Sfoglia il codice sorgente

More blanket gitignores messing up vendored deps.

Adam Ierymenko 3 anni fa
parent
commit
f2c12d548d

+ 0 - 1
.gitignore

@@ -125,7 +125,6 @@ attic/world/mkworld
 workspace/
 workspace2/
 zeroidc/target/
-tmp/
 
 #snapcraft specifics
 /parts/

+ 1025 - 0
zeroidc/vendor/ring/pregenerated/tmp/aesni-gcm-x86_64-nasm.asm

@@ -0,0 +1,1025 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+section	.text code align=64
+
+
+
+ALIGN	32
+_aesni_ctr32_ghash_6x:
+
+	vmovdqu	xmm2,XMMWORD[32+r11]
+	sub	rdx,6
+	vpxor	xmm4,xmm4,xmm4
+	vmovdqu	xmm15,XMMWORD[((0-128))+rcx]
+	vpaddb	xmm10,xmm1,xmm2
+	vpaddb	xmm11,xmm10,xmm2
+	vpaddb	xmm12,xmm11,xmm2
+	vpaddb	xmm13,xmm12,xmm2
+	vpaddb	xmm14,xmm13,xmm2
+	vpxor	xmm9,xmm1,xmm15
+	vmovdqu	XMMWORD[(16+8)+rsp],xmm4
+	jmp	NEAR $L$oop6x
+
+ALIGN	32
+$L$oop6x:
+	add	ebx,100663296
+	jc	NEAR $L$handle_ctr32
+	vmovdqu	xmm3,XMMWORD[((0-32))+r9]
+	vpaddb	xmm1,xmm14,xmm2
+	vpxor	xmm10,xmm10,xmm15
+	vpxor	xmm11,xmm11,xmm15
+
+$L$resume_ctr32:
+	vmovdqu	XMMWORD[r8],xmm1
+	vpclmulqdq	xmm5,xmm7,xmm3,0x10
+	vpxor	xmm12,xmm12,xmm15
+	vmovups	xmm2,XMMWORD[((16-128))+rcx]
+	vpclmulqdq	xmm6,xmm7,xmm3,0x01
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	xor	r12,r12
+	cmp	r15,r14
+
+	vaesenc	xmm9,xmm9,xmm2
+	vmovdqu	xmm0,XMMWORD[((48+8))+rsp]
+	vpxor	xmm13,xmm13,xmm15
+	vpclmulqdq	xmm1,xmm7,xmm3,0x00
+	vaesenc	xmm10,xmm10,xmm2
+	vpxor	xmm14,xmm14,xmm15
+	setnc	r12b
+	vpclmulqdq	xmm7,xmm7,xmm3,0x11
+	vaesenc	xmm11,xmm11,xmm2
+	vmovdqu	xmm3,XMMWORD[((16-32))+r9]
+	neg	r12
+	vaesenc	xmm12,xmm12,xmm2
+	vpxor	xmm6,xmm6,xmm5
+	vpclmulqdq	xmm5,xmm0,xmm3,0x00
+	vpxor	xmm8,xmm8,xmm4
+	vaesenc	xmm13,xmm13,xmm2
+	vpxor	xmm4,xmm1,xmm5
+	and	r12,0x60
+	vmovups	xmm15,XMMWORD[((32-128))+rcx]
+	vpclmulqdq	xmm1,xmm0,xmm3,0x10
+	vaesenc	xmm14,xmm14,xmm2
+
+	vpclmulqdq	xmm2,xmm0,xmm3,0x01
+	lea	r14,[r12*1+r14]
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm8,xmm8,XMMWORD[((16+8))+rsp]
+	vpclmulqdq	xmm3,xmm0,xmm3,0x11
+	vmovdqu	xmm0,XMMWORD[((64+8))+rsp]
+	vaesenc	xmm10,xmm10,xmm15
+	movbe	r13,QWORD[88+r14]
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[80+r14]
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((32+8))+rsp],r13
+	vaesenc	xmm13,xmm13,xmm15
+	mov	QWORD[((40+8))+rsp],r12
+	vmovdqu	xmm5,XMMWORD[((48-32))+r9]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vmovups	xmm15,XMMWORD[((48-128))+rcx]
+	vpxor	xmm6,xmm6,xmm1
+	vpclmulqdq	xmm1,xmm0,xmm5,0x00
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm6,xmm6,xmm2
+	vpclmulqdq	xmm2,xmm0,xmm5,0x10
+	vaesenc	xmm10,xmm10,xmm15
+	vpxor	xmm7,xmm7,xmm3
+	vpclmulqdq	xmm3,xmm0,xmm5,0x01
+	vaesenc	xmm11,xmm11,xmm15
+	vpclmulqdq	xmm5,xmm0,xmm5,0x11
+	vmovdqu	xmm0,XMMWORD[((80+8))+rsp]
+	vaesenc	xmm12,xmm12,xmm15
+	vaesenc	xmm13,xmm13,xmm15
+	vpxor	xmm4,xmm4,xmm1
+	vmovdqu	xmm1,XMMWORD[((64-32))+r9]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vmovups	xmm15,XMMWORD[((64-128))+rcx]
+	vpxor	xmm6,xmm6,xmm2
+	vpclmulqdq	xmm2,xmm0,xmm1,0x00
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm6,xmm6,xmm3
+	vpclmulqdq	xmm3,xmm0,xmm1,0x10
+	vaesenc	xmm10,xmm10,xmm15
+	movbe	r13,QWORD[72+r14]
+	vpxor	xmm7,xmm7,xmm5
+	vpclmulqdq	xmm5,xmm0,xmm1,0x01
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[64+r14]
+	vpclmulqdq	xmm1,xmm0,xmm1,0x11
+	vmovdqu	xmm0,XMMWORD[((96+8))+rsp]
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((48+8))+rsp],r13
+	vaesenc	xmm13,xmm13,xmm15
+	mov	QWORD[((56+8))+rsp],r12
+	vpxor	xmm4,xmm4,xmm2
+	vmovdqu	xmm2,XMMWORD[((96-32))+r9]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vmovups	xmm15,XMMWORD[((80-128))+rcx]
+	vpxor	xmm6,xmm6,xmm3
+	vpclmulqdq	xmm3,xmm0,xmm2,0x00
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm6,xmm6,xmm5
+	vpclmulqdq	xmm5,xmm0,xmm2,0x10
+	vaesenc	xmm10,xmm10,xmm15
+	movbe	r13,QWORD[56+r14]
+	vpxor	xmm7,xmm7,xmm1
+	vpclmulqdq	xmm1,xmm0,xmm2,0x01
+	vpxor	xmm8,xmm8,XMMWORD[((112+8))+rsp]
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[48+r14]
+	vpclmulqdq	xmm2,xmm0,xmm2,0x11
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((64+8))+rsp],r13
+	vaesenc	xmm13,xmm13,xmm15
+	mov	QWORD[((72+8))+rsp],r12
+	vpxor	xmm4,xmm4,xmm3
+	vmovdqu	xmm3,XMMWORD[((112-32))+r9]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vmovups	xmm15,XMMWORD[((96-128))+rcx]
+	vpxor	xmm6,xmm6,xmm5
+	vpclmulqdq	xmm5,xmm8,xmm3,0x10
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm6,xmm6,xmm1
+	vpclmulqdq	xmm1,xmm8,xmm3,0x01
+	vaesenc	xmm10,xmm10,xmm15
+	movbe	r13,QWORD[40+r14]
+	vpxor	xmm7,xmm7,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm3,0x00
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[32+r14]
+	vpclmulqdq	xmm8,xmm8,xmm3,0x11
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((80+8))+rsp],r13
+	vaesenc	xmm13,xmm13,xmm15
+	mov	QWORD[((88+8))+rsp],r12
+	vpxor	xmm6,xmm6,xmm5
+	vaesenc	xmm14,xmm14,xmm15
+	vpxor	xmm6,xmm6,xmm1
+
+	vmovups	xmm15,XMMWORD[((112-128))+rcx]
+	vpslldq	xmm5,xmm6,8
+	vpxor	xmm4,xmm4,xmm2
+	vmovdqu	xmm3,XMMWORD[16+r11]
+
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm7,xmm7,xmm8
+	vaesenc	xmm10,xmm10,xmm15
+	vpxor	xmm4,xmm4,xmm5
+	movbe	r13,QWORD[24+r14]
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[16+r14]
+	vpalignr	xmm0,xmm4,xmm4,8
+	vpclmulqdq	xmm4,xmm4,xmm3,0x10
+	mov	QWORD[((96+8))+rsp],r13
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((104+8))+rsp],r12
+	vaesenc	xmm13,xmm13,xmm15
+	vmovups	xmm1,XMMWORD[((128-128))+rcx]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vaesenc	xmm9,xmm9,xmm1
+	vmovups	xmm15,XMMWORD[((144-128))+rcx]
+	vaesenc	xmm10,xmm10,xmm1
+	vpsrldq	xmm6,xmm6,8
+	vaesenc	xmm11,xmm11,xmm1
+	vpxor	xmm7,xmm7,xmm6
+	vaesenc	xmm12,xmm12,xmm1
+	vpxor	xmm4,xmm4,xmm0
+	movbe	r13,QWORD[8+r14]
+	vaesenc	xmm13,xmm13,xmm1
+	movbe	r12,QWORD[r14]
+	vaesenc	xmm14,xmm14,xmm1
+	vmovups	xmm1,XMMWORD[((160-128))+rcx]
+	cmp	ebp,11
+	jb	NEAR $L$enc_tail
+
+	vaesenc	xmm9,xmm9,xmm15
+	vaesenc	xmm10,xmm10,xmm15
+	vaesenc	xmm11,xmm11,xmm15
+	vaesenc	xmm12,xmm12,xmm15
+	vaesenc	xmm13,xmm13,xmm15
+	vaesenc	xmm14,xmm14,xmm15
+
+	vaesenc	xmm9,xmm9,xmm1
+	vaesenc	xmm10,xmm10,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+	vaesenc	xmm13,xmm13,xmm1
+	vmovups	xmm15,XMMWORD[((176-128))+rcx]
+	vaesenc	xmm14,xmm14,xmm1
+	vmovups	xmm1,XMMWORD[((192-128))+rcx]
+
+
+	vaesenc	xmm9,xmm9,xmm15
+	vaesenc	xmm10,xmm10,xmm15
+	vaesenc	xmm11,xmm11,xmm15
+	vaesenc	xmm12,xmm12,xmm15
+	vaesenc	xmm13,xmm13,xmm15
+	vaesenc	xmm14,xmm14,xmm15
+
+	vaesenc	xmm9,xmm9,xmm1
+	vaesenc	xmm10,xmm10,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+	vaesenc	xmm13,xmm13,xmm1
+	vmovups	xmm15,XMMWORD[((208-128))+rcx]
+	vaesenc	xmm14,xmm14,xmm1
+	vmovups	xmm1,XMMWORD[((224-128))+rcx]
+	jmp	NEAR $L$enc_tail
+
+ALIGN	32
+$L$handle_ctr32:
+	vmovdqu	xmm0,XMMWORD[r11]
+	vpshufb	xmm6,xmm1,xmm0
+	vmovdqu	xmm5,XMMWORD[48+r11]
+	vpaddd	xmm10,xmm6,XMMWORD[64+r11]
+	vpaddd	xmm11,xmm6,xmm5
+	vmovdqu	xmm3,XMMWORD[((0-32))+r9]
+	vpaddd	xmm12,xmm10,xmm5
+	vpshufb	xmm10,xmm10,xmm0
+	vpaddd	xmm13,xmm11,xmm5
+	vpshufb	xmm11,xmm11,xmm0
+	vpxor	xmm10,xmm10,xmm15
+	vpaddd	xmm14,xmm12,xmm5
+	vpshufb	xmm12,xmm12,xmm0
+	vpxor	xmm11,xmm11,xmm15
+	vpaddd	xmm1,xmm13,xmm5
+	vpshufb	xmm13,xmm13,xmm0
+	vpshufb	xmm14,xmm14,xmm0
+	vpshufb	xmm1,xmm1,xmm0
+	jmp	NEAR $L$resume_ctr32
+
+ALIGN	32
+$L$enc_tail:
+	vaesenc	xmm9,xmm9,xmm15
+	vmovdqu	XMMWORD[(16+8)+rsp],xmm7
+	vpalignr	xmm8,xmm4,xmm4,8
+	vaesenc	xmm10,xmm10,xmm15
+	vpclmulqdq	xmm4,xmm4,xmm3,0x10
+	vpxor	xmm2,xmm1,XMMWORD[rdi]
+	vaesenc	xmm11,xmm11,xmm15
+	vpxor	xmm0,xmm1,XMMWORD[16+rdi]
+	vaesenc	xmm12,xmm12,xmm15
+	vpxor	xmm5,xmm1,XMMWORD[32+rdi]
+	vaesenc	xmm13,xmm13,xmm15
+	vpxor	xmm6,xmm1,XMMWORD[48+rdi]
+	vaesenc	xmm14,xmm14,xmm15
+	vpxor	xmm7,xmm1,XMMWORD[64+rdi]
+	vpxor	xmm3,xmm1,XMMWORD[80+rdi]
+	vmovdqu	xmm1,XMMWORD[r8]
+
+	vaesenclast	xmm9,xmm9,xmm2
+	vmovdqu	xmm2,XMMWORD[32+r11]
+	vaesenclast	xmm10,xmm10,xmm0
+	vpaddb	xmm0,xmm1,xmm2
+	mov	QWORD[((112+8))+rsp],r13
+	lea	rdi,[96+rdi]
+	vaesenclast	xmm11,xmm11,xmm5
+	vpaddb	xmm5,xmm0,xmm2
+	mov	QWORD[((120+8))+rsp],r12
+	lea	rsi,[96+rsi]
+	vmovdqu	xmm15,XMMWORD[((0-128))+rcx]
+	vaesenclast	xmm12,xmm12,xmm6
+	vpaddb	xmm6,xmm5,xmm2
+	vaesenclast	xmm13,xmm13,xmm7
+	vpaddb	xmm7,xmm6,xmm2
+	vaesenclast	xmm14,xmm14,xmm3
+	vpaddb	xmm3,xmm7,xmm2
+
+	add	r10,0x60
+	sub	rdx,0x6
+	jc	NEAR $L$6x_done
+
+	vmovups	XMMWORD[(-96)+rsi],xmm9
+	vpxor	xmm9,xmm1,xmm15
+	vmovups	XMMWORD[(-80)+rsi],xmm10
+	vmovdqa	xmm10,xmm0
+	vmovups	XMMWORD[(-64)+rsi],xmm11
+	vmovdqa	xmm11,xmm5
+	vmovups	XMMWORD[(-48)+rsi],xmm12
+	vmovdqa	xmm12,xmm6
+	vmovups	XMMWORD[(-32)+rsi],xmm13
+	vmovdqa	xmm13,xmm7
+	vmovups	XMMWORD[(-16)+rsi],xmm14
+	vmovdqa	xmm14,xmm3
+	vmovdqu	xmm7,XMMWORD[((32+8))+rsp]
+	jmp	NEAR $L$oop6x
+
+$L$6x_done:
+	vpxor	xmm8,xmm8,XMMWORD[((16+8))+rsp]
+	vpxor	xmm8,xmm8,xmm4
+
+	DB	0F3h,0C3h		;repret
+
+
+global	GFp_aesni_gcm_decrypt
+
+ALIGN	32
+GFp_aesni_gcm_decrypt:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_aesni_gcm_decrypt:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+	xor	r10,r10
+
+
+
+	cmp	rdx,0x60
+	jb	NEAR $L$gcm_dec_abort
+
+	lea	rax,[rsp]
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	lea	rsp,[((-168))+rsp]
+	movaps	XMMWORD[(-216)+rax],xmm6
+	movaps	XMMWORD[(-200)+rax],xmm7
+	movaps	XMMWORD[(-184)+rax],xmm8
+	movaps	XMMWORD[(-168)+rax],xmm9
+	movaps	XMMWORD[(-152)+rax],xmm10
+	movaps	XMMWORD[(-136)+rax],xmm11
+	movaps	XMMWORD[(-120)+rax],xmm12
+	movaps	XMMWORD[(-104)+rax],xmm13
+	movaps	XMMWORD[(-88)+rax],xmm14
+	movaps	XMMWORD[(-72)+rax],xmm15
+$L$gcm_dec_body:
+	vzeroupper
+
+	vmovdqu	xmm1,XMMWORD[r8]
+	add	rsp,-128
+	mov	ebx,DWORD[12+r8]
+	lea	r11,[$L$bswap_mask]
+	lea	r14,[((-128))+rcx]
+	mov	r15,0xf80
+	vmovdqu	xmm8,XMMWORD[r9]
+	and	rsp,-128
+	vmovdqu	xmm0,XMMWORD[r11]
+	lea	rcx,[128+rcx]
+	lea	r9,[((32+32))+r9]
+	mov	ebp,DWORD[((240-128))+rcx]
+	vpshufb	xmm8,xmm8,xmm0
+
+	and	r14,r15
+	and	r15,rsp
+	sub	r15,r14
+	jc	NEAR $L$dec_no_key_aliasing
+	cmp	r15,768
+	jnc	NEAR $L$dec_no_key_aliasing
+	sub	rsp,r15
+$L$dec_no_key_aliasing:
+
+	vmovdqu	xmm7,XMMWORD[80+rdi]
+	lea	r14,[rdi]
+	vmovdqu	xmm4,XMMWORD[64+rdi]
+
+
+
+
+
+
+
+	lea	r15,[((-192))+rdx*1+rdi]
+
+	vmovdqu	xmm5,XMMWORD[48+rdi]
+	shr	rdx,4
+	xor	r10,r10
+	vmovdqu	xmm6,XMMWORD[32+rdi]
+	vpshufb	xmm7,xmm7,xmm0
+	vmovdqu	xmm2,XMMWORD[16+rdi]
+	vpshufb	xmm4,xmm4,xmm0
+	vmovdqu	xmm3,XMMWORD[rdi]
+	vpshufb	xmm5,xmm5,xmm0
+	vmovdqu	XMMWORD[48+rsp],xmm4
+	vpshufb	xmm6,xmm6,xmm0
+	vmovdqu	XMMWORD[64+rsp],xmm5
+	vpshufb	xmm2,xmm2,xmm0
+	vmovdqu	XMMWORD[80+rsp],xmm6
+	vpshufb	xmm3,xmm3,xmm0
+	vmovdqu	XMMWORD[96+rsp],xmm2
+	vmovdqu	XMMWORD[112+rsp],xmm3
+
+	call	_aesni_ctr32_ghash_6x
+
+	vmovups	XMMWORD[(-96)+rsi],xmm9
+	vmovups	XMMWORD[(-80)+rsi],xmm10
+	vmovups	XMMWORD[(-64)+rsi],xmm11
+	vmovups	XMMWORD[(-48)+rsi],xmm12
+	vmovups	XMMWORD[(-32)+rsi],xmm13
+	vmovups	XMMWORD[(-16)+rsi],xmm14
+
+	vpshufb	xmm8,xmm8,XMMWORD[r11]
+	vmovdqu	XMMWORD[(-64)+r9],xmm8
+
+	vzeroupper
+	movaps	xmm6,XMMWORD[((-216))+rax]
+	movaps	xmm7,XMMWORD[((-200))+rax]
+	movaps	xmm8,XMMWORD[((-184))+rax]
+	movaps	xmm9,XMMWORD[((-168))+rax]
+	movaps	xmm10,XMMWORD[((-152))+rax]
+	movaps	xmm11,XMMWORD[((-136))+rax]
+	movaps	xmm12,XMMWORD[((-120))+rax]
+	movaps	xmm13,XMMWORD[((-104))+rax]
+	movaps	xmm14,XMMWORD[((-88))+rax]
+	movaps	xmm15,XMMWORD[((-72))+rax]
+	mov	r15,QWORD[((-48))+rax]
+
+	mov	r14,QWORD[((-40))+rax]
+
+	mov	r13,QWORD[((-32))+rax]
+
+	mov	r12,QWORD[((-24))+rax]
+
+	mov	rbp,QWORD[((-16))+rax]
+
+	mov	rbx,QWORD[((-8))+rax]
+
+	lea	rsp,[rax]
+
+$L$gcm_dec_abort:
+	mov	rax,r10
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_aesni_gcm_decrypt:
+
+ALIGN	32
+_aesni_ctr32_6x:
+
+	vmovdqu	xmm4,XMMWORD[((0-128))+rcx]
+	vmovdqu	xmm2,XMMWORD[32+r11]
+	lea	r13,[((-1))+rbp]
+	vmovups	xmm15,XMMWORD[((16-128))+rcx]
+	lea	r12,[((32-128))+rcx]
+	vpxor	xmm9,xmm1,xmm4
+	add	ebx,100663296
+	jc	NEAR $L$handle_ctr32_2
+	vpaddb	xmm10,xmm1,xmm2
+	vpaddb	xmm11,xmm10,xmm2
+	vpxor	xmm10,xmm10,xmm4
+	vpaddb	xmm12,xmm11,xmm2
+	vpxor	xmm11,xmm11,xmm4
+	vpaddb	xmm13,xmm12,xmm2
+	vpxor	xmm12,xmm12,xmm4
+	vpaddb	xmm14,xmm13,xmm2
+	vpxor	xmm13,xmm13,xmm4
+	vpaddb	xmm1,xmm14,xmm2
+	vpxor	xmm14,xmm14,xmm4
+	jmp	NEAR $L$oop_ctr32
+
+ALIGN	16
+$L$oop_ctr32:
+	vaesenc	xmm9,xmm9,xmm15
+	vaesenc	xmm10,xmm10,xmm15
+	vaesenc	xmm11,xmm11,xmm15
+	vaesenc	xmm12,xmm12,xmm15
+	vaesenc	xmm13,xmm13,xmm15
+	vaesenc	xmm14,xmm14,xmm15
+	vmovups	xmm15,XMMWORD[r12]
+	lea	r12,[16+r12]
+	dec	r13d
+	jnz	NEAR $L$oop_ctr32
+
+	vmovdqu	xmm3,XMMWORD[r12]
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm4,xmm3,XMMWORD[rdi]
+	vaesenc	xmm10,xmm10,xmm15
+	vpxor	xmm5,xmm3,XMMWORD[16+rdi]
+	vaesenc	xmm11,xmm11,xmm15
+	vpxor	xmm6,xmm3,XMMWORD[32+rdi]
+	vaesenc	xmm12,xmm12,xmm15
+	vpxor	xmm8,xmm3,XMMWORD[48+rdi]
+	vaesenc	xmm13,xmm13,xmm15
+	vpxor	xmm2,xmm3,XMMWORD[64+rdi]
+	vaesenc	xmm14,xmm14,xmm15
+	vpxor	xmm3,xmm3,XMMWORD[80+rdi]
+	lea	rdi,[96+rdi]
+
+	vaesenclast	xmm9,xmm9,xmm4
+	vaesenclast	xmm10,xmm10,xmm5
+	vaesenclast	xmm11,xmm11,xmm6
+	vaesenclast	xmm12,xmm12,xmm8
+	vaesenclast	xmm13,xmm13,xmm2
+	vaesenclast	xmm14,xmm14,xmm3
+	vmovups	XMMWORD[rsi],xmm9
+	vmovups	XMMWORD[16+rsi],xmm10
+	vmovups	XMMWORD[32+rsi],xmm11
+	vmovups	XMMWORD[48+rsi],xmm12
+	vmovups	XMMWORD[64+rsi],xmm13
+	vmovups	XMMWORD[80+rsi],xmm14
+	lea	rsi,[96+rsi]
+
+	DB	0F3h,0C3h		;repret
+ALIGN	32
+$L$handle_ctr32_2:
+	vpshufb	xmm6,xmm1,xmm0
+	vmovdqu	xmm5,XMMWORD[48+r11]
+	vpaddd	xmm10,xmm6,XMMWORD[64+r11]
+	vpaddd	xmm11,xmm6,xmm5
+	vpaddd	xmm12,xmm10,xmm5
+	vpshufb	xmm10,xmm10,xmm0
+	vpaddd	xmm13,xmm11,xmm5
+	vpshufb	xmm11,xmm11,xmm0
+	vpxor	xmm10,xmm10,xmm4
+	vpaddd	xmm14,xmm12,xmm5
+	vpshufb	xmm12,xmm12,xmm0
+	vpxor	xmm11,xmm11,xmm4
+	vpaddd	xmm1,xmm13,xmm5
+	vpshufb	xmm13,xmm13,xmm0
+	vpxor	xmm12,xmm12,xmm4
+	vpshufb	xmm14,xmm14,xmm0
+	vpxor	xmm13,xmm13,xmm4
+	vpshufb	xmm1,xmm1,xmm0
+	vpxor	xmm14,xmm14,xmm4
+	jmp	NEAR $L$oop_ctr32
+
+
+
+global	GFp_aesni_gcm_encrypt
+
+ALIGN	32
+GFp_aesni_gcm_encrypt:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_aesni_gcm_encrypt:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+	xor	r10,r10
+
+
+
+
+	cmp	rdx,0x60*3
+	jb	NEAR $L$gcm_enc_abort
+
+	lea	rax,[rsp]
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	lea	rsp,[((-168))+rsp]
+	movaps	XMMWORD[(-216)+rax],xmm6
+	movaps	XMMWORD[(-200)+rax],xmm7
+	movaps	XMMWORD[(-184)+rax],xmm8
+	movaps	XMMWORD[(-168)+rax],xmm9
+	movaps	XMMWORD[(-152)+rax],xmm10
+	movaps	XMMWORD[(-136)+rax],xmm11
+	movaps	XMMWORD[(-120)+rax],xmm12
+	movaps	XMMWORD[(-104)+rax],xmm13
+	movaps	XMMWORD[(-88)+rax],xmm14
+	movaps	XMMWORD[(-72)+rax],xmm15
+$L$gcm_enc_body:
+	vzeroupper
+
+	vmovdqu	xmm1,XMMWORD[r8]
+	add	rsp,-128
+	mov	ebx,DWORD[12+r8]
+	lea	r11,[$L$bswap_mask]
+	lea	r14,[((-128))+rcx]
+	mov	r15,0xf80
+	lea	rcx,[128+rcx]
+	vmovdqu	xmm0,XMMWORD[r11]
+	and	rsp,-128
+	mov	ebp,DWORD[((240-128))+rcx]
+
+	and	r14,r15
+	and	r15,rsp
+	sub	r15,r14
+	jc	NEAR $L$enc_no_key_aliasing
+	cmp	r15,768
+	jnc	NEAR $L$enc_no_key_aliasing
+	sub	rsp,r15
+$L$enc_no_key_aliasing:
+
+	lea	r14,[rsi]
+
+
+
+
+
+
+
+
+	lea	r15,[((-192))+rdx*1+rsi]
+
+	shr	rdx,4
+
+	call	_aesni_ctr32_6x
+	vpshufb	xmm8,xmm9,xmm0
+	vpshufb	xmm2,xmm10,xmm0
+	vmovdqu	XMMWORD[112+rsp],xmm8
+	vpshufb	xmm4,xmm11,xmm0
+	vmovdqu	XMMWORD[96+rsp],xmm2
+	vpshufb	xmm5,xmm12,xmm0
+	vmovdqu	XMMWORD[80+rsp],xmm4
+	vpshufb	xmm6,xmm13,xmm0
+	vmovdqu	XMMWORD[64+rsp],xmm5
+	vpshufb	xmm7,xmm14,xmm0
+	vmovdqu	XMMWORD[48+rsp],xmm6
+
+	call	_aesni_ctr32_6x
+
+	vmovdqu	xmm8,XMMWORD[r9]
+	lea	r9,[((32+32))+r9]
+	sub	rdx,12
+	mov	r10,0x60*2
+	vpshufb	xmm8,xmm8,xmm0
+
+	call	_aesni_ctr32_ghash_6x
+	vmovdqu	xmm7,XMMWORD[32+rsp]
+	vmovdqu	xmm0,XMMWORD[r11]
+	vmovdqu	xmm3,XMMWORD[((0-32))+r9]
+	vpunpckhqdq	xmm1,xmm7,xmm7
+	vmovdqu	xmm15,XMMWORD[((32-32))+r9]
+	vmovups	XMMWORD[(-96)+rsi],xmm9
+	vpshufb	xmm9,xmm9,xmm0
+	vpxor	xmm1,xmm1,xmm7
+	vmovups	XMMWORD[(-80)+rsi],xmm10
+	vpshufb	xmm10,xmm10,xmm0
+	vmovups	XMMWORD[(-64)+rsi],xmm11
+	vpshufb	xmm11,xmm11,xmm0
+	vmovups	XMMWORD[(-48)+rsi],xmm12
+	vpshufb	xmm12,xmm12,xmm0
+	vmovups	XMMWORD[(-32)+rsi],xmm13
+	vpshufb	xmm13,xmm13,xmm0
+	vmovups	XMMWORD[(-16)+rsi],xmm14
+	vpshufb	xmm14,xmm14,xmm0
+	vmovdqu	XMMWORD[16+rsp],xmm9
+	vmovdqu	xmm6,XMMWORD[48+rsp]
+	vmovdqu	xmm0,XMMWORD[((16-32))+r9]
+	vpunpckhqdq	xmm2,xmm6,xmm6
+	vpclmulqdq	xmm5,xmm7,xmm3,0x00
+	vpxor	xmm2,xmm2,xmm6
+	vpclmulqdq	xmm7,xmm7,xmm3,0x11
+	vpclmulqdq	xmm1,xmm1,xmm15,0x00
+
+	vmovdqu	xmm9,XMMWORD[64+rsp]
+	vpclmulqdq	xmm4,xmm6,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((48-32))+r9]
+	vpxor	xmm4,xmm4,xmm5
+	vpunpckhqdq	xmm5,xmm9,xmm9
+	vpclmulqdq	xmm6,xmm6,xmm0,0x11
+	vpxor	xmm5,xmm5,xmm9
+	vpxor	xmm6,xmm6,xmm7
+	vpclmulqdq	xmm2,xmm2,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((80-32))+r9]
+	vpxor	xmm2,xmm2,xmm1
+
+	vmovdqu	xmm1,XMMWORD[80+rsp]
+	vpclmulqdq	xmm7,xmm9,xmm3,0x00
+	vmovdqu	xmm0,XMMWORD[((64-32))+r9]
+	vpxor	xmm7,xmm7,xmm4
+	vpunpckhqdq	xmm4,xmm1,xmm1
+	vpclmulqdq	xmm9,xmm9,xmm3,0x11
+	vpxor	xmm4,xmm4,xmm1
+	vpxor	xmm9,xmm9,xmm6
+	vpclmulqdq	xmm5,xmm5,xmm15,0x00
+	vpxor	xmm5,xmm5,xmm2
+
+	vmovdqu	xmm2,XMMWORD[96+rsp]
+	vpclmulqdq	xmm6,xmm1,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((96-32))+r9]
+	vpxor	xmm6,xmm6,xmm7
+	vpunpckhqdq	xmm7,xmm2,xmm2
+	vpclmulqdq	xmm1,xmm1,xmm0,0x11
+	vpxor	xmm7,xmm7,xmm2
+	vpxor	xmm1,xmm1,xmm9
+	vpclmulqdq	xmm4,xmm4,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((128-32))+r9]
+	vpxor	xmm4,xmm4,xmm5
+
+	vpxor	xmm8,xmm8,XMMWORD[112+rsp]
+	vpclmulqdq	xmm5,xmm2,xmm3,0x00
+	vmovdqu	xmm0,XMMWORD[((112-32))+r9]
+	vpunpckhqdq	xmm9,xmm8,xmm8
+	vpxor	xmm5,xmm5,xmm6
+	vpclmulqdq	xmm2,xmm2,xmm3,0x11
+	vpxor	xmm9,xmm9,xmm8
+	vpxor	xmm2,xmm2,xmm1
+	vpclmulqdq	xmm7,xmm7,xmm15,0x00
+	vpxor	xmm4,xmm7,xmm4
+
+	vpclmulqdq	xmm6,xmm8,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((0-32))+r9]
+	vpunpckhqdq	xmm1,xmm14,xmm14
+	vpclmulqdq	xmm8,xmm8,xmm0,0x11
+	vpxor	xmm1,xmm1,xmm14
+	vpxor	xmm5,xmm6,xmm5
+	vpclmulqdq	xmm9,xmm9,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((32-32))+r9]
+	vpxor	xmm7,xmm8,xmm2
+	vpxor	xmm6,xmm9,xmm4
+
+	vmovdqu	xmm0,XMMWORD[((16-32))+r9]
+	vpxor	xmm9,xmm7,xmm5
+	vpclmulqdq	xmm4,xmm14,xmm3,0x00
+	vpxor	xmm6,xmm6,xmm9
+	vpunpckhqdq	xmm2,xmm13,xmm13
+	vpclmulqdq	xmm14,xmm14,xmm3,0x11
+	vpxor	xmm2,xmm2,xmm13
+	vpslldq	xmm9,xmm6,8
+	vpclmulqdq	xmm1,xmm1,xmm15,0x00
+	vpxor	xmm8,xmm5,xmm9
+	vpsrldq	xmm6,xmm6,8
+	vpxor	xmm7,xmm7,xmm6
+
+	vpclmulqdq	xmm5,xmm13,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((48-32))+r9]
+	vpxor	xmm5,xmm5,xmm4
+	vpunpckhqdq	xmm9,xmm12,xmm12
+	vpclmulqdq	xmm13,xmm13,xmm0,0x11
+	vpxor	xmm9,xmm9,xmm12
+	vpxor	xmm13,xmm13,xmm14
+	vpalignr	xmm14,xmm8,xmm8,8
+	vpclmulqdq	xmm2,xmm2,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((80-32))+r9]
+	vpxor	xmm2,xmm2,xmm1
+
+	vpclmulqdq	xmm4,xmm12,xmm3,0x00
+	vmovdqu	xmm0,XMMWORD[((64-32))+r9]
+	vpxor	xmm4,xmm4,xmm5
+	vpunpckhqdq	xmm1,xmm11,xmm11
+	vpclmulqdq	xmm12,xmm12,xmm3,0x11
+	vpxor	xmm1,xmm1,xmm11
+	vpxor	xmm12,xmm12,xmm13
+	vxorps	xmm7,xmm7,XMMWORD[16+rsp]
+	vpclmulqdq	xmm9,xmm9,xmm15,0x00
+	vpxor	xmm9,xmm9,xmm2
+
+	vpclmulqdq	xmm8,xmm8,XMMWORD[16+r11],0x10
+	vxorps	xmm8,xmm8,xmm14
+
+	vpclmulqdq	xmm5,xmm11,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((96-32))+r9]
+	vpxor	xmm5,xmm5,xmm4
+	vpunpckhqdq	xmm2,xmm10,xmm10
+	vpclmulqdq	xmm11,xmm11,xmm0,0x11
+	vpxor	xmm2,xmm2,xmm10
+	vpalignr	xmm14,xmm8,xmm8,8
+	vpxor	xmm11,xmm11,xmm12
+	vpclmulqdq	xmm1,xmm1,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((128-32))+r9]
+	vpxor	xmm1,xmm1,xmm9
+
+	vxorps	xmm14,xmm14,xmm7
+	vpclmulqdq	xmm8,xmm8,XMMWORD[16+r11],0x10
+	vxorps	xmm8,xmm8,xmm14
+
+	vpclmulqdq	xmm4,xmm10,xmm3,0x00
+	vmovdqu	xmm0,XMMWORD[((112-32))+r9]
+	vpxor	xmm4,xmm4,xmm5
+	vpunpckhqdq	xmm9,xmm8,xmm8
+	vpclmulqdq	xmm10,xmm10,xmm3,0x11
+	vpxor	xmm9,xmm9,xmm8
+	vpxor	xmm10,xmm10,xmm11
+	vpclmulqdq	xmm2,xmm2,xmm15,0x00
+	vpxor	xmm2,xmm2,xmm1
+
+	vpclmulqdq	xmm5,xmm8,xmm0,0x00
+	vpclmulqdq	xmm7,xmm8,xmm0,0x11
+	vpxor	xmm5,xmm5,xmm4
+	vpclmulqdq	xmm6,xmm9,xmm15,0x10
+	vpxor	xmm7,xmm7,xmm10
+	vpxor	xmm6,xmm6,xmm2
+
+	vpxor	xmm4,xmm7,xmm5
+	vpxor	xmm6,xmm6,xmm4
+	vpslldq	xmm1,xmm6,8
+	vmovdqu	xmm3,XMMWORD[16+r11]
+	vpsrldq	xmm6,xmm6,8
+	vpxor	xmm8,xmm5,xmm1
+	vpxor	xmm7,xmm7,xmm6
+
+	vpalignr	xmm2,xmm8,xmm8,8
+	vpclmulqdq	xmm8,xmm8,xmm3,0x10
+	vpxor	xmm8,xmm8,xmm2
+
+	vpalignr	xmm2,xmm8,xmm8,8
+	vpclmulqdq	xmm8,xmm8,xmm3,0x10
+	vpxor	xmm2,xmm2,xmm7
+	vpxor	xmm8,xmm8,xmm2
+	vpshufb	xmm8,xmm8,XMMWORD[r11]
+	vmovdqu	XMMWORD[(-64)+r9],xmm8
+
+	vzeroupper
+	movaps	xmm6,XMMWORD[((-216))+rax]
+	movaps	xmm7,XMMWORD[((-200))+rax]
+	movaps	xmm8,XMMWORD[((-184))+rax]
+	movaps	xmm9,XMMWORD[((-168))+rax]
+	movaps	xmm10,XMMWORD[((-152))+rax]
+	movaps	xmm11,XMMWORD[((-136))+rax]
+	movaps	xmm12,XMMWORD[((-120))+rax]
+	movaps	xmm13,XMMWORD[((-104))+rax]
+	movaps	xmm14,XMMWORD[((-88))+rax]
+	movaps	xmm15,XMMWORD[((-72))+rax]
+	mov	r15,QWORD[((-48))+rax]
+
+	mov	r14,QWORD[((-40))+rax]
+
+	mov	r13,QWORD[((-32))+rax]
+
+	mov	r12,QWORD[((-24))+rax]
+
+	mov	rbp,QWORD[((-16))+rax]
+
+	mov	rbx,QWORD[((-8))+rax]
+
+	lea	rsp,[rax]
+
+$L$gcm_enc_abort:
+	mov	rax,r10
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_aesni_gcm_encrypt:
+ALIGN	64
+$L$bswap_mask:
+DB	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+$L$poly:
+DB	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+$L$one_msb:
+DB	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+$L$two_lsb:
+DB	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+$L$one_lsb:
+DB	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+DB	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108
+DB	101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82
+DB	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
+DB	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+ALIGN	64
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+gcm_se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[120+r8]
+
+	mov	r15,QWORD[((-48))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	rbx,QWORD[((-8))+rax]
+	mov	QWORD[240+r8],r15
+	mov	QWORD[232+r8],r14
+	mov	QWORD[224+r8],r13
+	mov	QWORD[216+r8],r12
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[144+r8],rbx
+
+	lea	rsi,[((-216))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,20
+	DD	0xa548f3fc
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	DB	0F3h,0C3h		;repret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_GFp_aesni_gcm_decrypt wrt ..imagebase
+	DD	$L$SEH_end_GFp_aesni_gcm_decrypt wrt ..imagebase
+	DD	$L$SEH_gcm_dec_info wrt ..imagebase
+
+	DD	$L$SEH_begin_GFp_aesni_gcm_encrypt wrt ..imagebase
+	DD	$L$SEH_end_GFp_aesni_gcm_encrypt wrt ..imagebase
+	DD	$L$SEH_GFp_gcm_enc_info wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_gcm_dec_info:
+DB	9,0,0,0
+	DD	gcm_se_handler wrt ..imagebase
+	DD	$L$gcm_dec_body wrt ..imagebase,$L$gcm_dec_abort wrt ..imagebase
+$L$SEH_GFp_gcm_enc_info:
+DB	9,0,0,0
+	DD	gcm_se_handler wrt ..imagebase
+	DD	$L$gcm_enc_body wrt ..imagebase,$L$gcm_enc_abort wrt ..imagebase

+ 682 - 0
zeroidc/vendor/ring/pregenerated/tmp/aesni-x86-win32n.asm

@@ -0,0 +1,682 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
[email protected] equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+;extern	_GFp_ia32cap_P
+global	_GFp_aes_hw_encrypt
+align	16
+_GFp_aes_hw_encrypt:
+L$_GFp_aes_hw_encrypt_begin:
+	mov	eax,DWORD [4+esp]
+	mov	edx,DWORD [12+esp]
+	movups	xmm2,[eax]
+	mov	ecx,DWORD [240+edx]
+	mov	eax,DWORD [8+esp]
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	lea	edx,[32+edx]
+	xorps	xmm2,xmm0
+L$000enc1_loop_1:
+db	102,15,56,220,209
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$000enc1_loop_1
+db	102,15,56,221,209
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	movups	[eax],xmm2
+	pxor	xmm2,xmm2
+	ret
+align	16
+__aesni_encrypt2:
+	movups	xmm0,[edx]
+	shl	ecx,4
+	movups	xmm1,[16+edx]
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	movups	xmm0,[32+edx]
+	lea	edx,[32+ecx*1+edx]
+	neg	ecx
+	add	ecx,16
+L$001enc2_loop:
+db	102,15,56,220,209
+db	102,15,56,220,217
+	movups	xmm1,[ecx*1+edx]
+	add	ecx,32
+db	102,15,56,220,208
+db	102,15,56,220,216
+	movups	xmm0,[ecx*1+edx-16]
+	jnz	NEAR L$001enc2_loop
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,221,208
+db	102,15,56,221,216
+	ret
+align	16
+__aesni_encrypt3:
+	movups	xmm0,[edx]
+	shl	ecx,4
+	movups	xmm1,[16+edx]
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm0
+	movups	xmm0,[32+edx]
+	lea	edx,[32+ecx*1+edx]
+	neg	ecx
+	add	ecx,16
+L$002enc3_loop:
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,220,225
+	movups	xmm1,[ecx*1+edx]
+	add	ecx,32
+db	102,15,56,220,208
+db	102,15,56,220,216
+db	102,15,56,220,224
+	movups	xmm0,[ecx*1+edx-16]
+	jnz	NEAR L$002enc3_loop
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,220,225
+db	102,15,56,221,208
+db	102,15,56,221,216
+db	102,15,56,221,224
+	ret
+align	16
+__aesni_encrypt4:
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	shl	ecx,4
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm0
+	pxor	xmm5,xmm0
+	movups	xmm0,[32+edx]
+	lea	edx,[32+ecx*1+edx]
+	neg	ecx
+db	15,31,64,0
+	add	ecx,16
+L$003enc4_loop:
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,220,225
+db	102,15,56,220,233
+	movups	xmm1,[ecx*1+edx]
+	add	ecx,32
+db	102,15,56,220,208
+db	102,15,56,220,216
+db	102,15,56,220,224
+db	102,15,56,220,232
+	movups	xmm0,[ecx*1+edx-16]
+	jnz	NEAR L$003enc4_loop
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,220,225
+db	102,15,56,220,233
+db	102,15,56,221,208
+db	102,15,56,221,216
+db	102,15,56,221,224
+db	102,15,56,221,232
+	ret
+align	16
+__aesni_encrypt6:
+	movups	xmm0,[edx]
+	shl	ecx,4
+	movups	xmm1,[16+edx]
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm0
+db	102,15,56,220,209
+	pxor	xmm5,xmm0
+	pxor	xmm6,xmm0
+db	102,15,56,220,217
+	lea	edx,[32+ecx*1+edx]
+	neg	ecx
+db	102,15,56,220,225
+	pxor	xmm7,xmm0
+	movups	xmm0,[ecx*1+edx]
+	add	ecx,16
+	jmp	NEAR L$004_aesni_encrypt6_inner
+align	16
+L$005enc6_loop:
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,220,225
+L$004_aesni_encrypt6_inner:
+db	102,15,56,220,233
+db	102,15,56,220,241
+db	102,15,56,220,249
+L$_aesni_encrypt6_enter:
+	movups	xmm1,[ecx*1+edx]
+	add	ecx,32
+db	102,15,56,220,208
+db	102,15,56,220,216
+db	102,15,56,220,224
+db	102,15,56,220,232
+db	102,15,56,220,240
+db	102,15,56,220,248
+	movups	xmm0,[ecx*1+edx-16]
+	jnz	NEAR L$005enc6_loop
+db	102,15,56,220,209
+db	102,15,56,220,217
+db	102,15,56,220,225
+db	102,15,56,220,233
+db	102,15,56,220,241
+db	102,15,56,220,249
+db	102,15,56,221,208
+db	102,15,56,221,216
+db	102,15,56,221,224
+db	102,15,56,221,232
+db	102,15,56,221,240
+db	102,15,56,221,248
+	ret
+global	_GFp_aes_hw_ctr32_encrypt_blocks
+align	16
+_GFp_aes_hw_ctr32_encrypt_blocks:
+L$_GFp_aes_hw_ctr32_encrypt_blocks_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	esi,DWORD [20+esp]
+	mov	edi,DWORD [24+esp]
+	mov	eax,DWORD [28+esp]
+	mov	edx,DWORD [32+esp]
+	mov	ebx,DWORD [36+esp]
+	mov	ebp,esp
+	sub	esp,88
+	and	esp,-16
+	mov	DWORD [80+esp],ebp
+	cmp	eax,1
+	je	NEAR L$006ctr32_one_shortcut
+	movdqu	xmm7,[ebx]
+	mov	DWORD [esp],202182159
+	mov	DWORD [4+esp],134810123
+	mov	DWORD [8+esp],67438087
+	mov	DWORD [12+esp],66051
+	mov	ecx,6
+	xor	ebp,ebp
+	mov	DWORD [16+esp],ecx
+	mov	DWORD [20+esp],ecx
+	mov	DWORD [24+esp],ecx
+	mov	DWORD [28+esp],ebp
+db	102,15,58,22,251,3
+db	102,15,58,34,253,3
+	mov	ecx,DWORD [240+edx]
+	bswap	ebx
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	movdqa	xmm2,[esp]
+db	102,15,58,34,195,0
+	lea	ebp,[3+ebx]
+db	102,15,58,34,205,0
+	inc	ebx
+db	102,15,58,34,195,1
+	inc	ebp
+db	102,15,58,34,205,1
+	inc	ebx
+db	102,15,58,34,195,2
+	inc	ebp
+db	102,15,58,34,205,2
+	movdqa	[48+esp],xmm0
+db	102,15,56,0,194
+	movdqu	xmm6,[edx]
+	movdqa	[64+esp],xmm1
+db	102,15,56,0,202
+	pshufd	xmm2,xmm0,192
+	pshufd	xmm3,xmm0,128
+	cmp	eax,6
+	jb	NEAR L$007ctr32_tail
+	pxor	xmm7,xmm6
+	shl	ecx,4
+	mov	ebx,16
+	movdqa	[32+esp],xmm7
+	mov	ebp,edx
+	sub	ebx,ecx
+	lea	edx,[32+ecx*1+edx]
+	sub	eax,6
+	jmp	NEAR L$008ctr32_loop6
+align	16
+L$008ctr32_loop6:
+	pshufd	xmm4,xmm0,64
+	movdqa	xmm0,[32+esp]
+	pshufd	xmm5,xmm1,192
+	pxor	xmm2,xmm0
+	pshufd	xmm6,xmm1,128
+	pxor	xmm3,xmm0
+	pshufd	xmm7,xmm1,64
+	movups	xmm1,[16+ebp]
+	pxor	xmm4,xmm0
+	pxor	xmm5,xmm0
+db	102,15,56,220,209
+	pxor	xmm6,xmm0
+	pxor	xmm7,xmm0
+db	102,15,56,220,217
+	movups	xmm0,[32+ebp]
+	mov	ecx,ebx
+db	102,15,56,220,225
+db	102,15,56,220,233
+db	102,15,56,220,241
+db	102,15,56,220,249
+	call	L$_aesni_encrypt6_enter
+	movups	xmm1,[esi]
+	movups	xmm0,[16+esi]
+	xorps	xmm2,xmm1
+	movups	xmm1,[32+esi]
+	xorps	xmm3,xmm0
+	movups	[edi],xmm2
+	movdqa	xmm0,[16+esp]
+	xorps	xmm4,xmm1
+	movdqa	xmm1,[64+esp]
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	paddd	xmm1,xmm0
+	paddd	xmm0,[48+esp]
+	movdqa	xmm2,[esp]
+	movups	xmm3,[48+esi]
+	movups	xmm4,[64+esi]
+	xorps	xmm5,xmm3
+	movups	xmm3,[80+esi]
+	lea	esi,[96+esi]
+	movdqa	[48+esp],xmm0
+db	102,15,56,0,194
+	xorps	xmm6,xmm4
+	movups	[48+edi],xmm5
+	xorps	xmm7,xmm3
+	movdqa	[64+esp],xmm1
+db	102,15,56,0,202
+	movups	[64+edi],xmm6
+	pshufd	xmm2,xmm0,192
+	movups	[80+edi],xmm7
+	lea	edi,[96+edi]
+	pshufd	xmm3,xmm0,128
+	sub	eax,6
+	jnc	NEAR L$008ctr32_loop6
+	add	eax,6
+	jz	NEAR L$009ctr32_ret
+	movdqu	xmm7,[ebp]
+	mov	edx,ebp
+	pxor	xmm7,[32+esp]
+	mov	ecx,DWORD [240+ebp]
+L$007ctr32_tail:
+	por	xmm2,xmm7
+	cmp	eax,2
+	jb	NEAR L$010ctr32_one
+	pshufd	xmm4,xmm0,64
+	por	xmm3,xmm7
+	je	NEAR L$011ctr32_two
+	pshufd	xmm5,xmm1,192
+	por	xmm4,xmm7
+	cmp	eax,4
+	jb	NEAR L$012ctr32_three
+	pshufd	xmm6,xmm1,128
+	por	xmm5,xmm7
+	je	NEAR L$013ctr32_four
+	por	xmm6,xmm7
+	call	__aesni_encrypt6
+	movups	xmm1,[esi]
+	movups	xmm0,[16+esi]
+	xorps	xmm2,xmm1
+	movups	xmm1,[32+esi]
+	xorps	xmm3,xmm0
+	movups	xmm0,[48+esi]
+	xorps	xmm4,xmm1
+	movups	xmm1,[64+esi]
+	xorps	xmm5,xmm0
+	movups	[edi],xmm2
+	xorps	xmm6,xmm1
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	movups	[48+edi],xmm5
+	movups	[64+edi],xmm6
+	jmp	NEAR L$009ctr32_ret
+align	16
+L$006ctr32_one_shortcut:
+	movups	xmm2,[ebx]
+	mov	ecx,DWORD [240+edx]
+L$010ctr32_one:
+	movups	xmm0,[edx]
+	movups	xmm1,[16+edx]
+	lea	edx,[32+edx]
+	xorps	xmm2,xmm0
+L$014enc1_loop_2:
+db	102,15,56,220,209
+	dec	ecx
+	movups	xmm1,[edx]
+	lea	edx,[16+edx]
+	jnz	NEAR L$014enc1_loop_2
+db	102,15,56,221,209
+	movups	xmm6,[esi]
+	xorps	xmm6,xmm2
+	movups	[edi],xmm6
+	jmp	NEAR L$009ctr32_ret
+align	16
+L$011ctr32_two:
+	call	__aesni_encrypt2
+	movups	xmm5,[esi]
+	movups	xmm6,[16+esi]
+	xorps	xmm2,xmm5
+	xorps	xmm3,xmm6
+	movups	[edi],xmm2
+	movups	[16+edi],xmm3
+	jmp	NEAR L$009ctr32_ret
+align	16
+L$012ctr32_three:
+	call	__aesni_encrypt3
+	movups	xmm5,[esi]
+	movups	xmm6,[16+esi]
+	xorps	xmm2,xmm5
+	movups	xmm7,[32+esi]
+	xorps	xmm3,xmm6
+	movups	[edi],xmm2
+	xorps	xmm4,xmm7
+	movups	[16+edi],xmm3
+	movups	[32+edi],xmm4
+	jmp	NEAR L$009ctr32_ret
+align	16
+L$013ctr32_four:
+	call	__aesni_encrypt4
+	movups	xmm6,[esi]
+	movups	xmm7,[16+esi]
+	movups	xmm1,[32+esi]
+	xorps	xmm2,xmm6
+	movups	xmm0,[48+esi]
+	xorps	xmm3,xmm7
+	movups	[edi],xmm2
+	xorps	xmm4,xmm1
+	movups	[16+edi],xmm3
+	xorps	xmm5,xmm0
+	movups	[32+edi],xmm4
+	movups	[48+edi],xmm5
+L$009ctr32_ret:
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	movdqa	[32+esp],xmm0
+	pxor	xmm5,xmm5
+	movdqa	[48+esp],xmm0
+	pxor	xmm6,xmm6
+	movdqa	[64+esp],xmm0
+	pxor	xmm7,xmm7
+	mov	esp,DWORD [80+esp]
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+align	16
+__aesni_set_encrypt_key:
+	push	ebp
+	push	ebx
+	test	eax,eax
+	jz	NEAR L$015bad_pointer
+	test	edx,edx
+	jz	NEAR L$015bad_pointer
+	call	L$016pic
+L$016pic:
+	pop	ebx
+	lea	ebx,[(L$key_const-L$016pic)+ebx]
+	lea	ebp,[_GFp_ia32cap_P]
+	movups	xmm0,[eax]
+	xorps	xmm4,xmm4
+	mov	ebp,DWORD [4+ebp]
+	lea	edx,[16+edx]
+	and	ebp,268437504
+	cmp	ecx,256
+	je	NEAR L$01714rounds
+	cmp	ecx,128
+	jne	NEAR L$018bad_keybits
+align	16
+L$01910rounds:
+	cmp	ebp,268435456
+	je	NEAR L$02010rounds_alt
+	mov	ecx,9
+	movups	[edx-16],xmm0
+db	102,15,58,223,200,1
+	call	L$021key_128_cold
+db	102,15,58,223,200,2
+	call	L$022key_128
+db	102,15,58,223,200,4
+	call	L$022key_128
+db	102,15,58,223,200,8
+	call	L$022key_128
+db	102,15,58,223,200,16
+	call	L$022key_128
+db	102,15,58,223,200,32
+	call	L$022key_128
+db	102,15,58,223,200,64
+	call	L$022key_128
+db	102,15,58,223,200,128
+	call	L$022key_128
+db	102,15,58,223,200,27
+	call	L$022key_128
+db	102,15,58,223,200,54
+	call	L$022key_128
+	movups	[edx],xmm0
+	mov	DWORD [80+edx],ecx
+	jmp	NEAR L$023good_key
+align	16
+L$022key_128:
+	movups	[edx],xmm0
+	lea	edx,[16+edx]
+L$021key_128_cold:
+	shufps	xmm4,xmm0,16
+	xorps	xmm0,xmm4
+	shufps	xmm4,xmm0,140
+	xorps	xmm0,xmm4
+	shufps	xmm1,xmm1,255
+	xorps	xmm0,xmm1
+	ret
+align	16
+L$02010rounds_alt:
+	movdqa	xmm5,[ebx]
+	mov	ecx,8
+	movdqa	xmm4,[32+ebx]
+	movdqa	xmm2,xmm0
+	movdqu	[edx-16],xmm0
+L$024loop_key128:
+db	102,15,56,0,197
+db	102,15,56,221,196
+	pslld	xmm4,1
+	lea	edx,[16+edx]
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+	pxor	xmm0,xmm2
+	movdqu	[edx-16],xmm0
+	movdqa	xmm2,xmm0
+	dec	ecx
+	jnz	NEAR L$024loop_key128
+	movdqa	xmm4,[48+ebx]
+db	102,15,56,0,197
+db	102,15,56,221,196
+	pslld	xmm4,1
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+	pxor	xmm0,xmm2
+	movdqu	[edx],xmm0
+	movdqa	xmm2,xmm0
+db	102,15,56,0,197
+db	102,15,56,221,196
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+	pxor	xmm0,xmm2
+	movdqu	[16+edx],xmm0
+	mov	ecx,9
+	mov	DWORD [96+edx],ecx
+	jmp	NEAR L$023good_key
+align	16
+L$01714rounds:
+	movups	xmm2,[16+eax]
+	lea	edx,[16+edx]
+	cmp	ebp,268435456
+	je	NEAR L$02514rounds_alt
+	mov	ecx,13
+	movups	[edx-32],xmm0
+	movups	[edx-16],xmm2
+db	102,15,58,223,202,1
+	call	L$026key_256a_cold
+db	102,15,58,223,200,1
+	call	L$027key_256b
+db	102,15,58,223,202,2
+	call	L$028key_256a
+db	102,15,58,223,200,2
+	call	L$027key_256b
+db	102,15,58,223,202,4
+	call	L$028key_256a
+db	102,15,58,223,200,4
+	call	L$027key_256b
+db	102,15,58,223,202,8
+	call	L$028key_256a
+db	102,15,58,223,200,8
+	call	L$027key_256b
+db	102,15,58,223,202,16
+	call	L$028key_256a
+db	102,15,58,223,200,16
+	call	L$027key_256b
+db	102,15,58,223,202,32
+	call	L$028key_256a
+db	102,15,58,223,200,32
+	call	L$027key_256b
+db	102,15,58,223,202,64
+	call	L$028key_256a
+	movups	[edx],xmm0
+	mov	DWORD [16+edx],ecx
+	xor	eax,eax
+	jmp	NEAR L$023good_key
+align	16
+L$028key_256a:
+	movups	[edx],xmm2
+	lea	edx,[16+edx]
+L$026key_256a_cold:
+	shufps	xmm4,xmm0,16
+	xorps	xmm0,xmm4
+	shufps	xmm4,xmm0,140
+	xorps	xmm0,xmm4
+	shufps	xmm1,xmm1,255
+	xorps	xmm0,xmm1
+	ret
+align	16
+L$027key_256b:
+	movups	[edx],xmm0
+	lea	edx,[16+edx]
+	shufps	xmm4,xmm2,16
+	xorps	xmm2,xmm4
+	shufps	xmm4,xmm2,140
+	xorps	xmm2,xmm4
+	shufps	xmm1,xmm1,170
+	xorps	xmm2,xmm1
+	ret
+align	16
+L$02514rounds_alt:
+	movdqa	xmm5,[ebx]
+	movdqa	xmm4,[32+ebx]
+	mov	ecx,7
+	movdqu	[edx-32],xmm0
+	movdqa	xmm1,xmm2
+	movdqu	[edx-16],xmm2
+L$029loop_key256:
+db	102,15,56,0,213
+db	102,15,56,221,212
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm0,xmm3
+	pslld	xmm4,1
+	pxor	xmm0,xmm2
+	movdqu	[edx],xmm0
+	dec	ecx
+	jz	NEAR L$030done_key256
+	pshufd	xmm2,xmm0,255
+	pxor	xmm3,xmm3
+db	102,15,56,221,211
+	movdqa	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm1,xmm3
+	pxor	xmm2,xmm1
+	movdqu	[16+edx],xmm2
+	lea	edx,[32+edx]
+	movdqa	xmm1,xmm2
+	jmp	NEAR L$029loop_key256
+L$030done_key256:
+	mov	ecx,13
+	mov	DWORD [16+edx],ecx
+L$023good_key:
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	xor	eax,eax
+	pop	ebx
+	pop	ebp
+	ret
+align	4
+L$015bad_pointer:
+	mov	eax,-1
+	pop	ebx
+	pop	ebp
+	ret
+align	4
+L$018bad_keybits:
+	pxor	xmm0,xmm0
+	mov	eax,-2
+	pop	ebx
+	pop	ebp
+	ret
+global	_GFp_aes_hw_set_encrypt_key
+align	16
+_GFp_aes_hw_set_encrypt_key:
+L$_GFp_aes_hw_set_encrypt_key_begin:
+	mov	eax,DWORD [4+esp]
+	mov	ecx,DWORD [8+esp]
+	mov	edx,DWORD [12+esp]
+	call	__aesni_set_encrypt_key
+	ret
+align	64
+L$key_const:
+dd	202313229,202313229,202313229,202313229
+dd	67569157,67569157,67569157,67569157
+dd	1,1,1,1
+dd	27,27,27,27
+db	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+db	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+db	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+db	115,108,46,111,114,103,62,0
+segment	.bss
+common	_GFp_ia32cap_P 16

+ 1311 - 0
zeroidc/vendor/ring/pregenerated/tmp/aesni-x86_64-nasm.asm

@@ -0,0 +1,1311 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+section	.text code align=64
+
+EXTERN	GFp_ia32cap_P
+global	GFp_aes_hw_encrypt
+
+ALIGN	16
+GFp_aes_hw_encrypt:
+
+	movups	xmm2,XMMWORD[rcx]
+	mov	eax,DWORD[240+r8]
+	movups	xmm0,XMMWORD[r8]
+	movups	xmm1,XMMWORD[16+r8]
+	lea	r8,[32+r8]
+	xorps	xmm2,xmm0
+$L$oop_enc1_1:
+DB	102,15,56,220,209
+	dec	eax
+	movups	xmm1,XMMWORD[r8]
+	lea	r8,[16+r8]
+	jnz	NEAR $L$oop_enc1_1
+DB	102,15,56,221,209
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	movups	XMMWORD[rdx],xmm2
+	pxor	xmm2,xmm2
+	DB	0F3h,0C3h		;repret
+
+
+
+ALIGN	16
+_aesni_encrypt2:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	xorps	xmm3,xmm0
+	movups	xmm0,XMMWORD[32+rcx]
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+	add	rax,16
+
+$L$enc_loop2:
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+DB	102,15,56,220,208
+DB	102,15,56,220,216
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$enc_loop2
+
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+DB	102,15,56,221,208
+DB	102,15,56,221,216
+	DB	0F3h,0C3h		;repret
+
+
+
+ALIGN	16
+_aesni_encrypt3:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	xorps	xmm3,xmm0
+	xorps	xmm4,xmm0
+	movups	xmm0,XMMWORD[32+rcx]
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+	add	rax,16
+
+$L$enc_loop3:
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+DB	102,15,56,220,225
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+DB	102,15,56,220,208
+DB	102,15,56,220,216
+DB	102,15,56,220,224
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$enc_loop3
+
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+DB	102,15,56,220,225
+DB	102,15,56,221,208
+DB	102,15,56,221,216
+DB	102,15,56,221,224
+	DB	0F3h,0C3h		;repret
+
+
+
+ALIGN	16
+_aesni_encrypt4:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	xorps	xmm3,xmm0
+	xorps	xmm4,xmm0
+	xorps	xmm5,xmm0
+	movups	xmm0,XMMWORD[32+rcx]
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+DB	0x0f,0x1f,0x00
+	add	rax,16
+
+$L$enc_loop4:
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+DB	102,15,56,220,225
+DB	102,15,56,220,233
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+DB	102,15,56,220,208
+DB	102,15,56,220,216
+DB	102,15,56,220,224
+DB	102,15,56,220,232
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$enc_loop4
+
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+DB	102,15,56,220,225
+DB	102,15,56,220,233
+DB	102,15,56,221,208
+DB	102,15,56,221,216
+DB	102,15,56,221,224
+DB	102,15,56,221,232
+	DB	0F3h,0C3h		;repret
+
+
+
+ALIGN	16
+_aesni_encrypt6:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm0
+DB	102,15,56,220,209
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+DB	102,15,56,220,217
+	pxor	xmm5,xmm0
+	pxor	xmm6,xmm0
+DB	102,15,56,220,225
+	pxor	xmm7,xmm0
+	movups	xmm0,XMMWORD[rax*1+rcx]
+	add	rax,16
+	jmp	NEAR $L$enc_loop6_enter
+ALIGN	16
+$L$enc_loop6:
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+DB	102,15,56,220,225
+$L$enc_loop6_enter:
+DB	102,15,56,220,233
+DB	102,15,56,220,241
+DB	102,15,56,220,249
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+DB	102,15,56,220,208
+DB	102,15,56,220,216
+DB	102,15,56,220,224
+DB	102,15,56,220,232
+DB	102,15,56,220,240
+DB	102,15,56,220,248
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$enc_loop6
+
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+DB	102,15,56,220,225
+DB	102,15,56,220,233
+DB	102,15,56,220,241
+DB	102,15,56,220,249
+DB	102,15,56,221,208
+DB	102,15,56,221,216
+DB	102,15,56,221,224
+DB	102,15,56,221,232
+DB	102,15,56,221,240
+DB	102,15,56,221,248
+	DB	0F3h,0C3h		;repret
+
+
+
+ALIGN	16
+_aesni_encrypt8:
+
+	movups	xmm0,XMMWORD[rcx]
+	shl	eax,4
+	movups	xmm1,XMMWORD[16+rcx]
+	xorps	xmm2,xmm0
+	xorps	xmm3,xmm0
+	pxor	xmm4,xmm0
+	pxor	xmm5,xmm0
+	pxor	xmm6,xmm0
+	lea	rcx,[32+rax*1+rcx]
+	neg	rax
+DB	102,15,56,220,209
+	pxor	xmm7,xmm0
+	pxor	xmm8,xmm0
+DB	102,15,56,220,217
+	pxor	xmm9,xmm0
+	movups	xmm0,XMMWORD[rax*1+rcx]
+	add	rax,16
+	jmp	NEAR $L$enc_loop8_inner
+ALIGN	16
+$L$enc_loop8:
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+$L$enc_loop8_inner:
+DB	102,15,56,220,225
+DB	102,15,56,220,233
+DB	102,15,56,220,241
+DB	102,15,56,220,249
+DB	102,68,15,56,220,193
+DB	102,68,15,56,220,201
+$L$enc_loop8_enter:
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+DB	102,15,56,220,208
+DB	102,15,56,220,216
+DB	102,15,56,220,224
+DB	102,15,56,220,232
+DB	102,15,56,220,240
+DB	102,15,56,220,248
+DB	102,68,15,56,220,192
+DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$enc_loop8
+
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+DB	102,15,56,220,225
+DB	102,15,56,220,233
+DB	102,15,56,220,241
+DB	102,15,56,220,249
+DB	102,68,15,56,220,193
+DB	102,68,15,56,220,201
+DB	102,15,56,221,208
+DB	102,15,56,221,216
+DB	102,15,56,221,224
+DB	102,15,56,221,232
+DB	102,15,56,221,240
+DB	102,15,56,221,248
+DB	102,68,15,56,221,192
+DB	102,68,15,56,221,200
+	DB	0F3h,0C3h		;repret
+
+
+global	GFp_aes_hw_ctr32_encrypt_blocks
+
+ALIGN	16
+GFp_aes_hw_ctr32_encrypt_blocks:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_aes_hw_ctr32_encrypt_blocks:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+	cmp	rdx,1
+	jne	NEAR $L$ctr32_bulk
+
+
+
+	movups	xmm2,XMMWORD[r8]
+	movups	xmm3,XMMWORD[rdi]
+	mov	edx,DWORD[240+rcx]
+	movups	xmm0,XMMWORD[rcx]
+	movups	xmm1,XMMWORD[16+rcx]
+	lea	rcx,[32+rcx]
+	xorps	xmm2,xmm0
+$L$oop_enc1_2:
+DB	102,15,56,220,209
+	dec	edx
+	movups	xmm1,XMMWORD[rcx]
+	lea	rcx,[16+rcx]
+	jnz	NEAR $L$oop_enc1_2
+DB	102,15,56,221,209
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	xorps	xmm2,xmm3
+	pxor	xmm3,xmm3
+	movups	XMMWORD[rsi],xmm2
+	xorps	xmm2,xmm2
+	jmp	NEAR $L$ctr32_epilogue
+
+ALIGN	16
+$L$ctr32_bulk:
+	lea	r11,[rsp]
+
+	push	rbp
+
+	sub	rsp,288
+	and	rsp,-16
+	movaps	XMMWORD[(-168)+r11],xmm6
+	movaps	XMMWORD[(-152)+r11],xmm7
+	movaps	XMMWORD[(-136)+r11],xmm8
+	movaps	XMMWORD[(-120)+r11],xmm9
+	movaps	XMMWORD[(-104)+r11],xmm10
+	movaps	XMMWORD[(-88)+r11],xmm11
+	movaps	XMMWORD[(-72)+r11],xmm12
+	movaps	XMMWORD[(-56)+r11],xmm13
+	movaps	XMMWORD[(-40)+r11],xmm14
+	movaps	XMMWORD[(-24)+r11],xmm15
+$L$ctr32_body:
+
+
+
+
+	movdqu	xmm2,XMMWORD[r8]
+	movdqu	xmm0,XMMWORD[rcx]
+	mov	r8d,DWORD[12+r8]
+	pxor	xmm2,xmm0
+	mov	ebp,DWORD[12+rcx]
+	movdqa	XMMWORD[rsp],xmm2
+	bswap	r8d
+	movdqa	xmm3,xmm2
+	movdqa	xmm4,xmm2
+	movdqa	xmm5,xmm2
+	movdqa	XMMWORD[64+rsp],xmm2
+	movdqa	XMMWORD[80+rsp],xmm2
+	movdqa	XMMWORD[96+rsp],xmm2
+	mov	r10,rdx
+	movdqa	XMMWORD[112+rsp],xmm2
+
+	lea	rax,[1+r8]
+	lea	rdx,[2+r8]
+	bswap	eax
+	bswap	edx
+	xor	eax,ebp
+	xor	edx,ebp
+DB	102,15,58,34,216,3
+	lea	rax,[3+r8]
+	movdqa	XMMWORD[16+rsp],xmm3
+DB	102,15,58,34,226,3
+	bswap	eax
+	mov	rdx,r10
+	lea	r10,[4+r8]
+	movdqa	XMMWORD[32+rsp],xmm4
+	xor	eax,ebp
+	bswap	r10d
+DB	102,15,58,34,232,3
+	xor	r10d,ebp
+	movdqa	XMMWORD[48+rsp],xmm5
+	lea	r9,[5+r8]
+	mov	DWORD[((64+12))+rsp],r10d
+	bswap	r9d
+	lea	r10,[6+r8]
+	mov	eax,DWORD[240+rcx]
+	xor	r9d,ebp
+	bswap	r10d
+	mov	DWORD[((80+12))+rsp],r9d
+	xor	r10d,ebp
+	lea	r9,[7+r8]
+	mov	DWORD[((96+12))+rsp],r10d
+	bswap	r9d
+	lea	r10,[GFp_ia32cap_P]
+	mov	r10d,DWORD[4+r10]
+	xor	r9d,ebp
+	and	r10d,71303168
+	mov	DWORD[((112+12))+rsp],r9d
+
+	movups	xmm1,XMMWORD[16+rcx]
+
+	movdqa	xmm6,XMMWORD[64+rsp]
+	movdqa	xmm7,XMMWORD[80+rsp]
+
+	cmp	rdx,8
+	jb	NEAR $L$ctr32_tail
+
+	sub	rdx,6
+	cmp	r10d,4194304
+	je	NEAR $L$ctr32_6x
+
+	lea	rcx,[128+rcx]
+	sub	rdx,2
+	jmp	NEAR $L$ctr32_loop8
+
+ALIGN	16
+$L$ctr32_6x:
+	shl	eax,4
+	mov	r10d,48
+	bswap	ebp
+	lea	rcx,[32+rax*1+rcx]
+	sub	r10,rax
+	jmp	NEAR $L$ctr32_loop6
+
+ALIGN	16
+$L$ctr32_loop6:
+	add	r8d,6
+	movups	xmm0,XMMWORD[((-48))+r10*1+rcx]
+DB	102,15,56,220,209
+	mov	eax,r8d
+	xor	eax,ebp
+DB	102,15,56,220,217
+DB	0x0f,0x38,0xf1,0x44,0x24,12
+	lea	eax,[1+r8]
+DB	102,15,56,220,225
+	xor	eax,ebp
+DB	0x0f,0x38,0xf1,0x44,0x24,28
+DB	102,15,56,220,233
+	lea	eax,[2+r8]
+	xor	eax,ebp
+DB	102,15,56,220,241
+DB	0x0f,0x38,0xf1,0x44,0x24,44
+	lea	eax,[3+r8]
+DB	102,15,56,220,249
+	movups	xmm1,XMMWORD[((-32))+r10*1+rcx]
+	xor	eax,ebp
+
+DB	102,15,56,220,208
+DB	0x0f,0x38,0xf1,0x44,0x24,60
+	lea	eax,[4+r8]
+DB	102,15,56,220,216
+	xor	eax,ebp
+DB	0x0f,0x38,0xf1,0x44,0x24,76
+DB	102,15,56,220,224
+	lea	eax,[5+r8]
+	xor	eax,ebp
+DB	102,15,56,220,232
+DB	0x0f,0x38,0xf1,0x44,0x24,92
+	mov	rax,r10
+DB	102,15,56,220,240
+DB	102,15,56,220,248
+	movups	xmm0,XMMWORD[((-16))+r10*1+rcx]
+
+	call	$L$enc_loop6
+
+	movdqu	xmm8,XMMWORD[rdi]
+	movdqu	xmm9,XMMWORD[16+rdi]
+	movdqu	xmm10,XMMWORD[32+rdi]
+	movdqu	xmm11,XMMWORD[48+rdi]
+	movdqu	xmm12,XMMWORD[64+rdi]
+	movdqu	xmm13,XMMWORD[80+rdi]
+	lea	rdi,[96+rdi]
+	movups	xmm1,XMMWORD[((-64))+r10*1+rcx]
+	pxor	xmm8,xmm2
+	movaps	xmm2,XMMWORD[rsp]
+	pxor	xmm9,xmm3
+	movaps	xmm3,XMMWORD[16+rsp]
+	pxor	xmm10,xmm4
+	movaps	xmm4,XMMWORD[32+rsp]
+	pxor	xmm11,xmm5
+	movaps	xmm5,XMMWORD[48+rsp]
+	pxor	xmm12,xmm6
+	movaps	xmm6,XMMWORD[64+rsp]
+	pxor	xmm13,xmm7
+	movaps	xmm7,XMMWORD[80+rsp]
+	movdqu	XMMWORD[rsi],xmm8
+	movdqu	XMMWORD[16+rsi],xmm9
+	movdqu	XMMWORD[32+rsi],xmm10
+	movdqu	XMMWORD[48+rsi],xmm11
+	movdqu	XMMWORD[64+rsi],xmm12
+	movdqu	XMMWORD[80+rsi],xmm13
+	lea	rsi,[96+rsi]
+
+	sub	rdx,6
+	jnc	NEAR $L$ctr32_loop6
+
+	add	rdx,6
+	jz	NEAR $L$ctr32_done
+
+	lea	eax,[((-48))+r10]
+	lea	rcx,[((-80))+r10*1+rcx]
+	neg	eax
+	shr	eax,4
+	jmp	NEAR $L$ctr32_tail
+
+ALIGN	32
+$L$ctr32_loop8:
+	add	r8d,8
+	movdqa	xmm8,XMMWORD[96+rsp]
+DB	102,15,56,220,209
+	mov	r9d,r8d
+	movdqa	xmm9,XMMWORD[112+rsp]
+DB	102,15,56,220,217
+	bswap	r9d
+	movups	xmm0,XMMWORD[((32-128))+rcx]
+DB	102,15,56,220,225
+	xor	r9d,ebp
+	nop
+DB	102,15,56,220,233
+	mov	DWORD[((0+12))+rsp],r9d
+	lea	r9,[1+r8]
+DB	102,15,56,220,241
+DB	102,15,56,220,249
+DB	102,68,15,56,220,193
+DB	102,68,15,56,220,201
+	movups	xmm1,XMMWORD[((48-128))+rcx]
+	bswap	r9d
+DB	102,15,56,220,208
+DB	102,15,56,220,216
+	xor	r9d,ebp
+DB	0x66,0x90
+DB	102,15,56,220,224
+DB	102,15,56,220,232
+	mov	DWORD[((16+12))+rsp],r9d
+	lea	r9,[2+r8]
+DB	102,15,56,220,240
+DB	102,15,56,220,248
+DB	102,68,15,56,220,192
+DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((64-128))+rcx]
+	bswap	r9d
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+	xor	r9d,ebp
+DB	0x66,0x90
+DB	102,15,56,220,225
+DB	102,15,56,220,233
+	mov	DWORD[((32+12))+rsp],r9d
+	lea	r9,[3+r8]
+DB	102,15,56,220,241
+DB	102,15,56,220,249
+DB	102,68,15,56,220,193
+DB	102,68,15,56,220,201
+	movups	xmm1,XMMWORD[((80-128))+rcx]
+	bswap	r9d
+DB	102,15,56,220,208
+DB	102,15,56,220,216
+	xor	r9d,ebp
+DB	0x66,0x90
+DB	102,15,56,220,224
+DB	102,15,56,220,232
+	mov	DWORD[((48+12))+rsp],r9d
+	lea	r9,[4+r8]
+DB	102,15,56,220,240
+DB	102,15,56,220,248
+DB	102,68,15,56,220,192
+DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((96-128))+rcx]
+	bswap	r9d
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+	xor	r9d,ebp
+DB	0x66,0x90
+DB	102,15,56,220,225
+DB	102,15,56,220,233
+	mov	DWORD[((64+12))+rsp],r9d
+	lea	r9,[5+r8]
+DB	102,15,56,220,241
+DB	102,15,56,220,249
+DB	102,68,15,56,220,193
+DB	102,68,15,56,220,201
+	movups	xmm1,XMMWORD[((112-128))+rcx]
+	bswap	r9d
+DB	102,15,56,220,208
+DB	102,15,56,220,216
+	xor	r9d,ebp
+DB	0x66,0x90
+DB	102,15,56,220,224
+DB	102,15,56,220,232
+	mov	DWORD[((80+12))+rsp],r9d
+	lea	r9,[6+r8]
+DB	102,15,56,220,240
+DB	102,15,56,220,248
+DB	102,68,15,56,220,192
+DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((128-128))+rcx]
+	bswap	r9d
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+	xor	r9d,ebp
+DB	0x66,0x90
+DB	102,15,56,220,225
+DB	102,15,56,220,233
+	mov	DWORD[((96+12))+rsp],r9d
+	lea	r9,[7+r8]
+DB	102,15,56,220,241
+DB	102,15,56,220,249
+DB	102,68,15,56,220,193
+DB	102,68,15,56,220,201
+	movups	xmm1,XMMWORD[((144-128))+rcx]
+	bswap	r9d
+DB	102,15,56,220,208
+DB	102,15,56,220,216
+DB	102,15,56,220,224
+	xor	r9d,ebp
+	movdqu	xmm10,XMMWORD[rdi]
+DB	102,15,56,220,232
+	mov	DWORD[((112+12))+rsp],r9d
+	cmp	eax,11
+DB	102,15,56,220,240
+DB	102,15,56,220,248
+DB	102,68,15,56,220,192
+DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((160-128))+rcx]
+
+	jb	NEAR $L$ctr32_enc_done
+
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+DB	102,15,56,220,225
+DB	102,15,56,220,233
+DB	102,15,56,220,241
+DB	102,15,56,220,249
+DB	102,68,15,56,220,193
+DB	102,68,15,56,220,201
+	movups	xmm1,XMMWORD[((176-128))+rcx]
+
+DB	102,15,56,220,208
+DB	102,15,56,220,216
+DB	102,15,56,220,224
+DB	102,15,56,220,232
+DB	102,15,56,220,240
+DB	102,15,56,220,248
+DB	102,68,15,56,220,192
+DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((192-128))+rcx]
+
+
+
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+DB	102,15,56,220,225
+DB	102,15,56,220,233
+DB	102,15,56,220,241
+DB	102,15,56,220,249
+DB	102,68,15,56,220,193
+DB	102,68,15,56,220,201
+	movups	xmm1,XMMWORD[((208-128))+rcx]
+
+DB	102,15,56,220,208
+DB	102,15,56,220,216
+DB	102,15,56,220,224
+DB	102,15,56,220,232
+DB	102,15,56,220,240
+DB	102,15,56,220,248
+DB	102,68,15,56,220,192
+DB	102,68,15,56,220,200
+	movups	xmm0,XMMWORD[((224-128))+rcx]
+	jmp	NEAR $L$ctr32_enc_done
+
+ALIGN	16
+$L$ctr32_enc_done:
+	movdqu	xmm11,XMMWORD[16+rdi]
+	pxor	xmm10,xmm0
+	movdqu	xmm12,XMMWORD[32+rdi]
+	pxor	xmm11,xmm0
+	movdqu	xmm13,XMMWORD[48+rdi]
+	pxor	xmm12,xmm0
+	movdqu	xmm14,XMMWORD[64+rdi]
+	pxor	xmm13,xmm0
+	movdqu	xmm15,XMMWORD[80+rdi]
+	pxor	xmm14,xmm0
+	pxor	xmm15,xmm0
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+DB	102,15,56,220,225
+DB	102,15,56,220,233
+DB	102,15,56,220,241
+DB	102,15,56,220,249
+DB	102,68,15,56,220,193
+DB	102,68,15,56,220,201
+	movdqu	xmm1,XMMWORD[96+rdi]
+	lea	rdi,[128+rdi]
+
+DB	102,65,15,56,221,210
+	pxor	xmm1,xmm0
+	movdqu	xmm10,XMMWORD[((112-128))+rdi]
+DB	102,65,15,56,221,219
+	pxor	xmm10,xmm0
+	movdqa	xmm11,XMMWORD[rsp]
+DB	102,65,15,56,221,228
+DB	102,65,15,56,221,237
+	movdqa	xmm12,XMMWORD[16+rsp]
+	movdqa	xmm13,XMMWORD[32+rsp]
+DB	102,65,15,56,221,246
+DB	102,65,15,56,221,255
+	movdqa	xmm14,XMMWORD[48+rsp]
+	movdqa	xmm15,XMMWORD[64+rsp]
+DB	102,68,15,56,221,193
+	movdqa	xmm0,XMMWORD[80+rsp]
+	movups	xmm1,XMMWORD[((16-128))+rcx]
+DB	102,69,15,56,221,202
+
+	movups	XMMWORD[rsi],xmm2
+	movdqa	xmm2,xmm11
+	movups	XMMWORD[16+rsi],xmm3
+	movdqa	xmm3,xmm12
+	movups	XMMWORD[32+rsi],xmm4
+	movdqa	xmm4,xmm13
+	movups	XMMWORD[48+rsi],xmm5
+	movdqa	xmm5,xmm14
+	movups	XMMWORD[64+rsi],xmm6
+	movdqa	xmm6,xmm15
+	movups	XMMWORD[80+rsi],xmm7
+	movdqa	xmm7,xmm0
+	movups	XMMWORD[96+rsi],xmm8
+	movups	XMMWORD[112+rsi],xmm9
+	lea	rsi,[128+rsi]
+
+	sub	rdx,8
+	jnc	NEAR $L$ctr32_loop8
+
+	add	rdx,8
+	jz	NEAR $L$ctr32_done
+	lea	rcx,[((-128))+rcx]
+
+$L$ctr32_tail:
+
+
+	lea	rcx,[16+rcx]
+	cmp	rdx,4
+	jb	NEAR $L$ctr32_loop3
+	je	NEAR $L$ctr32_loop4
+
+
+	shl	eax,4
+	movdqa	xmm8,XMMWORD[96+rsp]
+	pxor	xmm9,xmm9
+
+	movups	xmm0,XMMWORD[16+rcx]
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+	lea	rcx,[((32-16))+rax*1+rcx]
+	neg	rax
+DB	102,15,56,220,225
+	add	rax,16
+	movups	xmm10,XMMWORD[rdi]
+DB	102,15,56,220,233
+DB	102,15,56,220,241
+	movups	xmm11,XMMWORD[16+rdi]
+	movups	xmm12,XMMWORD[32+rdi]
+DB	102,15,56,220,249
+DB	102,68,15,56,220,193
+
+	call	$L$enc_loop8_enter
+
+	movdqu	xmm13,XMMWORD[48+rdi]
+	pxor	xmm2,xmm10
+	movdqu	xmm10,XMMWORD[64+rdi]
+	pxor	xmm3,xmm11
+	movdqu	XMMWORD[rsi],xmm2
+	pxor	xmm4,xmm12
+	movdqu	XMMWORD[16+rsi],xmm3
+	pxor	xmm5,xmm13
+	movdqu	XMMWORD[32+rsi],xmm4
+	pxor	xmm6,xmm10
+	movdqu	XMMWORD[48+rsi],xmm5
+	movdqu	XMMWORD[64+rsi],xmm6
+	cmp	rdx,6
+	jb	NEAR $L$ctr32_done
+
+	movups	xmm11,XMMWORD[80+rdi]
+	xorps	xmm7,xmm11
+	movups	XMMWORD[80+rsi],xmm7
+	je	NEAR $L$ctr32_done
+
+	movups	xmm12,XMMWORD[96+rdi]
+	xorps	xmm8,xmm12
+	movups	XMMWORD[96+rsi],xmm8
+	jmp	NEAR $L$ctr32_done
+
+ALIGN	32
+$L$ctr32_loop4:
+DB	102,15,56,220,209
+	lea	rcx,[16+rcx]
+	dec	eax
+DB	102,15,56,220,217
+DB	102,15,56,220,225
+DB	102,15,56,220,233
+	movups	xmm1,XMMWORD[rcx]
+	jnz	NEAR $L$ctr32_loop4
+DB	102,15,56,221,209
+DB	102,15,56,221,217
+	movups	xmm10,XMMWORD[rdi]
+	movups	xmm11,XMMWORD[16+rdi]
+DB	102,15,56,221,225
+DB	102,15,56,221,233
+	movups	xmm12,XMMWORD[32+rdi]
+	movups	xmm13,XMMWORD[48+rdi]
+
+	xorps	xmm2,xmm10
+	movups	XMMWORD[rsi],xmm2
+	xorps	xmm3,xmm11
+	movups	XMMWORD[16+rsi],xmm3
+	pxor	xmm4,xmm12
+	movdqu	XMMWORD[32+rsi],xmm4
+	pxor	xmm5,xmm13
+	movdqu	XMMWORD[48+rsi],xmm5
+	jmp	NEAR $L$ctr32_done
+
+ALIGN	32
+$L$ctr32_loop3:
+DB	102,15,56,220,209
+	lea	rcx,[16+rcx]
+	dec	eax
+DB	102,15,56,220,217
+DB	102,15,56,220,225
+	movups	xmm1,XMMWORD[rcx]
+	jnz	NEAR $L$ctr32_loop3
+DB	102,15,56,221,209
+DB	102,15,56,221,217
+DB	102,15,56,221,225
+
+	movups	xmm10,XMMWORD[rdi]
+	xorps	xmm2,xmm10
+	movups	XMMWORD[rsi],xmm2
+	cmp	rdx,2
+	jb	NEAR $L$ctr32_done
+
+	movups	xmm11,XMMWORD[16+rdi]
+	xorps	xmm3,xmm11
+	movups	XMMWORD[16+rsi],xmm3
+	je	NEAR $L$ctr32_done
+
+	movups	xmm12,XMMWORD[32+rdi]
+	xorps	xmm4,xmm12
+	movups	XMMWORD[32+rsi],xmm4
+
+$L$ctr32_done:
+	xorps	xmm0,xmm0
+	xor	ebp,ebp
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	movaps	xmm6,XMMWORD[((-168))+r11]
+	movaps	XMMWORD[(-168)+r11],xmm0
+	movaps	xmm7,XMMWORD[((-152))+r11]
+	movaps	XMMWORD[(-152)+r11],xmm0
+	movaps	xmm8,XMMWORD[((-136))+r11]
+	movaps	XMMWORD[(-136)+r11],xmm0
+	movaps	xmm9,XMMWORD[((-120))+r11]
+	movaps	XMMWORD[(-120)+r11],xmm0
+	movaps	xmm10,XMMWORD[((-104))+r11]
+	movaps	XMMWORD[(-104)+r11],xmm0
+	movaps	xmm11,XMMWORD[((-88))+r11]
+	movaps	XMMWORD[(-88)+r11],xmm0
+	movaps	xmm12,XMMWORD[((-72))+r11]
+	movaps	XMMWORD[(-72)+r11],xmm0
+	movaps	xmm13,XMMWORD[((-56))+r11]
+	movaps	XMMWORD[(-56)+r11],xmm0
+	movaps	xmm14,XMMWORD[((-40))+r11]
+	movaps	XMMWORD[(-40)+r11],xmm0
+	movaps	xmm15,XMMWORD[((-24))+r11]
+	movaps	XMMWORD[(-24)+r11],xmm0
+	movaps	XMMWORD[rsp],xmm0
+	movaps	XMMWORD[16+rsp],xmm0
+	movaps	XMMWORD[32+rsp],xmm0
+	movaps	XMMWORD[48+rsp],xmm0
+	movaps	XMMWORD[64+rsp],xmm0
+	movaps	XMMWORD[80+rsp],xmm0
+	movaps	XMMWORD[96+rsp],xmm0
+	movaps	XMMWORD[112+rsp],xmm0
+	mov	rbp,QWORD[((-8))+r11]
+
+	lea	rsp,[r11]
+
+$L$ctr32_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_aes_hw_ctr32_encrypt_blocks:
+global	GFp_aes_hw_set_encrypt_key
+
+ALIGN	16
+GFp_aes_hw_set_encrypt_key:
+__aesni_set_encrypt_key:
+
+DB	0x48,0x83,0xEC,0x08
+
+	mov	rax,-1
+	test	rcx,rcx
+	jz	NEAR $L$enc_key_ret
+	test	r8,r8
+	jz	NEAR $L$enc_key_ret
+
+	movups	xmm0,XMMWORD[rcx]
+	xorps	xmm4,xmm4
+	lea	r10,[GFp_ia32cap_P]
+	mov	r10d,DWORD[4+r10]
+	and	r10d,268437504
+	lea	rax,[16+r8]
+	cmp	edx,256
+	je	NEAR $L$14rounds
+
+	cmp	edx,128
+	jne	NEAR $L$bad_keybits
+
+$L$10rounds:
+	mov	edx,9
+	cmp	r10d,268435456
+	je	NEAR $L$10rounds_alt
+
+	movups	XMMWORD[r8],xmm0
+DB	102,15,58,223,200,1
+	call	$L$key_expansion_128_cold
+DB	102,15,58,223,200,2
+	call	$L$key_expansion_128
+DB	102,15,58,223,200,4
+	call	$L$key_expansion_128
+DB	102,15,58,223,200,8
+	call	$L$key_expansion_128
+DB	102,15,58,223,200,16
+	call	$L$key_expansion_128
+DB	102,15,58,223,200,32
+	call	$L$key_expansion_128
+DB	102,15,58,223,200,64
+	call	$L$key_expansion_128
+DB	102,15,58,223,200,128
+	call	$L$key_expansion_128
+DB	102,15,58,223,200,27
+	call	$L$key_expansion_128
+DB	102,15,58,223,200,54
+	call	$L$key_expansion_128
+	movups	XMMWORD[rax],xmm0
+	mov	DWORD[80+rax],edx
+	xor	eax,eax
+	jmp	NEAR $L$enc_key_ret
+
+ALIGN	16
+$L$10rounds_alt:
+	movdqa	xmm5,XMMWORD[$L$key_rotate]
+	mov	r10d,8
+	movdqa	xmm4,XMMWORD[$L$key_rcon1]
+	movdqa	xmm2,xmm0
+	movdqu	XMMWORD[r8],xmm0
+	jmp	NEAR $L$oop_key128
+
+ALIGN	16
+$L$oop_key128:
+DB	102,15,56,0,197
+DB	102,15,56,221,196
+	pslld	xmm4,1
+	lea	rax,[16+rax]
+
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+
+	pxor	xmm0,xmm2
+	movdqu	XMMWORD[(-16)+rax],xmm0
+	movdqa	xmm2,xmm0
+
+	dec	r10d
+	jnz	NEAR $L$oop_key128
+
+	movdqa	xmm4,XMMWORD[$L$key_rcon1b]
+
+DB	102,15,56,0,197
+DB	102,15,56,221,196
+	pslld	xmm4,1
+
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+
+	pxor	xmm0,xmm2
+	movdqu	XMMWORD[rax],xmm0
+
+	movdqa	xmm2,xmm0
+DB	102,15,56,0,197
+DB	102,15,56,221,196
+
+	movdqa	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm3,xmm2
+	pslldq	xmm2,4
+	pxor	xmm2,xmm3
+
+	pxor	xmm0,xmm2
+	movdqu	XMMWORD[16+rax],xmm0
+
+	mov	DWORD[96+rax],edx
+	xor	eax,eax
+	jmp	NEAR $L$enc_key_ret
+
+
+
+ALIGN	16
+$L$14rounds:
+	movups	xmm2,XMMWORD[16+rcx]
+	mov	edx,13
+	lea	rax,[16+rax]
+	cmp	r10d,268435456
+	je	NEAR $L$14rounds_alt
+
+	movups	XMMWORD[r8],xmm0
+	movups	XMMWORD[16+r8],xmm2
+DB	102,15,58,223,202,1
+	call	$L$key_expansion_256a_cold
+DB	102,15,58,223,200,1
+	call	$L$key_expansion_256b
+DB	102,15,58,223,202,2
+	call	$L$key_expansion_256a
+DB	102,15,58,223,200,2
+	call	$L$key_expansion_256b
+DB	102,15,58,223,202,4
+	call	$L$key_expansion_256a
+DB	102,15,58,223,200,4
+	call	$L$key_expansion_256b
+DB	102,15,58,223,202,8
+	call	$L$key_expansion_256a
+DB	102,15,58,223,200,8
+	call	$L$key_expansion_256b
+DB	102,15,58,223,202,16
+	call	$L$key_expansion_256a
+DB	102,15,58,223,200,16
+	call	$L$key_expansion_256b
+DB	102,15,58,223,202,32
+	call	$L$key_expansion_256a
+DB	102,15,58,223,200,32
+	call	$L$key_expansion_256b
+DB	102,15,58,223,202,64
+	call	$L$key_expansion_256a
+	movups	XMMWORD[rax],xmm0
+	mov	DWORD[16+rax],edx
+	xor	rax,rax
+	jmp	NEAR $L$enc_key_ret
+
+ALIGN	16
+$L$14rounds_alt:
+	movdqa	xmm5,XMMWORD[$L$key_rotate]
+	movdqa	xmm4,XMMWORD[$L$key_rcon1]
+	mov	r10d,7
+	movdqu	XMMWORD[r8],xmm0
+	movdqa	xmm1,xmm2
+	movdqu	XMMWORD[16+r8],xmm2
+	jmp	NEAR $L$oop_key256
+
+ALIGN	16
+$L$oop_key256:
+DB	102,15,56,0,213
+DB	102,15,56,221,212
+
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm3,xmm0
+	pslldq	xmm0,4
+	pxor	xmm0,xmm3
+	pslld	xmm4,1
+
+	pxor	xmm0,xmm2
+	movdqu	XMMWORD[rax],xmm0
+
+	dec	r10d
+	jz	NEAR $L$done_key256
+
+	pshufd	xmm2,xmm0,0xff
+	pxor	xmm3,xmm3
+DB	102,15,56,221,211
+
+	movdqa	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm3,xmm1
+	pslldq	xmm1,4
+	pxor	xmm1,xmm3
+
+	pxor	xmm2,xmm1
+	movdqu	XMMWORD[16+rax],xmm2
+	lea	rax,[32+rax]
+	movdqa	xmm1,xmm2
+
+	jmp	NEAR $L$oop_key256
+
+$L$done_key256:
+	mov	DWORD[16+rax],edx
+	xor	eax,eax
+	jmp	NEAR $L$enc_key_ret
+
+ALIGN	16
+$L$bad_keybits:
+	mov	rax,-2
+$L$enc_key_ret:
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	add	rsp,8
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_set_encrypt_key:
+
+ALIGN	16
+$L$key_expansion_128:
+	movups	XMMWORD[rax],xmm0
+	lea	rax,[16+rax]
+$L$key_expansion_128_cold:
+	shufps	xmm4,xmm0,16
+	xorps	xmm0,xmm4
+	shufps	xmm4,xmm0,140
+	xorps	xmm0,xmm4
+	shufps	xmm1,xmm1,255
+	xorps	xmm0,xmm1
+	DB	0F3h,0C3h		;repret
+
+ALIGN	16
+$L$key_expansion_192a:
+	movups	XMMWORD[rax],xmm0
+	lea	rax,[16+rax]
+$L$key_expansion_192a_cold:
+	movaps	xmm5,xmm2
+$L$key_expansion_192b_warm:
+	shufps	xmm4,xmm0,16
+	movdqa	xmm3,xmm2
+	xorps	xmm0,xmm4
+	shufps	xmm4,xmm0,140
+	pslldq	xmm3,4
+	xorps	xmm0,xmm4
+	pshufd	xmm1,xmm1,85
+	pxor	xmm2,xmm3
+	pxor	xmm0,xmm1
+	pshufd	xmm3,xmm0,255
+	pxor	xmm2,xmm3
+	DB	0F3h,0C3h		;repret
+
+ALIGN	16
+$L$key_expansion_192b:
+	movaps	xmm3,xmm0
+	shufps	xmm5,xmm0,68
+	movups	XMMWORD[rax],xmm5
+	shufps	xmm3,xmm2,78
+	movups	XMMWORD[16+rax],xmm3
+	lea	rax,[32+rax]
+	jmp	NEAR $L$key_expansion_192b_warm
+
+ALIGN	16
+$L$key_expansion_256a:
+	movups	XMMWORD[rax],xmm2
+	lea	rax,[16+rax]
+$L$key_expansion_256a_cold:
+	shufps	xmm4,xmm0,16
+	xorps	xmm0,xmm4
+	shufps	xmm4,xmm0,140
+	xorps	xmm0,xmm4
+	shufps	xmm1,xmm1,255
+	xorps	xmm0,xmm1
+	DB	0F3h,0C3h		;repret
+
+ALIGN	16
+$L$key_expansion_256b:
+	movups	XMMWORD[rax],xmm0
+	lea	rax,[16+rax]
+
+	shufps	xmm4,xmm2,16
+	xorps	xmm2,xmm4
+	shufps	xmm4,xmm2,140
+	xorps	xmm2,xmm4
+	shufps	xmm1,xmm1,170
+	xorps	xmm2,xmm1
+	DB	0F3h,0C3h		;repret
+
+
+ALIGN	64
+$L$bswap_mask:
+DB	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+$L$increment32:
+	DD	6,6,6,0
+$L$increment64:
+	DD	1,0,0,0
+$L$increment1:
+DB	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+$L$key_rotate:
+	DD	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+$L$key_rotate192:
+	DD	0x04070605,0x04070605,0x04070605,0x04070605
+$L$key_rcon1:
+	DD	1,1,1,1
+$L$key_rcon1b:
+	DD	0x1b,0x1b,0x1b,0x1b
+
+DB	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+DB	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+DB	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+DB	115,108,46,111,114,103,62,0
+ALIGN	64
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+ctr_xts_se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[208+r8]
+
+	lea	rsi,[((-168))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,20
+	DD	0xa548f3fc
+
+	mov	rbp,QWORD[((-8))+rax]
+	mov	QWORD[160+r8],rbp
+
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	DB	0F3h,0C3h		;repret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_GFp_aes_hw_ctr32_encrypt_blocks wrt ..imagebase
+	DD	$L$SEH_end_GFp_aes_hw_ctr32_encrypt_blocks wrt ..imagebase
+	DD	$L$SEH_info_GFp_ctr32 wrt ..imagebase
+	DD	GFp_aes_hw_set_encrypt_key wrt ..imagebase
+	DD	$L$SEH_end_GFp_set_encrypt_key wrt ..imagebase
+	DD	$L$SEH_info_GFp_key wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_GFp_ctr32:
+DB	9,0,0,0
+	DD	ctr_xts_se_handler wrt ..imagebase
+	DD	$L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase
+$L$SEH_info_GFp_key:
+DB	0x01,0x04,0x01,0x00
+DB	0x04,0x02,0x00,0x00

+ 973 - 0
zeroidc/vendor/ring/pregenerated/tmp/chacha-x86-win32n.asm

@@ -0,0 +1,973 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
[email protected] equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+global	_GFp_ChaCha20_ctr32
+align	16
+_GFp_ChaCha20_ctr32:
+L$_GFp_ChaCha20_ctr32_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	xor	eax,eax
+	cmp	eax,DWORD [28+esp]
+	je	NEAR L$000no_data
+	call	L$pic_point
+L$pic_point:
+	pop	eax
+	lea	ebp,[_GFp_ia32cap_P]
+	test	DWORD [ebp],16777216
+	jz	NEAR L$001x86
+	test	DWORD [4+ebp],512
+	jz	NEAR L$001x86
+	jmp	NEAR L$ssse3_shortcut
+L$001x86:
+	mov	esi,DWORD [32+esp]
+	mov	edi,DWORD [36+esp]
+	sub	esp,132
+	mov	eax,DWORD [esi]
+	mov	ebx,DWORD [4+esi]
+	mov	ecx,DWORD [8+esi]
+	mov	edx,DWORD [12+esi]
+	mov	DWORD [80+esp],eax
+	mov	DWORD [84+esp],ebx
+	mov	DWORD [88+esp],ecx
+	mov	DWORD [92+esp],edx
+	mov	eax,DWORD [16+esi]
+	mov	ebx,DWORD [20+esi]
+	mov	ecx,DWORD [24+esi]
+	mov	edx,DWORD [28+esi]
+	mov	DWORD [96+esp],eax
+	mov	DWORD [100+esp],ebx
+	mov	DWORD [104+esp],ecx
+	mov	DWORD [108+esp],edx
+	mov	eax,DWORD [edi]
+	mov	ebx,DWORD [4+edi]
+	mov	ecx,DWORD [8+edi]
+	mov	edx,DWORD [12+edi]
+	sub	eax,1
+	mov	DWORD [112+esp],eax
+	mov	DWORD [116+esp],ebx
+	mov	DWORD [120+esp],ecx
+	mov	DWORD [124+esp],edx
+	jmp	NEAR L$002entry
+align	16
+L$003outer_loop:
+	mov	DWORD [156+esp],ebx
+	mov	DWORD [152+esp],eax
+	mov	DWORD [160+esp],ecx
+L$002entry:
+	mov	eax,1634760805
+	mov	DWORD [4+esp],857760878
+	mov	DWORD [8+esp],2036477234
+	mov	DWORD [12+esp],1797285236
+	mov	ebx,DWORD [84+esp]
+	mov	ebp,DWORD [88+esp]
+	mov	ecx,DWORD [104+esp]
+	mov	esi,DWORD [108+esp]
+	mov	edx,DWORD [116+esp]
+	mov	edi,DWORD [120+esp]
+	mov	DWORD [20+esp],ebx
+	mov	DWORD [24+esp],ebp
+	mov	DWORD [40+esp],ecx
+	mov	DWORD [44+esp],esi
+	mov	DWORD [52+esp],edx
+	mov	DWORD [56+esp],edi
+	mov	ebx,DWORD [92+esp]
+	mov	edi,DWORD [124+esp]
+	mov	edx,DWORD [112+esp]
+	mov	ebp,DWORD [80+esp]
+	mov	ecx,DWORD [96+esp]
+	mov	esi,DWORD [100+esp]
+	add	edx,1
+	mov	DWORD [28+esp],ebx
+	mov	DWORD [60+esp],edi
+	mov	DWORD [112+esp],edx
+	mov	ebx,10
+	jmp	NEAR L$004loop
+align	16
+L$004loop:
+	add	eax,ebp
+	mov	DWORD [128+esp],ebx
+	mov	ebx,ebp
+	xor	edx,eax
+	rol	edx,16
+	add	ecx,edx
+	xor	ebx,ecx
+	mov	edi,DWORD [52+esp]
+	rol	ebx,12
+	mov	ebp,DWORD [20+esp]
+	add	eax,ebx
+	xor	edx,eax
+	mov	DWORD [esp],eax
+	rol	edx,8
+	mov	eax,DWORD [4+esp]
+	add	ecx,edx
+	mov	DWORD [48+esp],edx
+	xor	ebx,ecx
+	add	eax,ebp
+	rol	ebx,7
+	xor	edi,eax
+	mov	DWORD [32+esp],ecx
+	rol	edi,16
+	mov	DWORD [16+esp],ebx
+	add	esi,edi
+	mov	ecx,DWORD [40+esp]
+	xor	ebp,esi
+	mov	edx,DWORD [56+esp]
+	rol	ebp,12
+	mov	ebx,DWORD [24+esp]
+	add	eax,ebp
+	xor	edi,eax
+	mov	DWORD [4+esp],eax
+	rol	edi,8
+	mov	eax,DWORD [8+esp]
+	add	esi,edi
+	mov	DWORD [52+esp],edi
+	xor	ebp,esi
+	add	eax,ebx
+	rol	ebp,7
+	xor	edx,eax
+	mov	DWORD [36+esp],esi
+	rol	edx,16
+	mov	DWORD [20+esp],ebp
+	add	ecx,edx
+	mov	esi,DWORD [44+esp]
+	xor	ebx,ecx
+	mov	edi,DWORD [60+esp]
+	rol	ebx,12
+	mov	ebp,DWORD [28+esp]
+	add	eax,ebx
+	xor	edx,eax
+	mov	DWORD [8+esp],eax
+	rol	edx,8
+	mov	eax,DWORD [12+esp]
+	add	ecx,edx
+	mov	DWORD [56+esp],edx
+	xor	ebx,ecx
+	add	eax,ebp
+	rol	ebx,7
+	xor	edi,eax
+	rol	edi,16
+	mov	DWORD [24+esp],ebx
+	add	esi,edi
+	xor	ebp,esi
+	rol	ebp,12
+	mov	ebx,DWORD [20+esp]
+	add	eax,ebp
+	xor	edi,eax
+	mov	DWORD [12+esp],eax
+	rol	edi,8
+	mov	eax,DWORD [esp]
+	add	esi,edi
+	mov	edx,edi
+	xor	ebp,esi
+	add	eax,ebx
+	rol	ebp,7
+	xor	edx,eax
+	rol	edx,16
+	mov	DWORD [28+esp],ebp
+	add	ecx,edx
+	xor	ebx,ecx
+	mov	edi,DWORD [48+esp]
+	rol	ebx,12
+	mov	ebp,DWORD [24+esp]
+	add	eax,ebx
+	xor	edx,eax
+	mov	DWORD [esp],eax
+	rol	edx,8
+	mov	eax,DWORD [4+esp]
+	add	ecx,edx
+	mov	DWORD [60+esp],edx
+	xor	ebx,ecx
+	add	eax,ebp
+	rol	ebx,7
+	xor	edi,eax
+	mov	DWORD [40+esp],ecx
+	rol	edi,16
+	mov	DWORD [20+esp],ebx
+	add	esi,edi
+	mov	ecx,DWORD [32+esp]
+	xor	ebp,esi
+	mov	edx,DWORD [52+esp]
+	rol	ebp,12
+	mov	ebx,DWORD [28+esp]
+	add	eax,ebp
+	xor	edi,eax
+	mov	DWORD [4+esp],eax
+	rol	edi,8
+	mov	eax,DWORD [8+esp]
+	add	esi,edi
+	mov	DWORD [48+esp],edi
+	xor	ebp,esi
+	add	eax,ebx
+	rol	ebp,7
+	xor	edx,eax
+	mov	DWORD [44+esp],esi
+	rol	edx,16
+	mov	DWORD [24+esp],ebp
+	add	ecx,edx
+	mov	esi,DWORD [36+esp]
+	xor	ebx,ecx
+	mov	edi,DWORD [56+esp]
+	rol	ebx,12
+	mov	ebp,DWORD [16+esp]
+	add	eax,ebx
+	xor	edx,eax
+	mov	DWORD [8+esp],eax
+	rol	edx,8
+	mov	eax,DWORD [12+esp]
+	add	ecx,edx
+	mov	DWORD [52+esp],edx
+	xor	ebx,ecx
+	add	eax,ebp
+	rol	ebx,7
+	xor	edi,eax
+	rol	edi,16
+	mov	DWORD [28+esp],ebx
+	add	esi,edi
+	xor	ebp,esi
+	mov	edx,DWORD [48+esp]
+	rol	ebp,12
+	mov	ebx,DWORD [128+esp]
+	add	eax,ebp
+	xor	edi,eax
+	mov	DWORD [12+esp],eax
+	rol	edi,8
+	mov	eax,DWORD [esp]
+	add	esi,edi
+	mov	DWORD [56+esp],edi
+	xor	ebp,esi
+	rol	ebp,7
+	dec	ebx
+	jnz	NEAR L$004loop
+	mov	ebx,DWORD [160+esp]
+	add	eax,1634760805
+	add	ebp,DWORD [80+esp]
+	add	ecx,DWORD [96+esp]
+	add	esi,DWORD [100+esp]
+	cmp	ebx,64
+	jb	NEAR L$005tail
+	mov	ebx,DWORD [156+esp]
+	add	edx,DWORD [112+esp]
+	add	edi,DWORD [120+esp]
+	xor	eax,DWORD [ebx]
+	xor	ebp,DWORD [16+ebx]
+	mov	DWORD [esp],eax
+	mov	eax,DWORD [152+esp]
+	xor	ecx,DWORD [32+ebx]
+	xor	esi,DWORD [36+ebx]
+	xor	edx,DWORD [48+ebx]
+	xor	edi,DWORD [56+ebx]
+	mov	DWORD [16+eax],ebp
+	mov	DWORD [32+eax],ecx
+	mov	DWORD [36+eax],esi
+	mov	DWORD [48+eax],edx
+	mov	DWORD [56+eax],edi
+	mov	ebp,DWORD [4+esp]
+	mov	ecx,DWORD [8+esp]
+	mov	esi,DWORD [12+esp]
+	mov	edx,DWORD [20+esp]
+	mov	edi,DWORD [24+esp]
+	add	ebp,857760878
+	add	ecx,2036477234
+	add	esi,1797285236
+	add	edx,DWORD [84+esp]
+	add	edi,DWORD [88+esp]
+	xor	ebp,DWORD [4+ebx]
+	xor	ecx,DWORD [8+ebx]
+	xor	esi,DWORD [12+ebx]
+	xor	edx,DWORD [20+ebx]
+	xor	edi,DWORD [24+ebx]
+	mov	DWORD [4+eax],ebp
+	mov	DWORD [8+eax],ecx
+	mov	DWORD [12+eax],esi
+	mov	DWORD [20+eax],edx
+	mov	DWORD [24+eax],edi
+	mov	ebp,DWORD [28+esp]
+	mov	ecx,DWORD [40+esp]
+	mov	esi,DWORD [44+esp]
+	mov	edx,DWORD [52+esp]
+	mov	edi,DWORD [60+esp]
+	add	ebp,DWORD [92+esp]
+	add	ecx,DWORD [104+esp]
+	add	esi,DWORD [108+esp]
+	add	edx,DWORD [116+esp]
+	add	edi,DWORD [124+esp]
+	xor	ebp,DWORD [28+ebx]
+	xor	ecx,DWORD [40+ebx]
+	xor	esi,DWORD [44+ebx]
+	xor	edx,DWORD [52+ebx]
+	xor	edi,DWORD [60+ebx]
+	lea	ebx,[64+ebx]
+	mov	DWORD [28+eax],ebp
+	mov	ebp,DWORD [esp]
+	mov	DWORD [40+eax],ecx
+	mov	ecx,DWORD [160+esp]
+	mov	DWORD [44+eax],esi
+	mov	DWORD [52+eax],edx
+	mov	DWORD [60+eax],edi
+	mov	DWORD [eax],ebp
+	lea	eax,[64+eax]
+	sub	ecx,64
+	jnz	NEAR L$003outer_loop
+	jmp	NEAR L$006done
+L$005tail:
+	add	edx,DWORD [112+esp]
+	add	edi,DWORD [120+esp]
+	mov	DWORD [esp],eax
+	mov	DWORD [16+esp],ebp
+	mov	DWORD [32+esp],ecx
+	mov	DWORD [36+esp],esi
+	mov	DWORD [48+esp],edx
+	mov	DWORD [56+esp],edi
+	mov	ebp,DWORD [4+esp]
+	mov	ecx,DWORD [8+esp]
+	mov	esi,DWORD [12+esp]
+	mov	edx,DWORD [20+esp]
+	mov	edi,DWORD [24+esp]
+	add	ebp,857760878
+	add	ecx,2036477234
+	add	esi,1797285236
+	add	edx,DWORD [84+esp]
+	add	edi,DWORD [88+esp]
+	mov	DWORD [4+esp],ebp
+	mov	DWORD [8+esp],ecx
+	mov	DWORD [12+esp],esi
+	mov	DWORD [20+esp],edx
+	mov	DWORD [24+esp],edi
+	mov	ebp,DWORD [28+esp]
+	mov	ecx,DWORD [40+esp]
+	mov	esi,DWORD [44+esp]
+	mov	edx,DWORD [52+esp]
+	mov	edi,DWORD [60+esp]
+	add	ebp,DWORD [92+esp]
+	add	ecx,DWORD [104+esp]
+	add	esi,DWORD [108+esp]
+	add	edx,DWORD [116+esp]
+	add	edi,DWORD [124+esp]
+	mov	DWORD [28+esp],ebp
+	mov	ebp,DWORD [156+esp]
+	mov	DWORD [40+esp],ecx
+	mov	ecx,DWORD [152+esp]
+	mov	DWORD [44+esp],esi
+	xor	esi,esi
+	mov	DWORD [52+esp],edx
+	mov	DWORD [60+esp],edi
+	xor	eax,eax
+	xor	edx,edx
+L$007tail_loop:
+	mov	al,BYTE [ebp*1+esi]
+	mov	dl,BYTE [esi*1+esp]
+	lea	esi,[1+esi]
+	xor	al,dl
+	mov	BYTE [esi*1+ecx-1],al
+	dec	ebx
+	jnz	NEAR L$007tail_loop
+L$006done:
+	add	esp,132
+L$000no_data:
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+align	16
+__ChaCha20_ssse3:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+L$ssse3_shortcut:
+	mov	edi,DWORD [20+esp]
+	mov	esi,DWORD [24+esp]
+	mov	ecx,DWORD [28+esp]
+	mov	edx,DWORD [32+esp]
+	mov	ebx,DWORD [36+esp]
+	mov	ebp,esp
+	sub	esp,524
+	and	esp,-64
+	mov	DWORD [512+esp],ebp
+	lea	eax,[(L$ssse3_data-L$pic_point)+eax]
+	movdqu	xmm3,[ebx]
+	cmp	ecx,256
+	jb	NEAR L$0081x
+	mov	DWORD [516+esp],edx
+	mov	DWORD [520+esp],ebx
+	sub	ecx,256
+	lea	ebp,[384+esp]
+	movdqu	xmm7,[edx]
+	pshufd	xmm0,xmm3,0
+	pshufd	xmm1,xmm3,85
+	pshufd	xmm2,xmm3,170
+	pshufd	xmm3,xmm3,255
+	paddd	xmm0,[48+eax]
+	pshufd	xmm4,xmm7,0
+	pshufd	xmm5,xmm7,85
+	psubd	xmm0,[64+eax]
+	pshufd	xmm6,xmm7,170
+	pshufd	xmm7,xmm7,255
+	movdqa	[64+ebp],xmm0
+	movdqa	[80+ebp],xmm1
+	movdqa	[96+ebp],xmm2
+	movdqa	[112+ebp],xmm3
+	movdqu	xmm3,[16+edx]
+	movdqa	[ebp-64],xmm4
+	movdqa	[ebp-48],xmm5
+	movdqa	[ebp-32],xmm6
+	movdqa	[ebp-16],xmm7
+	movdqa	xmm7,[32+eax]
+	lea	ebx,[128+esp]
+	pshufd	xmm0,xmm3,0
+	pshufd	xmm1,xmm3,85
+	pshufd	xmm2,xmm3,170
+	pshufd	xmm3,xmm3,255
+	pshufd	xmm4,xmm7,0
+	pshufd	xmm5,xmm7,85
+	pshufd	xmm6,xmm7,170
+	pshufd	xmm7,xmm7,255
+	movdqa	[ebp],xmm0
+	movdqa	[16+ebp],xmm1
+	movdqa	[32+ebp],xmm2
+	movdqa	[48+ebp],xmm3
+	movdqa	[ebp-128],xmm4
+	movdqa	[ebp-112],xmm5
+	movdqa	[ebp-96],xmm6
+	movdqa	[ebp-80],xmm7
+	lea	esi,[128+esi]
+	lea	edi,[128+edi]
+	jmp	NEAR L$009outer_loop
+align	16
+L$009outer_loop:
+	movdqa	xmm1,[ebp-112]
+	movdqa	xmm2,[ebp-96]
+	movdqa	xmm3,[ebp-80]
+	movdqa	xmm5,[ebp-48]
+	movdqa	xmm6,[ebp-32]
+	movdqa	xmm7,[ebp-16]
+	movdqa	[ebx-112],xmm1
+	movdqa	[ebx-96],xmm2
+	movdqa	[ebx-80],xmm3
+	movdqa	[ebx-48],xmm5
+	movdqa	[ebx-32],xmm6
+	movdqa	[ebx-16],xmm7
+	movdqa	xmm2,[32+ebp]
+	movdqa	xmm3,[48+ebp]
+	movdqa	xmm4,[64+ebp]
+	movdqa	xmm5,[80+ebp]
+	movdqa	xmm6,[96+ebp]
+	movdqa	xmm7,[112+ebp]
+	paddd	xmm4,[64+eax]
+	movdqa	[32+ebx],xmm2
+	movdqa	[48+ebx],xmm3
+	movdqa	[64+ebx],xmm4
+	movdqa	[80+ebx],xmm5
+	movdqa	[96+ebx],xmm6
+	movdqa	[112+ebx],xmm7
+	movdqa	[64+ebp],xmm4
+	movdqa	xmm0,[ebp-128]
+	movdqa	xmm6,xmm4
+	movdqa	xmm3,[ebp-64]
+	movdqa	xmm4,[ebp]
+	movdqa	xmm5,[16+ebp]
+	mov	edx,10
+	nop
+align	16
+L$010loop:
+	paddd	xmm0,xmm3
+	movdqa	xmm2,xmm3
+	pxor	xmm6,xmm0
+	pshufb	xmm6,[eax]
+	paddd	xmm4,xmm6
+	pxor	xmm2,xmm4
+	movdqa	xmm3,[ebx-48]
+	movdqa	xmm1,xmm2
+	pslld	xmm2,12
+	psrld	xmm1,20
+	por	xmm2,xmm1
+	movdqa	xmm1,[ebx-112]
+	paddd	xmm0,xmm2
+	movdqa	xmm7,[80+ebx]
+	pxor	xmm6,xmm0
+	movdqa	[ebx-128],xmm0
+	pshufb	xmm6,[16+eax]
+	paddd	xmm4,xmm6
+	movdqa	[64+ebx],xmm6
+	pxor	xmm2,xmm4
+	paddd	xmm1,xmm3
+	movdqa	xmm0,xmm2
+	pslld	xmm2,7
+	psrld	xmm0,25
+	pxor	xmm7,xmm1
+	por	xmm2,xmm0
+	movdqa	[ebx],xmm4
+	pshufb	xmm7,[eax]
+	movdqa	[ebx-64],xmm2
+	paddd	xmm5,xmm7
+	movdqa	xmm4,[32+ebx]
+	pxor	xmm3,xmm5
+	movdqa	xmm2,[ebx-32]
+	movdqa	xmm0,xmm3
+	pslld	xmm3,12
+	psrld	xmm0,20
+	por	xmm3,xmm0
+	movdqa	xmm0,[ebx-96]
+	paddd	xmm1,xmm3
+	movdqa	xmm6,[96+ebx]
+	pxor	xmm7,xmm1
+	movdqa	[ebx-112],xmm1
+	pshufb	xmm7,[16+eax]
+	paddd	xmm5,xmm7
+	movdqa	[80+ebx],xmm7
+	pxor	xmm3,xmm5
+	paddd	xmm0,xmm2
+	movdqa	xmm1,xmm3
+	pslld	xmm3,7
+	psrld	xmm1,25
+	pxor	xmm6,xmm0
+	por	xmm3,xmm1
+	movdqa	[16+ebx],xmm5
+	pshufb	xmm6,[eax]
+	movdqa	[ebx-48],xmm3
+	paddd	xmm4,xmm6
+	movdqa	xmm5,[48+ebx]
+	pxor	xmm2,xmm4
+	movdqa	xmm3,[ebx-16]
+	movdqa	xmm1,xmm2
+	pslld	xmm2,12
+	psrld	xmm1,20
+	por	xmm2,xmm1
+	movdqa	xmm1,[ebx-80]
+	paddd	xmm0,xmm2
+	movdqa	xmm7,[112+ebx]
+	pxor	xmm6,xmm0
+	movdqa	[ebx-96],xmm0
+	pshufb	xmm6,[16+eax]
+	paddd	xmm4,xmm6
+	movdqa	[96+ebx],xmm6
+	pxor	xmm2,xmm4
+	paddd	xmm1,xmm3
+	movdqa	xmm0,xmm2
+	pslld	xmm2,7
+	psrld	xmm0,25
+	pxor	xmm7,xmm1
+	por	xmm2,xmm0
+	pshufb	xmm7,[eax]
+	movdqa	[ebx-32],xmm2
+	paddd	xmm5,xmm7
+	pxor	xmm3,xmm5
+	movdqa	xmm2,[ebx-48]
+	movdqa	xmm0,xmm3
+	pslld	xmm3,12
+	psrld	xmm0,20
+	por	xmm3,xmm0
+	movdqa	xmm0,[ebx-128]
+	paddd	xmm1,xmm3
+	pxor	xmm7,xmm1
+	movdqa	[ebx-80],xmm1
+	pshufb	xmm7,[16+eax]
+	paddd	xmm5,xmm7
+	movdqa	xmm6,xmm7
+	pxor	xmm3,xmm5
+	paddd	xmm0,xmm2
+	movdqa	xmm1,xmm3
+	pslld	xmm3,7
+	psrld	xmm1,25
+	pxor	xmm6,xmm0
+	por	xmm3,xmm1
+	pshufb	xmm6,[eax]
+	movdqa	[ebx-16],xmm3
+	paddd	xmm4,xmm6
+	pxor	xmm2,xmm4
+	movdqa	xmm3,[ebx-32]
+	movdqa	xmm1,xmm2
+	pslld	xmm2,12
+	psrld	xmm1,20
+	por	xmm2,xmm1
+	movdqa	xmm1,[ebx-112]
+	paddd	xmm0,xmm2
+	movdqa	xmm7,[64+ebx]
+	pxor	xmm6,xmm0
+	movdqa	[ebx-128],xmm0
+	pshufb	xmm6,[16+eax]
+	paddd	xmm4,xmm6
+	movdqa	[112+ebx],xmm6
+	pxor	xmm2,xmm4
+	paddd	xmm1,xmm3
+	movdqa	xmm0,xmm2
+	pslld	xmm2,7
+	psrld	xmm0,25
+	pxor	xmm7,xmm1
+	por	xmm2,xmm0
+	movdqa	[32+ebx],xmm4
+	pshufb	xmm7,[eax]
+	movdqa	[ebx-48],xmm2
+	paddd	xmm5,xmm7
+	movdqa	xmm4,[ebx]
+	pxor	xmm3,xmm5
+	movdqa	xmm2,[ebx-16]
+	movdqa	xmm0,xmm3
+	pslld	xmm3,12
+	psrld	xmm0,20
+	por	xmm3,xmm0
+	movdqa	xmm0,[ebx-96]
+	paddd	xmm1,xmm3
+	movdqa	xmm6,[80+ebx]
+	pxor	xmm7,xmm1
+	movdqa	[ebx-112],xmm1
+	pshufb	xmm7,[16+eax]
+	paddd	xmm5,xmm7
+	movdqa	[64+ebx],xmm7
+	pxor	xmm3,xmm5
+	paddd	xmm0,xmm2
+	movdqa	xmm1,xmm3
+	pslld	xmm3,7
+	psrld	xmm1,25
+	pxor	xmm6,xmm0
+	por	xmm3,xmm1
+	movdqa	[48+ebx],xmm5
+	pshufb	xmm6,[eax]
+	movdqa	[ebx-32],xmm3
+	paddd	xmm4,xmm6
+	movdqa	xmm5,[16+ebx]
+	pxor	xmm2,xmm4
+	movdqa	xmm3,[ebx-64]
+	movdqa	xmm1,xmm2
+	pslld	xmm2,12
+	psrld	xmm1,20
+	por	xmm2,xmm1
+	movdqa	xmm1,[ebx-80]
+	paddd	xmm0,xmm2
+	movdqa	xmm7,[96+ebx]
+	pxor	xmm6,xmm0
+	movdqa	[ebx-96],xmm0
+	pshufb	xmm6,[16+eax]
+	paddd	xmm4,xmm6
+	movdqa	[80+ebx],xmm6
+	pxor	xmm2,xmm4
+	paddd	xmm1,xmm3
+	movdqa	xmm0,xmm2
+	pslld	xmm2,7
+	psrld	xmm0,25
+	pxor	xmm7,xmm1
+	por	xmm2,xmm0
+	pshufb	xmm7,[eax]
+	movdqa	[ebx-16],xmm2
+	paddd	xmm5,xmm7
+	pxor	xmm3,xmm5
+	movdqa	xmm0,xmm3
+	pslld	xmm3,12
+	psrld	xmm0,20
+	por	xmm3,xmm0
+	movdqa	xmm0,[ebx-128]
+	paddd	xmm1,xmm3
+	movdqa	xmm6,[64+ebx]
+	pxor	xmm7,xmm1
+	movdqa	[ebx-80],xmm1
+	pshufb	xmm7,[16+eax]
+	paddd	xmm5,xmm7
+	movdqa	[96+ebx],xmm7
+	pxor	xmm3,xmm5
+	movdqa	xmm1,xmm3
+	pslld	xmm3,7
+	psrld	xmm1,25
+	por	xmm3,xmm1
+	dec	edx
+	jnz	NEAR L$010loop
+	movdqa	[ebx-64],xmm3
+	movdqa	[ebx],xmm4
+	movdqa	[16+ebx],xmm5
+	movdqa	[64+ebx],xmm6
+	movdqa	[96+ebx],xmm7
+	movdqa	xmm1,[ebx-112]
+	movdqa	xmm2,[ebx-96]
+	movdqa	xmm3,[ebx-80]
+	paddd	xmm0,[ebp-128]
+	paddd	xmm1,[ebp-112]
+	paddd	xmm2,[ebp-96]
+	paddd	xmm3,[ebp-80]
+	movdqa	xmm6,xmm0
+	punpckldq	xmm0,xmm1
+	movdqa	xmm7,xmm2
+	punpckldq	xmm2,xmm3
+	punpckhdq	xmm6,xmm1
+	punpckhdq	xmm7,xmm3
+	movdqa	xmm1,xmm0
+	punpcklqdq	xmm0,xmm2
+	movdqa	xmm3,xmm6
+	punpcklqdq	xmm6,xmm7
+	punpckhqdq	xmm1,xmm2
+	punpckhqdq	xmm3,xmm7
+	movdqu	xmm4,[esi-128]
+	movdqu	xmm5,[esi-64]
+	movdqu	xmm2,[esi]
+	movdqu	xmm7,[64+esi]
+	lea	esi,[16+esi]
+	pxor	xmm4,xmm0
+	movdqa	xmm0,[ebx-64]
+	pxor	xmm5,xmm1
+	movdqa	xmm1,[ebx-48]
+	pxor	xmm6,xmm2
+	movdqa	xmm2,[ebx-32]
+	pxor	xmm7,xmm3
+	movdqa	xmm3,[ebx-16]
+	movdqu	[edi-128],xmm4
+	movdqu	[edi-64],xmm5
+	movdqu	[edi],xmm6
+	movdqu	[64+edi],xmm7
+	lea	edi,[16+edi]
+	paddd	xmm0,[ebp-64]
+	paddd	xmm1,[ebp-48]
+	paddd	xmm2,[ebp-32]
+	paddd	xmm3,[ebp-16]
+	movdqa	xmm6,xmm0
+	punpckldq	xmm0,xmm1
+	movdqa	xmm7,xmm2
+	punpckldq	xmm2,xmm3
+	punpckhdq	xmm6,xmm1
+	punpckhdq	xmm7,xmm3
+	movdqa	xmm1,xmm0
+	punpcklqdq	xmm0,xmm2
+	movdqa	xmm3,xmm6
+	punpcklqdq	xmm6,xmm7
+	punpckhqdq	xmm1,xmm2
+	punpckhqdq	xmm3,xmm7
+	movdqu	xmm4,[esi-128]
+	movdqu	xmm5,[esi-64]
+	movdqu	xmm2,[esi]
+	movdqu	xmm7,[64+esi]
+	lea	esi,[16+esi]
+	pxor	xmm4,xmm0
+	movdqa	xmm0,[ebx]
+	pxor	xmm5,xmm1
+	movdqa	xmm1,[16+ebx]
+	pxor	xmm6,xmm2
+	movdqa	xmm2,[32+ebx]
+	pxor	xmm7,xmm3
+	movdqa	xmm3,[48+ebx]
+	movdqu	[edi-128],xmm4
+	movdqu	[edi-64],xmm5
+	movdqu	[edi],xmm6
+	movdqu	[64+edi],xmm7
+	lea	edi,[16+edi]
+	paddd	xmm0,[ebp]
+	paddd	xmm1,[16+ebp]
+	paddd	xmm2,[32+ebp]
+	paddd	xmm3,[48+ebp]
+	movdqa	xmm6,xmm0
+	punpckldq	xmm0,xmm1
+	movdqa	xmm7,xmm2
+	punpckldq	xmm2,xmm3
+	punpckhdq	xmm6,xmm1
+	punpckhdq	xmm7,xmm3
+	movdqa	xmm1,xmm0
+	punpcklqdq	xmm0,xmm2
+	movdqa	xmm3,xmm6
+	punpcklqdq	xmm6,xmm7
+	punpckhqdq	xmm1,xmm2
+	punpckhqdq	xmm3,xmm7
+	movdqu	xmm4,[esi-128]
+	movdqu	xmm5,[esi-64]
+	movdqu	xmm2,[esi]
+	movdqu	xmm7,[64+esi]
+	lea	esi,[16+esi]
+	pxor	xmm4,xmm0
+	movdqa	xmm0,[64+ebx]
+	pxor	xmm5,xmm1
+	movdqa	xmm1,[80+ebx]
+	pxor	xmm6,xmm2
+	movdqa	xmm2,[96+ebx]
+	pxor	xmm7,xmm3
+	movdqa	xmm3,[112+ebx]
+	movdqu	[edi-128],xmm4
+	movdqu	[edi-64],xmm5
+	movdqu	[edi],xmm6
+	movdqu	[64+edi],xmm7
+	lea	edi,[16+edi]
+	paddd	xmm0,[64+ebp]
+	paddd	xmm1,[80+ebp]
+	paddd	xmm2,[96+ebp]
+	paddd	xmm3,[112+ebp]
+	movdqa	xmm6,xmm0
+	punpckldq	xmm0,xmm1
+	movdqa	xmm7,xmm2
+	punpckldq	xmm2,xmm3
+	punpckhdq	xmm6,xmm1
+	punpckhdq	xmm7,xmm3
+	movdqa	xmm1,xmm0
+	punpcklqdq	xmm0,xmm2
+	movdqa	xmm3,xmm6
+	punpcklqdq	xmm6,xmm7
+	punpckhqdq	xmm1,xmm2
+	punpckhqdq	xmm3,xmm7
+	movdqu	xmm4,[esi-128]
+	movdqu	xmm5,[esi-64]
+	movdqu	xmm2,[esi]
+	movdqu	xmm7,[64+esi]
+	lea	esi,[208+esi]
+	pxor	xmm4,xmm0
+	pxor	xmm5,xmm1
+	pxor	xmm6,xmm2
+	pxor	xmm7,xmm3
+	movdqu	[edi-128],xmm4
+	movdqu	[edi-64],xmm5
+	movdqu	[edi],xmm6
+	movdqu	[64+edi],xmm7
+	lea	edi,[208+edi]
+	sub	ecx,256
+	jnc	NEAR L$009outer_loop
+	add	ecx,256
+	jz	NEAR L$011done
+	mov	ebx,DWORD [520+esp]
+	lea	esi,[esi-128]
+	mov	edx,DWORD [516+esp]
+	lea	edi,[edi-128]
+	movd	xmm2,DWORD [64+ebp]
+	movdqu	xmm3,[ebx]
+	paddd	xmm2,[96+eax]
+	pand	xmm3,[112+eax]
+	por	xmm3,xmm2
+L$0081x:
+	movdqa	xmm0,[32+eax]
+	movdqu	xmm1,[edx]
+	movdqu	xmm2,[16+edx]
+	movdqa	xmm6,[eax]
+	movdqa	xmm7,[16+eax]
+	mov	DWORD [48+esp],ebp
+	movdqa	[esp],xmm0
+	movdqa	[16+esp],xmm1
+	movdqa	[32+esp],xmm2
+	movdqa	[48+esp],xmm3
+	mov	edx,10
+	jmp	NEAR L$012loop1x
+align	16
+L$013outer1x:
+	movdqa	xmm3,[80+eax]
+	movdqa	xmm0,[esp]
+	movdqa	xmm1,[16+esp]
+	movdqa	xmm2,[32+esp]
+	paddd	xmm3,[48+esp]
+	mov	edx,10
+	movdqa	[48+esp],xmm3
+	jmp	NEAR L$012loop1x
+align	16
+L$012loop1x:
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+db	102,15,56,0,222
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,20
+	pslld	xmm4,12
+	por	xmm1,xmm4
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+db	102,15,56,0,223
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,25
+	pslld	xmm4,7
+	por	xmm1,xmm4
+	pshufd	xmm2,xmm2,78
+	pshufd	xmm1,xmm1,57
+	pshufd	xmm3,xmm3,147
+	nop
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+db	102,15,56,0,222
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,20
+	pslld	xmm4,12
+	por	xmm1,xmm4
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+db	102,15,56,0,223
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,25
+	pslld	xmm4,7
+	por	xmm1,xmm4
+	pshufd	xmm2,xmm2,78
+	pshufd	xmm1,xmm1,147
+	pshufd	xmm3,xmm3,57
+	dec	edx
+	jnz	NEAR L$012loop1x
+	paddd	xmm0,[esp]
+	paddd	xmm1,[16+esp]
+	paddd	xmm2,[32+esp]
+	paddd	xmm3,[48+esp]
+	cmp	ecx,64
+	jb	NEAR L$014tail
+	movdqu	xmm4,[esi]
+	movdqu	xmm5,[16+esi]
+	pxor	xmm0,xmm4
+	movdqu	xmm4,[32+esi]
+	pxor	xmm1,xmm5
+	movdqu	xmm5,[48+esi]
+	pxor	xmm2,xmm4
+	pxor	xmm3,xmm5
+	lea	esi,[64+esi]
+	movdqu	[edi],xmm0
+	movdqu	[16+edi],xmm1
+	movdqu	[32+edi],xmm2
+	movdqu	[48+edi],xmm3
+	lea	edi,[64+edi]
+	sub	ecx,64
+	jnz	NEAR L$013outer1x
+	jmp	NEAR L$011done
+L$014tail:
+	movdqa	[esp],xmm0
+	movdqa	[16+esp],xmm1
+	movdqa	[32+esp],xmm2
+	movdqa	[48+esp],xmm3
+	xor	eax,eax
+	xor	edx,edx
+	xor	ebp,ebp
+L$015tail_loop:
+	mov	al,BYTE [ebp*1+esp]
+	mov	dl,BYTE [ebp*1+esi]
+	lea	ebp,[1+ebp]
+	xor	al,dl
+	mov	BYTE [ebp*1+edi-1],al
+	dec	ecx
+	jnz	NEAR L$015tail_loop
+L$011done:
+	mov	esp,DWORD [512+esp]
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+align	64
+L$ssse3_data:
+db	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+db	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+dd	1634760805,857760878,2036477234,1797285236
+dd	0,1,2,3
+dd	4,4,4,4
+dd	1,0,0,0
+dd	4,0,0,0
+dd	0,-1,-1,-1
+align	64
+db	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
+db	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
+db	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
+db	114,103,62,0
+segment	.bss
+common	_GFp_ia32cap_P 16

+ 1922 - 0
zeroidc/vendor/ring/pregenerated/tmp/chacha-x86_64-nasm.asm

@@ -0,0 +1,1922 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+section	.text code align=64
+
+
+EXTERN	GFp_ia32cap_P
+
+ALIGN	64
+$L$zero:
+	DD	0,0,0,0
+$L$one:
+	DD	1,0,0,0
+$L$inc:
+	DD	0,1,2,3
+$L$four:
+	DD	4,4,4,4
+$L$incy:
+	DD	0,2,4,6,1,3,5,7
+$L$eight:
+	DD	8,8,8,8,8,8,8,8
+$L$rot16:
+DB	0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd
+$L$rot24:
+DB	0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe
+$L$sigma:
+DB	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
+DB	0
+ALIGN	64
+$L$zeroz:
+	DD	0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0
+$L$fourz:
+	DD	4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0
+$L$incz:
+	DD	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+$L$sixteen:
+	DD	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+DB	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
+DB	95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
+DB	98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
+DB	108,46,111,114,103,62,0
+global	GFp_ChaCha20_ctr32
+
+ALIGN	64
+GFp_ChaCha20_ctr32:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_ChaCha20_ctr32:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+	cmp	rdx,0
+	je	NEAR $L$no_data
+	mov	r10,QWORD[((GFp_ia32cap_P+4))]
+	test	r10d,512
+	jnz	NEAR $L$ChaCha20_ssse3
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,64+24
+
+$L$ctr32_body:
+
+
+	movdqu	xmm1,XMMWORD[rcx]
+	movdqu	xmm2,XMMWORD[16+rcx]
+	movdqu	xmm3,XMMWORD[r8]
+	movdqa	xmm4,XMMWORD[$L$one]
+
+
+	movdqa	XMMWORD[16+rsp],xmm1
+	movdqa	XMMWORD[32+rsp],xmm2
+	movdqa	XMMWORD[48+rsp],xmm3
+	mov	rbp,rdx
+	jmp	NEAR $L$oop_outer
+
+ALIGN	32
+$L$oop_outer:
+	mov	eax,0x61707865
+	mov	ebx,0x3320646e
+	mov	ecx,0x79622d32
+	mov	edx,0x6b206574
+	mov	r8d,DWORD[16+rsp]
+	mov	r9d,DWORD[20+rsp]
+	mov	r10d,DWORD[24+rsp]
+	mov	r11d,DWORD[28+rsp]
+	movd	r12d,xmm3
+	mov	r13d,DWORD[52+rsp]
+	mov	r14d,DWORD[56+rsp]
+	mov	r15d,DWORD[60+rsp]
+
+	mov	QWORD[((64+0))+rsp],rbp
+	mov	ebp,10
+	mov	QWORD[((64+8))+rsp],rsi
+DB	102,72,15,126,214
+	mov	QWORD[((64+16))+rsp],rdi
+	mov	rdi,rsi
+	shr	rdi,32
+	jmp	NEAR $L$oop
+
+ALIGN	32
+$L$oop:
+	add	eax,r8d
+	xor	r12d,eax
+	rol	r12d,16
+	add	ebx,r9d
+	xor	r13d,ebx
+	rol	r13d,16
+	add	esi,r12d
+	xor	r8d,esi
+	rol	r8d,12
+	add	edi,r13d
+	xor	r9d,edi
+	rol	r9d,12
+	add	eax,r8d
+	xor	r12d,eax
+	rol	r12d,8
+	add	ebx,r9d
+	xor	r13d,ebx
+	rol	r13d,8
+	add	esi,r12d
+	xor	r8d,esi
+	rol	r8d,7
+	add	edi,r13d
+	xor	r9d,edi
+	rol	r9d,7
+	mov	DWORD[32+rsp],esi
+	mov	DWORD[36+rsp],edi
+	mov	esi,DWORD[40+rsp]
+	mov	edi,DWORD[44+rsp]
+	add	ecx,r10d
+	xor	r14d,ecx
+	rol	r14d,16
+	add	edx,r11d
+	xor	r15d,edx
+	rol	r15d,16
+	add	esi,r14d
+	xor	r10d,esi
+	rol	r10d,12
+	add	edi,r15d
+	xor	r11d,edi
+	rol	r11d,12
+	add	ecx,r10d
+	xor	r14d,ecx
+	rol	r14d,8
+	add	edx,r11d
+	xor	r15d,edx
+	rol	r15d,8
+	add	esi,r14d
+	xor	r10d,esi
+	rol	r10d,7
+	add	edi,r15d
+	xor	r11d,edi
+	rol	r11d,7
+	add	eax,r9d
+	xor	r15d,eax
+	rol	r15d,16
+	add	ebx,r10d
+	xor	r12d,ebx
+	rol	r12d,16
+	add	esi,r15d
+	xor	r9d,esi
+	rol	r9d,12
+	add	edi,r12d
+	xor	r10d,edi
+	rol	r10d,12
+	add	eax,r9d
+	xor	r15d,eax
+	rol	r15d,8
+	add	ebx,r10d
+	xor	r12d,ebx
+	rol	r12d,8
+	add	esi,r15d
+	xor	r9d,esi
+	rol	r9d,7
+	add	edi,r12d
+	xor	r10d,edi
+	rol	r10d,7
+	mov	DWORD[40+rsp],esi
+	mov	DWORD[44+rsp],edi
+	mov	esi,DWORD[32+rsp]
+	mov	edi,DWORD[36+rsp]
+	add	ecx,r11d
+	xor	r13d,ecx
+	rol	r13d,16
+	add	edx,r8d
+	xor	r14d,edx
+	rol	r14d,16
+	add	esi,r13d
+	xor	r11d,esi
+	rol	r11d,12
+	add	edi,r14d
+	xor	r8d,edi
+	rol	r8d,12
+	add	ecx,r11d
+	xor	r13d,ecx
+	rol	r13d,8
+	add	edx,r8d
+	xor	r14d,edx
+	rol	r14d,8
+	add	esi,r13d
+	xor	r11d,esi
+	rol	r11d,7
+	add	edi,r14d
+	xor	r8d,edi
+	rol	r8d,7
+	dec	ebp
+	jnz	NEAR $L$oop
+	mov	DWORD[36+rsp],edi
+	mov	DWORD[32+rsp],esi
+	mov	rbp,QWORD[64+rsp]
+	movdqa	xmm1,xmm2
+	mov	rsi,QWORD[((64+8))+rsp]
+	paddd	xmm3,xmm4
+	mov	rdi,QWORD[((64+16))+rsp]
+
+	add	eax,0x61707865
+	add	ebx,0x3320646e
+	add	ecx,0x79622d32
+	add	edx,0x6b206574
+	add	r8d,DWORD[16+rsp]
+	add	r9d,DWORD[20+rsp]
+	add	r10d,DWORD[24+rsp]
+	add	r11d,DWORD[28+rsp]
+	add	r12d,DWORD[48+rsp]
+	add	r13d,DWORD[52+rsp]
+	add	r14d,DWORD[56+rsp]
+	add	r15d,DWORD[60+rsp]
+	paddd	xmm1,XMMWORD[32+rsp]
+
+	cmp	rbp,64
+	jb	NEAR $L$tail
+
+	xor	eax,DWORD[rsi]
+	xor	ebx,DWORD[4+rsi]
+	xor	ecx,DWORD[8+rsi]
+	xor	edx,DWORD[12+rsi]
+	xor	r8d,DWORD[16+rsi]
+	xor	r9d,DWORD[20+rsi]
+	xor	r10d,DWORD[24+rsi]
+	xor	r11d,DWORD[28+rsi]
+	movdqu	xmm0,XMMWORD[32+rsi]
+	xor	r12d,DWORD[48+rsi]
+	xor	r13d,DWORD[52+rsi]
+	xor	r14d,DWORD[56+rsi]
+	xor	r15d,DWORD[60+rsi]
+	lea	rsi,[64+rsi]
+	pxor	xmm0,xmm1
+
+	movdqa	XMMWORD[32+rsp],xmm2
+	movd	DWORD[48+rsp],xmm3
+
+	mov	DWORD[rdi],eax
+	mov	DWORD[4+rdi],ebx
+	mov	DWORD[8+rdi],ecx
+	mov	DWORD[12+rdi],edx
+	mov	DWORD[16+rdi],r8d
+	mov	DWORD[20+rdi],r9d
+	mov	DWORD[24+rdi],r10d
+	mov	DWORD[28+rdi],r11d
+	movdqu	XMMWORD[32+rdi],xmm0
+	mov	DWORD[48+rdi],r12d
+	mov	DWORD[52+rdi],r13d
+	mov	DWORD[56+rdi],r14d
+	mov	DWORD[60+rdi],r15d
+	lea	rdi,[64+rdi]
+
+	sub	rbp,64
+	jnz	NEAR $L$oop_outer
+
+	jmp	NEAR $L$done
+
+ALIGN	16
+$L$tail:
+	mov	DWORD[rsp],eax
+	mov	DWORD[4+rsp],ebx
+	xor	rbx,rbx
+	mov	DWORD[8+rsp],ecx
+	mov	DWORD[12+rsp],edx
+	mov	DWORD[16+rsp],r8d
+	mov	DWORD[20+rsp],r9d
+	mov	DWORD[24+rsp],r10d
+	mov	DWORD[28+rsp],r11d
+	movdqa	XMMWORD[32+rsp],xmm1
+	mov	DWORD[48+rsp],r12d
+	mov	DWORD[52+rsp],r13d
+	mov	DWORD[56+rsp],r14d
+	mov	DWORD[60+rsp],r15d
+
+$L$oop_tail:
+	movzx	eax,BYTE[rbx*1+rsi]
+	movzx	edx,BYTE[rbx*1+rsp]
+	lea	rbx,[1+rbx]
+	xor	eax,edx
+	mov	BYTE[((-1))+rbx*1+rdi],al
+	dec	rbp
+	jnz	NEAR $L$oop_tail
+
+$L$done:
+	lea	rsi,[((64+24+48))+rsp]
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$no_data:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_ChaCha20_ctr32:
+
+ALIGN	32
+ChaCha20_ssse3:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ChaCha20_ssse3:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+$L$ChaCha20_ssse3:
+
+	mov	r9,rsp
+
+	cmp	rdx,128
+	ja	NEAR $L$ChaCha20_4x
+
+$L$do_sse3_after_all:
+	sub	rsp,64+40
+	movaps	XMMWORD[(-40)+r9],xmm6
+	movaps	XMMWORD[(-24)+r9],xmm7
+$L$ssse3_body:
+	movdqa	xmm0,XMMWORD[$L$sigma]
+	movdqu	xmm1,XMMWORD[rcx]
+	movdqu	xmm2,XMMWORD[16+rcx]
+	movdqu	xmm3,XMMWORD[r8]
+	movdqa	xmm6,XMMWORD[$L$rot16]
+	movdqa	xmm7,XMMWORD[$L$rot24]
+
+	movdqa	XMMWORD[rsp],xmm0
+	movdqa	XMMWORD[16+rsp],xmm1
+	movdqa	XMMWORD[32+rsp],xmm2
+	movdqa	XMMWORD[48+rsp],xmm3
+	mov	r8,10
+	jmp	NEAR $L$oop_ssse3
+
+ALIGN	32
+$L$oop_outer_ssse3:
+	movdqa	xmm3,XMMWORD[$L$one]
+	movdqa	xmm0,XMMWORD[rsp]
+	movdqa	xmm1,XMMWORD[16+rsp]
+	movdqa	xmm2,XMMWORD[32+rsp]
+	paddd	xmm3,XMMWORD[48+rsp]
+	mov	r8,10
+	movdqa	XMMWORD[48+rsp],xmm3
+	jmp	NEAR $L$oop_ssse3
+
+ALIGN	32
+$L$oop_ssse3:
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+DB	102,15,56,0,222
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,20
+	pslld	xmm4,12
+	por	xmm1,xmm4
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+DB	102,15,56,0,223
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,25
+	pslld	xmm4,7
+	por	xmm1,xmm4
+	pshufd	xmm2,xmm2,78
+	pshufd	xmm1,xmm1,57
+	pshufd	xmm3,xmm3,147
+	nop
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+DB	102,15,56,0,222
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,20
+	pslld	xmm4,12
+	por	xmm1,xmm4
+	paddd	xmm0,xmm1
+	pxor	xmm3,xmm0
+DB	102,15,56,0,223
+	paddd	xmm2,xmm3
+	pxor	xmm1,xmm2
+	movdqa	xmm4,xmm1
+	psrld	xmm1,25
+	pslld	xmm4,7
+	por	xmm1,xmm4
+	pshufd	xmm2,xmm2,78
+	pshufd	xmm1,xmm1,147
+	pshufd	xmm3,xmm3,57
+	dec	r8
+	jnz	NEAR $L$oop_ssse3
+	paddd	xmm0,XMMWORD[rsp]
+	paddd	xmm1,XMMWORD[16+rsp]
+	paddd	xmm2,XMMWORD[32+rsp]
+	paddd	xmm3,XMMWORD[48+rsp]
+
+	cmp	rdx,64
+	jb	NEAR $L$tail_ssse3
+
+	movdqu	xmm4,XMMWORD[rsi]
+	movdqu	xmm5,XMMWORD[16+rsi]
+	pxor	xmm0,xmm4
+	movdqu	xmm4,XMMWORD[32+rsi]
+	pxor	xmm1,xmm5
+	movdqu	xmm5,XMMWORD[48+rsi]
+	lea	rsi,[64+rsi]
+	pxor	xmm2,xmm4
+	pxor	xmm3,xmm5
+
+	movdqu	XMMWORD[rdi],xmm0
+	movdqu	XMMWORD[16+rdi],xmm1
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm3
+	lea	rdi,[64+rdi]
+
+	sub	rdx,64
+	jnz	NEAR $L$oop_outer_ssse3
+
+	jmp	NEAR $L$done_ssse3
+
+ALIGN	16
+$L$tail_ssse3:
+	movdqa	XMMWORD[rsp],xmm0
+	movdqa	XMMWORD[16+rsp],xmm1
+	movdqa	XMMWORD[32+rsp],xmm2
+	movdqa	XMMWORD[48+rsp],xmm3
+	xor	r8,r8
+
+$L$oop_tail_ssse3:
+	movzx	eax,BYTE[r8*1+rsi]
+	movzx	ecx,BYTE[r8*1+rsp]
+	lea	r8,[1+r8]
+	xor	eax,ecx
+	mov	BYTE[((-1))+r8*1+rdi],al
+	dec	rdx
+	jnz	NEAR $L$oop_tail_ssse3
+
+$L$done_ssse3:
+	movaps	xmm6,XMMWORD[((-40))+r9]
+	movaps	xmm7,XMMWORD[((-24))+r9]
+	lea	rsp,[r9]
+
+$L$ssse3_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_ChaCha20_ssse3:
+
+ALIGN	32
+ChaCha20_4x:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ChaCha20_4x:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+$L$ChaCha20_4x:
+
+	mov	r9,rsp
+
+	mov	r11,r10
+	shr	r10,32
+	test	r10,32
+	jnz	NEAR $L$ChaCha20_8x
+	cmp	rdx,192
+	ja	NEAR $L$proceed4x
+
+	and	r11,71303168
+	cmp	r11,4194304
+	je	NEAR $L$do_sse3_after_all
+
+$L$proceed4x:
+	sub	rsp,0x140+168
+	movaps	XMMWORD[(-168)+r9],xmm6
+	movaps	XMMWORD[(-152)+r9],xmm7
+	movaps	XMMWORD[(-136)+r9],xmm8
+	movaps	XMMWORD[(-120)+r9],xmm9
+	movaps	XMMWORD[(-104)+r9],xmm10
+	movaps	XMMWORD[(-88)+r9],xmm11
+	movaps	XMMWORD[(-72)+r9],xmm12
+	movaps	XMMWORD[(-56)+r9],xmm13
+	movaps	XMMWORD[(-40)+r9],xmm14
+	movaps	XMMWORD[(-24)+r9],xmm15
+$L$4x_body:
+	movdqa	xmm11,XMMWORD[$L$sigma]
+	movdqu	xmm15,XMMWORD[rcx]
+	movdqu	xmm7,XMMWORD[16+rcx]
+	movdqu	xmm3,XMMWORD[r8]
+	lea	rcx,[256+rsp]
+	lea	r10,[$L$rot16]
+	lea	r11,[$L$rot24]
+
+	pshufd	xmm8,xmm11,0x00
+	pshufd	xmm9,xmm11,0x55
+	movdqa	XMMWORD[64+rsp],xmm8
+	pshufd	xmm10,xmm11,0xaa
+	movdqa	XMMWORD[80+rsp],xmm9
+	pshufd	xmm11,xmm11,0xff
+	movdqa	XMMWORD[96+rsp],xmm10
+	movdqa	XMMWORD[112+rsp],xmm11
+
+	pshufd	xmm12,xmm15,0x00
+	pshufd	xmm13,xmm15,0x55
+	movdqa	XMMWORD[(128-256)+rcx],xmm12
+	pshufd	xmm14,xmm15,0xaa
+	movdqa	XMMWORD[(144-256)+rcx],xmm13
+	pshufd	xmm15,xmm15,0xff
+	movdqa	XMMWORD[(160-256)+rcx],xmm14
+	movdqa	XMMWORD[(176-256)+rcx],xmm15
+
+	pshufd	xmm4,xmm7,0x00
+	pshufd	xmm5,xmm7,0x55
+	movdqa	XMMWORD[(192-256)+rcx],xmm4
+	pshufd	xmm6,xmm7,0xaa
+	movdqa	XMMWORD[(208-256)+rcx],xmm5
+	pshufd	xmm7,xmm7,0xff
+	movdqa	XMMWORD[(224-256)+rcx],xmm6
+	movdqa	XMMWORD[(240-256)+rcx],xmm7
+
+	pshufd	xmm0,xmm3,0x00
+	pshufd	xmm1,xmm3,0x55
+	paddd	xmm0,XMMWORD[$L$inc]
+	pshufd	xmm2,xmm3,0xaa
+	movdqa	XMMWORD[(272-256)+rcx],xmm1
+	pshufd	xmm3,xmm3,0xff
+	movdqa	XMMWORD[(288-256)+rcx],xmm2
+	movdqa	XMMWORD[(304-256)+rcx],xmm3
+
+	jmp	NEAR $L$oop_enter4x
+
+ALIGN	32
+$L$oop_outer4x:
+	movdqa	xmm8,XMMWORD[64+rsp]
+	movdqa	xmm9,XMMWORD[80+rsp]
+	movdqa	xmm10,XMMWORD[96+rsp]
+	movdqa	xmm11,XMMWORD[112+rsp]
+	movdqa	xmm12,XMMWORD[((128-256))+rcx]
+	movdqa	xmm13,XMMWORD[((144-256))+rcx]
+	movdqa	xmm14,XMMWORD[((160-256))+rcx]
+	movdqa	xmm15,XMMWORD[((176-256))+rcx]
+	movdqa	xmm4,XMMWORD[((192-256))+rcx]
+	movdqa	xmm5,XMMWORD[((208-256))+rcx]
+	movdqa	xmm6,XMMWORD[((224-256))+rcx]
+	movdqa	xmm7,XMMWORD[((240-256))+rcx]
+	movdqa	xmm0,XMMWORD[((256-256))+rcx]
+	movdqa	xmm1,XMMWORD[((272-256))+rcx]
+	movdqa	xmm2,XMMWORD[((288-256))+rcx]
+	movdqa	xmm3,XMMWORD[((304-256))+rcx]
+	paddd	xmm0,XMMWORD[$L$four]
+
+$L$oop_enter4x:
+	movdqa	XMMWORD[32+rsp],xmm6
+	movdqa	XMMWORD[48+rsp],xmm7
+	movdqa	xmm7,XMMWORD[r10]
+	mov	eax,10
+	movdqa	XMMWORD[(256-256)+rcx],xmm0
+	jmp	NEAR $L$oop4x
+
+ALIGN	32
+$L$oop4x:
+	paddd	xmm8,xmm12
+	paddd	xmm9,xmm13
+	pxor	xmm0,xmm8
+	pxor	xmm1,xmm9
+DB	102,15,56,0,199
+DB	102,15,56,0,207
+	paddd	xmm4,xmm0
+	paddd	xmm5,xmm1
+	pxor	xmm12,xmm4
+	pxor	xmm13,xmm5
+	movdqa	xmm6,xmm12
+	pslld	xmm12,12
+	psrld	xmm6,20
+	movdqa	xmm7,xmm13
+	pslld	xmm13,12
+	por	xmm12,xmm6
+	psrld	xmm7,20
+	movdqa	xmm6,XMMWORD[r11]
+	por	xmm13,xmm7
+	paddd	xmm8,xmm12
+	paddd	xmm9,xmm13
+	pxor	xmm0,xmm8
+	pxor	xmm1,xmm9
+DB	102,15,56,0,198
+DB	102,15,56,0,206
+	paddd	xmm4,xmm0
+	paddd	xmm5,xmm1
+	pxor	xmm12,xmm4
+	pxor	xmm13,xmm5
+	movdqa	xmm7,xmm12
+	pslld	xmm12,7
+	psrld	xmm7,25
+	movdqa	xmm6,xmm13
+	pslld	xmm13,7
+	por	xmm12,xmm7
+	psrld	xmm6,25
+	movdqa	xmm7,XMMWORD[r10]
+	por	xmm13,xmm6
+	movdqa	XMMWORD[rsp],xmm4
+	movdqa	XMMWORD[16+rsp],xmm5
+	movdqa	xmm4,XMMWORD[32+rsp]
+	movdqa	xmm5,XMMWORD[48+rsp]
+	paddd	xmm10,xmm14
+	paddd	xmm11,xmm15
+	pxor	xmm2,xmm10
+	pxor	xmm3,xmm11
+DB	102,15,56,0,215
+DB	102,15,56,0,223
+	paddd	xmm4,xmm2
+	paddd	xmm5,xmm3
+	pxor	xmm14,xmm4
+	pxor	xmm15,xmm5
+	movdqa	xmm6,xmm14
+	pslld	xmm14,12
+	psrld	xmm6,20
+	movdqa	xmm7,xmm15
+	pslld	xmm15,12
+	por	xmm14,xmm6
+	psrld	xmm7,20
+	movdqa	xmm6,XMMWORD[r11]
+	por	xmm15,xmm7
+	paddd	xmm10,xmm14
+	paddd	xmm11,xmm15
+	pxor	xmm2,xmm10
+	pxor	xmm3,xmm11
+DB	102,15,56,0,214
+DB	102,15,56,0,222
+	paddd	xmm4,xmm2
+	paddd	xmm5,xmm3
+	pxor	xmm14,xmm4
+	pxor	xmm15,xmm5
+	movdqa	xmm7,xmm14
+	pslld	xmm14,7
+	psrld	xmm7,25
+	movdqa	xmm6,xmm15
+	pslld	xmm15,7
+	por	xmm14,xmm7
+	psrld	xmm6,25
+	movdqa	xmm7,XMMWORD[r10]
+	por	xmm15,xmm6
+	paddd	xmm8,xmm13
+	paddd	xmm9,xmm14
+	pxor	xmm3,xmm8
+	pxor	xmm0,xmm9
+DB	102,15,56,0,223
+DB	102,15,56,0,199
+	paddd	xmm4,xmm3
+	paddd	xmm5,xmm0
+	pxor	xmm13,xmm4
+	pxor	xmm14,xmm5
+	movdqa	xmm6,xmm13
+	pslld	xmm13,12
+	psrld	xmm6,20
+	movdqa	xmm7,xmm14
+	pslld	xmm14,12
+	por	xmm13,xmm6
+	psrld	xmm7,20
+	movdqa	xmm6,XMMWORD[r11]
+	por	xmm14,xmm7
+	paddd	xmm8,xmm13
+	paddd	xmm9,xmm14
+	pxor	xmm3,xmm8
+	pxor	xmm0,xmm9
+DB	102,15,56,0,222
+DB	102,15,56,0,198
+	paddd	xmm4,xmm3
+	paddd	xmm5,xmm0
+	pxor	xmm13,xmm4
+	pxor	xmm14,xmm5
+	movdqa	xmm7,xmm13
+	pslld	xmm13,7
+	psrld	xmm7,25
+	movdqa	xmm6,xmm14
+	pslld	xmm14,7
+	por	xmm13,xmm7
+	psrld	xmm6,25
+	movdqa	xmm7,XMMWORD[r10]
+	por	xmm14,xmm6
+	movdqa	XMMWORD[32+rsp],xmm4
+	movdqa	XMMWORD[48+rsp],xmm5
+	movdqa	xmm4,XMMWORD[rsp]
+	movdqa	xmm5,XMMWORD[16+rsp]
+	paddd	xmm10,xmm15
+	paddd	xmm11,xmm12
+	pxor	xmm1,xmm10
+	pxor	xmm2,xmm11
+DB	102,15,56,0,207
+DB	102,15,56,0,215
+	paddd	xmm4,xmm1
+	paddd	xmm5,xmm2
+	pxor	xmm15,xmm4
+	pxor	xmm12,xmm5
+	movdqa	xmm6,xmm15
+	pslld	xmm15,12
+	psrld	xmm6,20
+	movdqa	xmm7,xmm12
+	pslld	xmm12,12
+	por	xmm15,xmm6
+	psrld	xmm7,20
+	movdqa	xmm6,XMMWORD[r11]
+	por	xmm12,xmm7
+	paddd	xmm10,xmm15
+	paddd	xmm11,xmm12
+	pxor	xmm1,xmm10
+	pxor	xmm2,xmm11
+DB	102,15,56,0,206
+DB	102,15,56,0,214
+	paddd	xmm4,xmm1
+	paddd	xmm5,xmm2
+	pxor	xmm15,xmm4
+	pxor	xmm12,xmm5
+	movdqa	xmm7,xmm15
+	pslld	xmm15,7
+	psrld	xmm7,25
+	movdqa	xmm6,xmm12
+	pslld	xmm12,7
+	por	xmm15,xmm7
+	psrld	xmm6,25
+	movdqa	xmm7,XMMWORD[r10]
+	por	xmm12,xmm6
+	dec	eax
+	jnz	NEAR $L$oop4x
+
+	paddd	xmm8,XMMWORD[64+rsp]
+	paddd	xmm9,XMMWORD[80+rsp]
+	paddd	xmm10,XMMWORD[96+rsp]
+	paddd	xmm11,XMMWORD[112+rsp]
+
+	movdqa	xmm6,xmm8
+	punpckldq	xmm8,xmm9
+	movdqa	xmm7,xmm10
+	punpckldq	xmm10,xmm11
+	punpckhdq	xmm6,xmm9
+	punpckhdq	xmm7,xmm11
+	movdqa	xmm9,xmm8
+	punpcklqdq	xmm8,xmm10
+	movdqa	xmm11,xmm6
+	punpcklqdq	xmm6,xmm7
+	punpckhqdq	xmm9,xmm10
+	punpckhqdq	xmm11,xmm7
+	paddd	xmm12,XMMWORD[((128-256))+rcx]
+	paddd	xmm13,XMMWORD[((144-256))+rcx]
+	paddd	xmm14,XMMWORD[((160-256))+rcx]
+	paddd	xmm15,XMMWORD[((176-256))+rcx]
+
+	movdqa	XMMWORD[rsp],xmm8
+	movdqa	XMMWORD[16+rsp],xmm9
+	movdqa	xmm8,XMMWORD[32+rsp]
+	movdqa	xmm9,XMMWORD[48+rsp]
+
+	movdqa	xmm10,xmm12
+	punpckldq	xmm12,xmm13
+	movdqa	xmm7,xmm14
+	punpckldq	xmm14,xmm15
+	punpckhdq	xmm10,xmm13
+	punpckhdq	xmm7,xmm15
+	movdqa	xmm13,xmm12
+	punpcklqdq	xmm12,xmm14
+	movdqa	xmm15,xmm10
+	punpcklqdq	xmm10,xmm7
+	punpckhqdq	xmm13,xmm14
+	punpckhqdq	xmm15,xmm7
+	paddd	xmm4,XMMWORD[((192-256))+rcx]
+	paddd	xmm5,XMMWORD[((208-256))+rcx]
+	paddd	xmm8,XMMWORD[((224-256))+rcx]
+	paddd	xmm9,XMMWORD[((240-256))+rcx]
+
+	movdqa	XMMWORD[32+rsp],xmm6
+	movdqa	XMMWORD[48+rsp],xmm11
+
+	movdqa	xmm14,xmm4
+	punpckldq	xmm4,xmm5
+	movdqa	xmm7,xmm8
+	punpckldq	xmm8,xmm9
+	punpckhdq	xmm14,xmm5
+	punpckhdq	xmm7,xmm9
+	movdqa	xmm5,xmm4
+	punpcklqdq	xmm4,xmm8
+	movdqa	xmm9,xmm14
+	punpcklqdq	xmm14,xmm7
+	punpckhqdq	xmm5,xmm8
+	punpckhqdq	xmm9,xmm7
+	paddd	xmm0,XMMWORD[((256-256))+rcx]
+	paddd	xmm1,XMMWORD[((272-256))+rcx]
+	paddd	xmm2,XMMWORD[((288-256))+rcx]
+	paddd	xmm3,XMMWORD[((304-256))+rcx]
+
+	movdqa	xmm8,xmm0
+	punpckldq	xmm0,xmm1
+	movdqa	xmm7,xmm2
+	punpckldq	xmm2,xmm3
+	punpckhdq	xmm8,xmm1
+	punpckhdq	xmm7,xmm3
+	movdqa	xmm1,xmm0
+	punpcklqdq	xmm0,xmm2
+	movdqa	xmm3,xmm8
+	punpcklqdq	xmm8,xmm7
+	punpckhqdq	xmm1,xmm2
+	punpckhqdq	xmm3,xmm7
+	cmp	rdx,64*4
+	jb	NEAR $L$tail4x
+
+	movdqu	xmm6,XMMWORD[rsi]
+	movdqu	xmm11,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm7,XMMWORD[48+rsi]
+	pxor	xmm6,XMMWORD[rsp]
+	pxor	xmm11,xmm12
+	pxor	xmm2,xmm4
+	pxor	xmm7,xmm0
+
+	movdqu	XMMWORD[rdi],xmm6
+	movdqu	xmm6,XMMWORD[64+rsi]
+	movdqu	XMMWORD[16+rdi],xmm11
+	movdqu	xmm11,XMMWORD[80+rsi]
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	xmm2,XMMWORD[96+rsi]
+	movdqu	XMMWORD[48+rdi],xmm7
+	movdqu	xmm7,XMMWORD[112+rsi]
+	lea	rsi,[128+rsi]
+	pxor	xmm6,XMMWORD[16+rsp]
+	pxor	xmm11,xmm13
+	pxor	xmm2,xmm5
+	pxor	xmm7,xmm1
+
+	movdqu	XMMWORD[64+rdi],xmm6
+	movdqu	xmm6,XMMWORD[rsi]
+	movdqu	XMMWORD[80+rdi],xmm11
+	movdqu	xmm11,XMMWORD[16+rsi]
+	movdqu	XMMWORD[96+rdi],xmm2
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	XMMWORD[112+rdi],xmm7
+	lea	rdi,[128+rdi]
+	movdqu	xmm7,XMMWORD[48+rsi]
+	pxor	xmm6,XMMWORD[32+rsp]
+	pxor	xmm11,xmm10
+	pxor	xmm2,xmm14
+	pxor	xmm7,xmm8
+
+	movdqu	XMMWORD[rdi],xmm6
+	movdqu	xmm6,XMMWORD[64+rsi]
+	movdqu	XMMWORD[16+rdi],xmm11
+	movdqu	xmm11,XMMWORD[80+rsi]
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	xmm2,XMMWORD[96+rsi]
+	movdqu	XMMWORD[48+rdi],xmm7
+	movdqu	xmm7,XMMWORD[112+rsi]
+	lea	rsi,[128+rsi]
+	pxor	xmm6,XMMWORD[48+rsp]
+	pxor	xmm11,xmm15
+	pxor	xmm2,xmm9
+	pxor	xmm7,xmm3
+	movdqu	XMMWORD[64+rdi],xmm6
+	movdqu	XMMWORD[80+rdi],xmm11
+	movdqu	XMMWORD[96+rdi],xmm2
+	movdqu	XMMWORD[112+rdi],xmm7
+	lea	rdi,[128+rdi]
+
+	sub	rdx,64*4
+	jnz	NEAR $L$oop_outer4x
+
+	jmp	NEAR $L$done4x
+
+$L$tail4x:
+	cmp	rdx,192
+	jae	NEAR $L$192_or_more4x
+	cmp	rdx,128
+	jae	NEAR $L$128_or_more4x
+	cmp	rdx,64
+	jae	NEAR $L$64_or_more4x
+
+
+	xor	r10,r10
+
+	movdqa	XMMWORD[16+rsp],xmm12
+	movdqa	XMMWORD[32+rsp],xmm4
+	movdqa	XMMWORD[48+rsp],xmm0
+	jmp	NEAR $L$oop_tail4x
+
+ALIGN	32
+$L$64_or_more4x:
+	movdqu	xmm6,XMMWORD[rsi]
+	movdqu	xmm11,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm7,XMMWORD[48+rsi]
+	pxor	xmm6,XMMWORD[rsp]
+	pxor	xmm11,xmm12
+	pxor	xmm2,xmm4
+	pxor	xmm7,xmm0
+	movdqu	XMMWORD[rdi],xmm6
+	movdqu	XMMWORD[16+rdi],xmm11
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm7
+	je	NEAR $L$done4x
+
+	movdqa	xmm6,XMMWORD[16+rsp]
+	lea	rsi,[64+rsi]
+	xor	r10,r10
+	movdqa	XMMWORD[rsp],xmm6
+	movdqa	XMMWORD[16+rsp],xmm13
+	lea	rdi,[64+rdi]
+	movdqa	XMMWORD[32+rsp],xmm5
+	sub	rdx,64
+	movdqa	XMMWORD[48+rsp],xmm1
+	jmp	NEAR $L$oop_tail4x
+
+ALIGN	32
+$L$128_or_more4x:
+	movdqu	xmm6,XMMWORD[rsi]
+	movdqu	xmm11,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm7,XMMWORD[48+rsi]
+	pxor	xmm6,XMMWORD[rsp]
+	pxor	xmm11,xmm12
+	pxor	xmm2,xmm4
+	pxor	xmm7,xmm0
+
+	movdqu	XMMWORD[rdi],xmm6
+	movdqu	xmm6,XMMWORD[64+rsi]
+	movdqu	XMMWORD[16+rdi],xmm11
+	movdqu	xmm11,XMMWORD[80+rsi]
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	xmm2,XMMWORD[96+rsi]
+	movdqu	XMMWORD[48+rdi],xmm7
+	movdqu	xmm7,XMMWORD[112+rsi]
+	pxor	xmm6,XMMWORD[16+rsp]
+	pxor	xmm11,xmm13
+	pxor	xmm2,xmm5
+	pxor	xmm7,xmm1
+	movdqu	XMMWORD[64+rdi],xmm6
+	movdqu	XMMWORD[80+rdi],xmm11
+	movdqu	XMMWORD[96+rdi],xmm2
+	movdqu	XMMWORD[112+rdi],xmm7
+	je	NEAR $L$done4x
+
+	movdqa	xmm6,XMMWORD[32+rsp]
+	lea	rsi,[128+rsi]
+	xor	r10,r10
+	movdqa	XMMWORD[rsp],xmm6
+	movdqa	XMMWORD[16+rsp],xmm10
+	lea	rdi,[128+rdi]
+	movdqa	XMMWORD[32+rsp],xmm14
+	sub	rdx,128
+	movdqa	XMMWORD[48+rsp],xmm8
+	jmp	NEAR $L$oop_tail4x
+
+ALIGN	32
+$L$192_or_more4x:
+	movdqu	xmm6,XMMWORD[rsi]
+	movdqu	xmm11,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm7,XMMWORD[48+rsi]
+	pxor	xmm6,XMMWORD[rsp]
+	pxor	xmm11,xmm12
+	pxor	xmm2,xmm4
+	pxor	xmm7,xmm0
+
+	movdqu	XMMWORD[rdi],xmm6
+	movdqu	xmm6,XMMWORD[64+rsi]
+	movdqu	XMMWORD[16+rdi],xmm11
+	movdqu	xmm11,XMMWORD[80+rsi]
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	xmm2,XMMWORD[96+rsi]
+	movdqu	XMMWORD[48+rdi],xmm7
+	movdqu	xmm7,XMMWORD[112+rsi]
+	lea	rsi,[128+rsi]
+	pxor	xmm6,XMMWORD[16+rsp]
+	pxor	xmm11,xmm13
+	pxor	xmm2,xmm5
+	pxor	xmm7,xmm1
+
+	movdqu	XMMWORD[64+rdi],xmm6
+	movdqu	xmm6,XMMWORD[rsi]
+	movdqu	XMMWORD[80+rdi],xmm11
+	movdqu	xmm11,XMMWORD[16+rsi]
+	movdqu	XMMWORD[96+rdi],xmm2
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	XMMWORD[112+rdi],xmm7
+	lea	rdi,[128+rdi]
+	movdqu	xmm7,XMMWORD[48+rsi]
+	pxor	xmm6,XMMWORD[32+rsp]
+	pxor	xmm11,xmm10
+	pxor	xmm2,xmm14
+	pxor	xmm7,xmm8
+	movdqu	XMMWORD[rdi],xmm6
+	movdqu	XMMWORD[16+rdi],xmm11
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm7
+	je	NEAR $L$done4x
+
+	movdqa	xmm6,XMMWORD[48+rsp]
+	lea	rsi,[64+rsi]
+	xor	r10,r10
+	movdqa	XMMWORD[rsp],xmm6
+	movdqa	XMMWORD[16+rsp],xmm15
+	lea	rdi,[64+rdi]
+	movdqa	XMMWORD[32+rsp],xmm9
+	sub	rdx,192
+	movdqa	XMMWORD[48+rsp],xmm3
+
+$L$oop_tail4x:
+	movzx	eax,BYTE[r10*1+rsi]
+	movzx	ecx,BYTE[r10*1+rsp]
+	lea	r10,[1+r10]
+	xor	eax,ecx
+	mov	BYTE[((-1))+r10*1+rdi],al
+	dec	rdx
+	jnz	NEAR $L$oop_tail4x
+
+$L$done4x:
+	movaps	xmm6,XMMWORD[((-168))+r9]
+	movaps	xmm7,XMMWORD[((-152))+r9]
+	movaps	xmm8,XMMWORD[((-136))+r9]
+	movaps	xmm9,XMMWORD[((-120))+r9]
+	movaps	xmm10,XMMWORD[((-104))+r9]
+	movaps	xmm11,XMMWORD[((-88))+r9]
+	movaps	xmm12,XMMWORD[((-72))+r9]
+	movaps	xmm13,XMMWORD[((-56))+r9]
+	movaps	xmm14,XMMWORD[((-40))+r9]
+	movaps	xmm15,XMMWORD[((-24))+r9]
+	lea	rsp,[r9]
+
+$L$4x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_ChaCha20_4x:
+
+ALIGN	32
+ChaCha20_8x:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ChaCha20_8x:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+$L$ChaCha20_8x:
+
+	mov	r9,rsp
+
+	sub	rsp,0x280+168
+	and	rsp,-32
+	movaps	XMMWORD[(-168)+r9],xmm6
+	movaps	XMMWORD[(-152)+r9],xmm7
+	movaps	XMMWORD[(-136)+r9],xmm8
+	movaps	XMMWORD[(-120)+r9],xmm9
+	movaps	XMMWORD[(-104)+r9],xmm10
+	movaps	XMMWORD[(-88)+r9],xmm11
+	movaps	XMMWORD[(-72)+r9],xmm12
+	movaps	XMMWORD[(-56)+r9],xmm13
+	movaps	XMMWORD[(-40)+r9],xmm14
+	movaps	XMMWORD[(-24)+r9],xmm15
+$L$8x_body:
+	vzeroupper
+
+
+
+
+
+
+
+
+
+
+	vbroadcasti128	ymm11,XMMWORD[$L$sigma]
+	vbroadcasti128	ymm3,XMMWORD[rcx]
+	vbroadcasti128	ymm15,XMMWORD[16+rcx]
+	vbroadcasti128	ymm7,XMMWORD[r8]
+	lea	rcx,[256+rsp]
+	lea	rax,[512+rsp]
+	lea	r10,[$L$rot16]
+	lea	r11,[$L$rot24]
+
+	vpshufd	ymm8,ymm11,0x00
+	vpshufd	ymm9,ymm11,0x55
+	vmovdqa	YMMWORD[(128-256)+rcx],ymm8
+	vpshufd	ymm10,ymm11,0xaa
+	vmovdqa	YMMWORD[(160-256)+rcx],ymm9
+	vpshufd	ymm11,ymm11,0xff
+	vmovdqa	YMMWORD[(192-256)+rcx],ymm10
+	vmovdqa	YMMWORD[(224-256)+rcx],ymm11
+
+	vpshufd	ymm0,ymm3,0x00
+	vpshufd	ymm1,ymm3,0x55
+	vmovdqa	YMMWORD[(256-256)+rcx],ymm0
+	vpshufd	ymm2,ymm3,0xaa
+	vmovdqa	YMMWORD[(288-256)+rcx],ymm1
+	vpshufd	ymm3,ymm3,0xff
+	vmovdqa	YMMWORD[(320-256)+rcx],ymm2
+	vmovdqa	YMMWORD[(352-256)+rcx],ymm3
+
+	vpshufd	ymm12,ymm15,0x00
+	vpshufd	ymm13,ymm15,0x55
+	vmovdqa	YMMWORD[(384-512)+rax],ymm12
+	vpshufd	ymm14,ymm15,0xaa
+	vmovdqa	YMMWORD[(416-512)+rax],ymm13
+	vpshufd	ymm15,ymm15,0xff
+	vmovdqa	YMMWORD[(448-512)+rax],ymm14
+	vmovdqa	YMMWORD[(480-512)+rax],ymm15
+
+	vpshufd	ymm4,ymm7,0x00
+	vpshufd	ymm5,ymm7,0x55
+	vpaddd	ymm4,ymm4,YMMWORD[$L$incy]
+	vpshufd	ymm6,ymm7,0xaa
+	vmovdqa	YMMWORD[(544-512)+rax],ymm5
+	vpshufd	ymm7,ymm7,0xff
+	vmovdqa	YMMWORD[(576-512)+rax],ymm6
+	vmovdqa	YMMWORD[(608-512)+rax],ymm7
+
+	jmp	NEAR $L$oop_enter8x
+
+ALIGN	32
+$L$oop_outer8x:
+	vmovdqa	ymm8,YMMWORD[((128-256))+rcx]
+	vmovdqa	ymm9,YMMWORD[((160-256))+rcx]
+	vmovdqa	ymm10,YMMWORD[((192-256))+rcx]
+	vmovdqa	ymm11,YMMWORD[((224-256))+rcx]
+	vmovdqa	ymm0,YMMWORD[((256-256))+rcx]
+	vmovdqa	ymm1,YMMWORD[((288-256))+rcx]
+	vmovdqa	ymm2,YMMWORD[((320-256))+rcx]
+	vmovdqa	ymm3,YMMWORD[((352-256))+rcx]
+	vmovdqa	ymm12,YMMWORD[((384-512))+rax]
+	vmovdqa	ymm13,YMMWORD[((416-512))+rax]
+	vmovdqa	ymm14,YMMWORD[((448-512))+rax]
+	vmovdqa	ymm15,YMMWORD[((480-512))+rax]
+	vmovdqa	ymm4,YMMWORD[((512-512))+rax]
+	vmovdqa	ymm5,YMMWORD[((544-512))+rax]
+	vmovdqa	ymm6,YMMWORD[((576-512))+rax]
+	vmovdqa	ymm7,YMMWORD[((608-512))+rax]
+	vpaddd	ymm4,ymm4,YMMWORD[$L$eight]
+
+$L$oop_enter8x:
+	vmovdqa	YMMWORD[64+rsp],ymm14
+	vmovdqa	YMMWORD[96+rsp],ymm15
+	vbroadcasti128	ymm15,XMMWORD[r10]
+	vmovdqa	YMMWORD[(512-512)+rax],ymm4
+	mov	eax,10
+	jmp	NEAR $L$oop8x
+
+ALIGN	32
+$L$oop8x:
+	vpaddd	ymm8,ymm8,ymm0
+	vpxor	ymm4,ymm8,ymm4
+	vpshufb	ymm4,ymm4,ymm15
+	vpaddd	ymm9,ymm9,ymm1
+	vpxor	ymm5,ymm9,ymm5
+	vpshufb	ymm5,ymm5,ymm15
+	vpaddd	ymm12,ymm12,ymm4
+	vpxor	ymm0,ymm12,ymm0
+	vpslld	ymm14,ymm0,12
+	vpsrld	ymm0,ymm0,20
+	vpor	ymm0,ymm14,ymm0
+	vbroadcasti128	ymm14,XMMWORD[r11]
+	vpaddd	ymm13,ymm13,ymm5
+	vpxor	ymm1,ymm13,ymm1
+	vpslld	ymm15,ymm1,12
+	vpsrld	ymm1,ymm1,20
+	vpor	ymm1,ymm15,ymm1
+	vpaddd	ymm8,ymm8,ymm0
+	vpxor	ymm4,ymm8,ymm4
+	vpshufb	ymm4,ymm4,ymm14
+	vpaddd	ymm9,ymm9,ymm1
+	vpxor	ymm5,ymm9,ymm5
+	vpshufb	ymm5,ymm5,ymm14
+	vpaddd	ymm12,ymm12,ymm4
+	vpxor	ymm0,ymm12,ymm0
+	vpslld	ymm15,ymm0,7
+	vpsrld	ymm0,ymm0,25
+	vpor	ymm0,ymm15,ymm0
+	vbroadcasti128	ymm15,XMMWORD[r10]
+	vpaddd	ymm13,ymm13,ymm5
+	vpxor	ymm1,ymm13,ymm1
+	vpslld	ymm14,ymm1,7
+	vpsrld	ymm1,ymm1,25
+	vpor	ymm1,ymm14,ymm1
+	vmovdqa	YMMWORD[rsp],ymm12
+	vmovdqa	YMMWORD[32+rsp],ymm13
+	vmovdqa	ymm12,YMMWORD[64+rsp]
+	vmovdqa	ymm13,YMMWORD[96+rsp]
+	vpaddd	ymm10,ymm10,ymm2
+	vpxor	ymm6,ymm10,ymm6
+	vpshufb	ymm6,ymm6,ymm15
+	vpaddd	ymm11,ymm11,ymm3
+	vpxor	ymm7,ymm11,ymm7
+	vpshufb	ymm7,ymm7,ymm15
+	vpaddd	ymm12,ymm12,ymm6
+	vpxor	ymm2,ymm12,ymm2
+	vpslld	ymm14,ymm2,12
+	vpsrld	ymm2,ymm2,20
+	vpor	ymm2,ymm14,ymm2
+	vbroadcasti128	ymm14,XMMWORD[r11]
+	vpaddd	ymm13,ymm13,ymm7
+	vpxor	ymm3,ymm13,ymm3
+	vpslld	ymm15,ymm3,12
+	vpsrld	ymm3,ymm3,20
+	vpor	ymm3,ymm15,ymm3
+	vpaddd	ymm10,ymm10,ymm2
+	vpxor	ymm6,ymm10,ymm6
+	vpshufb	ymm6,ymm6,ymm14
+	vpaddd	ymm11,ymm11,ymm3
+	vpxor	ymm7,ymm11,ymm7
+	vpshufb	ymm7,ymm7,ymm14
+	vpaddd	ymm12,ymm12,ymm6
+	vpxor	ymm2,ymm12,ymm2
+	vpslld	ymm15,ymm2,7
+	vpsrld	ymm2,ymm2,25
+	vpor	ymm2,ymm15,ymm2
+	vbroadcasti128	ymm15,XMMWORD[r10]
+	vpaddd	ymm13,ymm13,ymm7
+	vpxor	ymm3,ymm13,ymm3
+	vpslld	ymm14,ymm3,7
+	vpsrld	ymm3,ymm3,25
+	vpor	ymm3,ymm14,ymm3
+	vpaddd	ymm8,ymm8,ymm1
+	vpxor	ymm7,ymm8,ymm7
+	vpshufb	ymm7,ymm7,ymm15
+	vpaddd	ymm9,ymm9,ymm2
+	vpxor	ymm4,ymm9,ymm4
+	vpshufb	ymm4,ymm4,ymm15
+	vpaddd	ymm12,ymm12,ymm7
+	vpxor	ymm1,ymm12,ymm1
+	vpslld	ymm14,ymm1,12
+	vpsrld	ymm1,ymm1,20
+	vpor	ymm1,ymm14,ymm1
+	vbroadcasti128	ymm14,XMMWORD[r11]
+	vpaddd	ymm13,ymm13,ymm4
+	vpxor	ymm2,ymm13,ymm2
+	vpslld	ymm15,ymm2,12
+	vpsrld	ymm2,ymm2,20
+	vpor	ymm2,ymm15,ymm2
+	vpaddd	ymm8,ymm8,ymm1
+	vpxor	ymm7,ymm8,ymm7
+	vpshufb	ymm7,ymm7,ymm14
+	vpaddd	ymm9,ymm9,ymm2
+	vpxor	ymm4,ymm9,ymm4
+	vpshufb	ymm4,ymm4,ymm14
+	vpaddd	ymm12,ymm12,ymm7
+	vpxor	ymm1,ymm12,ymm1
+	vpslld	ymm15,ymm1,7
+	vpsrld	ymm1,ymm1,25
+	vpor	ymm1,ymm15,ymm1
+	vbroadcasti128	ymm15,XMMWORD[r10]
+	vpaddd	ymm13,ymm13,ymm4
+	vpxor	ymm2,ymm13,ymm2
+	vpslld	ymm14,ymm2,7
+	vpsrld	ymm2,ymm2,25
+	vpor	ymm2,ymm14,ymm2
+	vmovdqa	YMMWORD[64+rsp],ymm12
+	vmovdqa	YMMWORD[96+rsp],ymm13
+	vmovdqa	ymm12,YMMWORD[rsp]
+	vmovdqa	ymm13,YMMWORD[32+rsp]
+	vpaddd	ymm10,ymm10,ymm3
+	vpxor	ymm5,ymm10,ymm5
+	vpshufb	ymm5,ymm5,ymm15
+	vpaddd	ymm11,ymm11,ymm0
+	vpxor	ymm6,ymm11,ymm6
+	vpshufb	ymm6,ymm6,ymm15
+	vpaddd	ymm12,ymm12,ymm5
+	vpxor	ymm3,ymm12,ymm3
+	vpslld	ymm14,ymm3,12
+	vpsrld	ymm3,ymm3,20
+	vpor	ymm3,ymm14,ymm3
+	vbroadcasti128	ymm14,XMMWORD[r11]
+	vpaddd	ymm13,ymm13,ymm6
+	vpxor	ymm0,ymm13,ymm0
+	vpslld	ymm15,ymm0,12
+	vpsrld	ymm0,ymm0,20
+	vpor	ymm0,ymm15,ymm0
+	vpaddd	ymm10,ymm10,ymm3
+	vpxor	ymm5,ymm10,ymm5
+	vpshufb	ymm5,ymm5,ymm14
+	vpaddd	ymm11,ymm11,ymm0
+	vpxor	ymm6,ymm11,ymm6
+	vpshufb	ymm6,ymm6,ymm14
+	vpaddd	ymm12,ymm12,ymm5
+	vpxor	ymm3,ymm12,ymm3
+	vpslld	ymm15,ymm3,7
+	vpsrld	ymm3,ymm3,25
+	vpor	ymm3,ymm15,ymm3
+	vbroadcasti128	ymm15,XMMWORD[r10]
+	vpaddd	ymm13,ymm13,ymm6
+	vpxor	ymm0,ymm13,ymm0
+	vpslld	ymm14,ymm0,7
+	vpsrld	ymm0,ymm0,25
+	vpor	ymm0,ymm14,ymm0
+	dec	eax
+	jnz	NEAR $L$oop8x
+
+	lea	rax,[512+rsp]
+	vpaddd	ymm8,ymm8,YMMWORD[((128-256))+rcx]
+	vpaddd	ymm9,ymm9,YMMWORD[((160-256))+rcx]
+	vpaddd	ymm10,ymm10,YMMWORD[((192-256))+rcx]
+	vpaddd	ymm11,ymm11,YMMWORD[((224-256))+rcx]
+
+	vpunpckldq	ymm14,ymm8,ymm9
+	vpunpckldq	ymm15,ymm10,ymm11
+	vpunpckhdq	ymm8,ymm8,ymm9
+	vpunpckhdq	ymm10,ymm10,ymm11
+	vpunpcklqdq	ymm9,ymm14,ymm15
+	vpunpckhqdq	ymm14,ymm14,ymm15
+	vpunpcklqdq	ymm11,ymm8,ymm10
+	vpunpckhqdq	ymm8,ymm8,ymm10
+	vpaddd	ymm0,ymm0,YMMWORD[((256-256))+rcx]
+	vpaddd	ymm1,ymm1,YMMWORD[((288-256))+rcx]
+	vpaddd	ymm2,ymm2,YMMWORD[((320-256))+rcx]
+	vpaddd	ymm3,ymm3,YMMWORD[((352-256))+rcx]
+
+	vpunpckldq	ymm10,ymm0,ymm1
+	vpunpckldq	ymm15,ymm2,ymm3
+	vpunpckhdq	ymm0,ymm0,ymm1
+	vpunpckhdq	ymm2,ymm2,ymm3
+	vpunpcklqdq	ymm1,ymm10,ymm15
+	vpunpckhqdq	ymm10,ymm10,ymm15
+	vpunpcklqdq	ymm3,ymm0,ymm2
+	vpunpckhqdq	ymm0,ymm0,ymm2
+	vperm2i128	ymm15,ymm9,ymm1,0x20
+	vperm2i128	ymm1,ymm9,ymm1,0x31
+	vperm2i128	ymm9,ymm14,ymm10,0x20
+	vperm2i128	ymm10,ymm14,ymm10,0x31
+	vperm2i128	ymm14,ymm11,ymm3,0x20
+	vperm2i128	ymm3,ymm11,ymm3,0x31
+	vperm2i128	ymm11,ymm8,ymm0,0x20
+	vperm2i128	ymm0,ymm8,ymm0,0x31
+	vmovdqa	YMMWORD[rsp],ymm15
+	vmovdqa	YMMWORD[32+rsp],ymm9
+	vmovdqa	ymm15,YMMWORD[64+rsp]
+	vmovdqa	ymm9,YMMWORD[96+rsp]
+
+	vpaddd	ymm12,ymm12,YMMWORD[((384-512))+rax]
+	vpaddd	ymm13,ymm13,YMMWORD[((416-512))+rax]
+	vpaddd	ymm15,ymm15,YMMWORD[((448-512))+rax]
+	vpaddd	ymm9,ymm9,YMMWORD[((480-512))+rax]
+
+	vpunpckldq	ymm2,ymm12,ymm13
+	vpunpckldq	ymm8,ymm15,ymm9
+	vpunpckhdq	ymm12,ymm12,ymm13
+	vpunpckhdq	ymm15,ymm15,ymm9
+	vpunpcklqdq	ymm13,ymm2,ymm8
+	vpunpckhqdq	ymm2,ymm2,ymm8
+	vpunpcklqdq	ymm9,ymm12,ymm15
+	vpunpckhqdq	ymm12,ymm12,ymm15
+	vpaddd	ymm4,ymm4,YMMWORD[((512-512))+rax]
+	vpaddd	ymm5,ymm5,YMMWORD[((544-512))+rax]
+	vpaddd	ymm6,ymm6,YMMWORD[((576-512))+rax]
+	vpaddd	ymm7,ymm7,YMMWORD[((608-512))+rax]
+
+	vpunpckldq	ymm15,ymm4,ymm5
+	vpunpckldq	ymm8,ymm6,ymm7
+	vpunpckhdq	ymm4,ymm4,ymm5
+	vpunpckhdq	ymm6,ymm6,ymm7
+	vpunpcklqdq	ymm5,ymm15,ymm8
+	vpunpckhqdq	ymm15,ymm15,ymm8
+	vpunpcklqdq	ymm7,ymm4,ymm6
+	vpunpckhqdq	ymm4,ymm4,ymm6
+	vperm2i128	ymm8,ymm13,ymm5,0x20
+	vperm2i128	ymm5,ymm13,ymm5,0x31
+	vperm2i128	ymm13,ymm2,ymm15,0x20
+	vperm2i128	ymm15,ymm2,ymm15,0x31
+	vperm2i128	ymm2,ymm9,ymm7,0x20
+	vperm2i128	ymm7,ymm9,ymm7,0x31
+	vperm2i128	ymm9,ymm12,ymm4,0x20
+	vperm2i128	ymm4,ymm12,ymm4,0x31
+	vmovdqa	ymm6,YMMWORD[rsp]
+	vmovdqa	ymm12,YMMWORD[32+rsp]
+
+	cmp	rdx,64*8
+	jb	NEAR $L$tail8x
+
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	lea	rsi,[128+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	lea	rdi,[128+rdi]
+
+	vpxor	ymm12,ymm12,YMMWORD[rsi]
+	vpxor	ymm13,ymm13,YMMWORD[32+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[64+rsi]
+	vpxor	ymm15,ymm15,YMMWORD[96+rsi]
+	lea	rsi,[128+rsi]
+	vmovdqu	YMMWORD[rdi],ymm12
+	vmovdqu	YMMWORD[32+rdi],ymm13
+	vmovdqu	YMMWORD[64+rdi],ymm10
+	vmovdqu	YMMWORD[96+rdi],ymm15
+	lea	rdi,[128+rdi]
+
+	vpxor	ymm14,ymm14,YMMWORD[rsi]
+	vpxor	ymm2,ymm2,YMMWORD[32+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[64+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[96+rsi]
+	lea	rsi,[128+rsi]
+	vmovdqu	YMMWORD[rdi],ymm14
+	vmovdqu	YMMWORD[32+rdi],ymm2
+	vmovdqu	YMMWORD[64+rdi],ymm3
+	vmovdqu	YMMWORD[96+rdi],ymm7
+	lea	rdi,[128+rdi]
+
+	vpxor	ymm11,ymm11,YMMWORD[rsi]
+	vpxor	ymm9,ymm9,YMMWORD[32+rsi]
+	vpxor	ymm0,ymm0,YMMWORD[64+rsi]
+	vpxor	ymm4,ymm4,YMMWORD[96+rsi]
+	lea	rsi,[128+rsi]
+	vmovdqu	YMMWORD[rdi],ymm11
+	vmovdqu	YMMWORD[32+rdi],ymm9
+	vmovdqu	YMMWORD[64+rdi],ymm0
+	vmovdqu	YMMWORD[96+rdi],ymm4
+	lea	rdi,[128+rdi]
+
+	sub	rdx,64*8
+	jnz	NEAR $L$oop_outer8x
+
+	jmp	NEAR $L$done8x
+
+$L$tail8x:
+	cmp	rdx,448
+	jae	NEAR $L$448_or_more8x
+	cmp	rdx,384
+	jae	NEAR $L$384_or_more8x
+	cmp	rdx,320
+	jae	NEAR $L$320_or_more8x
+	cmp	rdx,256
+	jae	NEAR $L$256_or_more8x
+	cmp	rdx,192
+	jae	NEAR $L$192_or_more8x
+	cmp	rdx,128
+	jae	NEAR $L$128_or_more8x
+	cmp	rdx,64
+	jae	NEAR $L$64_or_more8x
+
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm6
+	vmovdqa	YMMWORD[32+rsp],ymm8
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$64_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	je	NEAR $L$done8x
+
+	lea	rsi,[64+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm1
+	lea	rdi,[64+rdi]
+	sub	rdx,64
+	vmovdqa	YMMWORD[32+rsp],ymm5
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$128_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	je	NEAR $L$done8x
+
+	lea	rsi,[128+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm12
+	lea	rdi,[128+rdi]
+	sub	rdx,128
+	vmovdqa	YMMWORD[32+rsp],ymm13
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$192_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
+	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	vmovdqu	YMMWORD[128+rdi],ymm12
+	vmovdqu	YMMWORD[160+rdi],ymm13
+	je	NEAR $L$done8x
+
+	lea	rsi,[192+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm10
+	lea	rdi,[192+rdi]
+	sub	rdx,192
+	vmovdqa	YMMWORD[32+rsp],ymm15
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$256_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
+	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
+	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	vmovdqu	YMMWORD[128+rdi],ymm12
+	vmovdqu	YMMWORD[160+rdi],ymm13
+	vmovdqu	YMMWORD[192+rdi],ymm10
+	vmovdqu	YMMWORD[224+rdi],ymm15
+	je	NEAR $L$done8x
+
+	lea	rsi,[256+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm14
+	lea	rdi,[256+rdi]
+	sub	rdx,256
+	vmovdqa	YMMWORD[32+rsp],ymm2
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$320_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
+	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
+	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
+	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	vmovdqu	YMMWORD[128+rdi],ymm12
+	vmovdqu	YMMWORD[160+rdi],ymm13
+	vmovdqu	YMMWORD[192+rdi],ymm10
+	vmovdqu	YMMWORD[224+rdi],ymm15
+	vmovdqu	YMMWORD[256+rdi],ymm14
+	vmovdqu	YMMWORD[288+rdi],ymm2
+	je	NEAR $L$done8x
+
+	lea	rsi,[320+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm3
+	lea	rdi,[320+rdi]
+	sub	rdx,320
+	vmovdqa	YMMWORD[32+rsp],ymm7
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$384_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
+	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
+	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
+	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	vmovdqu	YMMWORD[128+rdi],ymm12
+	vmovdqu	YMMWORD[160+rdi],ymm13
+	vmovdqu	YMMWORD[192+rdi],ymm10
+	vmovdqu	YMMWORD[224+rdi],ymm15
+	vmovdqu	YMMWORD[256+rdi],ymm14
+	vmovdqu	YMMWORD[288+rdi],ymm2
+	vmovdqu	YMMWORD[320+rdi],ymm3
+	vmovdqu	YMMWORD[352+rdi],ymm7
+	je	NEAR $L$done8x
+
+	lea	rsi,[384+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm11
+	lea	rdi,[384+rdi]
+	sub	rdx,384
+	vmovdqa	YMMWORD[32+rsp],ymm9
+	jmp	NEAR $L$oop_tail8x
+
+ALIGN	32
+$L$448_or_more8x:
+	vpxor	ymm6,ymm6,YMMWORD[rsi]
+	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
+	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
+	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
+	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
+	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
+	vpxor	ymm11,ymm11,YMMWORD[384+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[416+rsi]
+	vmovdqu	YMMWORD[rdi],ymm6
+	vmovdqu	YMMWORD[32+rdi],ymm8
+	vmovdqu	YMMWORD[64+rdi],ymm1
+	vmovdqu	YMMWORD[96+rdi],ymm5
+	vmovdqu	YMMWORD[128+rdi],ymm12
+	vmovdqu	YMMWORD[160+rdi],ymm13
+	vmovdqu	YMMWORD[192+rdi],ymm10
+	vmovdqu	YMMWORD[224+rdi],ymm15
+	vmovdqu	YMMWORD[256+rdi],ymm14
+	vmovdqu	YMMWORD[288+rdi],ymm2
+	vmovdqu	YMMWORD[320+rdi],ymm3
+	vmovdqu	YMMWORD[352+rdi],ymm7
+	vmovdqu	YMMWORD[384+rdi],ymm11
+	vmovdqu	YMMWORD[416+rdi],ymm9
+	je	NEAR $L$done8x
+
+	lea	rsi,[448+rsi]
+	xor	r10,r10
+	vmovdqa	YMMWORD[rsp],ymm0
+	lea	rdi,[448+rdi]
+	sub	rdx,448
+	vmovdqa	YMMWORD[32+rsp],ymm4
+
+$L$oop_tail8x:
+	movzx	eax,BYTE[r10*1+rsi]
+	movzx	ecx,BYTE[r10*1+rsp]
+	lea	r10,[1+r10]
+	xor	eax,ecx
+	mov	BYTE[((-1))+r10*1+rdi],al
+	dec	rdx
+	jnz	NEAR $L$oop_tail8x
+
+$L$done8x:
+	vzeroall
+	movaps	xmm6,XMMWORD[((-168))+r9]
+	movaps	xmm7,XMMWORD[((-152))+r9]
+	movaps	xmm8,XMMWORD[((-136))+r9]
+	movaps	xmm9,XMMWORD[((-120))+r9]
+	movaps	xmm10,XMMWORD[((-104))+r9]
+	movaps	xmm11,XMMWORD[((-88))+r9]
+	movaps	xmm12,XMMWORD[((-72))+r9]
+	movaps	xmm13,XMMWORD[((-56))+r9]
+	movaps	xmm14,XMMWORD[((-40))+r9]
+	movaps	xmm15,XMMWORD[((-24))+r9]
+	lea	rsp,[r9]
+
+$L$8x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_ChaCha20_8x:
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	lea	r10,[$L$ctr32_body]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	lea	r10,[$L$no_data]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rax,[((64+24+48))+rax]
+
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	DB	0F3h,0C3h		;repret
+
+
+
+ALIGN	16
+ssse3_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[192+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rsi,[((-40))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,4
+	DD	0xa548f3fc
+
+	jmp	NEAR $L$common_seh_tail
+
+
+
+ALIGN	16
+full_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[192+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rsi,[((-168))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,20
+	DD	0xa548f3fc
+
+	jmp	NEAR $L$common_seh_tail
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_GFp_ChaCha20_ctr32 wrt ..imagebase
+	DD	$L$SEH_end_GFp_ChaCha20_ctr32 wrt ..imagebase
+	DD	$L$SEH_info_GFp_ChaCha20_ctr32 wrt ..imagebase
+
+	DD	$L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase
+	DD	$L$SEH_end_ChaCha20_ssse3 wrt ..imagebase
+	DD	$L$SEH_info_ChaCha20_ssse3 wrt ..imagebase
+
+	DD	$L$SEH_begin_ChaCha20_4x wrt ..imagebase
+	DD	$L$SEH_end_ChaCha20_4x wrt ..imagebase
+	DD	$L$SEH_info_ChaCha20_4x wrt ..imagebase
+	DD	$L$SEH_begin_ChaCha20_8x wrt ..imagebase
+	DD	$L$SEH_end_ChaCha20_8x wrt ..imagebase
+	DD	$L$SEH_info_ChaCha20_8x wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_GFp_ChaCha20_ctr32:
+DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+
+$L$SEH_info_ChaCha20_ssse3:
+DB	9,0,0,0
+	DD	ssse3_handler wrt ..imagebase
+	DD	$L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase
+
+$L$SEH_info_ChaCha20_4x:
+DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase
+$L$SEH_info_ChaCha20_8x:
+DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase

+ 8941 - 0
zeroidc/vendor/ring/pregenerated/tmp/chacha20_poly1305_x86_64-nasm.asm

@@ -0,0 +1,8941 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+section	.text code align=64
+
+EXTERN	GFp_ia32cap_P
+
+chacha20_poly1305_constants:
+
+ALIGN	64
+$L$chacha20_consts:
+DB	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+DB	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+$L$rol8:
+DB	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+DB	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+$L$rol16:
+DB	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+DB	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+$L$avx2_init:
+	DD	0,0,0,0
+$L$sse_inc:
+	DD	1,0,0,0
+$L$avx2_inc:
+	DD	2,0,0,0,2,0,0,0
+$L$clamp:
+	DQ	0x0FFFFFFC0FFFFFFF,0x0FFFFFFC0FFFFFFC
+	DQ	0xFFFFFFFFFFFFFFFF,0xFFFFFFFFFFFFFFFF
+ALIGN	16
+$L$and_masks:
+DB	0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+DB	0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+DB	0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+DB	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+DB	0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+DB	0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
+DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
+DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
+DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
+DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
+DB	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+
+ALIGN	64
+poly_hash_ad_internal:
+
+
+	xor	r10,r10
+	xor	r11,r11
+	xor	r12,r12
+	cmp	r8,13
+	jne	NEAR $L$hash_ad_loop
+$L$poly_fast_tls_ad:
+
+	mov	r10,QWORD[rcx]
+	mov	r11,QWORD[5+rcx]
+	shr	r11,24
+	mov	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	DB	0F3h,0C3h		;repret
+$L$hash_ad_loop:
+
+	cmp	r8,16
+	jb	NEAR $L$hash_ad_tail
+	add	r10,QWORD[((0+0))+rcx]
+	adc	r11,QWORD[((8+0))+rcx]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rcx,[16+rcx]
+	sub	r8,16
+	jmp	NEAR $L$hash_ad_loop
+$L$hash_ad_tail:
+	cmp	r8,0
+	je	NEAR $L$hash_ad_done
+
+	xor	r13,r13
+	xor	r14,r14
+	xor	r15,r15
+	add	rcx,r8
+$L$hash_ad_tail_loop:
+	shld	r14,r13,8
+	shl	r13,8
+	movzx	r15,BYTE[((-1))+rcx]
+	xor	r13,r15
+	dec	rcx
+	dec	r8
+	jne	NEAR $L$hash_ad_tail_loop
+
+	add	r10,r13
+	adc	r11,r14
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+$L$hash_ad_done:
+	DB	0F3h,0C3h		;repret
+
+
+
+global	GFp_chacha20_poly1305_open
+
+ALIGN	64
+GFp_chacha20_poly1305_open:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_chacha20_poly1305_open:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+
+
+	push	r9
+
+	sub	rsp,288 + 160 + 32
+
+
+	lea	rbp,[32+rsp]
+	and	rbp,-32
+
+	movaps	XMMWORD[(0+0)+rbp],xmm6
+	movaps	XMMWORD[(16+0)+rbp],xmm7
+	movaps	XMMWORD[(32+0)+rbp],xmm8
+	movaps	XMMWORD[(48+0)+rbp],xmm9
+	movaps	XMMWORD[(64+0)+rbp],xmm10
+	movaps	XMMWORD[(80+0)+rbp],xmm11
+	movaps	XMMWORD[(96+0)+rbp],xmm12
+	movaps	XMMWORD[(112+0)+rbp],xmm13
+	movaps	XMMWORD[(128+0)+rbp],xmm14
+	movaps	XMMWORD[(144+0)+rbp],xmm15
+
+	mov	rbx,rdx
+	mov	QWORD[((0+160+32))+rbp],r8
+	mov	QWORD[((8+160+32))+rbp],rbx
+
+	mov	eax,DWORD[((GFp_ia32cap_P+8))]
+	and	eax,288
+	xor	eax,288
+	jz	NEAR chacha20_poly1305_open_avx2
+
+	cmp	rbx,128
+	jbe	NEAR $L$open_sse_128
+
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqu	xmm4,XMMWORD[r9]
+	movdqu	xmm8,XMMWORD[16+r9]
+	movdqu	xmm12,XMMWORD[32+r9]
+
+	movdqa	xmm7,xmm12
+
+	movdqa	XMMWORD[(160+48)+rbp],xmm4
+	movdqa	XMMWORD[(160+64)+rbp],xmm8
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	mov	r10,10
+$L$open_sse_init_rounds:
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+
+	dec	r10
+	jne	NEAR $L$open_sse_init_rounds
+
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+
+	pand	xmm0,XMMWORD[$L$clamp]
+	movdqa	XMMWORD[(160+0)+rbp],xmm0
+	movdqa	XMMWORD[(160+16)+rbp],xmm4
+
+	mov	r8,r8
+	call	poly_hash_ad_internal
+$L$open_sse_main_loop:
+	cmp	rbx,16*16
+	jb	NEAR $L$open_sse_tail
+
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	movdqa	xmm10,xmm8
+	movdqa	xmm3,xmm0
+	movdqa	xmm7,xmm4
+	movdqa	xmm11,xmm8
+	movdqa	xmm15,XMMWORD[((160+96))+rbp]
+	paddd	xmm15,XMMWORD[$L$sse_inc]
+	movdqa	xmm14,xmm15
+	paddd	xmm14,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm14
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+	movdqa	XMMWORD[(160+128)+rbp],xmm14
+	movdqa	XMMWORD[(160+144)+rbp],xmm15
+
+
+
+	mov	rcx,4
+	mov	r8,rsi
+$L$open_sse_main_loop_rounds:
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,XMMWORD[$L$rol16]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+
+	lea	r8,[16+r8]
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,20
+	pslld	xmm7,32-20
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,20
+	pslld	xmm6,32-20
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,20
+	pslld	xmm5,32-20
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,20
+	pslld	xmm4,32-20
+	pxor	xmm4,xmm8
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	movdqa	xmm8,XMMWORD[$L$rol8]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,25
+	pslld	xmm7,32-25
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,25
+	pslld	xmm6,32-25
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,25
+	pslld	xmm5,32-25
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,25
+	pslld	xmm4,32-25
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+DB	102,15,58,15,255,4
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,12
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,XMMWORD[$L$rol16]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,20
+	pslld	xmm7,32-20
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,20
+	pslld	xmm6,32-20
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,20
+	pslld	xmm5,32-20
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,20
+	pslld	xmm4,32-20
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[$L$rol8]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,25
+	pslld	xmm7,32-25
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,25
+	pslld	xmm6,32-25
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,25
+	pslld	xmm5,32-25
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,25
+	pslld	xmm4,32-25
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+DB	102,15,58,15,255,12
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,4
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+
+	dec	rcx
+	jge	NEAR $L$open_sse_main_loop_rounds
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	r8,[16+r8]
+	cmp	rcx,-6
+	jg	NEAR $L$open_sse_main_loop_rounds
+	paddd	xmm3,XMMWORD[$L$chacha20_consts]
+	paddd	xmm7,XMMWORD[((160+48))+rbp]
+	paddd	xmm11,XMMWORD[((160+64))+rbp]
+	paddd	xmm15,XMMWORD[((160+144))+rbp]
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm6,XMMWORD[((160+48))+rbp]
+	paddd	xmm10,XMMWORD[((160+64))+rbp]
+	paddd	xmm14,XMMWORD[((160+128))+rbp]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+	movdqa	XMMWORD[(160+80)+rbp],xmm12
+	movdqu	xmm12,XMMWORD[((0 + 0))+rsi]
+	pxor	xmm12,xmm3
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm12
+	movdqu	xmm12,XMMWORD[((16 + 0))+rsi]
+	pxor	xmm12,xmm7
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm12
+	movdqu	xmm12,XMMWORD[((32 + 0))+rsi]
+	pxor	xmm12,xmm11
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm12
+	movdqu	xmm12,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm12,xmm15
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm12
+	movdqu	xmm3,XMMWORD[((0 + 64))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 64))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 64))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 64))+rsi]
+	pxor	xmm2,xmm3
+	pxor	xmm6,xmm7
+	pxor	xmm10,xmm11
+	pxor	xmm15,xmm14
+	movdqu	XMMWORD[(0 + 64)+rdi],xmm2
+	movdqu	XMMWORD[(16 + 64)+rdi],xmm6
+	movdqu	XMMWORD[(32 + 64)+rdi],xmm10
+	movdqu	XMMWORD[(48 + 64)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 128))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 128))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 128))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 128))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 128)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 128)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 128)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 128)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 192))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 192))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 192))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 192))+rsi]
+	pxor	xmm0,xmm3
+	pxor	xmm4,xmm7
+	pxor	xmm8,xmm11
+	pxor	xmm15,XMMWORD[((160+80))+rbp]
+	movdqu	XMMWORD[(0 + 192)+rdi],xmm0
+	movdqu	XMMWORD[(16 + 192)+rdi],xmm4
+	movdqu	XMMWORD[(32 + 192)+rdi],xmm8
+	movdqu	XMMWORD[(48 + 192)+rdi],xmm15
+
+	lea	rsi,[256+rsi]
+	lea	rdi,[256+rdi]
+	sub	rbx,16*16
+	jmp	NEAR $L$open_sse_main_loop
+$L$open_sse_tail:
+
+	test	rbx,rbx
+	jz	NEAR $L$open_sse_finalize
+	cmp	rbx,12*16
+	ja	NEAR $L$open_sse_tail_256
+	cmp	rbx,8*16
+	ja	NEAR $L$open_sse_tail_192
+	cmp	rbx,4*16
+	ja	NEAR $L$open_sse_tail_128
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm12,XMMWORD[((160+96))+rbp]
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+
+	xor	r8,r8
+	mov	rcx,rbx
+	cmp	rcx,16
+	jb	NEAR $L$open_sse_tail_64_rounds
+$L$open_sse_tail_64_rounds_and_x1hash:
+	add	r10,QWORD[((0+0))+r8*1+rsi]
+	adc	r11,QWORD[((8+0))+r8*1+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	sub	rcx,16
+$L$open_sse_tail_64_rounds:
+	add	r8,16
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+
+	cmp	rcx,16
+	jae	NEAR $L$open_sse_tail_64_rounds_and_x1hash
+	cmp	r8,10*16
+	jne	NEAR $L$open_sse_tail_64_rounds
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+
+	jmp	NEAR $L$open_sse_tail_64_dec_loop
+
+$L$open_sse_tail_128:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm13,XMMWORD[((160+96))+rbp]
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+
+	mov	rcx,rbx
+	and	rcx,-16
+	xor	r8,r8
+$L$open_sse_tail_128_rounds_and_x1hash:
+	add	r10,QWORD[((0+0))+r8*1+rsi]
+	adc	r11,QWORD[((8+0))+r8*1+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+$L$open_sse_tail_128_rounds:
+	add	r8,16
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+
+	cmp	r8,rcx
+	jb	NEAR $L$open_sse_tail_128_rounds_and_x1hash
+	cmp	r8,10*16
+	jne	NEAR $L$open_sse_tail_128_rounds
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+	movdqu	xmm3,XMMWORD[((0 + 0))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 0))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 0))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm15
+
+	sub	rbx,4*16
+	lea	rsi,[64+rsi]
+	lea	rdi,[64+rdi]
+	jmp	NEAR $L$open_sse_tail_64_dec_loop
+
+$L$open_sse_tail_192:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	movdqa	xmm10,xmm8
+	movdqa	xmm14,XMMWORD[((160+96))+rbp]
+	paddd	xmm14,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm14
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+	movdqa	XMMWORD[(160+128)+rbp],xmm14
+
+	mov	rcx,rbx
+	mov	r8,10*16
+	cmp	rcx,10*16
+	cmovg	rcx,r8
+	and	rcx,-16
+	xor	r8,r8
+$L$open_sse_tail_192_rounds_and_x1hash:
+	add	r10,QWORD[((0+0))+r8*1+rsi]
+	adc	r11,QWORD[((8+0))+r8*1+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+$L$open_sse_tail_192_rounds:
+	add	r8,16
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+
+	cmp	r8,rcx
+	jb	NEAR $L$open_sse_tail_192_rounds_and_x1hash
+	cmp	r8,10*16
+	jne	NEAR $L$open_sse_tail_192_rounds
+	cmp	rbx,11*16
+	jb	NEAR $L$open_sse_tail_192_finish
+	add	r10,QWORD[((0+160))+rsi]
+	adc	r11,QWORD[((8+160))+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	cmp	rbx,12*16
+	jb	NEAR $L$open_sse_tail_192_finish
+	add	r10,QWORD[((0+176))+rsi]
+	adc	r11,QWORD[((8+176))+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+$L$open_sse_tail_192_finish:
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm6,XMMWORD[((160+48))+rbp]
+	paddd	xmm10,XMMWORD[((160+64))+rbp]
+	paddd	xmm14,XMMWORD[((160+128))+rbp]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+	movdqu	xmm3,XMMWORD[((0 + 0))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 0))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 0))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm2,xmm3
+	pxor	xmm6,xmm7
+	pxor	xmm10,xmm11
+	pxor	xmm15,xmm14
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm2
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm6
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm10
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 64))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 64))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 64))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 64))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 64)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 64)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 64)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 64)+rdi],xmm15
+
+	sub	rbx,8*16
+	lea	rsi,[128+rsi]
+	lea	rdi,[128+rdi]
+	jmp	NEAR $L$open_sse_tail_64_dec_loop
+
+$L$open_sse_tail_256:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	movdqa	xmm10,xmm8
+	movdqa	xmm3,xmm0
+	movdqa	xmm7,xmm4
+	movdqa	xmm11,xmm8
+	movdqa	xmm15,XMMWORD[((160+96))+rbp]
+	paddd	xmm15,XMMWORD[$L$sse_inc]
+	movdqa	xmm14,xmm15
+	paddd	xmm14,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm14
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+	movdqa	XMMWORD[(160+128)+rbp],xmm14
+	movdqa	XMMWORD[(160+144)+rbp],xmm15
+
+	xor	r8,r8
+$L$open_sse_tail_256_rounds_and_x1hash:
+	add	r10,QWORD[((0+0))+r8*1+rsi]
+	adc	r11,QWORD[((8+0))+r8*1+rsi]
+	adc	r12,1
+	movdqa	XMMWORD[(160+80)+rbp],xmm11
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm11,xmm4
+	pslld	xmm11,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm11
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm11,xmm4
+	pslld	xmm11,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm11
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm11,xmm5
+	pslld	xmm11,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm11
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm11,xmm5
+	pslld	xmm11,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm11
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm11,xmm6
+	pslld	xmm11,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm11
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm11,xmm6
+	pslld	xmm11,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm11
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+	movdqa	xmm11,XMMWORD[((160+80))+rbp]
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	movdqa	XMMWORD[(160+80)+rbp],xmm9
+	paddd	xmm3,xmm7
+	pxor	xmm15,xmm3
+	pshufb	xmm15,XMMWORD[$L$rol16]
+	paddd	xmm11,xmm15
+	pxor	xmm7,xmm11
+	movdqa	xmm9,xmm7
+	pslld	xmm9,12
+	psrld	xmm7,20
+	pxor	xmm7,xmm9
+	paddd	xmm3,xmm7
+	pxor	xmm15,xmm3
+	pshufb	xmm15,XMMWORD[$L$rol8]
+	paddd	xmm11,xmm15
+	pxor	xmm7,xmm11
+	movdqa	xmm9,xmm7
+	pslld	xmm9,7
+	psrld	xmm7,25
+	pxor	xmm7,xmm9
+DB	102,15,58,15,255,4
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,12
+	movdqa	xmm9,XMMWORD[((160+80))+rbp]
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	movdqa	XMMWORD[(160+80)+rbp],xmm11
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm11,xmm4
+	pslld	xmm11,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm11
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm11,xmm4
+	pslld	xmm11,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm11
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm11,xmm5
+	pslld	xmm11,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm11
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm11,xmm5
+	pslld	xmm11,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm11
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm11,xmm6
+	pslld	xmm11,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm11
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm11,xmm6
+	pslld	xmm11,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm11
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+	movdqa	xmm11,XMMWORD[((160+80))+rbp]
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	movdqa	XMMWORD[(160+80)+rbp],xmm9
+	paddd	xmm3,xmm7
+	pxor	xmm15,xmm3
+	pshufb	xmm15,XMMWORD[$L$rol16]
+	paddd	xmm11,xmm15
+	pxor	xmm7,xmm11
+	movdqa	xmm9,xmm7
+	pslld	xmm9,12
+	psrld	xmm7,20
+	pxor	xmm7,xmm9
+	paddd	xmm3,xmm7
+	pxor	xmm15,xmm3
+	pshufb	xmm15,XMMWORD[$L$rol8]
+	paddd	xmm11,xmm15
+	pxor	xmm7,xmm11
+	movdqa	xmm9,xmm7
+	pslld	xmm9,7
+	psrld	xmm7,25
+	pxor	xmm7,xmm9
+DB	102,15,58,15,255,12
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,4
+	movdqa	xmm9,XMMWORD[((160+80))+rbp]
+
+	add	r8,16
+	cmp	r8,10*16
+	jb	NEAR $L$open_sse_tail_256_rounds_and_x1hash
+
+	mov	rcx,rbx
+	and	rcx,-16
+$L$open_sse_tail_256_hash:
+	add	r10,QWORD[((0+0))+r8*1+rsi]
+	adc	r11,QWORD[((8+0))+r8*1+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	add	r8,16
+	cmp	r8,rcx
+	jb	NEAR $L$open_sse_tail_256_hash
+	paddd	xmm3,XMMWORD[$L$chacha20_consts]
+	paddd	xmm7,XMMWORD[((160+48))+rbp]
+	paddd	xmm11,XMMWORD[((160+64))+rbp]
+	paddd	xmm15,XMMWORD[((160+144))+rbp]
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm6,XMMWORD[((160+48))+rbp]
+	paddd	xmm10,XMMWORD[((160+64))+rbp]
+	paddd	xmm14,XMMWORD[((160+128))+rbp]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+	movdqa	XMMWORD[(160+80)+rbp],xmm12
+	movdqu	xmm12,XMMWORD[((0 + 0))+rsi]
+	pxor	xmm12,xmm3
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm12
+	movdqu	xmm12,XMMWORD[((16 + 0))+rsi]
+	pxor	xmm12,xmm7
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm12
+	movdqu	xmm12,XMMWORD[((32 + 0))+rsi]
+	pxor	xmm12,xmm11
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm12
+	movdqu	xmm12,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm12,xmm15
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm12
+	movdqu	xmm3,XMMWORD[((0 + 64))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 64))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 64))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 64))+rsi]
+	pxor	xmm2,xmm3
+	pxor	xmm6,xmm7
+	pxor	xmm10,xmm11
+	pxor	xmm15,xmm14
+	movdqu	XMMWORD[(0 + 64)+rdi],xmm2
+	movdqu	XMMWORD[(16 + 64)+rdi],xmm6
+	movdqu	XMMWORD[(32 + 64)+rdi],xmm10
+	movdqu	XMMWORD[(48 + 64)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 128))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 128))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 128))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 128))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 128)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 128)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 128)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 128)+rdi],xmm15
+
+	movdqa	xmm12,XMMWORD[((160+80))+rbp]
+	sub	rbx,12*16
+	lea	rsi,[192+rsi]
+	lea	rdi,[192+rdi]
+
+
+$L$open_sse_tail_64_dec_loop:
+	cmp	rbx,16
+	jb	NEAR $L$open_sse_tail_16_init
+	sub	rbx,16
+	movdqu	xmm3,XMMWORD[rsi]
+	pxor	xmm0,xmm3
+	movdqu	XMMWORD[rdi],xmm0
+	lea	rsi,[16+rsi]
+	lea	rdi,[16+rdi]
+	movdqa	xmm0,xmm4
+	movdqa	xmm4,xmm8
+	movdqa	xmm8,xmm12
+	jmp	NEAR $L$open_sse_tail_64_dec_loop
+$L$open_sse_tail_16_init:
+	movdqa	xmm1,xmm0
+
+
+$L$open_sse_tail_16:
+	test	rbx,rbx
+	jz	NEAR $L$open_sse_finalize
+
+
+
+	pxor	xmm3,xmm3
+	lea	rsi,[((-1))+rbx*1+rsi]
+	mov	r8,rbx
+$L$open_sse_tail_16_compose:
+	pslldq	xmm3,1
+	pinsrb	xmm3,BYTE[rsi],0
+	sub	rsi,1
+	sub	r8,1
+	jnz	NEAR $L$open_sse_tail_16_compose
+
+DB	102,73,15,126,221
+	pextrq	r14,xmm3,1
+
+	pxor	xmm3,xmm1
+
+
+$L$open_sse_tail_16_extract:
+	pextrb	XMMWORD[rdi],xmm3,0
+	psrldq	xmm3,1
+	add	rdi,1
+	sub	rbx,1
+	jne	NEAR $L$open_sse_tail_16_extract
+
+	add	r10,r13
+	adc	r11,r14
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+$L$open_sse_finalize:
+	add	r10,QWORD[((0+160+32))+rbp]
+	adc	r11,QWORD[((8+160+32))+rbp]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+	mov	r13,r10
+	mov	r14,r11
+	mov	r15,r12
+	sub	r10,-5
+	sbb	r11,-1
+	sbb	r12,3
+	cmovc	r10,r13
+	cmovc	r11,r14
+	cmovc	r12,r15
+
+	add	r10,QWORD[((0+160+16))+rbp]
+	adc	r11,QWORD[((8+160+16))+rbp]
+
+	movaps	xmm6,XMMWORD[((0+0))+rbp]
+	movaps	xmm7,XMMWORD[((16+0))+rbp]
+	movaps	xmm8,XMMWORD[((32+0))+rbp]
+	movaps	xmm9,XMMWORD[((48+0))+rbp]
+	movaps	xmm10,XMMWORD[((64+0))+rbp]
+	movaps	xmm11,XMMWORD[((80+0))+rbp]
+	movaps	xmm12,XMMWORD[((96+0))+rbp]
+	movaps	xmm13,XMMWORD[((112+0))+rbp]
+	movaps	xmm14,XMMWORD[((128+0))+rbp]
+	movaps	xmm15,XMMWORD[((144+0))+rbp]
+
+
+	add	rsp,288 + 160 + 32
+
+
+	pop	r9
+
+	mov	QWORD[r9],r10
+	mov	QWORD[8+r9],r11
+	pop	r15
+
+	pop	r14
+
+	pop	r13
+
+	pop	r12
+
+	pop	rbx
+
+	pop	rbp
+
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$open_sse_128:
+
+	movdqu	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm1,xmm0
+	movdqa	xmm2,xmm0
+	movdqu	xmm4,XMMWORD[r9]
+	movdqa	xmm5,xmm4
+	movdqa	xmm6,xmm4
+	movdqu	xmm8,XMMWORD[16+r9]
+	movdqa	xmm9,xmm8
+	movdqa	xmm10,xmm8
+	movdqu	xmm12,XMMWORD[32+r9]
+	movdqa	xmm13,xmm12
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm14,xmm13
+	paddd	xmm14,XMMWORD[$L$sse_inc]
+	movdqa	xmm7,xmm4
+	movdqa	xmm11,xmm8
+	movdqa	xmm15,xmm13
+	mov	r10,10
+
+$L$open_sse_128_rounds:
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+
+	dec	r10
+	jnz	NEAR $L$open_sse_128_rounds
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,xmm7
+	paddd	xmm5,xmm7
+	paddd	xmm6,xmm7
+	paddd	xmm9,xmm11
+	paddd	xmm10,xmm11
+	paddd	xmm13,xmm15
+	paddd	xmm15,XMMWORD[$L$sse_inc]
+	paddd	xmm14,xmm15
+
+	pand	xmm0,XMMWORD[$L$clamp]
+	movdqa	XMMWORD[(160+0)+rbp],xmm0
+	movdqa	XMMWORD[(160+16)+rbp],xmm4
+
+	mov	r8,r8
+	call	poly_hash_ad_internal
+$L$open_sse_128_xor_hash:
+	cmp	rbx,16
+	jb	NEAR $L$open_sse_tail_16
+	sub	rbx,16
+	add	r10,QWORD[((0+0))+rsi]
+	adc	r11,QWORD[((8+0))+rsi]
+	adc	r12,1
+
+
+	movdqu	xmm3,XMMWORD[rsi]
+	pxor	xmm1,xmm3
+	movdqu	XMMWORD[rdi],xmm1
+	lea	rsi,[16+rsi]
+	lea	rdi,[16+rdi]
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+	movdqa	xmm1,xmm5
+	movdqa	xmm5,xmm9
+	movdqa	xmm9,xmm13
+	movdqa	xmm13,xmm2
+	movdqa	xmm2,xmm6
+	movdqa	xmm6,xmm10
+	movdqa	xmm10,xmm14
+	jmp	NEAR $L$open_sse_128_xor_hash
+$L$SEH_end_GFp_chacha20_poly1305_open:
+
+
+
+
+
+
+
+global	GFp_chacha20_poly1305_seal
+
+ALIGN	64
+GFp_chacha20_poly1305_seal:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_chacha20_poly1305_seal:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+
+
+	push	r9
+
+	sub	rsp,288 + 160 + 32
+
+	lea	rbp,[32+rsp]
+	and	rbp,-32
+
+	movaps	XMMWORD[(0+0)+rbp],xmm6
+	movaps	XMMWORD[(16+0)+rbp],xmm7
+	movaps	XMMWORD[(32+0)+rbp],xmm8
+	movaps	XMMWORD[(48+0)+rbp],xmm9
+	movaps	XMMWORD[(64+0)+rbp],xmm10
+	movaps	XMMWORD[(80+0)+rbp],xmm11
+	movaps	XMMWORD[(96+0)+rbp],xmm12
+	movaps	XMMWORD[(112+0)+rbp],xmm13
+	movaps	XMMWORD[(128+0)+rbp],xmm14
+	movaps	XMMWORD[(144+0)+rbp],xmm15
+
+	mov	rbx,QWORD[56+r9]
+	add	rbx,rdx
+	mov	QWORD[((0+160+32))+rbp],r8
+	mov	QWORD[((8+160+32))+rbp],rbx
+	mov	rbx,rdx
+
+	mov	eax,DWORD[((GFp_ia32cap_P+8))]
+	and	eax,288
+	xor	eax,288
+	jz	NEAR chacha20_poly1305_seal_avx2
+
+	cmp	rbx,128
+	jbe	NEAR $L$seal_sse_128
+
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqu	xmm4,XMMWORD[r9]
+	movdqu	xmm8,XMMWORD[16+r9]
+	movdqu	xmm12,XMMWORD[32+r9]
+
+	movdqa	xmm1,xmm0
+	movdqa	xmm2,xmm0
+	movdqa	xmm3,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm6,xmm4
+	movdqa	xmm7,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm10,xmm8
+	movdqa	xmm11,xmm8
+	movdqa	xmm15,xmm12
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	xmm14,xmm12
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm12
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+
+	movdqa	XMMWORD[(160+48)+rbp],xmm4
+	movdqa	XMMWORD[(160+64)+rbp],xmm8
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+	movdqa	XMMWORD[(160+128)+rbp],xmm14
+	movdqa	XMMWORD[(160+144)+rbp],xmm15
+	mov	r10,10
+$L$seal_sse_init_rounds:
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,XMMWORD[$L$rol16]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,20
+	pslld	xmm7,32-20
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,20
+	pslld	xmm6,32-20
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,20
+	pslld	xmm5,32-20
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,20
+	pslld	xmm4,32-20
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[$L$rol8]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,25
+	pslld	xmm7,32-25
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,25
+	pslld	xmm6,32-25
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,25
+	pslld	xmm5,32-25
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,25
+	pslld	xmm4,32-25
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+DB	102,15,58,15,255,4
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,12
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,XMMWORD[$L$rol16]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,20
+	pslld	xmm7,32-20
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,20
+	pslld	xmm6,32-20
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,20
+	pslld	xmm5,32-20
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,20
+	pslld	xmm4,32-20
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[$L$rol8]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,25
+	pslld	xmm7,32-25
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,25
+	pslld	xmm6,32-25
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,25
+	pslld	xmm5,32-25
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,25
+	pslld	xmm4,32-25
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+DB	102,15,58,15,255,12
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,4
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+
+	dec	r10
+	jnz	NEAR $L$seal_sse_init_rounds
+	paddd	xmm3,XMMWORD[$L$chacha20_consts]
+	paddd	xmm7,XMMWORD[((160+48))+rbp]
+	paddd	xmm11,XMMWORD[((160+64))+rbp]
+	paddd	xmm15,XMMWORD[((160+144))+rbp]
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm6,XMMWORD[((160+48))+rbp]
+	paddd	xmm10,XMMWORD[((160+64))+rbp]
+	paddd	xmm14,XMMWORD[((160+128))+rbp]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+
+
+	pand	xmm3,XMMWORD[$L$clamp]
+	movdqa	XMMWORD[(160+0)+rbp],xmm3
+	movdqa	XMMWORD[(160+16)+rbp],xmm7
+
+	mov	r8,r8
+	call	poly_hash_ad_internal
+	movdqu	xmm3,XMMWORD[((0 + 0))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 0))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 0))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm2,xmm3
+	pxor	xmm6,xmm7
+	pxor	xmm10,xmm11
+	pxor	xmm15,xmm14
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm2
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm6
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm10
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 64))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 64))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 64))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 64))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 64)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 64)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 64)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 64)+rdi],xmm15
+
+	cmp	rbx,12*16
+	ja	NEAR $L$seal_sse_main_init
+	mov	rcx,8*16
+	sub	rbx,8*16
+	lea	rsi,[128+rsi]
+	jmp	NEAR $L$seal_sse_128_tail_hash
+$L$seal_sse_main_init:
+	movdqu	xmm3,XMMWORD[((0 + 128))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 128))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 128))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 128))+rsi]
+	pxor	xmm0,xmm3
+	pxor	xmm4,xmm7
+	pxor	xmm8,xmm11
+	pxor	xmm15,xmm12
+	movdqu	XMMWORD[(0 + 128)+rdi],xmm0
+	movdqu	XMMWORD[(16 + 128)+rdi],xmm4
+	movdqu	XMMWORD[(32 + 128)+rdi],xmm8
+	movdqu	XMMWORD[(48 + 128)+rdi],xmm15
+
+	mov	rcx,12*16
+	sub	rbx,12*16
+	lea	rsi,[192+rsi]
+	mov	rcx,2
+	mov	r8,8
+	cmp	rbx,4*16
+	jbe	NEAR $L$seal_sse_tail_64
+	cmp	rbx,8*16
+	jbe	NEAR $L$seal_sse_tail_128
+	cmp	rbx,12*16
+	jbe	NEAR $L$seal_sse_tail_192
+
+$L$seal_sse_main_loop:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	movdqa	xmm10,xmm8
+	movdqa	xmm3,xmm0
+	movdqa	xmm7,xmm4
+	movdqa	xmm11,xmm8
+	movdqa	xmm15,XMMWORD[((160+96))+rbp]
+	paddd	xmm15,XMMWORD[$L$sse_inc]
+	movdqa	xmm14,xmm15
+	paddd	xmm14,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm14
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+	movdqa	XMMWORD[(160+128)+rbp],xmm14
+	movdqa	XMMWORD[(160+144)+rbp],xmm15
+
+ALIGN	32
+$L$seal_sse_main_rounds:
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,XMMWORD[$L$rol16]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,20
+	pslld	xmm7,32-20
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,20
+	pslld	xmm6,32-20
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,20
+	pslld	xmm5,32-20
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,20
+	pslld	xmm4,32-20
+	pxor	xmm4,xmm8
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	movdqa	xmm8,XMMWORD[$L$rol8]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,25
+	pslld	xmm7,32-25
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,25
+	pslld	xmm6,32-25
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,25
+	pslld	xmm5,32-25
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,25
+	pslld	xmm4,32-25
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+DB	102,15,58,15,255,4
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,12
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,XMMWORD[$L$rol16]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,20
+	pslld	xmm7,32-20
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,20
+	pslld	xmm6,32-20
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,20
+	pslld	xmm5,32-20
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,20
+	pslld	xmm4,32-20
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[$L$rol8]
+	paddd	xmm3,xmm7
+	paddd	xmm2,xmm6
+	paddd	xmm1,xmm5
+	paddd	xmm0,xmm4
+	pxor	xmm15,xmm3
+	pxor	xmm14,xmm2
+	pxor	xmm13,xmm1
+	pxor	xmm12,xmm0
+DB	102,69,15,56,0,248
+DB	102,69,15,56,0,240
+DB	102,69,15,56,0,232
+DB	102,69,15,56,0,224
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+	paddd	xmm11,xmm15
+	paddd	xmm10,xmm14
+	paddd	xmm9,xmm13
+	paddd	xmm8,xmm12
+	pxor	xmm7,xmm11
+	pxor	xmm6,xmm10
+	pxor	xmm5,xmm9
+	pxor	xmm4,xmm8
+	movdqa	XMMWORD[(160+80)+rbp],xmm8
+	movdqa	xmm8,xmm7
+	psrld	xmm8,25
+	pslld	xmm7,32-25
+	pxor	xmm7,xmm8
+	movdqa	xmm8,xmm6
+	psrld	xmm8,25
+	pslld	xmm6,32-25
+	pxor	xmm6,xmm8
+	movdqa	xmm8,xmm5
+	psrld	xmm8,25
+	pslld	xmm5,32-25
+	pxor	xmm5,xmm8
+	movdqa	xmm8,xmm4
+	psrld	xmm8,25
+	pslld	xmm4,32-25
+	pxor	xmm4,xmm8
+	movdqa	xmm8,XMMWORD[((160+80))+rbp]
+DB	102,15,58,15,255,12
+DB	102,69,15,58,15,219,8
+DB	102,69,15,58,15,255,4
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+
+	lea	rdi,[16+rdi]
+	dec	r8
+	jge	NEAR $L$seal_sse_main_rounds
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_sse_main_rounds
+	paddd	xmm3,XMMWORD[$L$chacha20_consts]
+	paddd	xmm7,XMMWORD[((160+48))+rbp]
+	paddd	xmm11,XMMWORD[((160+64))+rbp]
+	paddd	xmm15,XMMWORD[((160+144))+rbp]
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm6,XMMWORD[((160+48))+rbp]
+	paddd	xmm10,XMMWORD[((160+64))+rbp]
+	paddd	xmm14,XMMWORD[((160+128))+rbp]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+
+	movdqa	XMMWORD[(160+80)+rbp],xmm14
+	movdqa	XMMWORD[(160+80)+rbp],xmm14
+	movdqu	xmm14,XMMWORD[((0 + 0))+rsi]
+	pxor	xmm14,xmm3
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm14
+	movdqu	xmm14,XMMWORD[((16 + 0))+rsi]
+	pxor	xmm14,xmm7
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm14
+	movdqu	xmm14,XMMWORD[((32 + 0))+rsi]
+	pxor	xmm14,xmm11
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm14
+	movdqu	xmm14,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm14,xmm15
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm14
+
+	movdqa	xmm14,XMMWORD[((160+80))+rbp]
+	movdqu	xmm3,XMMWORD[((0 + 64))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 64))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 64))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 64))+rsi]
+	pxor	xmm2,xmm3
+	pxor	xmm6,xmm7
+	pxor	xmm10,xmm11
+	pxor	xmm15,xmm14
+	movdqu	XMMWORD[(0 + 64)+rdi],xmm2
+	movdqu	XMMWORD[(16 + 64)+rdi],xmm6
+	movdqu	XMMWORD[(32 + 64)+rdi],xmm10
+	movdqu	XMMWORD[(48 + 64)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 128))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 128))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 128))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 128))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 128)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 128)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 128)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 128)+rdi],xmm15
+
+	cmp	rbx,16*16
+	ja	NEAR $L$seal_sse_main_loop_xor
+
+	mov	rcx,12*16
+	sub	rbx,12*16
+	lea	rsi,[192+rsi]
+	jmp	NEAR $L$seal_sse_128_tail_hash
+$L$seal_sse_main_loop_xor:
+	movdqu	xmm3,XMMWORD[((0 + 192))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 192))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 192))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 192))+rsi]
+	pxor	xmm0,xmm3
+	pxor	xmm4,xmm7
+	pxor	xmm8,xmm11
+	pxor	xmm15,xmm12
+	movdqu	XMMWORD[(0 + 192)+rdi],xmm0
+	movdqu	XMMWORD[(16 + 192)+rdi],xmm4
+	movdqu	XMMWORD[(32 + 192)+rdi],xmm8
+	movdqu	XMMWORD[(48 + 192)+rdi],xmm15
+
+	lea	rsi,[256+rsi]
+	sub	rbx,16*16
+	mov	rcx,6
+	mov	r8,4
+	cmp	rbx,12*16
+	jg	NEAR $L$seal_sse_main_loop
+	mov	rcx,rbx
+	test	rbx,rbx
+	je	NEAR $L$seal_sse_128_tail_hash
+	mov	rcx,6
+	cmp	rbx,8*16
+	ja	NEAR $L$seal_sse_tail_192
+	cmp	rbx,4*16
+	ja	NEAR $L$seal_sse_tail_128
+
+$L$seal_sse_tail_64:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm12,XMMWORD[((160+96))+rbp]
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+
+$L$seal_sse_tail_64_rounds_and_x2hash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_sse_tail_64_rounds_and_x1hash:
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_sse_tail_64_rounds_and_x2hash
+	dec	r8
+	jge	NEAR $L$seal_sse_tail_64_rounds_and_x1hash
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+
+	jmp	NEAR $L$seal_sse_128_tail_xor
+
+$L$seal_sse_tail_128:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm13,XMMWORD[((160+96))+rbp]
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+
+$L$seal_sse_tail_128_rounds_and_x2hash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_sse_tail_128_rounds_and_x1hash:
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+
+	lea	rdi,[16+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_sse_tail_128_rounds_and_x2hash
+	dec	r8
+	jge	NEAR $L$seal_sse_tail_128_rounds_and_x1hash
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+	movdqu	xmm3,XMMWORD[((0 + 0))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 0))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 0))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm15
+
+	mov	rcx,4*16
+	sub	rbx,4*16
+	lea	rsi,[64+rsi]
+	jmp	NEAR $L$seal_sse_128_tail_hash
+
+$L$seal_sse_tail_192:
+	movdqa	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm4,XMMWORD[((160+48))+rbp]
+	movdqa	xmm8,XMMWORD[((160+64))+rbp]
+	movdqa	xmm1,xmm0
+	movdqa	xmm5,xmm4
+	movdqa	xmm9,xmm8
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	movdqa	xmm10,xmm8
+	movdqa	xmm14,XMMWORD[((160+96))+rbp]
+	paddd	xmm14,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm14
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm12,xmm13
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	XMMWORD[(160+96)+rbp],xmm12
+	movdqa	XMMWORD[(160+112)+rbp],xmm13
+	movdqa	XMMWORD[(160+128)+rbp],xmm14
+
+$L$seal_sse_tail_192_rounds_and_x2hash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_sse_tail_192_rounds_and_x1hash:
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+
+	lea	rdi,[16+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_sse_tail_192_rounds_and_x2hash
+	dec	r8
+	jge	NEAR $L$seal_sse_tail_192_rounds_and_x1hash
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm6,XMMWORD[((160+48))+rbp]
+	paddd	xmm10,XMMWORD[((160+64))+rbp]
+	paddd	xmm14,XMMWORD[((160+128))+rbp]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm5,XMMWORD[((160+48))+rbp]
+	paddd	xmm9,XMMWORD[((160+64))+rbp]
+	paddd	xmm13,XMMWORD[((160+112))+rbp]
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,XMMWORD[((160+48))+rbp]
+	paddd	xmm8,XMMWORD[((160+64))+rbp]
+	paddd	xmm12,XMMWORD[((160+96))+rbp]
+	movdqu	xmm3,XMMWORD[((0 + 0))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 0))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 0))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 0))+rsi]
+	pxor	xmm2,xmm3
+	pxor	xmm6,xmm7
+	pxor	xmm10,xmm11
+	pxor	xmm15,xmm14
+	movdqu	XMMWORD[(0 + 0)+rdi],xmm2
+	movdqu	XMMWORD[(16 + 0)+rdi],xmm6
+	movdqu	XMMWORD[(32 + 0)+rdi],xmm10
+	movdqu	XMMWORD[(48 + 0)+rdi],xmm15
+	movdqu	xmm3,XMMWORD[((0 + 64))+rsi]
+	movdqu	xmm7,XMMWORD[((16 + 64))+rsi]
+	movdqu	xmm11,XMMWORD[((32 + 64))+rsi]
+	movdqu	xmm15,XMMWORD[((48 + 64))+rsi]
+	pxor	xmm1,xmm3
+	pxor	xmm5,xmm7
+	pxor	xmm9,xmm11
+	pxor	xmm15,xmm13
+	movdqu	XMMWORD[(0 + 64)+rdi],xmm1
+	movdqu	XMMWORD[(16 + 64)+rdi],xmm5
+	movdqu	XMMWORD[(32 + 64)+rdi],xmm9
+	movdqu	XMMWORD[(48 + 64)+rdi],xmm15
+
+	mov	rcx,8*16
+	sub	rbx,8*16
+	lea	rsi,[128+rsi]
+
+$L$seal_sse_128_tail_hash:
+	cmp	rcx,16
+	jb	NEAR $L$seal_sse_128_tail_xor
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	sub	rcx,16
+	lea	rdi,[16+rdi]
+	jmp	NEAR $L$seal_sse_128_tail_hash
+
+$L$seal_sse_128_tail_xor:
+	cmp	rbx,16
+	jb	NEAR $L$seal_sse_tail_16
+	sub	rbx,16
+
+	movdqu	xmm3,XMMWORD[rsi]
+	pxor	xmm0,xmm3
+	movdqu	XMMWORD[rdi],xmm0
+
+	add	r10,QWORD[rdi]
+	adc	r11,QWORD[8+rdi]
+	adc	r12,1
+	lea	rsi,[16+rsi]
+	lea	rdi,[16+rdi]
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm4,xmm8
+	movdqa	xmm8,xmm12
+	movdqa	xmm12,xmm1
+	movdqa	xmm1,xmm5
+	movdqa	xmm5,xmm9
+	movdqa	xmm9,xmm13
+	jmp	NEAR $L$seal_sse_128_tail_xor
+
+$L$seal_sse_tail_16:
+	test	rbx,rbx
+	jz	NEAR $L$process_blocks_of_extra_in
+
+	mov	r8,rbx
+	mov	rcx,rbx
+	lea	rsi,[((-1))+rbx*1+rsi]
+	pxor	xmm15,xmm15
+$L$seal_sse_tail_16_compose:
+	pslldq	xmm15,1
+	pinsrb	xmm15,BYTE[rsi],0
+	lea	rsi,[((-1))+rsi]
+	dec	rcx
+	jne	NEAR $L$seal_sse_tail_16_compose
+
+
+	pxor	xmm15,xmm0
+
+
+	mov	rcx,rbx
+	movdqu	xmm0,xmm15
+$L$seal_sse_tail_16_extract:
+	pextrb	XMMWORD[rdi],xmm0,0
+	psrldq	xmm0,1
+	add	rdi,1
+	sub	rcx,1
+	jnz	NEAR $L$seal_sse_tail_16_extract
+
+
+
+
+
+
+
+
+	mov	r9,QWORD[((288 + 160 + 32))+rsp]
+	mov	r14,QWORD[56+r9]
+	mov	r13,QWORD[48+r9]
+	test	r14,r14
+	jz	NEAR $L$process_partial_block
+
+	mov	r15,16
+	sub	r15,rbx
+	cmp	r14,r15
+
+	jge	NEAR $L$load_extra_in
+	mov	r15,r14
+
+$L$load_extra_in:
+
+
+	lea	rsi,[((-1))+r15*1+r13]
+
+
+	add	r13,r15
+	sub	r14,r15
+	mov	QWORD[48+r9],r13
+	mov	QWORD[56+r9],r14
+
+
+
+	add	r8,r15
+
+
+	pxor	xmm11,xmm11
+$L$load_extra_load_loop:
+	pslldq	xmm11,1
+	pinsrb	xmm11,BYTE[rsi],0
+	lea	rsi,[((-1))+rsi]
+	sub	r15,1
+	jnz	NEAR $L$load_extra_load_loop
+
+
+
+
+	mov	r15,rbx
+
+$L$load_extra_shift_loop:
+	pslldq	xmm11,1
+	sub	r15,1
+	jnz	NEAR $L$load_extra_shift_loop
+
+
+
+
+	lea	r15,[$L$and_masks]
+	shl	rbx,4
+	pand	xmm15,XMMWORD[((-16))+rbx*1+r15]
+
+
+	por	xmm15,xmm11
+
+
+
+DB	102,77,15,126,253
+	pextrq	r14,xmm15,1
+	add	r10,r13
+	adc	r11,r14
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+$L$process_blocks_of_extra_in:
+
+	mov	r9,QWORD[((288+32+160 ))+rsp]
+	mov	rsi,QWORD[48+r9]
+	mov	r8,QWORD[56+r9]
+	mov	rcx,r8
+	shr	r8,4
+
+$L$process_extra_hash_loop:
+	jz	NEAR process_extra_in_trailer
+	add	r10,QWORD[((0+0))+rsi]
+	adc	r11,QWORD[((8+0))+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rsi,[16+rsi]
+	sub	r8,1
+	jmp	NEAR $L$process_extra_hash_loop
+process_extra_in_trailer:
+	and	rcx,15
+	mov	rbx,rcx
+	jz	NEAR $L$do_length_block
+	lea	rsi,[((-1))+rcx*1+rsi]
+
+$L$process_extra_in_trailer_load:
+	pslldq	xmm15,1
+	pinsrb	xmm15,BYTE[rsi],0
+	lea	rsi,[((-1))+rsi]
+	sub	rcx,1
+	jnz	NEAR $L$process_extra_in_trailer_load
+
+$L$process_partial_block:
+
+	lea	r15,[$L$and_masks]
+	shl	rbx,4
+	pand	xmm15,XMMWORD[((-16))+rbx*1+r15]
+DB	102,77,15,126,253
+	pextrq	r14,xmm15,1
+	add	r10,r13
+	adc	r11,r14
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+$L$do_length_block:
+	add	r10,QWORD[((0+160+32))+rbp]
+	adc	r11,QWORD[((8+160+32))+rbp]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+	mov	r13,r10
+	mov	r14,r11
+	mov	r15,r12
+	sub	r10,-5
+	sbb	r11,-1
+	sbb	r12,3
+	cmovc	r10,r13
+	cmovc	r11,r14
+	cmovc	r12,r15
+
+	add	r10,QWORD[((0+160+16))+rbp]
+	adc	r11,QWORD[((8+160+16))+rbp]
+
+	movaps	xmm6,XMMWORD[((0+0))+rbp]
+	movaps	xmm7,XMMWORD[((16+0))+rbp]
+	movaps	xmm8,XMMWORD[((32+0))+rbp]
+	movaps	xmm9,XMMWORD[((48+0))+rbp]
+	movaps	xmm10,XMMWORD[((64+0))+rbp]
+	movaps	xmm11,XMMWORD[((80+0))+rbp]
+	movaps	xmm12,XMMWORD[((96+0))+rbp]
+	movaps	xmm13,XMMWORD[((112+0))+rbp]
+	movaps	xmm14,XMMWORD[((128+0))+rbp]
+	movaps	xmm15,XMMWORD[((144+0))+rbp]
+
+
+	add	rsp,288 + 160 + 32
+
+
+	pop	r9
+
+	mov	QWORD[r9],r10
+	mov	QWORD[8+r9],r11
+	pop	r15
+
+	pop	r14
+
+	pop	r13
+
+	pop	r12
+
+	pop	rbx
+
+	pop	rbp
+
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$seal_sse_128:
+
+	movdqu	xmm0,XMMWORD[$L$chacha20_consts]
+	movdqa	xmm1,xmm0
+	movdqa	xmm2,xmm0
+	movdqu	xmm4,XMMWORD[r9]
+	movdqa	xmm5,xmm4
+	movdqa	xmm6,xmm4
+	movdqu	xmm8,XMMWORD[16+r9]
+	movdqa	xmm9,xmm8
+	movdqa	xmm10,xmm8
+	movdqu	xmm14,XMMWORD[32+r9]
+	movdqa	xmm12,xmm14
+	paddd	xmm12,XMMWORD[$L$sse_inc]
+	movdqa	xmm13,xmm12
+	paddd	xmm13,XMMWORD[$L$sse_inc]
+	movdqa	xmm7,xmm4
+	movdqa	xmm11,xmm8
+	movdqa	xmm15,xmm12
+	mov	r10,10
+
+$L$seal_sse_128_rounds:
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,4
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,12
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,4
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,12
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,4
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,12
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol16]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,12
+	psrld	xmm4,20
+	pxor	xmm4,xmm3
+	paddd	xmm0,xmm4
+	pxor	xmm12,xmm0
+	pshufb	xmm12,XMMWORD[$L$rol8]
+	paddd	xmm8,xmm12
+	pxor	xmm4,xmm8
+	movdqa	xmm3,xmm4
+	pslld	xmm3,7
+	psrld	xmm4,25
+	pxor	xmm4,xmm3
+DB	102,15,58,15,228,12
+DB	102,69,15,58,15,192,8
+DB	102,69,15,58,15,228,4
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol16]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,12
+	psrld	xmm5,20
+	pxor	xmm5,xmm3
+	paddd	xmm1,xmm5
+	pxor	xmm13,xmm1
+	pshufb	xmm13,XMMWORD[$L$rol8]
+	paddd	xmm9,xmm13
+	pxor	xmm5,xmm9
+	movdqa	xmm3,xmm5
+	pslld	xmm3,7
+	psrld	xmm5,25
+	pxor	xmm5,xmm3
+DB	102,15,58,15,237,12
+DB	102,69,15,58,15,201,8
+DB	102,69,15,58,15,237,4
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol16]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,12
+	psrld	xmm6,20
+	pxor	xmm6,xmm3
+	paddd	xmm2,xmm6
+	pxor	xmm14,xmm2
+	pshufb	xmm14,XMMWORD[$L$rol8]
+	paddd	xmm10,xmm14
+	pxor	xmm6,xmm10
+	movdqa	xmm3,xmm6
+	pslld	xmm3,7
+	psrld	xmm6,25
+	pxor	xmm6,xmm3
+DB	102,15,58,15,246,12
+DB	102,69,15,58,15,210,8
+DB	102,69,15,58,15,246,4
+
+	dec	r10
+	jnz	NEAR $L$seal_sse_128_rounds
+	paddd	xmm0,XMMWORD[$L$chacha20_consts]
+	paddd	xmm1,XMMWORD[$L$chacha20_consts]
+	paddd	xmm2,XMMWORD[$L$chacha20_consts]
+	paddd	xmm4,xmm7
+	paddd	xmm5,xmm7
+	paddd	xmm6,xmm7
+	paddd	xmm8,xmm11
+	paddd	xmm9,xmm11
+	paddd	xmm12,xmm15
+	paddd	xmm15,XMMWORD[$L$sse_inc]
+	paddd	xmm13,xmm15
+
+	pand	xmm2,XMMWORD[$L$clamp]
+	movdqa	XMMWORD[(160+0)+rbp],xmm2
+	movdqa	XMMWORD[(160+16)+rbp],xmm6
+
+	mov	r8,r8
+	call	poly_hash_ad_internal
+	jmp	NEAR $L$seal_sse_128_tail_xor
+$L$SEH_end_GFp_chacha20_poly1305_seal:
+
+
+
+
+ALIGN	64
+chacha20_poly1305_open_avx2:
+
+
+
+
+
+
+
+
+
+
+
+
+	vzeroupper
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vbroadcasti128	ymm4,XMMWORD[r9]
+	vbroadcasti128	ymm8,XMMWORD[16+r9]
+	vbroadcasti128	ymm12,XMMWORD[32+r9]
+	vpaddd	ymm12,ymm12,YMMWORD[$L$avx2_init]
+	cmp	rbx,6*32
+	jbe	NEAR $L$open_avx2_192
+	cmp	rbx,10*32
+	jbe	NEAR $L$open_avx2_320
+
+	vmovdqa	YMMWORD[(160+64)+rbp],ymm4
+	vmovdqa	YMMWORD[(160+96)+rbp],ymm8
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	mov	r10,10
+$L$open_avx2_init_rounds:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+
+	dec	r10
+	jne	NEAR $L$open_avx2_init_rounds
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+
+	vpand	ymm3,ymm3,YMMWORD[$L$clamp]
+	vmovdqa	YMMWORD[(160+0)+rbp],ymm3
+
+	vperm2i128	ymm0,ymm4,ymm0,0x13
+	vperm2i128	ymm4,ymm12,ymm8,0x13
+
+	mov	r8,r8
+	call	poly_hash_ad_internal
+
+	xor	rcx,rcx
+$L$open_avx2_init_hash:
+	add	r10,QWORD[((0+0))+rcx*1+rsi]
+	adc	r11,QWORD[((8+0))+rcx*1+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	add	rcx,16
+	cmp	rcx,2*32
+	jne	NEAR $L$open_avx2_init_hash
+
+	vpxor	ymm0,ymm0,YMMWORD[rsi]
+	vpxor	ymm4,ymm4,YMMWORD[32+rsi]
+
+	vmovdqu	YMMWORD[rdi],ymm0
+	vmovdqu	YMMWORD[32+rdi],ymm4
+	lea	rsi,[64+rsi]
+	lea	rdi,[64+rdi]
+	sub	rbx,2*32
+$L$open_avx2_main_loop:
+
+	cmp	rbx,16*32
+	jb	NEAR $L$open_avx2_main_loop_done
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm3,ymm0
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm15,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm14,ymm12,ymm15
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+256)+rbp],ymm15
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+
+	xor	rcx,rcx
+$L$open_avx2_main_loop_rounds:
+	add	r10,QWORD[((0+0))+rcx*1+rsi]
+	adc	r11,QWORD[((8+0))+rcx*1+rsi]
+	adc	r12,1
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	add	r15,rax
+	adc	r9,rdx
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	add	r10,QWORD[((0+16))+rcx*1+rsi]
+	adc	r11,QWORD[((8+16))+rcx*1+rsi]
+	adc	r12,1
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,4
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,12
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,12
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	add	r15,rax
+	adc	r9,rdx
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	add	r10,QWORD[((0+32))+rcx*1+rsi]
+	adc	r11,QWORD[((8+32))+rcx*1+rsi]
+	adc	r12,1
+
+	lea	rcx,[48+rcx]
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	add	r15,rax
+	adc	r9,rdx
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,12
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,4
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpalignr	ymm12,ymm12,ymm12,4
+
+	cmp	rcx,10*6*8
+	jne	NEAR $L$open_avx2_main_loop_rounds
+	vpaddd	ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm7,ymm7,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm11,ymm11,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm15,ymm15,YMMWORD[((160+256))+rbp]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm0
+	add	r10,QWORD[((0+480))+rsi]
+	adc	r11,QWORD[((8+480))+rsi]
+	adc	r12,1
+	vperm2i128	ymm0,ymm7,ymm3,0x02
+	vperm2i128	ymm7,ymm7,ymm3,0x13
+	vperm2i128	ymm3,ymm15,ymm11,0x02
+	vperm2i128	ymm11,ymm15,ymm11,0x13
+	vpxor	ymm0,ymm0,YMMWORD[((0+0))+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[((32+0))+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[((64+0))+rsi]
+	vpxor	ymm11,ymm11,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm0
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm7
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm11
+
+	vmovdqa	ymm0,YMMWORD[((160+128))+rbp]
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vperm2i128	ymm3,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+128))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+128))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+128))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+128))+rsi]
+	vmovdqu	YMMWORD[(0+128)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+128)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+128)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+128)+rdi],ymm10
+	add	r10,QWORD[((0+480+16))+rsi]
+	adc	r11,QWORD[((8+480+16))+rsi]
+	adc	r12,1
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+256))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+256))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+256))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+256))+rsi]
+	vmovdqu	YMMWORD[(0+256)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+256)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+256)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+256)+rdi],ymm9
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm12,ymm8,0x02
+	vperm2i128	ymm8,ymm12,ymm8,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+384))+rsi]
+	vpxor	ymm0,ymm0,YMMWORD[((32+384))+rsi]
+	vpxor	ymm4,ymm4,YMMWORD[((64+384))+rsi]
+	vpxor	ymm8,ymm8,YMMWORD[((96+384))+rsi]
+	vmovdqu	YMMWORD[(0+384)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+384)+rdi],ymm0
+	vmovdqu	YMMWORD[(64+384)+rdi],ymm4
+	vmovdqu	YMMWORD[(96+384)+rdi],ymm8
+
+	lea	rsi,[512+rsi]
+	lea	rdi,[512+rdi]
+	sub	rbx,16*32
+	jmp	NEAR $L$open_avx2_main_loop
+$L$open_avx2_main_loop_done:
+	test	rbx,rbx
+	vzeroupper
+	je	NEAR $L$open_sse_finalize
+
+	cmp	rbx,12*32
+	ja	NEAR $L$open_avx2_tail_512
+	cmp	rbx,8*32
+	ja	NEAR $L$open_avx2_tail_384
+	cmp	rbx,4*32
+	ja	NEAR $L$open_avx2_tail_256
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+
+	xor	r8,r8
+	mov	rcx,rbx
+	and	rcx,-16
+	test	rcx,rcx
+	je	NEAR $L$open_avx2_tail_128_rounds
+$L$open_avx2_tail_128_rounds_and_x1hash:
+	add	r10,QWORD[((0+0))+r8*1+rsi]
+	adc	r11,QWORD[((8+0))+r8*1+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+$L$open_avx2_tail_128_rounds:
+	add	r8,16
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+
+	cmp	r8,rcx
+	jb	NEAR $L$open_avx2_tail_128_rounds_and_x1hash
+	cmp	r8,160
+	jne	NEAR $L$open_avx2_tail_128_rounds
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	jmp	NEAR $L$open_avx2_tail_128_xor
+
+$L$open_avx2_tail_256:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm13,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+
+	mov	QWORD[((160+128))+rbp],rbx
+	mov	rcx,rbx
+	sub	rcx,4*32
+	shr	rcx,4
+	mov	r8,10
+	cmp	rcx,10
+	cmovg	rcx,r8
+	mov	rbx,rsi
+	xor	r8,r8
+$L$open_avx2_tail_256_rounds_and_x1hash:
+	add	r10,QWORD[((0+0))+rbx]
+	adc	r11,QWORD[((8+0))+rbx]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rbx,[16+rbx]
+$L$open_avx2_tail_256_rounds:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+
+	inc	r8
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,12
+
+	cmp	r8,rcx
+	jb	NEAR $L$open_avx2_tail_256_rounds_and_x1hash
+	cmp	r8,10
+	jne	NEAR $L$open_avx2_tail_256_rounds
+	mov	r8,rbx
+	sub	rbx,rsi
+	mov	rcx,rbx
+	mov	rbx,QWORD[((160+128))+rbp]
+$L$open_avx2_tail_256_hash:
+	add	rcx,16
+	cmp	rcx,rbx
+	jg	NEAR $L$open_avx2_tail_256_done
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	r8,[16+r8]
+	jmp	NEAR $L$open_avx2_tail_256_hash
+$L$open_avx2_tail_256_done:
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+0))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+0))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+0))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	lea	rsi,[128+rsi]
+	lea	rdi,[128+rdi]
+	sub	rbx,4*32
+	jmp	NEAR $L$open_avx2_tail_128_xor
+
+$L$open_avx2_tail_384:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm14,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+
+	mov	QWORD[((160+128))+rbp],rbx
+	mov	rcx,rbx
+	sub	rcx,8*32
+	shr	rcx,4
+	add	rcx,6
+	mov	r8,10
+	cmp	rcx,10
+	cmovg	rcx,r8
+	mov	rbx,rsi
+	xor	r8,r8
+$L$open_avx2_tail_384_rounds_and_x2hash:
+	add	r10,QWORD[((0+0))+rbx]
+	adc	r11,QWORD[((8+0))+rbx]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rbx,[16+rbx]
+$L$open_avx2_tail_384_rounds_and_x1hash:
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	add	r10,QWORD[((0+0))+rbx]
+	adc	r11,QWORD[((8+0))+rbx]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rbx,[16+rbx]
+	inc	r8
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+
+	cmp	r8,rcx
+	jb	NEAR $L$open_avx2_tail_384_rounds_and_x2hash
+	cmp	r8,10
+	jne	NEAR $L$open_avx2_tail_384_rounds_and_x1hash
+	mov	r8,rbx
+	sub	rbx,rsi
+	mov	rcx,rbx
+	mov	rbx,QWORD[((160+128))+rbp]
+$L$open_avx2_384_tail_hash:
+	add	rcx,16
+	cmp	rcx,rbx
+	jg	NEAR $L$open_avx2_384_tail_done
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	r8,[16+r8]
+	jmp	NEAR $L$open_avx2_384_tail_hash
+$L$open_avx2_384_tail_done:
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vperm2i128	ymm3,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+0))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+0))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+0))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm10
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+128))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+128))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+128))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+128))+rsi]
+	vmovdqu	YMMWORD[(0+128)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+128)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+128)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+128)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	lea	rsi,[256+rsi]
+	lea	rdi,[256+rdi]
+	sub	rbx,8*32
+	jmp	NEAR $L$open_avx2_tail_128_xor
+
+$L$open_avx2_tail_512:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm3,ymm0
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm15,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm14,ymm12,ymm15
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+256)+rbp],ymm15
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+
+	xor	rcx,rcx
+	mov	r8,rsi
+$L$open_avx2_tail_512_rounds_and_x2hash:
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	r8,[16+r8]
+$L$open_avx2_tail_512_rounds_and_x1hash:
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,4
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,12
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	add	r10,QWORD[((0+16))+r8]
+	adc	r11,QWORD[((8+16))+r8]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	r8,[32+r8]
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,12
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,4
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,4
+
+	inc	rcx
+	cmp	rcx,4
+	jl	NEAR $L$open_avx2_tail_512_rounds_and_x2hash
+	cmp	rcx,10
+	jne	NEAR $L$open_avx2_tail_512_rounds_and_x1hash
+	mov	rcx,rbx
+	sub	rcx,12*32
+	and	rcx,-16
+$L$open_avx2_tail_512_hash:
+	test	rcx,rcx
+	je	NEAR $L$open_avx2_tail_512_done
+	add	r10,QWORD[((0+0))+r8]
+	adc	r11,QWORD[((8+0))+r8]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	r8,[16+r8]
+	sub	rcx,2*8
+	jmp	NEAR $L$open_avx2_tail_512_hash
+$L$open_avx2_tail_512_done:
+	vpaddd	ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm7,ymm7,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm11,ymm11,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm15,ymm15,YMMWORD[((160+256))+rbp]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm0
+	vperm2i128	ymm0,ymm7,ymm3,0x02
+	vperm2i128	ymm7,ymm7,ymm3,0x13
+	vperm2i128	ymm3,ymm15,ymm11,0x02
+	vperm2i128	ymm11,ymm15,ymm11,0x13
+	vpxor	ymm0,ymm0,YMMWORD[((0+0))+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[((32+0))+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[((64+0))+rsi]
+	vpxor	ymm11,ymm11,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm0
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm7
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm11
+
+	vmovdqa	ymm0,YMMWORD[((160+128))+rbp]
+	vperm2i128	ymm3,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+128))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+128))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+128))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+128))+rsi]
+	vmovdqu	YMMWORD[(0+128)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+128)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+128)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+128)+rdi],ymm10
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+256))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+256))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+256))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+256))+rsi]
+	vmovdqu	YMMWORD[(0+256)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+256)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+256)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+256)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	lea	rsi,[384+rsi]
+	lea	rdi,[384+rdi]
+	sub	rbx,12*32
+$L$open_avx2_tail_128_xor:
+	cmp	rbx,32
+	jb	NEAR $L$open_avx2_tail_32_xor
+	sub	rbx,32
+	vpxor	ymm0,ymm0,YMMWORD[rsi]
+	vmovdqu	YMMWORD[rdi],ymm0
+	lea	rsi,[32+rsi]
+	lea	rdi,[32+rdi]
+	vmovdqa	ymm0,ymm4
+	vmovdqa	ymm4,ymm8
+	vmovdqa	ymm8,ymm12
+	jmp	NEAR $L$open_avx2_tail_128_xor
+$L$open_avx2_tail_32_xor:
+	cmp	rbx,16
+	vmovdqa	xmm1,xmm0
+	jb	NEAR $L$open_avx2_exit
+	sub	rbx,16
+
+	vpxor	xmm1,xmm0,XMMWORD[rsi]
+	vmovdqu	XMMWORD[rdi],xmm1
+	lea	rsi,[16+rsi]
+	lea	rdi,[16+rdi]
+	vperm2i128	ymm0,ymm0,ymm0,0x11
+	vmovdqa	xmm1,xmm0
+$L$open_avx2_exit:
+	vzeroupper
+	jmp	NEAR $L$open_sse_tail_16
+
+$L$open_avx2_192:
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm10,ymm8
+	vpaddd	ymm13,ymm12,YMMWORD[$L$avx2_inc]
+	vmovdqa	ymm11,ymm12
+	vmovdqa	ymm15,ymm13
+	mov	r10,10
+$L$open_avx2_192_rounds:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+
+	dec	r10
+	jne	NEAR $L$open_avx2_192_rounds
+	vpaddd	ymm0,ymm0,ymm2
+	vpaddd	ymm1,ymm1,ymm2
+	vpaddd	ymm4,ymm4,ymm6
+	vpaddd	ymm5,ymm5,ymm6
+	vpaddd	ymm8,ymm8,ymm10
+	vpaddd	ymm9,ymm9,ymm10
+	vpaddd	ymm12,ymm12,ymm11
+	vpaddd	ymm13,ymm13,ymm15
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+
+	vpand	ymm3,ymm3,YMMWORD[$L$clamp]
+	vmovdqa	YMMWORD[(160+0)+rbp],ymm3
+
+	vperm2i128	ymm0,ymm4,ymm0,0x13
+	vperm2i128	ymm4,ymm12,ymm8,0x13
+	vperm2i128	ymm8,ymm5,ymm1,0x02
+	vperm2i128	ymm12,ymm13,ymm9,0x02
+	vperm2i128	ymm1,ymm5,ymm1,0x13
+	vperm2i128	ymm5,ymm13,ymm9,0x13
+$L$open_avx2_short:
+	mov	r8,r8
+	call	poly_hash_ad_internal
+$L$open_avx2_short_hash_and_xor_loop:
+	cmp	rbx,32
+	jb	NEAR $L$open_avx2_short_tail_32
+	sub	rbx,32
+	add	r10,QWORD[((0+0))+rsi]
+	adc	r11,QWORD[((8+0))+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	add	r10,QWORD[((0+16))+rsi]
+	adc	r11,QWORD[((8+16))+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+
+	vpxor	ymm0,ymm0,YMMWORD[rsi]
+	vmovdqu	YMMWORD[rdi],ymm0
+	lea	rsi,[32+rsi]
+	lea	rdi,[32+rdi]
+
+	vmovdqa	ymm0,ymm4
+	vmovdqa	ymm4,ymm8
+	vmovdqa	ymm8,ymm12
+	vmovdqa	ymm12,ymm1
+	vmovdqa	ymm1,ymm5
+	vmovdqa	ymm5,ymm9
+	vmovdqa	ymm9,ymm13
+	vmovdqa	ymm13,ymm2
+	vmovdqa	ymm2,ymm6
+	jmp	NEAR $L$open_avx2_short_hash_and_xor_loop
+$L$open_avx2_short_tail_32:
+	cmp	rbx,16
+	vmovdqa	xmm1,xmm0
+	jb	NEAR $L$open_avx2_short_tail_32_exit
+	sub	rbx,16
+	add	r10,QWORD[((0+0))+rsi]
+	adc	r11,QWORD[((8+0))+rsi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	vpxor	xmm3,xmm0,XMMWORD[rsi]
+	vmovdqu	XMMWORD[rdi],xmm3
+	lea	rsi,[16+rsi]
+	lea	rdi,[16+rdi]
+	vextracti128	xmm1,ymm0,1
+$L$open_avx2_short_tail_32_exit:
+	vzeroupper
+	jmp	NEAR $L$open_sse_tail_16
+
+$L$open_avx2_320:
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm10,ymm8
+	vpaddd	ymm13,ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm14,ymm13,YMMWORD[$L$avx2_inc]
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	mov	r10,10
+$L$open_avx2_320_rounds:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,12
+
+	dec	r10
+	jne	NEAR $L$open_avx2_320_rounds
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,ymm7
+	vpaddd	ymm5,ymm5,ymm7
+	vpaddd	ymm6,ymm6,ymm7
+	vpaddd	ymm8,ymm8,ymm11
+	vpaddd	ymm9,ymm9,ymm11
+	vpaddd	ymm10,ymm10,ymm11
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+
+	vpand	ymm3,ymm3,YMMWORD[$L$clamp]
+	vmovdqa	YMMWORD[(160+0)+rbp],ymm3
+
+	vperm2i128	ymm0,ymm4,ymm0,0x13
+	vperm2i128	ymm4,ymm12,ymm8,0x13
+	vperm2i128	ymm8,ymm5,ymm1,0x02
+	vperm2i128	ymm12,ymm13,ymm9,0x02
+	vperm2i128	ymm1,ymm5,ymm1,0x13
+	vperm2i128	ymm5,ymm13,ymm9,0x13
+	vperm2i128	ymm9,ymm6,ymm2,0x02
+	vperm2i128	ymm13,ymm14,ymm10,0x02
+	vperm2i128	ymm2,ymm6,ymm2,0x13
+	vperm2i128	ymm6,ymm14,ymm10,0x13
+	jmp	NEAR $L$open_avx2_short
+
+
+
+
+
+ALIGN	64
+chacha20_poly1305_seal_avx2:
+
+
+
+
+
+
+
+
+
+
+
+
+	vzeroupper
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vbroadcasti128	ymm4,XMMWORD[r9]
+	vbroadcasti128	ymm8,XMMWORD[16+r9]
+	vbroadcasti128	ymm12,XMMWORD[32+r9]
+	vpaddd	ymm12,ymm12,YMMWORD[$L$avx2_init]
+	cmp	rbx,6*32
+	jbe	NEAR $L$seal_avx2_192
+	cmp	rbx,10*32
+	jbe	NEAR $L$seal_avx2_320
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm3,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm7,ymm4
+	vmovdqa	YMMWORD[(160+64)+rbp],ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm11,ymm8
+	vmovdqa	YMMWORD[(160+96)+rbp],ymm8
+	vmovdqa	ymm15,ymm12
+	vpaddd	ymm14,ymm15,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm13,ymm14,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm12,ymm13,YMMWORD[$L$avx2_inc]
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	vmovdqa	YMMWORD[(160+256)+rbp],ymm15
+	mov	r10,10
+$L$seal_avx2_init_rounds:
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,4
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,12
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,12
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,4
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,4
+
+	dec	r10
+	jnz	NEAR $L$seal_avx2_init_rounds
+	vpaddd	ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm7,ymm7,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm11,ymm11,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm15,ymm15,YMMWORD[((160+256))+rbp]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+	vperm2i128	ymm11,ymm15,ymm11,0x13
+	vperm2i128	ymm15,ymm7,ymm3,0x02
+	vperm2i128	ymm3,ymm7,ymm3,0x13
+	vpand	ymm15,ymm15,YMMWORD[$L$clamp]
+	vmovdqa	YMMWORD[(160+0)+rbp],ymm15
+	mov	r8,r8
+	call	poly_hash_ad_internal
+
+	vpxor	ymm3,ymm3,YMMWORD[rsi]
+	vpxor	ymm11,ymm11,YMMWORD[32+rsi]
+	vmovdqu	YMMWORD[rdi],ymm3
+	vmovdqu	YMMWORD[32+rdi],ymm11
+	vperm2i128	ymm15,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm15,ymm15,YMMWORD[((0+64))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+64))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+64))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+64))+rsi]
+	vmovdqu	YMMWORD[(0+64)+rdi],ymm15
+	vmovdqu	YMMWORD[(32+64)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+64)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+64)+rdi],ymm10
+	vperm2i128	ymm15,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm15,ymm15,YMMWORD[((0+192))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+192))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+192))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+192))+rsi]
+	vmovdqu	YMMWORD[(0+192)+rdi],ymm15
+	vmovdqu	YMMWORD[(32+192)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+192)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+192)+rdi],ymm9
+	vperm2i128	ymm15,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm15
+
+	lea	rsi,[320+rsi]
+	sub	rbx,10*32
+	mov	rcx,10*32
+	cmp	rbx,4*32
+	jbe	NEAR $L$seal_avx2_short_hash_remainder
+	vpxor	ymm0,ymm0,YMMWORD[rsi]
+	vpxor	ymm4,ymm4,YMMWORD[32+rsi]
+	vpxor	ymm8,ymm8,YMMWORD[64+rsi]
+	vpxor	ymm12,ymm12,YMMWORD[96+rsi]
+	vmovdqu	YMMWORD[320+rdi],ymm0
+	vmovdqu	YMMWORD[352+rdi],ymm4
+	vmovdqu	YMMWORD[384+rdi],ymm8
+	vmovdqu	YMMWORD[416+rdi],ymm12
+	lea	rsi,[128+rsi]
+	sub	rbx,4*32
+	mov	rcx,8
+	mov	r8,2
+	cmp	rbx,4*32
+	jbe	NEAR $L$seal_avx2_tail_128
+	cmp	rbx,8*32
+	jbe	NEAR $L$seal_avx2_tail_256
+	cmp	rbx,12*32
+	jbe	NEAR $L$seal_avx2_tail_384
+	cmp	rbx,16*32
+	jbe	NEAR $L$seal_avx2_tail_512
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm3,ymm0
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm15,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm14,ymm12,ymm15
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+256)+rbp],ymm15
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,4
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,12
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,12
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,4
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,4
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+
+	sub	rdi,16
+	mov	rcx,9
+	jmp	NEAR $L$seal_avx2_main_loop_rounds_entry
+ALIGN	32
+$L$seal_avx2_main_loop:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm3,ymm0
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm15,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm14,ymm12,ymm15
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+256)+rbp],ymm15
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+
+	mov	rcx,10
+ALIGN	32
+$L$seal_avx2_main_loop_rounds:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	add	r15,rax
+	adc	r9,rdx
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+$L$seal_avx2_main_loop_rounds_entry:
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,4
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,12
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,12
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	add	r15,rax
+	adc	r9,rdx
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	add	r10,QWORD[((0+32))+rdi]
+	adc	r11,QWORD[((8+32))+rdi]
+	adc	r12,1
+
+	lea	rdi,[48+rdi]
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	add	r15,rax
+	adc	r9,rdx
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,12
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,4
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpalignr	ymm12,ymm12,ymm12,4
+
+	dec	rcx
+	jne	NEAR $L$seal_avx2_main_loop_rounds
+	vpaddd	ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm7,ymm7,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm11,ymm11,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm15,ymm15,YMMWORD[((160+256))+rbp]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm0
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[32+rdi]
+	vperm2i128	ymm0,ymm7,ymm3,0x02
+	vperm2i128	ymm7,ymm7,ymm3,0x13
+	vperm2i128	ymm3,ymm15,ymm11,0x02
+	vperm2i128	ymm11,ymm15,ymm11,0x13
+	vpxor	ymm0,ymm0,YMMWORD[((0+0))+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[((32+0))+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[((64+0))+rsi]
+	vpxor	ymm11,ymm11,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm0
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm7
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm11
+
+	vmovdqa	ymm0,YMMWORD[((160+128))+rbp]
+	vperm2i128	ymm3,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+128))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+128))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+128))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+128))+rsi]
+	vmovdqu	YMMWORD[(0+128)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+128)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+128)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+128)+rdi],ymm10
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+256))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+256))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+256))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+256))+rsi]
+	vmovdqu	YMMWORD[(0+256)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+256)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+256)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+256)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm12,ymm8,0x02
+	vperm2i128	ymm8,ymm12,ymm8,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+384))+rsi]
+	vpxor	ymm0,ymm0,YMMWORD[((32+384))+rsi]
+	vpxor	ymm4,ymm4,YMMWORD[((64+384))+rsi]
+	vpxor	ymm8,ymm8,YMMWORD[((96+384))+rsi]
+	vmovdqu	YMMWORD[(0+384)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+384)+rdi],ymm0
+	vmovdqu	YMMWORD[(64+384)+rdi],ymm4
+	vmovdqu	YMMWORD[(96+384)+rdi],ymm8
+
+	lea	rsi,[512+rsi]
+	sub	rbx,16*32
+	cmp	rbx,16*32
+	jg	NEAR $L$seal_avx2_main_loop
+
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[32+rdi]
+	mov	rcx,10
+	xor	r8,r8
+
+	cmp	rbx,12*32
+	ja	NEAR $L$seal_avx2_tail_512
+	cmp	rbx,8*32
+	ja	NEAR $L$seal_avx2_tail_384
+	cmp	rbx,4*32
+	ja	NEAR $L$seal_avx2_tail_256
+
+$L$seal_avx2_tail_128:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+
+$L$seal_avx2_tail_128_rounds_and_3xhash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_avx2_tail_128_rounds_and_2xhash:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[32+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_avx2_tail_128_rounds_and_3xhash
+	dec	r8
+	jge	NEAR $L$seal_avx2_tail_128_rounds_and_2xhash
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	jmp	NEAR $L$seal_avx2_short_loop
+
+$L$seal_avx2_tail_256:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm13,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+
+$L$seal_avx2_tail_256_rounds_and_3xhash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_avx2_tail_256_rounds_and_2xhash:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[32+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_avx2_tail_256_rounds_and_3xhash
+	dec	r8
+	jge	NEAR $L$seal_avx2_tail_256_rounds_and_2xhash
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+0))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+0))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+0))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	mov	rcx,4*32
+	lea	rsi,[128+rsi]
+	sub	rbx,4*32
+	jmp	NEAR $L$seal_avx2_short_hash_remainder
+
+$L$seal_avx2_tail_384:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm14,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+
+$L$seal_avx2_tail_384_rounds_and_3xhash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_avx2_tail_384_rounds_and_2xhash:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,12
+
+	lea	rdi,[32+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_avx2_tail_384_rounds_and_3xhash
+	dec	r8
+	jge	NEAR $L$seal_avx2_tail_384_rounds_and_2xhash
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vperm2i128	ymm3,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+0))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+0))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+0))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm10
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+128))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+128))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+128))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+128))+rsi]
+	vmovdqu	YMMWORD[(0+128)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+128)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+128)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+128)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	mov	rcx,8*32
+	lea	rsi,[256+rsi]
+	sub	rbx,8*32
+	jmp	NEAR $L$seal_avx2_short_hash_remainder
+
+$L$seal_avx2_tail_512:
+	vmovdqa	ymm0,YMMWORD[$L$chacha20_consts]
+	vmovdqa	ymm4,YMMWORD[((160+64))+rbp]
+	vmovdqa	ymm8,YMMWORD[((160+96))+rbp]
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm10,ymm8
+	vmovdqa	ymm3,ymm0
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm15,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm14,ymm12,ymm15
+	vpaddd	ymm13,ymm12,ymm14
+	vpaddd	ymm12,ymm12,ymm13
+	vmovdqa	YMMWORD[(160+256)+rbp],ymm15
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+
+$L$seal_avx2_tail_512_rounds_and_3xhash:
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	add	r15,rax
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+$L$seal_avx2_tail_512_rounds_and_2xhash:
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,4
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,12
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm4,ymm4,ymm4,4
+	add	r15,rax
+	adc	r9,rdx
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,12
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol16]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,20
+	vpslld	ymm7,ymm7,32-20
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,20
+	vpslld	ymm6,ymm6,32-20
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,20
+	vpslld	ymm5,ymm5,32-20
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,20
+	vpslld	ymm4,ymm4,32-20
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[$L$rol8]
+	vpaddd	ymm3,ymm3,ymm7
+	vpaddd	ymm2,ymm2,ymm6
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	vpaddd	ymm1,ymm1,ymm5
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm15,ymm15,ymm3
+	vpxor	ymm14,ymm14,ymm2
+	vpxor	ymm13,ymm13,ymm1
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm15,ymm15,ymm8
+	vpshufb	ymm14,ymm14,ymm8
+	vpshufb	ymm13,ymm13,ymm8
+	vpshufb	ymm12,ymm12,ymm8
+	vpaddd	ymm11,ymm11,ymm15
+	vpaddd	ymm10,ymm10,ymm14
+	vpaddd	ymm9,ymm9,ymm13
+	vpaddd	ymm8,ymm12,YMMWORD[((160+128))+rbp]
+	vpxor	ymm7,ymm7,ymm11
+	vpxor	ymm6,ymm6,ymm10
+	vpxor	ymm5,ymm5,ymm9
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm8
+	vpsrld	ymm8,ymm7,25
+	mov	rdx,QWORD[((0+160+0))+rbp]
+	mov	r15,rdx
+	mulx	r14,r13,r10
+	mulx	rdx,rax,r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	vpslld	ymm7,ymm7,32-25
+	vpxor	ymm7,ymm7,ymm8
+	vpsrld	ymm8,ymm6,25
+	vpslld	ymm6,ymm6,32-25
+	vpxor	ymm6,ymm6,ymm8
+	vpsrld	ymm8,ymm5,25
+	vpslld	ymm5,ymm5,32-25
+	vpxor	ymm5,ymm5,ymm8
+	vpsrld	ymm8,ymm4,25
+	vpslld	ymm4,ymm4,32-25
+	vpxor	ymm4,ymm4,ymm8
+	vmovdqa	ymm8,YMMWORD[((160+128))+rbp]
+	vpalignr	ymm7,ymm7,ymm7,12
+	vpalignr	ymm11,ymm11,ymm11,8
+	vpalignr	ymm15,ymm15,ymm15,4
+	vpalignr	ymm6,ymm6,ymm6,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	mov	rdx,QWORD[((8+160+0))+rbp]
+	mulx	rax,r10,r10
+	add	r14,r10
+	mulx	r9,r11,r11
+	adc	r15,r11
+	adc	r9,0
+	imul	rdx,r12
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm12,ymm12,ymm12,4
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	add	r15,rax
+	adc	r9,rdx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[32+rdi]
+	dec	rcx
+	jg	NEAR $L$seal_avx2_tail_512_rounds_and_3xhash
+	dec	r8
+	jge	NEAR $L$seal_avx2_tail_512_rounds_and_2xhash
+	vpaddd	ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm7,ymm7,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm11,ymm11,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm15,ymm15,YMMWORD[((160+256))+rbp]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm6,ymm6,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm10,ymm10,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm5,ymm5,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm9,ymm9,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,YMMWORD[((160+64))+rbp]
+	vpaddd	ymm8,ymm8,YMMWORD[((160+96))+rbp]
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+	vmovdqa	YMMWORD[(160+128)+rbp],ymm0
+	vperm2i128	ymm0,ymm7,ymm3,0x02
+	vperm2i128	ymm7,ymm7,ymm3,0x13
+	vperm2i128	ymm3,ymm15,ymm11,0x02
+	vperm2i128	ymm11,ymm15,ymm11,0x13
+	vpxor	ymm0,ymm0,YMMWORD[((0+0))+rsi]
+	vpxor	ymm3,ymm3,YMMWORD[((32+0))+rsi]
+	vpxor	ymm7,ymm7,YMMWORD[((64+0))+rsi]
+	vpxor	ymm11,ymm11,YMMWORD[((96+0))+rsi]
+	vmovdqu	YMMWORD[(0+0)+rdi],ymm0
+	vmovdqu	YMMWORD[(32+0)+rdi],ymm3
+	vmovdqu	YMMWORD[(64+0)+rdi],ymm7
+	vmovdqu	YMMWORD[(96+0)+rdi],ymm11
+
+	vmovdqa	ymm0,YMMWORD[((160+128))+rbp]
+	vperm2i128	ymm3,ymm6,ymm2,0x02
+	vperm2i128	ymm6,ymm6,ymm2,0x13
+	vperm2i128	ymm2,ymm14,ymm10,0x02
+	vperm2i128	ymm10,ymm14,ymm10,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+128))+rsi]
+	vpxor	ymm2,ymm2,YMMWORD[((32+128))+rsi]
+	vpxor	ymm6,ymm6,YMMWORD[((64+128))+rsi]
+	vpxor	ymm10,ymm10,YMMWORD[((96+128))+rsi]
+	vmovdqu	YMMWORD[(0+128)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+128)+rdi],ymm2
+	vmovdqu	YMMWORD[(64+128)+rdi],ymm6
+	vmovdqu	YMMWORD[(96+128)+rdi],ymm10
+	vperm2i128	ymm3,ymm5,ymm1,0x02
+	vperm2i128	ymm5,ymm5,ymm1,0x13
+	vperm2i128	ymm1,ymm13,ymm9,0x02
+	vperm2i128	ymm9,ymm13,ymm9,0x13
+	vpxor	ymm3,ymm3,YMMWORD[((0+256))+rsi]
+	vpxor	ymm1,ymm1,YMMWORD[((32+256))+rsi]
+	vpxor	ymm5,ymm5,YMMWORD[((64+256))+rsi]
+	vpxor	ymm9,ymm9,YMMWORD[((96+256))+rsi]
+	vmovdqu	YMMWORD[(0+256)+rdi],ymm3
+	vmovdqu	YMMWORD[(32+256)+rdi],ymm1
+	vmovdqu	YMMWORD[(64+256)+rdi],ymm5
+	vmovdqu	YMMWORD[(96+256)+rdi],ymm9
+	vperm2i128	ymm3,ymm4,ymm0,0x13
+	vperm2i128	ymm0,ymm4,ymm0,0x02
+	vperm2i128	ymm4,ymm12,ymm8,0x02
+	vperm2i128	ymm12,ymm12,ymm8,0x13
+	vmovdqa	ymm8,ymm3
+
+	mov	rcx,12*32
+	lea	rsi,[384+rsi]
+	sub	rbx,12*32
+	jmp	NEAR $L$seal_avx2_short_hash_remainder
+
+$L$seal_avx2_320:
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm10,ymm8
+	vpaddd	ymm13,ymm12,YMMWORD[$L$avx2_inc]
+	vpaddd	ymm14,ymm13,YMMWORD[$L$avx2_inc]
+	vmovdqa	ymm7,ymm4
+	vmovdqa	ymm11,ymm8
+	vmovdqa	YMMWORD[(160+160)+rbp],ymm12
+	vmovdqa	YMMWORD[(160+192)+rbp],ymm13
+	vmovdqa	YMMWORD[(160+224)+rbp],ymm14
+	mov	r10,10
+$L$seal_avx2_320_rounds:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,12
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol16]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpsrld	ymm3,ymm6,20
+	vpslld	ymm6,ymm6,12
+	vpxor	ymm6,ymm6,ymm3
+	vpaddd	ymm2,ymm2,ymm6
+	vpxor	ymm14,ymm14,ymm2
+	vpshufb	ymm14,ymm14,YMMWORD[$L$rol8]
+	vpaddd	ymm10,ymm10,ymm14
+	vpxor	ymm6,ymm6,ymm10
+	vpslld	ymm3,ymm6,7
+	vpsrld	ymm6,ymm6,25
+	vpxor	ymm6,ymm6,ymm3
+	vpalignr	ymm14,ymm14,ymm14,4
+	vpalignr	ymm10,ymm10,ymm10,8
+	vpalignr	ymm6,ymm6,ymm6,12
+
+	dec	r10
+	jne	NEAR $L$seal_avx2_320_rounds
+	vpaddd	ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+	vpaddd	ymm4,ymm4,ymm7
+	vpaddd	ymm5,ymm5,ymm7
+	vpaddd	ymm6,ymm6,ymm7
+	vpaddd	ymm8,ymm8,ymm11
+	vpaddd	ymm9,ymm9,ymm11
+	vpaddd	ymm10,ymm10,ymm11
+	vpaddd	ymm12,ymm12,YMMWORD[((160+160))+rbp]
+	vpaddd	ymm13,ymm13,YMMWORD[((160+192))+rbp]
+	vpaddd	ymm14,ymm14,YMMWORD[((160+224))+rbp]
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+
+	vpand	ymm3,ymm3,YMMWORD[$L$clamp]
+	vmovdqa	YMMWORD[(160+0)+rbp],ymm3
+
+	vperm2i128	ymm0,ymm4,ymm0,0x13
+	vperm2i128	ymm4,ymm12,ymm8,0x13
+	vperm2i128	ymm8,ymm5,ymm1,0x02
+	vperm2i128	ymm12,ymm13,ymm9,0x02
+	vperm2i128	ymm1,ymm5,ymm1,0x13
+	vperm2i128	ymm5,ymm13,ymm9,0x13
+	vperm2i128	ymm9,ymm6,ymm2,0x02
+	vperm2i128	ymm13,ymm14,ymm10,0x02
+	vperm2i128	ymm2,ymm6,ymm2,0x13
+	vperm2i128	ymm6,ymm14,ymm10,0x13
+	jmp	NEAR $L$seal_avx2_short
+
+$L$seal_avx2_192:
+	vmovdqa	ymm1,ymm0
+	vmovdqa	ymm2,ymm0
+	vmovdqa	ymm5,ymm4
+	vmovdqa	ymm6,ymm4
+	vmovdqa	ymm9,ymm8
+	vmovdqa	ymm10,ymm8
+	vpaddd	ymm13,ymm12,YMMWORD[$L$avx2_inc]
+	vmovdqa	ymm11,ymm12
+	vmovdqa	ymm15,ymm13
+	mov	r10,10
+$L$seal_avx2_192_rounds:
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,12
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,4
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,12
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,4
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol16]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpsrld	ymm3,ymm4,20
+	vpslld	ymm4,ymm4,12
+	vpxor	ymm4,ymm4,ymm3
+	vpaddd	ymm0,ymm0,ymm4
+	vpxor	ymm12,ymm12,ymm0
+	vpshufb	ymm12,ymm12,YMMWORD[$L$rol8]
+	vpaddd	ymm8,ymm8,ymm12
+	vpxor	ymm4,ymm4,ymm8
+	vpslld	ymm3,ymm4,7
+	vpsrld	ymm4,ymm4,25
+	vpxor	ymm4,ymm4,ymm3
+	vpalignr	ymm12,ymm12,ymm12,4
+	vpalignr	ymm8,ymm8,ymm8,8
+	vpalignr	ymm4,ymm4,ymm4,12
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol16]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpsrld	ymm3,ymm5,20
+	vpslld	ymm5,ymm5,12
+	vpxor	ymm5,ymm5,ymm3
+	vpaddd	ymm1,ymm1,ymm5
+	vpxor	ymm13,ymm13,ymm1
+	vpshufb	ymm13,ymm13,YMMWORD[$L$rol8]
+	vpaddd	ymm9,ymm9,ymm13
+	vpxor	ymm5,ymm5,ymm9
+	vpslld	ymm3,ymm5,7
+	vpsrld	ymm5,ymm5,25
+	vpxor	ymm5,ymm5,ymm3
+	vpalignr	ymm13,ymm13,ymm13,4
+	vpalignr	ymm9,ymm9,ymm9,8
+	vpalignr	ymm5,ymm5,ymm5,12
+
+	dec	r10
+	jne	NEAR $L$seal_avx2_192_rounds
+	vpaddd	ymm0,ymm0,ymm2
+	vpaddd	ymm1,ymm1,ymm2
+	vpaddd	ymm4,ymm4,ymm6
+	vpaddd	ymm5,ymm5,ymm6
+	vpaddd	ymm8,ymm8,ymm10
+	vpaddd	ymm9,ymm9,ymm10
+	vpaddd	ymm12,ymm12,ymm11
+	vpaddd	ymm13,ymm13,ymm15
+	vperm2i128	ymm3,ymm4,ymm0,0x02
+
+	vpand	ymm3,ymm3,YMMWORD[$L$clamp]
+	vmovdqa	YMMWORD[(160+0)+rbp],ymm3
+
+	vperm2i128	ymm0,ymm4,ymm0,0x13
+	vperm2i128	ymm4,ymm12,ymm8,0x13
+	vperm2i128	ymm8,ymm5,ymm1,0x02
+	vperm2i128	ymm12,ymm13,ymm9,0x02
+	vperm2i128	ymm1,ymm5,ymm1,0x13
+	vperm2i128	ymm5,ymm13,ymm9,0x13
+$L$seal_avx2_short:
+	mov	r8,r8
+	call	poly_hash_ad_internal
+	xor	rcx,rcx
+$L$seal_avx2_short_hash_remainder:
+	cmp	rcx,16
+	jb	NEAR $L$seal_avx2_short_loop
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	sub	rcx,16
+	add	rdi,16
+	jmp	NEAR $L$seal_avx2_short_hash_remainder
+$L$seal_avx2_short_loop:
+	cmp	rbx,32
+	jb	NEAR $L$seal_avx2_short_tail
+	sub	rbx,32
+
+	vpxor	ymm0,ymm0,YMMWORD[rsi]
+	vmovdqu	YMMWORD[rdi],ymm0
+	lea	rsi,[32+rsi]
+
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+	add	r10,QWORD[((0+16))+rdi]
+	adc	r11,QWORD[((8+16))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[32+rdi]
+
+	vmovdqa	ymm0,ymm4
+	vmovdqa	ymm4,ymm8
+	vmovdqa	ymm8,ymm12
+	vmovdqa	ymm12,ymm1
+	vmovdqa	ymm1,ymm5
+	vmovdqa	ymm5,ymm9
+	vmovdqa	ymm9,ymm13
+	vmovdqa	ymm13,ymm2
+	vmovdqa	ymm2,ymm6
+	jmp	NEAR $L$seal_avx2_short_loop
+$L$seal_avx2_short_tail:
+	cmp	rbx,16
+	jb	NEAR $L$seal_avx2_exit
+	sub	rbx,16
+	vpxor	xmm3,xmm0,XMMWORD[rsi]
+	vmovdqu	XMMWORD[rdi],xmm3
+	lea	rsi,[16+rsi]
+	add	r10,QWORD[((0+0))+rdi]
+	adc	r11,QWORD[((8+0))+rdi]
+	adc	r12,1
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mov	r15,rax
+	mul	r10
+	mov	r13,rax
+	mov	r14,rdx
+	mov	rax,QWORD[((0+160+0))+rbp]
+	mul	r11
+	imul	r15,r12
+	add	r14,rax
+	adc	r15,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mov	r9,rax
+	mul	r10
+	add	r14,rax
+	adc	rdx,0
+	mov	r10,rdx
+	mov	rax,QWORD[((8+160+0))+rbp]
+	mul	r11
+	add	r15,rax
+	adc	rdx,0
+	imul	r9,r12
+	add	r15,r10
+	adc	r9,rdx
+	mov	r10,r13
+	mov	r11,r14
+	mov	r12,r15
+	and	r12,3
+	mov	r13,r15
+	and	r13,-4
+	mov	r14,r9
+	shrd	r15,r9,2
+	shr	r9,2
+	add	r15,r13
+	adc	r9,r14
+	add	r10,r15
+	adc	r11,r9
+	adc	r12,0
+
+	lea	rdi,[16+rdi]
+	vextracti128	xmm0,ymm0,1
+$L$seal_avx2_exit:
+	vzeroupper
+	jmp	NEAR $L$seal_sse_tail_16
+
+

+ 1105 - 0
zeroidc/vendor/ring/pregenerated/tmp/ecp_nistz256-x86-win32n.asm

@@ -0,0 +1,1105 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
[email protected] equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+;extern	_GFp_ia32cap_P
+L$ONE_mont:
+dd	1,0,0,-1,-1,-1,-2,0
+align	16
+__ecp_nistz256_div_by_2:
+	mov	ebp,DWORD [esi]
+	xor	edx,edx
+	mov	ebx,DWORD [4+esi]
+	mov	eax,ebp
+	and	ebp,1
+	mov	ecx,DWORD [8+esi]
+	sub	edx,ebp
+	add	eax,edx
+	adc	ebx,edx
+	mov	DWORD [edi],eax
+	adc	ecx,edx
+	mov	DWORD [4+edi],ebx
+	mov	DWORD [8+edi],ecx
+	mov	eax,DWORD [12+esi]
+	mov	ebx,DWORD [16+esi]
+	adc	eax,0
+	mov	ecx,DWORD [20+esi]
+	adc	ebx,0
+	mov	DWORD [12+edi],eax
+	adc	ecx,0
+	mov	DWORD [16+edi],ebx
+	mov	DWORD [20+edi],ecx
+	mov	eax,DWORD [24+esi]
+	mov	ebx,DWORD [28+esi]
+	adc	eax,ebp
+	adc	ebx,edx
+	mov	DWORD [24+edi],eax
+	sbb	esi,esi
+	mov	DWORD [28+edi],ebx
+	mov	eax,DWORD [edi]
+	mov	ebx,DWORD [4+edi]
+	mov	ecx,DWORD [8+edi]
+	mov	edx,DWORD [12+edi]
+	shr	eax,1
+	mov	ebp,ebx
+	shl	ebx,31
+	or	eax,ebx
+	shr	ebp,1
+	mov	ebx,ecx
+	shl	ecx,31
+	mov	DWORD [edi],eax
+	or	ebp,ecx
+	mov	eax,DWORD [16+edi]
+	shr	ebx,1
+	mov	ecx,edx
+	shl	edx,31
+	mov	DWORD [4+edi],ebp
+	or	ebx,edx
+	mov	ebp,DWORD [20+edi]
+	shr	ecx,1
+	mov	edx,eax
+	shl	eax,31
+	mov	DWORD [8+edi],ebx
+	or	ecx,eax
+	mov	ebx,DWORD [24+edi]
+	shr	edx,1
+	mov	eax,ebp
+	shl	ebp,31
+	mov	DWORD [12+edi],ecx
+	or	edx,ebp
+	mov	ecx,DWORD [28+edi]
+	shr	eax,1
+	mov	ebp,ebx
+	shl	ebx,31
+	mov	DWORD [16+edi],edx
+	or	eax,ebx
+	shr	ebp,1
+	mov	ebx,ecx
+	shl	ecx,31
+	mov	DWORD [20+edi],eax
+	or	ebp,ecx
+	shr	ebx,1
+	shl	esi,31
+	mov	DWORD [24+edi],ebp
+	or	ebx,esi
+	mov	DWORD [28+edi],ebx
+	ret
+global	_GFp_nistz256_add
+align	16
+_GFp_nistz256_add:
+L$_GFp_nistz256_add_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	esi,DWORD [24+esp]
+	mov	ebp,DWORD [28+esp]
+	mov	edi,DWORD [20+esp]
+	call	__ecp_nistz256_add
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+align	16
+__ecp_nistz256_add:
+	mov	eax,DWORD [esi]
+	mov	ebx,DWORD [4+esi]
+	mov	ecx,DWORD [8+esi]
+	add	eax,DWORD [ebp]
+	mov	edx,DWORD [12+esi]
+	adc	ebx,DWORD [4+ebp]
+	mov	DWORD [edi],eax
+	adc	ecx,DWORD [8+ebp]
+	mov	DWORD [4+edi],ebx
+	adc	edx,DWORD [12+ebp]
+	mov	DWORD [8+edi],ecx
+	mov	DWORD [12+edi],edx
+	mov	eax,DWORD [16+esi]
+	mov	ebx,DWORD [20+esi]
+	mov	ecx,DWORD [24+esi]
+	adc	eax,DWORD [16+ebp]
+	mov	edx,DWORD [28+esi]
+	adc	ebx,DWORD [20+ebp]
+	mov	DWORD [16+edi],eax
+	adc	ecx,DWORD [24+ebp]
+	mov	DWORD [20+edi],ebx
+	mov	esi,0
+	adc	edx,DWORD [28+ebp]
+	mov	DWORD [24+edi],ecx
+	adc	esi,0
+	mov	DWORD [28+edi],edx
+	mov	eax,DWORD [edi]
+	mov	ebx,DWORD [4+edi]
+	mov	ecx,DWORD [8+edi]
+	sub	eax,-1
+	mov	edx,DWORD [12+edi]
+	sbb	ebx,-1
+	mov	eax,DWORD [16+edi]
+	sbb	ecx,-1
+	mov	ebx,DWORD [20+edi]
+	sbb	edx,0
+	mov	ecx,DWORD [24+edi]
+	sbb	eax,0
+	mov	edx,DWORD [28+edi]
+	sbb	ebx,0
+	sbb	ecx,1
+	sbb	edx,-1
+	sbb	esi,0
+	not	esi
+	mov	eax,DWORD [edi]
+	mov	ebp,esi
+	mov	ebx,DWORD [4+edi]
+	shr	ebp,31
+	mov	ecx,DWORD [8+edi]
+	sub	eax,esi
+	mov	edx,DWORD [12+edi]
+	sbb	ebx,esi
+	mov	DWORD [edi],eax
+	sbb	ecx,esi
+	mov	DWORD [4+edi],ebx
+	sbb	edx,0
+	mov	DWORD [8+edi],ecx
+	mov	DWORD [12+edi],edx
+	mov	eax,DWORD [16+edi]
+	mov	ebx,DWORD [20+edi]
+	mov	ecx,DWORD [24+edi]
+	sbb	eax,0
+	mov	edx,DWORD [28+edi]
+	sbb	ebx,0
+	mov	DWORD [16+edi],eax
+	sbb	ecx,ebp
+	mov	DWORD [20+edi],ebx
+	sbb	edx,esi
+	mov	DWORD [24+edi],ecx
+	mov	DWORD [28+edi],edx
+	ret
+align	16
+__ecp_nistz256_sub:
+	mov	eax,DWORD [esi]
+	mov	ebx,DWORD [4+esi]
+	mov	ecx,DWORD [8+esi]
+	sub	eax,DWORD [ebp]
+	mov	edx,DWORD [12+esi]
+	sbb	ebx,DWORD [4+ebp]
+	mov	DWORD [edi],eax
+	sbb	ecx,DWORD [8+ebp]
+	mov	DWORD [4+edi],ebx
+	sbb	edx,DWORD [12+ebp]
+	mov	DWORD [8+edi],ecx
+	mov	DWORD [12+edi],edx
+	mov	eax,DWORD [16+esi]
+	mov	ebx,DWORD [20+esi]
+	mov	ecx,DWORD [24+esi]
+	sbb	eax,DWORD [16+ebp]
+	mov	edx,DWORD [28+esi]
+	sbb	ebx,DWORD [20+ebp]
+	sbb	ecx,DWORD [24+ebp]
+	mov	DWORD [16+edi],eax
+	sbb	edx,DWORD [28+ebp]
+	mov	DWORD [20+edi],ebx
+	sbb	esi,esi
+	mov	DWORD [24+edi],ecx
+	mov	DWORD [28+edi],edx
+	mov	eax,DWORD [edi]
+	mov	ebp,esi
+	mov	ebx,DWORD [4+edi]
+	shr	ebp,31
+	mov	ecx,DWORD [8+edi]
+	add	eax,esi
+	mov	edx,DWORD [12+edi]
+	adc	ebx,esi
+	mov	DWORD [edi],eax
+	adc	ecx,esi
+	mov	DWORD [4+edi],ebx
+	adc	edx,0
+	mov	DWORD [8+edi],ecx
+	mov	DWORD [12+edi],edx
+	mov	eax,DWORD [16+edi]
+	mov	ebx,DWORD [20+edi]
+	mov	ecx,DWORD [24+edi]
+	adc	eax,0
+	mov	edx,DWORD [28+edi]
+	adc	ebx,0
+	mov	DWORD [16+edi],eax
+	adc	ecx,ebp
+	mov	DWORD [20+edi],ebx
+	adc	edx,esi
+	mov	DWORD [24+edi],ecx
+	mov	DWORD [28+edi],edx
+	ret
+global	_GFp_nistz256_neg
+align	16
+_GFp_nistz256_neg:
+L$_GFp_nistz256_neg_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	ebp,DWORD [24+esp]
+	mov	edi,DWORD [20+esp]
+	xor	eax,eax
+	sub	esp,32
+	mov	DWORD [esp],eax
+	mov	esi,esp
+	mov	DWORD [4+esp],eax
+	mov	DWORD [8+esp],eax
+	mov	DWORD [12+esp],eax
+	mov	DWORD [16+esp],eax
+	mov	DWORD [20+esp],eax
+	mov	DWORD [24+esp],eax
+	mov	DWORD [28+esp],eax
+	call	__ecp_nistz256_sub
+	add	esp,32
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+align	16
+__picup_eax:
+	mov	eax,DWORD [esp]
+	ret
+global	_GFp_nistz256_mul_mont
+align	16
+_GFp_nistz256_mul_mont:
+L$_GFp_nistz256_mul_mont_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	esi,DWORD [24+esp]
+	mov	ebp,DWORD [28+esp]
+	call	__picup_eax
+L$000pic:
+	lea	eax,[_GFp_ia32cap_P]
+	mov	eax,DWORD [eax]
+	mov	edi,DWORD [20+esp]
+	call	__ecp_nistz256_mul_mont
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+align	16
+__ecp_nistz256_mul_mont:
+	mov	edx,esp
+	sub	esp,256
+	movd	xmm7,DWORD [ebp]
+	lea	ebp,[4+ebp]
+	pcmpeqd	xmm6,xmm6
+	psrlq	xmm6,48
+	pshuflw	xmm7,xmm7,220
+	and	esp,-64
+	pshufd	xmm7,xmm7,220
+	lea	ebx,[128+esp]
+	movd	xmm0,DWORD [esi]
+	pshufd	xmm0,xmm0,204
+	movd	xmm1,DWORD [4+esi]
+	movdqa	[ebx],xmm0
+	pmuludq	xmm0,xmm7
+	movd	xmm2,DWORD [8+esi]
+	pshufd	xmm1,xmm1,204
+	movdqa	[16+ebx],xmm1
+	pmuludq	xmm1,xmm7
+	movq	xmm4,xmm0
+	pslldq	xmm4,6
+	paddq	xmm4,xmm0
+	movdqa	xmm5,xmm4
+	psrldq	xmm4,10
+	pand	xmm5,xmm6
+	movd	xmm3,DWORD [12+esi]
+	pshufd	xmm2,xmm2,204
+	movdqa	[32+ebx],xmm2
+	pmuludq	xmm2,xmm7
+	paddq	xmm1,xmm4
+	movdqa	[esp],xmm1
+	movd	xmm0,DWORD [16+esi]
+	pshufd	xmm3,xmm3,204
+	movdqa	[48+ebx],xmm3
+	pmuludq	xmm3,xmm7
+	movdqa	[16+esp],xmm2
+	movd	xmm1,DWORD [20+esi]
+	pshufd	xmm0,xmm0,204
+	movdqa	[64+ebx],xmm0
+	pmuludq	xmm0,xmm7
+	paddq	xmm3,xmm5
+	movdqa	[32+esp],xmm3
+	movd	xmm2,DWORD [24+esi]
+	pshufd	xmm1,xmm1,204
+	movdqa	[80+ebx],xmm1
+	pmuludq	xmm1,xmm7
+	movdqa	[48+esp],xmm0
+	pshufd	xmm4,xmm5,177
+	movd	xmm3,DWORD [28+esi]
+	pshufd	xmm2,xmm2,204
+	movdqa	[96+ebx],xmm2
+	pmuludq	xmm2,xmm7
+	movdqa	[64+esp],xmm1
+	psubq	xmm4,xmm5
+	movd	xmm0,DWORD [ebp]
+	pshufd	xmm3,xmm3,204
+	movdqa	[112+ebx],xmm3
+	pmuludq	xmm3,xmm7
+	pshuflw	xmm7,xmm0,220
+	movdqa	xmm0,[ebx]
+	pshufd	xmm7,xmm7,220
+	mov	ecx,6
+	lea	ebp,[4+ebp]
+	jmp	NEAR L$001madd_sse2
+align	16
+L$001madd_sse2:
+	paddq	xmm2,xmm5
+	paddq	xmm3,xmm4
+	movdqa	xmm1,[16+ebx]
+	pmuludq	xmm0,xmm7
+	movdqa	[80+esp],xmm2
+	movdqa	xmm2,[32+ebx]
+	pmuludq	xmm1,xmm7
+	movdqa	[96+esp],xmm3
+	paddq	xmm0,[esp]
+	movdqa	xmm3,[48+ebx]
+	pmuludq	xmm2,xmm7
+	movq	xmm4,xmm0
+	pslldq	xmm4,6
+	paddq	xmm1,[16+esp]
+	paddq	xmm4,xmm0
+	movdqa	xmm5,xmm4
+	psrldq	xmm4,10
+	movdqa	xmm0,[64+ebx]
+	pmuludq	xmm3,xmm7
+	paddq	xmm1,xmm4
+	paddq	xmm2,[32+esp]
+	movdqa	[esp],xmm1
+	movdqa	xmm1,[80+ebx]
+	pmuludq	xmm0,xmm7
+	paddq	xmm3,[48+esp]
+	movdqa	[16+esp],xmm2
+	pand	xmm5,xmm6
+	movdqa	xmm2,[96+ebx]
+	pmuludq	xmm1,xmm7
+	paddq	xmm3,xmm5
+	paddq	xmm0,[64+esp]
+	movdqa	[32+esp],xmm3
+	pshufd	xmm4,xmm5,177
+	movdqa	xmm3,xmm7
+	pmuludq	xmm2,xmm7
+	movd	xmm7,DWORD [ebp]
+	lea	ebp,[4+ebp]
+	paddq	xmm1,[80+esp]
+	psubq	xmm4,xmm5
+	movdqa	[48+esp],xmm0
+	pshuflw	xmm7,xmm7,220
+	pmuludq	xmm3,[112+ebx]
+	pshufd	xmm7,xmm7,220
+	movdqa	xmm0,[ebx]
+	movdqa	[64+esp],xmm1
+	paddq	xmm2,[96+esp]
+	dec	ecx
+	jnz	NEAR L$001madd_sse2
+	paddq	xmm2,xmm5
+	paddq	xmm3,xmm4
+	movdqa	xmm1,[16+ebx]
+	pmuludq	xmm0,xmm7
+	movdqa	[80+esp],xmm2
+	movdqa	xmm2,[32+ebx]
+	pmuludq	xmm1,xmm7
+	movdqa	[96+esp],xmm3
+	paddq	xmm0,[esp]
+	movdqa	xmm3,[48+ebx]
+	pmuludq	xmm2,xmm7
+	movq	xmm4,xmm0
+	pslldq	xmm4,6
+	paddq	xmm1,[16+esp]
+	paddq	xmm4,xmm0
+	movdqa	xmm5,xmm4
+	psrldq	xmm4,10
+	movdqa	xmm0,[64+ebx]
+	pmuludq	xmm3,xmm7
+	paddq	xmm1,xmm4
+	paddq	xmm2,[32+esp]
+	movdqa	[esp],xmm1
+	movdqa	xmm1,[80+ebx]
+	pmuludq	xmm0,xmm7
+	paddq	xmm3,[48+esp]
+	movdqa	[16+esp],xmm2
+	pand	xmm5,xmm6
+	movdqa	xmm2,[96+ebx]
+	pmuludq	xmm1,xmm7
+	paddq	xmm3,xmm5
+	paddq	xmm0,[64+esp]
+	movdqa	[32+esp],xmm3
+	pshufd	xmm4,xmm5,177
+	movdqa	xmm3,[112+ebx]
+	pmuludq	xmm2,xmm7
+	paddq	xmm1,[80+esp]
+	psubq	xmm4,xmm5
+	movdqa	[48+esp],xmm0
+	pmuludq	xmm3,xmm7
+	pcmpeqd	xmm7,xmm7
+	movdqa	xmm0,[esp]
+	pslldq	xmm7,8
+	movdqa	[64+esp],xmm1
+	paddq	xmm2,[96+esp]
+	paddq	xmm2,xmm5
+	paddq	xmm3,xmm4
+	movdqa	[80+esp],xmm2
+	movdqa	[96+esp],xmm3
+	movdqa	xmm1,[16+esp]
+	movdqa	xmm2,[32+esp]
+	movdqa	xmm3,[48+esp]
+	movq	xmm4,xmm0
+	pand	xmm0,xmm7
+	xor	ebp,ebp
+	pslldq	xmm4,6
+	movq	xmm5,xmm1
+	paddq	xmm0,xmm4
+	pand	xmm1,xmm7
+	psrldq	xmm0,6
+	movd	eax,xmm0
+	psrldq	xmm0,4
+	paddq	xmm5,xmm0
+	movdqa	xmm0,[64+esp]
+	sub	eax,-1
+	pslldq	xmm5,6
+	movq	xmm4,xmm2
+	paddq	xmm1,xmm5
+	pand	xmm2,xmm7
+	psrldq	xmm1,6
+	mov	DWORD [edi],eax
+	movd	eax,xmm1
+	psrldq	xmm1,4
+	paddq	xmm4,xmm1
+	movdqa	xmm1,[80+esp]
+	sbb	eax,-1
+	pslldq	xmm4,6
+	movq	xmm5,xmm3
+	paddq	xmm2,xmm4
+	pand	xmm3,xmm7
+	psrldq	xmm2,6
+	mov	DWORD [4+edi],eax
+	movd	eax,xmm2
+	psrldq	xmm2,4
+	paddq	xmm5,xmm2
+	movdqa	xmm2,[96+esp]
+	sbb	eax,-1
+	pslldq	xmm5,6
+	movq	xmm4,xmm0
+	paddq	xmm3,xmm5
+	pand	xmm0,xmm7
+	psrldq	xmm3,6
+	mov	DWORD [8+edi],eax
+	movd	eax,xmm3
+	psrldq	xmm3,4
+	paddq	xmm4,xmm3
+	sbb	eax,0
+	pslldq	xmm4,6
+	movq	xmm5,xmm1
+	paddq	xmm0,xmm4
+	pand	xmm1,xmm7
+	psrldq	xmm0,6
+	mov	DWORD [12+edi],eax
+	movd	eax,xmm0
+	psrldq	xmm0,4
+	paddq	xmm5,xmm0
+	sbb	eax,0
+	pslldq	xmm5,6
+	movq	xmm4,xmm2
+	paddq	xmm1,xmm5
+	pand	xmm2,xmm7
+	psrldq	xmm1,6
+	movd	ebx,xmm1
+	psrldq	xmm1,4
+	mov	esp,edx
+	paddq	xmm4,xmm1
+	pslldq	xmm4,6
+	paddq	xmm2,xmm4
+	psrldq	xmm2,6
+	movd	ecx,xmm2
+	psrldq	xmm2,4
+	sbb	ebx,0
+	movd	edx,xmm2
+	pextrw	esi,xmm2,2
+	sbb	ecx,1
+	sbb	edx,-1
+	sbb	esi,0
+	sub	ebp,esi
+	add	DWORD [edi],esi
+	adc	DWORD [4+edi],esi
+	adc	DWORD [8+edi],esi
+	adc	DWORD [12+edi],0
+	adc	eax,0
+	adc	ebx,0
+	mov	DWORD [16+edi],eax
+	adc	ecx,ebp
+	mov	DWORD [20+edi],ebx
+	adc	edx,esi
+	mov	DWORD [24+edi],ecx
+	mov	DWORD [28+edi],edx
+	ret
+global	_GFp_nistz256_point_double
+align	16
+_GFp_nistz256_point_double:
+L$_GFp_nistz256_point_double_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	esi,DWORD [24+esp]
+	sub	esp,164
+	call	__picup_eax
+L$002pic:
+	lea	edx,[_GFp_ia32cap_P]
+	mov	ebp,DWORD [edx]
+L$point_double_shortcut:
+	mov	eax,DWORD [esi]
+	mov	ebx,DWORD [4+esi]
+	mov	ecx,DWORD [8+esi]
+	mov	edx,DWORD [12+esi]
+	mov	DWORD [96+esp],eax
+	mov	DWORD [100+esp],ebx
+	mov	DWORD [104+esp],ecx
+	mov	DWORD [108+esp],edx
+	mov	eax,DWORD [16+esi]
+	mov	ebx,DWORD [20+esi]
+	mov	ecx,DWORD [24+esi]
+	mov	edx,DWORD [28+esi]
+	mov	DWORD [112+esp],eax
+	mov	DWORD [116+esp],ebx
+	mov	DWORD [120+esp],ecx
+	mov	DWORD [124+esp],edx
+	mov	DWORD [160+esp],ebp
+	lea	ebp,[32+esi]
+	lea	esi,[32+esi]
+	lea	edi,[esp]
+	call	__ecp_nistz256_add
+	mov	eax,DWORD [160+esp]
+	mov	esi,64
+	add	esi,DWORD [188+esp]
+	lea	edi,[64+esp]
+	mov	ebp,esi
+	call	__ecp_nistz256_mul_mont
+	mov	eax,DWORD [160+esp]
+	lea	esi,[esp]
+	lea	ebp,[esp]
+	lea	edi,[esp]
+	call	__ecp_nistz256_mul_mont
+	mov	eax,DWORD [160+esp]
+	mov	ebp,DWORD [188+esp]
+	lea	esi,[32+ebp]
+	lea	ebp,[64+ebp]
+	lea	edi,[128+esp]
+	call	__ecp_nistz256_mul_mont
+	lea	esi,[96+esp]
+	lea	ebp,[64+esp]
+	lea	edi,[32+esp]
+	call	__ecp_nistz256_add
+	mov	edi,64
+	lea	esi,[128+esp]
+	lea	ebp,[128+esp]
+	add	edi,DWORD [184+esp]
+	call	__ecp_nistz256_add
+	lea	esi,[96+esp]
+	lea	ebp,[64+esp]
+	lea	edi,[64+esp]
+	call	__ecp_nistz256_sub
+	mov	eax,DWORD [160+esp]
+	lea	esi,[esp]
+	lea	ebp,[esp]
+	lea	edi,[128+esp]
+	call	__ecp_nistz256_mul_mont
+	mov	eax,DWORD [160+esp]
+	lea	esi,[32+esp]
+	lea	ebp,[64+esp]
+	lea	edi,[32+esp]
+	call	__ecp_nistz256_mul_mont
+	mov	edi,32
+	lea	esi,[128+esp]
+	add	edi,DWORD [184+esp]
+	call	__ecp_nistz256_div_by_2
+	lea	esi,[32+esp]
+	lea	ebp,[32+esp]
+	lea	edi,[128+esp]
+	call	__ecp_nistz256_add
+	mov	eax,DWORD [160+esp]
+	lea	esi,[96+esp]
+	lea	ebp,[esp]
+	lea	edi,[esp]
+	call	__ecp_nistz256_mul_mont
+	lea	esi,[128+esp]
+	lea	ebp,[32+esp]
+	lea	edi,[32+esp]
+	call	__ecp_nistz256_add
+	lea	esi,[esp]
+	lea	ebp,[esp]
+	lea	edi,[128+esp]
+	call	__ecp_nistz256_add
+	mov	eax,DWORD [160+esp]
+	lea	esi,[32+esp]
+	lea	ebp,[32+esp]
+	mov	edi,DWORD [184+esp]
+	call	__ecp_nistz256_mul_mont
+	mov	esi,edi
+	lea	ebp,[128+esp]
+	call	__ecp_nistz256_sub
+	lea	esi,[esp]
+	mov	ebp,edi
+	lea	edi,[esp]
+	call	__ecp_nistz256_sub
+	mov	eax,DWORD [160+esp]
+	mov	esi,edi
+	lea	ebp,[32+esp]
+	call	__ecp_nistz256_mul_mont
+	mov	ebp,32
+	lea	esi,[esp]
+	add	ebp,DWORD [184+esp]
+	mov	edi,ebp
+	call	__ecp_nistz256_sub
+	add	esp,164
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_GFp_nistz256_point_add_affine
+align	16
+_GFp_nistz256_point_add_affine:
+L$_GFp_nistz256_point_add_affine_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	esi,DWORD [24+esp]
+	sub	esp,492
+	call	__picup_eax
+L$003pic:
+	lea	edx,[_GFp_ia32cap_P]
+	mov	ebp,DWORD [edx]
+	lea	edi,[96+esp]
+	mov	eax,DWORD [esi]
+	mov	ebx,DWORD [4+esi]
+	mov	ecx,DWORD [8+esi]
+	mov	edx,DWORD [12+esi]
+	mov	DWORD [edi],eax
+	mov	DWORD [488+esp],ebp
+	mov	DWORD [4+edi],ebx
+	mov	DWORD [8+edi],ecx
+	mov	DWORD [12+edi],edx
+	mov	eax,DWORD [16+esi]
+	mov	ebx,DWORD [20+esi]
+	mov	ecx,DWORD [24+esi]
+	mov	edx,DWORD [28+esi]
+	mov	DWORD [16+edi],eax
+	mov	DWORD [20+edi],ebx
+	mov	DWORD [24+edi],ecx
+	mov	DWORD [28+edi],edx
+	mov	eax,DWORD [32+esi]
+	mov	ebx,DWORD [36+esi]
+	mov	ecx,DWORD [40+esi]
+	mov	edx,DWORD [44+esi]
+	mov	DWORD [32+edi],eax
+	mov	DWORD [36+edi],ebx
+	mov	DWORD [40+edi],ecx
+	mov	DWORD [44+edi],edx
+	mov	eax,DWORD [48+esi]
+	mov	ebx,DWORD [52+esi]
+	mov	ecx,DWORD [56+esi]
+	mov	edx,DWORD [60+esi]
+	mov	DWORD [48+edi],eax
+	mov	DWORD [52+edi],ebx
+	mov	DWORD [56+edi],ecx
+	mov	DWORD [60+edi],edx
+	mov	eax,DWORD [64+esi]
+	mov	ebx,DWORD [68+esi]
+	mov	ecx,DWORD [72+esi]
+	mov	edx,DWORD [76+esi]
+	mov	DWORD [64+edi],eax
+	mov	ebp,eax
+	mov	DWORD [68+edi],ebx
+	or	ebp,ebx
+	mov	DWORD [72+edi],ecx
+	or	ebp,ecx
+	mov	DWORD [76+edi],edx
+	or	ebp,edx
+	mov	eax,DWORD [80+esi]
+	mov	ebx,DWORD [84+esi]
+	mov	ecx,DWORD [88+esi]
+	mov	edx,DWORD [92+esi]
+	mov	DWORD [80+edi],eax
+	or	ebp,eax
+	mov	DWORD [84+edi],ebx
+	or	ebp,ebx
+	mov	DWORD [88+edi],ecx
+	or	ebp,ecx
+	mov	DWORD [92+edi],edx
+	or	ebp,edx
+	xor	eax,eax
+	mov	esi,DWORD [520+esp]
+	sub	eax,ebp
+	or	ebp,eax
+	sar	ebp,31
+	mov	DWORD [480+esp],ebp
+	lea	edi,[192+esp]
+	mov	eax,DWORD [esi]
+	mov	ebx,DWORD [4+esi]
+	mov	ecx,DWORD [8+esi]
+	mov	edx,DWORD [12+esi]
+	mov	DWORD [edi],eax
+	mov	ebp,eax
+	mov	DWORD [4+edi],ebx
+	or	ebp,ebx
+	mov	DWORD [8+edi],ecx
+	or	ebp,ecx
+	mov	DWORD [12+edi],edx
+	or	ebp,edx
+	mov	eax,DWORD [16+esi]
+	mov	ebx,DWORD [20+esi]
+	mov	ecx,DWORD [24+esi]
+	mov	edx,DWORD [28+esi]
+	mov	DWORD [16+edi],eax
+	or	ebp,eax
+	mov	DWORD [20+edi],ebx
+	or	ebp,ebx
+	mov	DWORD [24+edi],ecx
+	or	ebp,ecx
+	mov	DWORD [28+edi],edx
+	or	ebp,edx
+	mov	eax,DWORD [32+esi]
+	mov	ebx,DWORD [36+esi]
+	mov	ecx,DWORD [40+esi]
+	mov	edx,DWORD [44+esi]
+	mov	DWORD [32+edi],eax
+	or	ebp,eax
+	mov	DWORD [36+edi],ebx
+	or	ebp,ebx
+	mov	DWORD [40+edi],ecx
+	or	ebp,ecx
+	mov	DWORD [44+edi],edx
+	or	ebp,edx
+	mov	eax,DWORD [48+esi]
+	mov	ebx,DWORD [52+esi]
+	mov	ecx,DWORD [56+esi]
+	mov	edx,DWORD [60+esi]
+	mov	DWORD [48+edi],eax
+	or	ebp,eax
+	mov	DWORD [52+edi],ebx
+	or	ebp,ebx
+	mov	DWORD [56+edi],ecx
+	or	ebp,ecx
+	mov	DWORD [60+edi],edx
+	or	ebp,edx
+	xor	ebx,ebx
+	mov	eax,DWORD [488+esp]
+	sub	ebx,ebp
+	lea	esi,[160+esp]
+	or	ebx,ebp
+	lea	ebp,[160+esp]
+	sar	ebx,31
+	lea	edi,[288+esp]
+	mov	DWORD [484+esp],ebx
+	call	__ecp_nistz256_mul_mont
+	mov	eax,DWORD [488+esp]
+	lea	esi,[192+esp]
+	mov	ebp,edi
+	lea	edi,[256+esp]
+	call	__ecp_nistz256_mul_mont
+	mov	eax,DWORD [488+esp]
+	lea	esi,[160+esp]
+	lea	ebp,[288+esp]
+	lea	edi,[288+esp]
+	call	__ecp_nistz256_mul_mont
+	lea	esi,[256+esp]
+	lea	ebp,[96+esp]
+	lea	edi,[320+esp]
+	call	__ecp_nistz256_sub
+	mov	eax,DWORD [488+esp]
+	lea	esi,[224+esp]
+	lea	ebp,[288+esp]
+	lea	edi,[288+esp]
+	call	__ecp_nistz256_mul_mont
+	mov	eax,DWORD [488+esp]
+	lea	esi,[160+esp]
+	lea	ebp,[320+esp]
+	lea	edi,[64+esp]
+	call	__ecp_nistz256_mul_mont
+	lea	esi,[288+esp]
+	lea	ebp,[128+esp]
+	lea	edi,[352+esp]
+	call	__ecp_nistz256_sub
+	mov	eax,DWORD [488+esp]
+	lea	esi,[320+esp]
+	lea	ebp,[320+esp]
+	lea	edi,[384+esp]
+	call	__ecp_nistz256_mul_mont
+	mov	eax,DWORD [488+esp]
+	lea	esi,[352+esp]
+	lea	ebp,[352+esp]
+	lea	edi,[448+esp]
+	call	__ecp_nistz256_mul_mont
+	mov	eax,DWORD [488+esp]
+	lea	esi,[96+esp]
+	lea	ebp,[384+esp]
+	lea	edi,[256+esp]
+	call	__ecp_nistz256_mul_mont
+	mov	eax,DWORD [488+esp]
+	lea	esi,[320+esp]
+	lea	ebp,[384+esp]
+	lea	edi,[416+esp]
+	call	__ecp_nistz256_mul_mont
+	lea	esi,[256+esp]
+	lea	ebp,[256+esp]
+	lea	edi,[384+esp]
+	call	__ecp_nistz256_add
+	lea	esi,[448+esp]
+	lea	ebp,[384+esp]
+	lea	edi,[esp]
+	call	__ecp_nistz256_sub
+	lea	esi,[esp]
+	lea	ebp,[416+esp]
+	lea	edi,[esp]
+	call	__ecp_nistz256_sub
+	lea	esi,[256+esp]
+	lea	ebp,[esp]
+	lea	edi,[32+esp]
+	call	__ecp_nistz256_sub
+	mov	eax,DWORD [488+esp]
+	lea	esi,[416+esp]
+	lea	ebp,[128+esp]
+	lea	edi,[288+esp]
+	call	__ecp_nistz256_mul_mont
+	mov	eax,DWORD [488+esp]
+	lea	esi,[352+esp]
+	lea	ebp,[32+esp]
+	lea	edi,[32+esp]
+	call	__ecp_nistz256_mul_mont
+	lea	esi,[32+esp]
+	lea	ebp,[288+esp]
+	lea	edi,[32+esp]
+	call	__ecp_nistz256_sub
+	mov	ebp,DWORD [480+esp]
+	mov	esi,DWORD [484+esp]
+	mov	edi,DWORD [512+esp]
+	mov	edx,ebp
+	not	ebp
+	and	edx,esi
+	and	ebp,esi
+	not	esi
+	mov	eax,edx
+	and	eax,DWORD [64+esp]
+	mov	ebx,ebp
+	and	ebx,1
+	mov	ecx,esi
+	and	ecx,DWORD [160+esp]
+	or	eax,ebx
+	or	eax,ecx
+	mov	DWORD [64+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [68+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [164+esp]
+	or	eax,ecx
+	mov	DWORD [68+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [72+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [168+esp]
+	or	eax,ecx
+	mov	DWORD [72+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [76+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [172+esp]
+	or	eax,ebp
+	or	eax,ecx
+	mov	DWORD [76+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [80+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [176+esp]
+	or	eax,ebp
+	or	eax,ecx
+	mov	DWORD [80+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [84+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [180+esp]
+	or	eax,ebp
+	or	eax,ecx
+	mov	DWORD [84+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [88+esp]
+	mov	ebx,ebp
+	and	ebx,-2
+	mov	ecx,esi
+	and	ecx,DWORD [184+esp]
+	or	eax,ebx
+	or	eax,ecx
+	mov	DWORD [88+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [92+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [188+esp]
+	or	eax,ecx
+	mov	DWORD [92+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [esp]
+	mov	ebx,ebp
+	and	ebx,DWORD [192+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [96+esp]
+	or	eax,ebx
+	or	eax,ecx
+	mov	DWORD [edi],eax
+	mov	eax,edx
+	and	eax,DWORD [4+esp]
+	mov	ebx,ebp
+	and	ebx,DWORD [196+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [100+esp]
+	or	eax,ebx
+	or	eax,ecx
+	mov	DWORD [4+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [8+esp]
+	mov	ebx,ebp
+	and	ebx,DWORD [200+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [104+esp]
+	or	eax,ebx
+	or	eax,ecx
+	mov	DWORD [8+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [12+esp]
+	mov	ebx,ebp
+	and	ebx,DWORD [204+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [108+esp]
+	or	eax,ebx
+	or	eax,ecx
+	mov	DWORD [12+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [16+esp]
+	mov	ebx,ebp
+	and	ebx,DWORD [208+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [112+esp]
+	or	eax,ebx
+	or	eax,ecx
+	mov	DWORD [16+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [20+esp]
+	mov	ebx,ebp
+	and	ebx,DWORD [212+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [116+esp]
+	or	eax,ebx
+	or	eax,ecx
+	mov	DWORD [20+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [24+esp]
+	mov	ebx,ebp
+	and	ebx,DWORD [216+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [120+esp]
+	or	eax,ebx
+	or	eax,ecx
+	mov	DWORD [24+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [28+esp]
+	mov	ebx,ebp
+	and	ebx,DWORD [220+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [124+esp]
+	or	eax,ebx
+	or	eax,ecx
+	mov	DWORD [28+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [32+esp]
+	mov	ebx,ebp
+	and	ebx,DWORD [224+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [128+esp]
+	or	eax,ebx
+	or	eax,ecx
+	mov	DWORD [32+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [36+esp]
+	mov	ebx,ebp
+	and	ebx,DWORD [228+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [132+esp]
+	or	eax,ebx
+	or	eax,ecx
+	mov	DWORD [36+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [40+esp]
+	mov	ebx,ebp
+	and	ebx,DWORD [232+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [136+esp]
+	or	eax,ebx
+	or	eax,ecx
+	mov	DWORD [40+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [44+esp]
+	mov	ebx,ebp
+	and	ebx,DWORD [236+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [140+esp]
+	or	eax,ebx
+	or	eax,ecx
+	mov	DWORD [44+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [48+esp]
+	mov	ebx,ebp
+	and	ebx,DWORD [240+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [144+esp]
+	or	eax,ebx
+	or	eax,ecx
+	mov	DWORD [48+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [52+esp]
+	mov	ebx,ebp
+	and	ebx,DWORD [244+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [148+esp]
+	or	eax,ebx
+	or	eax,ecx
+	mov	DWORD [52+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [56+esp]
+	mov	ebx,ebp
+	and	ebx,DWORD [248+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [152+esp]
+	or	eax,ebx
+	or	eax,ecx
+	mov	DWORD [56+edi],eax
+	mov	eax,edx
+	and	eax,DWORD [60+esp]
+	mov	ebx,ebp
+	and	ebx,DWORD [252+esp]
+	mov	ecx,esi
+	and	ecx,DWORD [156+esp]
+	or	eax,ebx
+	or	eax,ecx
+	mov	DWORD [60+edi],eax
+	add	esp,492
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+segment	.bss
+common	_GFp_ia32cap_P 16

+ 359 - 0
zeroidc/vendor/ring/pregenerated/tmp/ghash-x86-win32n.asm

@@ -0,0 +1,359 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
[email protected] equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+global	_GFp_gcm_init_clmul
+align	16
+_GFp_gcm_init_clmul:
+L$_GFp_gcm_init_clmul_begin:
+	mov	edx,DWORD [4+esp]
+	mov	eax,DWORD [8+esp]
+	call	L$000pic
+L$000pic:
+	pop	ecx
+	lea	ecx,[(L$bswap-L$000pic)+ecx]
+	movdqu	xmm2,[eax]
+	pshufd	xmm2,xmm2,78
+	pshufd	xmm4,xmm2,255
+	movdqa	xmm3,xmm2
+	psllq	xmm2,1
+	pxor	xmm5,xmm5
+	psrlq	xmm3,63
+	pcmpgtd	xmm5,xmm4
+	pslldq	xmm3,8
+	por	xmm2,xmm3
+	pand	xmm5,[16+ecx]
+	pxor	xmm2,xmm5
+	movdqa	xmm0,xmm2
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pshufd	xmm4,xmm2,78
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm2
+db	102,15,58,68,194,0
+db	102,15,58,68,202,17
+db	102,15,58,68,220,0
+	xorps	xmm3,xmm0
+	xorps	xmm3,xmm1
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	pshufd	xmm3,xmm2,78
+	pshufd	xmm4,xmm0,78
+	pxor	xmm3,xmm2
+	movdqu	[edx],xmm2
+	pxor	xmm4,xmm0
+	movdqu	[16+edx],xmm0
+db	102,15,58,15,227,8
+	movdqu	[32+edx],xmm4
+	ret
+global	_GFp_gcm_gmult_clmul
+align	16
+_GFp_gcm_gmult_clmul:
+L$_GFp_gcm_gmult_clmul_begin:
+	mov	eax,DWORD [4+esp]
+	mov	edx,DWORD [8+esp]
+	call	L$001pic
+L$001pic:
+	pop	ecx
+	lea	ecx,[(L$bswap-L$001pic)+ecx]
+	movdqu	xmm0,[eax]
+	movdqa	xmm5,[ecx]
+	movups	xmm2,[edx]
+db	102,15,56,0,197
+	movups	xmm4,[32+edx]
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pxor	xmm3,xmm0
+db	102,15,58,68,194,0
+db	102,15,58,68,202,17
+db	102,15,58,68,220,0
+	xorps	xmm3,xmm0
+	xorps	xmm3,xmm1
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+db	102,15,56,0,197
+	movdqu	[eax],xmm0
+	ret
+global	_GFp_gcm_ghash_clmul
+align	16
+_GFp_gcm_ghash_clmul:
+L$_GFp_gcm_ghash_clmul_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	eax,DWORD [20+esp]
+	mov	edx,DWORD [24+esp]
+	mov	esi,DWORD [28+esp]
+	mov	ebx,DWORD [32+esp]
+	call	L$002pic
+L$002pic:
+	pop	ecx
+	lea	ecx,[(L$bswap-L$002pic)+ecx]
+	movdqu	xmm0,[eax]
+	movdqa	xmm5,[ecx]
+	movdqu	xmm2,[edx]
+db	102,15,56,0,197
+	sub	ebx,16
+	jz	NEAR L$003odd_tail
+	movdqu	xmm3,[esi]
+	movdqu	xmm6,[16+esi]
+db	102,15,56,0,221
+db	102,15,56,0,245
+	movdqu	xmm5,[32+edx]
+	pxor	xmm0,xmm3
+	pshufd	xmm3,xmm6,78
+	movdqa	xmm7,xmm6
+	pxor	xmm3,xmm6
+	lea	esi,[32+esi]
+db	102,15,58,68,242,0
+db	102,15,58,68,250,17
+db	102,15,58,68,221,0
+	movups	xmm2,[16+edx]
+	nop
+	sub	ebx,32
+	jbe	NEAR L$004even_tail
+	jmp	NEAR L$005mod_loop
+align	32
+L$005mod_loop:
+	pshufd	xmm4,xmm0,78
+	movdqa	xmm1,xmm0
+	pxor	xmm4,xmm0
+	nop
+db	102,15,58,68,194,0
+db	102,15,58,68,202,17
+db	102,15,58,68,229,16
+	movups	xmm2,[edx]
+	xorps	xmm0,xmm6
+	movdqa	xmm5,[ecx]
+	xorps	xmm1,xmm7
+	movdqu	xmm7,[esi]
+	pxor	xmm3,xmm0
+	movdqu	xmm6,[16+esi]
+	pxor	xmm3,xmm1
+db	102,15,56,0,253
+	pxor	xmm4,xmm3
+	movdqa	xmm3,xmm4
+	psrldq	xmm4,8
+	pslldq	xmm3,8
+	pxor	xmm1,xmm4
+	pxor	xmm0,xmm3
+db	102,15,56,0,245
+	pxor	xmm1,xmm7
+	movdqa	xmm7,xmm6
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+db	102,15,58,68,242,0
+	movups	xmm5,[32+edx]
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+	pshufd	xmm3,xmm7,78
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm3,xmm7
+	pxor	xmm1,xmm4
+db	102,15,58,68,250,17
+	movups	xmm2,[16+edx]
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+db	102,15,58,68,221,0
+	lea	esi,[32+esi]
+	sub	ebx,32
+	ja	NEAR L$005mod_loop
+L$004even_tail:
+	pshufd	xmm4,xmm0,78
+	movdqa	xmm1,xmm0
+	pxor	xmm4,xmm0
+db	102,15,58,68,194,0
+db	102,15,58,68,202,17
+db	102,15,58,68,229,16
+	movdqa	xmm5,[ecx]
+	xorps	xmm0,xmm6
+	xorps	xmm1,xmm7
+	pxor	xmm3,xmm0
+	pxor	xmm3,xmm1
+	pxor	xmm4,xmm3
+	movdqa	xmm3,xmm4
+	psrldq	xmm4,8
+	pslldq	xmm3,8
+	pxor	xmm1,xmm4
+	pxor	xmm0,xmm3
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	test	ebx,ebx
+	jnz	NEAR L$006done
+	movups	xmm2,[edx]
+L$003odd_tail:
+	movdqu	xmm3,[esi]
+db	102,15,56,0,221
+	pxor	xmm0,xmm3
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pshufd	xmm4,xmm2,78
+	pxor	xmm3,xmm0
+	pxor	xmm4,xmm2
+db	102,15,58,68,194,0
+db	102,15,58,68,202,17
+db	102,15,58,68,220,0
+	xorps	xmm3,xmm0
+	xorps	xmm3,xmm1
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+L$006done:
+db	102,15,56,0,197
+	movdqu	[eax],xmm0
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+align	64
+L$bswap:
+db	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+db	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
+align	64
+L$007rem_8bit:
+dw	0,450,900,582,1800,1738,1164,1358
+dw	3600,4050,3476,3158,2328,2266,2716,2910
+dw	7200,7650,8100,7782,6952,6890,6316,6510
+dw	4656,5106,4532,4214,5432,5370,5820,6014
+dw	14400,14722,15300,14854,16200,16010,15564,15630
+dw	13904,14226,13780,13334,12632,12442,13020,13086
+dw	9312,9634,10212,9766,9064,8874,8428,8494
+dw	10864,11186,10740,10294,11640,11450,12028,12094
+dw	28800,28994,29444,29382,30600,30282,29708,30158
+dw	32400,32594,32020,31958,31128,30810,31260,31710
+dw	27808,28002,28452,28390,27560,27242,26668,27118
+dw	25264,25458,24884,24822,26040,25722,26172,26622
+dw	18624,18690,19268,19078,20424,19978,19532,19854
+dw	18128,18194,17748,17558,16856,16410,16988,17310
+dw	21728,21794,22372,22182,21480,21034,20588,20910
+dw	23280,23346,22900,22710,24056,23610,24188,24510
+dw	57600,57538,57988,58182,58888,59338,58764,58446
+dw	61200,61138,60564,60758,59416,59866,60316,59998
+dw	64800,64738,65188,65382,64040,64490,63916,63598
+dw	62256,62194,61620,61814,62520,62970,63420,63102
+dw	55616,55426,56004,56070,56904,57226,56780,56334
+dw	55120,54930,54484,54550,53336,53658,54236,53790
+dw	50528,50338,50916,50982,49768,50090,49644,49198
+dw	52080,51890,51444,51510,52344,52666,53244,52798
+dw	37248,36930,37380,37830,38536,38730,38156,38094
+dw	40848,40530,39956,40406,39064,39258,39708,39646
+dw	36256,35938,36388,36838,35496,35690,35116,35054
+dw	33712,33394,32820,33270,33976,34170,34620,34558
+dw	43456,43010,43588,43910,44744,44810,44364,44174
+dw	42960,42514,42068,42390,41176,41242,41820,41630
+dw	46560,46114,46692,47014,45800,45866,45420,45230
+dw	48112,47666,47220,47542,48376,48442,49020,48830
+db	71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
+db	82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
+db	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
+db	0

+ 1209 - 0
zeroidc/vendor/ring/pregenerated/tmp/ghash-x86_64-nasm.asm

@@ -0,0 +1,1209 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+section	.text code align=64
+
+EXTERN	GFp_ia32cap_P
+global	GFp_gcm_init_clmul
+
+ALIGN	16
+GFp_gcm_init_clmul:
+
+$L$_init_clmul:
+$L$SEH_begin_GFp_gcm_init_clmul:
+
+DB	0x48,0x83,0xec,0x18
+DB	0x0f,0x29,0x34,0x24
+	movdqu	xmm2,XMMWORD[rdx]
+	pshufd	xmm2,xmm2,78
+
+
+	pshufd	xmm4,xmm2,255
+	movdqa	xmm3,xmm2
+	psllq	xmm2,1
+	pxor	xmm5,xmm5
+	psrlq	xmm3,63
+	pcmpgtd	xmm5,xmm4
+	pslldq	xmm3,8
+	por	xmm2,xmm3
+
+
+	pand	xmm5,XMMWORD[$L$0x1c2_polynomial]
+	pxor	xmm2,xmm5
+
+
+	pshufd	xmm6,xmm2,78
+	movdqa	xmm0,xmm2
+	pxor	xmm6,xmm2
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pxor	xmm3,xmm0
+DB	102,15,58,68,194,0
+DB	102,15,58,68,202,17
+DB	102,15,58,68,222,0
+	pxor	xmm3,xmm0
+	pxor	xmm3,xmm1
+
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	pshufd	xmm3,xmm2,78
+	pshufd	xmm4,xmm0,78
+	pxor	xmm3,xmm2
+	movdqu	XMMWORD[rcx],xmm2
+	pxor	xmm4,xmm0
+	movdqu	XMMWORD[16+rcx],xmm0
+DB	102,15,58,15,227,8
+	movdqu	XMMWORD[32+rcx],xmm4
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pxor	xmm3,xmm0
+DB	102,15,58,68,194,0
+DB	102,15,58,68,202,17
+DB	102,15,58,68,222,0
+	pxor	xmm3,xmm0
+	pxor	xmm3,xmm1
+
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	movdqa	xmm5,xmm0
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pxor	xmm3,xmm0
+DB	102,15,58,68,194,0
+DB	102,15,58,68,202,17
+DB	102,15,58,68,222,0
+	pxor	xmm3,xmm0
+	pxor	xmm3,xmm1
+
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	pshufd	xmm3,xmm5,78
+	pshufd	xmm4,xmm0,78
+	pxor	xmm3,xmm5
+	movdqu	XMMWORD[48+rcx],xmm5
+	pxor	xmm4,xmm0
+	movdqu	XMMWORD[64+rcx],xmm0
+DB	102,15,58,15,227,8
+	movdqu	XMMWORD[80+rcx],xmm4
+	movaps	xmm6,XMMWORD[rsp]
+	lea	rsp,[24+rsp]
+$L$SEH_end_GFp_gcm_init_clmul:
+	DB	0F3h,0C3h		;repret
+
+
+global	GFp_gcm_gmult_clmul
+
+ALIGN	16
+GFp_gcm_gmult_clmul:
+
+$L$_gmult_clmul:
+	movdqu	xmm0,XMMWORD[rcx]
+	movdqa	xmm5,XMMWORD[$L$bswap_mask]
+	movdqu	xmm2,XMMWORD[rdx]
+	movdqu	xmm4,XMMWORD[32+rdx]
+DB	102,15,56,0,197
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pxor	xmm3,xmm0
+DB	102,15,58,68,194,0
+DB	102,15,58,68,202,17
+DB	102,15,58,68,220,0
+	pxor	xmm3,xmm0
+	pxor	xmm3,xmm1
+
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+DB	102,15,56,0,197
+	movdqu	XMMWORD[rcx],xmm0
+	DB	0F3h,0C3h		;repret
+
+
+global	GFp_gcm_ghash_clmul
+
+ALIGN	32
+GFp_gcm_ghash_clmul:
+
+$L$_ghash_clmul:
+	lea	rax,[((-136))+rsp]
+$L$SEH_begin_GFp_gcm_ghash_clmul:
+
+DB	0x48,0x8d,0x60,0xe0
+DB	0x0f,0x29,0x70,0xe0
+DB	0x0f,0x29,0x78,0xf0
+DB	0x44,0x0f,0x29,0x00
+DB	0x44,0x0f,0x29,0x48,0x10
+DB	0x44,0x0f,0x29,0x50,0x20
+DB	0x44,0x0f,0x29,0x58,0x30
+DB	0x44,0x0f,0x29,0x60,0x40
+DB	0x44,0x0f,0x29,0x68,0x50
+DB	0x44,0x0f,0x29,0x70,0x60
+DB	0x44,0x0f,0x29,0x78,0x70
+	movdqa	xmm10,XMMWORD[$L$bswap_mask]
+
+	movdqu	xmm0,XMMWORD[rcx]
+	movdqu	xmm2,XMMWORD[rdx]
+	movdqu	xmm7,XMMWORD[32+rdx]
+DB	102,65,15,56,0,194
+
+	sub	r9,0x10
+	jz	NEAR $L$odd_tail
+
+	movdqu	xmm6,XMMWORD[16+rdx]
+	lea	rax,[GFp_ia32cap_P]
+	mov	eax,DWORD[4+rax]
+	cmp	r9,0x30
+	jb	NEAR $L$skip4x
+
+	and	eax,71303168
+	cmp	eax,4194304
+	je	NEAR $L$skip4x
+
+	sub	r9,0x30
+	mov	rax,0xA040608020C0E000
+	movdqu	xmm14,XMMWORD[48+rdx]
+	movdqu	xmm15,XMMWORD[64+rdx]
+
+
+
+
+	movdqu	xmm3,XMMWORD[48+r8]
+	movdqu	xmm11,XMMWORD[32+r8]
+DB	102,65,15,56,0,218
+DB	102,69,15,56,0,218
+	movdqa	xmm5,xmm3
+	pshufd	xmm4,xmm3,78
+	pxor	xmm4,xmm3
+DB	102,15,58,68,218,0
+DB	102,15,58,68,234,17
+DB	102,15,58,68,231,0
+
+	movdqa	xmm13,xmm11
+	pshufd	xmm12,xmm11,78
+	pxor	xmm12,xmm11
+DB	102,68,15,58,68,222,0
+DB	102,68,15,58,68,238,17
+DB	102,68,15,58,68,231,16
+	xorps	xmm3,xmm11
+	xorps	xmm5,xmm13
+	movups	xmm7,XMMWORD[80+rdx]
+	xorps	xmm4,xmm12
+
+	movdqu	xmm11,XMMWORD[16+r8]
+	movdqu	xmm8,XMMWORD[r8]
+DB	102,69,15,56,0,218
+DB	102,69,15,56,0,194
+	movdqa	xmm13,xmm11
+	pshufd	xmm12,xmm11,78
+	pxor	xmm0,xmm8
+	pxor	xmm12,xmm11
+DB	102,69,15,58,68,222,0
+	movdqa	xmm1,xmm0
+	pshufd	xmm8,xmm0,78
+	pxor	xmm8,xmm0
+DB	102,69,15,58,68,238,17
+DB	102,68,15,58,68,231,0
+	xorps	xmm3,xmm11
+	xorps	xmm5,xmm13
+
+	lea	r8,[64+r8]
+	sub	r9,0x40
+	jc	NEAR $L$tail4x
+
+	jmp	NEAR $L$mod4_loop
+ALIGN	32
+$L$mod4_loop:
+DB	102,65,15,58,68,199,0
+	xorps	xmm4,xmm12
+	movdqu	xmm11,XMMWORD[48+r8]
+DB	102,69,15,56,0,218
+DB	102,65,15,58,68,207,17
+	xorps	xmm0,xmm3
+	movdqu	xmm3,XMMWORD[32+r8]
+	movdqa	xmm13,xmm11
+DB	102,68,15,58,68,199,16
+	pshufd	xmm12,xmm11,78
+	xorps	xmm1,xmm5
+	pxor	xmm12,xmm11
+DB	102,65,15,56,0,218
+	movups	xmm7,XMMWORD[32+rdx]
+	xorps	xmm8,xmm4
+DB	102,68,15,58,68,218,0
+	pshufd	xmm4,xmm3,78
+
+	pxor	xmm8,xmm0
+	movdqa	xmm5,xmm3
+	pxor	xmm8,xmm1
+	pxor	xmm4,xmm3
+	movdqa	xmm9,xmm8
+DB	102,68,15,58,68,234,17
+	pslldq	xmm8,8
+	psrldq	xmm9,8
+	pxor	xmm0,xmm8
+	movdqa	xmm8,XMMWORD[$L$7_mask]
+	pxor	xmm1,xmm9
+DB	102,76,15,110,200
+
+	pand	xmm8,xmm0
+DB	102,69,15,56,0,200
+	pxor	xmm9,xmm0
+DB	102,68,15,58,68,231,0
+	psllq	xmm9,57
+	movdqa	xmm8,xmm9
+	pslldq	xmm9,8
+DB	102,15,58,68,222,0
+	psrldq	xmm8,8
+	pxor	xmm0,xmm9
+	pxor	xmm1,xmm8
+	movdqu	xmm8,XMMWORD[r8]
+
+	movdqa	xmm9,xmm0
+	psrlq	xmm0,1
+DB	102,15,58,68,238,17
+	xorps	xmm3,xmm11
+	movdqu	xmm11,XMMWORD[16+r8]
+DB	102,69,15,56,0,218
+DB	102,15,58,68,231,16
+	xorps	xmm5,xmm13
+	movups	xmm7,XMMWORD[80+rdx]
+DB	102,69,15,56,0,194
+	pxor	xmm1,xmm9
+	pxor	xmm9,xmm0
+	psrlq	xmm0,5
+
+	movdqa	xmm13,xmm11
+	pxor	xmm4,xmm12
+	pshufd	xmm12,xmm11,78
+	pxor	xmm0,xmm9
+	pxor	xmm1,xmm8
+	pxor	xmm12,xmm11
+DB	102,69,15,58,68,222,0
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	movdqa	xmm1,xmm0
+DB	102,69,15,58,68,238,17
+	xorps	xmm3,xmm11
+	pshufd	xmm8,xmm0,78
+	pxor	xmm8,xmm0
+
+DB	102,68,15,58,68,231,0
+	xorps	xmm5,xmm13
+
+	lea	r8,[64+r8]
+	sub	r9,0x40
+	jnc	NEAR $L$mod4_loop
+
+$L$tail4x:
+DB	102,65,15,58,68,199,0
+DB	102,65,15,58,68,207,17
+DB	102,68,15,58,68,199,16
+	xorps	xmm4,xmm12
+	xorps	xmm0,xmm3
+	xorps	xmm1,xmm5
+	pxor	xmm1,xmm0
+	pxor	xmm8,xmm4
+
+	pxor	xmm8,xmm1
+	pxor	xmm1,xmm0
+
+	movdqa	xmm9,xmm8
+	psrldq	xmm8,8
+	pslldq	xmm9,8
+	pxor	xmm1,xmm8
+	pxor	xmm0,xmm9
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	add	r9,0x40
+	jz	NEAR $L$done
+	movdqu	xmm7,XMMWORD[32+rdx]
+	sub	r9,0x10
+	jz	NEAR $L$odd_tail
+$L$skip4x:
+
+
+
+
+
+	movdqu	xmm8,XMMWORD[r8]
+	movdqu	xmm3,XMMWORD[16+r8]
+DB	102,69,15,56,0,194
+DB	102,65,15,56,0,218
+	pxor	xmm0,xmm8
+
+	movdqa	xmm5,xmm3
+	pshufd	xmm4,xmm3,78
+	pxor	xmm4,xmm3
+DB	102,15,58,68,218,0
+DB	102,15,58,68,234,17
+DB	102,15,58,68,231,0
+
+	lea	r8,[32+r8]
+	nop
+	sub	r9,0x20
+	jbe	NEAR $L$even_tail
+	nop
+	jmp	NEAR $L$mod_loop
+
+ALIGN	32
+$L$mod_loop:
+	movdqa	xmm1,xmm0
+	movdqa	xmm8,xmm4
+	pshufd	xmm4,xmm0,78
+	pxor	xmm4,xmm0
+
+DB	102,15,58,68,198,0
+DB	102,15,58,68,206,17
+DB	102,15,58,68,231,16
+
+	pxor	xmm0,xmm3
+	pxor	xmm1,xmm5
+	movdqu	xmm9,XMMWORD[r8]
+	pxor	xmm8,xmm0
+DB	102,69,15,56,0,202
+	movdqu	xmm3,XMMWORD[16+r8]
+
+	pxor	xmm8,xmm1
+	pxor	xmm1,xmm9
+	pxor	xmm4,xmm8
+DB	102,65,15,56,0,218
+	movdqa	xmm8,xmm4
+	psrldq	xmm8,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm8
+	pxor	xmm0,xmm4
+
+	movdqa	xmm5,xmm3
+
+	movdqa	xmm9,xmm0
+	movdqa	xmm8,xmm0
+	psllq	xmm0,5
+	pxor	xmm8,xmm0
+DB	102,15,58,68,218,0
+	psllq	xmm0,1
+	pxor	xmm0,xmm8
+	psllq	xmm0,57
+	movdqa	xmm8,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm8,8
+	pxor	xmm0,xmm9
+	pshufd	xmm4,xmm5,78
+	pxor	xmm1,xmm8
+	pxor	xmm4,xmm5
+
+	movdqa	xmm9,xmm0
+	psrlq	xmm0,1
+DB	102,15,58,68,234,17
+	pxor	xmm1,xmm9
+	pxor	xmm9,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm9
+	lea	r8,[32+r8]
+	psrlq	xmm0,1
+DB	102,15,58,68,231,0
+	pxor	xmm0,xmm1
+
+	sub	r9,0x20
+	ja	NEAR $L$mod_loop
+
+$L$even_tail:
+	movdqa	xmm1,xmm0
+	movdqa	xmm8,xmm4
+	pshufd	xmm4,xmm0,78
+	pxor	xmm4,xmm0
+
+DB	102,15,58,68,198,0
+DB	102,15,58,68,206,17
+DB	102,15,58,68,231,16
+
+	pxor	xmm0,xmm3
+	pxor	xmm1,xmm5
+	pxor	xmm8,xmm0
+	pxor	xmm8,xmm1
+	pxor	xmm4,xmm8
+	movdqa	xmm8,xmm4
+	psrldq	xmm8,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm8
+	pxor	xmm0,xmm4
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+	test	r9,r9
+	jnz	NEAR $L$done
+
+$L$odd_tail:
+	movdqu	xmm8,XMMWORD[r8]
+DB	102,69,15,56,0,194
+	pxor	xmm0,xmm8
+	movdqa	xmm1,xmm0
+	pshufd	xmm3,xmm0,78
+	pxor	xmm3,xmm0
+DB	102,15,58,68,194,0
+DB	102,15,58,68,202,17
+DB	102,15,58,68,223,0
+	pxor	xmm3,xmm0
+	pxor	xmm3,xmm1
+
+	movdqa	xmm4,xmm3
+	psrldq	xmm3,8
+	pslldq	xmm4,8
+	pxor	xmm1,xmm3
+	pxor	xmm0,xmm4
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm3,xmm0
+	psllq	xmm0,5
+	pxor	xmm3,xmm0
+	psllq	xmm0,1
+	pxor	xmm0,xmm3
+	psllq	xmm0,57
+	movdqa	xmm3,xmm0
+	pslldq	xmm0,8
+	psrldq	xmm3,8
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm3
+
+
+	movdqa	xmm4,xmm0
+	psrlq	xmm0,1
+	pxor	xmm1,xmm4
+	pxor	xmm4,xmm0
+	psrlq	xmm0,5
+	pxor	xmm0,xmm4
+	psrlq	xmm0,1
+	pxor	xmm0,xmm1
+$L$done:
+DB	102,65,15,56,0,194
+	movdqu	XMMWORD[rcx],xmm0
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[168+rsp]
+$L$SEH_end_GFp_gcm_ghash_clmul:
+	DB	0F3h,0C3h		;repret
+
+
+global	GFp_gcm_init_avx
+
+ALIGN	32
+GFp_gcm_init_avx:
+
+$L$SEH_begin_GFp_gcm_init_avx:
+
+DB	0x48,0x83,0xec,0x18
+DB	0x0f,0x29,0x34,0x24
+	vzeroupper
+
+	vmovdqu	xmm2,XMMWORD[rdx]
+	vpshufd	xmm2,xmm2,78
+
+
+	vpshufd	xmm4,xmm2,255
+	vpsrlq	xmm3,xmm2,63
+	vpsllq	xmm2,xmm2,1
+	vpxor	xmm5,xmm5,xmm5
+	vpcmpgtd	xmm5,xmm5,xmm4
+	vpslldq	xmm3,xmm3,8
+	vpor	xmm2,xmm2,xmm3
+
+
+	vpand	xmm5,xmm5,XMMWORD[$L$0x1c2_polynomial]
+	vpxor	xmm2,xmm2,xmm5
+
+	vpunpckhqdq	xmm6,xmm2,xmm2
+	vmovdqa	xmm0,xmm2
+	vpxor	xmm6,xmm6,xmm2
+	mov	r10,4
+	jmp	NEAR $L$init_start_avx
+ALIGN	32
+$L$init_loop_avx:
+	vpalignr	xmm5,xmm4,xmm3,8
+	vmovdqu	XMMWORD[(-16)+rcx],xmm5
+	vpunpckhqdq	xmm3,xmm0,xmm0
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm1,xmm0,xmm2,0x11
+	vpclmulqdq	xmm0,xmm0,xmm2,0x00
+	vpclmulqdq	xmm3,xmm3,xmm6,0x00
+	vpxor	xmm4,xmm1,xmm0
+	vpxor	xmm3,xmm3,xmm4
+
+	vpslldq	xmm4,xmm3,8
+	vpsrldq	xmm3,xmm3,8
+	vpxor	xmm0,xmm0,xmm4
+	vpxor	xmm1,xmm1,xmm3
+	vpsllq	xmm3,xmm0,57
+	vpsllq	xmm4,xmm0,62
+	vpxor	xmm4,xmm4,xmm3
+	vpsllq	xmm3,xmm0,63
+	vpxor	xmm4,xmm4,xmm3
+	vpslldq	xmm3,xmm4,8
+	vpsrldq	xmm4,xmm4,8
+	vpxor	xmm0,xmm0,xmm3
+	vpxor	xmm1,xmm1,xmm4
+
+	vpsrlq	xmm4,xmm0,1
+	vpxor	xmm1,xmm1,xmm0
+	vpxor	xmm0,xmm0,xmm4
+	vpsrlq	xmm4,xmm4,5
+	vpxor	xmm0,xmm0,xmm4
+	vpsrlq	xmm0,xmm0,1
+	vpxor	xmm0,xmm0,xmm1
+$L$init_start_avx:
+	vmovdqa	xmm5,xmm0
+	vpunpckhqdq	xmm3,xmm0,xmm0
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm1,xmm0,xmm2,0x11
+	vpclmulqdq	xmm0,xmm0,xmm2,0x00
+	vpclmulqdq	xmm3,xmm3,xmm6,0x00
+	vpxor	xmm4,xmm1,xmm0
+	vpxor	xmm3,xmm3,xmm4
+
+	vpslldq	xmm4,xmm3,8
+	vpsrldq	xmm3,xmm3,8
+	vpxor	xmm0,xmm0,xmm4
+	vpxor	xmm1,xmm1,xmm3
+	vpsllq	xmm3,xmm0,57
+	vpsllq	xmm4,xmm0,62
+	vpxor	xmm4,xmm4,xmm3
+	vpsllq	xmm3,xmm0,63
+	vpxor	xmm4,xmm4,xmm3
+	vpslldq	xmm3,xmm4,8
+	vpsrldq	xmm4,xmm4,8
+	vpxor	xmm0,xmm0,xmm3
+	vpxor	xmm1,xmm1,xmm4
+
+	vpsrlq	xmm4,xmm0,1
+	vpxor	xmm1,xmm1,xmm0
+	vpxor	xmm0,xmm0,xmm4
+	vpsrlq	xmm4,xmm4,5
+	vpxor	xmm0,xmm0,xmm4
+	vpsrlq	xmm0,xmm0,1
+	vpxor	xmm0,xmm0,xmm1
+	vpshufd	xmm3,xmm5,78
+	vpshufd	xmm4,xmm0,78
+	vpxor	xmm3,xmm3,xmm5
+	vmovdqu	XMMWORD[rcx],xmm5
+	vpxor	xmm4,xmm4,xmm0
+	vmovdqu	XMMWORD[16+rcx],xmm0
+	lea	rcx,[48+rcx]
+	sub	r10,1
+	jnz	NEAR $L$init_loop_avx
+
+	vpalignr	xmm5,xmm3,xmm4,8
+	vmovdqu	XMMWORD[(-16)+rcx],xmm5
+
+	vzeroupper
+	movaps	xmm6,XMMWORD[rsp]
+	lea	rsp,[24+rsp]
+$L$SEH_end_GFp_gcm_init_avx:
+	DB	0F3h,0C3h		;repret
+
+
+global	GFp_gcm_ghash_avx
+
+ALIGN	32
+GFp_gcm_ghash_avx:
+
+	lea	rax,[((-136))+rsp]
+$L$SEH_begin_GFp_gcm_ghash_avx:
+
+DB	0x48,0x8d,0x60,0xe0
+DB	0x0f,0x29,0x70,0xe0
+DB	0x0f,0x29,0x78,0xf0
+DB	0x44,0x0f,0x29,0x00
+DB	0x44,0x0f,0x29,0x48,0x10
+DB	0x44,0x0f,0x29,0x50,0x20
+DB	0x44,0x0f,0x29,0x58,0x30
+DB	0x44,0x0f,0x29,0x60,0x40
+DB	0x44,0x0f,0x29,0x68,0x50
+DB	0x44,0x0f,0x29,0x70,0x60
+DB	0x44,0x0f,0x29,0x78,0x70
+	vzeroupper
+
+	vmovdqu	xmm10,XMMWORD[rcx]
+	lea	r10,[$L$0x1c2_polynomial]
+	lea	rdx,[64+rdx]
+	vmovdqu	xmm13,XMMWORD[$L$bswap_mask]
+	vpshufb	xmm10,xmm10,xmm13
+	cmp	r9,0x80
+	jb	NEAR $L$short_avx
+	sub	r9,0x80
+
+	vmovdqu	xmm14,XMMWORD[112+r8]
+	vmovdqu	xmm6,XMMWORD[((0-64))+rdx]
+	vpshufb	xmm14,xmm14,xmm13
+	vmovdqu	xmm7,XMMWORD[((32-64))+rdx]
+
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vmovdqu	xmm15,XMMWORD[96+r8]
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm9,xmm9,xmm14
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((16-64))+rdx]
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vmovdqu	xmm14,XMMWORD[80+r8]
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm8,xmm8,xmm15
+
+	vpshufb	xmm14,xmm14,xmm13
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((48-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+	vmovdqu	xmm15,XMMWORD[64+r8]
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((80-64))+rdx]
+
+	vpshufb	xmm15,xmm15,xmm13
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm4,xmm4,xmm1
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((64-64))+rdx]
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm14,XMMWORD[48+r8]
+	vpxor	xmm0,xmm0,xmm3
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpxor	xmm1,xmm1,xmm4
+	vpshufb	xmm14,xmm14,xmm13
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((96-64))+rdx]
+	vpxor	xmm2,xmm2,xmm5
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((128-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+
+	vmovdqu	xmm15,XMMWORD[32+r8]
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm4,xmm4,xmm1
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((112-64))+rdx]
+	vpxor	xmm5,xmm5,xmm2
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm14,XMMWORD[16+r8]
+	vpxor	xmm0,xmm0,xmm3
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpxor	xmm1,xmm1,xmm4
+	vpshufb	xmm14,xmm14,xmm13
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((144-64))+rdx]
+	vpxor	xmm2,xmm2,xmm5
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((176-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+
+	vmovdqu	xmm15,XMMWORD[r8]
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm4,xmm4,xmm1
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((160-64))+rdx]
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm9,xmm7,0x10
+
+	lea	r8,[128+r8]
+	cmp	r9,0x80
+	jb	NEAR $L$tail_avx
+
+	vpxor	xmm15,xmm15,xmm10
+	sub	r9,0x80
+	jmp	NEAR $L$oop8x_avx
+
+ALIGN	32
+$L$oop8x_avx:
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vmovdqu	xmm14,XMMWORD[112+r8]
+	vpxor	xmm3,xmm3,xmm0
+	vpxor	xmm8,xmm8,xmm15
+	vpclmulqdq	xmm10,xmm15,xmm6,0x00
+	vpshufb	xmm14,xmm14,xmm13
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm11,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((0-64))+rdx]
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm12,xmm8,xmm7,0x00
+	vmovdqu	xmm7,XMMWORD[((32-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+
+	vmovdqu	xmm15,XMMWORD[96+r8]
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm10,xmm10,xmm3
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vxorps	xmm11,xmm11,xmm4
+	vmovdqu	xmm6,XMMWORD[((16-64))+rdx]
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm12,xmm12,xmm5
+	vxorps	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm14,XMMWORD[80+r8]
+	vpxor	xmm12,xmm12,xmm10
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpxor	xmm12,xmm12,xmm11
+	vpslldq	xmm9,xmm12,8
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vpsrldq	xmm12,xmm12,8
+	vpxor	xmm10,xmm10,xmm9
+	vmovdqu	xmm6,XMMWORD[((48-64))+rdx]
+	vpshufb	xmm14,xmm14,xmm13
+	vxorps	xmm11,xmm11,xmm12
+	vpxor	xmm4,xmm4,xmm1
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((80-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+	vpxor	xmm5,xmm5,xmm2
+
+	vmovdqu	xmm15,XMMWORD[64+r8]
+	vpalignr	xmm12,xmm10,xmm10,8
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpshufb	xmm15,xmm15,xmm13
+	vpxor	xmm0,xmm0,xmm3
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((64-64))+rdx]
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vxorps	xmm8,xmm8,xmm15
+	vpxor	xmm2,xmm2,xmm5
+
+	vmovdqu	xmm14,XMMWORD[48+r8]
+	vpclmulqdq	xmm10,xmm10,XMMWORD[r10],0x10
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpshufb	xmm14,xmm14,xmm13
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((96-64))+rdx]
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((128-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+	vpxor	xmm5,xmm5,xmm2
+
+	vmovdqu	xmm15,XMMWORD[32+r8]
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpshufb	xmm15,xmm15,xmm13
+	vpxor	xmm0,xmm0,xmm3
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((112-64))+rdx]
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vpxor	xmm2,xmm2,xmm5
+	vxorps	xmm10,xmm10,xmm12
+
+	vmovdqu	xmm14,XMMWORD[16+r8]
+	vpalignr	xmm12,xmm10,xmm10,8
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpshufb	xmm14,xmm14,xmm13
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((144-64))+rdx]
+	vpclmulqdq	xmm10,xmm10,XMMWORD[r10],0x10
+	vxorps	xmm12,xmm12,xmm11
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((176-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+	vpxor	xmm5,xmm5,xmm2
+
+	vmovdqu	xmm15,XMMWORD[r8]
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((160-64))+rdx]
+	vpxor	xmm15,xmm15,xmm12
+	vpclmulqdq	xmm2,xmm9,xmm7,0x10
+	vpxor	xmm15,xmm15,xmm10
+
+	lea	r8,[128+r8]
+	sub	r9,0x80
+	jnc	NEAR $L$oop8x_avx
+
+	add	r9,0x80
+	jmp	NEAR $L$tail_no_xor_avx
+
+ALIGN	32
+$L$short_avx:
+	vmovdqu	xmm14,XMMWORD[((-16))+r9*1+r8]
+	lea	r8,[r9*1+r8]
+	vmovdqu	xmm6,XMMWORD[((0-64))+rdx]
+	vmovdqu	xmm7,XMMWORD[((32-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+
+	vmovdqa	xmm3,xmm0
+	vmovdqa	xmm4,xmm1
+	vmovdqa	xmm5,xmm2
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-32))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((16-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vpsrldq	xmm7,xmm7,8
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-48))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((48-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vmovdqu	xmm7,XMMWORD[((80-64))+rdx]
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-64))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((64-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vpsrldq	xmm7,xmm7,8
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-80))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((96-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vmovdqu	xmm7,XMMWORD[((128-64))+rdx]
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-96))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((112-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vpsrldq	xmm7,xmm7,8
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-112))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((144-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vmovq	xmm7,QWORD[((184-64))+rdx]
+	sub	r9,0x10
+	jmp	NEAR $L$tail_avx
+
+ALIGN	32
+$L$tail_avx:
+	vpxor	xmm15,xmm15,xmm10
+$L$tail_no_xor_avx:
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+
+	vmovdqu	xmm12,XMMWORD[r10]
+
+	vpxor	xmm10,xmm3,xmm0
+	vpxor	xmm11,xmm4,xmm1
+	vpxor	xmm5,xmm5,xmm2
+
+	vpxor	xmm5,xmm5,xmm10
+	vpxor	xmm5,xmm5,xmm11
+	vpslldq	xmm9,xmm5,8
+	vpsrldq	xmm5,xmm5,8
+	vpxor	xmm10,xmm10,xmm9
+	vpxor	xmm11,xmm11,xmm5
+
+	vpclmulqdq	xmm9,xmm10,xmm12,0x10
+	vpalignr	xmm10,xmm10,xmm10,8
+	vpxor	xmm10,xmm10,xmm9
+
+	vpclmulqdq	xmm9,xmm10,xmm12,0x10
+	vpalignr	xmm10,xmm10,xmm10,8
+	vpxor	xmm10,xmm10,xmm11
+	vpxor	xmm10,xmm10,xmm9
+
+	cmp	r9,0
+	jne	NEAR $L$short_avx
+
+	vpshufb	xmm10,xmm10,xmm13
+	vmovdqu	XMMWORD[rcx],xmm10
+	vzeroupper
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[168+rsp]
+$L$SEH_end_GFp_gcm_ghash_avx:
+	DB	0F3h,0C3h		;repret
+
+
+ALIGN	64
+$L$bswap_mask:
+DB	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+$L$0x1c2_polynomial:
+DB	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+$L$7_mask:
+	DD	7,0,7,0
+ALIGN	64
+
+DB	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52
+DB	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
+DB	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
+DB	114,103,62,0
+ALIGN	64
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_GFp_gcm_init_clmul wrt ..imagebase
+	DD	$L$SEH_end_GFp_gcm_init_clmul wrt ..imagebase
+	DD	$L$SEH_info_GFp_gcm_init_clmul wrt ..imagebase
+
+	DD	$L$SEH_begin_GFp_gcm_ghash_clmul wrt ..imagebase
+	DD	$L$SEH_end_GFp_gcm_ghash_clmul wrt ..imagebase
+	DD	$L$SEH_info_GFp_gcm_ghash_clmul wrt ..imagebase
+	DD	$L$SEH_begin_GFp_gcm_init_avx wrt ..imagebase
+	DD	$L$SEH_end_GFp_gcm_init_avx wrt ..imagebase
+	DD	$L$SEH_info_GFp_gcm_init_clmul wrt ..imagebase
+
+	DD	$L$SEH_begin_GFp_gcm_ghash_avx wrt ..imagebase
+	DD	$L$SEH_end_GFp_gcm_ghash_avx wrt ..imagebase
+	DD	$L$SEH_info_GFp_gcm_ghash_clmul wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_GFp_gcm_init_clmul:
+DB	0x01,0x08,0x03,0x00
+DB	0x08,0x68,0x00,0x00
+DB	0x04,0x22,0x00,0x00
+$L$SEH_info_GFp_gcm_ghash_clmul:
+DB	0x01,0x33,0x16,0x00
+DB	0x33,0xf8,0x09,0x00
+DB	0x2e,0xe8,0x08,0x00
+DB	0x29,0xd8,0x07,0x00
+DB	0x24,0xc8,0x06,0x00
+DB	0x1f,0xb8,0x05,0x00
+DB	0x1a,0xa8,0x04,0x00
+DB	0x15,0x98,0x03,0x00
+DB	0x10,0x88,0x02,0x00
+DB	0x0c,0x78,0x01,0x00
+DB	0x08,0x68,0x00,0x00
+DB	0x04,0x01,0x15,0x00

+ 5037 - 0
zeroidc/vendor/ring/pregenerated/tmp/p256-x86_64-asm-nasm.asm

@@ -0,0 +1,5037 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+section	.text code align=64
+
+EXTERN	GFp_ia32cap_P
+
+
+ALIGN	64
+$L$poly:
+	DQ	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
+
+$L$One:
+	DD	1,1,1,1,1,1,1,1
+$L$Two:
+	DD	2,2,2,2,2,2,2,2
+$L$Three:
+	DD	3,3,3,3,3,3,3,3
+$L$ONE_mont:
+	DQ	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
+
+
+$L$ord:
+	DQ	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+$L$ordK:
+	DQ	0xccd1c8aaee00bc4f
+
+
+
+global	GFp_nistz256_add
+
+ALIGN	32
+GFp_nistz256_add:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_nistz256_add:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+	push	r12
+	push	r13
+
+	mov	r8,QWORD[rsi]
+	xor	r13,r13
+	mov	r9,QWORD[8+rsi]
+	mov	r10,QWORD[16+rsi]
+	mov	r11,QWORD[24+rsi]
+	lea	rsi,[$L$poly]
+
+	add	r8,QWORD[rdx]
+	adc	r9,QWORD[8+rdx]
+	mov	rax,r8
+	adc	r10,QWORD[16+rdx]
+	adc	r11,QWORD[24+rdx]
+	mov	rdx,r9
+	adc	r13,0
+
+	sub	r8,QWORD[rsi]
+	mov	rcx,r10
+	sbb	r9,QWORD[8+rsi]
+	sbb	r10,QWORD[16+rsi]
+	mov	r12,r11
+	sbb	r11,QWORD[24+rsi]
+	sbb	r13,0
+
+	cmovc	r8,rax
+	cmovc	r9,rdx
+	mov	QWORD[rdi],r8
+	cmovc	r10,rcx
+	mov	QWORD[8+rdi],r9
+	cmovc	r11,r12
+	mov	QWORD[16+rdi],r10
+	mov	QWORD[24+rdi],r11
+
+	pop	r13
+	pop	r12
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+$L$SEH_end_GFp_nistz256_add:
+
+
+
+global	GFp_nistz256_neg
+
+ALIGN	32
+GFp_nistz256_neg:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_nistz256_neg:
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+	push	r12
+
+	push	r13
+
+$L$neg_body:
+
+	xor	r8,r8
+	xor	r9,r9
+	xor	r10,r10
+	xor	r11,r11
+	xor	r13,r13
+
+	sub	r8,QWORD[rsi]
+	sbb	r9,QWORD[8+rsi]
+	sbb	r10,QWORD[16+rsi]
+	mov	rax,r8
+	sbb	r11,QWORD[24+rsi]
+	lea	rsi,[$L$poly]
+	mov	rdx,r9
+	sbb	r13,0
+
+	add	r8,QWORD[rsi]
+	mov	rcx,r10
+	adc	r9,QWORD[8+rsi]
+	adc	r10,QWORD[16+rsi]
+	mov	r12,r11
+	adc	r11,QWORD[24+rsi]
+	test	r13,r13
+
+	cmovz	r8,rax
+	cmovz	r9,rdx
+	mov	QWORD[rdi],r8
+	cmovz	r10,rcx
+	mov	QWORD[8+rdi],r9
+	cmovz	r11,r12
+	mov	QWORD[16+rdi],r10
+	mov	QWORD[24+rdi],r11
+
+	mov	r13,QWORD[rsp]
+
+	mov	r12,QWORD[8+rsp]
+
+	lea	rsp,[16+rsp]
+
+$L$neg_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_nistz256_neg:
+
+
+
+
+
+
+global	GFp_p256_scalar_mul_mont
+
+ALIGN	32
+GFp_p256_scalar_mul_mont:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_p256_scalar_mul_mont:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	lea	rcx,[GFp_ia32cap_P]
+	mov	rcx,QWORD[8+rcx]
+	and	ecx,0x80100
+	cmp	ecx,0x80100
+	je	NEAR $L$ecp_nistz256_ord_mul_montx
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$ord_mul_body:
+
+	mov	rax,QWORD[rdx]
+	mov	rbx,rdx
+	lea	r14,[$L$ord]
+	mov	r15,QWORD[$L$ordK]
+
+
+	mov	rcx,rax
+	mul	QWORD[rsi]
+	mov	r8,rax
+	mov	rax,rcx
+	mov	r9,rdx
+
+	mul	QWORD[8+rsi]
+	add	r9,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD[16+rsi]
+	add	r10,rax
+	mov	rax,rcx
+	adc	rdx,0
+
+	mov	r13,r8
+	imul	r8,r15
+
+	mov	r11,rdx
+	mul	QWORD[24+rsi]
+	add	r11,rax
+	mov	rax,r8
+	adc	rdx,0
+	mov	r12,rdx
+
+
+	mul	QWORD[r14]
+	mov	rbp,r8
+	add	r13,rax
+	mov	rax,r8
+	adc	rdx,0
+	mov	rcx,rdx
+
+	sub	r10,r8
+	sbb	r8,0
+
+	mul	QWORD[8+r14]
+	add	r9,rcx
+	adc	rdx,0
+	add	r9,rax
+	mov	rax,rbp
+	adc	r10,rdx
+	mov	rdx,rbp
+	adc	r8,0
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r11,rax
+	mov	rax,QWORD[8+rbx]
+	sbb	rbp,rdx
+
+	add	r11,r8
+	adc	r12,rbp
+	adc	r13,0
+
+
+	mov	rcx,rax
+	mul	QWORD[rsi]
+	add	r9,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD[8+rsi]
+	add	r10,rbp
+	adc	rdx,0
+	add	r10,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD[16+rsi]
+	add	r11,rbp
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,rcx
+	adc	rdx,0
+
+	mov	rcx,r9
+	imul	r9,r15
+
+	mov	rbp,rdx
+	mul	QWORD[24+rsi]
+	add	r12,rbp
+	adc	rdx,0
+	xor	r8,r8
+	add	r12,rax
+	mov	rax,r9
+	adc	r13,rdx
+	adc	r8,0
+
+
+	mul	QWORD[r14]
+	mov	rbp,r9
+	add	rcx,rax
+	mov	rax,r9
+	adc	rcx,rdx
+
+	sub	r11,r9
+	sbb	r9,0
+
+	mul	QWORD[8+r14]
+	add	r10,rcx
+	adc	rdx,0
+	add	r10,rax
+	mov	rax,rbp
+	adc	r11,rdx
+	mov	rdx,rbp
+	adc	r9,0
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r12,rax
+	mov	rax,QWORD[16+rbx]
+	sbb	rbp,rdx
+
+	add	r12,r9
+	adc	r13,rbp
+	adc	r8,0
+
+
+	mov	rcx,rax
+	mul	QWORD[rsi]
+	add	r10,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD[8+rsi]
+	add	r11,rbp
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD[16+rsi]
+	add	r12,rbp
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,rcx
+	adc	rdx,0
+
+	mov	rcx,r10
+	imul	r10,r15
+
+	mov	rbp,rdx
+	mul	QWORD[24+rsi]
+	add	r13,rbp
+	adc	rdx,0
+	xor	r9,r9
+	add	r13,rax
+	mov	rax,r10
+	adc	r8,rdx
+	adc	r9,0
+
+
+	mul	QWORD[r14]
+	mov	rbp,r10
+	add	rcx,rax
+	mov	rax,r10
+	adc	rcx,rdx
+
+	sub	r12,r10
+	sbb	r10,0
+
+	mul	QWORD[8+r14]
+	add	r11,rcx
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,rbp
+	adc	r12,rdx
+	mov	rdx,rbp
+	adc	r10,0
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r13,rax
+	mov	rax,QWORD[24+rbx]
+	sbb	rbp,rdx
+
+	add	r13,r10
+	adc	r8,rbp
+	adc	r9,0
+
+
+	mov	rcx,rax
+	mul	QWORD[rsi]
+	add	r11,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD[8+rsi]
+	add	r12,rbp
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD[16+rsi]
+	add	r13,rbp
+	adc	rdx,0
+	add	r13,rax
+	mov	rax,rcx
+	adc	rdx,0
+
+	mov	rcx,r11
+	imul	r11,r15
+
+	mov	rbp,rdx
+	mul	QWORD[24+rsi]
+	add	r8,rbp
+	adc	rdx,0
+	xor	r10,r10
+	add	r8,rax
+	mov	rax,r11
+	adc	r9,rdx
+	adc	r10,0
+
+
+	mul	QWORD[r14]
+	mov	rbp,r11
+	add	rcx,rax
+	mov	rax,r11
+	adc	rcx,rdx
+
+	sub	r13,r11
+	sbb	r11,0
+
+	mul	QWORD[8+r14]
+	add	r12,rcx
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,rbp
+	adc	r13,rdx
+	mov	rdx,rbp
+	adc	r11,0
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r8,rax
+	sbb	rbp,rdx
+
+	add	r8,r11
+	adc	r9,rbp
+	adc	r10,0
+
+
+	mov	rsi,r12
+	sub	r12,QWORD[r14]
+	mov	r11,r13
+	sbb	r13,QWORD[8+r14]
+	mov	rcx,r8
+	sbb	r8,QWORD[16+r14]
+	mov	rbp,r9
+	sbb	r9,QWORD[24+r14]
+	sbb	r10,0
+
+	cmovc	r12,rsi
+	cmovc	r13,r11
+	cmovc	r8,rcx
+	cmovc	r9,rbp
+
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$ord_mul_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_p256_scalar_mul_mont:
+
+
+
+
+
+
+
+global	GFp_p256_scalar_sqr_rep_mont
+
+ALIGN	32
+GFp_p256_scalar_sqr_rep_mont:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_p256_scalar_sqr_rep_mont:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	lea	rcx,[GFp_ia32cap_P]
+	mov	rcx,QWORD[8+rcx]
+	and	ecx,0x80100
+	cmp	ecx,0x80100
+	je	NEAR $L$ecp_nistz256_ord_sqr_montx
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$ord_sqr_body:
+
+	mov	r8,QWORD[rsi]
+	mov	rax,QWORD[8+rsi]
+	mov	r14,QWORD[16+rsi]
+	mov	r15,QWORD[24+rsi]
+	lea	rsi,[$L$ord]
+	mov	rbx,rdx
+	jmp	NEAR $L$oop_ord_sqr
+
+ALIGN	32
+$L$oop_ord_sqr:
+
+	mov	rbp,rax
+	mul	r8
+	mov	r9,rax
+DB	102,72,15,110,205
+	mov	rax,r14
+	mov	r10,rdx
+
+	mul	r8
+	add	r10,rax
+	mov	rax,r15
+DB	102,73,15,110,214
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	r8
+	add	r11,rax
+	mov	rax,r15
+DB	102,73,15,110,223
+	adc	rdx,0
+	mov	r12,rdx
+
+
+	mul	r14
+	mov	r13,rax
+	mov	rax,r14
+	mov	r14,rdx
+
+
+	mul	rbp
+	add	r11,rax
+	mov	rax,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	rbp
+	add	r12,rax
+	adc	rdx,0
+
+	add	r12,r15
+	adc	r13,rdx
+	adc	r14,0
+
+
+	xor	r15,r15
+	mov	rax,r8
+	add	r9,r9
+	adc	r10,r10
+	adc	r11,r11
+	adc	r12,r12
+	adc	r13,r13
+	adc	r14,r14
+	adc	r15,0
+
+
+	mul	rax
+	mov	r8,rax
+DB	102,72,15,126,200
+	mov	rbp,rdx
+
+	mul	rax
+	add	r9,rbp
+	adc	r10,rax
+DB	102,72,15,126,208
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	rax
+	add	r11,rbp
+	adc	r12,rax
+DB	102,72,15,126,216
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mov	rcx,r8
+	imul	r8,QWORD[32+rsi]
+
+	mul	rax
+	add	r13,rbp
+	adc	r14,rax
+	mov	rax,QWORD[rsi]
+	adc	r15,rdx
+
+
+	mul	r8
+	mov	rbp,r8
+	add	rcx,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rcx,rdx
+
+	sub	r10,r8
+	sbb	rbp,0
+
+	mul	r8
+	add	r9,rcx
+	adc	rdx,0
+	add	r9,rax
+	mov	rax,r8
+	adc	r10,rdx
+	mov	rdx,r8
+	adc	rbp,0
+
+	mov	rcx,r9
+	imul	r9,QWORD[32+rsi]
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r11,rax
+	mov	rax,QWORD[rsi]
+	sbb	r8,rdx
+
+	add	r11,rbp
+	adc	r8,0
+
+
+	mul	r9
+	mov	rbp,r9
+	add	rcx,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rcx,rdx
+
+	sub	r11,r9
+	sbb	rbp,0
+
+	mul	r9
+	add	r10,rcx
+	adc	rdx,0
+	add	r10,rax
+	mov	rax,r9
+	adc	r11,rdx
+	mov	rdx,r9
+	adc	rbp,0
+
+	mov	rcx,r10
+	imul	r10,QWORD[32+rsi]
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r8,rax
+	mov	rax,QWORD[rsi]
+	sbb	r9,rdx
+
+	add	r8,rbp
+	adc	r9,0
+
+
+	mul	r10
+	mov	rbp,r10
+	add	rcx,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rcx,rdx
+
+	sub	r8,r10
+	sbb	rbp,0
+
+	mul	r10
+	add	r11,rcx
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,r10
+	adc	r8,rdx
+	mov	rdx,r10
+	adc	rbp,0
+
+	mov	rcx,r11
+	imul	r11,QWORD[32+rsi]
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r9,rax
+	mov	rax,QWORD[rsi]
+	sbb	r10,rdx
+
+	add	r9,rbp
+	adc	r10,0
+
+
+	mul	r11
+	mov	rbp,r11
+	add	rcx,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rcx,rdx
+
+	sub	r9,r11
+	sbb	rbp,0
+
+	mul	r11
+	add	r8,rcx
+	adc	rdx,0
+	add	r8,rax
+	mov	rax,r11
+	adc	r9,rdx
+	mov	rdx,r11
+	adc	rbp,0
+
+	shl	rax,32
+	shr	rdx,32
+	sub	r10,rax
+	sbb	r11,rdx
+
+	add	r10,rbp
+	adc	r11,0
+
+
+	xor	rdx,rdx
+	add	r8,r12
+	adc	r9,r13
+	mov	r12,r8
+	adc	r10,r14
+	adc	r11,r15
+	mov	rax,r9
+	adc	rdx,0
+
+
+	sub	r8,QWORD[rsi]
+	mov	r14,r10
+	sbb	r9,QWORD[8+rsi]
+	sbb	r10,QWORD[16+rsi]
+	mov	r15,r11
+	sbb	r11,QWORD[24+rsi]
+	sbb	rdx,0
+
+	cmovc	r8,r12
+	cmovnc	rax,r9
+	cmovnc	r14,r10
+	cmovnc	r15,r11
+
+	dec	rbx
+	jnz	NEAR $L$oop_ord_sqr
+
+	mov	QWORD[rdi],r8
+	mov	QWORD[8+rdi],rax
+	pxor	xmm1,xmm1
+	mov	QWORD[16+rdi],r14
+	pxor	xmm2,xmm2
+	mov	QWORD[24+rdi],r15
+	pxor	xmm3,xmm3
+
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$ord_sqr_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_p256_scalar_sqr_rep_mont:
+
+
+ALIGN	32
+ecp_nistz256_ord_mul_montx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_ord_mul_montx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+$L$ecp_nistz256_ord_mul_montx:
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$ord_mulx_body:
+
+	mov	rbx,rdx
+	mov	rdx,QWORD[rdx]
+	mov	r9,QWORD[rsi]
+	mov	r10,QWORD[8+rsi]
+	mov	r11,QWORD[16+rsi]
+	mov	r12,QWORD[24+rsi]
+	lea	rsi,[((-128))+rsi]
+	lea	r14,[(($L$ord-128))]
+	mov	r15,QWORD[$L$ordK]
+
+
+	mulx	r9,r8,r9
+	mulx	r10,rcx,r10
+	mulx	r11,rbp,r11
+	add	r9,rcx
+	mulx	r12,rcx,r12
+	mov	rdx,r8
+	mulx	rax,rdx,r15
+	adc	r10,rbp
+	adc	r11,rcx
+	adc	r12,0
+
+
+	xor	r13,r13
+	mulx	rbp,rcx,QWORD[((0+128))+r14]
+	adcx	r8,rcx
+	adox	r9,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+r14]
+	adcx	r9,rcx
+	adox	r10,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+r14]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+r14]
+	mov	rdx,QWORD[8+rbx]
+	adcx	r11,rcx
+	adox	r12,rbp
+	adcx	r12,r8
+	adox	r13,r8
+	adc	r13,0
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+rsi]
+	adcx	r9,rcx
+	adox	r10,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+rsi]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+rsi]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+rsi]
+	mov	rdx,r9
+	mulx	rax,rdx,r15
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	adcx	r13,r8
+	adox	r8,r8
+	adc	r8,0
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+r14]
+	adcx	r9,rcx
+	adox	r10,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+r14]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+r14]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+r14]
+	mov	rdx,QWORD[16+rbx]
+	adcx	r12,rcx
+	adox	r13,rbp
+	adcx	r13,r9
+	adox	r8,r9
+	adc	r8,0
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+rsi]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+rsi]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+rsi]
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+rsi]
+	mov	rdx,r10
+	mulx	rax,rdx,r15
+	adcx	r13,rcx
+	adox	r8,rbp
+
+	adcx	r8,r9
+	adox	r9,r9
+	adc	r9,0
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+r14]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+r14]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+r14]
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+r14]
+	mov	rdx,QWORD[24+rbx]
+	adcx	r13,rcx
+	adox	r8,rbp
+	adcx	r8,r10
+	adox	r9,r10
+	adc	r9,0
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+rsi]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+rsi]
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+rsi]
+	adcx	r13,rcx
+	adox	r8,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+rsi]
+	mov	rdx,r11
+	mulx	rax,rdx,r15
+	adcx	r8,rcx
+	adox	r9,rbp
+
+	adcx	r9,r10
+	adox	r10,r10
+	adc	r10,0
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+r14]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+r14]
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+r14]
+	adcx	r13,rcx
+	adox	r8,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+r14]
+	lea	r14,[128+r14]
+	mov	rbx,r12
+	adcx	r8,rcx
+	adox	r9,rbp
+	mov	rdx,r13
+	adcx	r9,r11
+	adox	r10,r11
+	adc	r10,0
+
+
+
+	mov	rcx,r8
+	sub	r12,QWORD[r14]
+	sbb	r13,QWORD[8+r14]
+	sbb	r8,QWORD[16+r14]
+	mov	rbp,r9
+	sbb	r9,QWORD[24+r14]
+	sbb	r10,0
+
+	cmovc	r12,rbx
+	cmovc	r13,rdx
+	cmovc	r8,rcx
+	cmovc	r9,rbp
+
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$ord_mulx_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_ecp_nistz256_ord_mul_montx:
+
+
+ALIGN	32
+ecp_nistz256_ord_sqr_montx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_ecp_nistz256_ord_sqr_montx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+$L$ecp_nistz256_ord_sqr_montx:
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$ord_sqrx_body:
+
+	mov	rbx,rdx
+	mov	rdx,QWORD[rsi]
+	mov	r14,QWORD[8+rsi]
+	mov	r15,QWORD[16+rsi]
+	mov	r8,QWORD[24+rsi]
+	lea	rsi,[$L$ord]
+	jmp	NEAR $L$oop_ord_sqrx
+
+ALIGN	32
+$L$oop_ord_sqrx:
+	mulx	r10,r9,r14
+	mulx	r11,rcx,r15
+	mov	rax,rdx
+DB	102,73,15,110,206
+	mulx	r12,rbp,r8
+	mov	rdx,r14
+	add	r10,rcx
+DB	102,73,15,110,215
+	adc	r11,rbp
+	adc	r12,0
+	xor	r13,r13
+
+	mulx	rbp,rcx,r15
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,r8
+	mov	rdx,r15
+	adcx	r12,rcx
+	adox	r13,rbp
+	adc	r13,0
+
+	mulx	r14,rcx,r8
+	mov	rdx,rax
+DB	102,73,15,110,216
+	xor	r15,r15
+	adcx	r9,r9
+	adox	r13,rcx
+	adcx	r10,r10
+	adox	r14,r15
+
+
+	mulx	rbp,r8,rdx
+DB	102,72,15,126,202
+	adcx	r11,r11
+	adox	r9,rbp
+	adcx	r12,r12
+	mulx	rax,rcx,rdx
+DB	102,72,15,126,210
+	adcx	r13,r13
+	adox	r10,rcx
+	adcx	r14,r14
+	mulx	rbp,rcx,rdx
+DB	0x67
+DB	102,72,15,126,218
+	adox	r11,rax
+	adcx	r15,r15
+	adox	r12,rcx
+	adox	r13,rbp
+	mulx	rax,rcx,rdx
+	adox	r14,rcx
+	adox	r15,rax
+
+
+	mov	rdx,r8
+	mulx	rcx,rdx,QWORD[32+rsi]
+
+	xor	rax,rax
+	mulx	rbp,rcx,QWORD[rsi]
+	adcx	r8,rcx
+	adox	r9,rbp
+	mulx	rbp,rcx,QWORD[8+rsi]
+	adcx	r9,rcx
+	adox	r10,rbp
+	mulx	rbp,rcx,QWORD[16+rsi]
+	adcx	r10,rcx
+	adox	r11,rbp
+	mulx	rbp,rcx,QWORD[24+rsi]
+	adcx	r11,rcx
+	adox	r8,rbp
+	adcx	r8,rax
+
+
+	mov	rdx,r9
+	mulx	rcx,rdx,QWORD[32+rsi]
+
+	mulx	rbp,rcx,QWORD[rsi]
+	adox	r9,rcx
+	adcx	r10,rbp
+	mulx	rbp,rcx,QWORD[8+rsi]
+	adox	r10,rcx
+	adcx	r11,rbp
+	mulx	rbp,rcx,QWORD[16+rsi]
+	adox	r11,rcx
+	adcx	r8,rbp
+	mulx	rbp,rcx,QWORD[24+rsi]
+	adox	r8,rcx
+	adcx	r9,rbp
+	adox	r9,rax
+
+
+	mov	rdx,r10
+	mulx	rcx,rdx,QWORD[32+rsi]
+
+	mulx	rbp,rcx,QWORD[rsi]
+	adcx	r10,rcx
+	adox	r11,rbp
+	mulx	rbp,rcx,QWORD[8+rsi]
+	adcx	r11,rcx
+	adox	r8,rbp
+	mulx	rbp,rcx,QWORD[16+rsi]
+	adcx	r8,rcx
+	adox	r9,rbp
+	mulx	rbp,rcx,QWORD[24+rsi]
+	adcx	r9,rcx
+	adox	r10,rbp
+	adcx	r10,rax
+
+
+	mov	rdx,r11
+	mulx	rcx,rdx,QWORD[32+rsi]
+
+	mulx	rbp,rcx,QWORD[rsi]
+	adox	r11,rcx
+	adcx	r8,rbp
+	mulx	rbp,rcx,QWORD[8+rsi]
+	adox	r8,rcx
+	adcx	r9,rbp
+	mulx	rbp,rcx,QWORD[16+rsi]
+	adox	r9,rcx
+	adcx	r10,rbp
+	mulx	rbp,rcx,QWORD[24+rsi]
+	adox	r10,rcx
+	adcx	r11,rbp
+	adox	r11,rax
+
+
+	add	r12,r8
+	adc	r9,r13
+	mov	rdx,r12
+	adc	r10,r14
+	adc	r11,r15
+	mov	r14,r9
+	adc	rax,0
+
+
+	sub	r12,QWORD[rsi]
+	mov	r15,r10
+	sbb	r9,QWORD[8+rsi]
+	sbb	r10,QWORD[16+rsi]
+	mov	r8,r11
+	sbb	r11,QWORD[24+rsi]
+	sbb	rax,0
+
+	cmovnc	rdx,r12
+	cmovnc	r14,r9
+	cmovnc	r15,r10
+	cmovnc	r8,r11
+
+	dec	rbx
+	jnz	NEAR $L$oop_ord_sqrx
+
+	mov	QWORD[rdi],rdx
+	mov	QWORD[8+rdi],r14
+	pxor	xmm1,xmm1
+	mov	QWORD[16+rdi],r15
+	pxor	xmm2,xmm2
+	mov	QWORD[24+rdi],r8
+	pxor	xmm3,xmm3
+
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$ord_sqrx_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_ecp_nistz256_ord_sqr_montx:
+
+
+
+
+
+
+global	GFp_nistz256_mul_mont
+
+ALIGN	32
+GFp_nistz256_mul_mont:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_nistz256_mul_mont:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	lea	rcx,[GFp_ia32cap_P]
+	mov	rcx,QWORD[8+rcx]
+	and	ecx,0x80100
+$L$mul_mont:
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$mul_body:
+	cmp	ecx,0x80100
+	je	NEAR $L$mul_montx
+	mov	rbx,rdx
+	mov	rax,QWORD[rdx]
+	mov	r9,QWORD[rsi]
+	mov	r10,QWORD[8+rsi]
+	mov	r11,QWORD[16+rsi]
+	mov	r12,QWORD[24+rsi]
+
+	call	__ecp_nistz256_mul_montq
+	jmp	NEAR $L$mul_mont_done
+
+ALIGN	32
+$L$mul_montx:
+	mov	rbx,rdx
+	mov	rdx,QWORD[rdx]
+	mov	r9,QWORD[rsi]
+	mov	r10,QWORD[8+rsi]
+	mov	r11,QWORD[16+rsi]
+	mov	r12,QWORD[24+rsi]
+	lea	rsi,[((-128))+rsi]
+
+	call	__ecp_nistz256_mul_montx
+$L$mul_mont_done:
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$mul_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_nistz256_mul_mont:
+
+
+ALIGN	32
+__ecp_nistz256_mul_montq:
+
+
+
+	mov	rbp,rax
+	mul	r9
+	mov	r14,QWORD[(($L$poly+8))]
+	mov	r8,rax
+	mov	rax,rbp
+	mov	r9,rdx
+
+	mul	r10
+	mov	r15,QWORD[(($L$poly+24))]
+	add	r9,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	r11
+	add	r10,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	r12
+	add	r11,rax
+	mov	rax,r8
+	adc	rdx,0
+	xor	r13,r13
+	mov	r12,rdx
+
+
+
+
+
+
+
+
+
+
+	mov	rbp,r8
+	shl	r8,32
+	mul	r15
+	shr	rbp,32
+	add	r9,r8
+	adc	r10,rbp
+	adc	r11,rax
+	mov	rax,QWORD[8+rbx]
+	adc	r12,rdx
+	adc	r13,0
+	xor	r8,r8
+
+
+
+	mov	rbp,rax
+	mul	QWORD[rsi]
+	add	r9,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[8+rsi]
+	add	r10,rcx
+	adc	rdx,0
+	add	r10,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[16+rsi]
+	add	r11,rcx
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[24+rsi]
+	add	r12,rcx
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,r9
+	adc	r13,rdx
+	adc	r8,0
+
+
+
+	mov	rbp,r9
+	shl	r9,32
+	mul	r15
+	shr	rbp,32
+	add	r10,r9
+	adc	r11,rbp
+	adc	r12,rax
+	mov	rax,QWORD[16+rbx]
+	adc	r13,rdx
+	adc	r8,0
+	xor	r9,r9
+
+
+
+	mov	rbp,rax
+	mul	QWORD[rsi]
+	add	r10,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[8+rsi]
+	add	r11,rcx
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[16+rsi]
+	add	r12,rcx
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[24+rsi]
+	add	r13,rcx
+	adc	rdx,0
+	add	r13,rax
+	mov	rax,r10
+	adc	r8,rdx
+	adc	r9,0
+
+
+
+	mov	rbp,r10
+	shl	r10,32
+	mul	r15
+	shr	rbp,32
+	add	r11,r10
+	adc	r12,rbp
+	adc	r13,rax
+	mov	rax,QWORD[24+rbx]
+	adc	r8,rdx
+	adc	r9,0
+	xor	r10,r10
+
+
+
+	mov	rbp,rax
+	mul	QWORD[rsi]
+	add	r11,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[8+rsi]
+	add	r12,rcx
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[16+rsi]
+	add	r13,rcx
+	adc	rdx,0
+	add	r13,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	QWORD[24+rsi]
+	add	r8,rcx
+	adc	rdx,0
+	add	r8,rax
+	mov	rax,r11
+	adc	r9,rdx
+	adc	r10,0
+
+
+
+	mov	rbp,r11
+	shl	r11,32
+	mul	r15
+	shr	rbp,32
+	add	r12,r11
+	adc	r13,rbp
+	mov	rcx,r12
+	adc	r8,rax
+	adc	r9,rdx
+	mov	rbp,r13
+	adc	r10,0
+
+
+
+	sub	r12,-1
+	mov	rbx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	rdx,r9
+	sbb	r9,r15
+	sbb	r10,0
+
+	cmovc	r12,rcx
+	cmovc	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovc	r8,rbx
+	mov	QWORD[8+rdi],r13
+	cmovc	r9,rdx
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	DB	0F3h,0C3h		;repret
+
+
+
+
+
+
+
+
+
+
+global	GFp_nistz256_sqr_mont
+
+ALIGN	32
+GFp_nistz256_sqr_mont:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_nistz256_sqr_mont:
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+	lea	rcx,[GFp_ia32cap_P]
+	mov	rcx,QWORD[8+rcx]
+	and	ecx,0x80100
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$sqr_body:
+	cmp	ecx,0x80100
+	je	NEAR $L$sqr_montx
+	mov	rax,QWORD[rsi]
+	mov	r14,QWORD[8+rsi]
+	mov	r15,QWORD[16+rsi]
+	mov	r8,QWORD[24+rsi]
+
+	call	__ecp_nistz256_sqr_montq
+	jmp	NEAR $L$sqr_mont_done
+
+ALIGN	32
+$L$sqr_montx:
+	mov	rdx,QWORD[rsi]
+	mov	r14,QWORD[8+rsi]
+	mov	r15,QWORD[16+rsi]
+	mov	r8,QWORD[24+rsi]
+	lea	rsi,[((-128))+rsi]
+
+	call	__ecp_nistz256_sqr_montx
+$L$sqr_mont_done:
+	mov	r15,QWORD[rsp]
+
+	mov	r14,QWORD[8+rsp]
+
+	mov	r13,QWORD[16+rsp]
+
+	mov	r12,QWORD[24+rsp]
+
+	mov	rbx,QWORD[32+rsp]
+
+	mov	rbp,QWORD[40+rsp]
+
+	lea	rsp,[48+rsp]
+
+$L$sqr_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_nistz256_sqr_mont:
+
+
+ALIGN	32
+__ecp_nistz256_sqr_montq:
+
+	mov	r13,rax
+	mul	r14
+	mov	r9,rax
+	mov	rax,r15
+	mov	r10,rdx
+
+	mul	r13
+	add	r10,rax
+	mov	rax,r8
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	r13
+	add	r11,rax
+	mov	rax,r15
+	adc	rdx,0
+	mov	r12,rdx
+
+
+	mul	r14
+	add	r11,rax
+	mov	rax,r8
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	r14
+	add	r12,rax
+	mov	rax,r8
+	adc	rdx,0
+	add	r12,rbp
+	mov	r13,rdx
+	adc	r13,0
+
+
+	mul	r15
+	xor	r15,r15
+	add	r13,rax
+	mov	rax,QWORD[rsi]
+	mov	r14,rdx
+	adc	r14,0
+
+	add	r9,r9
+	adc	r10,r10
+	adc	r11,r11
+	adc	r12,r12
+	adc	r13,r13
+	adc	r14,r14
+	adc	r15,0
+
+	mul	rax
+	mov	r8,rax
+	mov	rax,QWORD[8+rsi]
+	mov	rcx,rdx
+
+	mul	rax
+	add	r9,rcx
+	adc	r10,rax
+	mov	rax,QWORD[16+rsi]
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	rax
+	add	r11,rcx
+	adc	r12,rax
+	mov	rax,QWORD[24+rsi]
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	rax
+	add	r13,rcx
+	adc	r14,rax
+	mov	rax,r8
+	adc	r15,rdx
+
+	mov	rsi,QWORD[(($L$poly+8))]
+	mov	rbp,QWORD[(($L$poly+24))]
+
+
+
+
+	mov	rcx,r8
+	shl	r8,32
+	mul	rbp
+	shr	rcx,32
+	add	r9,r8
+	adc	r10,rcx
+	adc	r11,rax
+	mov	rax,r9
+	adc	rdx,0
+
+
+
+	mov	rcx,r9
+	shl	r9,32
+	mov	r8,rdx
+	mul	rbp
+	shr	rcx,32
+	add	r10,r9
+	adc	r11,rcx
+	adc	r8,rax
+	mov	rax,r10
+	adc	rdx,0
+
+
+
+	mov	rcx,r10
+	shl	r10,32
+	mov	r9,rdx
+	mul	rbp
+	shr	rcx,32
+	add	r11,r10
+	adc	r8,rcx
+	adc	r9,rax
+	mov	rax,r11
+	adc	rdx,0
+
+
+
+	mov	rcx,r11
+	shl	r11,32
+	mov	r10,rdx
+	mul	rbp
+	shr	rcx,32
+	add	r8,r11
+	adc	r9,rcx
+	adc	r10,rax
+	adc	rdx,0
+	xor	r11,r11
+
+
+
+	add	r12,r8
+	adc	r13,r9
+	mov	r8,r12
+	adc	r14,r10
+	adc	r15,rdx
+	mov	r9,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	r10,r14
+	sbb	r13,rsi
+	sbb	r14,0
+	mov	rcx,r15
+	sbb	r15,rbp
+	sbb	r11,0
+
+	cmovc	r12,r8
+	cmovc	r13,r9
+	mov	QWORD[rdi],r12
+	cmovc	r14,r10
+	mov	QWORD[8+rdi],r13
+	cmovc	r15,rcx
+	mov	QWORD[16+rdi],r14
+	mov	QWORD[24+rdi],r15
+
+	DB	0F3h,0C3h		;repret
+
+
+
+ALIGN	32
+__ecp_nistz256_mul_montx:
+
+
+
+	mulx	r9,r8,r9
+	mulx	r10,rcx,r10
+	mov	r14,32
+	xor	r13,r13
+	mulx	r11,rbp,r11
+	mov	r15,QWORD[(($L$poly+24))]
+	adc	r9,rcx
+	mulx	r12,rcx,r12
+	mov	rdx,r8
+	adc	r10,rbp
+	shlx	rbp,r8,r14
+	adc	r11,rcx
+	shrx	rcx,r8,r14
+	adc	r12,0
+
+
+
+	add	r9,rbp
+	adc	r10,rcx
+
+	mulx	rbp,rcx,r15
+	mov	rdx,QWORD[8+rbx]
+	adc	r11,rcx
+	adc	r12,rbp
+	adc	r13,0
+	xor	r8,r8
+
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+rsi]
+	adcx	r9,rcx
+	adox	r10,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+rsi]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+rsi]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+rsi]
+	mov	rdx,r9
+	adcx	r12,rcx
+	shlx	rcx,r9,r14
+	adox	r13,rbp
+	shrx	rbp,r9,r14
+
+	adcx	r13,r8
+	adox	r8,r8
+	adc	r8,0
+
+
+
+	add	r10,rcx
+	adc	r11,rbp
+
+	mulx	rbp,rcx,r15
+	mov	rdx,QWORD[16+rbx]
+	adc	r12,rcx
+	adc	r13,rbp
+	adc	r8,0
+	xor	r9,r9
+
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+rsi]
+	adcx	r10,rcx
+	adox	r11,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+rsi]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+rsi]
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+rsi]
+	mov	rdx,r10
+	adcx	r13,rcx
+	shlx	rcx,r10,r14
+	adox	r8,rbp
+	shrx	rbp,r10,r14
+
+	adcx	r8,r9
+	adox	r9,r9
+	adc	r9,0
+
+
+
+	add	r11,rcx
+	adc	r12,rbp
+
+	mulx	rbp,rcx,r15
+	mov	rdx,QWORD[24+rbx]
+	adc	r13,rcx
+	adc	r8,rbp
+	adc	r9,0
+	xor	r10,r10
+
+
+
+	mulx	rbp,rcx,QWORD[((0+128))+rsi]
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,QWORD[((8+128))+rsi]
+	adcx	r12,rcx
+	adox	r13,rbp
+
+	mulx	rbp,rcx,QWORD[((16+128))+rsi]
+	adcx	r13,rcx
+	adox	r8,rbp
+
+	mulx	rbp,rcx,QWORD[((24+128))+rsi]
+	mov	rdx,r11
+	adcx	r8,rcx
+	shlx	rcx,r11,r14
+	adox	r9,rbp
+	shrx	rbp,r11,r14
+
+	adcx	r9,r10
+	adox	r10,r10
+	adc	r10,0
+
+
+
+	add	r12,rcx
+	adc	r13,rbp
+
+	mulx	rbp,rcx,r15
+	mov	rbx,r12
+	mov	r14,QWORD[(($L$poly+8))]
+	adc	r8,rcx
+	mov	rdx,r13
+	adc	r9,rbp
+	adc	r10,0
+
+
+
+	xor	eax,eax
+	mov	rcx,r8
+	sbb	r12,-1
+	sbb	r13,r14
+	sbb	r8,0
+	mov	rbp,r9
+	sbb	r9,r15
+	sbb	r10,0
+
+	cmovc	r12,rbx
+	cmovc	r13,rdx
+	mov	QWORD[rdi],r12
+	cmovc	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovc	r9,rbp
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	DB	0F3h,0C3h		;repret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_sqr_montx:
+
+	mulx	r10,r9,r14
+	mulx	r11,rcx,r15
+	xor	eax,eax
+	adc	r10,rcx
+	mulx	r12,rbp,r8
+	mov	rdx,r14
+	adc	r11,rbp
+	adc	r12,0
+	xor	r13,r13
+
+
+	mulx	rbp,rcx,r15
+	adcx	r11,rcx
+	adox	r12,rbp
+
+	mulx	rbp,rcx,r8
+	mov	rdx,r15
+	adcx	r12,rcx
+	adox	r13,rbp
+	adc	r13,0
+
+
+	mulx	r14,rcx,r8
+	mov	rdx,QWORD[((0+128))+rsi]
+	xor	r15,r15
+	adcx	r9,r9
+	adox	r13,rcx
+	adcx	r10,r10
+	adox	r14,r15
+
+	mulx	rbp,r8,rdx
+	mov	rdx,QWORD[((8+128))+rsi]
+	adcx	r11,r11
+	adox	r9,rbp
+	adcx	r12,r12
+	mulx	rax,rcx,rdx
+	mov	rdx,QWORD[((16+128))+rsi]
+	adcx	r13,r13
+	adox	r10,rcx
+	adcx	r14,r14
+DB	0x67
+	mulx	rbp,rcx,rdx
+	mov	rdx,QWORD[((24+128))+rsi]
+	adox	r11,rax
+	adcx	r15,r15
+	adox	r12,rcx
+	mov	rsi,32
+	adox	r13,rbp
+DB	0x67,0x67
+	mulx	rax,rcx,rdx
+	mov	rdx,QWORD[(($L$poly+24))]
+	adox	r14,rcx
+	shlx	rcx,r8,rsi
+	adox	r15,rax
+	shrx	rax,r8,rsi
+	mov	rbp,rdx
+
+
+	add	r9,rcx
+	adc	r10,rax
+
+	mulx	r8,rcx,r8
+	adc	r11,rcx
+	shlx	rcx,r9,rsi
+	adc	r8,0
+	shrx	rax,r9,rsi
+
+
+	add	r10,rcx
+	adc	r11,rax
+
+	mulx	r9,rcx,r9
+	adc	r8,rcx
+	shlx	rcx,r10,rsi
+	adc	r9,0
+	shrx	rax,r10,rsi
+
+
+	add	r11,rcx
+	adc	r8,rax
+
+	mulx	r10,rcx,r10
+	adc	r9,rcx
+	shlx	rcx,r11,rsi
+	adc	r10,0
+	shrx	rax,r11,rsi
+
+
+	add	r8,rcx
+	adc	r9,rax
+
+	mulx	r11,rcx,r11
+	adc	r10,rcx
+	adc	r11,0
+
+	xor	rdx,rdx
+	add	r12,r8
+	mov	rsi,QWORD[(($L$poly+8))]
+	adc	r13,r9
+	mov	r8,r12
+	adc	r14,r10
+	adc	r15,r11
+	mov	r9,r13
+	adc	rdx,0
+
+	sub	r12,-1
+	mov	r10,r14
+	sbb	r13,rsi
+	sbb	r14,0
+	mov	r11,r15
+	sbb	r15,rbp
+	sbb	rdx,0
+
+	cmovc	r12,r8
+	cmovc	r13,r9
+	mov	QWORD[rdi],r12
+	cmovc	r14,r10
+	mov	QWORD[8+rdi],r13
+	cmovc	r15,r11
+	mov	QWORD[16+rdi],r14
+	mov	QWORD[24+rdi],r15
+
+	DB	0F3h,0C3h		;repret
+
+
+
+
+global	GFp_nistz256_select_w5
+
+ALIGN	32
+GFp_nistz256_select_w5:
+
+	lea	rax,[GFp_ia32cap_P]
+	mov	rax,QWORD[8+rax]
+	test	eax,32
+	jnz	NEAR $L$avx2_select_w5
+	lea	rax,[((-136))+rsp]
+$L$SEH_begin_GFp_nistz256_select_w5:
+DB	0x48,0x8d,0x60,0xe0
+DB	0x0f,0x29,0x70,0xe0
+DB	0x0f,0x29,0x78,0xf0
+DB	0x44,0x0f,0x29,0x00
+DB	0x44,0x0f,0x29,0x48,0x10
+DB	0x44,0x0f,0x29,0x50,0x20
+DB	0x44,0x0f,0x29,0x58,0x30
+DB	0x44,0x0f,0x29,0x60,0x40
+DB	0x44,0x0f,0x29,0x68,0x50
+DB	0x44,0x0f,0x29,0x70,0x60
+DB	0x44,0x0f,0x29,0x78,0x70
+	movdqa	xmm0,XMMWORD[$L$One]
+	movd	xmm1,r8d
+
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	pxor	xmm6,xmm6
+	pxor	xmm7,xmm7
+
+	movdqa	xmm8,xmm0
+	pshufd	xmm1,xmm1,0
+
+	mov	rax,16
+$L$select_loop_sse_w5:
+
+	movdqa	xmm15,xmm8
+	paddd	xmm8,xmm0
+	pcmpeqd	xmm15,xmm1
+
+	movdqa	xmm9,XMMWORD[rdx]
+	movdqa	xmm10,XMMWORD[16+rdx]
+	movdqa	xmm11,XMMWORD[32+rdx]
+	movdqa	xmm12,XMMWORD[48+rdx]
+	movdqa	xmm13,XMMWORD[64+rdx]
+	movdqa	xmm14,XMMWORD[80+rdx]
+	lea	rdx,[96+rdx]
+
+	pand	xmm9,xmm15
+	pand	xmm10,xmm15
+	por	xmm2,xmm9
+	pand	xmm11,xmm15
+	por	xmm3,xmm10
+	pand	xmm12,xmm15
+	por	xmm4,xmm11
+	pand	xmm13,xmm15
+	por	xmm5,xmm12
+	pand	xmm14,xmm15
+	por	xmm6,xmm13
+	por	xmm7,xmm14
+
+	dec	rax
+	jnz	NEAR $L$select_loop_sse_w5
+
+	movdqu	XMMWORD[rcx],xmm2
+	movdqu	XMMWORD[16+rcx],xmm3
+	movdqu	XMMWORD[32+rcx],xmm4
+	movdqu	XMMWORD[48+rcx],xmm5
+	movdqu	XMMWORD[64+rcx],xmm6
+	movdqu	XMMWORD[80+rcx],xmm7
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[168+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_nistz256_select_w5:
+
+
+
+
+global	GFp_nistz256_select_w7
+
+ALIGN	32
+GFp_nistz256_select_w7:
+
+	lea	rax,[GFp_ia32cap_P]
+	mov	rax,QWORD[8+rax]
+	test	eax,32
+	jnz	NEAR $L$avx2_select_w7
+	lea	rax,[((-136))+rsp]
+$L$SEH_begin_GFp_nistz256_select_w7:
+DB	0x48,0x8d,0x60,0xe0
+DB	0x0f,0x29,0x70,0xe0
+DB	0x0f,0x29,0x78,0xf0
+DB	0x44,0x0f,0x29,0x00
+DB	0x44,0x0f,0x29,0x48,0x10
+DB	0x44,0x0f,0x29,0x50,0x20
+DB	0x44,0x0f,0x29,0x58,0x30
+DB	0x44,0x0f,0x29,0x60,0x40
+DB	0x44,0x0f,0x29,0x68,0x50
+DB	0x44,0x0f,0x29,0x70,0x60
+DB	0x44,0x0f,0x29,0x78,0x70
+	movdqa	xmm8,XMMWORD[$L$One]
+	movd	xmm1,r8d
+
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+
+	movdqa	xmm0,xmm8
+	pshufd	xmm1,xmm1,0
+	mov	rax,64
+
+$L$select_loop_sse_w7:
+	movdqa	xmm15,xmm8
+	paddd	xmm8,xmm0
+	movdqa	xmm9,XMMWORD[rdx]
+	movdqa	xmm10,XMMWORD[16+rdx]
+	pcmpeqd	xmm15,xmm1
+	movdqa	xmm11,XMMWORD[32+rdx]
+	movdqa	xmm12,XMMWORD[48+rdx]
+	lea	rdx,[64+rdx]
+
+	pand	xmm9,xmm15
+	pand	xmm10,xmm15
+	por	xmm2,xmm9
+	pand	xmm11,xmm15
+	por	xmm3,xmm10
+	pand	xmm12,xmm15
+	por	xmm4,xmm11
+	prefetcht0	[255+rdx]
+	por	xmm5,xmm12
+
+	dec	rax
+	jnz	NEAR $L$select_loop_sse_w7
+
+	movdqu	XMMWORD[rcx],xmm2
+	movdqu	XMMWORD[16+rcx],xmm3
+	movdqu	XMMWORD[32+rcx],xmm4
+	movdqu	XMMWORD[48+rcx],xmm5
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[168+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_nistz256_select_w7:
+
+
+
+
+ALIGN	32
+GFp_nistz256_avx2_select_w5:
+
+$L$avx2_select_w5:
+	vzeroupper
+	lea	rax,[((-136))+rsp]
+	mov	r11,rsp
+$L$SEH_begin_GFp_nistz256_avx2_select_w5:
+DB	0x48,0x8d,0x60,0xe0
+DB	0xc5,0xf8,0x29,0x70,0xe0
+DB	0xc5,0xf8,0x29,0x78,0xf0
+DB	0xc5,0x78,0x29,0x40,0x00
+DB	0xc5,0x78,0x29,0x48,0x10
+DB	0xc5,0x78,0x29,0x50,0x20
+DB	0xc5,0x78,0x29,0x58,0x30
+DB	0xc5,0x78,0x29,0x60,0x40
+DB	0xc5,0x78,0x29,0x68,0x50
+DB	0xc5,0x78,0x29,0x70,0x60
+DB	0xc5,0x78,0x29,0x78,0x70
+	vmovdqa	ymm0,YMMWORD[$L$Two]
+
+	vpxor	ymm2,ymm2,ymm2
+	vpxor	ymm3,ymm3,ymm3
+	vpxor	ymm4,ymm4,ymm4
+
+	vmovdqa	ymm5,YMMWORD[$L$One]
+	vmovdqa	ymm10,YMMWORD[$L$Two]
+
+	vmovd	xmm1,r8d
+	vpermd	ymm1,ymm2,ymm1
+
+	mov	rax,8
+$L$select_loop_avx2_w5:
+
+	vmovdqa	ymm6,YMMWORD[rdx]
+	vmovdqa	ymm7,YMMWORD[32+rdx]
+	vmovdqa	ymm8,YMMWORD[64+rdx]
+
+	vmovdqa	ymm11,YMMWORD[96+rdx]
+	vmovdqa	ymm12,YMMWORD[128+rdx]
+	vmovdqa	ymm13,YMMWORD[160+rdx]
+
+	vpcmpeqd	ymm9,ymm5,ymm1
+	vpcmpeqd	ymm14,ymm10,ymm1
+
+	vpaddd	ymm5,ymm5,ymm0
+	vpaddd	ymm10,ymm10,ymm0
+	lea	rdx,[192+rdx]
+
+	vpand	ymm6,ymm6,ymm9
+	vpand	ymm7,ymm7,ymm9
+	vpand	ymm8,ymm8,ymm9
+	vpand	ymm11,ymm11,ymm14
+	vpand	ymm12,ymm12,ymm14
+	vpand	ymm13,ymm13,ymm14
+
+	vpxor	ymm2,ymm2,ymm6
+	vpxor	ymm3,ymm3,ymm7
+	vpxor	ymm4,ymm4,ymm8
+	vpxor	ymm2,ymm2,ymm11
+	vpxor	ymm3,ymm3,ymm12
+	vpxor	ymm4,ymm4,ymm13
+
+	dec	rax
+	jnz	NEAR $L$select_loop_avx2_w5
+
+	vmovdqu	YMMWORD[rcx],ymm2
+	vmovdqu	YMMWORD[32+rcx],ymm3
+	vmovdqu	YMMWORD[64+rcx],ymm4
+	vzeroupper
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[r11]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_nistz256_avx2_select_w5:
+
+
+
+
+global	GFp_nistz256_avx2_select_w7
+
+ALIGN	32
+GFp_nistz256_avx2_select_w7:
+
+$L$avx2_select_w7:
+	vzeroupper
+	mov	r11,rsp
+	lea	rax,[((-136))+rsp]
+$L$SEH_begin_GFp_nistz256_avx2_select_w7:
+DB	0x48,0x8d,0x60,0xe0
+DB	0xc5,0xf8,0x29,0x70,0xe0
+DB	0xc5,0xf8,0x29,0x78,0xf0
+DB	0xc5,0x78,0x29,0x40,0x00
+DB	0xc5,0x78,0x29,0x48,0x10
+DB	0xc5,0x78,0x29,0x50,0x20
+DB	0xc5,0x78,0x29,0x58,0x30
+DB	0xc5,0x78,0x29,0x60,0x40
+DB	0xc5,0x78,0x29,0x68,0x50
+DB	0xc5,0x78,0x29,0x70,0x60
+DB	0xc5,0x78,0x29,0x78,0x70
+	vmovdqa	ymm0,YMMWORD[$L$Three]
+
+	vpxor	ymm2,ymm2,ymm2
+	vpxor	ymm3,ymm3,ymm3
+
+	vmovdqa	ymm4,YMMWORD[$L$One]
+	vmovdqa	ymm8,YMMWORD[$L$Two]
+	vmovdqa	ymm12,YMMWORD[$L$Three]
+
+	vmovd	xmm1,r8d
+	vpermd	ymm1,ymm2,ymm1
+
+
+	mov	rax,21
+$L$select_loop_avx2_w7:
+
+	vmovdqa	ymm5,YMMWORD[rdx]
+	vmovdqa	ymm6,YMMWORD[32+rdx]
+
+	vmovdqa	ymm9,YMMWORD[64+rdx]
+	vmovdqa	ymm10,YMMWORD[96+rdx]
+
+	vmovdqa	ymm13,YMMWORD[128+rdx]
+	vmovdqa	ymm14,YMMWORD[160+rdx]
+
+	vpcmpeqd	ymm7,ymm4,ymm1
+	vpcmpeqd	ymm11,ymm8,ymm1
+	vpcmpeqd	ymm15,ymm12,ymm1
+
+	vpaddd	ymm4,ymm4,ymm0
+	vpaddd	ymm8,ymm8,ymm0
+	vpaddd	ymm12,ymm12,ymm0
+	lea	rdx,[192+rdx]
+
+	vpand	ymm5,ymm5,ymm7
+	vpand	ymm6,ymm6,ymm7
+	vpand	ymm9,ymm9,ymm11
+	vpand	ymm10,ymm10,ymm11
+	vpand	ymm13,ymm13,ymm15
+	vpand	ymm14,ymm14,ymm15
+
+	vpxor	ymm2,ymm2,ymm5
+	vpxor	ymm3,ymm3,ymm6
+	vpxor	ymm2,ymm2,ymm9
+	vpxor	ymm3,ymm3,ymm10
+	vpxor	ymm2,ymm2,ymm13
+	vpxor	ymm3,ymm3,ymm14
+
+	dec	rax
+	jnz	NEAR $L$select_loop_avx2_w7
+
+
+	vmovdqa	ymm5,YMMWORD[rdx]
+	vmovdqa	ymm6,YMMWORD[32+rdx]
+
+	vpcmpeqd	ymm7,ymm4,ymm1
+
+	vpand	ymm5,ymm5,ymm7
+	vpand	ymm6,ymm6,ymm7
+
+	vpxor	ymm2,ymm2,ymm5
+	vpxor	ymm3,ymm3,ymm6
+
+	vmovdqu	YMMWORD[rcx],ymm2
+	vmovdqu	YMMWORD[32+rcx],ymm3
+	vzeroupper
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[r11]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_nistz256_avx2_select_w7:
+
+
+ALIGN	32
+__ecp_nistz256_add_toq:
+
+	xor	r11,r11
+	add	r12,QWORD[rbx]
+	adc	r13,QWORD[8+rbx]
+	mov	rax,r12
+	adc	r8,QWORD[16+rbx]
+	adc	r9,QWORD[24+rbx]
+	mov	rbp,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	cmovc	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovc	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovc	r9,r10
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	DB	0F3h,0C3h		;repret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_sub_fromq:
+
+	sub	r12,QWORD[rbx]
+	sbb	r13,QWORD[8+rbx]
+	mov	rax,r12
+	sbb	r8,QWORD[16+rbx]
+	sbb	r9,QWORD[24+rbx]
+	mov	rbp,r13
+	sbb	r11,r11
+
+	add	r12,-1
+	mov	rcx,r8
+	adc	r13,r14
+	adc	r8,0
+	mov	r10,r9
+	adc	r9,r15
+	test	r11,r11
+
+	cmovz	r12,rax
+	cmovz	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovz	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovz	r9,r10
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	DB	0F3h,0C3h		;repret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_subq:
+
+	sub	rax,r12
+	sbb	rbp,r13
+	mov	r12,rax
+	sbb	rcx,r8
+	sbb	r10,r9
+	mov	r13,rbp
+	sbb	r11,r11
+
+	add	rax,-1
+	mov	r8,rcx
+	adc	rbp,r14
+	adc	rcx,0
+	mov	r9,r10
+	adc	r10,r15
+	test	r11,r11
+
+	cmovnz	r12,rax
+	cmovnz	r13,rbp
+	cmovnz	r8,rcx
+	cmovnz	r9,r10
+
+	DB	0F3h,0C3h		;repret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_mul_by_2q:
+
+	xor	r11,r11
+	add	r12,r12
+	adc	r13,r13
+	mov	rax,r12
+	adc	r8,r8
+	adc	r9,r9
+	mov	rbp,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	cmovc	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovc	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovc	r9,r10
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	DB	0F3h,0C3h		;repret
+
+
+global	GFp_nistz256_point_double
+
+ALIGN	32
+GFp_nistz256_point_double:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_nistz256_point_double:
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+	lea	rcx,[GFp_ia32cap_P]
+	mov	rcx,QWORD[8+rcx]
+	and	ecx,0x80100
+	cmp	ecx,0x80100
+	je	NEAR $L$point_doublex
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,32*5+8
+
+$L$point_doubleq_body:
+
+$L$point_double_shortcutq:
+	movdqu	xmm0,XMMWORD[rsi]
+	mov	rbx,rsi
+	movdqu	xmm1,XMMWORD[16+rsi]
+	mov	r12,QWORD[((32+0))+rsi]
+	mov	r13,QWORD[((32+8))+rsi]
+	mov	r8,QWORD[((32+16))+rsi]
+	mov	r9,QWORD[((32+24))+rsi]
+	mov	r14,QWORD[(($L$poly+8))]
+	mov	r15,QWORD[(($L$poly+24))]
+	movdqa	XMMWORD[96+rsp],xmm0
+	movdqa	XMMWORD[(96+16)+rsp],xmm1
+	lea	r10,[32+rdi]
+	lea	r11,[64+rdi]
+DB	102,72,15,110,199
+DB	102,73,15,110,202
+DB	102,73,15,110,211
+
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_by_2q
+
+	mov	rax,QWORD[((64+0))+rsi]
+	mov	r14,QWORD[((64+8))+rsi]
+	mov	r15,QWORD[((64+16))+rsi]
+	mov	r8,QWORD[((64+24))+rsi]
+	lea	rsi,[((64-0))+rsi]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	r14,QWORD[((8+0))+rsp]
+	lea	rsi,[((0+0))+rsp]
+	mov	r15,QWORD[((16+0))+rsp]
+	mov	r8,QWORD[((24+0))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[32+rbx]
+	mov	r9,QWORD[((64+0))+rbx]
+	mov	r10,QWORD[((64+8))+rbx]
+	mov	r11,QWORD[((64+16))+rbx]
+	mov	r12,QWORD[((64+24))+rbx]
+	lea	rsi,[((64-0))+rbx]
+	lea	rbx,[32+rbx]
+DB	102,72,15,126,215
+	call	__ecp_nistz256_mul_montq
+	call	__ecp_nistz256_mul_by_2q
+
+	mov	r12,QWORD[((96+0))+rsp]
+	mov	r13,QWORD[((96+8))+rsp]
+	lea	rbx,[64+rsp]
+	mov	r8,QWORD[((96+16))+rsp]
+	mov	r9,QWORD[((96+24))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_add_toq
+
+	mov	r12,QWORD[((96+0))+rsp]
+	mov	r13,QWORD[((96+8))+rsp]
+	lea	rbx,[64+rsp]
+	mov	r8,QWORD[((96+16))+rsp]
+	mov	r9,QWORD[((96+24))+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	r14,QWORD[((8+0))+rsp]
+	lea	rsi,[((0+0))+rsp]
+	mov	r15,QWORD[((16+0))+rsp]
+	mov	r8,QWORD[((24+0))+rsp]
+DB	102,72,15,126,207
+	call	__ecp_nistz256_sqr_montq
+	xor	r9,r9
+	mov	rax,r12
+	add	r12,-1
+	mov	r10,r13
+	adc	r13,rsi
+	mov	rcx,r14
+	adc	r14,0
+	mov	r8,r15
+	adc	r15,rbp
+	adc	r9,0
+	xor	rsi,rsi
+	test	rax,1
+
+	cmovz	r12,rax
+	cmovz	r13,r10
+	cmovz	r14,rcx
+	cmovz	r15,r8
+	cmovz	r9,rsi
+
+	mov	rax,r13
+	shr	r12,1
+	shl	rax,63
+	mov	r10,r14
+	shr	r13,1
+	or	r12,rax
+	shl	r10,63
+	mov	rcx,r15
+	shr	r14,1
+	or	r13,r10
+	shl	rcx,63
+	mov	QWORD[rdi],r12
+	shr	r15,1
+	mov	QWORD[8+rdi],r13
+	shl	r9,63
+	or	r14,rcx
+	or	r15,r9
+	mov	QWORD[16+rdi],r14
+	mov	QWORD[24+rdi],r15
+	mov	rax,QWORD[64+rsp]
+	lea	rbx,[64+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_mul_by_2q
+
+	lea	rbx,[32+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_add_toq
+
+	mov	rax,QWORD[96+rsp]
+	lea	rbx,[96+rsp]
+	mov	r9,QWORD[((0+0))+rsp]
+	mov	r10,QWORD[((8+0))+rsp]
+	lea	rsi,[((0+0))+rsp]
+	mov	r11,QWORD[((16+0))+rsp]
+	mov	r12,QWORD[((24+0))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_mul_by_2q
+
+	mov	rax,QWORD[((0+32))+rsp]
+	mov	r14,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r15,QWORD[((16+32))+rsp]
+	mov	r8,QWORD[((24+32))+rsp]
+DB	102,72,15,126,199
+	call	__ecp_nistz256_sqr_montq
+
+	lea	rbx,[128+rsp]
+	mov	r8,r14
+	mov	r9,r15
+	mov	r14,rsi
+	mov	r15,rbp
+	call	__ecp_nistz256_sub_fromq
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	rbp,QWORD[((0+8))+rsp]
+	mov	rcx,QWORD[((0+16))+rsp]
+	mov	r10,QWORD[((0+24))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_subq
+
+	mov	rax,QWORD[32+rsp]
+	lea	rbx,[32+rsp]
+	mov	r14,r12
+	xor	ecx,ecx
+	mov	QWORD[((0+0))+rsp],r12
+	mov	r10,r13
+	mov	QWORD[((0+8))+rsp],r13
+	cmovz	r11,r8
+	mov	QWORD[((0+16))+rsp],r8
+	lea	rsi,[((0-0))+rsp]
+	cmovz	r12,r9
+	mov	QWORD[((0+24))+rsp],r9
+	mov	r9,r14
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montq
+
+DB	102,72,15,126,203
+DB	102,72,15,126,207
+	call	__ecp_nistz256_sub_fromq
+
+	lea	rsi,[((160+56))+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbx,QWORD[((-16))+rsi]
+
+	mov	rbp,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$point_doubleq_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_nistz256_point_double:
+global	GFp_nistz256_point_add
+
+ALIGN	32
+GFp_nistz256_point_add:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_nistz256_point_add:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	lea	rcx,[GFp_ia32cap_P]
+	mov	rcx,QWORD[8+rcx]
+	and	ecx,0x80100
+	cmp	ecx,0x80100
+	je	NEAR $L$point_addx
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,32*18+8
+
+$L$point_addq_body:
+
+	movdqu	xmm0,XMMWORD[rsi]
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm3,XMMWORD[48+rsi]
+	movdqu	xmm4,XMMWORD[64+rsi]
+	movdqu	xmm5,XMMWORD[80+rsi]
+	mov	rbx,rsi
+	mov	rsi,rdx
+	movdqa	XMMWORD[384+rsp],xmm0
+	movdqa	XMMWORD[(384+16)+rsp],xmm1
+	movdqa	XMMWORD[416+rsp],xmm2
+	movdqa	XMMWORD[(416+16)+rsp],xmm3
+	movdqa	XMMWORD[448+rsp],xmm4
+	movdqa	XMMWORD[(448+16)+rsp],xmm5
+	por	xmm5,xmm4
+
+	movdqu	xmm0,XMMWORD[rsi]
+	pshufd	xmm3,xmm5,0xb1
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	por	xmm5,xmm3
+	movdqu	xmm3,XMMWORD[48+rsi]
+	mov	rax,QWORD[((64+0))+rsi]
+	mov	r14,QWORD[((64+8))+rsi]
+	mov	r15,QWORD[((64+16))+rsi]
+	mov	r8,QWORD[((64+24))+rsi]
+	movdqa	XMMWORD[480+rsp],xmm0
+	pshufd	xmm4,xmm5,0x1e
+	movdqa	XMMWORD[(480+16)+rsp],xmm1
+	movdqu	xmm0,XMMWORD[64+rsi]
+	movdqu	xmm1,XMMWORD[80+rsi]
+	movdqa	XMMWORD[512+rsp],xmm2
+	movdqa	XMMWORD[(512+16)+rsp],xmm3
+	por	xmm5,xmm4
+	pxor	xmm4,xmm4
+	por	xmm1,xmm0
+DB	102,72,15,110,199
+
+	lea	rsi,[((64-0))+rsi]
+	mov	QWORD[((544+0))+rsp],rax
+	mov	QWORD[((544+8))+rsp],r14
+	mov	QWORD[((544+16))+rsp],r15
+	mov	QWORD[((544+24))+rsp],r8
+	lea	rdi,[96+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	pcmpeqd	xmm5,xmm4
+	pshufd	xmm4,xmm1,0xb1
+	por	xmm4,xmm1
+	pshufd	xmm5,xmm5,0
+	pshufd	xmm3,xmm4,0x1e
+	por	xmm4,xmm3
+	pxor	xmm3,xmm3
+	pcmpeqd	xmm4,xmm3
+	pshufd	xmm4,xmm4,0
+	mov	rax,QWORD[((64+0))+rbx]
+	mov	r14,QWORD[((64+8))+rbx]
+	mov	r15,QWORD[((64+16))+rbx]
+	mov	r8,QWORD[((64+24))+rbx]
+DB	102,72,15,110,203
+
+	lea	rsi,[((64-0))+rbx]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[544+rsp]
+	lea	rbx,[544+rsp]
+	mov	r9,QWORD[((0+96))+rsp]
+	mov	r10,QWORD[((8+96))+rsp]
+	lea	rsi,[((0+96))+rsp]
+	mov	r11,QWORD[((16+96))+rsp]
+	mov	r12,QWORD[((24+96))+rsp]
+	lea	rdi,[224+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[448+rsp]
+	lea	rbx,[448+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[416+rsp]
+	lea	rbx,[416+rsp]
+	mov	r9,QWORD[((0+224))+rsp]
+	mov	r10,QWORD[((8+224))+rsp]
+	lea	rsi,[((0+224))+rsp]
+	mov	r11,QWORD[((16+224))+rsp]
+	mov	r12,QWORD[((24+224))+rsp]
+	lea	rdi,[224+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[512+rsp]
+	lea	rbx,[512+rsp]
+	mov	r9,QWORD[((0+256))+rsp]
+	mov	r10,QWORD[((8+256))+rsp]
+	lea	rsi,[((0+256))+rsp]
+	mov	r11,QWORD[((16+256))+rsp]
+	mov	r12,QWORD[((24+256))+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rbx,[224+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	or	r12,r13
+	movdqa	xmm2,xmm4
+	or	r12,r8
+	or	r12,r9
+	por	xmm2,xmm5
+DB	102,73,15,110,220
+
+	mov	rax,QWORD[384+rsp]
+	lea	rbx,[384+rsp]
+	mov	r9,QWORD[((0+96))+rsp]
+	mov	r10,QWORD[((8+96))+rsp]
+	lea	rsi,[((0+96))+rsp]
+	mov	r11,QWORD[((16+96))+rsp]
+	mov	r12,QWORD[((24+96))+rsp]
+	lea	rdi,[160+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[480+rsp]
+	lea	rbx,[480+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[192+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rbx,[160+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	or	r12,r13
+	or	r12,r8
+	or	r12,r9
+
+DB	102,73,15,126,208
+DB	102,73,15,126,217
+	or	r12,r8
+DB	0x3e
+	jnz	NEAR $L$add_proceedq
+
+
+
+	test	r9,r9
+	jz	NEAR $L$add_doubleq
+
+
+
+
+
+
+DB	102,72,15,126,199
+	pxor	xmm0,xmm0
+	movdqu	XMMWORD[rdi],xmm0
+	movdqu	XMMWORD[16+rdi],xmm0
+	movdqu	XMMWORD[32+rdi],xmm0
+	movdqu	XMMWORD[48+rdi],xmm0
+	movdqu	XMMWORD[64+rdi],xmm0
+	movdqu	XMMWORD[80+rdi],xmm0
+	jmp	NEAR $L$add_doneq
+
+ALIGN	32
+$L$add_doubleq:
+DB	102,72,15,126,206
+DB	102,72,15,126,199
+	add	rsp,416
+
+	jmp	NEAR $L$point_double_shortcutq
+
+
+ALIGN	32
+$L$add_proceedq:
+	mov	rax,QWORD[((0+64))+rsp]
+	mov	r14,QWORD[((8+64))+rsp]
+	lea	rsi,[((0+64))+rsp]
+	mov	r15,QWORD[((16+64))+rsp]
+	mov	r8,QWORD[((24+64))+rsp]
+	lea	rdi,[96+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[448+rsp]
+	lea	rbx,[448+rsp]
+	mov	r9,QWORD[((0+0))+rsp]
+	mov	r10,QWORD[((8+0))+rsp]
+	lea	rsi,[((0+0))+rsp]
+	mov	r11,QWORD[((16+0))+rsp]
+	mov	r12,QWORD[((24+0))+rsp]
+	lea	rdi,[352+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	r14,QWORD[((8+0))+rsp]
+	lea	rsi,[((0+0))+rsp]
+	mov	r15,QWORD[((16+0))+rsp]
+	mov	r8,QWORD[((24+0))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[544+rsp]
+	lea	rbx,[544+rsp]
+	mov	r9,QWORD[((0+352))+rsp]
+	mov	r10,QWORD[((8+352))+rsp]
+	lea	rsi,[((0+352))+rsp]
+	mov	r11,QWORD[((16+352))+rsp]
+	mov	r12,QWORD[((24+352))+rsp]
+	lea	rdi,[352+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[rsp]
+	lea	rbx,[rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[160+rsp]
+	lea	rbx,[160+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[192+rsp]
+	call	__ecp_nistz256_mul_montq
+
+
+
+
+	xor	r11,r11
+	add	r12,r12
+	lea	rsi,[96+rsp]
+	adc	r13,r13
+	mov	rax,r12
+	adc	r8,r8
+	adc	r9,r9
+	mov	rbp,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	mov	rax,QWORD[rsi]
+	cmovc	r13,rbp
+	mov	rbp,QWORD[8+rsi]
+	cmovc	r8,rcx
+	mov	rcx,QWORD[16+rsi]
+	cmovc	r9,r10
+	mov	r10,QWORD[24+rsi]
+
+	call	__ecp_nistz256_subq
+
+	lea	rbx,[128+rsp]
+	lea	rdi,[288+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	mov	rax,QWORD[((192+0))+rsp]
+	mov	rbp,QWORD[((192+8))+rsp]
+	mov	rcx,QWORD[((192+16))+rsp]
+	mov	r10,QWORD[((192+24))+rsp]
+	lea	rdi,[320+rsp]
+
+	call	__ecp_nistz256_subq
+
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+	mov	rax,QWORD[128+rsp]
+	lea	rbx,[128+rsp]
+	mov	r9,QWORD[((0+224))+rsp]
+	mov	r10,QWORD[((8+224))+rsp]
+	lea	rsi,[((0+224))+rsp]
+	mov	r11,QWORD[((16+224))+rsp]
+	mov	r12,QWORD[((24+224))+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[320+rsp]
+	lea	rbx,[320+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((0+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[320+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rbx,[256+rsp]
+	lea	rdi,[320+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+DB	102,72,15,126,199
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[352+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((352+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[544+rsp]
+	pand	xmm3,XMMWORD[((544+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[448+rsp]
+	pand	xmm3,XMMWORD[((448+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[64+rdi],xmm2
+	movdqu	XMMWORD[80+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[288+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((288+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[480+rsp]
+	pand	xmm3,XMMWORD[((480+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[384+rsp]
+	pand	xmm3,XMMWORD[((384+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[rdi],xmm2
+	movdqu	XMMWORD[16+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[320+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((320+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[512+rsp]
+	pand	xmm3,XMMWORD[((512+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[416+rsp]
+	pand	xmm3,XMMWORD[((416+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm3
+
+$L$add_doneq:
+	lea	rsi,[((576+56))+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbx,QWORD[((-16))+rsi]
+
+	mov	rbp,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$point_addq_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_nistz256_point_add:
+global	GFp_nistz256_point_add_affine
+
+ALIGN	32
+GFp_nistz256_point_add_affine:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_nistz256_point_add_affine:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	lea	rcx,[GFp_ia32cap_P]
+	mov	rcx,QWORD[8+rcx]
+	and	ecx,0x80100
+	cmp	ecx,0x80100
+	je	NEAR $L$point_add_affinex
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,32*15+8
+
+$L$add_affineq_body:
+
+	movdqu	xmm0,XMMWORD[rsi]
+	mov	rbx,rdx
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm3,XMMWORD[48+rsi]
+	movdqu	xmm4,XMMWORD[64+rsi]
+	movdqu	xmm5,XMMWORD[80+rsi]
+	mov	rax,QWORD[((64+0))+rsi]
+	mov	r14,QWORD[((64+8))+rsi]
+	mov	r15,QWORD[((64+16))+rsi]
+	mov	r8,QWORD[((64+24))+rsi]
+	movdqa	XMMWORD[320+rsp],xmm0
+	movdqa	XMMWORD[(320+16)+rsp],xmm1
+	movdqa	XMMWORD[352+rsp],xmm2
+	movdqa	XMMWORD[(352+16)+rsp],xmm3
+	movdqa	XMMWORD[384+rsp],xmm4
+	movdqa	XMMWORD[(384+16)+rsp],xmm5
+	por	xmm5,xmm4
+
+	movdqu	xmm0,XMMWORD[rbx]
+	pshufd	xmm3,xmm5,0xb1
+	movdqu	xmm1,XMMWORD[16+rbx]
+	movdqu	xmm2,XMMWORD[32+rbx]
+	por	xmm5,xmm3
+	movdqu	xmm3,XMMWORD[48+rbx]
+	movdqa	XMMWORD[416+rsp],xmm0
+	pshufd	xmm4,xmm5,0x1e
+	movdqa	XMMWORD[(416+16)+rsp],xmm1
+	por	xmm1,xmm0
+DB	102,72,15,110,199
+	movdqa	XMMWORD[448+rsp],xmm2
+	movdqa	XMMWORD[(448+16)+rsp],xmm3
+	por	xmm3,xmm2
+	por	xmm5,xmm4
+	pxor	xmm4,xmm4
+	por	xmm3,xmm1
+
+	lea	rsi,[((64-0))+rsi]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	pcmpeqd	xmm5,xmm4
+	pshufd	xmm4,xmm3,0xb1
+	mov	rax,QWORD[rbx]
+
+	mov	r9,r12
+	por	xmm4,xmm3
+	pshufd	xmm5,xmm5,0
+	pshufd	xmm3,xmm4,0x1e
+	mov	r10,r13
+	por	xmm4,xmm3
+	pxor	xmm3,xmm3
+	mov	r11,r14
+	pcmpeqd	xmm4,xmm3
+	pshufd	xmm4,xmm4,0
+
+	lea	rsi,[((32-0))+rsp]
+	mov	r12,r15
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rbx,[320+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	mov	rax,QWORD[384+rsp]
+	lea	rbx,[384+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[384+rsp]
+	lea	rbx,[384+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((0+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[288+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[448+rsp]
+	lea	rbx,[448+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((0+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rbx,[352+rsp]
+	lea	rdi,[96+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	mov	rax,QWORD[((0+64))+rsp]
+	mov	r14,QWORD[((8+64))+rsp]
+	lea	rsi,[((0+64))+rsp]
+	mov	r15,QWORD[((16+64))+rsp]
+	mov	r8,QWORD[((24+64))+rsp]
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[((0+96))+rsp]
+	mov	r14,QWORD[((8+96))+rsp]
+	lea	rsi,[((0+96))+rsp]
+	mov	r15,QWORD[((16+96))+rsp]
+	mov	r8,QWORD[((24+96))+rsp]
+	lea	rdi,[192+rsp]
+	call	__ecp_nistz256_sqr_montq
+
+	mov	rax,QWORD[128+rsp]
+	lea	rbx,[128+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((0+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[160+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[320+rsp]
+	lea	rbx,[320+rsp]
+	mov	r9,QWORD[((0+128))+rsp]
+	mov	r10,QWORD[((8+128))+rsp]
+	lea	rsi,[((0+128))+rsp]
+	mov	r11,QWORD[((16+128))+rsp]
+	mov	r12,QWORD[((24+128))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montq
+
+
+
+
+	xor	r11,r11
+	add	r12,r12
+	lea	rsi,[192+rsp]
+	adc	r13,r13
+	mov	rax,r12
+	adc	r8,r8
+	adc	r9,r9
+	mov	rbp,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	mov	rax,QWORD[rsi]
+	cmovc	r13,rbp
+	mov	rbp,QWORD[8+rsi]
+	cmovc	r8,rcx
+	mov	rcx,QWORD[16+rsi]
+	cmovc	r9,r10
+	mov	r10,QWORD[24+rsi]
+
+	call	__ecp_nistz256_subq
+
+	lea	rbx,[160+rsp]
+	lea	rdi,[224+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	rbp,QWORD[((0+8))+rsp]
+	mov	rcx,QWORD[((0+16))+rsp]
+	mov	r10,QWORD[((0+24))+rsp]
+	lea	rdi,[64+rsp]
+
+	call	__ecp_nistz256_subq
+
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+	mov	rax,QWORD[352+rsp]
+	lea	rbx,[352+rsp]
+	mov	r9,QWORD[((0+160))+rsp]
+	mov	r10,QWORD[((8+160))+rsp]
+	lea	rsi,[((0+160))+rsp]
+	mov	r11,QWORD[((16+160))+rsp]
+	mov	r12,QWORD[((24+160))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	mov	rax,QWORD[96+rsp]
+	lea	rbx,[96+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((0+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_mul_montq
+
+	lea	rbx,[32+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_sub_fromq
+
+DB	102,72,15,126,199
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[288+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((288+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[$L$ONE_mont]
+	pand	xmm3,XMMWORD[(($L$ONE_mont+16))]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[384+rsp]
+	pand	xmm3,XMMWORD[((384+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[64+rdi],xmm2
+	movdqu	XMMWORD[80+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[224+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((224+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[416+rsp]
+	pand	xmm3,XMMWORD[((416+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[320+rsp]
+	pand	xmm3,XMMWORD[((320+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[rdi],xmm2
+	movdqu	XMMWORD[16+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[256+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((256+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[448+rsp]
+	pand	xmm3,XMMWORD[((448+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[352+rsp]
+	pand	xmm3,XMMWORD[((352+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm3
+
+	lea	rsi,[((480+56))+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbx,QWORD[((-16))+rsi]
+
+	mov	rbp,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$add_affineq_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_nistz256_point_add_affine:
+
+ALIGN	32
+__ecp_nistz256_add_tox:
+
+	xor	r11,r11
+	adc	r12,QWORD[rbx]
+	adc	r13,QWORD[8+rbx]
+	mov	rax,r12
+	adc	r8,QWORD[16+rbx]
+	adc	r9,QWORD[24+rbx]
+	mov	rbp,r13
+	adc	r11,0
+
+	xor	r10,r10
+	sbb	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	cmovc	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovc	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovc	r9,r10
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	DB	0F3h,0C3h		;repret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_sub_fromx:
+
+	xor	r11,r11
+	sbb	r12,QWORD[rbx]
+	sbb	r13,QWORD[8+rbx]
+	mov	rax,r12
+	sbb	r8,QWORD[16+rbx]
+	sbb	r9,QWORD[24+rbx]
+	mov	rbp,r13
+	sbb	r11,0
+
+	xor	r10,r10
+	adc	r12,-1
+	mov	rcx,r8
+	adc	r13,r14
+	adc	r8,0
+	mov	r10,r9
+	adc	r9,r15
+
+	bt	r11,0
+	cmovnc	r12,rax
+	cmovnc	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovnc	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovnc	r9,r10
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	DB	0F3h,0C3h		;repret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_subx:
+
+	xor	r11,r11
+	sbb	rax,r12
+	sbb	rbp,r13
+	mov	r12,rax
+	sbb	rcx,r8
+	sbb	r10,r9
+	mov	r13,rbp
+	sbb	r11,0
+
+	xor	r9,r9
+	adc	rax,-1
+	mov	r8,rcx
+	adc	rbp,r14
+	adc	rcx,0
+	mov	r9,r10
+	adc	r10,r15
+
+	bt	r11,0
+	cmovc	r12,rax
+	cmovc	r13,rbp
+	cmovc	r8,rcx
+	cmovc	r9,r10
+
+	DB	0F3h,0C3h		;repret
+
+
+
+
+ALIGN	32
+__ecp_nistz256_mul_by_2x:
+
+	xor	r11,r11
+	adc	r12,r12
+	adc	r13,r13
+	mov	rax,r12
+	adc	r8,r8
+	adc	r9,r9
+	mov	rbp,r13
+	adc	r11,0
+
+	xor	r10,r10
+	sbb	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	cmovc	r13,rbp
+	mov	QWORD[rdi],r12
+	cmovc	r8,rcx
+	mov	QWORD[8+rdi],r13
+	cmovc	r9,r10
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+
+	DB	0F3h,0C3h		;repret
+
+
+
+ALIGN	32
+GFp_nistz256_point_doublex:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_nistz256_point_doublex:
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+$L$point_doublex:
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,32*5+8
+
+$L$point_doublex_body:
+
+$L$point_double_shortcutx:
+	movdqu	xmm0,XMMWORD[rsi]
+	mov	rbx,rsi
+	movdqu	xmm1,XMMWORD[16+rsi]
+	mov	r12,QWORD[((32+0))+rsi]
+	mov	r13,QWORD[((32+8))+rsi]
+	mov	r8,QWORD[((32+16))+rsi]
+	mov	r9,QWORD[((32+24))+rsi]
+	mov	r14,QWORD[(($L$poly+8))]
+	mov	r15,QWORD[(($L$poly+24))]
+	movdqa	XMMWORD[96+rsp],xmm0
+	movdqa	XMMWORD[(96+16)+rsp],xmm1
+	lea	r10,[32+rdi]
+	lea	r11,[64+rdi]
+DB	102,72,15,110,199
+DB	102,73,15,110,202
+DB	102,73,15,110,211
+
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_by_2x
+
+	mov	rdx,QWORD[((64+0))+rsi]
+	mov	r14,QWORD[((64+8))+rsi]
+	mov	r15,QWORD[((64+16))+rsi]
+	mov	r8,QWORD[((64+24))+rsi]
+	lea	rsi,[((64-128))+rsi]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[((0+0))+rsp]
+	mov	r14,QWORD[((8+0))+rsp]
+	lea	rsi,[((-128+0))+rsp]
+	mov	r15,QWORD[((16+0))+rsp]
+	mov	r8,QWORD[((24+0))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[32+rbx]
+	mov	r9,QWORD[((64+0))+rbx]
+	mov	r10,QWORD[((64+8))+rbx]
+	mov	r11,QWORD[((64+16))+rbx]
+	mov	r12,QWORD[((64+24))+rbx]
+	lea	rsi,[((64-128))+rbx]
+	lea	rbx,[32+rbx]
+DB	102,72,15,126,215
+	call	__ecp_nistz256_mul_montx
+	call	__ecp_nistz256_mul_by_2x
+
+	mov	r12,QWORD[((96+0))+rsp]
+	mov	r13,QWORD[((96+8))+rsp]
+	lea	rbx,[64+rsp]
+	mov	r8,QWORD[((96+16))+rsp]
+	mov	r9,QWORD[((96+24))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_add_tox
+
+	mov	r12,QWORD[((96+0))+rsp]
+	mov	r13,QWORD[((96+8))+rsp]
+	lea	rbx,[64+rsp]
+	mov	r8,QWORD[((96+16))+rsp]
+	mov	r9,QWORD[((96+24))+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	mov	rdx,QWORD[((0+0))+rsp]
+	mov	r14,QWORD[((8+0))+rsp]
+	lea	rsi,[((-128+0))+rsp]
+	mov	r15,QWORD[((16+0))+rsp]
+	mov	r8,QWORD[((24+0))+rsp]
+DB	102,72,15,126,207
+	call	__ecp_nistz256_sqr_montx
+	xor	r9,r9
+	mov	rax,r12
+	add	r12,-1
+	mov	r10,r13
+	adc	r13,rsi
+	mov	rcx,r14
+	adc	r14,0
+	mov	r8,r15
+	adc	r15,rbp
+	adc	r9,0
+	xor	rsi,rsi
+	test	rax,1
+
+	cmovz	r12,rax
+	cmovz	r13,r10
+	cmovz	r14,rcx
+	cmovz	r15,r8
+	cmovz	r9,rsi
+
+	mov	rax,r13
+	shr	r12,1
+	shl	rax,63
+	mov	r10,r14
+	shr	r13,1
+	or	r12,rax
+	shl	r10,63
+	mov	rcx,r15
+	shr	r14,1
+	or	r13,r10
+	shl	rcx,63
+	mov	QWORD[rdi],r12
+	shr	r15,1
+	mov	QWORD[8+rdi],r13
+	shl	r9,63
+	or	r14,rcx
+	or	r15,r9
+	mov	QWORD[16+rdi],r14
+	mov	QWORD[24+rdi],r15
+	mov	rdx,QWORD[64+rsp]
+	lea	rbx,[64+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_mul_by_2x
+
+	lea	rbx,[32+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_add_tox
+
+	mov	rdx,QWORD[96+rsp]
+	lea	rbx,[96+rsp]
+	mov	r9,QWORD[((0+0))+rsp]
+	mov	r10,QWORD[((8+0))+rsp]
+	lea	rsi,[((-128+0))+rsp]
+	mov	r11,QWORD[((16+0))+rsp]
+	mov	r12,QWORD[((24+0))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_mul_by_2x
+
+	mov	rdx,QWORD[((0+32))+rsp]
+	mov	r14,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r15,QWORD[((16+32))+rsp]
+	mov	r8,QWORD[((24+32))+rsp]
+DB	102,72,15,126,199
+	call	__ecp_nistz256_sqr_montx
+
+	lea	rbx,[128+rsp]
+	mov	r8,r14
+	mov	r9,r15
+	mov	r14,rsi
+	mov	r15,rbp
+	call	__ecp_nistz256_sub_fromx
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	rbp,QWORD[((0+8))+rsp]
+	mov	rcx,QWORD[((0+16))+rsp]
+	mov	r10,QWORD[((0+24))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_subx
+
+	mov	rdx,QWORD[32+rsp]
+	lea	rbx,[32+rsp]
+	mov	r14,r12
+	xor	ecx,ecx
+	mov	QWORD[((0+0))+rsp],r12
+	mov	r10,r13
+	mov	QWORD[((0+8))+rsp],r13
+	cmovz	r11,r8
+	mov	QWORD[((0+16))+rsp],r8
+	lea	rsi,[((0-128))+rsp]
+	cmovz	r12,r9
+	mov	QWORD[((0+24))+rsp],r9
+	mov	r9,r14
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montx
+
+DB	102,72,15,126,203
+DB	102,72,15,126,207
+	call	__ecp_nistz256_sub_fromx
+
+	lea	rsi,[((160+56))+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbx,QWORD[((-16))+rsi]
+
+	mov	rbp,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$point_doublex_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_nistz256_point_doublex:
+
+ALIGN	32
+GFp_nistz256_point_addx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_nistz256_point_addx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+$L$point_addx:
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,32*18+8
+
+$L$point_addx_body:
+
+	movdqu	xmm0,XMMWORD[rsi]
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm3,XMMWORD[48+rsi]
+	movdqu	xmm4,XMMWORD[64+rsi]
+	movdqu	xmm5,XMMWORD[80+rsi]
+	mov	rbx,rsi
+	mov	rsi,rdx
+	movdqa	XMMWORD[384+rsp],xmm0
+	movdqa	XMMWORD[(384+16)+rsp],xmm1
+	movdqa	XMMWORD[416+rsp],xmm2
+	movdqa	XMMWORD[(416+16)+rsp],xmm3
+	movdqa	XMMWORD[448+rsp],xmm4
+	movdqa	XMMWORD[(448+16)+rsp],xmm5
+	por	xmm5,xmm4
+
+	movdqu	xmm0,XMMWORD[rsi]
+	pshufd	xmm3,xmm5,0xb1
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	por	xmm5,xmm3
+	movdqu	xmm3,XMMWORD[48+rsi]
+	mov	rdx,QWORD[((64+0))+rsi]
+	mov	r14,QWORD[((64+8))+rsi]
+	mov	r15,QWORD[((64+16))+rsi]
+	mov	r8,QWORD[((64+24))+rsi]
+	movdqa	XMMWORD[480+rsp],xmm0
+	pshufd	xmm4,xmm5,0x1e
+	movdqa	XMMWORD[(480+16)+rsp],xmm1
+	movdqu	xmm0,XMMWORD[64+rsi]
+	movdqu	xmm1,XMMWORD[80+rsi]
+	movdqa	XMMWORD[512+rsp],xmm2
+	movdqa	XMMWORD[(512+16)+rsp],xmm3
+	por	xmm5,xmm4
+	pxor	xmm4,xmm4
+	por	xmm1,xmm0
+DB	102,72,15,110,199
+
+	lea	rsi,[((64-128))+rsi]
+	mov	QWORD[((544+0))+rsp],rdx
+	mov	QWORD[((544+8))+rsp],r14
+	mov	QWORD[((544+16))+rsp],r15
+	mov	QWORD[((544+24))+rsp],r8
+	lea	rdi,[96+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	pcmpeqd	xmm5,xmm4
+	pshufd	xmm4,xmm1,0xb1
+	por	xmm4,xmm1
+	pshufd	xmm5,xmm5,0
+	pshufd	xmm3,xmm4,0x1e
+	por	xmm4,xmm3
+	pxor	xmm3,xmm3
+	pcmpeqd	xmm4,xmm3
+	pshufd	xmm4,xmm4,0
+	mov	rdx,QWORD[((64+0))+rbx]
+	mov	r14,QWORD[((64+8))+rbx]
+	mov	r15,QWORD[((64+16))+rbx]
+	mov	r8,QWORD[((64+24))+rbx]
+DB	102,72,15,110,203
+
+	lea	rsi,[((64-128))+rbx]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[544+rsp]
+	lea	rbx,[544+rsp]
+	mov	r9,QWORD[((0+96))+rsp]
+	mov	r10,QWORD[((8+96))+rsp]
+	lea	rsi,[((-128+96))+rsp]
+	mov	r11,QWORD[((16+96))+rsp]
+	mov	r12,QWORD[((24+96))+rsp]
+	lea	rdi,[224+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[448+rsp]
+	lea	rbx,[448+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[416+rsp]
+	lea	rbx,[416+rsp]
+	mov	r9,QWORD[((0+224))+rsp]
+	mov	r10,QWORD[((8+224))+rsp]
+	lea	rsi,[((-128+224))+rsp]
+	mov	r11,QWORD[((16+224))+rsp]
+	mov	r12,QWORD[((24+224))+rsp]
+	lea	rdi,[224+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[512+rsp]
+	lea	rbx,[512+rsp]
+	mov	r9,QWORD[((0+256))+rsp]
+	mov	r10,QWORD[((8+256))+rsp]
+	lea	rsi,[((-128+256))+rsp]
+	mov	r11,QWORD[((16+256))+rsp]
+	mov	r12,QWORD[((24+256))+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rbx,[224+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	or	r12,r13
+	movdqa	xmm2,xmm4
+	or	r12,r8
+	or	r12,r9
+	por	xmm2,xmm5
+DB	102,73,15,110,220
+
+	mov	rdx,QWORD[384+rsp]
+	lea	rbx,[384+rsp]
+	mov	r9,QWORD[((0+96))+rsp]
+	mov	r10,QWORD[((8+96))+rsp]
+	lea	rsi,[((-128+96))+rsp]
+	mov	r11,QWORD[((16+96))+rsp]
+	mov	r12,QWORD[((24+96))+rsp]
+	lea	rdi,[160+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[480+rsp]
+	lea	rbx,[480+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[192+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rbx,[160+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	or	r12,r13
+	or	r12,r8
+	or	r12,r9
+
+DB	102,73,15,126,208
+DB	102,73,15,126,217
+	or	r12,r8
+DB	0x3e
+	jnz	NEAR $L$add_proceedx
+
+
+
+	test	r9,r9
+	jz	NEAR $L$add_doublex
+
+
+
+
+
+
+DB	102,72,15,126,199
+	pxor	xmm0,xmm0
+	movdqu	XMMWORD[rdi],xmm0
+	movdqu	XMMWORD[16+rdi],xmm0
+	movdqu	XMMWORD[32+rdi],xmm0
+	movdqu	XMMWORD[48+rdi],xmm0
+	movdqu	XMMWORD[64+rdi],xmm0
+	movdqu	XMMWORD[80+rdi],xmm0
+	jmp	NEAR $L$add_donex
+
+ALIGN	32
+$L$add_doublex:
+DB	102,72,15,126,206
+DB	102,72,15,126,199
+	add	rsp,416
+
+	jmp	NEAR $L$point_double_shortcutx
+
+
+ALIGN	32
+$L$add_proceedx:
+	mov	rdx,QWORD[((0+64))+rsp]
+	mov	r14,QWORD[((8+64))+rsp]
+	lea	rsi,[((-128+64))+rsp]
+	mov	r15,QWORD[((16+64))+rsp]
+	mov	r8,QWORD[((24+64))+rsp]
+	lea	rdi,[96+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[448+rsp]
+	lea	rbx,[448+rsp]
+	mov	r9,QWORD[((0+0))+rsp]
+	mov	r10,QWORD[((8+0))+rsp]
+	lea	rsi,[((-128+0))+rsp]
+	mov	r11,QWORD[((16+0))+rsp]
+	mov	r12,QWORD[((24+0))+rsp]
+	lea	rdi,[352+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[((0+0))+rsp]
+	mov	r14,QWORD[((8+0))+rsp]
+	lea	rsi,[((-128+0))+rsp]
+	mov	r15,QWORD[((16+0))+rsp]
+	mov	r8,QWORD[((24+0))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[544+rsp]
+	lea	rbx,[544+rsp]
+	mov	r9,QWORD[((0+352))+rsp]
+	mov	r10,QWORD[((8+352))+rsp]
+	lea	rsi,[((-128+352))+rsp]
+	mov	r11,QWORD[((16+352))+rsp]
+	mov	r12,QWORD[((24+352))+rsp]
+	lea	rdi,[352+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[rsp]
+	lea	rbx,[rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[160+rsp]
+	lea	rbx,[160+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[192+rsp]
+	call	__ecp_nistz256_mul_montx
+
+
+
+
+	xor	r11,r11
+	add	r12,r12
+	lea	rsi,[96+rsp]
+	adc	r13,r13
+	mov	rax,r12
+	adc	r8,r8
+	adc	r9,r9
+	mov	rbp,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	mov	rax,QWORD[rsi]
+	cmovc	r13,rbp
+	mov	rbp,QWORD[8+rsi]
+	cmovc	r8,rcx
+	mov	rcx,QWORD[16+rsi]
+	cmovc	r9,r10
+	mov	r10,QWORD[24+rsi]
+
+	call	__ecp_nistz256_subx
+
+	lea	rbx,[128+rsp]
+	lea	rdi,[288+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	mov	rax,QWORD[((192+0))+rsp]
+	mov	rbp,QWORD[((192+8))+rsp]
+	mov	rcx,QWORD[((192+16))+rsp]
+	mov	r10,QWORD[((192+24))+rsp]
+	lea	rdi,[320+rsp]
+
+	call	__ecp_nistz256_subx
+
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+	mov	rdx,QWORD[128+rsp]
+	lea	rbx,[128+rsp]
+	mov	r9,QWORD[((0+224))+rsp]
+	mov	r10,QWORD[((8+224))+rsp]
+	lea	rsi,[((-128+224))+rsp]
+	mov	r11,QWORD[((16+224))+rsp]
+	mov	r12,QWORD[((24+224))+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[320+rsp]
+	lea	rbx,[320+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((-128+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[320+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rbx,[256+rsp]
+	lea	rdi,[320+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+DB	102,72,15,126,199
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[352+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((352+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[544+rsp]
+	pand	xmm3,XMMWORD[((544+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[448+rsp]
+	pand	xmm3,XMMWORD[((448+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[64+rdi],xmm2
+	movdqu	XMMWORD[80+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[288+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((288+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[480+rsp]
+	pand	xmm3,XMMWORD[((480+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[384+rsp]
+	pand	xmm3,XMMWORD[((384+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[rdi],xmm2
+	movdqu	XMMWORD[16+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[320+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((320+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[512+rsp]
+	pand	xmm3,XMMWORD[((512+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[416+rsp]
+	pand	xmm3,XMMWORD[((416+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm3
+
+$L$add_donex:
+	lea	rsi,[((576+56))+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbx,QWORD[((-16))+rsi]
+
+	mov	rbp,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$point_addx_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_nistz256_point_addx:
+
+ALIGN	32
+GFp_nistz256_point_add_affinex:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_nistz256_point_add_affinex:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+$L$point_add_affinex:
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,32*15+8
+
+$L$add_affinex_body:
+
+	movdqu	xmm0,XMMWORD[rsi]
+	mov	rbx,rdx
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+	movdqu	xmm3,XMMWORD[48+rsi]
+	movdqu	xmm4,XMMWORD[64+rsi]
+	movdqu	xmm5,XMMWORD[80+rsi]
+	mov	rdx,QWORD[((64+0))+rsi]
+	mov	r14,QWORD[((64+8))+rsi]
+	mov	r15,QWORD[((64+16))+rsi]
+	mov	r8,QWORD[((64+24))+rsi]
+	movdqa	XMMWORD[320+rsp],xmm0
+	movdqa	XMMWORD[(320+16)+rsp],xmm1
+	movdqa	XMMWORD[352+rsp],xmm2
+	movdqa	XMMWORD[(352+16)+rsp],xmm3
+	movdqa	XMMWORD[384+rsp],xmm4
+	movdqa	XMMWORD[(384+16)+rsp],xmm5
+	por	xmm5,xmm4
+
+	movdqu	xmm0,XMMWORD[rbx]
+	pshufd	xmm3,xmm5,0xb1
+	movdqu	xmm1,XMMWORD[16+rbx]
+	movdqu	xmm2,XMMWORD[32+rbx]
+	por	xmm5,xmm3
+	movdqu	xmm3,XMMWORD[48+rbx]
+	movdqa	XMMWORD[416+rsp],xmm0
+	pshufd	xmm4,xmm5,0x1e
+	movdqa	XMMWORD[(416+16)+rsp],xmm1
+	por	xmm1,xmm0
+DB	102,72,15,110,199
+	movdqa	XMMWORD[448+rsp],xmm2
+	movdqa	XMMWORD[(448+16)+rsp],xmm3
+	por	xmm3,xmm2
+	por	xmm5,xmm4
+	pxor	xmm4,xmm4
+	por	xmm3,xmm1
+
+	lea	rsi,[((64-128))+rsi]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	pcmpeqd	xmm5,xmm4
+	pshufd	xmm4,xmm3,0xb1
+	mov	rdx,QWORD[rbx]
+
+	mov	r9,r12
+	por	xmm4,xmm3
+	pshufd	xmm5,xmm5,0
+	pshufd	xmm3,xmm4,0x1e
+	mov	r10,r13
+	por	xmm4,xmm3
+	pxor	xmm3,xmm3
+	mov	r11,r14
+	pcmpeqd	xmm4,xmm3
+	pshufd	xmm4,xmm4,0
+
+	lea	rsi,[((32-128))+rsp]
+	mov	r12,r15
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rbx,[320+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	mov	rdx,QWORD[384+rsp]
+	lea	rbx,[384+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[384+rsp]
+	lea	rbx,[384+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((-128+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[288+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[448+rsp]
+	lea	rbx,[448+rsp]
+	mov	r9,QWORD[((0+32))+rsp]
+	mov	r10,QWORD[((8+32))+rsp]
+	lea	rsi,[((-128+32))+rsp]
+	mov	r11,QWORD[((16+32))+rsp]
+	mov	r12,QWORD[((24+32))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rbx,[352+rsp]
+	lea	rdi,[96+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	mov	rdx,QWORD[((0+64))+rsp]
+	mov	r14,QWORD[((8+64))+rsp]
+	lea	rsi,[((-128+64))+rsp]
+	mov	r15,QWORD[((16+64))+rsp]
+	mov	r8,QWORD[((24+64))+rsp]
+	lea	rdi,[128+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[((0+96))+rsp]
+	mov	r14,QWORD[((8+96))+rsp]
+	lea	rsi,[((-128+96))+rsp]
+	mov	r15,QWORD[((16+96))+rsp]
+	mov	r8,QWORD[((24+96))+rsp]
+	lea	rdi,[192+rsp]
+	call	__ecp_nistz256_sqr_montx
+
+	mov	rdx,QWORD[128+rsp]
+	lea	rbx,[128+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((-128+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[160+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[320+rsp]
+	lea	rbx,[320+rsp]
+	mov	r9,QWORD[((0+128))+rsp]
+	mov	r10,QWORD[((8+128))+rsp]
+	lea	rsi,[((-128+128))+rsp]
+	mov	r11,QWORD[((16+128))+rsp]
+	mov	r12,QWORD[((24+128))+rsp]
+	lea	rdi,[rsp]
+	call	__ecp_nistz256_mul_montx
+
+
+
+
+	xor	r11,r11
+	add	r12,r12
+	lea	rsi,[192+rsp]
+	adc	r13,r13
+	mov	rax,r12
+	adc	r8,r8
+	adc	r9,r9
+	mov	rbp,r13
+	adc	r11,0
+
+	sub	r12,-1
+	mov	rcx,r8
+	sbb	r13,r14
+	sbb	r8,0
+	mov	r10,r9
+	sbb	r9,r15
+	sbb	r11,0
+
+	cmovc	r12,rax
+	mov	rax,QWORD[rsi]
+	cmovc	r13,rbp
+	mov	rbp,QWORD[8+rsi]
+	cmovc	r8,rcx
+	mov	rcx,QWORD[16+rsi]
+	cmovc	r9,r10
+	mov	r10,QWORD[24+rsi]
+
+	call	__ecp_nistz256_subx
+
+	lea	rbx,[160+rsp]
+	lea	rdi,[224+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+	mov	rax,QWORD[((0+0))+rsp]
+	mov	rbp,QWORD[((0+8))+rsp]
+	mov	rcx,QWORD[((0+16))+rsp]
+	mov	r10,QWORD[((0+24))+rsp]
+	lea	rdi,[64+rsp]
+
+	call	__ecp_nistz256_subx
+
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r8
+	mov	QWORD[24+rdi],r9
+	mov	rdx,QWORD[352+rsp]
+	lea	rbx,[352+rsp]
+	mov	r9,QWORD[((0+160))+rsp]
+	mov	r10,QWORD[((8+160))+rsp]
+	lea	rsi,[((-128+160))+rsp]
+	mov	r11,QWORD[((16+160))+rsp]
+	mov	r12,QWORD[((24+160))+rsp]
+	lea	rdi,[32+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	mov	rdx,QWORD[96+rsp]
+	lea	rbx,[96+rsp]
+	mov	r9,QWORD[((0+64))+rsp]
+	mov	r10,QWORD[((8+64))+rsp]
+	lea	rsi,[((-128+64))+rsp]
+	mov	r11,QWORD[((16+64))+rsp]
+	mov	r12,QWORD[((24+64))+rsp]
+	lea	rdi,[64+rsp]
+	call	__ecp_nistz256_mul_montx
+
+	lea	rbx,[32+rsp]
+	lea	rdi,[256+rsp]
+	call	__ecp_nistz256_sub_fromx
+
+DB	102,72,15,126,199
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[288+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((288+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[$L$ONE_mont]
+	pand	xmm3,XMMWORD[(($L$ONE_mont+16))]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[384+rsp]
+	pand	xmm3,XMMWORD[((384+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[64+rdi],xmm2
+	movdqu	XMMWORD[80+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[224+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((224+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[416+rsp]
+	pand	xmm3,XMMWORD[((416+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[320+rsp]
+	pand	xmm3,XMMWORD[((320+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[rdi],xmm2
+	movdqu	XMMWORD[16+rdi],xmm3
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm1,xmm5
+	pandn	xmm0,XMMWORD[256+rsp]
+	movdqa	xmm2,xmm5
+	pandn	xmm1,XMMWORD[((256+16))+rsp]
+	movdqa	xmm3,xmm5
+	pand	xmm2,XMMWORD[448+rsp]
+	pand	xmm3,XMMWORD[((448+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm4
+	pandn	xmm0,xmm2
+	movdqa	xmm2,xmm4
+	pandn	xmm1,xmm3
+	movdqa	xmm3,xmm4
+	pand	xmm2,XMMWORD[352+rsp]
+	pand	xmm3,XMMWORD[((352+16))+rsp]
+	por	xmm2,xmm0
+	por	xmm3,xmm1
+	movdqu	XMMWORD[32+rdi],xmm2
+	movdqu	XMMWORD[48+rdi],xmm3
+
+	lea	rsi,[((480+56))+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbx,QWORD[((-16))+rsi]
+
+	mov	rbp,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$add_affinex_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_nistz256_point_add_affinex:
+EXTERN	__imp_RtlVirtualUnwind
+
+
+ALIGN	16
+short_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rax,[16+rax]
+
+	mov	r12,QWORD[((-8))+rax]
+	mov	r13,QWORD[((-16))+rax]
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+
+	jmp	NEAR $L$common_seh_tail
+
+
+
+ALIGN	16
+full_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	mov	r10d,DWORD[8+r11]
+	lea	rax,[r10*1+rax]
+
+	mov	rbp,QWORD[((-8))+rax]
+	mov	rbx,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	DB	0F3h,0C3h		;repret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_GFp_nistz256_neg wrt ..imagebase
+	DD	$L$SEH_end_GFp_nistz256_neg wrt ..imagebase
+	DD	$L$SEH_info_GFp_nistz256_neg wrt ..imagebase
+
+	DD	$L$SEH_begin_GFp_p256_scalar_mul_mont wrt ..imagebase
+	DD	$L$SEH_end_GFp_p256_scalar_mul_mont wrt ..imagebase
+	DD	$L$SEH_info_GFp_p256_scalar_mul_mont wrt ..imagebase
+
+	DD	$L$SEH_begin_GFp_p256_scalar_sqr_rep_mont wrt ..imagebase
+	DD	$L$SEH_end_GFp_p256_scalar_sqr_rep_mont wrt ..imagebase
+	DD	$L$SEH_info_GFp_p256_scalar_sqr_rep_mont wrt ..imagebase
+	DD	$L$SEH_begin_ecp_nistz256_ord_mul_montx wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_ord_mul_montx wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_ord_mul_montx wrt ..imagebase
+
+	DD	$L$SEH_begin_ecp_nistz256_ord_sqr_montx wrt ..imagebase
+	DD	$L$SEH_end_ecp_nistz256_ord_sqr_montx wrt ..imagebase
+	DD	$L$SEH_info_ecp_nistz256_ord_sqr_montx wrt ..imagebase
+	DD	$L$SEH_begin_GFp_nistz256_mul_mont wrt ..imagebase
+	DD	$L$SEH_end_GFp_nistz256_mul_mont wrt ..imagebase
+	DD	$L$SEH_info_GFp_nistz256_mul_mont wrt ..imagebase
+
+	DD	$L$SEH_begin_GFp_nistz256_sqr_mont wrt ..imagebase
+	DD	$L$SEH_end_GFp_nistz256_sqr_mont wrt ..imagebase
+	DD	$L$SEH_info_GFp_nistz256_sqr_mont wrt ..imagebase
+
+	DD	$L$SEH_begin_GFp_nistz256_select_w5 wrt ..imagebase
+	DD	$L$SEH_end_GFp_nistz256_select_w5 wrt ..imagebase
+	DD	$L$SEH_info_GFp_nistz256_select_wX wrt ..imagebase
+
+	DD	$L$SEH_begin_GFp_nistz256_select_w7 wrt ..imagebase
+	DD	$L$SEH_end_GFp_nistz256_select_w7 wrt ..imagebase
+	DD	$L$SEH_info_GFp_nistz256_select_wX wrt ..imagebase
+	DD	$L$SEH_begin_GFp_nistz256_avx2_select_w5 wrt ..imagebase
+	DD	$L$SEH_end_GFp_nistz256_avx2_select_w5 wrt ..imagebase
+	DD	$L$SEH_info_GFp_nistz256_avx2_select_wX wrt ..imagebase
+
+	DD	$L$SEH_begin_GFp_nistz256_avx2_select_w7 wrt ..imagebase
+	DD	$L$SEH_end_GFp_nistz256_avx2_select_w7 wrt ..imagebase
+	DD	$L$SEH_info_GFp_nistz256_avx2_select_wX wrt ..imagebase
+	DD	$L$SEH_begin_GFp_nistz256_point_double wrt ..imagebase
+	DD	$L$SEH_end_GFp_nistz256_point_double wrt ..imagebase
+	DD	$L$SEH_info_GFp_nistz256_point_double wrt ..imagebase
+
+	DD	$L$SEH_begin_GFp_nistz256_point_add wrt ..imagebase
+	DD	$L$SEH_end_GFp_nistz256_point_add wrt ..imagebase
+	DD	$L$SEH_info_GFp_nistz256_point_add wrt ..imagebase
+
+	DD	$L$SEH_begin_GFp_nistz256_point_add_affine wrt ..imagebase
+	DD	$L$SEH_end_GFp_nistz256_point_add_affine wrt ..imagebase
+	DD	$L$SEH_info_GFp_nistz256_point_add_affine wrt ..imagebase
+	DD	$L$SEH_begin_GFp_nistz256_point_doublex wrt ..imagebase
+	DD	$L$SEH_end_GFp_nistz256_point_doublex wrt ..imagebase
+	DD	$L$SEH_info_GFp_nistz256_point_doublex wrt ..imagebase
+
+	DD	$L$SEH_begin_GFp_nistz256_point_addx wrt ..imagebase
+	DD	$L$SEH_end_GFp_nistz256_point_addx wrt ..imagebase
+	DD	$L$SEH_info_GFp_nistz256_point_addx wrt ..imagebase
+
+	DD	$L$SEH_begin_GFp_nistz256_point_add_affinex wrt ..imagebase
+	DD	$L$SEH_end_GFp_nistz256_point_add_affinex wrt ..imagebase
+	DD	$L$SEH_info_GFp_nistz256_point_add_affinex wrt ..imagebase
+
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_GFp_nistz256_neg:
+DB	9,0,0,0
+	DD	short_handler wrt ..imagebase
+	DD	$L$neg_body wrt ..imagebase,$L$neg_epilogue wrt ..imagebase
+$L$SEH_info_GFp_p256_scalar_mul_mont:
+DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$ord_mul_body wrt ..imagebase,$L$ord_mul_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_GFp_p256_scalar_sqr_rep_mont:
+DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$ord_sqr_body wrt ..imagebase,$L$ord_sqr_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_ecp_nistz256_ord_mul_montx:
+DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$ord_mulx_body wrt ..imagebase,$L$ord_mulx_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_ecp_nistz256_ord_sqr_montx:
+DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$ord_sqrx_body wrt ..imagebase,$L$ord_sqrx_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_GFp_nistz256_mul_mont:
+DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_GFp_nistz256_sqr_mont:
+DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$sqr_body wrt ..imagebase,$L$sqr_epilogue wrt ..imagebase
+	DD	48,0
+$L$SEH_info_GFp_nistz256_select_wX:
+DB	0x01,0x33,0x16,0x00
+DB	0x33,0xf8,0x09,0x00
+DB	0x2e,0xe8,0x08,0x00
+DB	0x29,0xd8,0x07,0x00
+DB	0x24,0xc8,0x06,0x00
+DB	0x1f,0xb8,0x05,0x00
+DB	0x1a,0xa8,0x04,0x00
+DB	0x15,0x98,0x03,0x00
+DB	0x10,0x88,0x02,0x00
+DB	0x0c,0x78,0x01,0x00
+DB	0x08,0x68,0x00,0x00
+DB	0x04,0x01,0x15,0x00
+ALIGN	8
+$L$SEH_info_GFp_nistz256_avx2_select_wX:
+DB	0x01,0x36,0x17,0x0b
+DB	0x36,0xf8,0x09,0x00
+DB	0x31,0xe8,0x08,0x00
+DB	0x2c,0xd8,0x07,0x00
+DB	0x27,0xc8,0x06,0x00
+DB	0x22,0xb8,0x05,0x00
+DB	0x1d,0xa8,0x04,0x00
+DB	0x18,0x98,0x03,0x00
+DB	0x13,0x88,0x02,0x00
+DB	0x0e,0x78,0x01,0x00
+DB	0x09,0x68,0x00,0x00
+DB	0x04,0x01,0x15,0x00
+DB	0x00,0xb3,0x00,0x00
+ALIGN	8
+$L$SEH_info_GFp_nistz256_point_double:
+DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$point_doubleq_body wrt ..imagebase,$L$point_doubleq_epilogue wrt ..imagebase
+	DD	32*5+56,0
+$L$SEH_info_GFp_nistz256_point_add:
+DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$point_addq_body wrt ..imagebase,$L$point_addq_epilogue wrt ..imagebase
+	DD	32*18+56,0
+$L$SEH_info_GFp_nistz256_point_add_affine:
+DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$add_affineq_body wrt ..imagebase,$L$add_affineq_epilogue wrt ..imagebase
+	DD	32*15+56,0
+ALIGN	8
+$L$SEH_info_GFp_nistz256_point_doublex:
+DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$point_doublex_body wrt ..imagebase,$L$point_doublex_epilogue wrt ..imagebase
+	DD	32*5+56,0
+$L$SEH_info_GFp_nistz256_point_addx:
+DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$point_addx_body wrt ..imagebase,$L$point_addx_epilogue wrt ..imagebase
+	DD	32*18+56,0
+$L$SEH_info_GFp_nistz256_point_add_affinex:
+DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$add_affinex_body wrt ..imagebase,$L$add_affinex_epilogue wrt ..imagebase
+	DD	32*15+56,0

+ 4138 - 0
zeroidc/vendor/ring/pregenerated/tmp/sha256-x86_64-nasm.asm

@@ -0,0 +1,4138 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+section	.text code align=64
+
+
+EXTERN	GFp_ia32cap_P
+global	GFp_sha256_block_data_order
+
+ALIGN	16
+GFp_sha256_block_data_order:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_sha256_block_data_order:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	lea	r11,[GFp_ia32cap_P]
+	mov	r9d,DWORD[r11]
+	mov	r10d,DWORD[4+r11]
+	mov	r11d,DWORD[8+r11]
+	and	r9d,1073741824
+	and	r10d,268435968
+	or	r10d,r9d
+	cmp	r10d,1342177792
+	je	NEAR $L$avx_shortcut
+	test	r10d,512
+	jnz	NEAR $L$ssse3_shortcut
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	shl	rdx,4
+	sub	rsp,16*4+4*8
+	lea	rdx,[rdx*4+rsi]
+	and	rsp,-64
+	mov	QWORD[((64+0))+rsp],rdi
+	mov	QWORD[((64+8))+rsp],rsi
+	mov	QWORD[((64+16))+rsp],rdx
+	mov	QWORD[88+rsp],rax
+
+$L$prologue:
+
+	mov	eax,DWORD[rdi]
+	mov	ebx,DWORD[4+rdi]
+	mov	ecx,DWORD[8+rdi]
+	mov	edx,DWORD[12+rdi]
+	mov	r8d,DWORD[16+rdi]
+	mov	r9d,DWORD[20+rdi]
+	mov	r10d,DWORD[24+rdi]
+	mov	r11d,DWORD[28+rdi]
+	jmp	NEAR $L$loop
+
+ALIGN	16
+$L$loop:
+	mov	edi,ebx
+	lea	rbp,[K256]
+	xor	edi,ecx
+	mov	r12d,DWORD[rsi]
+	mov	r13d,r8d
+	mov	r14d,eax
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,r9d
+
+	xor	r13d,r8d
+	ror	r14d,9
+	xor	r15d,r10d
+
+	mov	DWORD[rsp],r12d
+	xor	r14d,eax
+	and	r15d,r8d
+
+	ror	r13d,5
+	add	r12d,r11d
+	xor	r15d,r10d
+
+	ror	r14d,11
+	xor	r13d,r8d
+	add	r12d,r15d
+
+	mov	r15d,eax
+	add	r12d,DWORD[rbp]
+	xor	r14d,eax
+
+	xor	r15d,ebx
+	ror	r13d,6
+	mov	r11d,ebx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r11d,edi
+	add	edx,r12d
+	add	r11d,r12d
+
+	lea	rbp,[4+rbp]
+	add	r11d,r14d
+	mov	r12d,DWORD[4+rsi]
+	mov	r13d,edx
+	mov	r14d,r11d
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,r8d
+
+	xor	r13d,edx
+	ror	r14d,9
+	xor	edi,r9d
+
+	mov	DWORD[4+rsp],r12d
+	xor	r14d,r11d
+	and	edi,edx
+
+	ror	r13d,5
+	add	r12d,r10d
+	xor	edi,r9d
+
+	ror	r14d,11
+	xor	r13d,edx
+	add	r12d,edi
+
+	mov	edi,r11d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r11d
+
+	xor	edi,eax
+	ror	r13d,6
+	mov	r10d,eax
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r10d,r15d
+	add	ecx,r12d
+	add	r10d,r12d
+
+	lea	rbp,[4+rbp]
+	add	r10d,r14d
+	mov	r12d,DWORD[8+rsi]
+	mov	r13d,ecx
+	mov	r14d,r10d
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,edx
+
+	xor	r13d,ecx
+	ror	r14d,9
+	xor	r15d,r8d
+
+	mov	DWORD[8+rsp],r12d
+	xor	r14d,r10d
+	and	r15d,ecx
+
+	ror	r13d,5
+	add	r12d,r9d
+	xor	r15d,r8d
+
+	ror	r14d,11
+	xor	r13d,ecx
+	add	r12d,r15d
+
+	mov	r15d,r10d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r10d
+
+	xor	r15d,r11d
+	ror	r13d,6
+	mov	r9d,r11d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r9d,edi
+	add	ebx,r12d
+	add	r9d,r12d
+
+	lea	rbp,[4+rbp]
+	add	r9d,r14d
+	mov	r12d,DWORD[12+rsi]
+	mov	r13d,ebx
+	mov	r14d,r9d
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,ecx
+
+	xor	r13d,ebx
+	ror	r14d,9
+	xor	edi,edx
+
+	mov	DWORD[12+rsp],r12d
+	xor	r14d,r9d
+	and	edi,ebx
+
+	ror	r13d,5
+	add	r12d,r8d
+	xor	edi,edx
+
+	ror	r14d,11
+	xor	r13d,ebx
+	add	r12d,edi
+
+	mov	edi,r9d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r9d
+
+	xor	edi,r10d
+	ror	r13d,6
+	mov	r8d,r10d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r8d,r15d
+	add	eax,r12d
+	add	r8d,r12d
+
+	lea	rbp,[20+rbp]
+	add	r8d,r14d
+	mov	r12d,DWORD[16+rsi]
+	mov	r13d,eax
+	mov	r14d,r8d
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,ebx
+
+	xor	r13d,eax
+	ror	r14d,9
+	xor	r15d,ecx
+
+	mov	DWORD[16+rsp],r12d
+	xor	r14d,r8d
+	and	r15d,eax
+
+	ror	r13d,5
+	add	r12d,edx
+	xor	r15d,ecx
+
+	ror	r14d,11
+	xor	r13d,eax
+	add	r12d,r15d
+
+	mov	r15d,r8d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r8d
+
+	xor	r15d,r9d
+	ror	r13d,6
+	mov	edx,r9d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	edx,edi
+	add	r11d,r12d
+	add	edx,r12d
+
+	lea	rbp,[4+rbp]
+	add	edx,r14d
+	mov	r12d,DWORD[20+rsi]
+	mov	r13d,r11d
+	mov	r14d,edx
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,eax
+
+	xor	r13d,r11d
+	ror	r14d,9
+	xor	edi,ebx
+
+	mov	DWORD[20+rsp],r12d
+	xor	r14d,edx
+	and	edi,r11d
+
+	ror	r13d,5
+	add	r12d,ecx
+	xor	edi,ebx
+
+	ror	r14d,11
+	xor	r13d,r11d
+	add	r12d,edi
+
+	mov	edi,edx
+	add	r12d,DWORD[rbp]
+	xor	r14d,edx
+
+	xor	edi,r8d
+	ror	r13d,6
+	mov	ecx,r8d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ecx,r15d
+	add	r10d,r12d
+	add	ecx,r12d
+
+	lea	rbp,[4+rbp]
+	add	ecx,r14d
+	mov	r12d,DWORD[24+rsi]
+	mov	r13d,r10d
+	mov	r14d,ecx
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,r11d
+
+	xor	r13d,r10d
+	ror	r14d,9
+	xor	r15d,eax
+
+	mov	DWORD[24+rsp],r12d
+	xor	r14d,ecx
+	and	r15d,r10d
+
+	ror	r13d,5
+	add	r12d,ebx
+	xor	r15d,eax
+
+	ror	r14d,11
+	xor	r13d,r10d
+	add	r12d,r15d
+
+	mov	r15d,ecx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ecx
+
+	xor	r15d,edx
+	ror	r13d,6
+	mov	ebx,edx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ebx,edi
+	add	r9d,r12d
+	add	ebx,r12d
+
+	lea	rbp,[4+rbp]
+	add	ebx,r14d
+	mov	r12d,DWORD[28+rsi]
+	mov	r13d,r9d
+	mov	r14d,ebx
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,r10d
+
+	xor	r13d,r9d
+	ror	r14d,9
+	xor	edi,r11d
+
+	mov	DWORD[28+rsp],r12d
+	xor	r14d,ebx
+	and	edi,r9d
+
+	ror	r13d,5
+	add	r12d,eax
+	xor	edi,r11d
+
+	ror	r14d,11
+	xor	r13d,r9d
+	add	r12d,edi
+
+	mov	edi,ebx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ebx
+
+	xor	edi,ecx
+	ror	r13d,6
+	mov	eax,ecx
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	eax,r15d
+	add	r8d,r12d
+	add	eax,r12d
+
+	lea	rbp,[20+rbp]
+	add	eax,r14d
+	mov	r12d,DWORD[32+rsi]
+	mov	r13d,r8d
+	mov	r14d,eax
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,r9d
+
+	xor	r13d,r8d
+	ror	r14d,9
+	xor	r15d,r10d
+
+	mov	DWORD[32+rsp],r12d
+	xor	r14d,eax
+	and	r15d,r8d
+
+	ror	r13d,5
+	add	r12d,r11d
+	xor	r15d,r10d
+
+	ror	r14d,11
+	xor	r13d,r8d
+	add	r12d,r15d
+
+	mov	r15d,eax
+	add	r12d,DWORD[rbp]
+	xor	r14d,eax
+
+	xor	r15d,ebx
+	ror	r13d,6
+	mov	r11d,ebx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r11d,edi
+	add	edx,r12d
+	add	r11d,r12d
+
+	lea	rbp,[4+rbp]
+	add	r11d,r14d
+	mov	r12d,DWORD[36+rsi]
+	mov	r13d,edx
+	mov	r14d,r11d
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,r8d
+
+	xor	r13d,edx
+	ror	r14d,9
+	xor	edi,r9d
+
+	mov	DWORD[36+rsp],r12d
+	xor	r14d,r11d
+	and	edi,edx
+
+	ror	r13d,5
+	add	r12d,r10d
+	xor	edi,r9d
+
+	ror	r14d,11
+	xor	r13d,edx
+	add	r12d,edi
+
+	mov	edi,r11d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r11d
+
+	xor	edi,eax
+	ror	r13d,6
+	mov	r10d,eax
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r10d,r15d
+	add	ecx,r12d
+	add	r10d,r12d
+
+	lea	rbp,[4+rbp]
+	add	r10d,r14d
+	mov	r12d,DWORD[40+rsi]
+	mov	r13d,ecx
+	mov	r14d,r10d
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,edx
+
+	xor	r13d,ecx
+	ror	r14d,9
+	xor	r15d,r8d
+
+	mov	DWORD[40+rsp],r12d
+	xor	r14d,r10d
+	and	r15d,ecx
+
+	ror	r13d,5
+	add	r12d,r9d
+	xor	r15d,r8d
+
+	ror	r14d,11
+	xor	r13d,ecx
+	add	r12d,r15d
+
+	mov	r15d,r10d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r10d
+
+	xor	r15d,r11d
+	ror	r13d,6
+	mov	r9d,r11d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r9d,edi
+	add	ebx,r12d
+	add	r9d,r12d
+
+	lea	rbp,[4+rbp]
+	add	r9d,r14d
+	mov	r12d,DWORD[44+rsi]
+	mov	r13d,ebx
+	mov	r14d,r9d
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,ecx
+
+	xor	r13d,ebx
+	ror	r14d,9
+	xor	edi,edx
+
+	mov	DWORD[44+rsp],r12d
+	xor	r14d,r9d
+	and	edi,ebx
+
+	ror	r13d,5
+	add	r12d,r8d
+	xor	edi,edx
+
+	ror	r14d,11
+	xor	r13d,ebx
+	add	r12d,edi
+
+	mov	edi,r9d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r9d
+
+	xor	edi,r10d
+	ror	r13d,6
+	mov	r8d,r10d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r8d,r15d
+	add	eax,r12d
+	add	r8d,r12d
+
+	lea	rbp,[20+rbp]
+	add	r8d,r14d
+	mov	r12d,DWORD[48+rsi]
+	mov	r13d,eax
+	mov	r14d,r8d
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,ebx
+
+	xor	r13d,eax
+	ror	r14d,9
+	xor	r15d,ecx
+
+	mov	DWORD[48+rsp],r12d
+	xor	r14d,r8d
+	and	r15d,eax
+
+	ror	r13d,5
+	add	r12d,edx
+	xor	r15d,ecx
+
+	ror	r14d,11
+	xor	r13d,eax
+	add	r12d,r15d
+
+	mov	r15d,r8d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r8d
+
+	xor	r15d,r9d
+	ror	r13d,6
+	mov	edx,r9d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	edx,edi
+	add	r11d,r12d
+	add	edx,r12d
+
+	lea	rbp,[4+rbp]
+	add	edx,r14d
+	mov	r12d,DWORD[52+rsi]
+	mov	r13d,r11d
+	mov	r14d,edx
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,eax
+
+	xor	r13d,r11d
+	ror	r14d,9
+	xor	edi,ebx
+
+	mov	DWORD[52+rsp],r12d
+	xor	r14d,edx
+	and	edi,r11d
+
+	ror	r13d,5
+	add	r12d,ecx
+	xor	edi,ebx
+
+	ror	r14d,11
+	xor	r13d,r11d
+	add	r12d,edi
+
+	mov	edi,edx
+	add	r12d,DWORD[rbp]
+	xor	r14d,edx
+
+	xor	edi,r8d
+	ror	r13d,6
+	mov	ecx,r8d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ecx,r15d
+	add	r10d,r12d
+	add	ecx,r12d
+
+	lea	rbp,[4+rbp]
+	add	ecx,r14d
+	mov	r12d,DWORD[56+rsi]
+	mov	r13d,r10d
+	mov	r14d,ecx
+	bswap	r12d
+	ror	r13d,14
+	mov	r15d,r11d
+
+	xor	r13d,r10d
+	ror	r14d,9
+	xor	r15d,eax
+
+	mov	DWORD[56+rsp],r12d
+	xor	r14d,ecx
+	and	r15d,r10d
+
+	ror	r13d,5
+	add	r12d,ebx
+	xor	r15d,eax
+
+	ror	r14d,11
+	xor	r13d,r10d
+	add	r12d,r15d
+
+	mov	r15d,ecx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ecx
+
+	xor	r15d,edx
+	ror	r13d,6
+	mov	ebx,edx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ebx,edi
+	add	r9d,r12d
+	add	ebx,r12d
+
+	lea	rbp,[4+rbp]
+	add	ebx,r14d
+	mov	r12d,DWORD[60+rsi]
+	mov	r13d,r9d
+	mov	r14d,ebx
+	bswap	r12d
+	ror	r13d,14
+	mov	edi,r10d
+
+	xor	r13d,r9d
+	ror	r14d,9
+	xor	edi,r11d
+
+	mov	DWORD[60+rsp],r12d
+	xor	r14d,ebx
+	and	edi,r9d
+
+	ror	r13d,5
+	add	r12d,eax
+	xor	edi,r11d
+
+	ror	r14d,11
+	xor	r13d,r9d
+	add	r12d,edi
+
+	mov	edi,ebx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ebx
+
+	xor	edi,ecx
+	ror	r13d,6
+	mov	eax,ecx
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	eax,r15d
+	add	r8d,r12d
+	add	eax,r12d
+
+	lea	rbp,[20+rbp]
+	jmp	NEAR $L$rounds_16_xx
+ALIGN	16
+$L$rounds_16_xx:
+	mov	r13d,DWORD[4+rsp]
+	mov	r15d,DWORD[56+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	eax,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[36+rsp]
+
+	add	r12d,DWORD[rsp]
+	mov	r13d,r8d
+	add	r12d,r15d
+	mov	r14d,eax
+	ror	r13d,14
+	mov	r15d,r9d
+
+	xor	r13d,r8d
+	ror	r14d,9
+	xor	r15d,r10d
+
+	mov	DWORD[rsp],r12d
+	xor	r14d,eax
+	and	r15d,r8d
+
+	ror	r13d,5
+	add	r12d,r11d
+	xor	r15d,r10d
+
+	ror	r14d,11
+	xor	r13d,r8d
+	add	r12d,r15d
+
+	mov	r15d,eax
+	add	r12d,DWORD[rbp]
+	xor	r14d,eax
+
+	xor	r15d,ebx
+	ror	r13d,6
+	mov	r11d,ebx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r11d,edi
+	add	edx,r12d
+	add	r11d,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[8+rsp]
+	mov	edi,DWORD[60+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r11d,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[40+rsp]
+
+	add	r12d,DWORD[4+rsp]
+	mov	r13d,edx
+	add	r12d,edi
+	mov	r14d,r11d
+	ror	r13d,14
+	mov	edi,r8d
+
+	xor	r13d,edx
+	ror	r14d,9
+	xor	edi,r9d
+
+	mov	DWORD[4+rsp],r12d
+	xor	r14d,r11d
+	and	edi,edx
+
+	ror	r13d,5
+	add	r12d,r10d
+	xor	edi,r9d
+
+	ror	r14d,11
+	xor	r13d,edx
+	add	r12d,edi
+
+	mov	edi,r11d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r11d
+
+	xor	edi,eax
+	ror	r13d,6
+	mov	r10d,eax
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r10d,r15d
+	add	ecx,r12d
+	add	r10d,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[12+rsp]
+	mov	r15d,DWORD[rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r10d,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[44+rsp]
+
+	add	r12d,DWORD[8+rsp]
+	mov	r13d,ecx
+	add	r12d,r15d
+	mov	r14d,r10d
+	ror	r13d,14
+	mov	r15d,edx
+
+	xor	r13d,ecx
+	ror	r14d,9
+	xor	r15d,r8d
+
+	mov	DWORD[8+rsp],r12d
+	xor	r14d,r10d
+	and	r15d,ecx
+
+	ror	r13d,5
+	add	r12d,r9d
+	xor	r15d,r8d
+
+	ror	r14d,11
+	xor	r13d,ecx
+	add	r12d,r15d
+
+	mov	r15d,r10d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r10d
+
+	xor	r15d,r11d
+	ror	r13d,6
+	mov	r9d,r11d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r9d,edi
+	add	ebx,r12d
+	add	r9d,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[16+rsp]
+	mov	edi,DWORD[4+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r9d,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[48+rsp]
+
+	add	r12d,DWORD[12+rsp]
+	mov	r13d,ebx
+	add	r12d,edi
+	mov	r14d,r9d
+	ror	r13d,14
+	mov	edi,ecx
+
+	xor	r13d,ebx
+	ror	r14d,9
+	xor	edi,edx
+
+	mov	DWORD[12+rsp],r12d
+	xor	r14d,r9d
+	and	edi,ebx
+
+	ror	r13d,5
+	add	r12d,r8d
+	xor	edi,edx
+
+	ror	r14d,11
+	xor	r13d,ebx
+	add	r12d,edi
+
+	mov	edi,r9d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r9d
+
+	xor	edi,r10d
+	ror	r13d,6
+	mov	r8d,r10d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r8d,r15d
+	add	eax,r12d
+	add	r8d,r12d
+
+	lea	rbp,[20+rbp]
+	mov	r13d,DWORD[20+rsp]
+	mov	r15d,DWORD[8+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r8d,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[52+rsp]
+
+	add	r12d,DWORD[16+rsp]
+	mov	r13d,eax
+	add	r12d,r15d
+	mov	r14d,r8d
+	ror	r13d,14
+	mov	r15d,ebx
+
+	xor	r13d,eax
+	ror	r14d,9
+	xor	r15d,ecx
+
+	mov	DWORD[16+rsp],r12d
+	xor	r14d,r8d
+	and	r15d,eax
+
+	ror	r13d,5
+	add	r12d,edx
+	xor	r15d,ecx
+
+	ror	r14d,11
+	xor	r13d,eax
+	add	r12d,r15d
+
+	mov	r15d,r8d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r8d
+
+	xor	r15d,r9d
+	ror	r13d,6
+	mov	edx,r9d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	edx,edi
+	add	r11d,r12d
+	add	edx,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[24+rsp]
+	mov	edi,DWORD[12+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	edx,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[56+rsp]
+
+	add	r12d,DWORD[20+rsp]
+	mov	r13d,r11d
+	add	r12d,edi
+	mov	r14d,edx
+	ror	r13d,14
+	mov	edi,eax
+
+	xor	r13d,r11d
+	ror	r14d,9
+	xor	edi,ebx
+
+	mov	DWORD[20+rsp],r12d
+	xor	r14d,edx
+	and	edi,r11d
+
+	ror	r13d,5
+	add	r12d,ecx
+	xor	edi,ebx
+
+	ror	r14d,11
+	xor	r13d,r11d
+	add	r12d,edi
+
+	mov	edi,edx
+	add	r12d,DWORD[rbp]
+	xor	r14d,edx
+
+	xor	edi,r8d
+	ror	r13d,6
+	mov	ecx,r8d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ecx,r15d
+	add	r10d,r12d
+	add	ecx,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[28+rsp]
+	mov	r15d,DWORD[16+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	ecx,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[60+rsp]
+
+	add	r12d,DWORD[24+rsp]
+	mov	r13d,r10d
+	add	r12d,r15d
+	mov	r14d,ecx
+	ror	r13d,14
+	mov	r15d,r11d
+
+	xor	r13d,r10d
+	ror	r14d,9
+	xor	r15d,eax
+
+	mov	DWORD[24+rsp],r12d
+	xor	r14d,ecx
+	and	r15d,r10d
+
+	ror	r13d,5
+	add	r12d,ebx
+	xor	r15d,eax
+
+	ror	r14d,11
+	xor	r13d,r10d
+	add	r12d,r15d
+
+	mov	r15d,ecx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ecx
+
+	xor	r15d,edx
+	ror	r13d,6
+	mov	ebx,edx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ebx,edi
+	add	r9d,r12d
+	add	ebx,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[32+rsp]
+	mov	edi,DWORD[20+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	ebx,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[rsp]
+
+	add	r12d,DWORD[28+rsp]
+	mov	r13d,r9d
+	add	r12d,edi
+	mov	r14d,ebx
+	ror	r13d,14
+	mov	edi,r10d
+
+	xor	r13d,r9d
+	ror	r14d,9
+	xor	edi,r11d
+
+	mov	DWORD[28+rsp],r12d
+	xor	r14d,ebx
+	and	edi,r9d
+
+	ror	r13d,5
+	add	r12d,eax
+	xor	edi,r11d
+
+	ror	r14d,11
+	xor	r13d,r9d
+	add	r12d,edi
+
+	mov	edi,ebx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ebx
+
+	xor	edi,ecx
+	ror	r13d,6
+	mov	eax,ecx
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	eax,r15d
+	add	r8d,r12d
+	add	eax,r12d
+
+	lea	rbp,[20+rbp]
+	mov	r13d,DWORD[36+rsp]
+	mov	r15d,DWORD[24+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	eax,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[4+rsp]
+
+	add	r12d,DWORD[32+rsp]
+	mov	r13d,r8d
+	add	r12d,r15d
+	mov	r14d,eax
+	ror	r13d,14
+	mov	r15d,r9d
+
+	xor	r13d,r8d
+	ror	r14d,9
+	xor	r15d,r10d
+
+	mov	DWORD[32+rsp],r12d
+	xor	r14d,eax
+	and	r15d,r8d
+
+	ror	r13d,5
+	add	r12d,r11d
+	xor	r15d,r10d
+
+	ror	r14d,11
+	xor	r13d,r8d
+	add	r12d,r15d
+
+	mov	r15d,eax
+	add	r12d,DWORD[rbp]
+	xor	r14d,eax
+
+	xor	r15d,ebx
+	ror	r13d,6
+	mov	r11d,ebx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r11d,edi
+	add	edx,r12d
+	add	r11d,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[40+rsp]
+	mov	edi,DWORD[28+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r11d,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[8+rsp]
+
+	add	r12d,DWORD[36+rsp]
+	mov	r13d,edx
+	add	r12d,edi
+	mov	r14d,r11d
+	ror	r13d,14
+	mov	edi,r8d
+
+	xor	r13d,edx
+	ror	r14d,9
+	xor	edi,r9d
+
+	mov	DWORD[36+rsp],r12d
+	xor	r14d,r11d
+	and	edi,edx
+
+	ror	r13d,5
+	add	r12d,r10d
+	xor	edi,r9d
+
+	ror	r14d,11
+	xor	r13d,edx
+	add	r12d,edi
+
+	mov	edi,r11d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r11d
+
+	xor	edi,eax
+	ror	r13d,6
+	mov	r10d,eax
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r10d,r15d
+	add	ecx,r12d
+	add	r10d,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[44+rsp]
+	mov	r15d,DWORD[32+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r10d,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[12+rsp]
+
+	add	r12d,DWORD[40+rsp]
+	mov	r13d,ecx
+	add	r12d,r15d
+	mov	r14d,r10d
+	ror	r13d,14
+	mov	r15d,edx
+
+	xor	r13d,ecx
+	ror	r14d,9
+	xor	r15d,r8d
+
+	mov	DWORD[40+rsp],r12d
+	xor	r14d,r10d
+	and	r15d,ecx
+
+	ror	r13d,5
+	add	r12d,r9d
+	xor	r15d,r8d
+
+	ror	r14d,11
+	xor	r13d,ecx
+	add	r12d,r15d
+
+	mov	r15d,r10d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r10d
+
+	xor	r15d,r11d
+	ror	r13d,6
+	mov	r9d,r11d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r9d,edi
+	add	ebx,r12d
+	add	r9d,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[48+rsp]
+	mov	edi,DWORD[36+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r9d,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[16+rsp]
+
+	add	r12d,DWORD[44+rsp]
+	mov	r13d,ebx
+	add	r12d,edi
+	mov	r14d,r9d
+	ror	r13d,14
+	mov	edi,ecx
+
+	xor	r13d,ebx
+	ror	r14d,9
+	xor	edi,edx
+
+	mov	DWORD[44+rsp],r12d
+	xor	r14d,r9d
+	and	edi,ebx
+
+	ror	r13d,5
+	add	r12d,r8d
+	xor	edi,edx
+
+	ror	r14d,11
+	xor	r13d,ebx
+	add	r12d,edi
+
+	mov	edi,r9d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r9d
+
+	xor	edi,r10d
+	ror	r13d,6
+	mov	r8d,r10d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	r8d,r15d
+	add	eax,r12d
+	add	r8d,r12d
+
+	lea	rbp,[20+rbp]
+	mov	r13d,DWORD[52+rsp]
+	mov	r15d,DWORD[40+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	r8d,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[20+rsp]
+
+	add	r12d,DWORD[48+rsp]
+	mov	r13d,eax
+	add	r12d,r15d
+	mov	r14d,r8d
+	ror	r13d,14
+	mov	r15d,ebx
+
+	xor	r13d,eax
+	ror	r14d,9
+	xor	r15d,ecx
+
+	mov	DWORD[48+rsp],r12d
+	xor	r14d,r8d
+	and	r15d,eax
+
+	ror	r13d,5
+	add	r12d,edx
+	xor	r15d,ecx
+
+	ror	r14d,11
+	xor	r13d,eax
+	add	r12d,r15d
+
+	mov	r15d,r8d
+	add	r12d,DWORD[rbp]
+	xor	r14d,r8d
+
+	xor	r15d,r9d
+	ror	r13d,6
+	mov	edx,r9d
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	edx,edi
+	add	r11d,r12d
+	add	edx,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[56+rsp]
+	mov	edi,DWORD[44+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	edx,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[24+rsp]
+
+	add	r12d,DWORD[52+rsp]
+	mov	r13d,r11d
+	add	r12d,edi
+	mov	r14d,edx
+	ror	r13d,14
+	mov	edi,eax
+
+	xor	r13d,r11d
+	ror	r14d,9
+	xor	edi,ebx
+
+	mov	DWORD[52+rsp],r12d
+	xor	r14d,edx
+	and	edi,r11d
+
+	ror	r13d,5
+	add	r12d,ecx
+	xor	edi,ebx
+
+	ror	r14d,11
+	xor	r13d,r11d
+	add	r12d,edi
+
+	mov	edi,edx
+	add	r12d,DWORD[rbp]
+	xor	r14d,edx
+
+	xor	edi,r8d
+	ror	r13d,6
+	mov	ecx,r8d
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ecx,r15d
+	add	r10d,r12d
+	add	ecx,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[60+rsp]
+	mov	r15d,DWORD[48+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	ecx,r14d
+	mov	r14d,r15d
+	ror	r15d,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	r15d,r14d
+	shr	r14d,10
+
+	ror	r15d,17
+	xor	r12d,r13d
+	xor	r15d,r14d
+	add	r12d,DWORD[28+rsp]
+
+	add	r12d,DWORD[56+rsp]
+	mov	r13d,r10d
+	add	r12d,r15d
+	mov	r14d,ecx
+	ror	r13d,14
+	mov	r15d,r11d
+
+	xor	r13d,r10d
+	ror	r14d,9
+	xor	r15d,eax
+
+	mov	DWORD[56+rsp],r12d
+	xor	r14d,ecx
+	and	r15d,r10d
+
+	ror	r13d,5
+	add	r12d,ebx
+	xor	r15d,eax
+
+	ror	r14d,11
+	xor	r13d,r10d
+	add	r12d,r15d
+
+	mov	r15d,ecx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ecx
+
+	xor	r15d,edx
+	ror	r13d,6
+	mov	ebx,edx
+
+	and	edi,r15d
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	ebx,edi
+	add	r9d,r12d
+	add	ebx,r12d
+
+	lea	rbp,[4+rbp]
+	mov	r13d,DWORD[rsp]
+	mov	edi,DWORD[52+rsp]
+
+	mov	r12d,r13d
+	ror	r13d,11
+	add	ebx,r14d
+	mov	r14d,edi
+	ror	edi,2
+
+	xor	r13d,r12d
+	shr	r12d,3
+	ror	r13d,7
+	xor	edi,r14d
+	shr	r14d,10
+
+	ror	edi,17
+	xor	r12d,r13d
+	xor	edi,r14d
+	add	r12d,DWORD[32+rsp]
+
+	add	r12d,DWORD[60+rsp]
+	mov	r13d,r9d
+	add	r12d,edi
+	mov	r14d,ebx
+	ror	r13d,14
+	mov	edi,r10d
+
+	xor	r13d,r9d
+	ror	r14d,9
+	xor	edi,r11d
+
+	mov	DWORD[60+rsp],r12d
+	xor	r14d,ebx
+	and	edi,r9d
+
+	ror	r13d,5
+	add	r12d,eax
+	xor	edi,r11d
+
+	ror	r14d,11
+	xor	r13d,r9d
+	add	r12d,edi
+
+	mov	edi,ebx
+	add	r12d,DWORD[rbp]
+	xor	r14d,ebx
+
+	xor	edi,ecx
+	ror	r13d,6
+	mov	eax,ecx
+
+	and	r15d,edi
+	ror	r14d,2
+	add	r12d,r13d
+
+	xor	eax,r15d
+	add	r8d,r12d
+	add	eax,r12d
+
+	lea	rbp,[20+rbp]
+	cmp	BYTE[3+rbp],0
+	jnz	NEAR $L$rounds_16_xx
+
+	mov	rdi,QWORD[((64+0))+rsp]
+	add	eax,r14d
+	lea	rsi,[64+rsi]
+
+	add	eax,DWORD[rdi]
+	add	ebx,DWORD[4+rdi]
+	add	ecx,DWORD[8+rdi]
+	add	edx,DWORD[12+rdi]
+	add	r8d,DWORD[16+rdi]
+	add	r9d,DWORD[20+rdi]
+	add	r10d,DWORD[24+rdi]
+	add	r11d,DWORD[28+rdi]
+
+	cmp	rsi,QWORD[((64+16))+rsp]
+
+	mov	DWORD[rdi],eax
+	mov	DWORD[4+rdi],ebx
+	mov	DWORD[8+rdi],ecx
+	mov	DWORD[12+rdi],edx
+	mov	DWORD[16+rdi],r8d
+	mov	DWORD[20+rdi],r9d
+	mov	DWORD[24+rdi],r10d
+	mov	DWORD[28+rdi],r11d
+	jb	NEAR $L$loop
+
+	mov	rsi,QWORD[88+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_sha256_block_data_order:
+ALIGN	64
+
+K256:
+	DD	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	DD	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	DD	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	DD	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	DD	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	DD	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	DD	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	DD	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	DD	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	DD	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	DD	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	DD	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	DD	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	DD	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	DD	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	DD	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	DD	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	DD	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	DD	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	DD	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	DD	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	DD	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	DD	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	DD	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	DD	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	DD	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	DD	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	DD	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	DD	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	DD	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	DD	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	DD	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+	DD	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+	DD	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+	DD	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+	DD	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+	DD	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+	DD	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+DB	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+DB	110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54
+DB	52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+DB	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+DB	111,114,103,62,0
+
+ALIGN	64
+GFp_sha256_block_data_order_ssse3:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_sha256_block_data_order_ssse3:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+$L$ssse3_shortcut:
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	shl	rdx,4
+	sub	rsp,160
+	lea	rdx,[rdx*4+rsi]
+	and	rsp,-64
+	mov	QWORD[((64+0))+rsp],rdi
+	mov	QWORD[((64+8))+rsp],rsi
+	mov	QWORD[((64+16))+rsp],rdx
+	mov	QWORD[88+rsp],rax
+
+	movaps	XMMWORD[(64+32)+rsp],xmm6
+	movaps	XMMWORD[(64+48)+rsp],xmm7
+	movaps	XMMWORD[(64+64)+rsp],xmm8
+	movaps	XMMWORD[(64+80)+rsp],xmm9
+$L$prologue_ssse3:
+
+	mov	eax,DWORD[rdi]
+	mov	ebx,DWORD[4+rdi]
+	mov	ecx,DWORD[8+rdi]
+	mov	edx,DWORD[12+rdi]
+	mov	r8d,DWORD[16+rdi]
+	mov	r9d,DWORD[20+rdi]
+	mov	r10d,DWORD[24+rdi]
+	mov	r11d,DWORD[28+rdi]
+
+
+	jmp	NEAR $L$loop_ssse3
+ALIGN	16
+$L$loop_ssse3:
+	movdqa	xmm7,XMMWORD[((K256+512))]
+	movdqu	xmm0,XMMWORD[rsi]
+	movdqu	xmm1,XMMWORD[16+rsi]
+	movdqu	xmm2,XMMWORD[32+rsi]
+DB	102,15,56,0,199
+	movdqu	xmm3,XMMWORD[48+rsi]
+	lea	rbp,[K256]
+DB	102,15,56,0,207
+	movdqa	xmm4,XMMWORD[rbp]
+	movdqa	xmm5,XMMWORD[32+rbp]
+DB	102,15,56,0,215
+	paddd	xmm4,xmm0
+	movdqa	xmm6,XMMWORD[64+rbp]
+DB	102,15,56,0,223
+	movdqa	xmm7,XMMWORD[96+rbp]
+	paddd	xmm5,xmm1
+	paddd	xmm6,xmm2
+	paddd	xmm7,xmm3
+	movdqa	XMMWORD[rsp],xmm4
+	mov	r14d,eax
+	movdqa	XMMWORD[16+rsp],xmm5
+	mov	edi,ebx
+	movdqa	XMMWORD[32+rsp],xmm6
+	xor	edi,ecx
+	movdqa	XMMWORD[48+rsp],xmm7
+	mov	r13d,r8d
+	jmp	NEAR $L$ssse3_00_47
+
+ALIGN	16
+$L$ssse3_00_47:
+	sub	rbp,-128
+	ror	r13d,14
+	movdqa	xmm4,xmm1
+	mov	eax,r14d
+	mov	r12d,r9d
+	movdqa	xmm7,xmm3
+	ror	r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	ror	r13d,5
+	xor	r14d,eax
+DB	102,15,58,15,224,4
+	and	r12d,r8d
+	xor	r13d,r8d
+DB	102,15,58,15,250,4
+	add	r11d,DWORD[rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	ror	r14d,11
+	movdqa	xmm5,xmm4
+	xor	r15d,ebx
+	add	r11d,r12d
+	movdqa	xmm6,xmm4
+	ror	r13d,6
+	and	edi,r15d
+	psrld	xmm4,3
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	paddd	xmm0,xmm7
+	ror	r14d,2
+	add	edx,r11d
+	psrld	xmm6,7
+	add	r11d,edi
+	mov	r13d,edx
+	pshufd	xmm7,xmm3,250
+	add	r14d,r11d
+	ror	r13d,14
+	pslld	xmm5,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	pxor	xmm4,xmm6
+	ror	r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	ror	r13d,5
+	psrld	xmm6,11
+	xor	r14d,r11d
+	pxor	xmm4,xmm5
+	and	r12d,edx
+	xor	r13d,edx
+	pslld	xmm5,11
+	add	r10d,DWORD[4+rsp]
+	mov	edi,r11d
+	pxor	xmm4,xmm6
+	xor	r12d,r9d
+	ror	r14d,11
+	movdqa	xmm6,xmm7
+	xor	edi,eax
+	add	r10d,r12d
+	pxor	xmm4,xmm5
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	psrld	xmm7,10
+	add	r10d,r13d
+	xor	r15d,eax
+	paddd	xmm0,xmm4
+	ror	r14d,2
+	add	ecx,r10d
+	psrlq	xmm6,17
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	pxor	xmm7,xmm6
+	ror	r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	ror	r14d,9
+	psrlq	xmm6,2
+	xor	r13d,ecx
+	xor	r12d,r8d
+	pxor	xmm7,xmm6
+	ror	r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	pshufd	xmm7,xmm7,128
+	xor	r13d,ecx
+	add	r9d,DWORD[8+rsp]
+	mov	r15d,r10d
+	psrldq	xmm7,8
+	xor	r12d,r8d
+	ror	r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	ror	r13d,6
+	paddd	xmm0,xmm7
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	pshufd	xmm7,xmm0,80
+	xor	edi,r11d
+	ror	r14d,2
+	add	ebx,r9d
+	movdqa	xmm6,xmm7
+	add	r9d,edi
+	mov	r13d,ebx
+	psrld	xmm7,10
+	add	r14d,r9d
+	ror	r13d,14
+	psrlq	xmm6,17
+	mov	r9d,r14d
+	mov	r12d,ecx
+	pxor	xmm7,xmm6
+	ror	r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	ror	r13d,5
+	xor	r14d,r9d
+	psrlq	xmm6,2
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD[12+rsp]
+	pxor	xmm7,xmm6
+	mov	edi,r9d
+	xor	r12d,edx
+	ror	r14d,11
+	pshufd	xmm7,xmm7,8
+	xor	edi,r10d
+	add	r8d,r12d
+	movdqa	xmm6,XMMWORD[rbp]
+	ror	r13d,6
+	and	r15d,edi
+	pslldq	xmm7,8
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	paddd	xmm0,xmm7
+	ror	r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	paddd	xmm6,xmm0
+	mov	r13d,eax
+	add	r14d,r8d
+	movdqa	XMMWORD[rsp],xmm6
+	ror	r13d,14
+	movdqa	xmm4,xmm2
+	mov	r8d,r14d
+	mov	r12d,ebx
+	movdqa	xmm7,xmm0
+	ror	r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	ror	r13d,5
+	xor	r14d,r8d
+DB	102,15,58,15,225,4
+	and	r12d,eax
+	xor	r13d,eax
+DB	102,15,58,15,251,4
+	add	edx,DWORD[16+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	ror	r14d,11
+	movdqa	xmm5,xmm4
+	xor	r15d,r9d
+	add	edx,r12d
+	movdqa	xmm6,xmm4
+	ror	r13d,6
+	and	edi,r15d
+	psrld	xmm4,3
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	paddd	xmm1,xmm7
+	ror	r14d,2
+	add	r11d,edx
+	psrld	xmm6,7
+	add	edx,edi
+	mov	r13d,r11d
+	pshufd	xmm7,xmm0,250
+	add	r14d,edx
+	ror	r13d,14
+	pslld	xmm5,14
+	mov	edx,r14d
+	mov	r12d,eax
+	pxor	xmm4,xmm6
+	ror	r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	ror	r13d,5
+	psrld	xmm6,11
+	xor	r14d,edx
+	pxor	xmm4,xmm5
+	and	r12d,r11d
+	xor	r13d,r11d
+	pslld	xmm5,11
+	add	ecx,DWORD[20+rsp]
+	mov	edi,edx
+	pxor	xmm4,xmm6
+	xor	r12d,ebx
+	ror	r14d,11
+	movdqa	xmm6,xmm7
+	xor	edi,r8d
+	add	ecx,r12d
+	pxor	xmm4,xmm5
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	psrld	xmm7,10
+	add	ecx,r13d
+	xor	r15d,r8d
+	paddd	xmm1,xmm4
+	ror	r14d,2
+	add	r10d,ecx
+	psrlq	xmm6,17
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	pxor	xmm7,xmm6
+	ror	r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	ror	r14d,9
+	psrlq	xmm6,2
+	xor	r13d,r10d
+	xor	r12d,eax
+	pxor	xmm7,xmm6
+	ror	r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	pshufd	xmm7,xmm7,128
+	xor	r13d,r10d
+	add	ebx,DWORD[24+rsp]
+	mov	r15d,ecx
+	psrldq	xmm7,8
+	xor	r12d,eax
+	ror	r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	ror	r13d,6
+	paddd	xmm1,xmm7
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	pshufd	xmm7,xmm1,80
+	xor	edi,edx
+	ror	r14d,2
+	add	r9d,ebx
+	movdqa	xmm6,xmm7
+	add	ebx,edi
+	mov	r13d,r9d
+	psrld	xmm7,10
+	add	r14d,ebx
+	ror	r13d,14
+	psrlq	xmm6,17
+	mov	ebx,r14d
+	mov	r12d,r10d
+	pxor	xmm7,xmm6
+	ror	r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	ror	r13d,5
+	xor	r14d,ebx
+	psrlq	xmm6,2
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD[28+rsp]
+	pxor	xmm7,xmm6
+	mov	edi,ebx
+	xor	r12d,r11d
+	ror	r14d,11
+	pshufd	xmm7,xmm7,8
+	xor	edi,ecx
+	add	eax,r12d
+	movdqa	xmm6,XMMWORD[32+rbp]
+	ror	r13d,6
+	and	r15d,edi
+	pslldq	xmm7,8
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	paddd	xmm1,xmm7
+	ror	r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	paddd	xmm6,xmm1
+	mov	r13d,r8d
+	add	r14d,eax
+	movdqa	XMMWORD[16+rsp],xmm6
+	ror	r13d,14
+	movdqa	xmm4,xmm3
+	mov	eax,r14d
+	mov	r12d,r9d
+	movdqa	xmm7,xmm1
+	ror	r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	ror	r13d,5
+	xor	r14d,eax
+DB	102,15,58,15,226,4
+	and	r12d,r8d
+	xor	r13d,r8d
+DB	102,15,58,15,248,4
+	add	r11d,DWORD[32+rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	ror	r14d,11
+	movdqa	xmm5,xmm4
+	xor	r15d,ebx
+	add	r11d,r12d
+	movdqa	xmm6,xmm4
+	ror	r13d,6
+	and	edi,r15d
+	psrld	xmm4,3
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	paddd	xmm2,xmm7
+	ror	r14d,2
+	add	edx,r11d
+	psrld	xmm6,7
+	add	r11d,edi
+	mov	r13d,edx
+	pshufd	xmm7,xmm1,250
+	add	r14d,r11d
+	ror	r13d,14
+	pslld	xmm5,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	pxor	xmm4,xmm6
+	ror	r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	ror	r13d,5
+	psrld	xmm6,11
+	xor	r14d,r11d
+	pxor	xmm4,xmm5
+	and	r12d,edx
+	xor	r13d,edx
+	pslld	xmm5,11
+	add	r10d,DWORD[36+rsp]
+	mov	edi,r11d
+	pxor	xmm4,xmm6
+	xor	r12d,r9d
+	ror	r14d,11
+	movdqa	xmm6,xmm7
+	xor	edi,eax
+	add	r10d,r12d
+	pxor	xmm4,xmm5
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	psrld	xmm7,10
+	add	r10d,r13d
+	xor	r15d,eax
+	paddd	xmm2,xmm4
+	ror	r14d,2
+	add	ecx,r10d
+	psrlq	xmm6,17
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	pxor	xmm7,xmm6
+	ror	r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	ror	r14d,9
+	psrlq	xmm6,2
+	xor	r13d,ecx
+	xor	r12d,r8d
+	pxor	xmm7,xmm6
+	ror	r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	pshufd	xmm7,xmm7,128
+	xor	r13d,ecx
+	add	r9d,DWORD[40+rsp]
+	mov	r15d,r10d
+	psrldq	xmm7,8
+	xor	r12d,r8d
+	ror	r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	ror	r13d,6
+	paddd	xmm2,xmm7
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	pshufd	xmm7,xmm2,80
+	xor	edi,r11d
+	ror	r14d,2
+	add	ebx,r9d
+	movdqa	xmm6,xmm7
+	add	r9d,edi
+	mov	r13d,ebx
+	psrld	xmm7,10
+	add	r14d,r9d
+	ror	r13d,14
+	psrlq	xmm6,17
+	mov	r9d,r14d
+	mov	r12d,ecx
+	pxor	xmm7,xmm6
+	ror	r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	ror	r13d,5
+	xor	r14d,r9d
+	psrlq	xmm6,2
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD[44+rsp]
+	pxor	xmm7,xmm6
+	mov	edi,r9d
+	xor	r12d,edx
+	ror	r14d,11
+	pshufd	xmm7,xmm7,8
+	xor	edi,r10d
+	add	r8d,r12d
+	movdqa	xmm6,XMMWORD[64+rbp]
+	ror	r13d,6
+	and	r15d,edi
+	pslldq	xmm7,8
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	paddd	xmm2,xmm7
+	ror	r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	paddd	xmm6,xmm2
+	mov	r13d,eax
+	add	r14d,r8d
+	movdqa	XMMWORD[32+rsp],xmm6
+	ror	r13d,14
+	movdqa	xmm4,xmm0
+	mov	r8d,r14d
+	mov	r12d,ebx
+	movdqa	xmm7,xmm2
+	ror	r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	ror	r13d,5
+	xor	r14d,r8d
+DB	102,15,58,15,227,4
+	and	r12d,eax
+	xor	r13d,eax
+DB	102,15,58,15,249,4
+	add	edx,DWORD[48+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	ror	r14d,11
+	movdqa	xmm5,xmm4
+	xor	r15d,r9d
+	add	edx,r12d
+	movdqa	xmm6,xmm4
+	ror	r13d,6
+	and	edi,r15d
+	psrld	xmm4,3
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	paddd	xmm3,xmm7
+	ror	r14d,2
+	add	r11d,edx
+	psrld	xmm6,7
+	add	edx,edi
+	mov	r13d,r11d
+	pshufd	xmm7,xmm2,250
+	add	r14d,edx
+	ror	r13d,14
+	pslld	xmm5,14
+	mov	edx,r14d
+	mov	r12d,eax
+	pxor	xmm4,xmm6
+	ror	r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	ror	r13d,5
+	psrld	xmm6,11
+	xor	r14d,edx
+	pxor	xmm4,xmm5
+	and	r12d,r11d
+	xor	r13d,r11d
+	pslld	xmm5,11
+	add	ecx,DWORD[52+rsp]
+	mov	edi,edx
+	pxor	xmm4,xmm6
+	xor	r12d,ebx
+	ror	r14d,11
+	movdqa	xmm6,xmm7
+	xor	edi,r8d
+	add	ecx,r12d
+	pxor	xmm4,xmm5
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	psrld	xmm7,10
+	add	ecx,r13d
+	xor	r15d,r8d
+	paddd	xmm3,xmm4
+	ror	r14d,2
+	add	r10d,ecx
+	psrlq	xmm6,17
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	pxor	xmm7,xmm6
+	ror	r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	ror	r14d,9
+	psrlq	xmm6,2
+	xor	r13d,r10d
+	xor	r12d,eax
+	pxor	xmm7,xmm6
+	ror	r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	pshufd	xmm7,xmm7,128
+	xor	r13d,r10d
+	add	ebx,DWORD[56+rsp]
+	mov	r15d,ecx
+	psrldq	xmm7,8
+	xor	r12d,eax
+	ror	r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	ror	r13d,6
+	paddd	xmm3,xmm7
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	pshufd	xmm7,xmm3,80
+	xor	edi,edx
+	ror	r14d,2
+	add	r9d,ebx
+	movdqa	xmm6,xmm7
+	add	ebx,edi
+	mov	r13d,r9d
+	psrld	xmm7,10
+	add	r14d,ebx
+	ror	r13d,14
+	psrlq	xmm6,17
+	mov	ebx,r14d
+	mov	r12d,r10d
+	pxor	xmm7,xmm6
+	ror	r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	ror	r13d,5
+	xor	r14d,ebx
+	psrlq	xmm6,2
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD[60+rsp]
+	pxor	xmm7,xmm6
+	mov	edi,ebx
+	xor	r12d,r11d
+	ror	r14d,11
+	pshufd	xmm7,xmm7,8
+	xor	edi,ecx
+	add	eax,r12d
+	movdqa	xmm6,XMMWORD[96+rbp]
+	ror	r13d,6
+	and	r15d,edi
+	pslldq	xmm7,8
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	paddd	xmm3,xmm7
+	ror	r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	paddd	xmm6,xmm3
+	mov	r13d,r8d
+	add	r14d,eax
+	movdqa	XMMWORD[48+rsp],xmm6
+	cmp	BYTE[131+rbp],0
+	jne	NEAR $L$ssse3_00_47
+	ror	r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	ror	r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	ror	r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	xor	r13d,r8d
+	add	r11d,DWORD[rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	ror	r14d,11
+	xor	r15d,ebx
+	add	r11d,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	ror	r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	mov	r13d,edx
+	add	r14d,r11d
+	ror	r13d,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	ror	r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	ror	r13d,5
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	add	r10d,DWORD[4+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	ror	r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	add	r10d,r13d
+	xor	r15d,eax
+	ror	r14d,2
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	ror	r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	ror	r14d,9
+	xor	r13d,ecx
+	xor	r12d,r8d
+	ror	r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD[8+rsp]
+	mov	r15d,r10d
+	xor	r12d,r8d
+	ror	r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	xor	edi,r11d
+	ror	r14d,2
+	add	ebx,r9d
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	ror	r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	ror	r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	ror	r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD[12+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	ror	r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	ror	r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	ror	r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	ror	r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	ror	r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	xor	r13d,eax
+	add	edx,DWORD[16+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	ror	r14d,11
+	xor	r15d,r9d
+	add	edx,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	ror	r14d,2
+	add	r11d,edx
+	add	edx,edi
+	mov	r13d,r11d
+	add	r14d,edx
+	ror	r13d,14
+	mov	edx,r14d
+	mov	r12d,eax
+	ror	r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	ror	r13d,5
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	add	ecx,DWORD[20+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	ror	r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	add	ecx,r13d
+	xor	r15d,r8d
+	ror	r14d,2
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	ror	r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	ror	r14d,9
+	xor	r13d,r10d
+	xor	r12d,eax
+	ror	r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD[24+rsp]
+	mov	r15d,ecx
+	xor	r12d,eax
+	ror	r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	xor	edi,edx
+	ror	r14d,2
+	add	r9d,ebx
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	ror	r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	ror	r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	ror	r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD[28+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	ror	r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	ror	r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	ror	r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	ror	r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	ror	r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	xor	r13d,r8d
+	add	r11d,DWORD[32+rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	ror	r14d,11
+	xor	r15d,ebx
+	add	r11d,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	ror	r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	mov	r13d,edx
+	add	r14d,r11d
+	ror	r13d,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	ror	r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	ror	r13d,5
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	add	r10d,DWORD[36+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	ror	r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	add	r10d,r13d
+	xor	r15d,eax
+	ror	r14d,2
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	ror	r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	ror	r14d,9
+	xor	r13d,ecx
+	xor	r12d,r8d
+	ror	r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD[40+rsp]
+	mov	r15d,r10d
+	xor	r12d,r8d
+	ror	r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	xor	edi,r11d
+	ror	r14d,2
+	add	ebx,r9d
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	ror	r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	ror	r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	ror	r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD[44+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	ror	r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	ror	r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	ror	r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	ror	r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	ror	r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	xor	r13d,eax
+	add	edx,DWORD[48+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	ror	r14d,11
+	xor	r15d,r9d
+	add	edx,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	ror	r14d,2
+	add	r11d,edx
+	add	edx,edi
+	mov	r13d,r11d
+	add	r14d,edx
+	ror	r13d,14
+	mov	edx,r14d
+	mov	r12d,eax
+	ror	r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	ror	r13d,5
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	add	ecx,DWORD[52+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	ror	r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	add	ecx,r13d
+	xor	r15d,r8d
+	ror	r14d,2
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	ror	r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	ror	r14d,9
+	xor	r13d,r10d
+	xor	r12d,eax
+	ror	r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD[56+rsp]
+	mov	r15d,ecx
+	xor	r12d,eax
+	ror	r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	xor	edi,edx
+	ror	r14d,2
+	add	r9d,ebx
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	ror	r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	ror	r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	ror	r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD[60+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	ror	r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	ror	r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	mov	rdi,QWORD[((64+0))+rsp]
+	mov	eax,r14d
+
+	add	eax,DWORD[rdi]
+	lea	rsi,[64+rsi]
+	add	ebx,DWORD[4+rdi]
+	add	ecx,DWORD[8+rdi]
+	add	edx,DWORD[12+rdi]
+	add	r8d,DWORD[16+rdi]
+	add	r9d,DWORD[20+rdi]
+	add	r10d,DWORD[24+rdi]
+	add	r11d,DWORD[28+rdi]
+
+	cmp	rsi,QWORD[((64+16))+rsp]
+
+	mov	DWORD[rdi],eax
+	mov	DWORD[4+rdi],ebx
+	mov	DWORD[8+rdi],ecx
+	mov	DWORD[12+rdi],edx
+	mov	DWORD[16+rdi],r8d
+	mov	DWORD[20+rdi],r9d
+	mov	DWORD[24+rdi],r10d
+	mov	DWORD[28+rdi],r11d
+	jb	NEAR $L$loop_ssse3
+
+	mov	rsi,QWORD[88+rsp]
+
+	movaps	xmm6,XMMWORD[((64+32))+rsp]
+	movaps	xmm7,XMMWORD[((64+48))+rsp]
+	movaps	xmm8,XMMWORD[((64+64))+rsp]
+	movaps	xmm9,XMMWORD[((64+80))+rsp]
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$epilogue_ssse3:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_sha256_block_data_order_ssse3:
+
+ALIGN	64
+GFp_sha256_block_data_order_avx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_sha256_block_data_order_avx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+$L$avx_shortcut:
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	shl	rdx,4
+	sub	rsp,160
+	lea	rdx,[rdx*4+rsi]
+	and	rsp,-64
+	mov	QWORD[((64+0))+rsp],rdi
+	mov	QWORD[((64+8))+rsp],rsi
+	mov	QWORD[((64+16))+rsp],rdx
+	mov	QWORD[88+rsp],rax
+
+	movaps	XMMWORD[(64+32)+rsp],xmm6
+	movaps	XMMWORD[(64+48)+rsp],xmm7
+	movaps	XMMWORD[(64+64)+rsp],xmm8
+	movaps	XMMWORD[(64+80)+rsp],xmm9
+$L$prologue_avx:
+
+	vzeroupper
+	mov	eax,DWORD[rdi]
+	mov	ebx,DWORD[4+rdi]
+	mov	ecx,DWORD[8+rdi]
+	mov	edx,DWORD[12+rdi]
+	mov	r8d,DWORD[16+rdi]
+	mov	r9d,DWORD[20+rdi]
+	mov	r10d,DWORD[24+rdi]
+	mov	r11d,DWORD[28+rdi]
+	vmovdqa	xmm8,XMMWORD[((K256+512+32))]
+	vmovdqa	xmm9,XMMWORD[((K256+512+64))]
+	jmp	NEAR $L$loop_avx
+ALIGN	16
+$L$loop_avx:
+	vmovdqa	xmm7,XMMWORD[((K256+512))]
+	vmovdqu	xmm0,XMMWORD[rsi]
+	vmovdqu	xmm1,XMMWORD[16+rsi]
+	vmovdqu	xmm2,XMMWORD[32+rsi]
+	vmovdqu	xmm3,XMMWORD[48+rsi]
+	vpshufb	xmm0,xmm0,xmm7
+	lea	rbp,[K256]
+	vpshufb	xmm1,xmm1,xmm7
+	vpshufb	xmm2,xmm2,xmm7
+	vpaddd	xmm4,xmm0,XMMWORD[rbp]
+	vpshufb	xmm3,xmm3,xmm7
+	vpaddd	xmm5,xmm1,XMMWORD[32+rbp]
+	vpaddd	xmm6,xmm2,XMMWORD[64+rbp]
+	vpaddd	xmm7,xmm3,XMMWORD[96+rbp]
+	vmovdqa	XMMWORD[rsp],xmm4
+	mov	r14d,eax
+	vmovdqa	XMMWORD[16+rsp],xmm5
+	mov	edi,ebx
+	vmovdqa	XMMWORD[32+rsp],xmm6
+	xor	edi,ecx
+	vmovdqa	XMMWORD[48+rsp],xmm7
+	mov	r13d,r8d
+	jmp	NEAR $L$avx_00_47
+
+ALIGN	16
+$L$avx_00_47:
+	sub	rbp,-128
+	vpalignr	xmm4,xmm1,xmm0,4
+	shrd	r13d,r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	vpalignr	xmm7,xmm3,xmm2,4
+	shrd	r14d,r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	vpsrld	xmm6,xmm4,7
+	shrd	r13d,r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	vpaddd	xmm0,xmm0,xmm7
+	xor	r13d,r8d
+	add	r11d,DWORD[rsp]
+	mov	r15d,eax
+	vpsrld	xmm7,xmm4,3
+	xor	r12d,r10d
+	shrd	r14d,r14d,11
+	xor	r15d,ebx
+	vpslld	xmm5,xmm4,14
+	add	r11d,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	vpxor	xmm4,xmm7,xmm6
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	vpshufd	xmm7,xmm3,250
+	shrd	r14d,r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	vpsrld	xmm6,xmm6,11
+	mov	r13d,edx
+	add	r14d,r11d
+	shrd	r13d,r13d,14
+	vpxor	xmm4,xmm4,xmm5
+	mov	r11d,r14d
+	mov	r12d,r8d
+	shrd	r14d,r14d,9
+	vpslld	xmm5,xmm5,11
+	xor	r13d,edx
+	xor	r12d,r9d
+	shrd	r13d,r13d,5
+	vpxor	xmm4,xmm4,xmm6
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	vpsrld	xmm6,xmm7,10
+	add	r10d,DWORD[4+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	vpxor	xmm4,xmm4,xmm5
+	shrd	r14d,r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	vpsrlq	xmm7,xmm7,17
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	vpaddd	xmm0,xmm0,xmm4
+	add	r10d,r13d
+	xor	r15d,eax
+	shrd	r14d,r14d,2
+	vpxor	xmm6,xmm6,xmm7
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	vpsrlq	xmm7,xmm7,2
+	add	r14d,r10d
+	shrd	r13d,r13d,14
+	mov	r10d,r14d
+	vpxor	xmm6,xmm6,xmm7
+	mov	r12d,edx
+	shrd	r14d,r14d,9
+	xor	r13d,ecx
+	vpshufb	xmm6,xmm6,xmm8
+	xor	r12d,r8d
+	shrd	r13d,r13d,5
+	xor	r14d,r10d
+	vpaddd	xmm0,xmm0,xmm6
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD[8+rsp]
+	vpshufd	xmm7,xmm0,80
+	mov	r15d,r10d
+	xor	r12d,r8d
+	shrd	r14d,r14d,11
+	vpsrld	xmm6,xmm7,10
+	xor	r15d,r11d
+	add	r9d,r12d
+	shrd	r13d,r13d,6
+	vpsrlq	xmm7,xmm7,17
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	vpxor	xmm6,xmm6,xmm7
+	xor	edi,r11d
+	shrd	r14d,r14d,2
+	add	ebx,r9d
+	vpsrlq	xmm7,xmm7,2
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	vpxor	xmm6,xmm6,xmm7
+	shrd	r13d,r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	vpshufb	xmm6,xmm6,xmm9
+	shrd	r14d,r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	vpaddd	xmm0,xmm0,xmm6
+	shrd	r13d,r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	vpaddd	xmm6,xmm0,XMMWORD[rbp]
+	xor	r13d,ebx
+	add	r8d,DWORD[12+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	shrd	r14d,r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	shrd	r14d,r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	vmovdqa	XMMWORD[rsp],xmm6
+	vpalignr	xmm4,xmm2,xmm1,4
+	shrd	r13d,r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	vpalignr	xmm7,xmm0,xmm3,4
+	shrd	r14d,r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	vpsrld	xmm6,xmm4,7
+	shrd	r13d,r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	vpaddd	xmm1,xmm1,xmm7
+	xor	r13d,eax
+	add	edx,DWORD[16+rsp]
+	mov	r15d,r8d
+	vpsrld	xmm7,xmm4,3
+	xor	r12d,ecx
+	shrd	r14d,r14d,11
+	xor	r15d,r9d
+	vpslld	xmm5,xmm4,14
+	add	edx,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	vpxor	xmm4,xmm7,xmm6
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	vpshufd	xmm7,xmm0,250
+	shrd	r14d,r14d,2
+	add	r11d,edx
+	add	edx,edi
+	vpsrld	xmm6,xmm6,11
+	mov	r13d,r11d
+	add	r14d,edx
+	shrd	r13d,r13d,14
+	vpxor	xmm4,xmm4,xmm5
+	mov	edx,r14d
+	mov	r12d,eax
+	shrd	r14d,r14d,9
+	vpslld	xmm5,xmm5,11
+	xor	r13d,r11d
+	xor	r12d,ebx
+	shrd	r13d,r13d,5
+	vpxor	xmm4,xmm4,xmm6
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	vpsrld	xmm6,xmm7,10
+	add	ecx,DWORD[20+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	vpxor	xmm4,xmm4,xmm5
+	shrd	r14d,r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	vpsrlq	xmm7,xmm7,17
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	vpaddd	xmm1,xmm1,xmm4
+	add	ecx,r13d
+	xor	r15d,r8d
+	shrd	r14d,r14d,2
+	vpxor	xmm6,xmm6,xmm7
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	vpsrlq	xmm7,xmm7,2
+	add	r14d,ecx
+	shrd	r13d,r13d,14
+	mov	ecx,r14d
+	vpxor	xmm6,xmm6,xmm7
+	mov	r12d,r11d
+	shrd	r14d,r14d,9
+	xor	r13d,r10d
+	vpshufb	xmm6,xmm6,xmm8
+	xor	r12d,eax
+	shrd	r13d,r13d,5
+	xor	r14d,ecx
+	vpaddd	xmm1,xmm1,xmm6
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD[24+rsp]
+	vpshufd	xmm7,xmm1,80
+	mov	r15d,ecx
+	xor	r12d,eax
+	shrd	r14d,r14d,11
+	vpsrld	xmm6,xmm7,10
+	xor	r15d,edx
+	add	ebx,r12d
+	shrd	r13d,r13d,6
+	vpsrlq	xmm7,xmm7,17
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	vpxor	xmm6,xmm6,xmm7
+	xor	edi,edx
+	shrd	r14d,r14d,2
+	add	r9d,ebx
+	vpsrlq	xmm7,xmm7,2
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	vpxor	xmm6,xmm6,xmm7
+	shrd	r13d,r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	vpshufb	xmm6,xmm6,xmm9
+	shrd	r14d,r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	vpaddd	xmm1,xmm1,xmm6
+	shrd	r13d,r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	vpaddd	xmm6,xmm1,XMMWORD[32+rbp]
+	xor	r13d,r9d
+	add	eax,DWORD[28+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	shrd	r14d,r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	shrd	r14d,r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	vmovdqa	XMMWORD[16+rsp],xmm6
+	vpalignr	xmm4,xmm3,xmm2,4
+	shrd	r13d,r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	vpalignr	xmm7,xmm1,xmm0,4
+	shrd	r14d,r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	vpsrld	xmm6,xmm4,7
+	shrd	r13d,r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	vpaddd	xmm2,xmm2,xmm7
+	xor	r13d,r8d
+	add	r11d,DWORD[32+rsp]
+	mov	r15d,eax
+	vpsrld	xmm7,xmm4,3
+	xor	r12d,r10d
+	shrd	r14d,r14d,11
+	xor	r15d,ebx
+	vpslld	xmm5,xmm4,14
+	add	r11d,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	vpxor	xmm4,xmm7,xmm6
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	vpshufd	xmm7,xmm1,250
+	shrd	r14d,r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	vpsrld	xmm6,xmm6,11
+	mov	r13d,edx
+	add	r14d,r11d
+	shrd	r13d,r13d,14
+	vpxor	xmm4,xmm4,xmm5
+	mov	r11d,r14d
+	mov	r12d,r8d
+	shrd	r14d,r14d,9
+	vpslld	xmm5,xmm5,11
+	xor	r13d,edx
+	xor	r12d,r9d
+	shrd	r13d,r13d,5
+	vpxor	xmm4,xmm4,xmm6
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	vpsrld	xmm6,xmm7,10
+	add	r10d,DWORD[36+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	vpxor	xmm4,xmm4,xmm5
+	shrd	r14d,r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	vpsrlq	xmm7,xmm7,17
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	vpaddd	xmm2,xmm2,xmm4
+	add	r10d,r13d
+	xor	r15d,eax
+	shrd	r14d,r14d,2
+	vpxor	xmm6,xmm6,xmm7
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	vpsrlq	xmm7,xmm7,2
+	add	r14d,r10d
+	shrd	r13d,r13d,14
+	mov	r10d,r14d
+	vpxor	xmm6,xmm6,xmm7
+	mov	r12d,edx
+	shrd	r14d,r14d,9
+	xor	r13d,ecx
+	vpshufb	xmm6,xmm6,xmm8
+	xor	r12d,r8d
+	shrd	r13d,r13d,5
+	xor	r14d,r10d
+	vpaddd	xmm2,xmm2,xmm6
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD[40+rsp]
+	vpshufd	xmm7,xmm2,80
+	mov	r15d,r10d
+	xor	r12d,r8d
+	shrd	r14d,r14d,11
+	vpsrld	xmm6,xmm7,10
+	xor	r15d,r11d
+	add	r9d,r12d
+	shrd	r13d,r13d,6
+	vpsrlq	xmm7,xmm7,17
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	vpxor	xmm6,xmm6,xmm7
+	xor	edi,r11d
+	shrd	r14d,r14d,2
+	add	ebx,r9d
+	vpsrlq	xmm7,xmm7,2
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	vpxor	xmm6,xmm6,xmm7
+	shrd	r13d,r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	vpshufb	xmm6,xmm6,xmm9
+	shrd	r14d,r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	vpaddd	xmm2,xmm2,xmm6
+	shrd	r13d,r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	vpaddd	xmm6,xmm2,XMMWORD[64+rbp]
+	xor	r13d,ebx
+	add	r8d,DWORD[44+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	shrd	r14d,r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	shrd	r14d,r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	vmovdqa	XMMWORD[32+rsp],xmm6
+	vpalignr	xmm4,xmm0,xmm3,4
+	shrd	r13d,r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	vpalignr	xmm7,xmm2,xmm1,4
+	shrd	r14d,r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	vpsrld	xmm6,xmm4,7
+	shrd	r13d,r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	vpaddd	xmm3,xmm3,xmm7
+	xor	r13d,eax
+	add	edx,DWORD[48+rsp]
+	mov	r15d,r8d
+	vpsrld	xmm7,xmm4,3
+	xor	r12d,ecx
+	shrd	r14d,r14d,11
+	xor	r15d,r9d
+	vpslld	xmm5,xmm4,14
+	add	edx,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	vpxor	xmm4,xmm7,xmm6
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	vpshufd	xmm7,xmm2,250
+	shrd	r14d,r14d,2
+	add	r11d,edx
+	add	edx,edi
+	vpsrld	xmm6,xmm6,11
+	mov	r13d,r11d
+	add	r14d,edx
+	shrd	r13d,r13d,14
+	vpxor	xmm4,xmm4,xmm5
+	mov	edx,r14d
+	mov	r12d,eax
+	shrd	r14d,r14d,9
+	vpslld	xmm5,xmm5,11
+	xor	r13d,r11d
+	xor	r12d,ebx
+	shrd	r13d,r13d,5
+	vpxor	xmm4,xmm4,xmm6
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	vpsrld	xmm6,xmm7,10
+	add	ecx,DWORD[52+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	vpxor	xmm4,xmm4,xmm5
+	shrd	r14d,r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	vpsrlq	xmm7,xmm7,17
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	vpaddd	xmm3,xmm3,xmm4
+	add	ecx,r13d
+	xor	r15d,r8d
+	shrd	r14d,r14d,2
+	vpxor	xmm6,xmm6,xmm7
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	vpsrlq	xmm7,xmm7,2
+	add	r14d,ecx
+	shrd	r13d,r13d,14
+	mov	ecx,r14d
+	vpxor	xmm6,xmm6,xmm7
+	mov	r12d,r11d
+	shrd	r14d,r14d,9
+	xor	r13d,r10d
+	vpshufb	xmm6,xmm6,xmm8
+	xor	r12d,eax
+	shrd	r13d,r13d,5
+	xor	r14d,ecx
+	vpaddd	xmm3,xmm3,xmm6
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD[56+rsp]
+	vpshufd	xmm7,xmm3,80
+	mov	r15d,ecx
+	xor	r12d,eax
+	shrd	r14d,r14d,11
+	vpsrld	xmm6,xmm7,10
+	xor	r15d,edx
+	add	ebx,r12d
+	shrd	r13d,r13d,6
+	vpsrlq	xmm7,xmm7,17
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	vpxor	xmm6,xmm6,xmm7
+	xor	edi,edx
+	shrd	r14d,r14d,2
+	add	r9d,ebx
+	vpsrlq	xmm7,xmm7,2
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	vpxor	xmm6,xmm6,xmm7
+	shrd	r13d,r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	vpshufb	xmm6,xmm6,xmm9
+	shrd	r14d,r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	vpaddd	xmm3,xmm3,xmm6
+	shrd	r13d,r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	vpaddd	xmm6,xmm3,XMMWORD[96+rbp]
+	xor	r13d,r9d
+	add	eax,DWORD[60+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	shrd	r14d,r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	shrd	r14d,r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	vmovdqa	XMMWORD[48+rsp],xmm6
+	cmp	BYTE[131+rbp],0
+	jne	NEAR $L$avx_00_47
+	shrd	r13d,r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	shrd	r14d,r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	shrd	r13d,r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	xor	r13d,r8d
+	add	r11d,DWORD[rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	shrd	r14d,r14d,11
+	xor	r15d,ebx
+	add	r11d,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	shrd	r14d,r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	mov	r13d,edx
+	add	r14d,r11d
+	shrd	r13d,r13d,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	shrd	r14d,r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	shrd	r13d,r13d,5
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	add	r10d,DWORD[4+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	shrd	r14d,r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	add	r10d,r13d
+	xor	r15d,eax
+	shrd	r14d,r14d,2
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	shrd	r13d,r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	shrd	r14d,r14d,9
+	xor	r13d,ecx
+	xor	r12d,r8d
+	shrd	r13d,r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD[8+rsp]
+	mov	r15d,r10d
+	xor	r12d,r8d
+	shrd	r14d,r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	xor	edi,r11d
+	shrd	r14d,r14d,2
+	add	ebx,r9d
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	shrd	r13d,r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	shrd	r14d,r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	shrd	r13d,r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD[12+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	shrd	r14d,r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	shrd	r14d,r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	shrd	r13d,r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	shrd	r14d,r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	shrd	r13d,r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	xor	r13d,eax
+	add	edx,DWORD[16+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	shrd	r14d,r14d,11
+	xor	r15d,r9d
+	add	edx,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	shrd	r14d,r14d,2
+	add	r11d,edx
+	add	edx,edi
+	mov	r13d,r11d
+	add	r14d,edx
+	shrd	r13d,r13d,14
+	mov	edx,r14d
+	mov	r12d,eax
+	shrd	r14d,r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	shrd	r13d,r13d,5
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	add	ecx,DWORD[20+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	shrd	r14d,r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	add	ecx,r13d
+	xor	r15d,r8d
+	shrd	r14d,r14d,2
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	shrd	r13d,r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	shrd	r14d,r14d,9
+	xor	r13d,r10d
+	xor	r12d,eax
+	shrd	r13d,r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD[24+rsp]
+	mov	r15d,ecx
+	xor	r12d,eax
+	shrd	r14d,r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	xor	edi,edx
+	shrd	r14d,r14d,2
+	add	r9d,ebx
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	shrd	r13d,r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	shrd	r14d,r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	shrd	r13d,r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD[28+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	shrd	r14d,r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	shrd	r14d,r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	shrd	r13d,r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	shrd	r14d,r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	shrd	r13d,r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	xor	r13d,r8d
+	add	r11d,DWORD[32+rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	shrd	r14d,r14d,11
+	xor	r15d,ebx
+	add	r11d,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	shrd	r14d,r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	mov	r13d,edx
+	add	r14d,r11d
+	shrd	r13d,r13d,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	shrd	r14d,r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	shrd	r13d,r13d,5
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	add	r10d,DWORD[36+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	shrd	r14d,r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	add	r10d,r13d
+	xor	r15d,eax
+	shrd	r14d,r14d,2
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	shrd	r13d,r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	shrd	r14d,r14d,9
+	xor	r13d,ecx
+	xor	r12d,r8d
+	shrd	r13d,r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD[40+rsp]
+	mov	r15d,r10d
+	xor	r12d,r8d
+	shrd	r14d,r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	xor	edi,r11d
+	shrd	r14d,r14d,2
+	add	ebx,r9d
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	shrd	r13d,r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	shrd	r14d,r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	shrd	r13d,r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD[44+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	shrd	r14d,r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	shrd	r14d,r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	shrd	r13d,r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	shrd	r14d,r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	shrd	r13d,r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	xor	r13d,eax
+	add	edx,DWORD[48+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	shrd	r14d,r14d,11
+	xor	r15d,r9d
+	add	edx,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	shrd	r14d,r14d,2
+	add	r11d,edx
+	add	edx,edi
+	mov	r13d,r11d
+	add	r14d,edx
+	shrd	r13d,r13d,14
+	mov	edx,r14d
+	mov	r12d,eax
+	shrd	r14d,r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	shrd	r13d,r13d,5
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	add	ecx,DWORD[52+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	shrd	r14d,r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	add	ecx,r13d
+	xor	r15d,r8d
+	shrd	r14d,r14d,2
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	shrd	r13d,r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	shrd	r14d,r14d,9
+	xor	r13d,r10d
+	xor	r12d,eax
+	shrd	r13d,r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD[56+rsp]
+	mov	r15d,ecx
+	xor	r12d,eax
+	shrd	r14d,r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	shrd	r13d,r13d,6
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	xor	edi,edx
+	shrd	r14d,r14d,2
+	add	r9d,ebx
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	shrd	r13d,r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	shrd	r14d,r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	shrd	r13d,r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD[60+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	shrd	r14d,r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	shrd	r13d,r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	shrd	r14d,r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	mov	rdi,QWORD[((64+0))+rsp]
+	mov	eax,r14d
+
+	add	eax,DWORD[rdi]
+	lea	rsi,[64+rsi]
+	add	ebx,DWORD[4+rdi]
+	add	ecx,DWORD[8+rdi]
+	add	edx,DWORD[12+rdi]
+	add	r8d,DWORD[16+rdi]
+	add	r9d,DWORD[20+rdi]
+	add	r10d,DWORD[24+rdi]
+	add	r11d,DWORD[28+rdi]
+
+	cmp	rsi,QWORD[((64+16))+rsp]
+
+	mov	DWORD[rdi],eax
+	mov	DWORD[4+rdi],ebx
+	mov	DWORD[8+rdi],ecx
+	mov	DWORD[12+rdi],edx
+	mov	DWORD[16+rdi],r8d
+	mov	DWORD[20+rdi],r9d
+	mov	DWORD[24+rdi],r10d
+	mov	DWORD[28+rdi],r11d
+	jb	NEAR $L$loop_avx
+
+	mov	rsi,QWORD[88+rsp]
+
+	vzeroupper
+	movaps	xmm6,XMMWORD[((64+32))+rsp]
+	movaps	xmm7,XMMWORD[((64+48))+rsp]
+	movaps	xmm8,XMMWORD[((64+64))+rsp]
+	movaps	xmm9,XMMWORD[((64+80))+rsp]
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$epilogue_avx:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_sha256_block_data_order_avx:
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$in_prologue
+	mov	rsi,rax
+	mov	rax,QWORD[((64+24))+rax]
+
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+	lea	r10,[$L$epilogue]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	lea	rsi,[((64+32))+rsi]
+	lea	rdi,[512+r8]
+	mov	ecx,8
+	DD	0xa548f3fc
+
+$L$in_prologue:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	DB	0F3h,0C3h		;repret
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_GFp_sha256_block_data_order wrt ..imagebase
+	DD	$L$SEH_end_GFp_sha256_block_data_order wrt ..imagebase
+	DD	$L$SEH_info_GFp_sha256_block_data_order wrt ..imagebase
+	DD	$L$SEH_begin_GFp_sha256_block_data_order_ssse3 wrt ..imagebase
+	DD	$L$SEH_end_GFp_sha256_block_data_order_ssse3 wrt ..imagebase
+	DD	$L$SEH_info_GFp_sha256_block_data_order_ssse3 wrt ..imagebase
+	DD	$L$SEH_begin_GFp_sha256_block_data_order_avx wrt ..imagebase
+	DD	$L$SEH_end_GFp_sha256_block_data_order_avx wrt ..imagebase
+	DD	$L$SEH_info_GFp_sha256_block_data_order_avx wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_GFp_sha256_block_data_order:
+DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase
+$L$SEH_info_GFp_sha256_block_data_order_ssse3:
+DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase
+$L$SEH_info_GFp_sha256_block_data_order_avx:
+DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase

+ 3135 - 0
zeroidc/vendor/ring/pregenerated/tmp/sha512-x86_64-nasm.asm

@@ -0,0 +1,3135 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+section	.text code align=64
+
+
+EXTERN	GFp_ia32cap_P
+global	GFp_sha512_block_data_order
+
+ALIGN	16
+GFp_sha512_block_data_order:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_sha512_block_data_order:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	lea	r11,[GFp_ia32cap_P]
+	mov	r9d,DWORD[r11]
+	mov	r10d,DWORD[4+r11]
+	mov	r11d,DWORD[8+r11]
+	and	r9d,1073741824
+	and	r10d,268435968
+	or	r10d,r9d
+	cmp	r10d,1342177792
+	je	NEAR $L$avx_shortcut
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	shl	rdx,4
+	sub	rsp,16*8+4*8
+	lea	rdx,[rdx*8+rsi]
+	and	rsp,-64
+	mov	QWORD[((128+0))+rsp],rdi
+	mov	QWORD[((128+8))+rsp],rsi
+	mov	QWORD[((128+16))+rsp],rdx
+	mov	QWORD[152+rsp],rax
+
+$L$prologue:
+
+	mov	rax,QWORD[rdi]
+	mov	rbx,QWORD[8+rdi]
+	mov	rcx,QWORD[16+rdi]
+	mov	rdx,QWORD[24+rdi]
+	mov	r8,QWORD[32+rdi]
+	mov	r9,QWORD[40+rdi]
+	mov	r10,QWORD[48+rdi]
+	mov	r11,QWORD[56+rdi]
+	jmp	NEAR $L$loop
+
+ALIGN	16
+$L$loop:
+	mov	rdi,rbx
+	lea	rbp,[K512]
+	xor	rdi,rcx
+	mov	r12,QWORD[rsi]
+	mov	r13,r8
+	mov	r14,rax
+	bswap	r12
+	ror	r13,23
+	mov	r15,r9
+
+	xor	r13,r8
+	ror	r14,5
+	xor	r15,r10
+
+	mov	QWORD[rsp],r12
+	xor	r14,rax
+	and	r15,r8
+
+	ror	r13,4
+	add	r12,r11
+	xor	r15,r10
+
+	ror	r14,6
+	xor	r13,r8
+	add	r12,r15
+
+	mov	r15,rax
+	add	r12,QWORD[rbp]
+	xor	r14,rax
+
+	xor	r15,rbx
+	ror	r13,14
+	mov	r11,rbx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r11,rdi
+	add	rdx,r12
+	add	r11,r12
+
+	lea	rbp,[8+rbp]
+	add	r11,r14
+	mov	r12,QWORD[8+rsi]
+	mov	r13,rdx
+	mov	r14,r11
+	bswap	r12
+	ror	r13,23
+	mov	rdi,r8
+
+	xor	r13,rdx
+	ror	r14,5
+	xor	rdi,r9
+
+	mov	QWORD[8+rsp],r12
+	xor	r14,r11
+	and	rdi,rdx
+
+	ror	r13,4
+	add	r12,r10
+	xor	rdi,r9
+
+	ror	r14,6
+	xor	r13,rdx
+	add	r12,rdi
+
+	mov	rdi,r11
+	add	r12,QWORD[rbp]
+	xor	r14,r11
+
+	xor	rdi,rax
+	ror	r13,14
+	mov	r10,rax
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r10,r15
+	add	rcx,r12
+	add	r10,r12
+
+	lea	rbp,[24+rbp]
+	add	r10,r14
+	mov	r12,QWORD[16+rsi]
+	mov	r13,rcx
+	mov	r14,r10
+	bswap	r12
+	ror	r13,23
+	mov	r15,rdx
+
+	xor	r13,rcx
+	ror	r14,5
+	xor	r15,r8
+
+	mov	QWORD[16+rsp],r12
+	xor	r14,r10
+	and	r15,rcx
+
+	ror	r13,4
+	add	r12,r9
+	xor	r15,r8
+
+	ror	r14,6
+	xor	r13,rcx
+	add	r12,r15
+
+	mov	r15,r10
+	add	r12,QWORD[rbp]
+	xor	r14,r10
+
+	xor	r15,r11
+	ror	r13,14
+	mov	r9,r11
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r9,rdi
+	add	rbx,r12
+	add	r9,r12
+
+	lea	rbp,[8+rbp]
+	add	r9,r14
+	mov	r12,QWORD[24+rsi]
+	mov	r13,rbx
+	mov	r14,r9
+	bswap	r12
+	ror	r13,23
+	mov	rdi,rcx
+
+	xor	r13,rbx
+	ror	r14,5
+	xor	rdi,rdx
+
+	mov	QWORD[24+rsp],r12
+	xor	r14,r9
+	and	rdi,rbx
+
+	ror	r13,4
+	add	r12,r8
+	xor	rdi,rdx
+
+	ror	r14,6
+	xor	r13,rbx
+	add	r12,rdi
+
+	mov	rdi,r9
+	add	r12,QWORD[rbp]
+	xor	r14,r9
+
+	xor	rdi,r10
+	ror	r13,14
+	mov	r8,r10
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r8,r15
+	add	rax,r12
+	add	r8,r12
+
+	lea	rbp,[24+rbp]
+	add	r8,r14
+	mov	r12,QWORD[32+rsi]
+	mov	r13,rax
+	mov	r14,r8
+	bswap	r12
+	ror	r13,23
+	mov	r15,rbx
+
+	xor	r13,rax
+	ror	r14,5
+	xor	r15,rcx
+
+	mov	QWORD[32+rsp],r12
+	xor	r14,r8
+	and	r15,rax
+
+	ror	r13,4
+	add	r12,rdx
+	xor	r15,rcx
+
+	ror	r14,6
+	xor	r13,rax
+	add	r12,r15
+
+	mov	r15,r8
+	add	r12,QWORD[rbp]
+	xor	r14,r8
+
+	xor	r15,r9
+	ror	r13,14
+	mov	rdx,r9
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rdx,rdi
+	add	r11,r12
+	add	rdx,r12
+
+	lea	rbp,[8+rbp]
+	add	rdx,r14
+	mov	r12,QWORD[40+rsi]
+	mov	r13,r11
+	mov	r14,rdx
+	bswap	r12
+	ror	r13,23
+	mov	rdi,rax
+
+	xor	r13,r11
+	ror	r14,5
+	xor	rdi,rbx
+
+	mov	QWORD[40+rsp],r12
+	xor	r14,rdx
+	and	rdi,r11
+
+	ror	r13,4
+	add	r12,rcx
+	xor	rdi,rbx
+
+	ror	r14,6
+	xor	r13,r11
+	add	r12,rdi
+
+	mov	rdi,rdx
+	add	r12,QWORD[rbp]
+	xor	r14,rdx
+
+	xor	rdi,r8
+	ror	r13,14
+	mov	rcx,r8
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rcx,r15
+	add	r10,r12
+	add	rcx,r12
+
+	lea	rbp,[24+rbp]
+	add	rcx,r14
+	mov	r12,QWORD[48+rsi]
+	mov	r13,r10
+	mov	r14,rcx
+	bswap	r12
+	ror	r13,23
+	mov	r15,r11
+
+	xor	r13,r10
+	ror	r14,5
+	xor	r15,rax
+
+	mov	QWORD[48+rsp],r12
+	xor	r14,rcx
+	and	r15,r10
+
+	ror	r13,4
+	add	r12,rbx
+	xor	r15,rax
+
+	ror	r14,6
+	xor	r13,r10
+	add	r12,r15
+
+	mov	r15,rcx
+	add	r12,QWORD[rbp]
+	xor	r14,rcx
+
+	xor	r15,rdx
+	ror	r13,14
+	mov	rbx,rdx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rbx,rdi
+	add	r9,r12
+	add	rbx,r12
+
+	lea	rbp,[8+rbp]
+	add	rbx,r14
+	mov	r12,QWORD[56+rsi]
+	mov	r13,r9
+	mov	r14,rbx
+	bswap	r12
+	ror	r13,23
+	mov	rdi,r10
+
+	xor	r13,r9
+	ror	r14,5
+	xor	rdi,r11
+
+	mov	QWORD[56+rsp],r12
+	xor	r14,rbx
+	and	rdi,r9
+
+	ror	r13,4
+	add	r12,rax
+	xor	rdi,r11
+
+	ror	r14,6
+	xor	r13,r9
+	add	r12,rdi
+
+	mov	rdi,rbx
+	add	r12,QWORD[rbp]
+	xor	r14,rbx
+
+	xor	rdi,rcx
+	ror	r13,14
+	mov	rax,rcx
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rax,r15
+	add	r8,r12
+	add	rax,r12
+
+	lea	rbp,[24+rbp]
+	add	rax,r14
+	mov	r12,QWORD[64+rsi]
+	mov	r13,r8
+	mov	r14,rax
+	bswap	r12
+	ror	r13,23
+	mov	r15,r9
+
+	xor	r13,r8
+	ror	r14,5
+	xor	r15,r10
+
+	mov	QWORD[64+rsp],r12
+	xor	r14,rax
+	and	r15,r8
+
+	ror	r13,4
+	add	r12,r11
+	xor	r15,r10
+
+	ror	r14,6
+	xor	r13,r8
+	add	r12,r15
+
+	mov	r15,rax
+	add	r12,QWORD[rbp]
+	xor	r14,rax
+
+	xor	r15,rbx
+	ror	r13,14
+	mov	r11,rbx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r11,rdi
+	add	rdx,r12
+	add	r11,r12
+
+	lea	rbp,[8+rbp]
+	add	r11,r14
+	mov	r12,QWORD[72+rsi]
+	mov	r13,rdx
+	mov	r14,r11
+	bswap	r12
+	ror	r13,23
+	mov	rdi,r8
+
+	xor	r13,rdx
+	ror	r14,5
+	xor	rdi,r9
+
+	mov	QWORD[72+rsp],r12
+	xor	r14,r11
+	and	rdi,rdx
+
+	ror	r13,4
+	add	r12,r10
+	xor	rdi,r9
+
+	ror	r14,6
+	xor	r13,rdx
+	add	r12,rdi
+
+	mov	rdi,r11
+	add	r12,QWORD[rbp]
+	xor	r14,r11
+
+	xor	rdi,rax
+	ror	r13,14
+	mov	r10,rax
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r10,r15
+	add	rcx,r12
+	add	r10,r12
+
+	lea	rbp,[24+rbp]
+	add	r10,r14
+	mov	r12,QWORD[80+rsi]
+	mov	r13,rcx
+	mov	r14,r10
+	bswap	r12
+	ror	r13,23
+	mov	r15,rdx
+
+	xor	r13,rcx
+	ror	r14,5
+	xor	r15,r8
+
+	mov	QWORD[80+rsp],r12
+	xor	r14,r10
+	and	r15,rcx
+
+	ror	r13,4
+	add	r12,r9
+	xor	r15,r8
+
+	ror	r14,6
+	xor	r13,rcx
+	add	r12,r15
+
+	mov	r15,r10
+	add	r12,QWORD[rbp]
+	xor	r14,r10
+
+	xor	r15,r11
+	ror	r13,14
+	mov	r9,r11
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r9,rdi
+	add	rbx,r12
+	add	r9,r12
+
+	lea	rbp,[8+rbp]
+	add	r9,r14
+	mov	r12,QWORD[88+rsi]
+	mov	r13,rbx
+	mov	r14,r9
+	bswap	r12
+	ror	r13,23
+	mov	rdi,rcx
+
+	xor	r13,rbx
+	ror	r14,5
+	xor	rdi,rdx
+
+	mov	QWORD[88+rsp],r12
+	xor	r14,r9
+	and	rdi,rbx
+
+	ror	r13,4
+	add	r12,r8
+	xor	rdi,rdx
+
+	ror	r14,6
+	xor	r13,rbx
+	add	r12,rdi
+
+	mov	rdi,r9
+	add	r12,QWORD[rbp]
+	xor	r14,r9
+
+	xor	rdi,r10
+	ror	r13,14
+	mov	r8,r10
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r8,r15
+	add	rax,r12
+	add	r8,r12
+
+	lea	rbp,[24+rbp]
+	add	r8,r14
+	mov	r12,QWORD[96+rsi]
+	mov	r13,rax
+	mov	r14,r8
+	bswap	r12
+	ror	r13,23
+	mov	r15,rbx
+
+	xor	r13,rax
+	ror	r14,5
+	xor	r15,rcx
+
+	mov	QWORD[96+rsp],r12
+	xor	r14,r8
+	and	r15,rax
+
+	ror	r13,4
+	add	r12,rdx
+	xor	r15,rcx
+
+	ror	r14,6
+	xor	r13,rax
+	add	r12,r15
+
+	mov	r15,r8
+	add	r12,QWORD[rbp]
+	xor	r14,r8
+
+	xor	r15,r9
+	ror	r13,14
+	mov	rdx,r9
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rdx,rdi
+	add	r11,r12
+	add	rdx,r12
+
+	lea	rbp,[8+rbp]
+	add	rdx,r14
+	mov	r12,QWORD[104+rsi]
+	mov	r13,r11
+	mov	r14,rdx
+	bswap	r12
+	ror	r13,23
+	mov	rdi,rax
+
+	xor	r13,r11
+	ror	r14,5
+	xor	rdi,rbx
+
+	mov	QWORD[104+rsp],r12
+	xor	r14,rdx
+	and	rdi,r11
+
+	ror	r13,4
+	add	r12,rcx
+	xor	rdi,rbx
+
+	ror	r14,6
+	xor	r13,r11
+	add	r12,rdi
+
+	mov	rdi,rdx
+	add	r12,QWORD[rbp]
+	xor	r14,rdx
+
+	xor	rdi,r8
+	ror	r13,14
+	mov	rcx,r8
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rcx,r15
+	add	r10,r12
+	add	rcx,r12
+
+	lea	rbp,[24+rbp]
+	add	rcx,r14
+	mov	r12,QWORD[112+rsi]
+	mov	r13,r10
+	mov	r14,rcx
+	bswap	r12
+	ror	r13,23
+	mov	r15,r11
+
+	xor	r13,r10
+	ror	r14,5
+	xor	r15,rax
+
+	mov	QWORD[112+rsp],r12
+	xor	r14,rcx
+	and	r15,r10
+
+	ror	r13,4
+	add	r12,rbx
+	xor	r15,rax
+
+	ror	r14,6
+	xor	r13,r10
+	add	r12,r15
+
+	mov	r15,rcx
+	add	r12,QWORD[rbp]
+	xor	r14,rcx
+
+	xor	r15,rdx
+	ror	r13,14
+	mov	rbx,rdx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rbx,rdi
+	add	r9,r12
+	add	rbx,r12
+
+	lea	rbp,[8+rbp]
+	add	rbx,r14
+	mov	r12,QWORD[120+rsi]
+	mov	r13,r9
+	mov	r14,rbx
+	bswap	r12
+	ror	r13,23
+	mov	rdi,r10
+
+	xor	r13,r9
+	ror	r14,5
+	xor	rdi,r11
+
+	mov	QWORD[120+rsp],r12
+	xor	r14,rbx
+	and	rdi,r9
+
+	ror	r13,4
+	add	r12,rax
+	xor	rdi,r11
+
+	ror	r14,6
+	xor	r13,r9
+	add	r12,rdi
+
+	mov	rdi,rbx
+	add	r12,QWORD[rbp]
+	xor	r14,rbx
+
+	xor	rdi,rcx
+	ror	r13,14
+	mov	rax,rcx
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rax,r15
+	add	r8,r12
+	add	rax,r12
+
+	lea	rbp,[24+rbp]
+	jmp	NEAR $L$rounds_16_xx
+ALIGN	16
+$L$rounds_16_xx:
+	mov	r13,QWORD[8+rsp]
+	mov	r15,QWORD[112+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rax,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[72+rsp]
+
+	add	r12,QWORD[rsp]
+	mov	r13,r8
+	add	r12,r15
+	mov	r14,rax
+	ror	r13,23
+	mov	r15,r9
+
+	xor	r13,r8
+	ror	r14,5
+	xor	r15,r10
+
+	mov	QWORD[rsp],r12
+	xor	r14,rax
+	and	r15,r8
+
+	ror	r13,4
+	add	r12,r11
+	xor	r15,r10
+
+	ror	r14,6
+	xor	r13,r8
+	add	r12,r15
+
+	mov	r15,rax
+	add	r12,QWORD[rbp]
+	xor	r14,rax
+
+	xor	r15,rbx
+	ror	r13,14
+	mov	r11,rbx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r11,rdi
+	add	rdx,r12
+	add	r11,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[16+rsp]
+	mov	rdi,QWORD[120+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r11,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[80+rsp]
+
+	add	r12,QWORD[8+rsp]
+	mov	r13,rdx
+	add	r12,rdi
+	mov	r14,r11
+	ror	r13,23
+	mov	rdi,r8
+
+	xor	r13,rdx
+	ror	r14,5
+	xor	rdi,r9
+
+	mov	QWORD[8+rsp],r12
+	xor	r14,r11
+	and	rdi,rdx
+
+	ror	r13,4
+	add	r12,r10
+	xor	rdi,r9
+
+	ror	r14,6
+	xor	r13,rdx
+	add	r12,rdi
+
+	mov	rdi,r11
+	add	r12,QWORD[rbp]
+	xor	r14,r11
+
+	xor	rdi,rax
+	ror	r13,14
+	mov	r10,rax
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r10,r15
+	add	rcx,r12
+	add	r10,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[24+rsp]
+	mov	r15,QWORD[rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r10,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[88+rsp]
+
+	add	r12,QWORD[16+rsp]
+	mov	r13,rcx
+	add	r12,r15
+	mov	r14,r10
+	ror	r13,23
+	mov	r15,rdx
+
+	xor	r13,rcx
+	ror	r14,5
+	xor	r15,r8
+
+	mov	QWORD[16+rsp],r12
+	xor	r14,r10
+	and	r15,rcx
+
+	ror	r13,4
+	add	r12,r9
+	xor	r15,r8
+
+	ror	r14,6
+	xor	r13,rcx
+	add	r12,r15
+
+	mov	r15,r10
+	add	r12,QWORD[rbp]
+	xor	r14,r10
+
+	xor	r15,r11
+	ror	r13,14
+	mov	r9,r11
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r9,rdi
+	add	rbx,r12
+	add	r9,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[32+rsp]
+	mov	rdi,QWORD[8+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r9,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[96+rsp]
+
+	add	r12,QWORD[24+rsp]
+	mov	r13,rbx
+	add	r12,rdi
+	mov	r14,r9
+	ror	r13,23
+	mov	rdi,rcx
+
+	xor	r13,rbx
+	ror	r14,5
+	xor	rdi,rdx
+
+	mov	QWORD[24+rsp],r12
+	xor	r14,r9
+	and	rdi,rbx
+
+	ror	r13,4
+	add	r12,r8
+	xor	rdi,rdx
+
+	ror	r14,6
+	xor	r13,rbx
+	add	r12,rdi
+
+	mov	rdi,r9
+	add	r12,QWORD[rbp]
+	xor	r14,r9
+
+	xor	rdi,r10
+	ror	r13,14
+	mov	r8,r10
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r8,r15
+	add	rax,r12
+	add	r8,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[40+rsp]
+	mov	r15,QWORD[16+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r8,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[104+rsp]
+
+	add	r12,QWORD[32+rsp]
+	mov	r13,rax
+	add	r12,r15
+	mov	r14,r8
+	ror	r13,23
+	mov	r15,rbx
+
+	xor	r13,rax
+	ror	r14,5
+	xor	r15,rcx
+
+	mov	QWORD[32+rsp],r12
+	xor	r14,r8
+	and	r15,rax
+
+	ror	r13,4
+	add	r12,rdx
+	xor	r15,rcx
+
+	ror	r14,6
+	xor	r13,rax
+	add	r12,r15
+
+	mov	r15,r8
+	add	r12,QWORD[rbp]
+	xor	r14,r8
+
+	xor	r15,r9
+	ror	r13,14
+	mov	rdx,r9
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rdx,rdi
+	add	r11,r12
+	add	rdx,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[48+rsp]
+	mov	rdi,QWORD[24+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rdx,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[112+rsp]
+
+	add	r12,QWORD[40+rsp]
+	mov	r13,r11
+	add	r12,rdi
+	mov	r14,rdx
+	ror	r13,23
+	mov	rdi,rax
+
+	xor	r13,r11
+	ror	r14,5
+	xor	rdi,rbx
+
+	mov	QWORD[40+rsp],r12
+	xor	r14,rdx
+	and	rdi,r11
+
+	ror	r13,4
+	add	r12,rcx
+	xor	rdi,rbx
+
+	ror	r14,6
+	xor	r13,r11
+	add	r12,rdi
+
+	mov	rdi,rdx
+	add	r12,QWORD[rbp]
+	xor	r14,rdx
+
+	xor	rdi,r8
+	ror	r13,14
+	mov	rcx,r8
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rcx,r15
+	add	r10,r12
+	add	rcx,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[56+rsp]
+	mov	r15,QWORD[32+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rcx,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[120+rsp]
+
+	add	r12,QWORD[48+rsp]
+	mov	r13,r10
+	add	r12,r15
+	mov	r14,rcx
+	ror	r13,23
+	mov	r15,r11
+
+	xor	r13,r10
+	ror	r14,5
+	xor	r15,rax
+
+	mov	QWORD[48+rsp],r12
+	xor	r14,rcx
+	and	r15,r10
+
+	ror	r13,4
+	add	r12,rbx
+	xor	r15,rax
+
+	ror	r14,6
+	xor	r13,r10
+	add	r12,r15
+
+	mov	r15,rcx
+	add	r12,QWORD[rbp]
+	xor	r14,rcx
+
+	xor	r15,rdx
+	ror	r13,14
+	mov	rbx,rdx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rbx,rdi
+	add	r9,r12
+	add	rbx,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[64+rsp]
+	mov	rdi,QWORD[40+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rbx,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[rsp]
+
+	add	r12,QWORD[56+rsp]
+	mov	r13,r9
+	add	r12,rdi
+	mov	r14,rbx
+	ror	r13,23
+	mov	rdi,r10
+
+	xor	r13,r9
+	ror	r14,5
+	xor	rdi,r11
+
+	mov	QWORD[56+rsp],r12
+	xor	r14,rbx
+	and	rdi,r9
+
+	ror	r13,4
+	add	r12,rax
+	xor	rdi,r11
+
+	ror	r14,6
+	xor	r13,r9
+	add	r12,rdi
+
+	mov	rdi,rbx
+	add	r12,QWORD[rbp]
+	xor	r14,rbx
+
+	xor	rdi,rcx
+	ror	r13,14
+	mov	rax,rcx
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rax,r15
+	add	r8,r12
+	add	rax,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[72+rsp]
+	mov	r15,QWORD[48+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rax,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[8+rsp]
+
+	add	r12,QWORD[64+rsp]
+	mov	r13,r8
+	add	r12,r15
+	mov	r14,rax
+	ror	r13,23
+	mov	r15,r9
+
+	xor	r13,r8
+	ror	r14,5
+	xor	r15,r10
+
+	mov	QWORD[64+rsp],r12
+	xor	r14,rax
+	and	r15,r8
+
+	ror	r13,4
+	add	r12,r11
+	xor	r15,r10
+
+	ror	r14,6
+	xor	r13,r8
+	add	r12,r15
+
+	mov	r15,rax
+	add	r12,QWORD[rbp]
+	xor	r14,rax
+
+	xor	r15,rbx
+	ror	r13,14
+	mov	r11,rbx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r11,rdi
+	add	rdx,r12
+	add	r11,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[80+rsp]
+	mov	rdi,QWORD[56+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r11,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[16+rsp]
+
+	add	r12,QWORD[72+rsp]
+	mov	r13,rdx
+	add	r12,rdi
+	mov	r14,r11
+	ror	r13,23
+	mov	rdi,r8
+
+	xor	r13,rdx
+	ror	r14,5
+	xor	rdi,r9
+
+	mov	QWORD[72+rsp],r12
+	xor	r14,r11
+	and	rdi,rdx
+
+	ror	r13,4
+	add	r12,r10
+	xor	rdi,r9
+
+	ror	r14,6
+	xor	r13,rdx
+	add	r12,rdi
+
+	mov	rdi,r11
+	add	r12,QWORD[rbp]
+	xor	r14,r11
+
+	xor	rdi,rax
+	ror	r13,14
+	mov	r10,rax
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r10,r15
+	add	rcx,r12
+	add	r10,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[88+rsp]
+	mov	r15,QWORD[64+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r10,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[24+rsp]
+
+	add	r12,QWORD[80+rsp]
+	mov	r13,rcx
+	add	r12,r15
+	mov	r14,r10
+	ror	r13,23
+	mov	r15,rdx
+
+	xor	r13,rcx
+	ror	r14,5
+	xor	r15,r8
+
+	mov	QWORD[80+rsp],r12
+	xor	r14,r10
+	and	r15,rcx
+
+	ror	r13,4
+	add	r12,r9
+	xor	r15,r8
+
+	ror	r14,6
+	xor	r13,rcx
+	add	r12,r15
+
+	mov	r15,r10
+	add	r12,QWORD[rbp]
+	xor	r14,r10
+
+	xor	r15,r11
+	ror	r13,14
+	mov	r9,r11
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	r9,rdi
+	add	rbx,r12
+	add	r9,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[96+rsp]
+	mov	rdi,QWORD[72+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r9,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[32+rsp]
+
+	add	r12,QWORD[88+rsp]
+	mov	r13,rbx
+	add	r12,rdi
+	mov	r14,r9
+	ror	r13,23
+	mov	rdi,rcx
+
+	xor	r13,rbx
+	ror	r14,5
+	xor	rdi,rdx
+
+	mov	QWORD[88+rsp],r12
+	xor	r14,r9
+	and	rdi,rbx
+
+	ror	r13,4
+	add	r12,r8
+	xor	rdi,rdx
+
+	ror	r14,6
+	xor	r13,rbx
+	add	r12,rdi
+
+	mov	rdi,r9
+	add	r12,QWORD[rbp]
+	xor	r14,r9
+
+	xor	rdi,r10
+	ror	r13,14
+	mov	r8,r10
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	r8,r15
+	add	rax,r12
+	add	r8,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[104+rsp]
+	mov	r15,QWORD[80+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	r8,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[40+rsp]
+
+	add	r12,QWORD[96+rsp]
+	mov	r13,rax
+	add	r12,r15
+	mov	r14,r8
+	ror	r13,23
+	mov	r15,rbx
+
+	xor	r13,rax
+	ror	r14,5
+	xor	r15,rcx
+
+	mov	QWORD[96+rsp],r12
+	xor	r14,r8
+	and	r15,rax
+
+	ror	r13,4
+	add	r12,rdx
+	xor	r15,rcx
+
+	ror	r14,6
+	xor	r13,rax
+	add	r12,r15
+
+	mov	r15,r8
+	add	r12,QWORD[rbp]
+	xor	r14,r8
+
+	xor	r15,r9
+	ror	r13,14
+	mov	rdx,r9
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rdx,rdi
+	add	r11,r12
+	add	rdx,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[112+rsp]
+	mov	rdi,QWORD[88+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rdx,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[48+rsp]
+
+	add	r12,QWORD[104+rsp]
+	mov	r13,r11
+	add	r12,rdi
+	mov	r14,rdx
+	ror	r13,23
+	mov	rdi,rax
+
+	xor	r13,r11
+	ror	r14,5
+	xor	rdi,rbx
+
+	mov	QWORD[104+rsp],r12
+	xor	r14,rdx
+	and	rdi,r11
+
+	ror	r13,4
+	add	r12,rcx
+	xor	rdi,rbx
+
+	ror	r14,6
+	xor	r13,r11
+	add	r12,rdi
+
+	mov	rdi,rdx
+	add	r12,QWORD[rbp]
+	xor	r14,rdx
+
+	xor	rdi,r8
+	ror	r13,14
+	mov	rcx,r8
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rcx,r15
+	add	r10,r12
+	add	rcx,r12
+
+	lea	rbp,[24+rbp]
+	mov	r13,QWORD[120+rsp]
+	mov	r15,QWORD[96+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rcx,r14
+	mov	r14,r15
+	ror	r15,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	r15,r14
+	shr	r14,6
+
+	ror	r15,19
+	xor	r12,r13
+	xor	r15,r14
+	add	r12,QWORD[56+rsp]
+
+	add	r12,QWORD[112+rsp]
+	mov	r13,r10
+	add	r12,r15
+	mov	r14,rcx
+	ror	r13,23
+	mov	r15,r11
+
+	xor	r13,r10
+	ror	r14,5
+	xor	r15,rax
+
+	mov	QWORD[112+rsp],r12
+	xor	r14,rcx
+	and	r15,r10
+
+	ror	r13,4
+	add	r12,rbx
+	xor	r15,rax
+
+	ror	r14,6
+	xor	r13,r10
+	add	r12,r15
+
+	mov	r15,rcx
+	add	r12,QWORD[rbp]
+	xor	r14,rcx
+
+	xor	r15,rdx
+	ror	r13,14
+	mov	rbx,rdx
+
+	and	rdi,r15
+	ror	r14,28
+	add	r12,r13
+
+	xor	rbx,rdi
+	add	r9,r12
+	add	rbx,r12
+
+	lea	rbp,[8+rbp]
+	mov	r13,QWORD[rsp]
+	mov	rdi,QWORD[104+rsp]
+
+	mov	r12,r13
+	ror	r13,7
+	add	rbx,r14
+	mov	r14,rdi
+	ror	rdi,42
+
+	xor	r13,r12
+	shr	r12,7
+	ror	r13,1
+	xor	rdi,r14
+	shr	r14,6
+
+	ror	rdi,19
+	xor	r12,r13
+	xor	rdi,r14
+	add	r12,QWORD[64+rsp]
+
+	add	r12,QWORD[120+rsp]
+	mov	r13,r9
+	add	r12,rdi
+	mov	r14,rbx
+	ror	r13,23
+	mov	rdi,r10
+
+	xor	r13,r9
+	ror	r14,5
+	xor	rdi,r11
+
+	mov	QWORD[120+rsp],r12
+	xor	r14,rbx
+	and	rdi,r9
+
+	ror	r13,4
+	add	r12,rax
+	xor	rdi,r11
+
+	ror	r14,6
+	xor	r13,r9
+	add	r12,rdi
+
+	mov	rdi,rbx
+	add	r12,QWORD[rbp]
+	xor	r14,rbx
+
+	xor	rdi,rcx
+	ror	r13,14
+	mov	rax,rcx
+
+	and	r15,rdi
+	ror	r14,28
+	add	r12,r13
+
+	xor	rax,r15
+	add	r8,r12
+	add	rax,r12
+
+	lea	rbp,[24+rbp]
+	cmp	BYTE[7+rbp],0
+	jnz	NEAR $L$rounds_16_xx
+
+	mov	rdi,QWORD[((128+0))+rsp]
+	add	rax,r14
+	lea	rsi,[128+rsi]
+
+	add	rax,QWORD[rdi]
+	add	rbx,QWORD[8+rdi]
+	add	rcx,QWORD[16+rdi]
+	add	rdx,QWORD[24+rdi]
+	add	r8,QWORD[32+rdi]
+	add	r9,QWORD[40+rdi]
+	add	r10,QWORD[48+rdi]
+	add	r11,QWORD[56+rdi]
+
+	cmp	rsi,QWORD[((128+16))+rsp]
+
+	mov	QWORD[rdi],rax
+	mov	QWORD[8+rdi],rbx
+	mov	QWORD[16+rdi],rcx
+	mov	QWORD[24+rdi],rdx
+	mov	QWORD[32+rdi],r8
+	mov	QWORD[40+rdi],r9
+	mov	QWORD[48+rdi],r10
+	mov	QWORD[56+rdi],r11
+	jb	NEAR $L$loop
+
+	mov	rsi,QWORD[152+rsp]
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_sha512_block_data_order:
+ALIGN	64
+
+K512:
+	DQ	0x428a2f98d728ae22,0x7137449123ef65cd
+	DQ	0x428a2f98d728ae22,0x7137449123ef65cd
+	DQ	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	DQ	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	DQ	0x3956c25bf348b538,0x59f111f1b605d019
+	DQ	0x3956c25bf348b538,0x59f111f1b605d019
+	DQ	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	DQ	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	DQ	0xd807aa98a3030242,0x12835b0145706fbe
+	DQ	0xd807aa98a3030242,0x12835b0145706fbe
+	DQ	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	DQ	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	DQ	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	DQ	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	DQ	0x9bdc06a725c71235,0xc19bf174cf692694
+	DQ	0x9bdc06a725c71235,0xc19bf174cf692694
+	DQ	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	DQ	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	DQ	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	DQ	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	DQ	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	DQ	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	DQ	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	DQ	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	DQ	0x983e5152ee66dfab,0xa831c66d2db43210
+	DQ	0x983e5152ee66dfab,0xa831c66d2db43210
+	DQ	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	DQ	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	DQ	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	DQ	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	DQ	0x06ca6351e003826f,0x142929670a0e6e70
+	DQ	0x06ca6351e003826f,0x142929670a0e6e70
+	DQ	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	DQ	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	DQ	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	DQ	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	DQ	0x650a73548baf63de,0x766a0abb3c77b2a8
+	DQ	0x650a73548baf63de,0x766a0abb3c77b2a8
+	DQ	0x81c2c92e47edaee6,0x92722c851482353b
+	DQ	0x81c2c92e47edaee6,0x92722c851482353b
+	DQ	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	DQ	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	DQ	0xc24b8b70d0f89791,0xc76c51a30654be30
+	DQ	0xc24b8b70d0f89791,0xc76c51a30654be30
+	DQ	0xd192e819d6ef5218,0xd69906245565a910
+	DQ	0xd192e819d6ef5218,0xd69906245565a910
+	DQ	0xf40e35855771202a,0x106aa07032bbd1b8
+	DQ	0xf40e35855771202a,0x106aa07032bbd1b8
+	DQ	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	DQ	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	DQ	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	DQ	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	DQ	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	DQ	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	DQ	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	DQ	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	DQ	0x748f82ee5defb2fc,0x78a5636f43172f60
+	DQ	0x748f82ee5defb2fc,0x78a5636f43172f60
+	DQ	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	DQ	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	DQ	0x90befffa23631e28,0xa4506cebde82bde9
+	DQ	0x90befffa23631e28,0xa4506cebde82bde9
+	DQ	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	DQ	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	DQ	0xca273eceea26619c,0xd186b8c721c0c207
+	DQ	0xca273eceea26619c,0xd186b8c721c0c207
+	DQ	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	DQ	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	DQ	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	DQ	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	DQ	0x113f9804bef90dae,0x1b710b35131c471b
+	DQ	0x113f9804bef90dae,0x1b710b35131c471b
+	DQ	0x28db77f523047d84,0x32caab7b40c72493
+	DQ	0x28db77f523047d84,0x32caab7b40c72493
+	DQ	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	DQ	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	DQ	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	DQ	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	DQ	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+	DQ	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+	DQ	0x0001020304050607,0x08090a0b0c0d0e0f
+	DQ	0x0001020304050607,0x08090a0b0c0d0e0f
+DB	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
+DB	110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54
+DB	52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+DB	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+DB	111,114,103,62,0
+
+ALIGN	64
+GFp_sha512_block_data_order_avx:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_sha512_block_data_order_avx:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+$L$avx_shortcut:
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	shl	rdx,4
+	sub	rsp,256
+	lea	rdx,[rdx*8+rsi]
+	and	rsp,-64
+	mov	QWORD[((128+0))+rsp],rdi
+	mov	QWORD[((128+8))+rsp],rsi
+	mov	QWORD[((128+16))+rsp],rdx
+	mov	QWORD[152+rsp],rax
+
+	movaps	XMMWORD[(128+32)+rsp],xmm6
+	movaps	XMMWORD[(128+48)+rsp],xmm7
+	movaps	XMMWORD[(128+64)+rsp],xmm8
+	movaps	XMMWORD[(128+80)+rsp],xmm9
+	movaps	XMMWORD[(128+96)+rsp],xmm10
+	movaps	XMMWORD[(128+112)+rsp],xmm11
+$L$prologue_avx:
+
+	vzeroupper
+	mov	rax,QWORD[rdi]
+	mov	rbx,QWORD[8+rdi]
+	mov	rcx,QWORD[16+rdi]
+	mov	rdx,QWORD[24+rdi]
+	mov	r8,QWORD[32+rdi]
+	mov	r9,QWORD[40+rdi]
+	mov	r10,QWORD[48+rdi]
+	mov	r11,QWORD[56+rdi]
+	jmp	NEAR $L$loop_avx
+ALIGN	16
+$L$loop_avx:
+	vmovdqa	xmm11,XMMWORD[((K512+1280))]
+	vmovdqu	xmm0,XMMWORD[rsi]
+	lea	rbp,[((K512+128))]
+	vmovdqu	xmm1,XMMWORD[16+rsi]
+	vmovdqu	xmm2,XMMWORD[32+rsi]
+	vpshufb	xmm0,xmm0,xmm11
+	vmovdqu	xmm3,XMMWORD[48+rsi]
+	vpshufb	xmm1,xmm1,xmm11
+	vmovdqu	xmm4,XMMWORD[64+rsi]
+	vpshufb	xmm2,xmm2,xmm11
+	vmovdqu	xmm5,XMMWORD[80+rsi]
+	vpshufb	xmm3,xmm3,xmm11
+	vmovdqu	xmm6,XMMWORD[96+rsi]
+	vpshufb	xmm4,xmm4,xmm11
+	vmovdqu	xmm7,XMMWORD[112+rsi]
+	vpshufb	xmm5,xmm5,xmm11
+	vpaddq	xmm8,xmm0,XMMWORD[((-128))+rbp]
+	vpshufb	xmm6,xmm6,xmm11
+	vpaddq	xmm9,xmm1,XMMWORD[((-96))+rbp]
+	vpshufb	xmm7,xmm7,xmm11
+	vpaddq	xmm10,xmm2,XMMWORD[((-64))+rbp]
+	vpaddq	xmm11,xmm3,XMMWORD[((-32))+rbp]
+	vmovdqa	XMMWORD[rsp],xmm8
+	vpaddq	xmm8,xmm4,XMMWORD[rbp]
+	vmovdqa	XMMWORD[16+rsp],xmm9
+	vpaddq	xmm9,xmm5,XMMWORD[32+rbp]
+	vmovdqa	XMMWORD[32+rsp],xmm10
+	vpaddq	xmm10,xmm6,XMMWORD[64+rbp]
+	vmovdqa	XMMWORD[48+rsp],xmm11
+	vpaddq	xmm11,xmm7,XMMWORD[96+rbp]
+	vmovdqa	XMMWORD[64+rsp],xmm8
+	mov	r14,rax
+	vmovdqa	XMMWORD[80+rsp],xmm9
+	mov	rdi,rbx
+	vmovdqa	XMMWORD[96+rsp],xmm10
+	xor	rdi,rcx
+	vmovdqa	XMMWORD[112+rsp],xmm11
+	mov	r13,r8
+	jmp	NEAR $L$avx_00_47
+
+ALIGN	16
+$L$avx_00_47:
+	add	rbp,256
+	vpalignr	xmm8,xmm1,xmm0,8
+	shrd	r13,r13,23
+	mov	rax,r14
+	vpalignr	xmm11,xmm5,xmm4,8
+	mov	r12,r9
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,r8
+	xor	r12,r10
+	vpaddq	xmm0,xmm0,xmm11
+	shrd	r13,r13,4
+	xor	r14,rax
+	vpsrlq	xmm11,xmm8,7
+	and	r12,r8
+	xor	r13,r8
+	vpsllq	xmm9,xmm8,56
+	add	r11,QWORD[rsp]
+	mov	r15,rax
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,r10
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,rbx
+	add	r11,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,rax
+	add	r11,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,rbx
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm7,6
+	add	rdx,r11
+	add	r11,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,rdx
+	add	r14,r11
+	vpsllq	xmm10,xmm7,3
+	shrd	r13,r13,23
+	mov	r11,r14
+	vpaddq	xmm0,xmm0,xmm8
+	mov	r12,r8
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm7,19
+	xor	r13,rdx
+	xor	r12,r9
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,r11
+	vpsllq	xmm10,xmm10,42
+	and	r12,rdx
+	xor	r13,rdx
+	vpxor	xmm11,xmm11,xmm9
+	add	r10,QWORD[8+rsp]
+	mov	rdi,r11
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,r9
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,rax
+	add	r10,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm0,xmm0,xmm11
+	xor	r14,r11
+	add	r10,r13
+	vpaddq	xmm10,xmm0,XMMWORD[((-128))+rbp]
+	xor	r15,rax
+	shrd	r14,r14,28
+	add	rcx,r10
+	add	r10,r15
+	mov	r13,rcx
+	add	r14,r10
+	vmovdqa	XMMWORD[rsp],xmm10
+	vpalignr	xmm8,xmm2,xmm1,8
+	shrd	r13,r13,23
+	mov	r10,r14
+	vpalignr	xmm11,xmm6,xmm5,8
+	mov	r12,rdx
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,rcx
+	xor	r12,r8
+	vpaddq	xmm1,xmm1,xmm11
+	shrd	r13,r13,4
+	xor	r14,r10
+	vpsrlq	xmm11,xmm8,7
+	and	r12,rcx
+	xor	r13,rcx
+	vpsllq	xmm9,xmm8,56
+	add	r9,QWORD[16+rsp]
+	mov	r15,r10
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,r8
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,r11
+	add	r9,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,r10
+	add	r9,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,r11
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm0,6
+	add	rbx,r9
+	add	r9,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,rbx
+	add	r14,r9
+	vpsllq	xmm10,xmm0,3
+	shrd	r13,r13,23
+	mov	r9,r14
+	vpaddq	xmm1,xmm1,xmm8
+	mov	r12,rcx
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm0,19
+	xor	r13,rbx
+	xor	r12,rdx
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,r9
+	vpsllq	xmm10,xmm10,42
+	and	r12,rbx
+	xor	r13,rbx
+	vpxor	xmm11,xmm11,xmm9
+	add	r8,QWORD[24+rsp]
+	mov	rdi,r9
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,rdx
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,r10
+	add	r8,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm1,xmm1,xmm11
+	xor	r14,r9
+	add	r8,r13
+	vpaddq	xmm10,xmm1,XMMWORD[((-96))+rbp]
+	xor	r15,r10
+	shrd	r14,r14,28
+	add	rax,r8
+	add	r8,r15
+	mov	r13,rax
+	add	r14,r8
+	vmovdqa	XMMWORD[16+rsp],xmm10
+	vpalignr	xmm8,xmm3,xmm2,8
+	shrd	r13,r13,23
+	mov	r8,r14
+	vpalignr	xmm11,xmm7,xmm6,8
+	mov	r12,rbx
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,rax
+	xor	r12,rcx
+	vpaddq	xmm2,xmm2,xmm11
+	shrd	r13,r13,4
+	xor	r14,r8
+	vpsrlq	xmm11,xmm8,7
+	and	r12,rax
+	xor	r13,rax
+	vpsllq	xmm9,xmm8,56
+	add	rdx,QWORD[32+rsp]
+	mov	r15,r8
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,rcx
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,r9
+	add	rdx,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,r8
+	add	rdx,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,r9
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm1,6
+	add	r11,rdx
+	add	rdx,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,r11
+	add	r14,rdx
+	vpsllq	xmm10,xmm1,3
+	shrd	r13,r13,23
+	mov	rdx,r14
+	vpaddq	xmm2,xmm2,xmm8
+	mov	r12,rax
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm1,19
+	xor	r13,r11
+	xor	r12,rbx
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,rdx
+	vpsllq	xmm10,xmm10,42
+	and	r12,r11
+	xor	r13,r11
+	vpxor	xmm11,xmm11,xmm9
+	add	rcx,QWORD[40+rsp]
+	mov	rdi,rdx
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,rbx
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,r8
+	add	rcx,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm2,xmm2,xmm11
+	xor	r14,rdx
+	add	rcx,r13
+	vpaddq	xmm10,xmm2,XMMWORD[((-64))+rbp]
+	xor	r15,r8
+	shrd	r14,r14,28
+	add	r10,rcx
+	add	rcx,r15
+	mov	r13,r10
+	add	r14,rcx
+	vmovdqa	XMMWORD[32+rsp],xmm10
+	vpalignr	xmm8,xmm4,xmm3,8
+	shrd	r13,r13,23
+	mov	rcx,r14
+	vpalignr	xmm11,xmm0,xmm7,8
+	mov	r12,r11
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,r10
+	xor	r12,rax
+	vpaddq	xmm3,xmm3,xmm11
+	shrd	r13,r13,4
+	xor	r14,rcx
+	vpsrlq	xmm11,xmm8,7
+	and	r12,r10
+	xor	r13,r10
+	vpsllq	xmm9,xmm8,56
+	add	rbx,QWORD[48+rsp]
+	mov	r15,rcx
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,rax
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,rdx
+	add	rbx,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,rcx
+	add	rbx,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,rdx
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm2,6
+	add	r9,rbx
+	add	rbx,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,r9
+	add	r14,rbx
+	vpsllq	xmm10,xmm2,3
+	shrd	r13,r13,23
+	mov	rbx,r14
+	vpaddq	xmm3,xmm3,xmm8
+	mov	r12,r10
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm2,19
+	xor	r13,r9
+	xor	r12,r11
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,rbx
+	vpsllq	xmm10,xmm10,42
+	and	r12,r9
+	xor	r13,r9
+	vpxor	xmm11,xmm11,xmm9
+	add	rax,QWORD[56+rsp]
+	mov	rdi,rbx
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,r11
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,rcx
+	add	rax,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm3,xmm3,xmm11
+	xor	r14,rbx
+	add	rax,r13
+	vpaddq	xmm10,xmm3,XMMWORD[((-32))+rbp]
+	xor	r15,rcx
+	shrd	r14,r14,28
+	add	r8,rax
+	add	rax,r15
+	mov	r13,r8
+	add	r14,rax
+	vmovdqa	XMMWORD[48+rsp],xmm10
+	vpalignr	xmm8,xmm5,xmm4,8
+	shrd	r13,r13,23
+	mov	rax,r14
+	vpalignr	xmm11,xmm1,xmm0,8
+	mov	r12,r9
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,r8
+	xor	r12,r10
+	vpaddq	xmm4,xmm4,xmm11
+	shrd	r13,r13,4
+	xor	r14,rax
+	vpsrlq	xmm11,xmm8,7
+	and	r12,r8
+	xor	r13,r8
+	vpsllq	xmm9,xmm8,56
+	add	r11,QWORD[64+rsp]
+	mov	r15,rax
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,r10
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,rbx
+	add	r11,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,rax
+	add	r11,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,rbx
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm3,6
+	add	rdx,r11
+	add	r11,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,rdx
+	add	r14,r11
+	vpsllq	xmm10,xmm3,3
+	shrd	r13,r13,23
+	mov	r11,r14
+	vpaddq	xmm4,xmm4,xmm8
+	mov	r12,r8
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm3,19
+	xor	r13,rdx
+	xor	r12,r9
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,r11
+	vpsllq	xmm10,xmm10,42
+	and	r12,rdx
+	xor	r13,rdx
+	vpxor	xmm11,xmm11,xmm9
+	add	r10,QWORD[72+rsp]
+	mov	rdi,r11
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,r9
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,rax
+	add	r10,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm4,xmm4,xmm11
+	xor	r14,r11
+	add	r10,r13
+	vpaddq	xmm10,xmm4,XMMWORD[rbp]
+	xor	r15,rax
+	shrd	r14,r14,28
+	add	rcx,r10
+	add	r10,r15
+	mov	r13,rcx
+	add	r14,r10
+	vmovdqa	XMMWORD[64+rsp],xmm10
+	vpalignr	xmm8,xmm6,xmm5,8
+	shrd	r13,r13,23
+	mov	r10,r14
+	vpalignr	xmm11,xmm2,xmm1,8
+	mov	r12,rdx
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,rcx
+	xor	r12,r8
+	vpaddq	xmm5,xmm5,xmm11
+	shrd	r13,r13,4
+	xor	r14,r10
+	vpsrlq	xmm11,xmm8,7
+	and	r12,rcx
+	xor	r13,rcx
+	vpsllq	xmm9,xmm8,56
+	add	r9,QWORD[80+rsp]
+	mov	r15,r10
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,r8
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,r11
+	add	r9,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,r10
+	add	r9,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,r11
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm4,6
+	add	rbx,r9
+	add	r9,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,rbx
+	add	r14,r9
+	vpsllq	xmm10,xmm4,3
+	shrd	r13,r13,23
+	mov	r9,r14
+	vpaddq	xmm5,xmm5,xmm8
+	mov	r12,rcx
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm4,19
+	xor	r13,rbx
+	xor	r12,rdx
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,r9
+	vpsllq	xmm10,xmm10,42
+	and	r12,rbx
+	xor	r13,rbx
+	vpxor	xmm11,xmm11,xmm9
+	add	r8,QWORD[88+rsp]
+	mov	rdi,r9
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,rdx
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,r10
+	add	r8,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm5,xmm5,xmm11
+	xor	r14,r9
+	add	r8,r13
+	vpaddq	xmm10,xmm5,XMMWORD[32+rbp]
+	xor	r15,r10
+	shrd	r14,r14,28
+	add	rax,r8
+	add	r8,r15
+	mov	r13,rax
+	add	r14,r8
+	vmovdqa	XMMWORD[80+rsp],xmm10
+	vpalignr	xmm8,xmm7,xmm6,8
+	shrd	r13,r13,23
+	mov	r8,r14
+	vpalignr	xmm11,xmm3,xmm2,8
+	mov	r12,rbx
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,rax
+	xor	r12,rcx
+	vpaddq	xmm6,xmm6,xmm11
+	shrd	r13,r13,4
+	xor	r14,r8
+	vpsrlq	xmm11,xmm8,7
+	and	r12,rax
+	xor	r13,rax
+	vpsllq	xmm9,xmm8,56
+	add	rdx,QWORD[96+rsp]
+	mov	r15,r8
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,rcx
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,r9
+	add	rdx,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,r8
+	add	rdx,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,r9
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm5,6
+	add	r11,rdx
+	add	rdx,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,r11
+	add	r14,rdx
+	vpsllq	xmm10,xmm5,3
+	shrd	r13,r13,23
+	mov	rdx,r14
+	vpaddq	xmm6,xmm6,xmm8
+	mov	r12,rax
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm5,19
+	xor	r13,r11
+	xor	r12,rbx
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,rdx
+	vpsllq	xmm10,xmm10,42
+	and	r12,r11
+	xor	r13,r11
+	vpxor	xmm11,xmm11,xmm9
+	add	rcx,QWORD[104+rsp]
+	mov	rdi,rdx
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,rbx
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,r8
+	add	rcx,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm6,xmm6,xmm11
+	xor	r14,rdx
+	add	rcx,r13
+	vpaddq	xmm10,xmm6,XMMWORD[64+rbp]
+	xor	r15,r8
+	shrd	r14,r14,28
+	add	r10,rcx
+	add	rcx,r15
+	mov	r13,r10
+	add	r14,rcx
+	vmovdqa	XMMWORD[96+rsp],xmm10
+	vpalignr	xmm8,xmm0,xmm7,8
+	shrd	r13,r13,23
+	mov	rcx,r14
+	vpalignr	xmm11,xmm4,xmm3,8
+	mov	r12,r11
+	shrd	r14,r14,5
+	vpsrlq	xmm10,xmm8,1
+	xor	r13,r10
+	xor	r12,rax
+	vpaddq	xmm7,xmm7,xmm11
+	shrd	r13,r13,4
+	xor	r14,rcx
+	vpsrlq	xmm11,xmm8,7
+	and	r12,r10
+	xor	r13,r10
+	vpsllq	xmm9,xmm8,56
+	add	rbx,QWORD[112+rsp]
+	mov	r15,rcx
+	vpxor	xmm8,xmm11,xmm10
+	xor	r12,rax
+	shrd	r14,r14,6
+	vpsrlq	xmm10,xmm10,7
+	xor	r15,rdx
+	add	rbx,r12
+	vpxor	xmm8,xmm8,xmm9
+	shrd	r13,r13,14
+	and	rdi,r15
+	vpsllq	xmm9,xmm9,7
+	xor	r14,rcx
+	add	rbx,r13
+	vpxor	xmm8,xmm8,xmm10
+	xor	rdi,rdx
+	shrd	r14,r14,28
+	vpsrlq	xmm11,xmm6,6
+	add	r9,rbx
+	add	rbx,rdi
+	vpxor	xmm8,xmm8,xmm9
+	mov	r13,r9
+	add	r14,rbx
+	vpsllq	xmm10,xmm6,3
+	shrd	r13,r13,23
+	mov	rbx,r14
+	vpaddq	xmm7,xmm7,xmm8
+	mov	r12,r10
+	shrd	r14,r14,5
+	vpsrlq	xmm9,xmm6,19
+	xor	r13,r9
+	xor	r12,r11
+	vpxor	xmm11,xmm11,xmm10
+	shrd	r13,r13,4
+	xor	r14,rbx
+	vpsllq	xmm10,xmm10,42
+	and	r12,r9
+	xor	r13,r9
+	vpxor	xmm11,xmm11,xmm9
+	add	rax,QWORD[120+rsp]
+	mov	rdi,rbx
+	vpsrlq	xmm9,xmm9,42
+	xor	r12,r11
+	shrd	r14,r14,6
+	vpxor	xmm11,xmm11,xmm10
+	xor	rdi,rcx
+	add	rax,r12
+	vpxor	xmm11,xmm11,xmm9
+	shrd	r13,r13,14
+	and	r15,rdi
+	vpaddq	xmm7,xmm7,xmm11
+	xor	r14,rbx
+	add	rax,r13
+	vpaddq	xmm10,xmm7,XMMWORD[96+rbp]
+	xor	r15,rcx
+	shrd	r14,r14,28
+	add	r8,rax
+	add	rax,r15
+	mov	r13,r8
+	add	r14,rax
+	vmovdqa	XMMWORD[112+rsp],xmm10
+	cmp	BYTE[135+rbp],0
+	jne	NEAR $L$avx_00_47
+	shrd	r13,r13,23
+	mov	rax,r14
+	mov	r12,r9
+	shrd	r14,r14,5
+	xor	r13,r8
+	xor	r12,r10
+	shrd	r13,r13,4
+	xor	r14,rax
+	and	r12,r8
+	xor	r13,r8
+	add	r11,QWORD[rsp]
+	mov	r15,rax
+	xor	r12,r10
+	shrd	r14,r14,6
+	xor	r15,rbx
+	add	r11,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,rax
+	add	r11,r13
+	xor	rdi,rbx
+	shrd	r14,r14,28
+	add	rdx,r11
+	add	r11,rdi
+	mov	r13,rdx
+	add	r14,r11
+	shrd	r13,r13,23
+	mov	r11,r14
+	mov	r12,r8
+	shrd	r14,r14,5
+	xor	r13,rdx
+	xor	r12,r9
+	shrd	r13,r13,4
+	xor	r14,r11
+	and	r12,rdx
+	xor	r13,rdx
+	add	r10,QWORD[8+rsp]
+	mov	rdi,r11
+	xor	r12,r9
+	shrd	r14,r14,6
+	xor	rdi,rax
+	add	r10,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,r11
+	add	r10,r13
+	xor	r15,rax
+	shrd	r14,r14,28
+	add	rcx,r10
+	add	r10,r15
+	mov	r13,rcx
+	add	r14,r10
+	shrd	r13,r13,23
+	mov	r10,r14
+	mov	r12,rdx
+	shrd	r14,r14,5
+	xor	r13,rcx
+	xor	r12,r8
+	shrd	r13,r13,4
+	xor	r14,r10
+	and	r12,rcx
+	xor	r13,rcx
+	add	r9,QWORD[16+rsp]
+	mov	r15,r10
+	xor	r12,r8
+	shrd	r14,r14,6
+	xor	r15,r11
+	add	r9,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,r10
+	add	r9,r13
+	xor	rdi,r11
+	shrd	r14,r14,28
+	add	rbx,r9
+	add	r9,rdi
+	mov	r13,rbx
+	add	r14,r9
+	shrd	r13,r13,23
+	mov	r9,r14
+	mov	r12,rcx
+	shrd	r14,r14,5
+	xor	r13,rbx
+	xor	r12,rdx
+	shrd	r13,r13,4
+	xor	r14,r9
+	and	r12,rbx
+	xor	r13,rbx
+	add	r8,QWORD[24+rsp]
+	mov	rdi,r9
+	xor	r12,rdx
+	shrd	r14,r14,6
+	xor	rdi,r10
+	add	r8,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,r9
+	add	r8,r13
+	xor	r15,r10
+	shrd	r14,r14,28
+	add	rax,r8
+	add	r8,r15
+	mov	r13,rax
+	add	r14,r8
+	shrd	r13,r13,23
+	mov	r8,r14
+	mov	r12,rbx
+	shrd	r14,r14,5
+	xor	r13,rax
+	xor	r12,rcx
+	shrd	r13,r13,4
+	xor	r14,r8
+	and	r12,rax
+	xor	r13,rax
+	add	rdx,QWORD[32+rsp]
+	mov	r15,r8
+	xor	r12,rcx
+	shrd	r14,r14,6
+	xor	r15,r9
+	add	rdx,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,r8
+	add	rdx,r13
+	xor	rdi,r9
+	shrd	r14,r14,28
+	add	r11,rdx
+	add	rdx,rdi
+	mov	r13,r11
+	add	r14,rdx
+	shrd	r13,r13,23
+	mov	rdx,r14
+	mov	r12,rax
+	shrd	r14,r14,5
+	xor	r13,r11
+	xor	r12,rbx
+	shrd	r13,r13,4
+	xor	r14,rdx
+	and	r12,r11
+	xor	r13,r11
+	add	rcx,QWORD[40+rsp]
+	mov	rdi,rdx
+	xor	r12,rbx
+	shrd	r14,r14,6
+	xor	rdi,r8
+	add	rcx,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,rdx
+	add	rcx,r13
+	xor	r15,r8
+	shrd	r14,r14,28
+	add	r10,rcx
+	add	rcx,r15
+	mov	r13,r10
+	add	r14,rcx
+	shrd	r13,r13,23
+	mov	rcx,r14
+	mov	r12,r11
+	shrd	r14,r14,5
+	xor	r13,r10
+	xor	r12,rax
+	shrd	r13,r13,4
+	xor	r14,rcx
+	and	r12,r10
+	xor	r13,r10
+	add	rbx,QWORD[48+rsp]
+	mov	r15,rcx
+	xor	r12,rax
+	shrd	r14,r14,6
+	xor	r15,rdx
+	add	rbx,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,rcx
+	add	rbx,r13
+	xor	rdi,rdx
+	shrd	r14,r14,28
+	add	r9,rbx
+	add	rbx,rdi
+	mov	r13,r9
+	add	r14,rbx
+	shrd	r13,r13,23
+	mov	rbx,r14
+	mov	r12,r10
+	shrd	r14,r14,5
+	xor	r13,r9
+	xor	r12,r11
+	shrd	r13,r13,4
+	xor	r14,rbx
+	and	r12,r9
+	xor	r13,r9
+	add	rax,QWORD[56+rsp]
+	mov	rdi,rbx
+	xor	r12,r11
+	shrd	r14,r14,6
+	xor	rdi,rcx
+	add	rax,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,rbx
+	add	rax,r13
+	xor	r15,rcx
+	shrd	r14,r14,28
+	add	r8,rax
+	add	rax,r15
+	mov	r13,r8
+	add	r14,rax
+	shrd	r13,r13,23
+	mov	rax,r14
+	mov	r12,r9
+	shrd	r14,r14,5
+	xor	r13,r8
+	xor	r12,r10
+	shrd	r13,r13,4
+	xor	r14,rax
+	and	r12,r8
+	xor	r13,r8
+	add	r11,QWORD[64+rsp]
+	mov	r15,rax
+	xor	r12,r10
+	shrd	r14,r14,6
+	xor	r15,rbx
+	add	r11,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,rax
+	add	r11,r13
+	xor	rdi,rbx
+	shrd	r14,r14,28
+	add	rdx,r11
+	add	r11,rdi
+	mov	r13,rdx
+	add	r14,r11
+	shrd	r13,r13,23
+	mov	r11,r14
+	mov	r12,r8
+	shrd	r14,r14,5
+	xor	r13,rdx
+	xor	r12,r9
+	shrd	r13,r13,4
+	xor	r14,r11
+	and	r12,rdx
+	xor	r13,rdx
+	add	r10,QWORD[72+rsp]
+	mov	rdi,r11
+	xor	r12,r9
+	shrd	r14,r14,6
+	xor	rdi,rax
+	add	r10,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,r11
+	add	r10,r13
+	xor	r15,rax
+	shrd	r14,r14,28
+	add	rcx,r10
+	add	r10,r15
+	mov	r13,rcx
+	add	r14,r10
+	shrd	r13,r13,23
+	mov	r10,r14
+	mov	r12,rdx
+	shrd	r14,r14,5
+	xor	r13,rcx
+	xor	r12,r8
+	shrd	r13,r13,4
+	xor	r14,r10
+	and	r12,rcx
+	xor	r13,rcx
+	add	r9,QWORD[80+rsp]
+	mov	r15,r10
+	xor	r12,r8
+	shrd	r14,r14,6
+	xor	r15,r11
+	add	r9,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,r10
+	add	r9,r13
+	xor	rdi,r11
+	shrd	r14,r14,28
+	add	rbx,r9
+	add	r9,rdi
+	mov	r13,rbx
+	add	r14,r9
+	shrd	r13,r13,23
+	mov	r9,r14
+	mov	r12,rcx
+	shrd	r14,r14,5
+	xor	r13,rbx
+	xor	r12,rdx
+	shrd	r13,r13,4
+	xor	r14,r9
+	and	r12,rbx
+	xor	r13,rbx
+	add	r8,QWORD[88+rsp]
+	mov	rdi,r9
+	xor	r12,rdx
+	shrd	r14,r14,6
+	xor	rdi,r10
+	add	r8,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,r9
+	add	r8,r13
+	xor	r15,r10
+	shrd	r14,r14,28
+	add	rax,r8
+	add	r8,r15
+	mov	r13,rax
+	add	r14,r8
+	shrd	r13,r13,23
+	mov	r8,r14
+	mov	r12,rbx
+	shrd	r14,r14,5
+	xor	r13,rax
+	xor	r12,rcx
+	shrd	r13,r13,4
+	xor	r14,r8
+	and	r12,rax
+	xor	r13,rax
+	add	rdx,QWORD[96+rsp]
+	mov	r15,r8
+	xor	r12,rcx
+	shrd	r14,r14,6
+	xor	r15,r9
+	add	rdx,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,r8
+	add	rdx,r13
+	xor	rdi,r9
+	shrd	r14,r14,28
+	add	r11,rdx
+	add	rdx,rdi
+	mov	r13,r11
+	add	r14,rdx
+	shrd	r13,r13,23
+	mov	rdx,r14
+	mov	r12,rax
+	shrd	r14,r14,5
+	xor	r13,r11
+	xor	r12,rbx
+	shrd	r13,r13,4
+	xor	r14,rdx
+	and	r12,r11
+	xor	r13,r11
+	add	rcx,QWORD[104+rsp]
+	mov	rdi,rdx
+	xor	r12,rbx
+	shrd	r14,r14,6
+	xor	rdi,r8
+	add	rcx,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,rdx
+	add	rcx,r13
+	xor	r15,r8
+	shrd	r14,r14,28
+	add	r10,rcx
+	add	rcx,r15
+	mov	r13,r10
+	add	r14,rcx
+	shrd	r13,r13,23
+	mov	rcx,r14
+	mov	r12,r11
+	shrd	r14,r14,5
+	xor	r13,r10
+	xor	r12,rax
+	shrd	r13,r13,4
+	xor	r14,rcx
+	and	r12,r10
+	xor	r13,r10
+	add	rbx,QWORD[112+rsp]
+	mov	r15,rcx
+	xor	r12,rax
+	shrd	r14,r14,6
+	xor	r15,rdx
+	add	rbx,r12
+	shrd	r13,r13,14
+	and	rdi,r15
+	xor	r14,rcx
+	add	rbx,r13
+	xor	rdi,rdx
+	shrd	r14,r14,28
+	add	r9,rbx
+	add	rbx,rdi
+	mov	r13,r9
+	add	r14,rbx
+	shrd	r13,r13,23
+	mov	rbx,r14
+	mov	r12,r10
+	shrd	r14,r14,5
+	xor	r13,r9
+	xor	r12,r11
+	shrd	r13,r13,4
+	xor	r14,rbx
+	and	r12,r9
+	xor	r13,r9
+	add	rax,QWORD[120+rsp]
+	mov	rdi,rbx
+	xor	r12,r11
+	shrd	r14,r14,6
+	xor	rdi,rcx
+	add	rax,r12
+	shrd	r13,r13,14
+	and	r15,rdi
+	xor	r14,rbx
+	add	rax,r13
+	xor	r15,rcx
+	shrd	r14,r14,28
+	add	r8,rax
+	add	rax,r15
+	mov	r13,r8
+	add	r14,rax
+	mov	rdi,QWORD[((128+0))+rsp]
+	mov	rax,r14
+
+	add	rax,QWORD[rdi]
+	lea	rsi,[128+rsi]
+	add	rbx,QWORD[8+rdi]
+	add	rcx,QWORD[16+rdi]
+	add	rdx,QWORD[24+rdi]
+	add	r8,QWORD[32+rdi]
+	add	r9,QWORD[40+rdi]
+	add	r10,QWORD[48+rdi]
+	add	r11,QWORD[56+rdi]
+
+	cmp	rsi,QWORD[((128+16))+rsp]
+
+	mov	QWORD[rdi],rax
+	mov	QWORD[8+rdi],rbx
+	mov	QWORD[16+rdi],rcx
+	mov	QWORD[24+rdi],rdx
+	mov	QWORD[32+rdi],r8
+	mov	QWORD[40+rdi],r9
+	mov	QWORD[48+rdi],r10
+	mov	QWORD[56+rdi],r11
+	jb	NEAR $L$loop_avx
+
+	mov	rsi,QWORD[152+rsp]
+
+	vzeroupper
+	movaps	xmm6,XMMWORD[((128+32))+rsp]
+	movaps	xmm7,XMMWORD[((128+48))+rsp]
+	movaps	xmm8,XMMWORD[((128+64))+rsp]
+	movaps	xmm9,XMMWORD[((128+80))+rsp]
+	movaps	xmm10,XMMWORD[((128+96))+rsp]
+	movaps	xmm11,XMMWORD[((128+112))+rsp]
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$epilogue_avx:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_sha512_block_data_order_avx:
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$in_prologue
+	mov	rsi,rax
+	mov	rax,QWORD[((128+24))+rax]
+
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+	lea	r10,[$L$epilogue]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	lea	rsi,[((128+32))+rsi]
+	lea	rdi,[512+r8]
+	mov	ecx,12
+	DD	0xa548f3fc
+
+$L$in_prologue:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	DB	0F3h,0C3h		;repret
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_GFp_sha512_block_data_order wrt ..imagebase
+	DD	$L$SEH_end_GFp_sha512_block_data_order wrt ..imagebase
+	DD	$L$SEH_info_GFp_sha512_block_data_order wrt ..imagebase
+	DD	$L$SEH_begin_GFp_sha512_block_data_order_avx wrt ..imagebase
+	DD	$L$SEH_end_GFp_sha512_block_data_order_avx wrt ..imagebase
+	DD	$L$SEH_info_GFp_sha512_block_data_order_avx wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_GFp_sha512_block_data_order:
+DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase
+$L$SEH_info_GFp_sha512_block_data_order_avx:
+DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase

+ 378 - 0
zeroidc/vendor/ring/pregenerated/tmp/vpaes-x86-win32n.asm

@@ -0,0 +1,378 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
[email protected] equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+align	64
+L$_vpaes_consts:
+dd	218628480,235210255,168496130,67568393
+dd	252381056,17041926,33884169,51187212
+dd	252645135,252645135,252645135,252645135
+dd	1512730624,3266504856,1377990664,3401244816
+dd	830229760,1275146365,2969422977,3447763452
+dd	3411033600,2979783055,338359620,2782886510
+dd	4209124096,907596821,221174255,1006095553
+dd	191964160,3799684038,3164090317,1589111125
+dd	182528256,1777043520,2877432650,3265356744
+dd	1874708224,3503451415,3305285752,363511674
+dd	1606117888,3487855781,1093350906,2384367825
+dd	197121,67569157,134941193,202313229
+dd	67569157,134941193,202313229,197121
+dd	134941193,202313229,197121,67569157
+dd	202313229,197121,67569157,134941193
+dd	33619971,100992007,168364043,235736079
+dd	235736079,33619971,100992007,168364043
+dd	168364043,235736079,33619971,100992007
+dd	100992007,168364043,235736079,33619971
+dd	50462976,117835012,185207048,252579084
+dd	252314880,51251460,117574920,184942860
+dd	184682752,252054788,50987272,118359308
+dd	118099200,185467140,251790600,50727180
+dd	2946363062,528716217,1300004225,1881839624
+dd	1532713819,1532713819,1532713819,1532713819
+dd	3602276352,4288629033,3737020424,4153884961
+dd	1354558464,32357713,2958822624,3775749553
+dd	1201988352,132424512,1572796698,503232858
+dd	2213177600,1597421020,4103937655,675398315
+db	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+db	111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+db	83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+db	114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+db	118,101,114,115,105,116,121,41,0
+align	64
+align	16
+__vpaes_preheat:
+	add	ebp,DWORD [esp]
+	movdqa	xmm7,[ebp-48]
+	movdqa	xmm6,[ebp-16]
+	ret
+align	16
+__vpaes_encrypt_core:
+	mov	ecx,16
+	mov	eax,DWORD [240+edx]
+	movdqa	xmm1,xmm6
+	movdqa	xmm2,[ebp]
+	pandn	xmm1,xmm0
+	pand	xmm0,xmm6
+	movdqu	xmm5,[edx]
+db	102,15,56,0,208
+	movdqa	xmm0,[16+ebp]
+	pxor	xmm2,xmm5
+	psrld	xmm1,4
+	add	edx,16
+db	102,15,56,0,193
+	lea	ebx,[192+ebp]
+	pxor	xmm0,xmm2
+	jmp	NEAR L$000enc_entry
+align	16
+L$001enc_loop:
+	movdqa	xmm4,[32+ebp]
+	movdqa	xmm0,[48+ebp]
+db	102,15,56,0,226
+db	102,15,56,0,195
+	pxor	xmm4,xmm5
+	movdqa	xmm5,[64+ebp]
+	pxor	xmm0,xmm4
+	movdqa	xmm1,[ecx*1+ebx-64]
+db	102,15,56,0,234
+	movdqa	xmm2,[80+ebp]
+	movdqa	xmm4,[ecx*1+ebx]
+db	102,15,56,0,211
+	movdqa	xmm3,xmm0
+	pxor	xmm2,xmm5
+db	102,15,56,0,193
+	add	edx,16
+	pxor	xmm0,xmm2
+db	102,15,56,0,220
+	add	ecx,16
+	pxor	xmm3,xmm0
+db	102,15,56,0,193
+	and	ecx,48
+	sub	eax,1
+	pxor	xmm0,xmm3
+L$000enc_entry:
+	movdqa	xmm1,xmm6
+	movdqa	xmm5,[ebp-32]
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm6
+db	102,15,56,0,232
+	movdqa	xmm3,xmm7
+	pxor	xmm0,xmm1
+db	102,15,56,0,217
+	movdqa	xmm4,xmm7
+	pxor	xmm3,xmm5
+db	102,15,56,0,224
+	movdqa	xmm2,xmm7
+	pxor	xmm4,xmm5
+db	102,15,56,0,211
+	movdqa	xmm3,xmm7
+	pxor	xmm2,xmm0
+db	102,15,56,0,220
+	movdqu	xmm5,[edx]
+	pxor	xmm3,xmm1
+	jnz	NEAR L$001enc_loop
+	movdqa	xmm4,[96+ebp]
+	movdqa	xmm0,[112+ebp]
+db	102,15,56,0,226
+	pxor	xmm4,xmm5
+db	102,15,56,0,195
+	movdqa	xmm1,[64+ecx*1+ebx]
+	pxor	xmm0,xmm4
+db	102,15,56,0,193
+	ret
+align	16
+__vpaes_schedule_core:
+	add	ebp,DWORD [esp]
+	movdqu	xmm0,[esi]
+	movdqa	xmm2,[320+ebp]
+	movdqa	xmm3,xmm0
+	lea	ebx,[ebp]
+	movdqa	[4+esp],xmm2
+	call	__vpaes_schedule_transform
+	movdqa	xmm7,xmm0
+	test	edi,edi
+	jnz	NEAR L$002schedule_am_decrypting
+	movdqu	[edx],xmm0
+	jmp	NEAR L$003schedule_go
+L$002schedule_am_decrypting:
+	movdqa	xmm1,[256+ecx*1+ebp]
+db	102,15,56,0,217
+	movdqu	[edx],xmm3
+	xor	ecx,48
+L$003schedule_go:
+	cmp	eax,192
+	ja	NEAR L$004schedule_256
+L$005schedule_128:
+	mov	eax,10
+L$006loop_schedule_128:
+	call	__vpaes_schedule_round
+	dec	eax
+	jz	NEAR L$007schedule_mangle_last
+	call	__vpaes_schedule_mangle
+	jmp	NEAR L$006loop_schedule_128
+align	16
+L$004schedule_256:
+	movdqu	xmm0,[16+esi]
+	call	__vpaes_schedule_transform
+	mov	eax,7
+L$008loop_schedule_256:
+	call	__vpaes_schedule_mangle
+	movdqa	xmm6,xmm0
+	call	__vpaes_schedule_round
+	dec	eax
+	jz	NEAR L$007schedule_mangle_last
+	call	__vpaes_schedule_mangle
+	pshufd	xmm0,xmm0,255
+	movdqa	[20+esp],xmm7
+	movdqa	xmm7,xmm6
+	call	L$_vpaes_schedule_low_round
+	movdqa	xmm7,[20+esp]
+	jmp	NEAR L$008loop_schedule_256
+align	16
+L$007schedule_mangle_last:
+	lea	ebx,[384+ebp]
+	test	edi,edi
+	jnz	NEAR L$009schedule_mangle_last_dec
+	movdqa	xmm1,[256+ecx*1+ebp]
+db	102,15,56,0,193
+	lea	ebx,[352+ebp]
+	add	edx,32
+L$009schedule_mangle_last_dec:
+	add	edx,-16
+	pxor	xmm0,[336+ebp]
+	call	__vpaes_schedule_transform
+	movdqu	[edx],xmm0
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	pxor	xmm6,xmm6
+	pxor	xmm7,xmm7
+	ret
+align	16
+__vpaes_schedule_round:
+	movdqa	xmm2,[8+esp]
+	pxor	xmm1,xmm1
+db	102,15,58,15,202,15
+db	102,15,58,15,210,15
+	pxor	xmm7,xmm1
+	pshufd	xmm0,xmm0,255
+db	102,15,58,15,192,1
+	movdqa	[8+esp],xmm2
+L$_vpaes_schedule_low_round:
+	movdqa	xmm1,xmm7
+	pslldq	xmm7,4
+	pxor	xmm7,xmm1
+	movdqa	xmm1,xmm7
+	pslldq	xmm7,8
+	pxor	xmm7,xmm1
+	pxor	xmm7,[336+ebp]
+	movdqa	xmm4,[ebp-16]
+	movdqa	xmm5,[ebp-48]
+	movdqa	xmm1,xmm4
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm4
+	movdqa	xmm2,[ebp-32]
+db	102,15,56,0,208
+	pxor	xmm0,xmm1
+	movdqa	xmm3,xmm5
+db	102,15,56,0,217
+	pxor	xmm3,xmm2
+	movdqa	xmm4,xmm5
+db	102,15,56,0,224
+	pxor	xmm4,xmm2
+	movdqa	xmm2,xmm5
+db	102,15,56,0,211
+	pxor	xmm2,xmm0
+	movdqa	xmm3,xmm5
+db	102,15,56,0,220
+	pxor	xmm3,xmm1
+	movdqa	xmm4,[32+ebp]
+db	102,15,56,0,226
+	movdqa	xmm0,[48+ebp]
+db	102,15,56,0,195
+	pxor	xmm0,xmm4
+	pxor	xmm0,xmm7
+	movdqa	xmm7,xmm0
+	ret
+align	16
+__vpaes_schedule_transform:
+	movdqa	xmm2,[ebp-16]
+	movdqa	xmm1,xmm2
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm2
+	movdqa	xmm2,[ebx]
+db	102,15,56,0,208
+	movdqa	xmm0,[16+ebx]
+db	102,15,56,0,193
+	pxor	xmm0,xmm2
+	ret
+align	16
+__vpaes_schedule_mangle:
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,[128+ebp]
+	test	edi,edi
+	jnz	NEAR L$010schedule_mangle_dec
+	add	edx,16
+	pxor	xmm4,[336+ebp]
+db	102,15,56,0,229
+	movdqa	xmm3,xmm4
+db	102,15,56,0,229
+	pxor	xmm3,xmm4
+db	102,15,56,0,229
+	pxor	xmm3,xmm4
+	jmp	NEAR L$011schedule_mangle_both
+align	16
+L$010schedule_mangle_dec:
+	movdqa	xmm2,[ebp-16]
+	lea	esi,[ebp]
+	movdqa	xmm1,xmm2
+	pandn	xmm1,xmm4
+	psrld	xmm1,4
+	pand	xmm4,xmm2
+	movdqa	xmm2,[esi]
+db	102,15,56,0,212
+	movdqa	xmm3,[16+esi]
+db	102,15,56,0,217
+	pxor	xmm3,xmm2
+db	102,15,56,0,221
+	movdqa	xmm2,[32+esi]
+db	102,15,56,0,212
+	pxor	xmm2,xmm3
+	movdqa	xmm3,[48+esi]
+db	102,15,56,0,217
+	pxor	xmm3,xmm2
+db	102,15,56,0,221
+	movdqa	xmm2,[64+esi]
+db	102,15,56,0,212
+	pxor	xmm2,xmm3
+	movdqa	xmm3,[80+esi]
+db	102,15,56,0,217
+	pxor	xmm3,xmm2
+db	102,15,56,0,221
+	movdqa	xmm2,[96+esi]
+db	102,15,56,0,212
+	pxor	xmm2,xmm3
+	movdqa	xmm3,[112+esi]
+db	102,15,56,0,217
+	pxor	xmm3,xmm2
+	add	edx,-16
+L$011schedule_mangle_both:
+	movdqa	xmm1,[256+ecx*1+ebp]
+db	102,15,56,0,217
+	add	ecx,-16
+	and	ecx,48
+	movdqu	[edx],xmm3
+	ret
+global	_GFp_vpaes_set_encrypt_key
+align	16
+_GFp_vpaes_set_encrypt_key:
+L$_GFp_vpaes_set_encrypt_key_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	esi,DWORD [20+esp]
+	lea	ebx,[esp-56]
+	mov	eax,DWORD [24+esp]
+	and	ebx,-16
+	mov	edx,DWORD [28+esp]
+	xchg	ebx,esp
+	mov	DWORD [48+esp],ebx
+	mov	ebx,eax
+	shr	ebx,5
+	add	ebx,5
+	mov	DWORD [240+edx],ebx
+	mov	ecx,48
+	mov	edi,0
+	lea	ebp,[(L$_vpaes_consts+0x30-L$012pic_point)]
+	call	__vpaes_schedule_core
+L$012pic_point:
+	mov	esp,DWORD [48+esp]
+	xor	eax,eax
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+global	_GFp_vpaes_encrypt
+align	16
+_GFp_vpaes_encrypt:
+L$_GFp_vpaes_encrypt_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	lea	ebp,[(L$_vpaes_consts+0x30-L$013pic_point)]
+	call	__vpaes_preheat
+L$013pic_point:
+	mov	esi,DWORD [20+esp]
+	lea	ebx,[esp-56]
+	mov	edi,DWORD [24+esp]
+	and	ebx,-16
+	mov	edx,DWORD [28+esp]
+	xchg	ebx,esp
+	mov	DWORD [48+esp],ebx
+	movdqu	xmm0,[esi]
+	call	__vpaes_encrypt_core
+	movdqu	[edi],xmm0
+	mov	esp,DWORD [48+esp]
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret

+ 982 - 0
zeroidc/vendor/ring/pregenerated/tmp/vpaes-x86_64-nasm.asm

@@ -0,0 +1,982 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+section	.text code align=64
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_encrypt_core:
+
+	mov	r9,rdx
+	mov	r11,16
+	mov	eax,DWORD[240+rdx]
+	movdqa	xmm1,xmm9
+	movdqa	xmm2,XMMWORD[$L$k_ipt]
+	pandn	xmm1,xmm0
+	movdqu	xmm5,XMMWORD[r9]
+	psrld	xmm1,4
+	pand	xmm0,xmm9
+DB	102,15,56,0,208
+	movdqa	xmm0,XMMWORD[(($L$k_ipt+16))]
+DB	102,15,56,0,193
+	pxor	xmm2,xmm5
+	add	r9,16
+	pxor	xmm0,xmm2
+	lea	r10,[$L$k_mc_backward]
+	jmp	NEAR $L$enc_entry
+
+ALIGN	16
+$L$enc_loop:
+
+	movdqa	xmm4,xmm13
+	movdqa	xmm0,xmm12
+DB	102,15,56,0,226
+DB	102,15,56,0,195
+	pxor	xmm4,xmm5
+	movdqa	xmm5,xmm15
+	pxor	xmm0,xmm4
+	movdqa	xmm1,XMMWORD[((-64))+r10*1+r11]
+DB	102,15,56,0,234
+	movdqa	xmm4,XMMWORD[r10*1+r11]
+	movdqa	xmm2,xmm14
+DB	102,15,56,0,211
+	movdqa	xmm3,xmm0
+	pxor	xmm2,xmm5
+DB	102,15,56,0,193
+	add	r9,16
+	pxor	xmm0,xmm2
+DB	102,15,56,0,220
+	add	r11,16
+	pxor	xmm3,xmm0
+DB	102,15,56,0,193
+	and	r11,0x30
+	sub	rax,1
+	pxor	xmm0,xmm3
+
+$L$enc_entry:
+
+	movdqa	xmm1,xmm9
+	movdqa	xmm5,xmm11
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm9
+DB	102,15,56,0,232
+	movdqa	xmm3,xmm10
+	pxor	xmm0,xmm1
+DB	102,15,56,0,217
+	movdqa	xmm4,xmm10
+	pxor	xmm3,xmm5
+DB	102,15,56,0,224
+	movdqa	xmm2,xmm10
+	pxor	xmm4,xmm5
+DB	102,15,56,0,211
+	movdqa	xmm3,xmm10
+	pxor	xmm2,xmm0
+DB	102,15,56,0,220
+	movdqu	xmm5,XMMWORD[r9]
+	pxor	xmm3,xmm1
+	jnz	NEAR $L$enc_loop
+
+
+	movdqa	xmm4,XMMWORD[((-96))+r10]
+	movdqa	xmm0,XMMWORD[((-80))+r10]
+DB	102,15,56,0,226
+	pxor	xmm4,xmm5
+DB	102,15,56,0,195
+	movdqa	xmm1,XMMWORD[64+r10*1+r11]
+	pxor	xmm0,xmm4
+DB	102,15,56,0,193
+	DB	0F3h,0C3h		;repret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_encrypt_core_2x:
+
+	mov	r9,rdx
+	mov	r11,16
+	mov	eax,DWORD[240+rdx]
+	movdqa	xmm1,xmm9
+	movdqa	xmm7,xmm9
+	movdqa	xmm2,XMMWORD[$L$k_ipt]
+	movdqa	xmm8,xmm2
+	pandn	xmm1,xmm0
+	pandn	xmm7,xmm6
+	movdqu	xmm5,XMMWORD[r9]
+
+	psrld	xmm1,4
+	psrld	xmm7,4
+	pand	xmm0,xmm9
+	pand	xmm6,xmm9
+DB	102,15,56,0,208
+DB	102,68,15,56,0,198
+	movdqa	xmm0,XMMWORD[(($L$k_ipt+16))]
+	movdqa	xmm6,xmm0
+DB	102,15,56,0,193
+DB	102,15,56,0,247
+	pxor	xmm2,xmm5
+	pxor	xmm8,xmm5
+	add	r9,16
+	pxor	xmm0,xmm2
+	pxor	xmm6,xmm8
+	lea	r10,[$L$k_mc_backward]
+	jmp	NEAR $L$enc2x_entry
+
+ALIGN	16
+$L$enc2x_loop:
+
+	movdqa	xmm4,XMMWORD[$L$k_sb1]
+	movdqa	xmm0,XMMWORD[(($L$k_sb1+16))]
+	movdqa	xmm12,xmm4
+	movdqa	xmm6,xmm0
+DB	102,15,56,0,226
+DB	102,69,15,56,0,224
+DB	102,15,56,0,195
+DB	102,65,15,56,0,243
+	pxor	xmm4,xmm5
+	pxor	xmm12,xmm5
+	movdqa	xmm5,XMMWORD[$L$k_sb2]
+	movdqa	xmm13,xmm5
+	pxor	xmm0,xmm4
+	pxor	xmm6,xmm12
+	movdqa	xmm1,XMMWORD[((-64))+r10*1+r11]
+
+DB	102,15,56,0,234
+DB	102,69,15,56,0,232
+	movdqa	xmm4,XMMWORD[r10*1+r11]
+
+	movdqa	xmm2,XMMWORD[(($L$k_sb2+16))]
+	movdqa	xmm8,xmm2
+DB	102,15,56,0,211
+DB	102,69,15,56,0,195
+	movdqa	xmm3,xmm0
+	movdqa	xmm11,xmm6
+	pxor	xmm2,xmm5
+	pxor	xmm8,xmm13
+DB	102,15,56,0,193
+DB	102,15,56,0,241
+	add	r9,16
+	pxor	xmm0,xmm2
+	pxor	xmm6,xmm8
+DB	102,15,56,0,220
+DB	102,68,15,56,0,220
+	add	r11,16
+	pxor	xmm3,xmm0
+	pxor	xmm11,xmm6
+DB	102,15,56,0,193
+DB	102,15,56,0,241
+	and	r11,0x30
+	sub	rax,1
+	pxor	xmm0,xmm3
+	pxor	xmm6,xmm11
+
+$L$enc2x_entry:
+
+	movdqa	xmm1,xmm9
+	movdqa	xmm7,xmm9
+	movdqa	xmm5,XMMWORD[(($L$k_inv+16))]
+	movdqa	xmm13,xmm5
+	pandn	xmm1,xmm0
+	pandn	xmm7,xmm6
+	psrld	xmm1,4
+	psrld	xmm7,4
+	pand	xmm0,xmm9
+	pand	xmm6,xmm9
+DB	102,15,56,0,232
+DB	102,68,15,56,0,238
+	movdqa	xmm3,xmm10
+	movdqa	xmm11,xmm10
+	pxor	xmm0,xmm1
+	pxor	xmm6,xmm7
+DB	102,15,56,0,217
+DB	102,68,15,56,0,223
+	movdqa	xmm4,xmm10
+	movdqa	xmm12,xmm10
+	pxor	xmm3,xmm5
+	pxor	xmm11,xmm13
+DB	102,15,56,0,224
+DB	102,68,15,56,0,230
+	movdqa	xmm2,xmm10
+	movdqa	xmm8,xmm10
+	pxor	xmm4,xmm5
+	pxor	xmm12,xmm13
+DB	102,15,56,0,211
+DB	102,69,15,56,0,195
+	movdqa	xmm3,xmm10
+	movdqa	xmm11,xmm10
+	pxor	xmm2,xmm0
+	pxor	xmm8,xmm6
+DB	102,15,56,0,220
+DB	102,69,15,56,0,220
+	movdqu	xmm5,XMMWORD[r9]
+
+	pxor	xmm3,xmm1
+	pxor	xmm11,xmm7
+	jnz	NEAR $L$enc2x_loop
+
+
+	movdqa	xmm4,XMMWORD[((-96))+r10]
+	movdqa	xmm0,XMMWORD[((-80))+r10]
+	movdqa	xmm12,xmm4
+	movdqa	xmm6,xmm0
+DB	102,15,56,0,226
+DB	102,69,15,56,0,224
+	pxor	xmm4,xmm5
+	pxor	xmm12,xmm5
+DB	102,15,56,0,195
+DB	102,65,15,56,0,243
+	movdqa	xmm1,XMMWORD[64+r10*1+r11]
+
+	pxor	xmm0,xmm4
+	pxor	xmm6,xmm12
+DB	102,15,56,0,193
+DB	102,15,56,0,241
+	DB	0F3h,0C3h		;repret
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_schedule_core:
+
+
+
+
+
+
+	call	_vpaes_preheat
+	movdqa	xmm8,XMMWORD[$L$k_rcon]
+	movdqu	xmm0,XMMWORD[rdi]
+
+
+	movdqa	xmm3,xmm0
+	lea	r11,[$L$k_ipt]
+	call	_vpaes_schedule_transform
+	movdqa	xmm7,xmm0
+
+	lea	r10,[$L$k_sr]
+
+
+	movdqu	XMMWORD[rdx],xmm0
+
+$L$schedule_go:
+	cmp	esi,192
+	ja	NEAR $L$schedule_256
+
+
+
+
+
+
+
+
+
+
+
+$L$schedule_128:
+	mov	esi,10
+
+$L$oop_schedule_128:
+	call	_vpaes_schedule_round
+	dec	rsi
+	jz	NEAR $L$schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	jmp	NEAR $L$oop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+$L$schedule_256:
+	movdqu	xmm0,XMMWORD[16+rdi]
+	call	_vpaes_schedule_transform
+	mov	esi,7
+
+$L$oop_schedule_256:
+	call	_vpaes_schedule_mangle
+	movdqa	xmm6,xmm0
+
+
+	call	_vpaes_schedule_round
+	dec	rsi
+	jz	NEAR $L$schedule_mangle_last
+	call	_vpaes_schedule_mangle
+
+
+	pshufd	xmm0,xmm0,0xFF
+	movdqa	xmm5,xmm7
+	movdqa	xmm7,xmm6
+	call	_vpaes_schedule_low_round
+	movdqa	xmm7,xmm5
+
+	jmp	NEAR $L$oop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+$L$schedule_mangle_last:
+
+	lea	r11,[$L$k_deskew]
+
+
+	movdqa	xmm1,XMMWORD[r10*1+r8]
+DB	102,15,56,0,193
+	lea	r11,[$L$k_opt]
+	add	rdx,32
+
+$L$schedule_mangle_last_dec:
+	add	rdx,-16
+	pxor	xmm0,XMMWORD[$L$k_s63]
+	call	_vpaes_schedule_transform
+	movdqu	XMMWORD[rdx],xmm0
+
+
+	pxor	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	pxor	xmm6,xmm6
+	pxor	xmm7,xmm7
+	DB	0F3h,0C3h		;repret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_schedule_round:
+
+
+	pxor	xmm1,xmm1
+DB	102,65,15,58,15,200,15
+DB	102,69,15,58,15,192,15
+	pxor	xmm7,xmm1
+
+
+	pshufd	xmm0,xmm0,0xFF
+DB	102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+	movdqa	xmm1,xmm7
+	pslldq	xmm7,4
+	pxor	xmm7,xmm1
+	movdqa	xmm1,xmm7
+	pslldq	xmm7,8
+	pxor	xmm7,xmm1
+	pxor	xmm7,XMMWORD[$L$k_s63]
+
+
+	movdqa	xmm1,xmm9
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm9
+	movdqa	xmm2,xmm11
+DB	102,15,56,0,208
+	pxor	xmm0,xmm1
+	movdqa	xmm3,xmm10
+DB	102,15,56,0,217
+	pxor	xmm3,xmm2
+	movdqa	xmm4,xmm10
+DB	102,15,56,0,224
+	pxor	xmm4,xmm2
+	movdqa	xmm2,xmm10
+DB	102,15,56,0,211
+	pxor	xmm2,xmm0
+	movdqa	xmm3,xmm10
+DB	102,15,56,0,220
+	pxor	xmm3,xmm1
+	movdqa	xmm4,xmm13
+DB	102,15,56,0,226
+	movdqa	xmm0,xmm12
+DB	102,15,56,0,195
+	pxor	xmm0,xmm4
+
+
+	pxor	xmm0,xmm7
+	movdqa	xmm7,xmm0
+	DB	0F3h,0C3h		;repret
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_schedule_transform:
+
+	movdqa	xmm1,xmm9
+	pandn	xmm1,xmm0
+	psrld	xmm1,4
+	pand	xmm0,xmm9
+	movdqa	xmm2,XMMWORD[r11]
+DB	102,15,56,0,208
+	movdqa	xmm0,XMMWORD[16+r11]
+DB	102,15,56,0,193
+	pxor	xmm0,xmm2
+	DB	0F3h,0C3h		;repret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_schedule_mangle:
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,XMMWORD[$L$k_mc_forward]
+
+
+	add	rdx,16
+	pxor	xmm4,XMMWORD[$L$k_s63]
+DB	102,15,56,0,229
+	movdqa	xmm3,xmm4
+DB	102,15,56,0,229
+	pxor	xmm3,xmm4
+DB	102,15,56,0,229
+	pxor	xmm3,xmm4
+
+$L$schedule_mangle_both:
+	movdqa	xmm1,XMMWORD[r10*1+r8]
+DB	102,15,56,0,217
+	add	r8,-16
+	and	r8,0x30
+	movdqu	XMMWORD[rdx],xmm3
+	DB	0F3h,0C3h		;repret
+
+
+
+
+
+
+global	GFp_vpaes_set_encrypt_key
+
+ALIGN	16
+GFp_vpaes_set_encrypt_key:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_vpaes_set_encrypt_key:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+%ifdef BORINGSSL_DISPATCH_TEST
+EXTERN	BORINGSSL_function_hit
+	mov	BYTE[((BORINGSSL_function_hit+5))],1
+%endif
+
+	lea	rsp,[((-184))+rsp]
+	movaps	XMMWORD[16+rsp],xmm6
+	movaps	XMMWORD[32+rsp],xmm7
+	movaps	XMMWORD[48+rsp],xmm8
+	movaps	XMMWORD[64+rsp],xmm9
+	movaps	XMMWORD[80+rsp],xmm10
+	movaps	XMMWORD[96+rsp],xmm11
+	movaps	XMMWORD[112+rsp],xmm12
+	movaps	XMMWORD[128+rsp],xmm13
+	movaps	XMMWORD[144+rsp],xmm14
+	movaps	XMMWORD[160+rsp],xmm15
+$L$enc_key_body:
+	mov	eax,esi
+	shr	eax,5
+	add	eax,5
+	mov	DWORD[240+rdx],eax
+
+	mov	ecx,0
+	mov	r8d,0x30
+	call	_vpaes_schedule_core
+	movaps	xmm6,XMMWORD[16+rsp]
+	movaps	xmm7,XMMWORD[32+rsp]
+	movaps	xmm8,XMMWORD[48+rsp]
+	movaps	xmm9,XMMWORD[64+rsp]
+	movaps	xmm10,XMMWORD[80+rsp]
+	movaps	xmm11,XMMWORD[96+rsp]
+	movaps	xmm12,XMMWORD[112+rsp]
+	movaps	xmm13,XMMWORD[128+rsp]
+	movaps	xmm14,XMMWORD[144+rsp]
+	movaps	xmm15,XMMWORD[160+rsp]
+	lea	rsp,[184+rsp]
+$L$enc_key_epilogue:
+	xor	eax,eax
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_vpaes_set_encrypt_key:
+
+global	GFp_vpaes_encrypt
+
+ALIGN	16
+GFp_vpaes_encrypt:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_vpaes_encrypt:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	lea	rsp,[((-184))+rsp]
+	movaps	XMMWORD[16+rsp],xmm6
+	movaps	XMMWORD[32+rsp],xmm7
+	movaps	XMMWORD[48+rsp],xmm8
+	movaps	XMMWORD[64+rsp],xmm9
+	movaps	XMMWORD[80+rsp],xmm10
+	movaps	XMMWORD[96+rsp],xmm11
+	movaps	XMMWORD[112+rsp],xmm12
+	movaps	XMMWORD[128+rsp],xmm13
+	movaps	XMMWORD[144+rsp],xmm14
+	movaps	XMMWORD[160+rsp],xmm15
+$L$enc_body:
+	movdqu	xmm0,XMMWORD[rdi]
+	call	_vpaes_preheat
+	call	_vpaes_encrypt_core
+	movdqu	XMMWORD[rsi],xmm0
+	movaps	xmm6,XMMWORD[16+rsp]
+	movaps	xmm7,XMMWORD[32+rsp]
+	movaps	xmm8,XMMWORD[48+rsp]
+	movaps	xmm9,XMMWORD[64+rsp]
+	movaps	xmm10,XMMWORD[80+rsp]
+	movaps	xmm11,XMMWORD[96+rsp]
+	movaps	xmm12,XMMWORD[112+rsp]
+	movaps	xmm13,XMMWORD[128+rsp]
+	movaps	xmm14,XMMWORD[144+rsp]
+	movaps	xmm15,XMMWORD[160+rsp]
+	lea	rsp,[184+rsp]
+$L$enc_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_vpaes_encrypt:
+global	GFp_vpaes_ctr32_encrypt_blocks
+
+ALIGN	16
+GFp_vpaes_ctr32_encrypt_blocks:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_vpaes_ctr32_encrypt_blocks:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+
+
+
+
+	xchg	rdx,rcx
+	test	rcx,rcx
+	jz	NEAR $L$ctr32_abort
+	lea	rsp,[((-184))+rsp]
+	movaps	XMMWORD[16+rsp],xmm6
+	movaps	XMMWORD[32+rsp],xmm7
+	movaps	XMMWORD[48+rsp],xmm8
+	movaps	XMMWORD[64+rsp],xmm9
+	movaps	XMMWORD[80+rsp],xmm10
+	movaps	XMMWORD[96+rsp],xmm11
+	movaps	XMMWORD[112+rsp],xmm12
+	movaps	XMMWORD[128+rsp],xmm13
+	movaps	XMMWORD[144+rsp],xmm14
+	movaps	XMMWORD[160+rsp],xmm15
+$L$ctr32_body:
+	movdqu	xmm0,XMMWORD[r8]
+	movdqa	xmm8,XMMWORD[$L$ctr_add_one]
+	sub	rsi,rdi
+	call	_vpaes_preheat
+	movdqa	xmm6,xmm0
+	pshufb	xmm6,XMMWORD[$L$rev_ctr]
+
+	test	rcx,1
+	jz	NEAR $L$ctr32_prep_loop
+
+
+
+	movdqu	xmm7,XMMWORD[rdi]
+	call	_vpaes_encrypt_core
+	pxor	xmm0,xmm7
+	paddd	xmm6,xmm8
+	movdqu	XMMWORD[rdi*1+rsi],xmm0
+	sub	rcx,1
+	lea	rdi,[16+rdi]
+	jz	NEAR $L$ctr32_done
+
+$L$ctr32_prep_loop:
+
+
+	movdqa	xmm14,xmm6
+	movdqa	xmm15,xmm6
+	paddd	xmm15,xmm8
+
+$L$ctr32_loop:
+	movdqa	xmm1,XMMWORD[$L$rev_ctr]
+	movdqa	xmm0,xmm14
+	movdqa	xmm6,xmm15
+DB	102,15,56,0,193
+DB	102,15,56,0,241
+	call	_vpaes_encrypt_core_2x
+	movdqu	xmm1,XMMWORD[rdi]
+	movdqu	xmm2,XMMWORD[16+rdi]
+	movdqa	xmm3,XMMWORD[$L$ctr_add_two]
+	pxor	xmm0,xmm1
+	pxor	xmm6,xmm2
+	paddd	xmm14,xmm3
+	paddd	xmm15,xmm3
+	movdqu	XMMWORD[rdi*1+rsi],xmm0
+	movdqu	XMMWORD[16+rdi*1+rsi],xmm6
+	sub	rcx,2
+	lea	rdi,[32+rdi]
+	jnz	NEAR $L$ctr32_loop
+
+$L$ctr32_done:
+	movaps	xmm6,XMMWORD[16+rsp]
+	movaps	xmm7,XMMWORD[32+rsp]
+	movaps	xmm8,XMMWORD[48+rsp]
+	movaps	xmm9,XMMWORD[64+rsp]
+	movaps	xmm10,XMMWORD[80+rsp]
+	movaps	xmm11,XMMWORD[96+rsp]
+	movaps	xmm12,XMMWORD[112+rsp]
+	movaps	xmm13,XMMWORD[128+rsp]
+	movaps	xmm14,XMMWORD[144+rsp]
+	movaps	xmm15,XMMWORD[160+rsp]
+	lea	rsp,[184+rsp]
+$L$ctr32_epilogue:
+$L$ctr32_abort:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_vpaes_ctr32_encrypt_blocks:
+
+
+
+
+
+
+
+ALIGN	16
+_vpaes_preheat:
+
+	lea	r10,[$L$k_s0F]
+	movdqa	xmm10,XMMWORD[((-32))+r10]
+	movdqa	xmm11,XMMWORD[((-16))+r10]
+	movdqa	xmm9,XMMWORD[r10]
+	movdqa	xmm13,XMMWORD[48+r10]
+	movdqa	xmm12,XMMWORD[64+r10]
+	movdqa	xmm15,XMMWORD[80+r10]
+	movdqa	xmm14,XMMWORD[96+r10]
+	DB	0F3h,0C3h		;repret
+
+
+
+
+
+
+
+
+ALIGN	64
+_vpaes_consts:
+$L$k_inv:
+	DQ	0x0E05060F0D080180,0x040703090A0B0C02
+	DQ	0x01040A060F0B0780,0x030D0E0C02050809
+
+$L$k_s0F:
+	DQ	0x0F0F0F0F0F0F0F0F,0x0F0F0F0F0F0F0F0F
+
+$L$k_ipt:
+	DQ	0xC2B2E8985A2A7000,0xCABAE09052227808
+	DQ	0x4C01307D317C4D00,0xCD80B1FCB0FDCC81
+
+$L$k_sb1:
+	DQ	0xB19BE18FCB503E00,0xA5DF7A6E142AF544
+	DQ	0x3618D415FAE22300,0x3BF7CCC10D2ED9EF
+$L$k_sb2:
+	DQ	0xE27A93C60B712400,0x5EB7E955BC982FCD
+	DQ	0x69EB88400AE12900,0xC2A163C8AB82234A
+$L$k_sbo:
+	DQ	0xD0D26D176FBDC700,0x15AABF7AC502A878
+	DQ	0xCFE474A55FBB6A00,0x8E1E90D1412B35FA
+
+$L$k_mc_forward:
+	DQ	0x0407060500030201,0x0C0F0E0D080B0A09
+	DQ	0x080B0A0904070605,0x000302010C0F0E0D
+	DQ	0x0C0F0E0D080B0A09,0x0407060500030201
+	DQ	0x000302010C0F0E0D,0x080B0A0904070605
+
+$L$k_mc_backward:
+	DQ	0x0605040702010003,0x0E0D0C0F0A09080B
+	DQ	0x020100030E0D0C0F,0x0A09080B06050407
+	DQ	0x0E0D0C0F0A09080B,0x0605040702010003
+	DQ	0x0A09080B06050407,0x020100030E0D0C0F
+
+$L$k_sr:
+	DQ	0x0706050403020100,0x0F0E0D0C0B0A0908
+	DQ	0x030E09040F0A0500,0x0B06010C07020D08
+	DQ	0x0F060D040B020900,0x070E050C030A0108
+	DQ	0x0B0E0104070A0D00,0x0306090C0F020508
+
+$L$k_rcon:
+	DQ	0x1F8391B9AF9DEEB6,0x702A98084D7C7D81
+
+$L$k_s63:
+	DQ	0x5B5B5B5B5B5B5B5B,0x5B5B5B5B5B5B5B5B
+
+$L$k_opt:
+	DQ	0xFF9F4929D6B66000,0xF7974121DEBE6808
+	DQ	0x01EDBD5150BCEC00,0xE10D5DB1B05C0CE0
+
+$L$k_deskew:
+	DQ	0x07E4A34047A4E300,0x1DFEB95A5DBEF91A
+	DQ	0x5F36B5DC83EA6900,0x2841C2ABF49D1E77
+
+
+$L$rev_ctr:
+	DQ	0x0706050403020100,0x0c0d0e0f0b0a0908
+
+
+$L$ctr_add_one:
+	DQ	0x0000000000000000,0x0000000100000000
+$L$ctr_add_two:
+	DQ	0x0000000000000000,0x0000000200000000
+
+DB	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+DB	111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54
+DB	52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97
+DB	109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32
+DB	85,110,105,118,101,114,115,105,116,121,41,0
+ALIGN	64
+
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$in_prologue
+
+	lea	rsi,[16+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,20
+	DD	0xa548f3fc
+	lea	rax,[184+rax]
+
+$L$in_prologue:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	DB	0F3h,0C3h		;repret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_GFp_vpaes_set_encrypt_key wrt ..imagebase
+	DD	$L$SEH_end_GFp_vpaes_set_encrypt_key wrt ..imagebase
+	DD	$L$SEH_info_GFp_vpaes_set_encrypt_key wrt ..imagebase
+
+	DD	$L$SEH_begin_GFp_vpaes_encrypt wrt ..imagebase
+	DD	$L$SEH_end_GFp_vpaes_encrypt wrt ..imagebase
+	DD	$L$SEH_info_GFp_vpaes_encrypt wrt ..imagebase
+	DD	$L$SEH_begin_GFp_vpaes_ctr32_encrypt_blocks wrt ..imagebase
+	DD	$L$SEH_end_GFp_vpaes_ctr32_encrypt_blocks wrt ..imagebase
+	DD	$L$SEH_info_GFp_vpaes_ctr32_encrypt_blocks wrt ..imagebase
+
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_GFp_vpaes_set_encrypt_key:
+DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$enc_key_body wrt ..imagebase,$L$enc_key_epilogue wrt ..imagebase
+$L$SEH_info_GFp_vpaes_encrypt:
+DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$enc_body wrt ..imagebase,$L$enc_epilogue wrt ..imagebase
+$L$SEH_info_GFp_vpaes_ctr32_encrypt_blocks:
+DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+	DD	$L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase

+ 227 - 0
zeroidc/vendor/ring/pregenerated/tmp/x86-mont-win32n.asm

@@ -0,0 +1,227 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__,obj
+section	code	use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
[email protected] equ 1
+section	.text	code align=64
+%else
+section	.text	code
+%endif
+;extern	_GFp_ia32cap_P
+global	_GFp_bn_mul_mont
+align	16
+_GFp_bn_mul_mont:
+L$_GFp_bn_mul_mont_begin:
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	xor	eax,eax
+	mov	edi,DWORD [40+esp]
+	lea	esi,[20+esp]
+	lea	edx,[24+esp]
+	add	edi,2
+	neg	edi
+	lea	ebp,[edi*4+esp-32]
+	neg	edi
+	mov	eax,ebp
+	sub	eax,edx
+	and	eax,2047
+	sub	ebp,eax
+	xor	edx,ebp
+	and	edx,2048
+	xor	edx,2048
+	sub	ebp,edx
+	and	ebp,-64
+	mov	eax,esp
+	sub	eax,ebp
+	and	eax,-4096
+	mov	edx,esp
+	lea	esp,[eax*1+ebp]
+	mov	eax,DWORD [esp]
+	cmp	esp,ebp
+	ja	NEAR L$000page_walk
+	jmp	NEAR L$001page_walk_done
+align	16
+L$000page_walk:
+	lea	esp,[esp-4096]
+	mov	eax,DWORD [esp]
+	cmp	esp,ebp
+	ja	NEAR L$000page_walk
+L$001page_walk_done:
+	mov	eax,DWORD [esi]
+	mov	ebx,DWORD [4+esi]
+	mov	ecx,DWORD [8+esi]
+	mov	ebp,DWORD [12+esi]
+	mov	esi,DWORD [16+esi]
+	mov	esi,DWORD [esi]
+	mov	DWORD [4+esp],eax
+	mov	DWORD [8+esp],ebx
+	mov	DWORD [12+esp],ecx
+	mov	DWORD [16+esp],ebp
+	mov	DWORD [20+esp],esi
+	lea	ebx,[edi-3]
+	mov	DWORD [24+esp],edx
+	lea	eax,[_GFp_ia32cap_P]
+	bt	DWORD [eax],26
+	mov	eax,-1
+	movd	mm7,eax
+	mov	esi,DWORD [8+esp]
+	mov	edi,DWORD [12+esp]
+	mov	ebp,DWORD [16+esp]
+	xor	edx,edx
+	xor	ecx,ecx
+	movd	mm4,DWORD [edi]
+	movd	mm5,DWORD [esi]
+	movd	mm3,DWORD [ebp]
+	pmuludq	mm5,mm4
+	movq	mm2,mm5
+	movq	mm0,mm5
+	pand	mm0,mm7
+	pmuludq	mm5,[20+esp]
+	pmuludq	mm3,mm5
+	paddq	mm3,mm0
+	movd	mm1,DWORD [4+ebp]
+	movd	mm0,DWORD [4+esi]
+	psrlq	mm2,32
+	psrlq	mm3,32
+	inc	ecx
+align	16
+L$0021st:
+	pmuludq	mm0,mm4
+	pmuludq	mm1,mm5
+	paddq	mm2,mm0
+	paddq	mm3,mm1
+	movq	mm0,mm2
+	pand	mm0,mm7
+	movd	mm1,DWORD [4+ecx*4+ebp]
+	paddq	mm3,mm0
+	movd	mm0,DWORD [4+ecx*4+esi]
+	psrlq	mm2,32
+	movd	DWORD [28+ecx*4+esp],mm3
+	psrlq	mm3,32
+	lea	ecx,[1+ecx]
+	cmp	ecx,ebx
+	jl	NEAR L$0021st
+	pmuludq	mm0,mm4
+	pmuludq	mm1,mm5
+	paddq	mm2,mm0
+	paddq	mm3,mm1
+	movq	mm0,mm2
+	pand	mm0,mm7
+	paddq	mm3,mm0
+	movd	DWORD [28+ecx*4+esp],mm3
+	psrlq	mm2,32
+	psrlq	mm3,32
+	paddq	mm3,mm2
+	movq	[32+ebx*4+esp],mm3
+	inc	edx
+L$003outer:
+	xor	ecx,ecx
+	movd	mm4,DWORD [edx*4+edi]
+	movd	mm5,DWORD [esi]
+	movd	mm6,DWORD [32+esp]
+	movd	mm3,DWORD [ebp]
+	pmuludq	mm5,mm4
+	paddq	mm5,mm6
+	movq	mm0,mm5
+	movq	mm2,mm5
+	pand	mm0,mm7
+	pmuludq	mm5,[20+esp]
+	pmuludq	mm3,mm5
+	paddq	mm3,mm0
+	movd	mm6,DWORD [36+esp]
+	movd	mm1,DWORD [4+ebp]
+	movd	mm0,DWORD [4+esi]
+	psrlq	mm2,32
+	psrlq	mm3,32
+	paddq	mm2,mm6
+	inc	ecx
+	dec	ebx
+L$004inner:
+	pmuludq	mm0,mm4
+	pmuludq	mm1,mm5
+	paddq	mm2,mm0
+	paddq	mm3,mm1
+	movq	mm0,mm2
+	movd	mm6,DWORD [36+ecx*4+esp]
+	pand	mm0,mm7
+	movd	mm1,DWORD [4+ecx*4+ebp]
+	paddq	mm3,mm0
+	movd	mm0,DWORD [4+ecx*4+esi]
+	psrlq	mm2,32
+	movd	DWORD [28+ecx*4+esp],mm3
+	psrlq	mm3,32
+	paddq	mm2,mm6
+	dec	ebx
+	lea	ecx,[1+ecx]
+	jnz	NEAR L$004inner
+	mov	ebx,ecx
+	pmuludq	mm0,mm4
+	pmuludq	mm1,mm5
+	paddq	mm2,mm0
+	paddq	mm3,mm1
+	movq	mm0,mm2
+	pand	mm0,mm7
+	paddq	mm3,mm0
+	movd	DWORD [28+ecx*4+esp],mm3
+	psrlq	mm2,32
+	psrlq	mm3,32
+	movd	mm6,DWORD [36+ebx*4+esp]
+	paddq	mm3,mm2
+	paddq	mm3,mm6
+	movq	[32+ebx*4+esp],mm3
+	lea	edx,[1+edx]
+	cmp	edx,ebx
+	jle	NEAR L$003outer
+	emms
+align	16
+L$005common_tail:
+	mov	ebp,DWORD [16+esp]
+	mov	edi,DWORD [4+esp]
+	lea	esi,[32+esp]
+	mov	eax,DWORD [esi]
+	mov	ecx,ebx
+	xor	edx,edx
+align	16
+L$006sub:
+	sbb	eax,DWORD [edx*4+ebp]
+	mov	DWORD [edx*4+edi],eax
+	dec	ecx
+	mov	eax,DWORD [4+edx*4+esi]
+	lea	edx,[1+edx]
+	jge	NEAR L$006sub
+	sbb	eax,0
+	mov	edx,-1
+	xor	edx,eax
+	jmp	NEAR L$007copy
+align	16
+L$007copy:
+	mov	esi,DWORD [32+ebx*4+esp]
+	mov	ebp,DWORD [ebx*4+edi]
+	mov	DWORD [32+ebx*4+esp],ecx
+	and	esi,eax
+	and	ebp,edx
+	or	ebp,esi
+	mov	DWORD [ebx*4+edi],ebp
+	dec	ebx
+	jge	NEAR L$007copy
+	mov	esp,DWORD [24+esp]
+	mov	eax,1
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+db	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+db	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+db	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+db	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+db	111,114,103,62,0
+segment	.bss
+common	_GFp_ia32cap_P 16

+ 1475 - 0
zeroidc/vendor/ring/pregenerated/tmp/x86_64-mont-nasm.asm

@@ -0,0 +1,1475 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+section	.text code align=64
+
+
+EXTERN	GFp_ia32cap_P
+
+global	GFp_bn_mul_mont
+
+ALIGN	16
+GFp_bn_mul_mont:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_bn_mul_mont:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+	mov	r9d,r9d
+	mov	rax,rsp
+
+	test	r9d,3
+	jnz	NEAR $L$mul_enter
+	cmp	r9d,8
+	jb	NEAR $L$mul_enter
+	mov	r11d,DWORD[((GFp_ia32cap_P+8))]
+	cmp	rdx,rsi
+	jne	NEAR $L$mul4x_enter
+	test	r9d,7
+	jz	NEAR $L$sqr8x_enter
+	jmp	NEAR $L$mul4x_enter
+
+ALIGN	16
+$L$mul_enter:
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+
+	neg	r9
+	mov	r11,rsp
+	lea	r10,[((-16))+r9*8+rsp]
+	neg	r9
+	and	r10,-1024
+
+
+
+
+
+
+
+
+
+	sub	r11,r10
+	and	r11,-4096
+	lea	rsp,[r11*1+r10]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul_page_walk
+	jmp	NEAR $L$mul_page_walk_done
+
+ALIGN	16
+$L$mul_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul_page_walk
+$L$mul_page_walk_done:
+
+	mov	QWORD[8+r9*8+rsp],rax
+
+$L$mul_body:
+	mov	r12,rdx
+	mov	r8,QWORD[r8]
+	mov	rbx,QWORD[r12]
+	mov	rax,QWORD[rsi]
+
+	xor	r14,r14
+	xor	r15,r15
+
+	mov	rbp,r8
+	mul	rbx
+	mov	r10,rax
+	mov	rax,QWORD[rcx]
+
+	imul	rbp,r10
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rdx,0
+	mov	r13,rdx
+
+	lea	r15,[1+r15]
+	jmp	NEAR $L$1st_enter
+
+ALIGN	16
+$L$1st:
+	add	r13,rax
+	mov	rax,QWORD[r15*8+rsi]
+	adc	rdx,0
+	add	r13,r11
+	mov	r11,r10
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],r13
+	mov	r13,rdx
+
+$L$1st_enter:
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[r15*8+rcx]
+	adc	rdx,0
+	lea	r15,[1+r15]
+	mov	r10,rdx
+
+	mul	rbp
+	cmp	r15,r9
+	jne	NEAR $L$1st
+
+	add	r13,rax
+	mov	rax,QWORD[rsi]
+	adc	rdx,0
+	add	r13,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],r13
+	mov	r13,rdx
+	mov	r11,r10
+
+	xor	rdx,rdx
+	add	r13,r11
+	adc	rdx,0
+	mov	QWORD[((-8))+r9*8+rsp],r13
+	mov	QWORD[r9*8+rsp],rdx
+
+	lea	r14,[1+r14]
+	jmp	NEAR $L$outer
+ALIGN	16
+$L$outer:
+	mov	rbx,QWORD[r14*8+r12]
+	xor	r15,r15
+	mov	rbp,r8
+	mov	r10,QWORD[rsp]
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[rcx]
+	adc	rdx,0
+
+	imul	rbp,r10
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rdx,0
+	mov	r10,QWORD[8+rsp]
+	mov	r13,rdx
+
+	lea	r15,[1+r15]
+	jmp	NEAR $L$inner_enter
+
+ALIGN	16
+$L$inner:
+	add	r13,rax
+	mov	rax,QWORD[r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	mov	r10,QWORD[r15*8+rsp]
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],r13
+	mov	r13,rdx
+
+$L$inner_enter:
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[r15*8+rcx]
+	adc	rdx,0
+	add	r10,r11
+	mov	r11,rdx
+	adc	r11,0
+	lea	r15,[1+r15]
+
+	mul	rbp
+	cmp	r15,r9
+	jne	NEAR $L$inner
+
+	add	r13,rax
+	mov	rax,QWORD[rsi]
+	adc	rdx,0
+	add	r13,r10
+	mov	r10,QWORD[r15*8+rsp]
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],r13
+	mov	r13,rdx
+
+	xor	rdx,rdx
+	add	r13,r11
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-8))+r9*8+rsp],r13
+	mov	QWORD[r9*8+rsp],rdx
+
+	lea	r14,[1+r14]
+	cmp	r14,r9
+	jb	NEAR $L$outer
+
+	xor	r14,r14
+	mov	rax,QWORD[rsp]
+	mov	r15,r9
+
+ALIGN	16
+$L$sub:	sbb	rax,QWORD[r14*8+rcx]
+	mov	QWORD[r14*8+rdi],rax
+	mov	rax,QWORD[8+r14*8+rsp]
+	lea	r14,[1+r14]
+	dec	r15
+	jnz	NEAR $L$sub
+
+	sbb	rax,0
+	mov	rbx,-1
+	xor	rbx,rax
+	xor	r14,r14
+	mov	r15,r9
+
+$L$copy:
+	mov	rcx,QWORD[r14*8+rdi]
+	mov	rdx,QWORD[r14*8+rsp]
+	and	rcx,rbx
+	and	rdx,rax
+	mov	QWORD[r14*8+rsp],r9
+	or	rdx,rcx
+	mov	QWORD[r14*8+rdi],rdx
+	lea	r14,[1+r14]
+	sub	r15,1
+	jnz	NEAR $L$copy
+
+	mov	rsi,QWORD[8+r9*8+rsp]
+
+	mov	rax,1
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$mul_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_bn_mul_mont:
+
+ALIGN	16
+bn_mul4x_mont:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_mul4x_mont:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+	mov	r9d,r9d
+	mov	rax,rsp
+
+$L$mul4x_enter:
+	and	r11d,0x80100
+	cmp	r11d,0x80100
+	je	NEAR $L$mulx4x_enter
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+
+	neg	r9
+	mov	r11,rsp
+	lea	r10,[((-32))+r9*8+rsp]
+	neg	r9
+	and	r10,-1024
+
+	sub	r11,r10
+	and	r11,-4096
+	lea	rsp,[r11*1+r10]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul4x_page_walk
+	jmp	NEAR $L$mul4x_page_walk_done
+
+$L$mul4x_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul4x_page_walk
+$L$mul4x_page_walk_done:
+
+	mov	QWORD[8+r9*8+rsp],rax
+
+$L$mul4x_body:
+	mov	QWORD[16+r9*8+rsp],rdi
+	mov	r12,rdx
+	mov	r8,QWORD[r8]
+	mov	rbx,QWORD[r12]
+	mov	rax,QWORD[rsi]
+
+	xor	r14,r14
+	xor	r15,r15
+
+	mov	rbp,r8
+	mul	rbx
+	mov	r10,rax
+	mov	rax,QWORD[rcx]
+
+	imul	rbp,r10
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rdx,0
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[16+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	lea	r15,[4+r15]
+	adc	rdx,0
+	mov	QWORD[rsp],rdi
+	mov	r13,rdx
+	jmp	NEAR $L$1st4x
+ALIGN	16
+$L$1st4x:
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+r15*8+rcx]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-24))+r15*8+rsp],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+r15*8+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[r15*8+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],rdi
+	mov	r13,rdx
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[r15*8+rcx]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[8+r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-8))+r15*8+rsp],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+r15*8+rcx]
+	adc	rdx,0
+	lea	r15,[4+r15]
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[((-16))+r15*8+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-32))+r15*8+rsp],rdi
+	mov	r13,rdx
+	cmp	r15,r9
+	jb	NEAR $L$1st4x
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+r15*8+rcx]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-24))+r15*8+rsp],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+r15*8+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],rdi
+	mov	r13,rdx
+
+	xor	rdi,rdi
+	add	r13,r10
+	adc	rdi,0
+	mov	QWORD[((-8))+r15*8+rsp],r13
+	mov	QWORD[r15*8+rsp],rdi
+
+	lea	r14,[1+r14]
+ALIGN	4
+$L$outer4x:
+	mov	rbx,QWORD[r14*8+r12]
+	xor	r15,r15
+	mov	r10,QWORD[rsp]
+	mov	rbp,r8
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[rcx]
+	adc	rdx,0
+
+	imul	rbp,r10
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rdx,0
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+rcx]
+	adc	rdx,0
+	add	r11,QWORD[8+rsp]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[16+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	lea	r15,[4+r15]
+	adc	rdx,0
+	mov	QWORD[rsp],rdi
+	mov	r13,rdx
+	jmp	NEAR $L$inner4x
+ALIGN	16
+$L$inner4x:
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+r15*8+rcx]
+	adc	rdx,0
+	add	r10,QWORD[((-16))+r15*8+rsp]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-24))+r15*8+rsp],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+r15*8+rcx]
+	adc	rdx,0
+	add	r11,QWORD[((-8))+r15*8+rsp]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[r15*8+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],rdi
+	mov	r13,rdx
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[r15*8+rcx]
+	adc	rdx,0
+	add	r10,QWORD[r15*8+rsp]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[8+r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-8))+r15*8+rsp],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+r15*8+rcx]
+	adc	rdx,0
+	add	r11,QWORD[8+r15*8+rsp]
+	adc	rdx,0
+	lea	r15,[4+r15]
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[((-16))+r15*8+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-32))+r15*8+rsp],rdi
+	mov	r13,rdx
+	cmp	r15,r9
+	jb	NEAR $L$inner4x
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+r15*8+rcx]
+	adc	rdx,0
+	add	r10,QWORD[((-16))+r15*8+rsp]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-24))+r15*8+rsp],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+r15*8+rcx]
+	adc	rdx,0
+	add	r11,QWORD[((-8))+r15*8+rsp]
+	adc	rdx,0
+	lea	r14,[1+r14]
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],rdi
+	mov	r13,rdx
+
+	xor	rdi,rdi
+	add	r13,r10
+	adc	rdi,0
+	add	r13,QWORD[r9*8+rsp]
+	adc	rdi,0
+	mov	QWORD[((-8))+r15*8+rsp],r13
+	mov	QWORD[r15*8+rsp],rdi
+
+	cmp	r14,r9
+	jb	NEAR $L$outer4x
+	mov	rdi,QWORD[16+r9*8+rsp]
+	lea	r15,[((-4))+r9]
+	mov	rax,QWORD[rsp]
+	mov	rdx,QWORD[8+rsp]
+	shr	r15,2
+	lea	rsi,[rsp]
+	xor	r14,r14
+
+	sub	rax,QWORD[rcx]
+	mov	rbx,QWORD[16+rsi]
+	mov	rbp,QWORD[24+rsi]
+	sbb	rdx,QWORD[8+rcx]
+
+$L$sub4x:
+	mov	QWORD[r14*8+rdi],rax
+	mov	QWORD[8+r14*8+rdi],rdx
+	sbb	rbx,QWORD[16+r14*8+rcx]
+	mov	rax,QWORD[32+r14*8+rsi]
+	mov	rdx,QWORD[40+r14*8+rsi]
+	sbb	rbp,QWORD[24+r14*8+rcx]
+	mov	QWORD[16+r14*8+rdi],rbx
+	mov	QWORD[24+r14*8+rdi],rbp
+	sbb	rax,QWORD[32+r14*8+rcx]
+	mov	rbx,QWORD[48+r14*8+rsi]
+	mov	rbp,QWORD[56+r14*8+rsi]
+	sbb	rdx,QWORD[40+r14*8+rcx]
+	lea	r14,[4+r14]
+	dec	r15
+	jnz	NEAR $L$sub4x
+
+	mov	QWORD[r14*8+rdi],rax
+	mov	rax,QWORD[32+r14*8+rsi]
+	sbb	rbx,QWORD[16+r14*8+rcx]
+	mov	QWORD[8+r14*8+rdi],rdx
+	sbb	rbp,QWORD[24+r14*8+rcx]
+	mov	QWORD[16+r14*8+rdi],rbx
+
+	sbb	rax,0
+	mov	QWORD[24+r14*8+rdi],rbp
+	pxor	xmm0,xmm0
+DB	102,72,15,110,224
+	pcmpeqd	xmm5,xmm5
+	pshufd	xmm4,xmm4,0
+	mov	r15,r9
+	pxor	xmm5,xmm4
+	shr	r15,2
+	xor	eax,eax
+
+	jmp	NEAR $L$copy4x
+ALIGN	16
+$L$copy4x:
+	movdqa	xmm1,XMMWORD[rax*1+rsp]
+	movdqu	xmm2,XMMWORD[rax*1+rdi]
+	pand	xmm1,xmm4
+	pand	xmm2,xmm5
+	movdqa	xmm3,XMMWORD[16+rax*1+rsp]
+	movdqa	XMMWORD[rax*1+rsp],xmm0
+	por	xmm1,xmm2
+	movdqu	xmm2,XMMWORD[16+rax*1+rdi]
+	movdqu	XMMWORD[rax*1+rdi],xmm1
+	pand	xmm3,xmm4
+	pand	xmm2,xmm5
+	movdqa	XMMWORD[16+rax*1+rsp],xmm0
+	por	xmm3,xmm2
+	movdqu	XMMWORD[16+rax*1+rdi],xmm3
+	lea	rax,[32+rax]
+	dec	r15
+	jnz	NEAR $L$copy4x
+	mov	rsi,QWORD[8+r9*8+rsp]
+
+	mov	rax,1
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$mul4x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_bn_mul4x_mont:
+EXTERN	GFp_bn_sqrx8x_internal
+EXTERN	GFp_bn_sqr8x_internal
+
+
+ALIGN	32
+bn_sqr8x_mont:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_sqr8x_mont:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+	mov	rax,rsp
+
+$L$sqr8x_enter:
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$sqr8x_prologue:
+
+	mov	r10d,r9d
+	shl	r9d,3
+	shl	r10,3+2
+	neg	r9
+
+
+
+
+
+
+	lea	r11,[((-64))+r9*2+rsp]
+	mov	rbp,rsp
+	mov	r8,QWORD[r8]
+	sub	r11,rsi
+	and	r11,4095
+	cmp	r10,r11
+	jb	NEAR $L$sqr8x_sp_alt
+	sub	rbp,r11
+	lea	rbp,[((-64))+r9*2+rbp]
+	jmp	NEAR $L$sqr8x_sp_done
+
+ALIGN	32
+$L$sqr8x_sp_alt:
+	lea	r10,[((4096-64))+r9*2]
+	lea	rbp,[((-64))+r9*2+rbp]
+	sub	r11,r10
+	mov	r10,0
+	cmovc	r11,r10
+	sub	rbp,r11
+$L$sqr8x_sp_done:
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$sqr8x_page_walk
+	jmp	NEAR $L$sqr8x_page_walk_done
+
+ALIGN	16
+$L$sqr8x_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$sqr8x_page_walk
+$L$sqr8x_page_walk_done:
+
+	mov	r10,r9
+	neg	r9
+
+	mov	QWORD[32+rsp],r8
+	mov	QWORD[40+rsp],rax
+
+$L$sqr8x_body:
+
+DB	102,72,15,110,209
+	pxor	xmm0,xmm0
+DB	102,72,15,110,207
+DB	102,73,15,110,218
+	mov	eax,DWORD[((GFp_ia32cap_P+8))]
+	and	eax,0x80100
+	cmp	eax,0x80100
+	jne	NEAR $L$sqr8x_nox
+
+	call	GFp_bn_sqrx8x_internal
+
+
+
+
+	lea	rbx,[rcx*1+r8]
+	mov	r9,rcx
+	mov	rdx,rcx
+DB	102,72,15,126,207
+	sar	rcx,3+2
+	jmp	NEAR $L$sqr8x_sub
+
+ALIGN	32
+$L$sqr8x_nox:
+	call	GFp_bn_sqr8x_internal
+
+
+
+
+	lea	rbx,[r9*1+rdi]
+	mov	rcx,r9
+	mov	rdx,r9
+DB	102,72,15,126,207
+	sar	rcx,3+2
+	jmp	NEAR $L$sqr8x_sub
+
+ALIGN	32
+$L$sqr8x_sub:
+	mov	r12,QWORD[rbx]
+	mov	r13,QWORD[8+rbx]
+	mov	r14,QWORD[16+rbx]
+	mov	r15,QWORD[24+rbx]
+	lea	rbx,[32+rbx]
+	sbb	r12,QWORD[rbp]
+	sbb	r13,QWORD[8+rbp]
+	sbb	r14,QWORD[16+rbp]
+	sbb	r15,QWORD[24+rbp]
+	lea	rbp,[32+rbp]
+	mov	QWORD[rdi],r12
+	mov	QWORD[8+rdi],r13
+	mov	QWORD[16+rdi],r14
+	mov	QWORD[24+rdi],r15
+	lea	rdi,[32+rdi]
+	inc	rcx
+	jnz	NEAR $L$sqr8x_sub
+
+	sbb	rax,0
+	lea	rbx,[r9*1+rbx]
+	lea	rdi,[r9*1+rdi]
+
+DB	102,72,15,110,200
+	pxor	xmm0,xmm0
+	pshufd	xmm1,xmm1,0
+	mov	rsi,QWORD[40+rsp]
+
+	jmp	NEAR $L$sqr8x_cond_copy
+
+ALIGN	32
+$L$sqr8x_cond_copy:
+	movdqa	xmm2,XMMWORD[rbx]
+	movdqa	xmm3,XMMWORD[16+rbx]
+	lea	rbx,[32+rbx]
+	movdqu	xmm4,XMMWORD[rdi]
+	movdqu	xmm5,XMMWORD[16+rdi]
+	lea	rdi,[32+rdi]
+	movdqa	XMMWORD[(-32)+rbx],xmm0
+	movdqa	XMMWORD[(-16)+rbx],xmm0
+	movdqa	XMMWORD[(-32)+rdx*1+rbx],xmm0
+	movdqa	XMMWORD[(-16)+rdx*1+rbx],xmm0
+	pcmpeqd	xmm0,xmm1
+	pand	xmm2,xmm1
+	pand	xmm3,xmm1
+	pand	xmm4,xmm0
+	pand	xmm5,xmm0
+	pxor	xmm0,xmm0
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqu	XMMWORD[(-32)+rdi],xmm4
+	movdqu	XMMWORD[(-16)+rdi],xmm5
+	add	r9,32
+	jnz	NEAR $L$sqr8x_cond_copy
+
+	mov	rax,1
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$sqr8x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_bn_sqr8x_mont:
+
+ALIGN	32
+bn_mulx4x_mont:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_mulx4x_mont:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+	mov	rax,rsp
+
+$L$mulx4x_enter:
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$mulx4x_prologue:
+
+	shl	r9d,3
+	xor	r10,r10
+	sub	r10,r9
+	mov	r8,QWORD[r8]
+	lea	rbp,[((-72))+r10*1+rsp]
+	and	rbp,-128
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mulx4x_page_walk
+	jmp	NEAR $L$mulx4x_page_walk_done
+
+ALIGN	16
+$L$mulx4x_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mulx4x_page_walk
+$L$mulx4x_page_walk_done:
+
+	lea	r10,[r9*1+rdx]
+
+
+
+
+
+
+
+
+
+
+
+
+	mov	QWORD[rsp],r9
+	shr	r9,5
+	mov	QWORD[16+rsp],r10
+	sub	r9,1
+	mov	QWORD[24+rsp],r8
+	mov	QWORD[32+rsp],rdi
+	mov	QWORD[40+rsp],rax
+
+	mov	QWORD[48+rsp],r9
+	jmp	NEAR $L$mulx4x_body
+
+ALIGN	32
+$L$mulx4x_body:
+	lea	rdi,[8+rdx]
+	mov	rdx,QWORD[rdx]
+	lea	rbx,[((64+32))+rsp]
+	mov	r9,rdx
+
+	mulx	rax,r8,QWORD[rsi]
+	mulx	r14,r11,QWORD[8+rsi]
+	add	r11,rax
+	mov	QWORD[8+rsp],rdi
+	mulx	r13,r12,QWORD[16+rsi]
+	adc	r12,r14
+	adc	r13,0
+
+	mov	rdi,r8
+	imul	r8,QWORD[24+rsp]
+	xor	rbp,rbp
+
+	mulx	r14,rax,QWORD[24+rsi]
+	mov	rdx,r8
+	lea	rsi,[32+rsi]
+	adcx	r13,rax
+	adcx	r14,rbp
+
+	mulx	r10,rax,QWORD[rcx]
+	adcx	rdi,rax
+	adox	r10,r11
+	mulx	r11,rax,QWORD[8+rcx]
+	adcx	r10,rax
+	adox	r11,r12
+DB	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
+	mov	rdi,QWORD[48+rsp]
+	mov	QWORD[((-32))+rbx],r10
+	adcx	r11,rax
+	adox	r12,r13
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	QWORD[((-24))+rbx],r11
+	adcx	r12,rax
+	adox	r15,rbp
+	lea	rcx,[32+rcx]
+	mov	QWORD[((-16))+rbx],r12
+
+	jmp	NEAR $L$mulx4x_1st
+
+ALIGN	32
+$L$mulx4x_1st:
+	adcx	r15,rbp
+	mulx	rax,r10,QWORD[rsi]
+	adcx	r10,r14
+	mulx	r14,r11,QWORD[8+rsi]
+	adcx	r11,rax
+	mulx	rax,r12,QWORD[16+rsi]
+	adcx	r12,r14
+	mulx	r14,r13,QWORD[24+rsi]
+DB	0x67,0x67
+	mov	rdx,r8
+	adcx	r13,rax
+	adcx	r14,rbp
+	lea	rsi,[32+rsi]
+	lea	rbx,[32+rbx]
+
+	adox	r10,r15
+	mulx	r15,rax,QWORD[rcx]
+	adcx	r10,rax
+	adox	r11,r15
+	mulx	r15,rax,QWORD[8+rcx]
+	adcx	r11,rax
+	adox	r12,r15
+	mulx	r15,rax,QWORD[16+rcx]
+	mov	QWORD[((-40))+rbx],r10
+	adcx	r12,rax
+	mov	QWORD[((-32))+rbx],r11
+	adox	r13,r15
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	QWORD[((-24))+rbx],r12
+	adcx	r13,rax
+	adox	r15,rbp
+	lea	rcx,[32+rcx]
+	mov	QWORD[((-16))+rbx],r13
+
+	dec	rdi
+	jnz	NEAR $L$mulx4x_1st
+
+	mov	rax,QWORD[rsp]
+	mov	rdi,QWORD[8+rsp]
+	adc	r15,rbp
+	add	r14,r15
+	sbb	r15,r15
+	mov	QWORD[((-8))+rbx],r14
+	jmp	NEAR $L$mulx4x_outer
+
+ALIGN	32
+$L$mulx4x_outer:
+	mov	rdx,QWORD[rdi]
+	lea	rdi,[8+rdi]
+	sub	rsi,rax
+	mov	QWORD[rbx],r15
+	lea	rbx,[((64+32))+rsp]
+	sub	rcx,rax
+
+	mulx	r11,r8,QWORD[rsi]
+	xor	ebp,ebp
+	mov	r9,rdx
+	mulx	r12,r14,QWORD[8+rsi]
+	adox	r8,QWORD[((-32))+rbx]
+	adcx	r11,r14
+	mulx	r13,r15,QWORD[16+rsi]
+	adox	r11,QWORD[((-24))+rbx]
+	adcx	r12,r15
+	adox	r12,QWORD[((-16))+rbx]
+	adcx	r13,rbp
+	adox	r13,rbp
+
+	mov	QWORD[8+rsp],rdi
+	mov	r15,r8
+	imul	r8,QWORD[24+rsp]
+	xor	ebp,ebp
+
+	mulx	r14,rax,QWORD[24+rsi]
+	mov	rdx,r8
+	adcx	r13,rax
+	adox	r13,QWORD[((-8))+rbx]
+	adcx	r14,rbp
+	lea	rsi,[32+rsi]
+	adox	r14,rbp
+
+	mulx	r10,rax,QWORD[rcx]
+	adcx	r15,rax
+	adox	r10,r11
+	mulx	r11,rax,QWORD[8+rcx]
+	adcx	r10,rax
+	adox	r11,r12
+	mulx	r12,rax,QWORD[16+rcx]
+	mov	QWORD[((-32))+rbx],r10
+	adcx	r11,rax
+	adox	r12,r13
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	QWORD[((-24))+rbx],r11
+	lea	rcx,[32+rcx]
+	adcx	r12,rax
+	adox	r15,rbp
+	mov	rdi,QWORD[48+rsp]
+	mov	QWORD[((-16))+rbx],r12
+
+	jmp	NEAR $L$mulx4x_inner
+
+ALIGN	32
+$L$mulx4x_inner:
+	mulx	rax,r10,QWORD[rsi]
+	adcx	r15,rbp
+	adox	r10,r14
+	mulx	r14,r11,QWORD[8+rsi]
+	adcx	r10,QWORD[rbx]
+	adox	r11,rax
+	mulx	rax,r12,QWORD[16+rsi]
+	adcx	r11,QWORD[8+rbx]
+	adox	r12,r14
+	mulx	r14,r13,QWORD[24+rsi]
+	mov	rdx,r8
+	adcx	r12,QWORD[16+rbx]
+	adox	r13,rax
+	adcx	r13,QWORD[24+rbx]
+	adox	r14,rbp
+	lea	rsi,[32+rsi]
+	lea	rbx,[32+rbx]
+	adcx	r14,rbp
+
+	adox	r10,r15
+	mulx	r15,rax,QWORD[rcx]
+	adcx	r10,rax
+	adox	r11,r15
+	mulx	r15,rax,QWORD[8+rcx]
+	adcx	r11,rax
+	adox	r12,r15
+	mulx	r15,rax,QWORD[16+rcx]
+	mov	QWORD[((-40))+rbx],r10
+	adcx	r12,rax
+	adox	r13,r15
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	QWORD[((-32))+rbx],r11
+	mov	QWORD[((-24))+rbx],r12
+	adcx	r13,rax
+	adox	r15,rbp
+	lea	rcx,[32+rcx]
+	mov	QWORD[((-16))+rbx],r13
+
+	dec	rdi
+	jnz	NEAR $L$mulx4x_inner
+
+	mov	rax,QWORD[rsp]
+	mov	rdi,QWORD[8+rsp]
+	adc	r15,rbp
+	sub	rbp,QWORD[rbx]
+	adc	r14,r15
+	sbb	r15,r15
+	mov	QWORD[((-8))+rbx],r14
+
+	cmp	rdi,QWORD[16+rsp]
+	jne	NEAR $L$mulx4x_outer
+
+	lea	rbx,[64+rsp]
+	sub	rcx,rax
+	neg	r15
+	mov	rdx,rax
+	shr	rax,3+2
+	mov	rdi,QWORD[32+rsp]
+	jmp	NEAR $L$mulx4x_sub
+
+ALIGN	32
+$L$mulx4x_sub:
+	mov	r11,QWORD[rbx]
+	mov	r12,QWORD[8+rbx]
+	mov	r13,QWORD[16+rbx]
+	mov	r14,QWORD[24+rbx]
+	lea	rbx,[32+rbx]
+	sbb	r11,QWORD[rcx]
+	sbb	r12,QWORD[8+rcx]
+	sbb	r13,QWORD[16+rcx]
+	sbb	r14,QWORD[24+rcx]
+	lea	rcx,[32+rcx]
+	mov	QWORD[rdi],r11
+	mov	QWORD[8+rdi],r12
+	mov	QWORD[16+rdi],r13
+	mov	QWORD[24+rdi],r14
+	lea	rdi,[32+rdi]
+	dec	rax
+	jnz	NEAR $L$mulx4x_sub
+
+	sbb	r15,0
+	lea	rbx,[64+rsp]
+	sub	rdi,rdx
+
+DB	102,73,15,110,207
+	pxor	xmm0,xmm0
+	pshufd	xmm1,xmm1,0
+	mov	rsi,QWORD[40+rsp]
+
+	jmp	NEAR $L$mulx4x_cond_copy
+
+ALIGN	32
+$L$mulx4x_cond_copy:
+	movdqa	xmm2,XMMWORD[rbx]
+	movdqa	xmm3,XMMWORD[16+rbx]
+	lea	rbx,[32+rbx]
+	movdqu	xmm4,XMMWORD[rdi]
+	movdqu	xmm5,XMMWORD[16+rdi]
+	lea	rdi,[32+rdi]
+	movdqa	XMMWORD[(-32)+rbx],xmm0
+	movdqa	XMMWORD[(-16)+rbx],xmm0
+	pcmpeqd	xmm0,xmm1
+	pand	xmm2,xmm1
+	pand	xmm3,xmm1
+	pand	xmm4,xmm0
+	pand	xmm5,xmm0
+	pxor	xmm0,xmm0
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqu	XMMWORD[(-32)+rdi],xmm4
+	movdqu	XMMWORD[(-16)+rdi],xmm5
+	sub	rdx,32
+	jnz	NEAR $L$mulx4x_cond_copy
+
+	mov	QWORD[rbx],rdx
+
+	mov	rax,1
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$mulx4x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_bn_mulx4x_mont:
+DB	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+DB	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+DB	54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83
+DB	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+DB	115,108,46,111,114,103,62,0
+ALIGN	16
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+mul_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	mov	r10,QWORD[192+r8]
+	mov	rax,QWORD[8+r10*8+rax]
+
+	jmp	NEAR $L$common_pop_regs
+
+
+
+ALIGN	16
+sqr_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_pop_regs
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[8+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[40+rax]
+
+$L$common_pop_regs:
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	DB	0F3h,0C3h		;repret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_GFp_bn_mul_mont wrt ..imagebase
+	DD	$L$SEH_end_GFp_bn_mul_mont wrt ..imagebase
+	DD	$L$SEH_info_GFp_bn_mul_mont wrt ..imagebase
+
+	DD	$L$SEH_begin_bn_mul4x_mont wrt ..imagebase
+	DD	$L$SEH_end_bn_mul4x_mont wrt ..imagebase
+	DD	$L$SEH_info_bn_mul4x_mont wrt ..imagebase
+
+	DD	$L$SEH_begin_bn_sqr8x_mont wrt ..imagebase
+	DD	$L$SEH_end_bn_sqr8x_mont wrt ..imagebase
+	DD	$L$SEH_info_bn_sqr8x_mont wrt ..imagebase
+	DD	$L$SEH_begin_bn_mulx4x_mont wrt ..imagebase
+	DD	$L$SEH_end_bn_mulx4x_mont wrt ..imagebase
+	DD	$L$SEH_info_bn_mulx4x_mont wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_GFp_bn_mul_mont:
+DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
+$L$SEH_info_bn_mul4x_mont:
+DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
+$L$SEH_info_bn_sqr8x_mont:
+DB	9,0,0,0
+	DD	sqr_handler wrt ..imagebase
+	DD	$L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase
+ALIGN	8
+$L$SEH_info_bn_mulx4x_mont:
+DB	9,0,0,0
+	DD	sqr_handler wrt ..imagebase
+	DD	$L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase
+ALIGN	8

+ 4031 - 0
zeroidc/vendor/ring/pregenerated/tmp/x86_64-mont5-nasm.asm

@@ -0,0 +1,4031 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+section	.text code align=64
+
+
+EXTERN	GFp_ia32cap_P
+
+global	GFp_bn_mul_mont_gather5
+
+ALIGN	64
+GFp_bn_mul_mont_gather5:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_bn_mul_mont_gather5:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+	mov	r9d,r9d
+	mov	rax,rsp
+
+	test	r9d,7
+	jnz	NEAR $L$mul_enter
+	lea	r11,[GFp_ia32cap_P]
+	mov	r11d,DWORD[8+r11]
+	jmp	NEAR $L$mul4x_enter
+
+ALIGN	16
+$L$mul_enter:
+	movd	xmm5,DWORD[56+rsp]
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+
+	neg	r9
+	mov	r11,rsp
+	lea	r10,[((-280))+r9*8+rsp]
+	neg	r9
+	and	r10,-1024
+
+
+
+
+
+
+
+
+
+	sub	r11,r10
+	and	r11,-4096
+	lea	rsp,[r11*1+r10]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul_page_walk
+	jmp	NEAR $L$mul_page_walk_done
+
+$L$mul_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul_page_walk
+$L$mul_page_walk_done:
+
+	lea	r10,[$L$inc]
+	mov	QWORD[8+r9*8+rsp],rax
+
+$L$mul_body:
+
+	lea	r12,[128+rdx]
+	movdqa	xmm0,XMMWORD[r10]
+	movdqa	xmm1,XMMWORD[16+r10]
+	lea	r10,[((24-112))+r9*8+rsp]
+	and	r10,-16
+
+	pshufd	xmm5,xmm5,0
+	movdqa	xmm4,xmm1
+	movdqa	xmm2,xmm1
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+DB	0x67
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[112+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[128+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[144+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[160+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[176+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[192+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[208+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[224+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[240+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[256+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[272+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[288+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[304+r10],xmm0
+
+	paddd	xmm3,xmm2
+DB	0x67
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[320+r10],xmm1
+
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[336+r10],xmm2
+	pand	xmm0,XMMWORD[64+r12]
+
+	pand	xmm1,XMMWORD[80+r12]
+	pand	xmm2,XMMWORD[96+r12]
+	movdqa	XMMWORD[352+r10],xmm3
+	pand	xmm3,XMMWORD[112+r12]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[((-128))+r12]
+	movdqa	xmm5,XMMWORD[((-112))+r12]
+	movdqa	xmm2,XMMWORD[((-96))+r12]
+	pand	xmm4,XMMWORD[112+r10]
+	movdqa	xmm3,XMMWORD[((-80))+r12]
+	pand	xmm5,XMMWORD[128+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[144+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[160+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[((-64))+r12]
+	movdqa	xmm5,XMMWORD[((-48))+r12]
+	movdqa	xmm2,XMMWORD[((-32))+r12]
+	pand	xmm4,XMMWORD[176+r10]
+	movdqa	xmm3,XMMWORD[((-16))+r12]
+	pand	xmm5,XMMWORD[192+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[208+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[224+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[r12]
+	movdqa	xmm5,XMMWORD[16+r12]
+	movdqa	xmm2,XMMWORD[32+r12]
+	pand	xmm4,XMMWORD[240+r10]
+	movdqa	xmm3,XMMWORD[48+r12]
+	pand	xmm5,XMMWORD[256+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[272+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[288+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	por	xmm0,xmm1
+	pshufd	xmm1,xmm0,0x4e
+	por	xmm0,xmm1
+	lea	r12,[256+r12]
+DB	102,72,15,126,195
+
+	mov	r8,QWORD[r8]
+	mov	rax,QWORD[rsi]
+
+	xor	r14,r14
+	xor	r15,r15
+
+	mov	rbp,r8
+	mul	rbx
+	mov	r10,rax
+	mov	rax,QWORD[rcx]
+
+	imul	rbp,r10
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rdx,0
+	mov	r13,rdx
+
+	lea	r15,[1+r15]
+	jmp	NEAR $L$1st_enter
+
+ALIGN	16
+$L$1st:
+	add	r13,rax
+	mov	rax,QWORD[r15*8+rsi]
+	adc	rdx,0
+	add	r13,r11
+	mov	r11,r10
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],r13
+	mov	r13,rdx
+
+$L$1st_enter:
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[r15*8+rcx]
+	adc	rdx,0
+	lea	r15,[1+r15]
+	mov	r10,rdx
+
+	mul	rbp
+	cmp	r15,r9
+	jne	NEAR $L$1st
+
+
+	add	r13,rax
+	adc	rdx,0
+	add	r13,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r9*8+rsp],r13
+	mov	r13,rdx
+	mov	r11,r10
+
+	xor	rdx,rdx
+	add	r13,r11
+	adc	rdx,0
+	mov	QWORD[((-8))+r9*8+rsp],r13
+	mov	QWORD[r9*8+rsp],rdx
+
+	lea	r14,[1+r14]
+	jmp	NEAR $L$outer
+ALIGN	16
+$L$outer:
+	lea	rdx,[((24+128))+r9*8+rsp]
+	and	rdx,-16
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	movdqa	xmm0,XMMWORD[((-128))+r12]
+	movdqa	xmm1,XMMWORD[((-112))+r12]
+	movdqa	xmm2,XMMWORD[((-96))+r12]
+	movdqa	xmm3,XMMWORD[((-80))+r12]
+	pand	xmm0,XMMWORD[((-128))+rdx]
+	pand	xmm1,XMMWORD[((-112))+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[((-96))+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[((-80))+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[((-64))+r12]
+	movdqa	xmm1,XMMWORD[((-48))+r12]
+	movdqa	xmm2,XMMWORD[((-32))+r12]
+	movdqa	xmm3,XMMWORD[((-16))+r12]
+	pand	xmm0,XMMWORD[((-64))+rdx]
+	pand	xmm1,XMMWORD[((-48))+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[((-32))+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[((-16))+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[r12]
+	movdqa	xmm1,XMMWORD[16+r12]
+	movdqa	xmm2,XMMWORD[32+r12]
+	movdqa	xmm3,XMMWORD[48+r12]
+	pand	xmm0,XMMWORD[rdx]
+	pand	xmm1,XMMWORD[16+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[32+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[48+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[64+r12]
+	movdqa	xmm1,XMMWORD[80+r12]
+	movdqa	xmm2,XMMWORD[96+r12]
+	movdqa	xmm3,XMMWORD[112+r12]
+	pand	xmm0,XMMWORD[64+rdx]
+	pand	xmm1,XMMWORD[80+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[96+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[112+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	por	xmm4,xmm5
+	pshufd	xmm0,xmm4,0x4e
+	por	xmm0,xmm4
+	lea	r12,[256+r12]
+
+	mov	rax,QWORD[rsi]
+DB	102,72,15,126,195
+
+	xor	r15,r15
+	mov	rbp,r8
+	mov	r10,QWORD[rsp]
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[rcx]
+	adc	rdx,0
+
+	imul	rbp,r10
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+rsi]
+	adc	rdx,0
+	mov	r10,QWORD[8+rsp]
+	mov	r13,rdx
+
+	lea	r15,[1+r15]
+	jmp	NEAR $L$inner_enter
+
+ALIGN	16
+$L$inner:
+	add	r13,rax
+	mov	rax,QWORD[r15*8+rsi]
+	adc	rdx,0
+	add	r13,r10
+	mov	r10,QWORD[r15*8+rsp]
+	adc	rdx,0
+	mov	QWORD[((-16))+r15*8+rsp],r13
+	mov	r13,rdx
+
+$L$inner_enter:
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[r15*8+rcx]
+	adc	rdx,0
+	add	r10,r11
+	mov	r11,rdx
+	adc	r11,0
+	lea	r15,[1+r15]
+
+	mul	rbp
+	cmp	r15,r9
+	jne	NEAR $L$inner
+
+	add	r13,rax
+	adc	rdx,0
+	add	r13,r10
+	mov	r10,QWORD[r9*8+rsp]
+	adc	rdx,0
+	mov	QWORD[((-16))+r9*8+rsp],r13
+	mov	r13,rdx
+
+	xor	rdx,rdx
+	add	r13,r11
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-8))+r9*8+rsp],r13
+	mov	QWORD[r9*8+rsp],rdx
+
+	lea	r14,[1+r14]
+	cmp	r14,r9
+	jb	NEAR $L$outer
+
+	xor	r14,r14
+	mov	rax,QWORD[rsp]
+	lea	rsi,[rsp]
+	mov	r15,r9
+	jmp	NEAR $L$sub
+ALIGN	16
+$L$sub:	sbb	rax,QWORD[r14*8+rcx]
+	mov	QWORD[r14*8+rdi],rax
+	mov	rax,QWORD[8+r14*8+rsi]
+	lea	r14,[1+r14]
+	dec	r15
+	jnz	NEAR $L$sub
+
+	sbb	rax,0
+	mov	rbx,-1
+	xor	rbx,rax
+	xor	r14,r14
+	mov	r15,r9
+
+$L$copy:
+	mov	rcx,QWORD[r14*8+rdi]
+	mov	rdx,QWORD[r14*8+rsp]
+	and	rcx,rbx
+	and	rdx,rax
+	mov	QWORD[r14*8+rsp],r14
+	or	rdx,rcx
+	mov	QWORD[r14*8+rdi],rdx
+	lea	r14,[1+r14]
+	sub	r15,1
+	jnz	NEAR $L$copy
+
+	mov	rsi,QWORD[8+r9*8+rsp]
+
+	mov	rax,1
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$mul_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_bn_mul_mont_gather5:
+
+ALIGN	32
+bn_mul4x_mont_gather5:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_mul4x_mont_gather5:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+DB	0x67
+	mov	rax,rsp
+
+$L$mul4x_enter:
+	and	r11d,0x80108
+	cmp	r11d,0x80108
+	je	NEAR $L$mulx4x_enter
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$mul4x_prologue:
+
+DB	0x67
+	shl	r9d,3
+	lea	r10,[r9*2+r9]
+	neg	r9
+
+
+
+
+
+
+
+
+
+
+	lea	r11,[((-320))+r9*2+rsp]
+	mov	rbp,rsp
+	sub	r11,rdi
+	and	r11,4095
+	cmp	r10,r11
+	jb	NEAR $L$mul4xsp_alt
+	sub	rbp,r11
+	lea	rbp,[((-320))+r9*2+rbp]
+	jmp	NEAR $L$mul4xsp_done
+
+ALIGN	32
+$L$mul4xsp_alt:
+	lea	r10,[((4096-320))+r9*2]
+	lea	rbp,[((-320))+r9*2+rbp]
+	sub	r11,r10
+	mov	r10,0
+	cmovc	r11,r10
+	sub	rbp,r11
+$L$mul4xsp_done:
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mul4x_page_walk
+	jmp	NEAR $L$mul4x_page_walk_done
+
+$L$mul4x_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mul4x_page_walk
+$L$mul4x_page_walk_done:
+
+	neg	r9
+
+	mov	QWORD[40+rsp],rax
+
+$L$mul4x_body:
+
+	call	mul4x_internal
+
+	mov	rsi,QWORD[40+rsp]
+
+	mov	rax,1
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$mul4x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_bn_mul4x_mont_gather5:
+
+
+ALIGN	32
+mul4x_internal:
+
+	shl	r9,5
+	movd	xmm5,DWORD[56+rax]
+	lea	rax,[$L$inc]
+	lea	r13,[128+r9*1+rdx]
+	shr	r9,5
+	movdqa	xmm0,XMMWORD[rax]
+	movdqa	xmm1,XMMWORD[16+rax]
+	lea	r10,[((88-112))+r9*1+rsp]
+	lea	r12,[128+rdx]
+
+	pshufd	xmm5,xmm5,0
+	movdqa	xmm4,xmm1
+DB	0x67,0x67
+	movdqa	xmm2,xmm1
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+DB	0x67
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[112+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[128+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[144+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[160+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[176+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[192+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[208+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[224+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[240+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[256+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[272+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[288+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[304+r10],xmm0
+
+	paddd	xmm3,xmm2
+DB	0x67
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[320+r10],xmm1
+
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[336+r10],xmm2
+	pand	xmm0,XMMWORD[64+r12]
+
+	pand	xmm1,XMMWORD[80+r12]
+	pand	xmm2,XMMWORD[96+r12]
+	movdqa	XMMWORD[352+r10],xmm3
+	pand	xmm3,XMMWORD[112+r12]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[((-128))+r12]
+	movdqa	xmm5,XMMWORD[((-112))+r12]
+	movdqa	xmm2,XMMWORD[((-96))+r12]
+	pand	xmm4,XMMWORD[112+r10]
+	movdqa	xmm3,XMMWORD[((-80))+r12]
+	pand	xmm5,XMMWORD[128+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[144+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[160+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[((-64))+r12]
+	movdqa	xmm5,XMMWORD[((-48))+r12]
+	movdqa	xmm2,XMMWORD[((-32))+r12]
+	pand	xmm4,XMMWORD[176+r10]
+	movdqa	xmm3,XMMWORD[((-16))+r12]
+	pand	xmm5,XMMWORD[192+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[208+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[224+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[r12]
+	movdqa	xmm5,XMMWORD[16+r12]
+	movdqa	xmm2,XMMWORD[32+r12]
+	pand	xmm4,XMMWORD[240+r10]
+	movdqa	xmm3,XMMWORD[48+r12]
+	pand	xmm5,XMMWORD[256+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[272+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[288+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	por	xmm0,xmm1
+	pshufd	xmm1,xmm0,0x4e
+	por	xmm0,xmm1
+	lea	r12,[256+r12]
+DB	102,72,15,126,195
+
+	mov	QWORD[((16+8))+rsp],r13
+	mov	QWORD[((56+8))+rsp],rdi
+
+	mov	r8,QWORD[r8]
+	mov	rax,QWORD[rsi]
+	lea	rsi,[r9*1+rsi]
+	neg	r9
+
+	mov	rbp,r8
+	mul	rbx
+	mov	r10,rax
+	mov	rax,QWORD[rcx]
+
+	imul	rbp,r10
+	lea	r14,[((64+8))+rsp]
+	mov	r11,rdx
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+r9*1+rsi]
+	adc	rdx,0
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[16+r9*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	lea	r15,[32+r9]
+	lea	rcx,[32+rcx]
+	adc	rdx,0
+	mov	QWORD[r14],rdi
+	mov	r13,rdx
+	jmp	NEAR $L$1st4x
+
+ALIGN	32
+$L$1st4x:
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+rcx]
+	lea	r14,[32+r14]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+r15*1+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-24))+r14],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[r15*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r14],rdi
+	mov	r13,rdx
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[rcx]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[8+r15*1+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-8))+r14],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[16+r15*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	lea	rcx,[32+rcx]
+	adc	rdx,0
+	mov	QWORD[r14],rdi
+	mov	r13,rdx
+
+	add	r15,32
+	jnz	NEAR $L$1st4x
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+rcx]
+	lea	r14,[32+r14]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-24))+r14],r13
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+rcx]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[r9*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-16))+r14],rdi
+	mov	r13,rdx
+
+	lea	rcx,[r9*1+rcx]
+
+	xor	rdi,rdi
+	add	r13,r10
+	adc	rdi,0
+	mov	QWORD[((-8))+r14],r13
+
+	jmp	NEAR $L$outer4x
+
+ALIGN	32
+$L$outer4x:
+	lea	rdx,[((16+128))+r14]
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	movdqa	xmm0,XMMWORD[((-128))+r12]
+	movdqa	xmm1,XMMWORD[((-112))+r12]
+	movdqa	xmm2,XMMWORD[((-96))+r12]
+	movdqa	xmm3,XMMWORD[((-80))+r12]
+	pand	xmm0,XMMWORD[((-128))+rdx]
+	pand	xmm1,XMMWORD[((-112))+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[((-96))+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[((-80))+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[((-64))+r12]
+	movdqa	xmm1,XMMWORD[((-48))+r12]
+	movdqa	xmm2,XMMWORD[((-32))+r12]
+	movdqa	xmm3,XMMWORD[((-16))+r12]
+	pand	xmm0,XMMWORD[((-64))+rdx]
+	pand	xmm1,XMMWORD[((-48))+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[((-32))+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[((-16))+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[r12]
+	movdqa	xmm1,XMMWORD[16+r12]
+	movdqa	xmm2,XMMWORD[32+r12]
+	movdqa	xmm3,XMMWORD[48+r12]
+	pand	xmm0,XMMWORD[rdx]
+	pand	xmm1,XMMWORD[16+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[32+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[48+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[64+r12]
+	movdqa	xmm1,XMMWORD[80+r12]
+	movdqa	xmm2,XMMWORD[96+r12]
+	movdqa	xmm3,XMMWORD[112+r12]
+	pand	xmm0,XMMWORD[64+rdx]
+	pand	xmm1,XMMWORD[80+rdx]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[96+rdx]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[112+rdx]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	por	xmm4,xmm5
+	pshufd	xmm0,xmm4,0x4e
+	por	xmm0,xmm4
+	lea	r12,[256+r12]
+DB	102,72,15,126,195
+
+	mov	r10,QWORD[r9*1+r14]
+	mov	rbp,r8
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[rcx]
+	adc	rdx,0
+
+	imul	rbp,r10
+	mov	r11,rdx
+	mov	QWORD[r14],rdi
+
+	lea	r14,[r9*1+r14]
+
+	mul	rbp
+	add	r10,rax
+	mov	rax,QWORD[8+r9*1+rsi]
+	adc	rdx,0
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+rcx]
+	adc	rdx,0
+	add	r11,QWORD[8+r14]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[16+r9*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	lea	r15,[32+r9]
+	lea	rcx,[32+rcx]
+	adc	rdx,0
+	mov	r13,rdx
+	jmp	NEAR $L$inner4x
+
+ALIGN	32
+$L$inner4x:
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+rcx]
+	adc	rdx,0
+	add	r10,QWORD[16+r14]
+	lea	r14,[32+r14]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+r15*1+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-32))+r14],rdi
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[((-8))+rcx]
+	adc	rdx,0
+	add	r11,QWORD[((-8))+r14]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[r15*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-24))+r14],r13
+	mov	r13,rdx
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[rcx]
+	adc	rdx,0
+	add	r10,QWORD[r14]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[8+r15*1+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-16))+r14],rdi
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[8+rcx]
+	adc	rdx,0
+	add	r11,QWORD[8+r14]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[16+r15*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	lea	rcx,[32+rcx]
+	adc	rdx,0
+	mov	QWORD[((-8))+r14],r13
+	mov	r13,rdx
+
+	add	r15,32
+	jnz	NEAR $L$inner4x
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[((-16))+rcx]
+	adc	rdx,0
+	add	r10,QWORD[16+r14]
+	lea	r14,[32+r14]
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,QWORD[((-8))+rsi]
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	QWORD[((-32))+r14],rdi
+	mov	rdi,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,rbp
+	mov	rbp,QWORD[((-8))+rcx]
+	adc	rdx,0
+	add	r11,QWORD[((-8))+r14]
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	rbp
+	add	rdi,rax
+	mov	rax,QWORD[r9*1+rsi]
+	adc	rdx,0
+	add	rdi,r11
+	adc	rdx,0
+	mov	QWORD[((-24))+r14],r13
+	mov	r13,rdx
+
+	mov	QWORD[((-16))+r14],rdi
+	lea	rcx,[r9*1+rcx]
+
+	xor	rdi,rdi
+	add	r13,r10
+	adc	rdi,0
+	add	r13,QWORD[r14]
+	adc	rdi,0
+	mov	QWORD[((-8))+r14],r13
+
+	cmp	r12,QWORD[((16+8))+rsp]
+	jb	NEAR $L$outer4x
+	xor	rax,rax
+	sub	rbp,r13
+	adc	r15,r15
+	or	rdi,r15
+	sub	rax,rdi
+	lea	rbx,[r9*1+r14]
+	mov	r12,QWORD[rcx]
+	lea	rbp,[rcx]
+	mov	rcx,r9
+	sar	rcx,3+2
+	mov	rdi,QWORD[((56+8))+rsp]
+	dec	r12
+	xor	r10,r10
+	mov	r13,QWORD[8+rbp]
+	mov	r14,QWORD[16+rbp]
+	mov	r15,QWORD[24+rbp]
+	jmp	NEAR $L$sqr4x_sub_entry
+
+
+global	GFp_bn_power5
+
+ALIGN	32
+GFp_bn_power5:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_GFp_bn_power5:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+	mov	rax,rsp
+
+	lea	r11,[GFp_ia32cap_P]
+	mov	r11d,DWORD[8+r11]
+	and	r11d,0x80108
+	cmp	r11d,0x80108
+	je	NEAR $L$powerx5_enter
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$power5_prologue:
+
+	shl	r9d,3
+	lea	r10d,[r9*2+r9]
+	neg	r9
+	mov	r8,QWORD[r8]
+
+
+
+
+
+
+
+
+	lea	r11,[((-320))+r9*2+rsp]
+	mov	rbp,rsp
+	sub	r11,rdi
+	and	r11,4095
+	cmp	r10,r11
+	jb	NEAR $L$pwr_sp_alt
+	sub	rbp,r11
+	lea	rbp,[((-320))+r9*2+rbp]
+	jmp	NEAR $L$pwr_sp_done
+
+ALIGN	32
+$L$pwr_sp_alt:
+	lea	r10,[((4096-320))+r9*2]
+	lea	rbp,[((-320))+r9*2+rbp]
+	sub	r11,r10
+	mov	r10,0
+	cmovc	r11,r10
+	sub	rbp,r11
+$L$pwr_sp_done:
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$pwr_page_walk
+	jmp	NEAR $L$pwr_page_walk_done
+
+$L$pwr_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$pwr_page_walk
+$L$pwr_page_walk_done:
+
+	mov	r10,r9
+	neg	r9
+
+
+
+
+
+
+
+
+
+
+	mov	QWORD[32+rsp],r8
+	mov	QWORD[40+rsp],rax
+
+$L$power5_body:
+DB	102,72,15,110,207
+DB	102,72,15,110,209
+DB	102,73,15,110,218
+DB	102,72,15,110,226
+
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+
+DB	102,72,15,126,209
+DB	102,72,15,126,226
+	mov	rdi,rsi
+	mov	rax,QWORD[40+rsp]
+	lea	r8,[32+rsp]
+
+	call	mul4x_internal
+
+	mov	rsi,QWORD[40+rsp]
+
+	mov	rax,1
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$power5_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_GFp_bn_power5:
+
+global	GFp_bn_sqr8x_internal
+
+
+ALIGN	32
+GFp_bn_sqr8x_internal:
+__bn_sqr8x_internal:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	lea	rbp,[32+r10]
+	lea	rsi,[r9*1+rsi]
+
+	mov	rcx,r9
+
+
+	mov	r14,QWORD[((-32))+rbp*1+rsi]
+	lea	rdi,[((48+8))+r9*2+rsp]
+	mov	rax,QWORD[((-24))+rbp*1+rsi]
+	lea	rdi,[((-32))+rbp*1+rdi]
+	mov	rbx,QWORD[((-16))+rbp*1+rsi]
+	mov	r15,rax
+
+	mul	r14
+	mov	r10,rax
+	mov	rax,rbx
+	mov	r11,rdx
+	mov	QWORD[((-24))+rbp*1+rdi],r10
+
+	mul	r14
+	add	r11,rax
+	mov	rax,rbx
+	adc	rdx,0
+	mov	QWORD[((-16))+rbp*1+rdi],r11
+	mov	r10,rdx
+
+
+	mov	rbx,QWORD[((-8))+rbp*1+rsi]
+	mul	r15
+	mov	r12,rax
+	mov	rax,rbx
+	mov	r13,rdx
+
+	lea	rcx,[rbp]
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	mov	r11,rdx
+	adc	r11,0
+	add	r10,r12
+	adc	r11,0
+	mov	QWORD[((-8))+rcx*1+rdi],r10
+	jmp	NEAR $L$sqr4x_1st
+
+ALIGN	32
+$L$sqr4x_1st:
+	mov	rbx,QWORD[rcx*1+rsi]
+	mul	r15
+	add	r13,rax
+	mov	rax,rbx
+	mov	r12,rdx
+	adc	r12,0
+
+	mul	r14
+	add	r11,rax
+	mov	rax,rbx
+	mov	rbx,QWORD[8+rcx*1+rsi]
+	mov	r10,rdx
+	adc	r10,0
+	add	r11,r13
+	adc	r10,0
+
+
+	mul	r15
+	add	r12,rax
+	mov	rax,rbx
+	mov	QWORD[rcx*1+rdi],r11
+	mov	r13,rdx
+	adc	r13,0
+
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	mov	rbx,QWORD[16+rcx*1+rsi]
+	mov	r11,rdx
+	adc	r11,0
+	add	r10,r12
+	adc	r11,0
+
+	mul	r15
+	add	r13,rax
+	mov	rax,rbx
+	mov	QWORD[8+rcx*1+rdi],r10
+	mov	r12,rdx
+	adc	r12,0
+
+	mul	r14
+	add	r11,rax
+	mov	rax,rbx
+	mov	rbx,QWORD[24+rcx*1+rsi]
+	mov	r10,rdx
+	adc	r10,0
+	add	r11,r13
+	adc	r10,0
+
+
+	mul	r15
+	add	r12,rax
+	mov	rax,rbx
+	mov	QWORD[16+rcx*1+rdi],r11
+	mov	r13,rdx
+	adc	r13,0
+	lea	rcx,[32+rcx]
+
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	mov	r11,rdx
+	adc	r11,0
+	add	r10,r12
+	adc	r11,0
+	mov	QWORD[((-8))+rcx*1+rdi],r10
+
+	cmp	rcx,0
+	jne	NEAR $L$sqr4x_1st
+
+	mul	r15
+	add	r13,rax
+	lea	rbp,[16+rbp]
+	adc	rdx,0
+	add	r13,r11
+	adc	rdx,0
+
+	mov	QWORD[rdi],r13
+	mov	r12,rdx
+	mov	QWORD[8+rdi],rdx
+	jmp	NEAR $L$sqr4x_outer
+
+ALIGN	32
+$L$sqr4x_outer:
+	mov	r14,QWORD[((-32))+rbp*1+rsi]
+	lea	rdi,[((48+8))+r9*2+rsp]
+	mov	rax,QWORD[((-24))+rbp*1+rsi]
+	lea	rdi,[((-32))+rbp*1+rdi]
+	mov	rbx,QWORD[((-16))+rbp*1+rsi]
+	mov	r15,rax
+
+	mul	r14
+	mov	r10,QWORD[((-24))+rbp*1+rdi]
+	add	r10,rax
+	mov	rax,rbx
+	adc	rdx,0
+	mov	QWORD[((-24))+rbp*1+rdi],r10
+	mov	r11,rdx
+
+	mul	r14
+	add	r11,rax
+	mov	rax,rbx
+	adc	rdx,0
+	add	r11,QWORD[((-16))+rbp*1+rdi]
+	mov	r10,rdx
+	adc	r10,0
+	mov	QWORD[((-16))+rbp*1+rdi],r11
+
+	xor	r12,r12
+
+	mov	rbx,QWORD[((-8))+rbp*1+rsi]
+	mul	r15
+	add	r12,rax
+	mov	rax,rbx
+	adc	rdx,0
+	add	r12,QWORD[((-8))+rbp*1+rdi]
+	mov	r13,rdx
+	adc	r13,0
+
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	adc	rdx,0
+	add	r10,r12
+	mov	r11,rdx
+	adc	r11,0
+	mov	QWORD[((-8))+rbp*1+rdi],r10
+
+	lea	rcx,[rbp]
+	jmp	NEAR $L$sqr4x_inner
+
+ALIGN	32
+$L$sqr4x_inner:
+	mov	rbx,QWORD[rcx*1+rsi]
+	mul	r15
+	add	r13,rax
+	mov	rax,rbx
+	mov	r12,rdx
+	adc	r12,0
+	add	r13,QWORD[rcx*1+rdi]
+	adc	r12,0
+
+DB	0x67
+	mul	r14
+	add	r11,rax
+	mov	rax,rbx
+	mov	rbx,QWORD[8+rcx*1+rsi]
+	mov	r10,rdx
+	adc	r10,0
+	add	r11,r13
+	adc	r10,0
+
+	mul	r15
+	add	r12,rax
+	mov	QWORD[rcx*1+rdi],r11
+	mov	rax,rbx
+	mov	r13,rdx
+	adc	r13,0
+	add	r12,QWORD[8+rcx*1+rdi]
+	lea	rcx,[16+rcx]
+	adc	r13,0
+
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	adc	rdx,0
+	add	r10,r12
+	mov	r11,rdx
+	adc	r11,0
+	mov	QWORD[((-8))+rcx*1+rdi],r10
+
+	cmp	rcx,0
+	jne	NEAR $L$sqr4x_inner
+
+DB	0x67
+	mul	r15
+	add	r13,rax
+	adc	rdx,0
+	add	r13,r11
+	adc	rdx,0
+
+	mov	QWORD[rdi],r13
+	mov	r12,rdx
+	mov	QWORD[8+rdi],rdx
+
+	add	rbp,16
+	jnz	NEAR $L$sqr4x_outer
+
+
+	mov	r14,QWORD[((-32))+rsi]
+	lea	rdi,[((48+8))+r9*2+rsp]
+	mov	rax,QWORD[((-24))+rsi]
+	lea	rdi,[((-32))+rbp*1+rdi]
+	mov	rbx,QWORD[((-16))+rsi]
+	mov	r15,rax
+
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	mov	r11,rdx
+	adc	r11,0
+
+	mul	r14
+	add	r11,rax
+	mov	rax,rbx
+	mov	QWORD[((-24))+rdi],r10
+	mov	r10,rdx
+	adc	r10,0
+	add	r11,r13
+	mov	rbx,QWORD[((-8))+rsi]
+	adc	r10,0
+
+	mul	r15
+	add	r12,rax
+	mov	rax,rbx
+	mov	QWORD[((-16))+rdi],r11
+	mov	r13,rdx
+	adc	r13,0
+
+	mul	r14
+	add	r10,rax
+	mov	rax,rbx
+	mov	r11,rdx
+	adc	r11,0
+	add	r10,r12
+	adc	r11,0
+	mov	QWORD[((-8))+rdi],r10
+
+	mul	r15
+	add	r13,rax
+	mov	rax,QWORD[((-16))+rsi]
+	adc	rdx,0
+	add	r13,r11
+	adc	rdx,0
+
+	mov	QWORD[rdi],r13
+	mov	r12,rdx
+	mov	QWORD[8+rdi],rdx
+
+	mul	rbx
+	add	rbp,16
+	xor	r14,r14
+	sub	rbp,r9
+	xor	r15,r15
+
+	add	rax,r12
+	adc	rdx,0
+	mov	QWORD[8+rdi],rax
+	mov	QWORD[16+rdi],rdx
+	mov	QWORD[24+rdi],r15
+
+	mov	rax,QWORD[((-16))+rbp*1+rsi]
+	lea	rdi,[((48+8))+rsp]
+	xor	r10,r10
+	mov	r11,QWORD[8+rdi]
+
+	lea	r12,[r10*2+r14]
+	shr	r10,63
+	lea	r13,[r11*2+rcx]
+	shr	r11,63
+	or	r13,r10
+	mov	r10,QWORD[16+rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[24+rdi]
+	adc	r12,rax
+	mov	rax,QWORD[((-8))+rbp*1+rsi]
+	mov	QWORD[rdi],r12
+	adc	r13,rdx
+
+	lea	rbx,[r10*2+r14]
+	mov	QWORD[8+rdi],r13
+	sbb	r15,r15
+	shr	r10,63
+	lea	r8,[r11*2+rcx]
+	shr	r11,63
+	or	r8,r10
+	mov	r10,QWORD[32+rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[40+rdi]
+	adc	rbx,rax
+	mov	rax,QWORD[rbp*1+rsi]
+	mov	QWORD[16+rdi],rbx
+	adc	r8,rdx
+	lea	rbp,[16+rbp]
+	mov	QWORD[24+rdi],r8
+	sbb	r15,r15
+	lea	rdi,[64+rdi]
+	jmp	NEAR $L$sqr4x_shift_n_add
+
+ALIGN	32
+$L$sqr4x_shift_n_add:
+	lea	r12,[r10*2+r14]
+	shr	r10,63
+	lea	r13,[r11*2+rcx]
+	shr	r11,63
+	or	r13,r10
+	mov	r10,QWORD[((-16))+rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[((-8))+rdi]
+	adc	r12,rax
+	mov	rax,QWORD[((-8))+rbp*1+rsi]
+	mov	QWORD[((-32))+rdi],r12
+	adc	r13,rdx
+
+	lea	rbx,[r10*2+r14]
+	mov	QWORD[((-24))+rdi],r13
+	sbb	r15,r15
+	shr	r10,63
+	lea	r8,[r11*2+rcx]
+	shr	r11,63
+	or	r8,r10
+	mov	r10,QWORD[rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[8+rdi]
+	adc	rbx,rax
+	mov	rax,QWORD[rbp*1+rsi]
+	mov	QWORD[((-16))+rdi],rbx
+	adc	r8,rdx
+
+	lea	r12,[r10*2+r14]
+	mov	QWORD[((-8))+rdi],r8
+	sbb	r15,r15
+	shr	r10,63
+	lea	r13,[r11*2+rcx]
+	shr	r11,63
+	or	r13,r10
+	mov	r10,QWORD[16+rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[24+rdi]
+	adc	r12,rax
+	mov	rax,QWORD[8+rbp*1+rsi]
+	mov	QWORD[rdi],r12
+	adc	r13,rdx
+
+	lea	rbx,[r10*2+r14]
+	mov	QWORD[8+rdi],r13
+	sbb	r15,r15
+	shr	r10,63
+	lea	r8,[r11*2+rcx]
+	shr	r11,63
+	or	r8,r10
+	mov	r10,QWORD[32+rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[40+rdi]
+	adc	rbx,rax
+	mov	rax,QWORD[16+rbp*1+rsi]
+	mov	QWORD[16+rdi],rbx
+	adc	r8,rdx
+	mov	QWORD[24+rdi],r8
+	sbb	r15,r15
+	lea	rdi,[64+rdi]
+	add	rbp,32
+	jnz	NEAR $L$sqr4x_shift_n_add
+
+	lea	r12,[r10*2+r14]
+DB	0x67
+	shr	r10,63
+	lea	r13,[r11*2+rcx]
+	shr	r11,63
+	or	r13,r10
+	mov	r10,QWORD[((-16))+rdi]
+	mov	r14,r11
+	mul	rax
+	neg	r15
+	mov	r11,QWORD[((-8))+rdi]
+	adc	r12,rax
+	mov	rax,QWORD[((-8))+rsi]
+	mov	QWORD[((-32))+rdi],r12
+	adc	r13,rdx
+
+	lea	rbx,[r10*2+r14]
+	mov	QWORD[((-24))+rdi],r13
+	sbb	r15,r15
+	shr	r10,63
+	lea	r8,[r11*2+rcx]
+	shr	r11,63
+	or	r8,r10
+	mul	rax
+	neg	r15
+	adc	rbx,rax
+	adc	r8,rdx
+	mov	QWORD[((-16))+rdi],rbx
+	mov	QWORD[((-8))+rdi],r8
+DB	102,72,15,126,213
+__bn_sqr8x_reduction:
+	xor	rax,rax
+	lea	rcx,[rbp*1+r9]
+	lea	rdx,[((48+8))+r9*2+rsp]
+	mov	QWORD[((0+8))+rsp],rcx
+	lea	rdi,[((48+8))+r9*1+rsp]
+	mov	QWORD[((8+8))+rsp],rdx
+	neg	r9
+	jmp	NEAR $L$8x_reduction_loop
+
+ALIGN	32
+$L$8x_reduction_loop:
+	lea	rdi,[r9*1+rdi]
+DB	0x66
+	mov	rbx,QWORD[rdi]
+	mov	r9,QWORD[8+rdi]
+	mov	r10,QWORD[16+rdi]
+	mov	r11,QWORD[24+rdi]
+	mov	r12,QWORD[32+rdi]
+	mov	r13,QWORD[40+rdi]
+	mov	r14,QWORD[48+rdi]
+	mov	r15,QWORD[56+rdi]
+	mov	QWORD[rdx],rax
+	lea	rdi,[64+rdi]
+
+DB	0x67
+	mov	r8,rbx
+	imul	rbx,QWORD[((32+8))+rsp]
+	mov	rax,QWORD[rbp]
+	mov	ecx,8
+	jmp	NEAR $L$8x_reduce
+
+ALIGN	32
+$L$8x_reduce:
+	mul	rbx
+	mov	rax,QWORD[8+rbp]
+	neg	r8
+	mov	r8,rdx
+	adc	r8,0
+
+	mul	rbx
+	add	r9,rax
+	mov	rax,QWORD[16+rbp]
+	adc	rdx,0
+	add	r8,r9
+	mov	QWORD[((48-8+8))+rcx*8+rsp],rbx
+	mov	r9,rdx
+	adc	r9,0
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[24+rbp]
+	adc	rdx,0
+	add	r9,r10
+	mov	rsi,QWORD[((32+8))+rsp]
+	mov	r10,rdx
+	adc	r10,0
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[32+rbp]
+	adc	rdx,0
+	imul	rsi,r8
+	add	r10,r11
+	mov	r11,rdx
+	adc	r11,0
+
+	mul	rbx
+	add	r12,rax
+	mov	rax,QWORD[40+rbp]
+	adc	rdx,0
+	add	r11,r12
+	mov	r12,rdx
+	adc	r12,0
+
+	mul	rbx
+	add	r13,rax
+	mov	rax,QWORD[48+rbp]
+	adc	rdx,0
+	add	r12,r13
+	mov	r13,rdx
+	adc	r13,0
+
+	mul	rbx
+	add	r14,rax
+	mov	rax,QWORD[56+rbp]
+	adc	rdx,0
+	add	r13,r14
+	mov	r14,rdx
+	adc	r14,0
+
+	mul	rbx
+	mov	rbx,rsi
+	add	r15,rax
+	mov	rax,QWORD[rbp]
+	adc	rdx,0
+	add	r14,r15
+	mov	r15,rdx
+	adc	r15,0
+
+	dec	ecx
+	jnz	NEAR $L$8x_reduce
+
+	lea	rbp,[64+rbp]
+	xor	rax,rax
+	mov	rdx,QWORD[((8+8))+rsp]
+	cmp	rbp,QWORD[((0+8))+rsp]
+	jae	NEAR $L$8x_no_tail
+
+DB	0x66
+	add	r8,QWORD[rdi]
+	adc	r9,QWORD[8+rdi]
+	adc	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	sbb	rsi,rsi
+
+	mov	rbx,QWORD[((48+56+8))+rsp]
+	mov	ecx,8
+	mov	rax,QWORD[rbp]
+	jmp	NEAR $L$8x_tail
+
+ALIGN	32
+$L$8x_tail:
+	mul	rbx
+	add	r8,rax
+	mov	rax,QWORD[8+rbp]
+	mov	QWORD[rdi],r8
+	mov	r8,rdx
+	adc	r8,0
+
+	mul	rbx
+	add	r9,rax
+	mov	rax,QWORD[16+rbp]
+	adc	rdx,0
+	add	r8,r9
+	lea	rdi,[8+rdi]
+	mov	r9,rdx
+	adc	r9,0
+
+	mul	rbx
+	add	r10,rax
+	mov	rax,QWORD[24+rbp]
+	adc	rdx,0
+	add	r9,r10
+	mov	r10,rdx
+	adc	r10,0
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,QWORD[32+rbp]
+	adc	rdx,0
+	add	r10,r11
+	mov	r11,rdx
+	adc	r11,0
+
+	mul	rbx
+	add	r12,rax
+	mov	rax,QWORD[40+rbp]
+	adc	rdx,0
+	add	r11,r12
+	mov	r12,rdx
+	adc	r12,0
+
+	mul	rbx
+	add	r13,rax
+	mov	rax,QWORD[48+rbp]
+	adc	rdx,0
+	add	r12,r13
+	mov	r13,rdx
+	adc	r13,0
+
+	mul	rbx
+	add	r14,rax
+	mov	rax,QWORD[56+rbp]
+	adc	rdx,0
+	add	r13,r14
+	mov	r14,rdx
+	adc	r14,0
+
+	mul	rbx
+	mov	rbx,QWORD[((48-16+8))+rcx*8+rsp]
+	add	r15,rax
+	adc	rdx,0
+	add	r14,r15
+	mov	rax,QWORD[rbp]
+	mov	r15,rdx
+	adc	r15,0
+
+	dec	ecx
+	jnz	NEAR $L$8x_tail
+
+	lea	rbp,[64+rbp]
+	mov	rdx,QWORD[((8+8))+rsp]
+	cmp	rbp,QWORD[((0+8))+rsp]
+	jae	NEAR $L$8x_tail_done
+
+	mov	rbx,QWORD[((48+56+8))+rsp]
+	neg	rsi
+	mov	rax,QWORD[rbp]
+	adc	r8,QWORD[rdi]
+	adc	r9,QWORD[8+rdi]
+	adc	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	sbb	rsi,rsi
+
+	mov	ecx,8
+	jmp	NEAR $L$8x_tail
+
+ALIGN	32
+$L$8x_tail_done:
+	xor	rax,rax
+	add	r8,QWORD[rdx]
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+	adc	r14,0
+	adc	r15,0
+	adc	rax,0
+
+	neg	rsi
+$L$8x_no_tail:
+	adc	r8,QWORD[rdi]
+	adc	r9,QWORD[8+rdi]
+	adc	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	adc	rax,0
+	mov	rcx,QWORD[((-8))+rbp]
+	xor	rsi,rsi
+
+DB	102,72,15,126,213
+
+	mov	QWORD[rdi],r8
+	mov	QWORD[8+rdi],r9
+DB	102,73,15,126,217
+	mov	QWORD[16+rdi],r10
+	mov	QWORD[24+rdi],r11
+	mov	QWORD[32+rdi],r12
+	mov	QWORD[40+rdi],r13
+	mov	QWORD[48+rdi],r14
+	mov	QWORD[56+rdi],r15
+	lea	rdi,[64+rdi]
+
+	cmp	rdi,rdx
+	jb	NEAR $L$8x_reduction_loop
+	DB	0F3h,0C3h		;repret
+
+
+
+ALIGN	32
+__bn_post4x_internal:
+
+	mov	r12,QWORD[rbp]
+	lea	rbx,[r9*1+rdi]
+	mov	rcx,r9
+DB	102,72,15,126,207
+	neg	rax
+DB	102,72,15,126,206
+	sar	rcx,3+2
+	dec	r12
+	xor	r10,r10
+	mov	r13,QWORD[8+rbp]
+	mov	r14,QWORD[16+rbp]
+	mov	r15,QWORD[24+rbp]
+	jmp	NEAR $L$sqr4x_sub_entry
+
+ALIGN	16
+$L$sqr4x_sub:
+	mov	r12,QWORD[rbp]
+	mov	r13,QWORD[8+rbp]
+	mov	r14,QWORD[16+rbp]
+	mov	r15,QWORD[24+rbp]
+$L$sqr4x_sub_entry:
+	lea	rbp,[32+rbp]
+	not	r12
+	not	r13
+	not	r14
+	not	r15
+	and	r12,rax
+	and	r13,rax
+	and	r14,rax
+	and	r15,rax
+
+	neg	r10
+	adc	r12,QWORD[rbx]
+	adc	r13,QWORD[8+rbx]
+	adc	r14,QWORD[16+rbx]
+	adc	r15,QWORD[24+rbx]
+	mov	QWORD[rdi],r12
+	lea	rbx,[32+rbx]
+	mov	QWORD[8+rdi],r13
+	sbb	r10,r10
+	mov	QWORD[16+rdi],r14
+	mov	QWORD[24+rdi],r15
+	lea	rdi,[32+rdi]
+
+	inc	rcx
+	jnz	NEAR $L$sqr4x_sub
+
+	mov	r10,r9
+	neg	r9
+	DB	0F3h,0C3h		;repret
+
+
+global	GFp_bn_from_montgomery
+
+ALIGN	32
+GFp_bn_from_montgomery:
+
+	test	DWORD[48+rsp],7
+	jz	NEAR bn_from_mont8x
+	xor	eax,eax
+	DB	0F3h,0C3h		;repret
+
+
+
+
+ALIGN	32
+bn_from_mont8x:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_from_mont8x:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+DB	0x67
+	mov	rax,rsp
+
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$from_prologue:
+
+	shl	r9d,3
+	lea	r10,[r9*2+r9]
+	neg	r9
+	mov	r8,QWORD[r8]
+
+
+
+
+
+
+
+
+	lea	r11,[((-320))+r9*2+rsp]
+	mov	rbp,rsp
+	sub	r11,rdi
+	and	r11,4095
+	cmp	r10,r11
+	jb	NEAR $L$from_sp_alt
+	sub	rbp,r11
+	lea	rbp,[((-320))+r9*2+rbp]
+	jmp	NEAR $L$from_sp_done
+
+ALIGN	32
+$L$from_sp_alt:
+	lea	r10,[((4096-320))+r9*2]
+	lea	rbp,[((-320))+r9*2+rbp]
+	sub	r11,r10
+	mov	r10,0
+	cmovc	r11,r10
+	sub	rbp,r11
+$L$from_sp_done:
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$from_page_walk
+	jmp	NEAR $L$from_page_walk_done
+
+$L$from_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$from_page_walk
+$L$from_page_walk_done:
+
+	mov	r10,r9
+	neg	r9
+
+
+
+
+
+
+
+
+
+
+	mov	QWORD[32+rsp],r8
+	mov	QWORD[40+rsp],rax
+
+$L$from_body:
+	mov	r11,r9
+	lea	rax,[48+rsp]
+	pxor	xmm0,xmm0
+	jmp	NEAR $L$mul_by_1
+
+ALIGN	32
+$L$mul_by_1:
+	movdqu	xmm1,XMMWORD[rsi]
+	movdqu	xmm2,XMMWORD[16+rsi]
+	movdqu	xmm3,XMMWORD[32+rsi]
+	movdqa	XMMWORD[r9*1+rax],xmm0
+	movdqu	xmm4,XMMWORD[48+rsi]
+	movdqa	XMMWORD[16+r9*1+rax],xmm0
+DB	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
+	movdqa	XMMWORD[rax],xmm1
+	movdqa	XMMWORD[32+r9*1+rax],xmm0
+	movdqa	XMMWORD[16+rax],xmm2
+	movdqa	XMMWORD[48+r9*1+rax],xmm0
+	movdqa	XMMWORD[32+rax],xmm3
+	movdqa	XMMWORD[48+rax],xmm4
+	lea	rax,[64+rax]
+	sub	r11,64
+	jnz	NEAR $L$mul_by_1
+
+DB	102,72,15,110,207
+DB	102,72,15,110,209
+DB	0x67
+	mov	rbp,rcx
+DB	102,73,15,110,218
+	lea	r11,[GFp_ia32cap_P]
+	mov	r11d,DWORD[8+r11]
+	and	r11d,0x80108
+	cmp	r11d,0x80108
+	jne	NEAR $L$from_mont_nox
+
+	lea	rdi,[r9*1+rax]
+	call	__bn_sqrx8x_reduction
+	call	__bn_postx4x_internal
+
+	pxor	xmm0,xmm0
+	lea	rax,[48+rsp]
+	jmp	NEAR $L$from_mont_zero
+
+ALIGN	32
+$L$from_mont_nox:
+	call	__bn_sqr8x_reduction
+	call	__bn_post4x_internal
+
+	pxor	xmm0,xmm0
+	lea	rax,[48+rsp]
+	jmp	NEAR $L$from_mont_zero
+
+ALIGN	32
+$L$from_mont_zero:
+	mov	rsi,QWORD[40+rsp]
+
+	movdqa	XMMWORD[rax],xmm0
+	movdqa	XMMWORD[16+rax],xmm0
+	movdqa	XMMWORD[32+rax],xmm0
+	movdqa	XMMWORD[48+rax],xmm0
+	lea	rax,[64+rax]
+	sub	r9,32
+	jnz	NEAR $L$from_mont_zero
+
+	mov	rax,1
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$from_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_bn_from_mont8x:
+
+ALIGN	32
+bn_mulx4x_mont_gather5:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_mulx4x_mont_gather5:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+	mov	rax,rsp
+
+$L$mulx4x_enter:
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$mulx4x_prologue:
+
+	shl	r9d,3
+	lea	r10,[r9*2+r9]
+	neg	r9
+	mov	r8,QWORD[r8]
+
+
+
+
+
+
+
+
+
+
+	lea	r11,[((-320))+r9*2+rsp]
+	mov	rbp,rsp
+	sub	r11,rdi
+	and	r11,4095
+	cmp	r10,r11
+	jb	NEAR $L$mulx4xsp_alt
+	sub	rbp,r11
+	lea	rbp,[((-320))+r9*2+rbp]
+	jmp	NEAR $L$mulx4xsp_done
+
+$L$mulx4xsp_alt:
+	lea	r10,[((4096-320))+r9*2]
+	lea	rbp,[((-320))+r9*2+rbp]
+	sub	r11,r10
+	mov	r10,0
+	cmovc	r11,r10
+	sub	rbp,r11
+$L$mulx4xsp_done:
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mulx4x_page_walk
+	jmp	NEAR $L$mulx4x_page_walk_done
+
+$L$mulx4x_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mulx4x_page_walk
+$L$mulx4x_page_walk_done:
+
+
+
+
+
+
+
+
+
+
+
+
+
+	mov	QWORD[32+rsp],r8
+	mov	QWORD[40+rsp],rax
+
+$L$mulx4x_body:
+	call	mulx4x_internal
+
+	mov	rsi,QWORD[40+rsp]
+
+	mov	rax,1
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$mulx4x_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_bn_mulx4x_mont_gather5:
+
+
+ALIGN	32
+mulx4x_internal:
+
+	mov	QWORD[8+rsp],r9
+	mov	r10,r9
+	neg	r9
+	shl	r9,5
+	neg	r10
+	lea	r13,[128+r9*1+rdx]
+	shr	r9,5+5
+	movd	xmm5,DWORD[56+rax]
+	sub	r9,1
+	lea	rax,[$L$inc]
+	mov	QWORD[((16+8))+rsp],r13
+	mov	QWORD[((24+8))+rsp],r9
+	mov	QWORD[((56+8))+rsp],rdi
+	movdqa	xmm0,XMMWORD[rax]
+	movdqa	xmm1,XMMWORD[16+rax]
+	lea	r10,[((88-112))+r10*1+rsp]
+	lea	rdi,[128+rdx]
+
+	pshufd	xmm5,xmm5,0
+	movdqa	xmm4,xmm1
+DB	0x67
+	movdqa	xmm2,xmm1
+DB	0x67
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[112+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[128+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[144+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[160+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[176+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[192+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[208+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[224+r10],xmm3
+	movdqa	xmm3,xmm4
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[240+r10],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[256+r10],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[272+r10],xmm2
+	movdqa	xmm2,xmm4
+
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[288+r10],xmm3
+	movdqa	xmm3,xmm4
+DB	0x67
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[304+r10],xmm0
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[320+r10],xmm1
+
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[336+r10],xmm2
+
+	pand	xmm0,XMMWORD[64+rdi]
+	pand	xmm1,XMMWORD[80+rdi]
+	pand	xmm2,XMMWORD[96+rdi]
+	movdqa	XMMWORD[352+r10],xmm3
+	pand	xmm3,XMMWORD[112+rdi]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[((-128))+rdi]
+	movdqa	xmm5,XMMWORD[((-112))+rdi]
+	movdqa	xmm2,XMMWORD[((-96))+rdi]
+	pand	xmm4,XMMWORD[112+r10]
+	movdqa	xmm3,XMMWORD[((-80))+rdi]
+	pand	xmm5,XMMWORD[128+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[144+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[160+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[((-64))+rdi]
+	movdqa	xmm5,XMMWORD[((-48))+rdi]
+	movdqa	xmm2,XMMWORD[((-32))+rdi]
+	pand	xmm4,XMMWORD[176+r10]
+	movdqa	xmm3,XMMWORD[((-16))+rdi]
+	pand	xmm5,XMMWORD[192+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[208+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[224+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	movdqa	xmm4,XMMWORD[rdi]
+	movdqa	xmm5,XMMWORD[16+rdi]
+	movdqa	xmm2,XMMWORD[32+rdi]
+	pand	xmm4,XMMWORD[240+r10]
+	movdqa	xmm3,XMMWORD[48+rdi]
+	pand	xmm5,XMMWORD[256+r10]
+	por	xmm0,xmm4
+	pand	xmm2,XMMWORD[272+r10]
+	por	xmm1,xmm5
+	pand	xmm3,XMMWORD[288+r10]
+	por	xmm0,xmm2
+	por	xmm1,xmm3
+	pxor	xmm0,xmm1
+	pshufd	xmm1,xmm0,0x4e
+	por	xmm0,xmm1
+	lea	rdi,[256+rdi]
+DB	102,72,15,126,194
+	lea	rbx,[((64+32+8))+rsp]
+
+	mov	r9,rdx
+	mulx	rax,r8,QWORD[rsi]
+	mulx	r12,r11,QWORD[8+rsi]
+	add	r11,rax
+	mulx	r13,rax,QWORD[16+rsi]
+	adc	r12,rax
+	adc	r13,0
+	mulx	r14,rax,QWORD[24+rsi]
+
+	mov	r15,r8
+	imul	r8,QWORD[((32+8))+rsp]
+	xor	rbp,rbp
+	mov	rdx,r8
+
+	mov	QWORD[((8+8))+rsp],rdi
+
+	lea	rsi,[32+rsi]
+	adcx	r13,rax
+	adcx	r14,rbp
+
+	mulx	r10,rax,QWORD[rcx]
+	adcx	r15,rax
+	adox	r10,r11
+	mulx	r11,rax,QWORD[8+rcx]
+	adcx	r10,rax
+	adox	r11,r12
+	mulx	r12,rax,QWORD[16+rcx]
+	mov	rdi,QWORD[((24+8))+rsp]
+	mov	QWORD[((-32))+rbx],r10
+	adcx	r11,rax
+	adox	r12,r13
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	QWORD[((-24))+rbx],r11
+	adcx	r12,rax
+	adox	r15,rbp
+	lea	rcx,[32+rcx]
+	mov	QWORD[((-16))+rbx],r12
+	jmp	NEAR $L$mulx4x_1st
+
+ALIGN	32
+$L$mulx4x_1st:
+	adcx	r15,rbp
+	mulx	rax,r10,QWORD[rsi]
+	adcx	r10,r14
+	mulx	r14,r11,QWORD[8+rsi]
+	adcx	r11,rax
+	mulx	rax,r12,QWORD[16+rsi]
+	adcx	r12,r14
+	mulx	r14,r13,QWORD[24+rsi]
+DB	0x67,0x67
+	mov	rdx,r8
+	adcx	r13,rax
+	adcx	r14,rbp
+	lea	rsi,[32+rsi]
+	lea	rbx,[32+rbx]
+
+	adox	r10,r15
+	mulx	r15,rax,QWORD[rcx]
+	adcx	r10,rax
+	adox	r11,r15
+	mulx	r15,rax,QWORD[8+rcx]
+	adcx	r11,rax
+	adox	r12,r15
+	mulx	r15,rax,QWORD[16+rcx]
+	mov	QWORD[((-40))+rbx],r10
+	adcx	r12,rax
+	mov	QWORD[((-32))+rbx],r11
+	adox	r13,r15
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	QWORD[((-24))+rbx],r12
+	adcx	r13,rax
+	adox	r15,rbp
+	lea	rcx,[32+rcx]
+	mov	QWORD[((-16))+rbx],r13
+
+	dec	rdi
+	jnz	NEAR $L$mulx4x_1st
+
+	mov	rax,QWORD[8+rsp]
+	adc	r15,rbp
+	lea	rsi,[rax*1+rsi]
+	add	r14,r15
+	mov	rdi,QWORD[((8+8))+rsp]
+	adc	rbp,rbp
+	mov	QWORD[((-8))+rbx],r14
+	jmp	NEAR $L$mulx4x_outer
+
+ALIGN	32
+$L$mulx4x_outer:
+	lea	r10,[((16-256))+rbx]
+	pxor	xmm4,xmm4
+DB	0x67,0x67
+	pxor	xmm5,xmm5
+	movdqa	xmm0,XMMWORD[((-128))+rdi]
+	movdqa	xmm1,XMMWORD[((-112))+rdi]
+	movdqa	xmm2,XMMWORD[((-96))+rdi]
+	pand	xmm0,XMMWORD[256+r10]
+	movdqa	xmm3,XMMWORD[((-80))+rdi]
+	pand	xmm1,XMMWORD[272+r10]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[288+r10]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[304+r10]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[((-64))+rdi]
+	movdqa	xmm1,XMMWORD[((-48))+rdi]
+	movdqa	xmm2,XMMWORD[((-32))+rdi]
+	pand	xmm0,XMMWORD[320+r10]
+	movdqa	xmm3,XMMWORD[((-16))+rdi]
+	pand	xmm1,XMMWORD[336+r10]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[352+r10]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[368+r10]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[rdi]
+	movdqa	xmm1,XMMWORD[16+rdi]
+	movdqa	xmm2,XMMWORD[32+rdi]
+	pand	xmm0,XMMWORD[384+r10]
+	movdqa	xmm3,XMMWORD[48+rdi]
+	pand	xmm1,XMMWORD[400+r10]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[416+r10]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[432+r10]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[64+rdi]
+	movdqa	xmm1,XMMWORD[80+rdi]
+	movdqa	xmm2,XMMWORD[96+rdi]
+	pand	xmm0,XMMWORD[448+r10]
+	movdqa	xmm3,XMMWORD[112+rdi]
+	pand	xmm1,XMMWORD[464+r10]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[480+r10]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[496+r10]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	por	xmm4,xmm5
+	pshufd	xmm0,xmm4,0x4e
+	por	xmm0,xmm4
+	lea	rdi,[256+rdi]
+DB	102,72,15,126,194
+
+	mov	QWORD[rbx],rbp
+	lea	rbx,[32+rax*1+rbx]
+	mulx	r11,r8,QWORD[rsi]
+	xor	rbp,rbp
+	mov	r9,rdx
+	mulx	r12,r14,QWORD[8+rsi]
+	adox	r8,QWORD[((-32))+rbx]
+	adcx	r11,r14
+	mulx	r13,r15,QWORD[16+rsi]
+	adox	r11,QWORD[((-24))+rbx]
+	adcx	r12,r15
+	mulx	r14,rdx,QWORD[24+rsi]
+	adox	r12,QWORD[((-16))+rbx]
+	adcx	r13,rdx
+	lea	rcx,[rax*1+rcx]
+	lea	rsi,[32+rsi]
+	adox	r13,QWORD[((-8))+rbx]
+	adcx	r14,rbp
+	adox	r14,rbp
+
+	mov	r15,r8
+	imul	r8,QWORD[((32+8))+rsp]
+
+	mov	rdx,r8
+	xor	rbp,rbp
+	mov	QWORD[((8+8))+rsp],rdi
+
+	mulx	r10,rax,QWORD[rcx]
+	adcx	r15,rax
+	adox	r10,r11
+	mulx	r11,rax,QWORD[8+rcx]
+	adcx	r10,rax
+	adox	r11,r12
+	mulx	r12,rax,QWORD[16+rcx]
+	adcx	r11,rax
+	adox	r12,r13
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	mov	rdi,QWORD[((24+8))+rsp]
+	mov	QWORD[((-32))+rbx],r10
+	adcx	r12,rax
+	mov	QWORD[((-24))+rbx],r11
+	adox	r15,rbp
+	mov	QWORD[((-16))+rbx],r12
+	lea	rcx,[32+rcx]
+	jmp	NEAR $L$mulx4x_inner
+
+ALIGN	32
+$L$mulx4x_inner:
+	mulx	rax,r10,QWORD[rsi]
+	adcx	r15,rbp
+	adox	r10,r14
+	mulx	r14,r11,QWORD[8+rsi]
+	adcx	r10,QWORD[rbx]
+	adox	r11,rax
+	mulx	rax,r12,QWORD[16+rsi]
+	adcx	r11,QWORD[8+rbx]
+	adox	r12,r14
+	mulx	r14,r13,QWORD[24+rsi]
+	mov	rdx,r8
+	adcx	r12,QWORD[16+rbx]
+	adox	r13,rax
+	adcx	r13,QWORD[24+rbx]
+	adox	r14,rbp
+	lea	rsi,[32+rsi]
+	lea	rbx,[32+rbx]
+	adcx	r14,rbp
+
+	adox	r10,r15
+	mulx	r15,rax,QWORD[rcx]
+	adcx	r10,rax
+	adox	r11,r15
+	mulx	r15,rax,QWORD[8+rcx]
+	adcx	r11,rax
+	adox	r12,r15
+	mulx	r15,rax,QWORD[16+rcx]
+	mov	QWORD[((-40))+rbx],r10
+	adcx	r12,rax
+	adox	r13,r15
+	mov	QWORD[((-32))+rbx],r11
+	mulx	r15,rax,QWORD[24+rcx]
+	mov	rdx,r9
+	lea	rcx,[32+rcx]
+	mov	QWORD[((-24))+rbx],r12
+	adcx	r13,rax
+	adox	r15,rbp
+	mov	QWORD[((-16))+rbx],r13
+
+	dec	rdi
+	jnz	NEAR $L$mulx4x_inner
+
+	mov	rax,QWORD[((0+8))+rsp]
+	adc	r15,rbp
+	sub	rdi,QWORD[rbx]
+	mov	rdi,QWORD[((8+8))+rsp]
+	mov	r10,QWORD[((16+8))+rsp]
+	adc	r14,r15
+	lea	rsi,[rax*1+rsi]
+	adc	rbp,rbp
+	mov	QWORD[((-8))+rbx],r14
+
+	cmp	rdi,r10
+	jb	NEAR $L$mulx4x_outer
+
+	mov	r10,QWORD[((-8))+rcx]
+	mov	r8,rbp
+	mov	r12,QWORD[rax*1+rcx]
+	lea	rbp,[rax*1+rcx]
+	mov	rcx,rax
+	lea	rdi,[rax*1+rbx]
+	xor	eax,eax
+	xor	r15,r15
+	sub	r10,r14
+	adc	r15,r15
+	or	r8,r15
+	sar	rcx,3+2
+	sub	rax,r8
+	mov	rdx,QWORD[((56+8))+rsp]
+	dec	r12
+	mov	r13,QWORD[8+rbp]
+	xor	r8,r8
+	mov	r14,QWORD[16+rbp]
+	mov	r15,QWORD[24+rbp]
+	jmp	NEAR $L$sqrx4x_sub_entry
+
+
+
+ALIGN	32
+bn_powerx5:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_bn_powerx5:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+
+	mov	rax,rsp
+
+$L$powerx5_enter:
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$powerx5_prologue:
+
+	shl	r9d,3
+	lea	r10,[r9*2+r9]
+	neg	r9
+	mov	r8,QWORD[r8]
+
+
+
+
+
+
+
+
+	lea	r11,[((-320))+r9*2+rsp]
+	mov	rbp,rsp
+	sub	r11,rdi
+	and	r11,4095
+	cmp	r10,r11
+	jb	NEAR $L$pwrx_sp_alt
+	sub	rbp,r11
+	lea	rbp,[((-320))+r9*2+rbp]
+	jmp	NEAR $L$pwrx_sp_done
+
+ALIGN	32
+$L$pwrx_sp_alt:
+	lea	r10,[((4096-320))+r9*2]
+	lea	rbp,[((-320))+r9*2+rbp]
+	sub	r11,r10
+	mov	r10,0
+	cmovc	r11,r10
+	sub	rbp,r11
+$L$pwrx_sp_done:
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$pwrx_page_walk
+	jmp	NEAR $L$pwrx_page_walk_done
+
+$L$pwrx_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$pwrx_page_walk
+$L$pwrx_page_walk_done:
+
+	mov	r10,r9
+	neg	r9
+
+
+
+
+
+
+
+
+
+
+
+
+	pxor	xmm0,xmm0
+DB	102,72,15,110,207
+DB	102,72,15,110,209
+DB	102,73,15,110,218
+DB	102,72,15,110,226
+	mov	QWORD[32+rsp],r8
+	mov	QWORD[40+rsp],rax
+
+$L$powerx5_body:
+
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+
+	mov	r9,r10
+	mov	rdi,rsi
+DB	102,72,15,126,209
+DB	102,72,15,126,226
+	mov	rax,QWORD[40+rsp]
+
+	call	mulx4x_internal
+
+	mov	rsi,QWORD[40+rsp]
+
+	mov	rax,1
+
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
+$L$powerx5_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_bn_powerx5:
+
+global	GFp_bn_sqrx8x_internal
+
+ALIGN	32
+GFp_bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	lea	rdi,[((48+8))+rsp]
+	lea	rbp,[r9*1+rsi]
+	mov	QWORD[((0+8))+rsp],r9
+	mov	QWORD[((8+8))+rsp],rbp
+	jmp	NEAR $L$sqr8x_zero_start
+
+ALIGN	32
+DB	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+$L$sqrx8x_zero:
+DB	0x3e
+	movdqa	XMMWORD[rdi],xmm0
+	movdqa	XMMWORD[16+rdi],xmm0
+	movdqa	XMMWORD[32+rdi],xmm0
+	movdqa	XMMWORD[48+rdi],xmm0
+$L$sqr8x_zero_start:
+	movdqa	XMMWORD[64+rdi],xmm0
+	movdqa	XMMWORD[80+rdi],xmm0
+	movdqa	XMMWORD[96+rdi],xmm0
+	movdqa	XMMWORD[112+rdi],xmm0
+	lea	rdi,[128+rdi]
+	sub	r9,64
+	jnz	NEAR $L$sqrx8x_zero
+
+	mov	rdx,QWORD[rsi]
+
+	xor	r10,r10
+	xor	r11,r11
+	xor	r12,r12
+	xor	r13,r13
+	xor	r14,r14
+	xor	r15,r15
+	lea	rdi,[((48+8))+rsp]
+	xor	rbp,rbp
+	jmp	NEAR $L$sqrx8x_outer_loop
+
+ALIGN	32
+$L$sqrx8x_outer_loop:
+	mulx	rax,r8,QWORD[8+rsi]
+	adcx	r8,r9
+	adox	r10,rax
+	mulx	rax,r9,QWORD[16+rsi]
+	adcx	r9,r10
+	adox	r11,rax
+DB	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
+	adcx	r10,r11
+	adox	r12,rax
+DB	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
+	adcx	r11,r12
+	adox	r13,rax
+	mulx	rax,r12,QWORD[40+rsi]
+	adcx	r12,r13
+	adox	r14,rax
+	mulx	rax,r13,QWORD[48+rsi]
+	adcx	r13,r14
+	adox	rax,r15
+	mulx	r15,r14,QWORD[56+rsi]
+	mov	rdx,QWORD[8+rsi]
+	adcx	r14,rax
+	adox	r15,rbp
+	adc	r15,QWORD[64+rdi]
+	mov	QWORD[8+rdi],r8
+	mov	QWORD[16+rdi],r9
+	sbb	rcx,rcx
+	xor	rbp,rbp
+
+
+	mulx	rbx,r8,QWORD[16+rsi]
+	mulx	rax,r9,QWORD[24+rsi]
+	adcx	r8,r10
+	adox	r9,rbx
+	mulx	rbx,r10,QWORD[32+rsi]
+	adcx	r9,r11
+	adox	r10,rax
+DB	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
+	adcx	r10,r12
+	adox	r11,rbx
+DB	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
+	adcx	r11,r13
+	adox	r12,r14
+DB	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
+	mov	rdx,QWORD[16+rsi]
+	adcx	r12,rax
+	adox	r13,rbx
+	adcx	r13,r15
+	adox	r14,rbp
+	adcx	r14,rbp
+
+	mov	QWORD[24+rdi],r8
+	mov	QWORD[32+rdi],r9
+
+	mulx	rbx,r8,QWORD[24+rsi]
+	mulx	rax,r9,QWORD[32+rsi]
+	adcx	r8,r10
+	adox	r9,rbx
+	mulx	rbx,r10,QWORD[40+rsi]
+	adcx	r9,r11
+	adox	r10,rax
+DB	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
+	adcx	r10,r12
+	adox	r11,r13
+DB	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
+DB	0x3e
+	mov	rdx,QWORD[24+rsi]
+	adcx	r11,rbx
+	adox	r12,rax
+	adcx	r12,r14
+	mov	QWORD[40+rdi],r8
+	mov	QWORD[48+rdi],r9
+	mulx	rax,r8,QWORD[32+rsi]
+	adox	r13,rbp
+	adcx	r13,rbp
+
+	mulx	rbx,r9,QWORD[40+rsi]
+	adcx	r8,r10
+	adox	r9,rax
+	mulx	rax,r10,QWORD[48+rsi]
+	adcx	r9,r11
+	adox	r10,r12
+	mulx	r12,r11,QWORD[56+rsi]
+	mov	rdx,QWORD[32+rsi]
+	mov	r14,QWORD[40+rsi]
+	adcx	r10,rbx
+	adox	r11,rax
+	mov	r15,QWORD[48+rsi]
+	adcx	r11,r13
+	adox	r12,rbp
+	adcx	r12,rbp
+
+	mov	QWORD[56+rdi],r8
+	mov	QWORD[64+rdi],r9
+
+	mulx	rax,r9,r14
+	mov	r8,QWORD[56+rsi]
+	adcx	r9,r10
+	mulx	rbx,r10,r15
+	adox	r10,rax
+	adcx	r10,r11
+	mulx	rax,r11,r8
+	mov	rdx,r14
+	adox	r11,rbx
+	adcx	r11,r12
+
+	adcx	rax,rbp
+
+	mulx	rbx,r14,r15
+	mulx	r13,r12,r8
+	mov	rdx,r15
+	lea	rsi,[64+rsi]
+	adcx	r11,r14
+	adox	r12,rbx
+	adcx	r12,rax
+	adox	r13,rbp
+
+DB	0x67,0x67
+	mulx	r14,r8,r8
+	adcx	r13,r8
+	adcx	r14,rbp
+
+	cmp	rsi,QWORD[((8+8))+rsp]
+	je	NEAR $L$sqrx8x_outer_break
+
+	neg	rcx
+	mov	rcx,-8
+	mov	r15,rbp
+	mov	r8,QWORD[64+rdi]
+	adcx	r9,QWORD[72+rdi]
+	adcx	r10,QWORD[80+rdi]
+	adcx	r11,QWORD[88+rdi]
+	adc	r12,QWORD[96+rdi]
+	adc	r13,QWORD[104+rdi]
+	adc	r14,QWORD[112+rdi]
+	adc	r15,QWORD[120+rdi]
+	lea	rbp,[rsi]
+	lea	rdi,[128+rdi]
+	sbb	rax,rax
+
+	mov	rdx,QWORD[((-64))+rsi]
+	mov	QWORD[((16+8))+rsp],rax
+	mov	QWORD[((24+8))+rsp],rdi
+
+
+	xor	eax,eax
+	jmp	NEAR $L$sqrx8x_loop
+
+ALIGN	32
+$L$sqrx8x_loop:
+	mov	rbx,r8
+	mulx	r8,rax,QWORD[rbp]
+	adcx	rbx,rax
+	adox	r8,r9
+
+	mulx	r9,rax,QWORD[8+rbp]
+	adcx	r8,rax
+	adox	r9,r10
+
+	mulx	r10,rax,QWORD[16+rbp]
+	adcx	r9,rax
+	adox	r10,r11
+
+	mulx	r11,rax,QWORD[24+rbp]
+	adcx	r10,rax
+	adox	r11,r12
+
+DB	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+	adcx	r11,rax
+	adox	r12,r13
+
+	mulx	r13,rax,QWORD[40+rbp]
+	adcx	r12,rax
+	adox	r13,r14
+
+	mulx	r14,rax,QWORD[48+rbp]
+	mov	QWORD[rcx*8+rdi],rbx
+	mov	ebx,0
+	adcx	r13,rax
+	adox	r14,r15
+
+DB	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
+	mov	rdx,QWORD[8+rcx*8+rsi]
+	adcx	r14,rax
+	adox	r15,rbx
+	adcx	r15,rbx
+
+DB	0x67
+	inc	rcx
+	jnz	NEAR $L$sqrx8x_loop
+
+	lea	rbp,[64+rbp]
+	mov	rcx,-8
+	cmp	rbp,QWORD[((8+8))+rsp]
+	je	NEAR $L$sqrx8x_break
+
+	sub	rbx,QWORD[((16+8))+rsp]
+DB	0x66
+	mov	rdx,QWORD[((-64))+rsi]
+	adcx	r8,QWORD[rdi]
+	adcx	r9,QWORD[8+rdi]
+	adc	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	lea	rdi,[64+rdi]
+DB	0x67
+	sbb	rax,rax
+	xor	ebx,ebx
+	mov	QWORD[((16+8))+rsp],rax
+	jmp	NEAR $L$sqrx8x_loop
+
+ALIGN	32
+$L$sqrx8x_break:
+	xor	rbp,rbp
+	sub	rbx,QWORD[((16+8))+rsp]
+	adcx	r8,rbp
+	mov	rcx,QWORD[((24+8))+rsp]
+	adcx	r9,rbp
+	mov	rdx,QWORD[rsi]
+	adc	r10,0
+	mov	QWORD[rdi],r8
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+	adc	r14,0
+	adc	r15,0
+	cmp	rdi,rcx
+	je	NEAR $L$sqrx8x_outer_loop
+
+	mov	QWORD[8+rdi],r9
+	mov	r9,QWORD[8+rcx]
+	mov	QWORD[16+rdi],r10
+	mov	r10,QWORD[16+rcx]
+	mov	QWORD[24+rdi],r11
+	mov	r11,QWORD[24+rcx]
+	mov	QWORD[32+rdi],r12
+	mov	r12,QWORD[32+rcx]
+	mov	QWORD[40+rdi],r13
+	mov	r13,QWORD[40+rcx]
+	mov	QWORD[48+rdi],r14
+	mov	r14,QWORD[48+rcx]
+	mov	QWORD[56+rdi],r15
+	mov	r15,QWORD[56+rcx]
+	mov	rdi,rcx
+	jmp	NEAR $L$sqrx8x_outer_loop
+
+ALIGN	32
+$L$sqrx8x_outer_break:
+	mov	QWORD[72+rdi],r9
+DB	102,72,15,126,217
+	mov	QWORD[80+rdi],r10
+	mov	QWORD[88+rdi],r11
+	mov	QWORD[96+rdi],r12
+	mov	QWORD[104+rdi],r13
+	mov	QWORD[112+rdi],r14
+	lea	rdi,[((48+8))+rsp]
+	mov	rdx,QWORD[rcx*1+rsi]
+
+	mov	r11,QWORD[8+rdi]
+	xor	r10,r10
+	mov	r9,QWORD[((0+8))+rsp]
+	adox	r11,r11
+	mov	r12,QWORD[16+rdi]
+	mov	r13,QWORD[24+rdi]
+
+
+ALIGN	32
+$L$sqrx4x_shift_n_add:
+	mulx	rbx,rax,rdx
+	adox	r12,r12
+	adcx	rax,r10
+DB	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
+DB	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
+	adox	r13,r13
+	adcx	rbx,r11
+	mov	r11,QWORD[40+rdi]
+	mov	QWORD[rdi],rax
+	mov	QWORD[8+rdi],rbx
+
+	mulx	rbx,rax,rdx
+	adox	r10,r10
+	adcx	rax,r12
+	mov	rdx,QWORD[16+rcx*1+rsi]
+	mov	r12,QWORD[48+rdi]
+	adox	r11,r11
+	adcx	rbx,r13
+	mov	r13,QWORD[56+rdi]
+	mov	QWORD[16+rdi],rax
+	mov	QWORD[24+rdi],rbx
+
+	mulx	rbx,rax,rdx
+	adox	r12,r12
+	adcx	rax,r10
+	mov	rdx,QWORD[24+rcx*1+rsi]
+	lea	rcx,[32+rcx]
+	mov	r10,QWORD[64+rdi]
+	adox	r13,r13
+	adcx	rbx,r11
+	mov	r11,QWORD[72+rdi]
+	mov	QWORD[32+rdi],rax
+	mov	QWORD[40+rdi],rbx
+
+	mulx	rbx,rax,rdx
+	adox	r10,r10
+	adcx	rax,r12
+	jrcxz	$L$sqrx4x_shift_n_add_break
+DB	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
+	adox	r11,r11
+	adcx	rbx,r13
+	mov	r12,QWORD[80+rdi]
+	mov	r13,QWORD[88+rdi]
+	mov	QWORD[48+rdi],rax
+	mov	QWORD[56+rdi],rbx
+	lea	rdi,[64+rdi]
+	nop
+	jmp	NEAR $L$sqrx4x_shift_n_add
+
+ALIGN	32
+$L$sqrx4x_shift_n_add_break:
+	adcx	rbx,r13
+	mov	QWORD[48+rdi],rax
+	mov	QWORD[56+rdi],rbx
+	lea	rdi,[64+rdi]
+DB	102,72,15,126,213
+__bn_sqrx8x_reduction:
+	xor	eax,eax
+	mov	rbx,QWORD[((32+8))+rsp]
+	mov	rdx,QWORD[((48+8))+rsp]
+	lea	rcx,[((-64))+r9*1+rbp]
+
+	mov	QWORD[((0+8))+rsp],rcx
+	mov	QWORD[((8+8))+rsp],rdi
+
+	lea	rdi,[((48+8))+rsp]
+	jmp	NEAR $L$sqrx8x_reduction_loop
+
+ALIGN	32
+$L$sqrx8x_reduction_loop:
+	mov	r9,QWORD[8+rdi]
+	mov	r10,QWORD[16+rdi]
+	mov	r11,QWORD[24+rdi]
+	mov	r12,QWORD[32+rdi]
+	mov	r8,rdx
+	imul	rdx,rbx
+	mov	r13,QWORD[40+rdi]
+	mov	r14,QWORD[48+rdi]
+	mov	r15,QWORD[56+rdi]
+	mov	QWORD[((24+8))+rsp],rax
+
+	lea	rdi,[64+rdi]
+	xor	rsi,rsi
+	mov	rcx,-8
+	jmp	NEAR $L$sqrx8x_reduce
+
+ALIGN	32
+$L$sqrx8x_reduce:
+	mov	rbx,r8
+	mulx	r8,rax,QWORD[rbp]
+	adcx	rax,rbx
+	adox	r8,r9
+
+	mulx	r9,rbx,QWORD[8+rbp]
+	adcx	r8,rbx
+	adox	r9,r10
+
+	mulx	r10,rbx,QWORD[16+rbp]
+	adcx	r9,rbx
+	adox	r10,r11
+
+	mulx	r11,rbx,QWORD[24+rbp]
+	adcx	r10,rbx
+	adox	r11,r12
+
+DB	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+	mov	rax,rdx
+	mov	rdx,r8
+	adcx	r11,rbx
+	adox	r12,r13
+
+	mulx	rdx,rbx,QWORD[((32+8))+rsp]
+	mov	rdx,rax
+	mov	QWORD[((64+48+8))+rcx*8+rsp],rax
+
+	mulx	r13,rax,QWORD[40+rbp]
+	adcx	r12,rax
+	adox	r13,r14
+
+	mulx	r14,rax,QWORD[48+rbp]
+	adcx	r13,rax
+	adox	r14,r15
+
+	mulx	r15,rax,QWORD[56+rbp]
+	mov	rdx,rbx
+	adcx	r14,rax
+	adox	r15,rsi
+	adcx	r15,rsi
+
+DB	0x67,0x67,0x67
+	inc	rcx
+	jnz	NEAR $L$sqrx8x_reduce
+
+	mov	rax,rsi
+	cmp	rbp,QWORD[((0+8))+rsp]
+	jae	NEAR $L$sqrx8x_no_tail
+
+	mov	rdx,QWORD[((48+8))+rsp]
+	add	r8,QWORD[rdi]
+	lea	rbp,[64+rbp]
+	mov	rcx,-8
+	adcx	r9,QWORD[8+rdi]
+	adcx	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	lea	rdi,[64+rdi]
+	sbb	rax,rax
+
+	xor	rsi,rsi
+	mov	QWORD[((16+8))+rsp],rax
+	jmp	NEAR $L$sqrx8x_tail
+
+ALIGN	32
+$L$sqrx8x_tail:
+	mov	rbx,r8
+	mulx	r8,rax,QWORD[rbp]
+	adcx	rbx,rax
+	adox	r8,r9
+
+	mulx	r9,rax,QWORD[8+rbp]
+	adcx	r8,rax
+	adox	r9,r10
+
+	mulx	r10,rax,QWORD[16+rbp]
+	adcx	r9,rax
+	adox	r10,r11
+
+	mulx	r11,rax,QWORD[24+rbp]
+	adcx	r10,rax
+	adox	r11,r12
+
+DB	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+	adcx	r11,rax
+	adox	r12,r13
+
+	mulx	r13,rax,QWORD[40+rbp]
+	adcx	r12,rax
+	adox	r13,r14
+
+	mulx	r14,rax,QWORD[48+rbp]
+	adcx	r13,rax
+	adox	r14,r15
+
+	mulx	r15,rax,QWORD[56+rbp]
+	mov	rdx,QWORD[((72+48+8))+rcx*8+rsp]
+	adcx	r14,rax
+	adox	r15,rsi
+	mov	QWORD[rcx*8+rdi],rbx
+	mov	rbx,r8
+	adcx	r15,rsi
+
+	inc	rcx
+	jnz	NEAR $L$sqrx8x_tail
+
+	cmp	rbp,QWORD[((0+8))+rsp]
+	jae	NEAR $L$sqrx8x_tail_done
+
+	sub	rsi,QWORD[((16+8))+rsp]
+	mov	rdx,QWORD[((48+8))+rsp]
+	lea	rbp,[64+rbp]
+	adc	r8,QWORD[rdi]
+	adc	r9,QWORD[8+rdi]
+	adc	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	lea	rdi,[64+rdi]
+	sbb	rax,rax
+	sub	rcx,8
+
+	xor	rsi,rsi
+	mov	QWORD[((16+8))+rsp],rax
+	jmp	NEAR $L$sqrx8x_tail
+
+ALIGN	32
+$L$sqrx8x_tail_done:
+	xor	rax,rax
+	add	r8,QWORD[((24+8))+rsp]
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+	adc	r14,0
+	adc	r15,0
+	adc	rax,0
+
+	sub	rsi,QWORD[((16+8))+rsp]
+$L$sqrx8x_no_tail:
+	adc	r8,QWORD[rdi]
+DB	102,72,15,126,217
+	adc	r9,QWORD[8+rdi]
+	mov	rsi,QWORD[56+rbp]
+DB	102,72,15,126,213
+	adc	r10,QWORD[16+rdi]
+	adc	r11,QWORD[24+rdi]
+	adc	r12,QWORD[32+rdi]
+	adc	r13,QWORD[40+rdi]
+	adc	r14,QWORD[48+rdi]
+	adc	r15,QWORD[56+rdi]
+	adc	rax,0
+
+	mov	rbx,QWORD[((32+8))+rsp]
+	mov	rdx,QWORD[64+rcx*1+rdi]
+
+	mov	QWORD[rdi],r8
+	lea	r8,[64+rdi]
+	mov	QWORD[8+rdi],r9
+	mov	QWORD[16+rdi],r10
+	mov	QWORD[24+rdi],r11
+	mov	QWORD[32+rdi],r12
+	mov	QWORD[40+rdi],r13
+	mov	QWORD[48+rdi],r14
+	mov	QWORD[56+rdi],r15
+
+	lea	rdi,[64+rcx*1+rdi]
+	cmp	r8,QWORD[((8+8))+rsp]
+	jb	NEAR $L$sqrx8x_reduction_loop
+	DB	0F3h,0C3h		;repret
+
+
+ALIGN	32
+
+__bn_postx4x_internal:
+
+	mov	r12,QWORD[rbp]
+	mov	r10,rcx
+	mov	r9,rcx
+	neg	rax
+	sar	rcx,3+2
+
+DB	102,72,15,126,202
+DB	102,72,15,126,206
+	dec	r12
+	mov	r13,QWORD[8+rbp]
+	xor	r8,r8
+	mov	r14,QWORD[16+rbp]
+	mov	r15,QWORD[24+rbp]
+	jmp	NEAR $L$sqrx4x_sub_entry
+
+ALIGN	16
+$L$sqrx4x_sub:
+	mov	r12,QWORD[rbp]
+	mov	r13,QWORD[8+rbp]
+	mov	r14,QWORD[16+rbp]
+	mov	r15,QWORD[24+rbp]
+$L$sqrx4x_sub_entry:
+	andn	r12,r12,rax
+	lea	rbp,[32+rbp]
+	andn	r13,r13,rax
+	andn	r14,r14,rax
+	andn	r15,r15,rax
+
+	neg	r8
+	adc	r12,QWORD[rdi]
+	adc	r13,QWORD[8+rdi]
+	adc	r14,QWORD[16+rdi]
+	adc	r15,QWORD[24+rdi]
+	mov	QWORD[rdx],r12
+	lea	rdi,[32+rdi]
+	mov	QWORD[8+rdx],r13
+	sbb	r8,r8
+	mov	QWORD[16+rdx],r14
+	mov	QWORD[24+rdx],r15
+	lea	rdx,[32+rdx]
+
+	inc	rcx
+	jnz	NEAR $L$sqrx4x_sub
+
+	neg	r9
+
+	DB	0F3h,0C3h		;repret
+
+
+global	GFp_bn_scatter5
+
+ALIGN	16
+GFp_bn_scatter5:
+
+	cmp	edx,0
+	jz	NEAR $L$scatter_epilogue
+	lea	r8,[r9*8+r8]
+$L$scatter:
+	mov	rax,QWORD[rcx]
+	lea	rcx,[8+rcx]
+	mov	QWORD[r8],rax
+	lea	r8,[256+r8]
+	sub	edx,1
+	jnz	NEAR $L$scatter
+$L$scatter_epilogue:
+	DB	0F3h,0C3h		;repret
+
+
+
+global	GFp_bn_gather5
+
+ALIGN	32
+GFp_bn_gather5:
+
+$L$SEH_begin_GFp_bn_gather5:
+
+DB	0x4c,0x8d,0x14,0x24
+
+DB	0x48,0x81,0xec,0x08,0x01,0x00,0x00
+	lea	rax,[$L$inc]
+	and	rsp,-16
+
+	movd	xmm5,r9d
+	movdqa	xmm0,XMMWORD[rax]
+	movdqa	xmm1,XMMWORD[16+rax]
+	lea	r11,[128+r8]
+	lea	rax,[128+rsp]
+
+	pshufd	xmm5,xmm5,0
+	movdqa	xmm4,xmm1
+	movdqa	xmm2,xmm1
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	xmm3,xmm4
+
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[(-128)+rax],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[(-112)+rax],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[(-96)+rax],xmm2
+	movdqa	xmm2,xmm4
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[(-80)+rax],xmm3
+	movdqa	xmm3,xmm4
+
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[(-64)+rax],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[(-48)+rax],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[(-32)+rax],xmm2
+	movdqa	xmm2,xmm4
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[(-16)+rax],xmm3
+	movdqa	xmm3,xmm4
+
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[rax],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[16+rax],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[32+rax],xmm2
+	movdqa	xmm2,xmm4
+	paddd	xmm1,xmm0
+	pcmpeqd	xmm0,xmm5
+	movdqa	XMMWORD[48+rax],xmm3
+	movdqa	xmm3,xmm4
+
+	paddd	xmm2,xmm1
+	pcmpeqd	xmm1,xmm5
+	movdqa	XMMWORD[64+rax],xmm0
+	movdqa	xmm0,xmm4
+
+	paddd	xmm3,xmm2
+	pcmpeqd	xmm2,xmm5
+	movdqa	XMMWORD[80+rax],xmm1
+	movdqa	xmm1,xmm4
+
+	paddd	xmm0,xmm3
+	pcmpeqd	xmm3,xmm5
+	movdqa	XMMWORD[96+rax],xmm2
+	movdqa	xmm2,xmm4
+	movdqa	XMMWORD[112+rax],xmm3
+	jmp	NEAR $L$gather
+
+ALIGN	32
+$L$gather:
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	movdqa	xmm0,XMMWORD[((-128))+r11]
+	movdqa	xmm1,XMMWORD[((-112))+r11]
+	movdqa	xmm2,XMMWORD[((-96))+r11]
+	pand	xmm0,XMMWORD[((-128))+rax]
+	movdqa	xmm3,XMMWORD[((-80))+r11]
+	pand	xmm1,XMMWORD[((-112))+rax]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[((-96))+rax]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[((-80))+rax]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[((-64))+r11]
+	movdqa	xmm1,XMMWORD[((-48))+r11]
+	movdqa	xmm2,XMMWORD[((-32))+r11]
+	pand	xmm0,XMMWORD[((-64))+rax]
+	movdqa	xmm3,XMMWORD[((-16))+r11]
+	pand	xmm1,XMMWORD[((-48))+rax]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[((-32))+rax]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[((-16))+rax]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[r11]
+	movdqa	xmm1,XMMWORD[16+r11]
+	movdqa	xmm2,XMMWORD[32+r11]
+	pand	xmm0,XMMWORD[rax]
+	movdqa	xmm3,XMMWORD[48+r11]
+	pand	xmm1,XMMWORD[16+rax]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[32+rax]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[48+rax]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	movdqa	xmm0,XMMWORD[64+r11]
+	movdqa	xmm1,XMMWORD[80+r11]
+	movdqa	xmm2,XMMWORD[96+r11]
+	pand	xmm0,XMMWORD[64+rax]
+	movdqa	xmm3,XMMWORD[112+r11]
+	pand	xmm1,XMMWORD[80+rax]
+	por	xmm4,xmm0
+	pand	xmm2,XMMWORD[96+rax]
+	por	xmm5,xmm1
+	pand	xmm3,XMMWORD[112+rax]
+	por	xmm4,xmm2
+	por	xmm5,xmm3
+	por	xmm4,xmm5
+	lea	r11,[256+r11]
+	pshufd	xmm0,xmm4,0x4e
+	por	xmm0,xmm4
+	movq	QWORD[rcx],xmm0
+	lea	rcx,[8+rcx]
+	sub	edx,1
+	jnz	NEAR $L$gather
+
+	lea	rsp,[r10]
+
+	DB	0F3h,0C3h		;repret
+$L$SEH_end_GFp_bn_gather5:
+
+
+ALIGN	64
+$L$inc:
+	DD	0,0,1,1
+	DD	2,2,2,2
+DB	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+DB	112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115
+DB	99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111
+DB	114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79
+DB	71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111
+DB	112,101,110,115,115,108,46,111,114,103,62,0
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+mul_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_pop_regs
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[8+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	r10,[$L$mul_epilogue]
+	cmp	rbx,r10
+	ja	NEAR $L$body_40
+
+	mov	r10,QWORD[192+r8]
+	mov	rax,QWORD[8+r10*8+rax]
+
+	jmp	NEAR $L$common_pop_regs
+
+$L$body_40:
+	mov	rax,QWORD[40+rax]
+$L$common_pop_regs:
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	DB	0F3h,0C3h		;repret
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_GFp_bn_mul_mont_gather5 wrt ..imagebase
+	DD	$L$SEH_end_GFp_bn_mul_mont_gather5 wrt ..imagebase
+	DD	$L$SEH_info_GFp_bn_mul_mont_gather5 wrt ..imagebase
+
+	DD	$L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase
+	DD	$L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase
+	DD	$L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase
+
+	DD	$L$SEH_begin_GFp_bn_power5 wrt ..imagebase
+	DD	$L$SEH_end_GFp_bn_power5 wrt ..imagebase
+	DD	$L$SEH_info_GFp_bn_power5 wrt ..imagebase
+
+	DD	$L$SEH_begin_bn_from_mont8x wrt ..imagebase
+	DD	$L$SEH_end_bn_from_mont8x wrt ..imagebase
+	DD	$L$SEH_info_bn_from_mont8x wrt ..imagebase
+	DD	$L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase
+	DD	$L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase
+	DD	$L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase
+
+	DD	$L$SEH_begin_bn_powerx5 wrt ..imagebase
+	DD	$L$SEH_end_bn_powerx5 wrt ..imagebase
+	DD	$L$SEH_info_GFp_bn_powerx5 wrt ..imagebase
+	DD	$L$SEH_begin_GFp_bn_gather5 wrt ..imagebase
+	DD	$L$SEH_end_GFp_bn_gather5 wrt ..imagebase
+	DD	$L$SEH_info_GFp_bn_gather5 wrt ..imagebase
+
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_GFp_bn_mul_mont_gather5:
+DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
+ALIGN	8
+$L$SEH_info_bn_mul4x_mont_gather5:
+DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
+ALIGN	8
+$L$SEH_info_GFp_bn_power5:
+DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase
+ALIGN	8
+$L$SEH_info_bn_from_mont8x:
+DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$from_prologue wrt ..imagebase,$L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase
+ALIGN	8
+$L$SEH_info_bn_mulx4x_mont_gather5:
+DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase
+ALIGN	8
+$L$SEH_info_GFp_bn_powerx5:
+DB	9,0,0,0
+	DD	mul_handler wrt ..imagebase
+	DD	$L$powerx5_prologue wrt ..imagebase,$L$powerx5_body wrt ..imagebase,$L$powerx5_epilogue wrt ..imagebase
+ALIGN	8
+$L$SEH_info_GFp_bn_gather5:
+DB	0x01,0x0b,0x03,0x0a
+DB	0x0b,0x01,0x21,0x00
+DB	0x04,0xa3,0x00,0x00
+ALIGN	8