summaryrefslogtreecommitdiff
path: root/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha
diff options
context:
space:
mode:
Diffstat (limited to 'CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha')
-rw-r--r--CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/keccak1600-x86_64.nasm527
-rw-r--r--CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm7610
-rw-r--r--CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-x86_64.nasm5766
-rw-r--r--CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm8291
-rw-r--r--CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-x86_64.nasm5711
-rw-r--r--CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha512-x86_64.nasm5665
6 files changed, 33570 insertions, 0 deletions
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/keccak1600-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/keccak1600-x86_64.nasm
new file mode 100644
index 0000000..af4b87d
--- /dev/null
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/keccak1600-x86_64.nasm
@@ -0,0 +1,527 @@
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+section .text code align=64
+
+
+
+ALIGN 32
+__KeccakF1600:
+
+ mov rax,QWORD[60+rdi]
+ mov rbx,QWORD[68+rdi]
+ mov rcx,QWORD[76+rdi]
+ mov rdx,QWORD[84+rdi]
+ mov rbp,QWORD[92+rdi]
+ jmp NEAR $L$oop
+
+ALIGN 32
+$L$oop:
+ mov r8,QWORD[((-100))+rdi]
+ mov r9,QWORD[((-52))+rdi]
+ mov r10,QWORD[((-4))+rdi]
+ mov r11,QWORD[44+rdi]
+
+ xor rcx,QWORD[((-84))+rdi]
+ xor rdx,QWORD[((-76))+rdi]
+ xor rax,r8
+ xor rbx,QWORD[((-92))+rdi]
+ xor rcx,QWORD[((-44))+rdi]
+ xor rax,QWORD[((-60))+rdi]
+ mov r12,rbp
+ xor rbp,QWORD[((-68))+rdi]
+
+ xor rcx,r10
+ xor rax,QWORD[((-20))+rdi]
+ xor rdx,QWORD[((-36))+rdi]
+ xor rbx,r9
+ xor rbp,QWORD[((-28))+rdi]
+
+ xor rcx,QWORD[36+rdi]
+ xor rax,QWORD[20+rdi]
+ xor rdx,QWORD[4+rdi]
+ xor rbx,QWORD[((-12))+rdi]
+ xor rbp,QWORD[12+rdi]
+
+ mov r13,rcx
+ rol rcx,1
+ xor rcx,rax
+ xor rdx,r11
+
+ rol rax,1
+ xor rax,rdx
+ xor rbx,QWORD[28+rdi]
+
+ rol rdx,1
+ xor rdx,rbx
+ xor rbp,QWORD[52+rdi]
+
+ rol rbx,1
+ xor rbx,rbp
+
+ rol rbp,1
+ xor rbp,r13
+ xor r9,rcx
+ xor r10,rdx
+ rol r9,44
+ xor r11,rbp
+ xor r12,rax
+ rol r10,43
+ xor r8,rbx
+ mov r13,r9
+ rol r11,21
+ or r9,r10
+ xor r9,r8
+ rol r12,14
+
+ xor r9,QWORD[r15]
+ lea r15,[8+r15]
+
+ mov r14,r12
+ and r12,r11
+ mov QWORD[((-100))+rsi],r9
+ xor r12,r10
+ not r10
+ mov QWORD[((-84))+rsi],r12
+
+ or r10,r11
+ mov r12,QWORD[76+rdi]
+ xor r10,r13
+ mov QWORD[((-92))+rsi],r10
+
+ and r13,r8
+ mov r9,QWORD[((-28))+rdi]
+ xor r13,r14
+ mov r10,QWORD[((-20))+rdi]
+ mov QWORD[((-68))+rsi],r13
+
+ or r14,r8
+ mov r8,QWORD[((-76))+rdi]
+ xor r14,r11
+ mov r11,QWORD[28+rdi]
+ mov QWORD[((-76))+rsi],r14
+
+
+ xor r8,rbp
+ xor r12,rdx
+ rol r8,28
+ xor r11,rcx
+ xor r9,rax
+ rol r12,61
+ rol r11,45
+ xor r10,rbx
+ rol r9,20
+ mov r13,r8
+ or r8,r12
+ rol r10,3
+
+ xor r8,r11
+ mov QWORD[((-36))+rsi],r8
+
+ mov r14,r9
+ and r9,r13
+ mov r8,QWORD[((-92))+rdi]
+ xor r9,r12
+ not r12
+ mov QWORD[((-28))+rsi],r9
+
+ or r12,r11
+ mov r9,QWORD[((-44))+rdi]
+ xor r12,r10
+ mov QWORD[((-44))+rsi],r12
+
+ and r11,r10
+ mov r12,QWORD[60+rdi]
+ xor r11,r14
+ mov QWORD[((-52))+rsi],r11
+
+ or r14,r10
+ mov r10,QWORD[4+rdi]
+ xor r14,r13
+ mov r11,QWORD[52+rdi]
+ mov QWORD[((-60))+rsi],r14
+
+
+ xor r10,rbp
+ xor r11,rax
+ rol r10,25
+ xor r9,rdx
+ rol r11,8
+ xor r12,rbx
+ rol r9,6
+ xor r8,rcx
+ rol r12,18
+ mov r13,r10
+ and r10,r11
+ rol r8,1
+
+ not r11
+ xor r10,r9
+ mov QWORD[((-12))+rsi],r10
+
+ mov r14,r12
+ and r12,r11
+ mov r10,QWORD[((-12))+rdi]
+ xor r12,r13
+ mov QWORD[((-4))+rsi],r12
+
+ or r13,r9
+ mov r12,QWORD[84+rdi]
+ xor r13,r8
+ mov QWORD[((-20))+rsi],r13
+
+ and r9,r8
+ xor r9,r14
+ mov QWORD[12+rsi],r9
+
+ or r14,r8
+ mov r9,QWORD[((-60))+rdi]
+ xor r14,r11
+ mov r11,QWORD[36+rdi]
+ mov QWORD[4+rsi],r14
+
+
+ mov r8,QWORD[((-68))+rdi]
+
+ xor r10,rcx
+ xor r11,rdx
+ rol r10,10
+ xor r9,rbx
+ rol r11,15
+ xor r12,rbp
+ rol r9,36
+ xor r8,rax
+ rol r12,56
+ mov r13,r10
+ or r10,r11
+ rol r8,27
+
+ not r11
+ xor r10,r9
+ mov QWORD[28+rsi],r10
+
+ mov r14,r12
+ or r12,r11
+ xor r12,r13
+ mov QWORD[36+rsi],r12
+
+ and r13,r9
+ xor r13,r8
+ mov QWORD[20+rsi],r13
+
+ or r9,r8
+ xor r9,r14
+ mov QWORD[52+rsi],r9
+
+ and r8,r14
+ xor r8,r11
+ mov QWORD[44+rsi],r8
+
+
+ xor rdx,QWORD[((-84))+rdi]
+ xor rbp,QWORD[((-36))+rdi]
+ rol rdx,62
+ xor rcx,QWORD[68+rdi]
+ rol rbp,55
+ xor rax,QWORD[12+rdi]
+ rol rcx,2
+ xor rbx,QWORD[20+rdi]
+ xchg rdi,rsi
+ rol rax,39
+ rol rbx,41
+ mov r13,rdx
+ and rdx,rbp
+ not rbp
+ xor rdx,rcx
+ mov QWORD[92+rdi],rdx
+
+ mov r14,rax
+ and rax,rbp
+ xor rax,r13
+ mov QWORD[60+rdi],rax
+
+ or r13,rcx
+ xor r13,rbx
+ mov QWORD[84+rdi],r13
+
+ and rcx,rbx
+ xor rcx,r14
+ mov QWORD[76+rdi],rcx
+
+ or rbx,r14
+ xor rbx,rbp
+ mov QWORD[68+rdi],rbx
+
+ mov rbp,rdx
+ mov rdx,r13
+
+ test r15,255
+ jnz NEAR $L$oop
+
+ lea r15,[((-192))+r15]
+ DB 0F3h,0C3h ;repret
+
+
+
+
+ALIGN 32
+KeccakF1600:
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+
+ lea rdi,[100+rdi]
+ sub rsp,200
+
+
+ not QWORD[((-92))+rdi]
+ not QWORD[((-84))+rdi]
+ not QWORD[((-36))+rdi]
+ not QWORD[((-4))+rdi]
+ not QWORD[36+rdi]
+ not QWORD[60+rdi]
+
+ lea r15,[iotas]
+ lea rsi,[100+rsp]
+
+ call __KeccakF1600
+
+ not QWORD[((-92))+rdi]
+ not QWORD[((-84))+rdi]
+ not QWORD[((-36))+rdi]
+ not QWORD[((-4))+rdi]
+ not QWORD[36+rdi]
+ not QWORD[60+rdi]
+ lea rdi,[((-100))+rdi]
+
+ add rsp,200
+
+
+ pop r15
+
+ pop r14
+
+ pop r13
+
+ pop r12
+
+ pop rbp
+
+ pop rbx
+
+ DB 0F3h,0C3h ;repret
+
+
+global SHA3_absorb
+
+ALIGN 32
+SHA3_absorb:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_SHA3_absorb:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+
+
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+
+ lea rdi,[100+rdi]
+ sub rsp,232
+
+
+ mov r9,rsi
+ lea rsi,[100+rsp]
+
+ not QWORD[((-92))+rdi]
+ not QWORD[((-84))+rdi]
+ not QWORD[((-36))+rdi]
+ not QWORD[((-4))+rdi]
+ not QWORD[36+rdi]
+ not QWORD[60+rdi]
+ lea r15,[iotas]
+
+ mov QWORD[((216-100))+rsi],rcx
+
+$L$oop_absorb:
+ cmp rdx,rcx
+ jc NEAR $L$done_absorb
+
+ shr rcx,3
+ lea r8,[((-100))+rdi]
+
+$L$block_absorb:
+ mov rax,QWORD[r9]
+ lea r9,[8+r9]
+ xor rax,QWORD[r8]
+ lea r8,[8+r8]
+ sub rdx,8
+ mov QWORD[((-8))+r8],rax
+ sub rcx,1
+ jnz NEAR $L$block_absorb
+
+ mov QWORD[((200-100))+rsi],r9
+ mov QWORD[((208-100))+rsi],rdx
+ call __KeccakF1600
+ mov r9,QWORD[((200-100))+rsi]
+ mov rdx,QWORD[((208-100))+rsi]
+ mov rcx,QWORD[((216-100))+rsi]
+ jmp NEAR $L$oop_absorb
+
+ALIGN 32
+$L$done_absorb:
+ mov rax,rdx
+
+ not QWORD[((-92))+rdi]
+ not QWORD[((-84))+rdi]
+ not QWORD[((-36))+rdi]
+ not QWORD[((-4))+rdi]
+ not QWORD[36+rdi]
+ not QWORD[60+rdi]
+
+ add rsp,232
+
+
+ pop r15
+
+ pop r14
+
+ pop r13
+
+ pop r12
+
+ pop rbp
+
+ pop rbx
+
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_SHA3_absorb:
+global SHA3_squeeze
+
+ALIGN 32
+SHA3_squeeze:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_SHA3_squeeze:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+
+
+
+ push r12
+
+ push r13
+
+ push r14
+
+
+ shr rcx,3
+ mov r8,rdi
+ mov r12,rsi
+ mov r13,rdx
+ mov r14,rcx
+ jmp NEAR $L$oop_squeeze
+
+ALIGN 32
+$L$oop_squeeze:
+ cmp r13,8
+ jb NEAR $L$tail_squeeze
+
+ mov rax,QWORD[r8]
+ lea r8,[8+r8]
+ mov QWORD[r12],rax
+ lea r12,[8+r12]
+ sub r13,8
+ jz NEAR $L$done_squeeze
+
+ sub rcx,1
+ jnz NEAR $L$oop_squeeze
+
+ call KeccakF1600
+ mov r8,rdi
+ mov rcx,r14
+ jmp NEAR $L$oop_squeeze
+
+$L$tail_squeeze:
+ mov rsi,r8
+ mov rdi,r12
+ mov rcx,r13
+DB 0xf3,0xa4
+
+$L$done_squeeze:
+ pop r14
+
+ pop r13
+
+ pop r12
+
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_SHA3_squeeze:
+ALIGN 256
+ DQ 0,0,0,0,0,0,0,0
+
+iotas:
+ DQ 0x0000000000000001
+ DQ 0x0000000000008082
+ DQ 0x800000000000808a
+ DQ 0x8000000080008000
+ DQ 0x000000000000808b
+ DQ 0x0000000080000001
+ DQ 0x8000000080008081
+ DQ 0x8000000000008009
+ DQ 0x000000000000008a
+ DQ 0x0000000000000088
+ DQ 0x0000000080008009
+ DQ 0x000000008000000a
+ DQ 0x000000008000808b
+ DQ 0x800000000000008b
+ DQ 0x8000000000008089
+ DQ 0x8000000000008003
+ DQ 0x8000000000008002
+ DQ 0x8000000000000080
+ DQ 0x000000000000800a
+ DQ 0x800000008000000a
+ DQ 0x8000000080008081
+ DQ 0x8000000000008080
+ DQ 0x0000000080000001
+ DQ 0x8000000080008008
+
+DB 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111
+DB 114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102
+DB 111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84
+DB 79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64
+DB 111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm
new file mode 100644
index 0000000..9018065
--- /dev/null
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm
@@ -0,0 +1,7610 @@
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+section .text code align=64
+
+
+EXTERN OPENSSL_ia32cap_P
+
+global sha1_multi_block
+
+ALIGN 32
+sha1_multi_block:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha1_multi_block:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+ mov rcx,QWORD[((OPENSSL_ia32cap_P+4))]
+ bt rcx,61
+ jc NEAR _shaext_shortcut
+ test ecx,268435456
+ jnz NEAR _avx_shortcut
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ lea rsp,[((-168))+rsp]
+ movaps XMMWORD[rsp],xmm6
+ movaps XMMWORD[16+rsp],xmm7
+ movaps XMMWORD[32+rsp],xmm8
+ movaps XMMWORD[48+rsp],xmm9
+ movaps XMMWORD[(-120)+rax],xmm10
+ movaps XMMWORD[(-104)+rax],xmm11
+ movaps XMMWORD[(-88)+rax],xmm12
+ movaps XMMWORD[(-72)+rax],xmm13
+ movaps XMMWORD[(-56)+rax],xmm14
+ movaps XMMWORD[(-40)+rax],xmm15
+ sub rsp,288
+ and rsp,-256
+ mov QWORD[272+rsp],rax
+
+$L$body:
+ lea rbp,[K_XX_XX]
+ lea rbx,[256+rsp]
+
+$L$oop_grande:
+ mov DWORD[280+rsp],edx
+ xor edx,edx
+
+ mov r8,QWORD[rsi]
+
+ mov ecx,DWORD[8+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[rbx],ecx
+ cmovle r8,rbp
+
+ mov r9,QWORD[16+rsi]
+
+ mov ecx,DWORD[24+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[4+rbx],ecx
+ cmovle r9,rbp
+
+ mov r10,QWORD[32+rsi]
+
+ mov ecx,DWORD[40+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[8+rbx],ecx
+ cmovle r10,rbp
+
+ mov r11,QWORD[48+rsi]
+
+ mov ecx,DWORD[56+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[12+rbx],ecx
+ cmovle r11,rbp
+ test edx,edx
+ jz NEAR $L$done
+
+ movdqu xmm10,XMMWORD[rdi]
+ lea rax,[128+rsp]
+ movdqu xmm11,XMMWORD[32+rdi]
+ movdqu xmm12,XMMWORD[64+rdi]
+ movdqu xmm13,XMMWORD[96+rdi]
+ movdqu xmm14,XMMWORD[128+rdi]
+ movdqa xmm5,XMMWORD[96+rbp]
+ movdqa xmm15,XMMWORD[((-32))+rbp]
+ jmp NEAR $L$oop
+
+ALIGN 32
+$L$oop:
+ movd xmm0,DWORD[r8]
+ lea r8,[64+r8]
+ movd xmm2,DWORD[r9]
+ lea r9,[64+r9]
+ movd xmm3,DWORD[r10]
+ lea r10,[64+r10]
+ movd xmm4,DWORD[r11]
+ lea r11,[64+r11]
+ punpckldq xmm0,xmm3
+ movd xmm1,DWORD[((-60))+r8]
+ punpckldq xmm2,xmm4
+ movd xmm9,DWORD[((-60))+r9]
+ punpckldq xmm0,xmm2
+ movd xmm8,DWORD[((-60))+r10]
+DB 102,15,56,0,197
+ movd xmm7,DWORD[((-60))+r11]
+ punpckldq xmm1,xmm8
+ movdqa xmm8,xmm10
+ paddd xmm14,xmm15
+ punpckldq xmm9,xmm7
+ movdqa xmm7,xmm11
+ movdqa xmm6,xmm11
+ pslld xmm8,5
+ pandn xmm7,xmm13
+ pand xmm6,xmm12
+ punpckldq xmm1,xmm9
+ movdqa xmm9,xmm10
+
+ movdqa XMMWORD[(0-128)+rax],xmm0
+ paddd xmm14,xmm0
+ movd xmm2,DWORD[((-56))+r8]
+ psrld xmm9,27
+ pxor xmm6,xmm7
+ movdqa xmm7,xmm11
+
+ por xmm8,xmm9
+ movd xmm9,DWORD[((-56))+r9]
+ pslld xmm7,30
+ paddd xmm14,xmm6
+
+ psrld xmm11,2
+ paddd xmm14,xmm8
+DB 102,15,56,0,205
+ movd xmm8,DWORD[((-56))+r10]
+ por xmm11,xmm7
+ movd xmm7,DWORD[((-56))+r11]
+ punpckldq xmm2,xmm8
+ movdqa xmm8,xmm14
+ paddd xmm13,xmm15
+ punpckldq xmm9,xmm7
+ movdqa xmm7,xmm10
+ movdqa xmm6,xmm10
+ pslld xmm8,5
+ pandn xmm7,xmm12
+ pand xmm6,xmm11
+ punpckldq xmm2,xmm9
+ movdqa xmm9,xmm14
+
+ movdqa XMMWORD[(16-128)+rax],xmm1
+ paddd xmm13,xmm1
+ movd xmm3,DWORD[((-52))+r8]
+ psrld xmm9,27
+ pxor xmm6,xmm7
+ movdqa xmm7,xmm10
+
+ por xmm8,xmm9
+ movd xmm9,DWORD[((-52))+r9]
+ pslld xmm7,30
+ paddd xmm13,xmm6
+
+ psrld xmm10,2
+ paddd xmm13,xmm8
+DB 102,15,56,0,213
+ movd xmm8,DWORD[((-52))+r10]
+ por xmm10,xmm7
+ movd xmm7,DWORD[((-52))+r11]
+ punpckldq xmm3,xmm8
+ movdqa xmm8,xmm13
+ paddd xmm12,xmm15
+ punpckldq xmm9,xmm7
+ movdqa xmm7,xmm14
+ movdqa xmm6,xmm14
+ pslld xmm8,5
+ pandn xmm7,xmm11
+ pand xmm6,xmm10
+ punpckldq xmm3,xmm9
+ movdqa xmm9,xmm13
+
+ movdqa XMMWORD[(32-128)+rax],xmm2
+ paddd xmm12,xmm2
+ movd xmm4,DWORD[((-48))+r8]
+ psrld xmm9,27
+ pxor xmm6,xmm7
+ movdqa xmm7,xmm14
+
+ por xmm8,xmm9
+ movd xmm9,DWORD[((-48))+r9]
+ pslld xmm7,30
+ paddd xmm12,xmm6
+
+ psrld xmm14,2
+ paddd xmm12,xmm8
+DB 102,15,56,0,221
+ movd xmm8,DWORD[((-48))+r10]
+ por xmm14,xmm7
+ movd xmm7,DWORD[((-48))+r11]
+ punpckldq xmm4,xmm8
+ movdqa xmm8,xmm12
+ paddd xmm11,xmm15
+ punpckldq xmm9,xmm7
+ movdqa xmm7,xmm13
+ movdqa xmm6,xmm13
+ pslld xmm8,5
+ pandn xmm7,xmm10
+ pand xmm6,xmm14
+ punpckldq xmm4,xmm9
+ movdqa xmm9,xmm12
+
+ movdqa XMMWORD[(48-128)+rax],xmm3
+ paddd xmm11,xmm3
+ movd xmm0,DWORD[((-44))+r8]
+ psrld xmm9,27
+ pxor xmm6,xmm7
+ movdqa xmm7,xmm13
+
+ por xmm8,xmm9
+ movd xmm9,DWORD[((-44))+r9]
+ pslld xmm7,30
+ paddd xmm11,xmm6
+
+ psrld xmm13,2
+ paddd xmm11,xmm8
+DB 102,15,56,0,229
+ movd xmm8,DWORD[((-44))+r10]
+ por xmm13,xmm7
+ movd xmm7,DWORD[((-44))+r11]
+ punpckldq xmm0,xmm8
+ movdqa xmm8,xmm11
+ paddd xmm10,xmm15
+ punpckldq xmm9,xmm7
+ movdqa xmm7,xmm12
+ movdqa xmm6,xmm12
+ pslld xmm8,5
+ pandn xmm7,xmm14
+ pand xmm6,xmm13
+ punpckldq xmm0,xmm9
+ movdqa xmm9,xmm11
+
+ movdqa XMMWORD[(64-128)+rax],xmm4
+ paddd xmm10,xmm4
+ movd xmm1,DWORD[((-40))+r8]
+ psrld xmm9,27
+ pxor xmm6,xmm7
+ movdqa xmm7,xmm12
+
+ por xmm8,xmm9
+ movd xmm9,DWORD[((-40))+r9]
+ pslld xmm7,30
+ paddd xmm10,xmm6
+
+ psrld xmm12,2
+ paddd xmm10,xmm8
+DB 102,15,56,0,197
+ movd xmm8,DWORD[((-40))+r10]
+ por xmm12,xmm7
+ movd xmm7,DWORD[((-40))+r11]
+ punpckldq xmm1,xmm8
+ movdqa xmm8,xmm10
+ paddd xmm14,xmm15
+ punpckldq xmm9,xmm7
+ movdqa xmm7,xmm11
+ movdqa xmm6,xmm11
+ pslld xmm8,5
+ pandn xmm7,xmm13
+ pand xmm6,xmm12
+ punpckldq xmm1,xmm9
+ movdqa xmm9,xmm10
+
+ movdqa XMMWORD[(80-128)+rax],xmm0
+ paddd xmm14,xmm0
+ movd xmm2,DWORD[((-36))+r8]
+ psrld xmm9,27
+ pxor xmm6,xmm7
+ movdqa xmm7,xmm11
+
+ por xmm8,xmm9
+ movd xmm9,DWORD[((-36))+r9]
+ pslld xmm7,30
+ paddd xmm14,xmm6
+
+ psrld xmm11,2
+ paddd xmm14,xmm8
+DB 102,15,56,0,205
+ movd xmm8,DWORD[((-36))+r10]
+ por xmm11,xmm7
+ movd xmm7,DWORD[((-36))+r11]
+ punpckldq xmm2,xmm8
+ movdqa xmm8,xmm14
+ paddd xmm13,xmm15
+ punpckldq xmm9,xmm7
+ movdqa xmm7,xmm10
+ movdqa xmm6,xmm10
+ pslld xmm8,5
+ pandn xmm7,xmm12
+ pand xmm6,xmm11
+ punpckldq xmm2,xmm9
+ movdqa xmm9,xmm14
+
+ movdqa XMMWORD[(96-128)+rax],xmm1
+ paddd xmm13,xmm1
+ movd xmm3,DWORD[((-32))+r8]
+ psrld xmm9,27
+ pxor xmm6,xmm7
+ movdqa xmm7,xmm10
+
+ por xmm8,xmm9
+ movd xmm9,DWORD[((-32))+r9]
+ pslld xmm7,30
+ paddd xmm13,xmm6
+
+ psrld xmm10,2
+ paddd xmm13,xmm8
+DB 102,15,56,0,213
+ movd xmm8,DWORD[((-32))+r10]
+ por xmm10,xmm7
+ movd xmm7,DWORD[((-32))+r11]
+ punpckldq xmm3,xmm8
+ movdqa xmm8,xmm13
+ paddd xmm12,xmm15
+ punpckldq xmm9,xmm7
+ movdqa xmm7,xmm14
+ movdqa xmm6,xmm14
+ pslld xmm8,5
+ pandn xmm7,xmm11
+ pand xmm6,xmm10
+ punpckldq xmm3,xmm9
+ movdqa xmm9,xmm13
+
+ movdqa XMMWORD[(112-128)+rax],xmm2
+ paddd xmm12,xmm2
+ movd xmm4,DWORD[((-28))+r8]
+ psrld xmm9,27
+ pxor xmm6,xmm7
+ movdqa xmm7,xmm14
+
+ por xmm8,xmm9
+ movd xmm9,DWORD[((-28))+r9]
+ pslld xmm7,30
+ paddd xmm12,xmm6
+
+ psrld xmm14,2
+ paddd xmm12,xmm8
+DB 102,15,56,0,221
+ movd xmm8,DWORD[((-28))+r10]
+ por xmm14,xmm7
+ movd xmm7,DWORD[((-28))+r11]
+ punpckldq xmm4,xmm8
+ movdqa xmm8,xmm12
+ paddd xmm11,xmm15
+ punpckldq xmm9,xmm7
+ movdqa xmm7,xmm13
+ movdqa xmm6,xmm13
+ pslld xmm8,5
+ pandn xmm7,xmm10
+ pand xmm6,xmm14
+ punpckldq xmm4,xmm9
+ movdqa xmm9,xmm12
+
+ movdqa XMMWORD[(128-128)+rax],xmm3
+ paddd xmm11,xmm3
+ movd xmm0,DWORD[((-24))+r8]
+ psrld xmm9,27
+ pxor xmm6,xmm7
+ movdqa xmm7,xmm13
+
+ por xmm8,xmm9
+ movd xmm9,DWORD[((-24))+r9]
+ pslld xmm7,30
+ paddd xmm11,xmm6
+
+ psrld xmm13,2
+ paddd xmm11,xmm8
+DB 102,15,56,0,229
+ movd xmm8,DWORD[((-24))+r10]
+ por xmm13,xmm7
+ movd xmm7,DWORD[((-24))+r11]
+ punpckldq xmm0,xmm8
+ movdqa xmm8,xmm11
+ paddd xmm10,xmm15
+ punpckldq xmm9,xmm7
+ movdqa xmm7,xmm12
+ movdqa xmm6,xmm12
+ pslld xmm8,5
+ pandn xmm7,xmm14
+ pand xmm6,xmm13
+ punpckldq xmm0,xmm9
+ movdqa xmm9,xmm11
+
+ movdqa XMMWORD[(144-128)+rax],xmm4
+ paddd xmm10,xmm4
+ movd xmm1,DWORD[((-20))+r8]
+ psrld xmm9,27
+ pxor xmm6,xmm7
+ movdqa xmm7,xmm12
+
+ por xmm8,xmm9
+ movd xmm9,DWORD[((-20))+r9]
+ pslld xmm7,30
+ paddd xmm10,xmm6
+
+ psrld xmm12,2
+ paddd xmm10,xmm8
+DB 102,15,56,0,197
+ movd xmm8,DWORD[((-20))+r10]
+ por xmm12,xmm7
+ movd xmm7,DWORD[((-20))+r11]
+ punpckldq xmm1,xmm8
+ movdqa xmm8,xmm10
+ paddd xmm14,xmm15
+ punpckldq xmm9,xmm7
+ movdqa xmm7,xmm11
+ movdqa xmm6,xmm11
+ pslld xmm8,5
+ pandn xmm7,xmm13
+ pand xmm6,xmm12
+ punpckldq xmm1,xmm9
+ movdqa xmm9,xmm10
+
+ movdqa XMMWORD[(160-128)+rax],xmm0
+ paddd xmm14,xmm0
+ movd xmm2,DWORD[((-16))+r8]
+ psrld xmm9,27
+ pxor xmm6,xmm7
+ movdqa xmm7,xmm11
+
+ por xmm8,xmm9
+ movd xmm9,DWORD[((-16))+r9]
+ pslld xmm7,30
+ paddd xmm14,xmm6
+
+ psrld xmm11,2
+ paddd xmm14,xmm8
+DB 102,15,56,0,205
+ movd xmm8,DWORD[((-16))+r10]
+ por xmm11,xmm7
+ movd xmm7,DWORD[((-16))+r11]
+ punpckldq xmm2,xmm8
+ movdqa xmm8,xmm14
+ paddd xmm13,xmm15
+ punpckldq xmm9,xmm7
+ movdqa xmm7,xmm10
+ movdqa xmm6,xmm10
+ pslld xmm8,5
+ pandn xmm7,xmm12
+ pand xmm6,xmm11
+ punpckldq xmm2,xmm9
+ movdqa xmm9,xmm14
+
+ movdqa XMMWORD[(176-128)+rax],xmm1
+ paddd xmm13,xmm1
+ movd xmm3,DWORD[((-12))+r8]
+ psrld xmm9,27
+ pxor xmm6,xmm7
+ movdqa xmm7,xmm10
+
+ por xmm8,xmm9
+ movd xmm9,DWORD[((-12))+r9]
+ pslld xmm7,30
+ paddd xmm13,xmm6
+
+ psrld xmm10,2
+ paddd xmm13,xmm8
+DB 102,15,56,0,213
+ movd xmm8,DWORD[((-12))+r10]
+ por xmm10,xmm7
+ movd xmm7,DWORD[((-12))+r11]
+ punpckldq xmm3,xmm8
+ movdqa xmm8,xmm13
+ paddd xmm12,xmm15
+ punpckldq xmm9,xmm7
+ movdqa xmm7,xmm14
+ movdqa xmm6,xmm14
+ pslld xmm8,5
+ pandn xmm7,xmm11
+ pand xmm6,xmm10
+ punpckldq xmm3,xmm9
+ movdqa xmm9,xmm13
+
+ movdqa XMMWORD[(192-128)+rax],xmm2
+ paddd xmm12,xmm2
+ movd xmm4,DWORD[((-8))+r8]
+ psrld xmm9,27
+ pxor xmm6,xmm7
+ movdqa xmm7,xmm14
+
+ por xmm8,xmm9
+ movd xmm9,DWORD[((-8))+r9]
+ pslld xmm7,30
+ paddd xmm12,xmm6
+
+ psrld xmm14,2
+ paddd xmm12,xmm8
+DB 102,15,56,0,221
+ movd xmm8,DWORD[((-8))+r10]
+ por xmm14,xmm7
+ movd xmm7,DWORD[((-8))+r11]
+ punpckldq xmm4,xmm8
+ movdqa xmm8,xmm12
+ paddd xmm11,xmm15
+ punpckldq xmm9,xmm7
+ movdqa xmm7,xmm13
+ movdqa xmm6,xmm13
+ pslld xmm8,5
+ pandn xmm7,xmm10
+ pand xmm6,xmm14
+ punpckldq xmm4,xmm9
+ movdqa xmm9,xmm12
+
+ movdqa XMMWORD[(208-128)+rax],xmm3
+ paddd xmm11,xmm3
+ movd xmm0,DWORD[((-4))+r8]
+ psrld xmm9,27
+ pxor xmm6,xmm7
+ movdqa xmm7,xmm13
+
+ por xmm8,xmm9
+ movd xmm9,DWORD[((-4))+r9]
+ pslld xmm7,30
+ paddd xmm11,xmm6
+
+ psrld xmm13,2
+ paddd xmm11,xmm8
+DB 102,15,56,0,229
+ movd xmm8,DWORD[((-4))+r10]
+ por xmm13,xmm7
+ movdqa xmm1,XMMWORD[((0-128))+rax]
+ movd xmm7,DWORD[((-4))+r11]
+ punpckldq xmm0,xmm8
+ movdqa xmm8,xmm11
+ paddd xmm10,xmm15
+ punpckldq xmm9,xmm7
+ movdqa xmm7,xmm12
+ movdqa xmm6,xmm12
+ pslld xmm8,5
+ prefetcht0 [63+r8]
+ pandn xmm7,xmm14
+ pand xmm6,xmm13
+ punpckldq xmm0,xmm9
+ movdqa xmm9,xmm11
+
+ movdqa XMMWORD[(224-128)+rax],xmm4
+ paddd xmm10,xmm4
+ psrld xmm9,27
+ pxor xmm6,xmm7
+ movdqa xmm7,xmm12
+ prefetcht0 [63+r9]
+
+ por xmm8,xmm9
+ pslld xmm7,30
+ paddd xmm10,xmm6
+ prefetcht0 [63+r10]
+
+ psrld xmm12,2
+ paddd xmm10,xmm8
+DB 102,15,56,0,197
+ prefetcht0 [63+r11]
+ por xmm12,xmm7
+ movdqa xmm2,XMMWORD[((16-128))+rax]
+ pxor xmm1,xmm3
+ movdqa xmm3,XMMWORD[((32-128))+rax]
+
+ movdqa xmm8,xmm10
+ pxor xmm1,XMMWORD[((128-128))+rax]
+ paddd xmm14,xmm15
+ movdqa xmm7,xmm11
+ pslld xmm8,5
+ pxor xmm1,xmm3
+ movdqa xmm6,xmm11
+ pandn xmm7,xmm13
+ movdqa xmm5,xmm1
+ pand xmm6,xmm12
+ movdqa xmm9,xmm10
+ psrld xmm5,31
+ paddd xmm1,xmm1
+
+ movdqa XMMWORD[(240-128)+rax],xmm0
+ paddd xmm14,xmm0
+ psrld xmm9,27
+ pxor xmm6,xmm7
+
+ movdqa xmm7,xmm11
+ por xmm8,xmm9
+ pslld xmm7,30
+ paddd xmm14,xmm6
+
+ psrld xmm11,2
+ paddd xmm14,xmm8
+ por xmm1,xmm5
+ por xmm11,xmm7
+ pxor xmm2,xmm4
+ movdqa xmm4,XMMWORD[((48-128))+rax]
+
+ movdqa xmm8,xmm14
+ pxor xmm2,XMMWORD[((144-128))+rax]
+ paddd xmm13,xmm15
+ movdqa xmm7,xmm10
+ pslld xmm8,5
+ pxor xmm2,xmm4
+ movdqa xmm6,xmm10
+ pandn xmm7,xmm12
+ movdqa xmm5,xmm2
+ pand xmm6,xmm11
+ movdqa xmm9,xmm14
+ psrld xmm5,31
+ paddd xmm2,xmm2
+
+ movdqa XMMWORD[(0-128)+rax],xmm1
+ paddd xmm13,xmm1
+ psrld xmm9,27
+ pxor xmm6,xmm7
+
+ movdqa xmm7,xmm10
+ por xmm8,xmm9
+ pslld xmm7,30
+ paddd xmm13,xmm6
+
+ psrld xmm10,2
+ paddd xmm13,xmm8
+ por xmm2,xmm5
+ por xmm10,xmm7
+ pxor xmm3,xmm0
+ movdqa xmm0,XMMWORD[((64-128))+rax]
+
+ movdqa xmm8,xmm13
+ pxor xmm3,XMMWORD[((160-128))+rax]
+ paddd xmm12,xmm15
+ movdqa xmm7,xmm14
+ pslld xmm8,5
+ pxor xmm3,xmm0
+ movdqa xmm6,xmm14
+ pandn xmm7,xmm11
+ movdqa xmm5,xmm3
+ pand xmm6,xmm10
+ movdqa xmm9,xmm13
+ psrld xmm5,31
+ paddd xmm3,xmm3
+
+ movdqa XMMWORD[(16-128)+rax],xmm2
+ paddd xmm12,xmm2
+ psrld xmm9,27
+ pxor xmm6,xmm7
+
+ movdqa xmm7,xmm14
+ por xmm8,xmm9
+ pslld xmm7,30
+ paddd xmm12,xmm6
+
+ psrld xmm14,2
+ paddd xmm12,xmm8
+ por xmm3,xmm5
+ por xmm14,xmm7
+ pxor xmm4,xmm1
+ movdqa xmm1,XMMWORD[((80-128))+rax]
+
+ movdqa xmm8,xmm12
+ pxor xmm4,XMMWORD[((176-128))+rax]
+ paddd xmm11,xmm15
+ movdqa xmm7,xmm13
+ pslld xmm8,5
+ pxor xmm4,xmm1
+ movdqa xmm6,xmm13
+ pandn xmm7,xmm10
+ movdqa xmm5,xmm4
+ pand xmm6,xmm14
+ movdqa xmm9,xmm12
+ psrld xmm5,31
+ paddd xmm4,xmm4
+
+ movdqa XMMWORD[(32-128)+rax],xmm3
+ paddd xmm11,xmm3
+ psrld xmm9,27
+ pxor xmm6,xmm7
+
+ movdqa xmm7,xmm13
+ por xmm8,xmm9
+ pslld xmm7,30
+ paddd xmm11,xmm6
+
+ psrld xmm13,2
+ paddd xmm11,xmm8
+ por xmm4,xmm5
+ por xmm13,xmm7
+ pxor xmm0,xmm2
+ movdqa xmm2,XMMWORD[((96-128))+rax]
+
+ movdqa xmm8,xmm11
+ pxor xmm0,XMMWORD[((192-128))+rax]
+ paddd xmm10,xmm15
+ movdqa xmm7,xmm12
+ pslld xmm8,5
+ pxor xmm0,xmm2
+ movdqa xmm6,xmm12
+ pandn xmm7,xmm14
+ movdqa xmm5,xmm0
+ pand xmm6,xmm13
+ movdqa xmm9,xmm11
+ psrld xmm5,31
+ paddd xmm0,xmm0
+
+ movdqa XMMWORD[(48-128)+rax],xmm4
+ paddd xmm10,xmm4
+ psrld xmm9,27
+ pxor xmm6,xmm7
+
+ movdqa xmm7,xmm12
+ por xmm8,xmm9
+ pslld xmm7,30
+ paddd xmm10,xmm6
+
+ psrld xmm12,2
+ paddd xmm10,xmm8
+ por xmm0,xmm5
+ por xmm12,xmm7
+ movdqa xmm15,XMMWORD[rbp]
+ pxor xmm1,xmm3
+ movdqa xmm3,XMMWORD[((112-128))+rax]
+
+ movdqa xmm8,xmm10
+ movdqa xmm6,xmm13
+ pxor xmm1,XMMWORD[((208-128))+rax]
+ paddd xmm14,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm11
+
+ movdqa xmm9,xmm10
+ movdqa XMMWORD[(64-128)+rax],xmm0
+ paddd xmm14,xmm0
+ pxor xmm1,xmm3
+ psrld xmm9,27
+ pxor xmm6,xmm12
+ movdqa xmm7,xmm11
+
+ pslld xmm7,30
+ movdqa xmm5,xmm1
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm14,xmm6
+ paddd xmm1,xmm1
+
+ psrld xmm11,2
+ paddd xmm14,xmm8
+ por xmm1,xmm5
+ por xmm11,xmm7
+ pxor xmm2,xmm4
+ movdqa xmm4,XMMWORD[((128-128))+rax]
+
+ movdqa xmm8,xmm14
+ movdqa xmm6,xmm12
+ pxor xmm2,XMMWORD[((224-128))+rax]
+ paddd xmm13,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm10
+
+ movdqa xmm9,xmm14
+ movdqa XMMWORD[(80-128)+rax],xmm1
+ paddd xmm13,xmm1
+ pxor xmm2,xmm4
+ psrld xmm9,27
+ pxor xmm6,xmm11
+ movdqa xmm7,xmm10
+
+ pslld xmm7,30
+ movdqa xmm5,xmm2
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm13,xmm6
+ paddd xmm2,xmm2
+
+ psrld xmm10,2
+ paddd xmm13,xmm8
+ por xmm2,xmm5
+ por xmm10,xmm7
+ pxor xmm3,xmm0
+ movdqa xmm0,XMMWORD[((144-128))+rax]
+
+ movdqa xmm8,xmm13
+ movdqa xmm6,xmm11
+ pxor xmm3,XMMWORD[((240-128))+rax]
+ paddd xmm12,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm14
+
+ movdqa xmm9,xmm13
+ movdqa XMMWORD[(96-128)+rax],xmm2
+ paddd xmm12,xmm2
+ pxor xmm3,xmm0
+ psrld xmm9,27
+ pxor xmm6,xmm10
+ movdqa xmm7,xmm14
+
+ pslld xmm7,30
+ movdqa xmm5,xmm3
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm12,xmm6
+ paddd xmm3,xmm3
+
+ psrld xmm14,2
+ paddd xmm12,xmm8
+ por xmm3,xmm5
+ por xmm14,xmm7
+ pxor xmm4,xmm1
+ movdqa xmm1,XMMWORD[((160-128))+rax]
+
+ movdqa xmm8,xmm12
+ movdqa xmm6,xmm10
+ pxor xmm4,XMMWORD[((0-128))+rax]
+ paddd xmm11,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm13
+
+ movdqa xmm9,xmm12
+ movdqa XMMWORD[(112-128)+rax],xmm3
+ paddd xmm11,xmm3
+ pxor xmm4,xmm1
+ psrld xmm9,27
+ pxor xmm6,xmm14
+ movdqa xmm7,xmm13
+
+ pslld xmm7,30
+ movdqa xmm5,xmm4
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm11,xmm6
+ paddd xmm4,xmm4
+
+ psrld xmm13,2
+ paddd xmm11,xmm8
+ por xmm4,xmm5
+ por xmm13,xmm7
+ pxor xmm0,xmm2
+ movdqa xmm2,XMMWORD[((176-128))+rax]
+
+ movdqa xmm8,xmm11
+ movdqa xmm6,xmm14
+ pxor xmm0,XMMWORD[((16-128))+rax]
+ paddd xmm10,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm12
+
+ movdqa xmm9,xmm11
+ movdqa XMMWORD[(128-128)+rax],xmm4
+ paddd xmm10,xmm4
+ pxor xmm0,xmm2
+ psrld xmm9,27
+ pxor xmm6,xmm13
+ movdqa xmm7,xmm12
+
+ pslld xmm7,30
+ movdqa xmm5,xmm0
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm10,xmm6
+ paddd xmm0,xmm0
+
+ psrld xmm12,2
+ paddd xmm10,xmm8
+ por xmm0,xmm5
+ por xmm12,xmm7
+ pxor xmm1,xmm3
+ movdqa xmm3,XMMWORD[((192-128))+rax]
+
+ movdqa xmm8,xmm10
+ movdqa xmm6,xmm13
+ pxor xmm1,XMMWORD[((32-128))+rax]
+ paddd xmm14,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm11
+
+ movdqa xmm9,xmm10
+ movdqa XMMWORD[(144-128)+rax],xmm0
+ paddd xmm14,xmm0
+ pxor xmm1,xmm3
+ psrld xmm9,27
+ pxor xmm6,xmm12
+ movdqa xmm7,xmm11
+
+ pslld xmm7,30
+ movdqa xmm5,xmm1
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm14,xmm6
+ paddd xmm1,xmm1
+
+ psrld xmm11,2
+ paddd xmm14,xmm8
+ por xmm1,xmm5
+ por xmm11,xmm7
+ pxor xmm2,xmm4
+ movdqa xmm4,XMMWORD[((208-128))+rax]
+
+ movdqa xmm8,xmm14
+ movdqa xmm6,xmm12
+ pxor xmm2,XMMWORD[((48-128))+rax]
+ paddd xmm13,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm10
+
+ movdqa xmm9,xmm14
+ movdqa XMMWORD[(160-128)+rax],xmm1
+ paddd xmm13,xmm1
+ pxor xmm2,xmm4
+ psrld xmm9,27
+ pxor xmm6,xmm11
+ movdqa xmm7,xmm10
+
+ pslld xmm7,30
+ movdqa xmm5,xmm2
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm13,xmm6
+ paddd xmm2,xmm2
+
+ psrld xmm10,2
+ paddd xmm13,xmm8
+ por xmm2,xmm5
+ por xmm10,xmm7
+ pxor xmm3,xmm0
+ movdqa xmm0,XMMWORD[((224-128))+rax]
+
+ movdqa xmm8,xmm13
+ movdqa xmm6,xmm11
+ pxor xmm3,XMMWORD[((64-128))+rax]
+ paddd xmm12,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm14
+
+ movdqa xmm9,xmm13
+ movdqa XMMWORD[(176-128)+rax],xmm2
+ paddd xmm12,xmm2
+ pxor xmm3,xmm0
+ psrld xmm9,27
+ pxor xmm6,xmm10
+ movdqa xmm7,xmm14
+
+ pslld xmm7,30
+ movdqa xmm5,xmm3
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm12,xmm6
+ paddd xmm3,xmm3
+
+ psrld xmm14,2
+ paddd xmm12,xmm8
+ por xmm3,xmm5
+ por xmm14,xmm7
+ pxor xmm4,xmm1
+ movdqa xmm1,XMMWORD[((240-128))+rax]
+
+ movdqa xmm8,xmm12
+ movdqa xmm6,xmm10
+ pxor xmm4,XMMWORD[((80-128))+rax]
+ paddd xmm11,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm13
+
+ movdqa xmm9,xmm12
+ movdqa XMMWORD[(192-128)+rax],xmm3
+ paddd xmm11,xmm3
+ pxor xmm4,xmm1
+ psrld xmm9,27
+ pxor xmm6,xmm14
+ movdqa xmm7,xmm13
+
+ pslld xmm7,30
+ movdqa xmm5,xmm4
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm11,xmm6
+ paddd xmm4,xmm4
+
+ psrld xmm13,2
+ paddd xmm11,xmm8
+ por xmm4,xmm5
+ por xmm13,xmm7
+ pxor xmm0,xmm2
+ movdqa xmm2,XMMWORD[((0-128))+rax]
+
+ movdqa xmm8,xmm11
+ movdqa xmm6,xmm14
+ pxor xmm0,XMMWORD[((96-128))+rax]
+ paddd xmm10,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm12
+
+ movdqa xmm9,xmm11
+ movdqa XMMWORD[(208-128)+rax],xmm4
+ paddd xmm10,xmm4
+ pxor xmm0,xmm2
+ psrld xmm9,27
+ pxor xmm6,xmm13
+ movdqa xmm7,xmm12
+
+ pslld xmm7,30
+ movdqa xmm5,xmm0
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm10,xmm6
+ paddd xmm0,xmm0
+
+ psrld xmm12,2
+ paddd xmm10,xmm8
+ por xmm0,xmm5
+ por xmm12,xmm7
+ pxor xmm1,xmm3
+ movdqa xmm3,XMMWORD[((16-128))+rax]
+
+ movdqa xmm8,xmm10
+ movdqa xmm6,xmm13
+ pxor xmm1,XMMWORD[((112-128))+rax]
+ paddd xmm14,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm11
+
+ movdqa xmm9,xmm10
+ movdqa XMMWORD[(224-128)+rax],xmm0
+ paddd xmm14,xmm0
+ pxor xmm1,xmm3
+ psrld xmm9,27
+ pxor xmm6,xmm12
+ movdqa xmm7,xmm11
+
+ pslld xmm7,30
+ movdqa xmm5,xmm1
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm14,xmm6
+ paddd xmm1,xmm1
+
+ psrld xmm11,2
+ paddd xmm14,xmm8
+ por xmm1,xmm5
+ por xmm11,xmm7
+ pxor xmm2,xmm4
+ movdqa xmm4,XMMWORD[((32-128))+rax]
+
+ movdqa xmm8,xmm14
+ movdqa xmm6,xmm12
+ pxor xmm2,XMMWORD[((128-128))+rax]
+ paddd xmm13,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm10
+
+ movdqa xmm9,xmm14
+ movdqa XMMWORD[(240-128)+rax],xmm1
+ paddd xmm13,xmm1
+ pxor xmm2,xmm4
+ psrld xmm9,27
+ pxor xmm6,xmm11
+ movdqa xmm7,xmm10
+
+ pslld xmm7,30
+ movdqa xmm5,xmm2
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm13,xmm6
+ paddd xmm2,xmm2
+
+ psrld xmm10,2
+ paddd xmm13,xmm8
+ por xmm2,xmm5
+ por xmm10,xmm7
+ pxor xmm3,xmm0
+ movdqa xmm0,XMMWORD[((48-128))+rax]
+
+ movdqa xmm8,xmm13
+ movdqa xmm6,xmm11
+ pxor xmm3,XMMWORD[((144-128))+rax]
+ paddd xmm12,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm14
+
+ movdqa xmm9,xmm13
+ movdqa XMMWORD[(0-128)+rax],xmm2
+ paddd xmm12,xmm2
+ pxor xmm3,xmm0
+ psrld xmm9,27
+ pxor xmm6,xmm10
+ movdqa xmm7,xmm14
+
+ pslld xmm7,30
+ movdqa xmm5,xmm3
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm12,xmm6
+ paddd xmm3,xmm3
+
+ psrld xmm14,2
+ paddd xmm12,xmm8
+ por xmm3,xmm5
+ por xmm14,xmm7
+ pxor xmm4,xmm1
+ movdqa xmm1,XMMWORD[((64-128))+rax]
+
+ movdqa xmm8,xmm12
+ movdqa xmm6,xmm10
+ pxor xmm4,XMMWORD[((160-128))+rax]
+ paddd xmm11,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm13
+
+ movdqa xmm9,xmm12
+ movdqa XMMWORD[(16-128)+rax],xmm3
+ paddd xmm11,xmm3
+ pxor xmm4,xmm1
+ psrld xmm9,27
+ pxor xmm6,xmm14
+ movdqa xmm7,xmm13
+
+ pslld xmm7,30
+ movdqa xmm5,xmm4
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm11,xmm6
+ paddd xmm4,xmm4
+
+ psrld xmm13,2
+ paddd xmm11,xmm8
+ por xmm4,xmm5
+ por xmm13,xmm7
+ pxor xmm0,xmm2
+ movdqa xmm2,XMMWORD[((80-128))+rax]
+
+ movdqa xmm8,xmm11
+ movdqa xmm6,xmm14
+ pxor xmm0,XMMWORD[((176-128))+rax]
+ paddd xmm10,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm12
+
+ movdqa xmm9,xmm11
+ movdqa XMMWORD[(32-128)+rax],xmm4
+ paddd xmm10,xmm4
+ pxor xmm0,xmm2
+ psrld xmm9,27
+ pxor xmm6,xmm13
+ movdqa xmm7,xmm12
+
+ pslld xmm7,30
+ movdqa xmm5,xmm0
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm10,xmm6
+ paddd xmm0,xmm0
+
+ psrld xmm12,2
+ paddd xmm10,xmm8
+ por xmm0,xmm5
+ por xmm12,xmm7
+ pxor xmm1,xmm3
+ movdqa xmm3,XMMWORD[((96-128))+rax]
+
+ movdqa xmm8,xmm10
+ movdqa xmm6,xmm13
+ pxor xmm1,XMMWORD[((192-128))+rax]
+ paddd xmm14,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm11
+
+ movdqa xmm9,xmm10
+ movdqa XMMWORD[(48-128)+rax],xmm0
+ paddd xmm14,xmm0
+ pxor xmm1,xmm3
+ psrld xmm9,27
+ pxor xmm6,xmm12
+ movdqa xmm7,xmm11
+
+ pslld xmm7,30
+ movdqa xmm5,xmm1
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm14,xmm6
+ paddd xmm1,xmm1
+
+ psrld xmm11,2
+ paddd xmm14,xmm8
+ por xmm1,xmm5
+ por xmm11,xmm7
+ pxor xmm2,xmm4
+ movdqa xmm4,XMMWORD[((112-128))+rax]
+
+ movdqa xmm8,xmm14
+ movdqa xmm6,xmm12
+ pxor xmm2,XMMWORD[((208-128))+rax]
+ paddd xmm13,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm10
+
+ movdqa xmm9,xmm14
+ movdqa XMMWORD[(64-128)+rax],xmm1
+ paddd xmm13,xmm1
+ pxor xmm2,xmm4
+ psrld xmm9,27
+ pxor xmm6,xmm11
+ movdqa xmm7,xmm10
+
+ pslld xmm7,30
+ movdqa xmm5,xmm2
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm13,xmm6
+ paddd xmm2,xmm2
+
+ psrld xmm10,2
+ paddd xmm13,xmm8
+ por xmm2,xmm5
+ por xmm10,xmm7
+ pxor xmm3,xmm0
+ movdqa xmm0,XMMWORD[((128-128))+rax]
+
+ movdqa xmm8,xmm13
+ movdqa xmm6,xmm11
+ pxor xmm3,XMMWORD[((224-128))+rax]
+ paddd xmm12,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm14
+
+ movdqa xmm9,xmm13
+ movdqa XMMWORD[(80-128)+rax],xmm2
+ paddd xmm12,xmm2
+ pxor xmm3,xmm0
+ psrld xmm9,27
+ pxor xmm6,xmm10
+ movdqa xmm7,xmm14
+
+ pslld xmm7,30
+ movdqa xmm5,xmm3
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm12,xmm6
+ paddd xmm3,xmm3
+
+ psrld xmm14,2
+ paddd xmm12,xmm8
+ por xmm3,xmm5
+ por xmm14,xmm7
+ pxor xmm4,xmm1
+ movdqa xmm1,XMMWORD[((144-128))+rax]
+
+ movdqa xmm8,xmm12
+ movdqa xmm6,xmm10
+ pxor xmm4,XMMWORD[((240-128))+rax]
+ paddd xmm11,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm13
+
+ movdqa xmm9,xmm12
+ movdqa XMMWORD[(96-128)+rax],xmm3
+ paddd xmm11,xmm3
+ pxor xmm4,xmm1
+ psrld xmm9,27
+ pxor xmm6,xmm14
+ movdqa xmm7,xmm13
+
+ pslld xmm7,30
+ movdqa xmm5,xmm4
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm11,xmm6
+ paddd xmm4,xmm4
+
+ psrld xmm13,2
+ paddd xmm11,xmm8
+ por xmm4,xmm5
+ por xmm13,xmm7
+ pxor xmm0,xmm2
+ movdqa xmm2,XMMWORD[((160-128))+rax]
+
+ movdqa xmm8,xmm11
+ movdqa xmm6,xmm14
+ pxor xmm0,XMMWORD[((0-128))+rax]
+ paddd xmm10,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm12
+
+ movdqa xmm9,xmm11
+ movdqa XMMWORD[(112-128)+rax],xmm4
+ paddd xmm10,xmm4
+ pxor xmm0,xmm2
+ psrld xmm9,27
+ pxor xmm6,xmm13
+ movdqa xmm7,xmm12
+
+ pslld xmm7,30
+ movdqa xmm5,xmm0
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm10,xmm6
+ paddd xmm0,xmm0
+
+ psrld xmm12,2
+ paddd xmm10,xmm8
+ por xmm0,xmm5
+ por xmm12,xmm7
+ movdqa xmm15,XMMWORD[32+rbp]
+ pxor xmm1,xmm3
+ movdqa xmm3,XMMWORD[((176-128))+rax]
+
+ movdqa xmm8,xmm10
+ movdqa xmm7,xmm13
+ pxor xmm1,XMMWORD[((16-128))+rax]
+ pxor xmm1,xmm3
+ paddd xmm14,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm10
+ pand xmm7,xmm12
+
+ movdqa xmm6,xmm13
+ movdqa xmm5,xmm1
+ psrld xmm9,27
+ paddd xmm14,xmm7
+ pxor xmm6,xmm12
+
+ movdqa XMMWORD[(128-128)+rax],xmm0
+ paddd xmm14,xmm0
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm11
+ movdqa xmm7,xmm11
+
+ pslld xmm7,30
+ paddd xmm1,xmm1
+ paddd xmm14,xmm6
+
+ psrld xmm11,2
+ paddd xmm14,xmm8
+ por xmm1,xmm5
+ por xmm11,xmm7
+ pxor xmm2,xmm4
+ movdqa xmm4,XMMWORD[((192-128))+rax]
+
+ movdqa xmm8,xmm14
+ movdqa xmm7,xmm12
+ pxor xmm2,XMMWORD[((32-128))+rax]
+ pxor xmm2,xmm4
+ paddd xmm13,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm14
+ pand xmm7,xmm11
+
+ movdqa xmm6,xmm12
+ movdqa xmm5,xmm2
+ psrld xmm9,27
+ paddd xmm13,xmm7
+ pxor xmm6,xmm11
+
+ movdqa XMMWORD[(144-128)+rax],xmm1
+ paddd xmm13,xmm1
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm10
+ movdqa xmm7,xmm10
+
+ pslld xmm7,30
+ paddd xmm2,xmm2
+ paddd xmm13,xmm6
+
+ psrld xmm10,2
+ paddd xmm13,xmm8
+ por xmm2,xmm5
+ por xmm10,xmm7
+ pxor xmm3,xmm0
+ movdqa xmm0,XMMWORD[((208-128))+rax]
+
+ movdqa xmm8,xmm13
+ movdqa xmm7,xmm11
+ pxor xmm3,XMMWORD[((48-128))+rax]
+ pxor xmm3,xmm0
+ paddd xmm12,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm13
+ pand xmm7,xmm10
+
+ movdqa xmm6,xmm11
+ movdqa xmm5,xmm3
+ psrld xmm9,27
+ paddd xmm12,xmm7
+ pxor xmm6,xmm10
+
+ movdqa XMMWORD[(160-128)+rax],xmm2
+ paddd xmm12,xmm2
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm14
+ movdqa xmm7,xmm14
+
+ pslld xmm7,30
+ paddd xmm3,xmm3
+ paddd xmm12,xmm6
+
+ psrld xmm14,2
+ paddd xmm12,xmm8
+ por xmm3,xmm5
+ por xmm14,xmm7
+ pxor xmm4,xmm1
+ movdqa xmm1,XMMWORD[((224-128))+rax]
+
+ movdqa xmm8,xmm12
+ movdqa xmm7,xmm10
+ pxor xmm4,XMMWORD[((64-128))+rax]
+ pxor xmm4,xmm1
+ paddd xmm11,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm12
+ pand xmm7,xmm14
+
+ movdqa xmm6,xmm10
+ movdqa xmm5,xmm4
+ psrld xmm9,27
+ paddd xmm11,xmm7
+ pxor xmm6,xmm14
+
+ movdqa XMMWORD[(176-128)+rax],xmm3
+ paddd xmm11,xmm3
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm13
+ movdqa xmm7,xmm13
+
+ pslld xmm7,30
+ paddd xmm4,xmm4
+ paddd xmm11,xmm6
+
+ psrld xmm13,2
+ paddd xmm11,xmm8
+ por xmm4,xmm5
+ por xmm13,xmm7
+ pxor xmm0,xmm2
+ movdqa xmm2,XMMWORD[((240-128))+rax]
+
+ movdqa xmm8,xmm11
+ movdqa xmm7,xmm14
+ pxor xmm0,XMMWORD[((80-128))+rax]
+ pxor xmm0,xmm2
+ paddd xmm10,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm11
+ pand xmm7,xmm13
+
+ movdqa xmm6,xmm14
+ movdqa xmm5,xmm0
+ psrld xmm9,27
+ paddd xmm10,xmm7
+ pxor xmm6,xmm13
+
+ movdqa XMMWORD[(192-128)+rax],xmm4
+ paddd xmm10,xmm4
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm12
+ movdqa xmm7,xmm12
+
+ pslld xmm7,30
+ paddd xmm0,xmm0
+ paddd xmm10,xmm6
+
+ psrld xmm12,2
+ paddd xmm10,xmm8
+ por xmm0,xmm5
+ por xmm12,xmm7
+ pxor xmm1,xmm3
+ movdqa xmm3,XMMWORD[((0-128))+rax]
+
+ movdqa xmm8,xmm10
+ movdqa xmm7,xmm13
+ pxor xmm1,XMMWORD[((96-128))+rax]
+ pxor xmm1,xmm3
+ paddd xmm14,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm10
+ pand xmm7,xmm12
+
+ movdqa xmm6,xmm13
+ movdqa xmm5,xmm1
+ psrld xmm9,27
+ paddd xmm14,xmm7
+ pxor xmm6,xmm12
+
+ movdqa XMMWORD[(208-128)+rax],xmm0
+ paddd xmm14,xmm0
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm11
+ movdqa xmm7,xmm11
+
+ pslld xmm7,30
+ paddd xmm1,xmm1
+ paddd xmm14,xmm6
+
+ psrld xmm11,2
+ paddd xmm14,xmm8
+ por xmm1,xmm5
+ por xmm11,xmm7
+ pxor xmm2,xmm4
+ movdqa xmm4,XMMWORD[((16-128))+rax]
+
+ movdqa xmm8,xmm14
+ movdqa xmm7,xmm12
+ pxor xmm2,XMMWORD[((112-128))+rax]
+ pxor xmm2,xmm4
+ paddd xmm13,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm14
+ pand xmm7,xmm11
+
+ movdqa xmm6,xmm12
+ movdqa xmm5,xmm2
+ psrld xmm9,27
+ paddd xmm13,xmm7
+ pxor xmm6,xmm11
+
+ movdqa XMMWORD[(224-128)+rax],xmm1
+ paddd xmm13,xmm1
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm10
+ movdqa xmm7,xmm10
+
+ pslld xmm7,30
+ paddd xmm2,xmm2
+ paddd xmm13,xmm6
+
+ psrld xmm10,2
+ paddd xmm13,xmm8
+ por xmm2,xmm5
+ por xmm10,xmm7
+ pxor xmm3,xmm0
+ movdqa xmm0,XMMWORD[((32-128))+rax]
+
+ movdqa xmm8,xmm13
+ movdqa xmm7,xmm11
+ pxor xmm3,XMMWORD[((128-128))+rax]
+ pxor xmm3,xmm0
+ paddd xmm12,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm13
+ pand xmm7,xmm10
+
+ movdqa xmm6,xmm11
+ movdqa xmm5,xmm3
+ psrld xmm9,27
+ paddd xmm12,xmm7
+ pxor xmm6,xmm10
+
+ movdqa XMMWORD[(240-128)+rax],xmm2
+ paddd xmm12,xmm2
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm14
+ movdqa xmm7,xmm14
+
+ pslld xmm7,30
+ paddd xmm3,xmm3
+ paddd xmm12,xmm6
+
+ psrld xmm14,2
+ paddd xmm12,xmm8
+ por xmm3,xmm5
+ por xmm14,xmm7
+ pxor xmm4,xmm1
+ movdqa xmm1,XMMWORD[((48-128))+rax]
+
+ movdqa xmm8,xmm12
+ movdqa xmm7,xmm10
+ pxor xmm4,XMMWORD[((144-128))+rax]
+ pxor xmm4,xmm1
+ paddd xmm11,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm12
+ pand xmm7,xmm14
+
+ movdqa xmm6,xmm10
+ movdqa xmm5,xmm4
+ psrld xmm9,27
+ paddd xmm11,xmm7
+ pxor xmm6,xmm14
+
+ movdqa XMMWORD[(0-128)+rax],xmm3
+ paddd xmm11,xmm3
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm13
+ movdqa xmm7,xmm13
+
+ pslld xmm7,30
+ paddd xmm4,xmm4
+ paddd xmm11,xmm6
+
+ psrld xmm13,2
+ paddd xmm11,xmm8
+ por xmm4,xmm5
+ por xmm13,xmm7
+ pxor xmm0,xmm2
+ movdqa xmm2,XMMWORD[((64-128))+rax]
+
+ movdqa xmm8,xmm11
+ movdqa xmm7,xmm14
+ pxor xmm0,XMMWORD[((160-128))+rax]
+ pxor xmm0,xmm2
+ paddd xmm10,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm11
+ pand xmm7,xmm13
+
+ movdqa xmm6,xmm14
+ movdqa xmm5,xmm0
+ psrld xmm9,27
+ paddd xmm10,xmm7
+ pxor xmm6,xmm13
+
+ movdqa XMMWORD[(16-128)+rax],xmm4
+ paddd xmm10,xmm4
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm12
+ movdqa xmm7,xmm12
+
+ pslld xmm7,30
+ paddd xmm0,xmm0
+ paddd xmm10,xmm6
+
+ psrld xmm12,2
+ paddd xmm10,xmm8
+ por xmm0,xmm5
+ por xmm12,xmm7
+ pxor xmm1,xmm3
+ movdqa xmm3,XMMWORD[((80-128))+rax]
+
+ movdqa xmm8,xmm10
+ movdqa xmm7,xmm13
+ pxor xmm1,XMMWORD[((176-128))+rax]
+ pxor xmm1,xmm3
+ paddd xmm14,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm10
+ pand xmm7,xmm12
+
+ movdqa xmm6,xmm13
+ movdqa xmm5,xmm1
+ psrld xmm9,27
+ paddd xmm14,xmm7
+ pxor xmm6,xmm12
+
+ movdqa XMMWORD[(32-128)+rax],xmm0
+ paddd xmm14,xmm0
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm11
+ movdqa xmm7,xmm11
+
+ pslld xmm7,30
+ paddd xmm1,xmm1
+ paddd xmm14,xmm6
+
+ psrld xmm11,2
+ paddd xmm14,xmm8
+ por xmm1,xmm5
+ por xmm11,xmm7
+ pxor xmm2,xmm4
+ movdqa xmm4,XMMWORD[((96-128))+rax]
+
+ movdqa xmm8,xmm14
+ movdqa xmm7,xmm12
+ pxor xmm2,XMMWORD[((192-128))+rax]
+ pxor xmm2,xmm4
+ paddd xmm13,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm14
+ pand xmm7,xmm11
+
+ movdqa xmm6,xmm12
+ movdqa xmm5,xmm2
+ psrld xmm9,27
+ paddd xmm13,xmm7
+ pxor xmm6,xmm11
+
+ movdqa XMMWORD[(48-128)+rax],xmm1
+ paddd xmm13,xmm1
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm10
+ movdqa xmm7,xmm10
+
+ pslld xmm7,30
+ paddd xmm2,xmm2
+ paddd xmm13,xmm6
+
+ psrld xmm10,2
+ paddd xmm13,xmm8
+ por xmm2,xmm5
+ por xmm10,xmm7
+ pxor xmm3,xmm0
+ movdqa xmm0,XMMWORD[((112-128))+rax]
+
+ movdqa xmm8,xmm13
+ movdqa xmm7,xmm11
+ pxor xmm3,XMMWORD[((208-128))+rax]
+ pxor xmm3,xmm0
+ paddd xmm12,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm13
+ pand xmm7,xmm10
+
+ movdqa xmm6,xmm11
+ movdqa xmm5,xmm3
+ psrld xmm9,27
+ paddd xmm12,xmm7
+ pxor xmm6,xmm10
+
+ movdqa XMMWORD[(64-128)+rax],xmm2
+ paddd xmm12,xmm2
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm14
+ movdqa xmm7,xmm14
+
+ pslld xmm7,30
+ paddd xmm3,xmm3
+ paddd xmm12,xmm6
+
+ psrld xmm14,2
+ paddd xmm12,xmm8
+ por xmm3,xmm5
+ por xmm14,xmm7
+ pxor xmm4,xmm1
+ movdqa xmm1,XMMWORD[((128-128))+rax]
+
+ movdqa xmm8,xmm12
+ movdqa xmm7,xmm10
+ pxor xmm4,XMMWORD[((224-128))+rax]
+ pxor xmm4,xmm1
+ paddd xmm11,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm12
+ pand xmm7,xmm14
+
+ movdqa xmm6,xmm10
+ movdqa xmm5,xmm4
+ psrld xmm9,27
+ paddd xmm11,xmm7
+ pxor xmm6,xmm14
+
+ movdqa XMMWORD[(80-128)+rax],xmm3
+ paddd xmm11,xmm3
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm13
+ movdqa xmm7,xmm13
+
+ pslld xmm7,30
+ paddd xmm4,xmm4
+ paddd xmm11,xmm6
+
+ psrld xmm13,2
+ paddd xmm11,xmm8
+ por xmm4,xmm5
+ por xmm13,xmm7
+ pxor xmm0,xmm2
+ movdqa xmm2,XMMWORD[((144-128))+rax]
+
+ movdqa xmm8,xmm11
+ movdqa xmm7,xmm14
+ pxor xmm0,XMMWORD[((240-128))+rax]
+ pxor xmm0,xmm2
+ paddd xmm10,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm11
+ pand xmm7,xmm13
+
+ movdqa xmm6,xmm14
+ movdqa xmm5,xmm0
+ psrld xmm9,27
+ paddd xmm10,xmm7
+ pxor xmm6,xmm13
+
+ movdqa XMMWORD[(96-128)+rax],xmm4
+ paddd xmm10,xmm4
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm12
+ movdqa xmm7,xmm12
+
+ pslld xmm7,30
+ paddd xmm0,xmm0
+ paddd xmm10,xmm6
+
+ psrld xmm12,2
+ paddd xmm10,xmm8
+ por xmm0,xmm5
+ por xmm12,xmm7
+ pxor xmm1,xmm3
+ movdqa xmm3,XMMWORD[((160-128))+rax]
+
+ movdqa xmm8,xmm10
+ movdqa xmm7,xmm13
+ pxor xmm1,XMMWORD[((0-128))+rax]
+ pxor xmm1,xmm3
+ paddd xmm14,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm10
+ pand xmm7,xmm12
+
+ movdqa xmm6,xmm13
+ movdqa xmm5,xmm1
+ psrld xmm9,27
+ paddd xmm14,xmm7
+ pxor xmm6,xmm12
+
+ movdqa XMMWORD[(112-128)+rax],xmm0
+ paddd xmm14,xmm0
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm11
+ movdqa xmm7,xmm11
+
+ pslld xmm7,30
+ paddd xmm1,xmm1
+ paddd xmm14,xmm6
+
+ psrld xmm11,2
+ paddd xmm14,xmm8
+ por xmm1,xmm5
+ por xmm11,xmm7
+ pxor xmm2,xmm4
+ movdqa xmm4,XMMWORD[((176-128))+rax]
+
+ movdqa xmm8,xmm14
+ movdqa xmm7,xmm12
+ pxor xmm2,XMMWORD[((16-128))+rax]
+ pxor xmm2,xmm4
+ paddd xmm13,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm14
+ pand xmm7,xmm11
+
+ movdqa xmm6,xmm12
+ movdqa xmm5,xmm2
+ psrld xmm9,27
+ paddd xmm13,xmm7
+ pxor xmm6,xmm11
+
+ movdqa XMMWORD[(128-128)+rax],xmm1
+ paddd xmm13,xmm1
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm10
+ movdqa xmm7,xmm10
+
+ pslld xmm7,30
+ paddd xmm2,xmm2
+ paddd xmm13,xmm6
+
+ psrld xmm10,2
+ paddd xmm13,xmm8
+ por xmm2,xmm5
+ por xmm10,xmm7
+ pxor xmm3,xmm0
+ movdqa xmm0,XMMWORD[((192-128))+rax]
+
+ movdqa xmm8,xmm13
+ movdqa xmm7,xmm11
+ pxor xmm3,XMMWORD[((32-128))+rax]
+ pxor xmm3,xmm0
+ paddd xmm12,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm13
+ pand xmm7,xmm10
+
+ movdqa xmm6,xmm11
+ movdqa xmm5,xmm3
+ psrld xmm9,27
+ paddd xmm12,xmm7
+ pxor xmm6,xmm10
+
+ movdqa XMMWORD[(144-128)+rax],xmm2
+ paddd xmm12,xmm2
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm14
+ movdqa xmm7,xmm14
+
+ pslld xmm7,30
+ paddd xmm3,xmm3
+ paddd xmm12,xmm6
+
+ psrld xmm14,2
+ paddd xmm12,xmm8
+ por xmm3,xmm5
+ por xmm14,xmm7
+ pxor xmm4,xmm1
+ movdqa xmm1,XMMWORD[((208-128))+rax]
+
+ movdqa xmm8,xmm12
+ movdqa xmm7,xmm10
+ pxor xmm4,XMMWORD[((48-128))+rax]
+ pxor xmm4,xmm1
+ paddd xmm11,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm12
+ pand xmm7,xmm14
+
+ movdqa xmm6,xmm10
+ movdqa xmm5,xmm4
+ psrld xmm9,27
+ paddd xmm11,xmm7
+ pxor xmm6,xmm14
+
+ movdqa XMMWORD[(160-128)+rax],xmm3
+ paddd xmm11,xmm3
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm13
+ movdqa xmm7,xmm13
+
+ pslld xmm7,30
+ paddd xmm4,xmm4
+ paddd xmm11,xmm6
+
+ psrld xmm13,2
+ paddd xmm11,xmm8
+ por xmm4,xmm5
+ por xmm13,xmm7
+ pxor xmm0,xmm2
+ movdqa xmm2,XMMWORD[((224-128))+rax]
+
+ movdqa xmm8,xmm11
+ movdqa xmm7,xmm14
+ pxor xmm0,XMMWORD[((64-128))+rax]
+ pxor xmm0,xmm2
+ paddd xmm10,xmm15
+ pslld xmm8,5
+ movdqa xmm9,xmm11
+ pand xmm7,xmm13
+
+ movdqa xmm6,xmm14
+ movdqa xmm5,xmm0
+ psrld xmm9,27
+ paddd xmm10,xmm7
+ pxor xmm6,xmm13
+
+ movdqa XMMWORD[(176-128)+rax],xmm4
+ paddd xmm10,xmm4
+ por xmm8,xmm9
+ psrld xmm5,31
+ pand xmm6,xmm12
+ movdqa xmm7,xmm12
+
+ pslld xmm7,30
+ paddd xmm0,xmm0
+ paddd xmm10,xmm6
+
+ psrld xmm12,2
+ paddd xmm10,xmm8
+ por xmm0,xmm5
+ por xmm12,xmm7
+ movdqa xmm15,XMMWORD[64+rbp]
+ pxor xmm1,xmm3
+ movdqa xmm3,XMMWORD[((240-128))+rax]
+
+ movdqa xmm8,xmm10
+ movdqa xmm6,xmm13
+ pxor xmm1,XMMWORD[((80-128))+rax]
+ paddd xmm14,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm11
+
+ movdqa xmm9,xmm10
+ movdqa XMMWORD[(192-128)+rax],xmm0
+ paddd xmm14,xmm0
+ pxor xmm1,xmm3
+ psrld xmm9,27
+ pxor xmm6,xmm12
+ movdqa xmm7,xmm11
+
+ pslld xmm7,30
+ movdqa xmm5,xmm1
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm14,xmm6
+ paddd xmm1,xmm1
+
+ psrld xmm11,2
+ paddd xmm14,xmm8
+ por xmm1,xmm5
+ por xmm11,xmm7
+ pxor xmm2,xmm4
+ movdqa xmm4,XMMWORD[((0-128))+rax]
+
+ movdqa xmm8,xmm14
+ movdqa xmm6,xmm12
+ pxor xmm2,XMMWORD[((96-128))+rax]
+ paddd xmm13,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm10
+
+ movdqa xmm9,xmm14
+ movdqa XMMWORD[(208-128)+rax],xmm1
+ paddd xmm13,xmm1
+ pxor xmm2,xmm4
+ psrld xmm9,27
+ pxor xmm6,xmm11
+ movdqa xmm7,xmm10
+
+ pslld xmm7,30
+ movdqa xmm5,xmm2
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm13,xmm6
+ paddd xmm2,xmm2
+
+ psrld xmm10,2
+ paddd xmm13,xmm8
+ por xmm2,xmm5
+ por xmm10,xmm7
+ pxor xmm3,xmm0
+ movdqa xmm0,XMMWORD[((16-128))+rax]
+
+ movdqa xmm8,xmm13
+ movdqa xmm6,xmm11
+ pxor xmm3,XMMWORD[((112-128))+rax]
+ paddd xmm12,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm14
+
+ movdqa xmm9,xmm13
+ movdqa XMMWORD[(224-128)+rax],xmm2
+ paddd xmm12,xmm2
+ pxor xmm3,xmm0
+ psrld xmm9,27
+ pxor xmm6,xmm10
+ movdqa xmm7,xmm14
+
+ pslld xmm7,30
+ movdqa xmm5,xmm3
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm12,xmm6
+ paddd xmm3,xmm3
+
+ psrld xmm14,2
+ paddd xmm12,xmm8
+ por xmm3,xmm5
+ por xmm14,xmm7
+ pxor xmm4,xmm1
+ movdqa xmm1,XMMWORD[((32-128))+rax]
+
+ movdqa xmm8,xmm12
+ movdqa xmm6,xmm10
+ pxor xmm4,XMMWORD[((128-128))+rax]
+ paddd xmm11,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm13
+
+ movdqa xmm9,xmm12
+ movdqa XMMWORD[(240-128)+rax],xmm3
+ paddd xmm11,xmm3
+ pxor xmm4,xmm1
+ psrld xmm9,27
+ pxor xmm6,xmm14
+ movdqa xmm7,xmm13
+
+ pslld xmm7,30
+ movdqa xmm5,xmm4
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm11,xmm6
+ paddd xmm4,xmm4
+
+ psrld xmm13,2
+ paddd xmm11,xmm8
+ por xmm4,xmm5
+ por xmm13,xmm7
+ pxor xmm0,xmm2
+ movdqa xmm2,XMMWORD[((48-128))+rax]
+
+ movdqa xmm8,xmm11
+ movdqa xmm6,xmm14
+ pxor xmm0,XMMWORD[((144-128))+rax]
+ paddd xmm10,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm12
+
+ movdqa xmm9,xmm11
+ movdqa XMMWORD[(0-128)+rax],xmm4
+ paddd xmm10,xmm4
+ pxor xmm0,xmm2
+ psrld xmm9,27
+ pxor xmm6,xmm13
+ movdqa xmm7,xmm12
+
+ pslld xmm7,30
+ movdqa xmm5,xmm0
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm10,xmm6
+ paddd xmm0,xmm0
+
+ psrld xmm12,2
+ paddd xmm10,xmm8
+ por xmm0,xmm5
+ por xmm12,xmm7
+ pxor xmm1,xmm3
+ movdqa xmm3,XMMWORD[((64-128))+rax]
+
+ movdqa xmm8,xmm10
+ movdqa xmm6,xmm13
+ pxor xmm1,XMMWORD[((160-128))+rax]
+ paddd xmm14,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm11
+
+ movdqa xmm9,xmm10
+ movdqa XMMWORD[(16-128)+rax],xmm0
+ paddd xmm14,xmm0
+ pxor xmm1,xmm3
+ psrld xmm9,27
+ pxor xmm6,xmm12
+ movdqa xmm7,xmm11
+
+ pslld xmm7,30
+ movdqa xmm5,xmm1
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm14,xmm6
+ paddd xmm1,xmm1
+
+ psrld xmm11,2
+ paddd xmm14,xmm8
+ por xmm1,xmm5
+ por xmm11,xmm7
+ pxor xmm2,xmm4
+ movdqa xmm4,XMMWORD[((80-128))+rax]
+
+ movdqa xmm8,xmm14
+ movdqa xmm6,xmm12
+ pxor xmm2,XMMWORD[((176-128))+rax]
+ paddd xmm13,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm10
+
+ movdqa xmm9,xmm14
+ movdqa XMMWORD[(32-128)+rax],xmm1
+ paddd xmm13,xmm1
+ pxor xmm2,xmm4
+ psrld xmm9,27
+ pxor xmm6,xmm11
+ movdqa xmm7,xmm10
+
+ pslld xmm7,30
+ movdqa xmm5,xmm2
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm13,xmm6
+ paddd xmm2,xmm2
+
+ psrld xmm10,2
+ paddd xmm13,xmm8
+ por xmm2,xmm5
+ por xmm10,xmm7
+ pxor xmm3,xmm0
+ movdqa xmm0,XMMWORD[((96-128))+rax]
+
+ movdqa xmm8,xmm13
+ movdqa xmm6,xmm11
+ pxor xmm3,XMMWORD[((192-128))+rax]
+ paddd xmm12,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm14
+
+ movdqa xmm9,xmm13
+ movdqa XMMWORD[(48-128)+rax],xmm2
+ paddd xmm12,xmm2
+ pxor xmm3,xmm0
+ psrld xmm9,27
+ pxor xmm6,xmm10
+ movdqa xmm7,xmm14
+
+ pslld xmm7,30
+ movdqa xmm5,xmm3
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm12,xmm6
+ paddd xmm3,xmm3
+
+ psrld xmm14,2
+ paddd xmm12,xmm8
+ por xmm3,xmm5
+ por xmm14,xmm7
+ pxor xmm4,xmm1
+ movdqa xmm1,XMMWORD[((112-128))+rax]
+
+ movdqa xmm8,xmm12
+ movdqa xmm6,xmm10
+ pxor xmm4,XMMWORD[((208-128))+rax]
+ paddd xmm11,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm13
+
+ movdqa xmm9,xmm12
+ movdqa XMMWORD[(64-128)+rax],xmm3
+ paddd xmm11,xmm3
+ pxor xmm4,xmm1
+ psrld xmm9,27
+ pxor xmm6,xmm14
+ movdqa xmm7,xmm13
+
+ pslld xmm7,30
+ movdqa xmm5,xmm4
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm11,xmm6
+ paddd xmm4,xmm4
+
+ psrld xmm13,2
+ paddd xmm11,xmm8
+ por xmm4,xmm5
+ por xmm13,xmm7
+ pxor xmm0,xmm2
+ movdqa xmm2,XMMWORD[((128-128))+rax]
+
+ movdqa xmm8,xmm11
+ movdqa xmm6,xmm14
+ pxor xmm0,XMMWORD[((224-128))+rax]
+ paddd xmm10,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm12
+
+ movdqa xmm9,xmm11
+ movdqa XMMWORD[(80-128)+rax],xmm4
+ paddd xmm10,xmm4
+ pxor xmm0,xmm2
+ psrld xmm9,27
+ pxor xmm6,xmm13
+ movdqa xmm7,xmm12
+
+ pslld xmm7,30
+ movdqa xmm5,xmm0
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm10,xmm6
+ paddd xmm0,xmm0
+
+ psrld xmm12,2
+ paddd xmm10,xmm8
+ por xmm0,xmm5
+ por xmm12,xmm7
+ pxor xmm1,xmm3
+ movdqa xmm3,XMMWORD[((144-128))+rax]
+
+ movdqa xmm8,xmm10
+ movdqa xmm6,xmm13
+ pxor xmm1,XMMWORD[((240-128))+rax]
+ paddd xmm14,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm11
+
+ movdqa xmm9,xmm10
+ movdqa XMMWORD[(96-128)+rax],xmm0
+ paddd xmm14,xmm0
+ pxor xmm1,xmm3
+ psrld xmm9,27
+ pxor xmm6,xmm12
+ movdqa xmm7,xmm11
+
+ pslld xmm7,30
+ movdqa xmm5,xmm1
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm14,xmm6
+ paddd xmm1,xmm1
+
+ psrld xmm11,2
+ paddd xmm14,xmm8
+ por xmm1,xmm5
+ por xmm11,xmm7
+ pxor xmm2,xmm4
+ movdqa xmm4,XMMWORD[((160-128))+rax]
+
+ movdqa xmm8,xmm14
+ movdqa xmm6,xmm12
+ pxor xmm2,XMMWORD[((0-128))+rax]
+ paddd xmm13,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm10
+
+ movdqa xmm9,xmm14
+ movdqa XMMWORD[(112-128)+rax],xmm1
+ paddd xmm13,xmm1
+ pxor xmm2,xmm4
+ psrld xmm9,27
+ pxor xmm6,xmm11
+ movdqa xmm7,xmm10
+
+ pslld xmm7,30
+ movdqa xmm5,xmm2
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm13,xmm6
+ paddd xmm2,xmm2
+
+ psrld xmm10,2
+ paddd xmm13,xmm8
+ por xmm2,xmm5
+ por xmm10,xmm7
+ pxor xmm3,xmm0
+ movdqa xmm0,XMMWORD[((176-128))+rax]
+
+ movdqa xmm8,xmm13
+ movdqa xmm6,xmm11
+ pxor xmm3,XMMWORD[((16-128))+rax]
+ paddd xmm12,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm14
+
+ movdqa xmm9,xmm13
+ paddd xmm12,xmm2
+ pxor xmm3,xmm0
+ psrld xmm9,27
+ pxor xmm6,xmm10
+ movdqa xmm7,xmm14
+
+ pslld xmm7,30
+ movdqa xmm5,xmm3
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm12,xmm6
+ paddd xmm3,xmm3
+
+ psrld xmm14,2
+ paddd xmm12,xmm8
+ por xmm3,xmm5
+ por xmm14,xmm7
+ pxor xmm4,xmm1
+ movdqa xmm1,XMMWORD[((192-128))+rax]
+
+ movdqa xmm8,xmm12
+ movdqa xmm6,xmm10
+ pxor xmm4,XMMWORD[((32-128))+rax]
+ paddd xmm11,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm13
+
+ movdqa xmm9,xmm12
+ paddd xmm11,xmm3
+ pxor xmm4,xmm1
+ psrld xmm9,27
+ pxor xmm6,xmm14
+ movdqa xmm7,xmm13
+
+ pslld xmm7,30
+ movdqa xmm5,xmm4
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm11,xmm6
+ paddd xmm4,xmm4
+
+ psrld xmm13,2
+ paddd xmm11,xmm8
+ por xmm4,xmm5
+ por xmm13,xmm7
+ pxor xmm0,xmm2
+ movdqa xmm2,XMMWORD[((208-128))+rax]
+
+ movdqa xmm8,xmm11
+ movdqa xmm6,xmm14
+ pxor xmm0,XMMWORD[((48-128))+rax]
+ paddd xmm10,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm12
+
+ movdqa xmm9,xmm11
+ paddd xmm10,xmm4
+ pxor xmm0,xmm2
+ psrld xmm9,27
+ pxor xmm6,xmm13
+ movdqa xmm7,xmm12
+
+ pslld xmm7,30
+ movdqa xmm5,xmm0
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm10,xmm6
+ paddd xmm0,xmm0
+
+ psrld xmm12,2
+ paddd xmm10,xmm8
+ por xmm0,xmm5
+ por xmm12,xmm7
+ pxor xmm1,xmm3
+ movdqa xmm3,XMMWORD[((224-128))+rax]
+
+ movdqa xmm8,xmm10
+ movdqa xmm6,xmm13
+ pxor xmm1,XMMWORD[((64-128))+rax]
+ paddd xmm14,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm11
+
+ movdqa xmm9,xmm10
+ paddd xmm14,xmm0
+ pxor xmm1,xmm3
+ psrld xmm9,27
+ pxor xmm6,xmm12
+ movdqa xmm7,xmm11
+
+ pslld xmm7,30
+ movdqa xmm5,xmm1
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm14,xmm6
+ paddd xmm1,xmm1
+
+ psrld xmm11,2
+ paddd xmm14,xmm8
+ por xmm1,xmm5
+ por xmm11,xmm7
+ pxor xmm2,xmm4
+ movdqa xmm4,XMMWORD[((240-128))+rax]
+
+ movdqa xmm8,xmm14
+ movdqa xmm6,xmm12
+ pxor xmm2,XMMWORD[((80-128))+rax]
+ paddd xmm13,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm10
+
+ movdqa xmm9,xmm14
+ paddd xmm13,xmm1
+ pxor xmm2,xmm4
+ psrld xmm9,27
+ pxor xmm6,xmm11
+ movdqa xmm7,xmm10
+
+ pslld xmm7,30
+ movdqa xmm5,xmm2
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm13,xmm6
+ paddd xmm2,xmm2
+
+ psrld xmm10,2
+ paddd xmm13,xmm8
+ por xmm2,xmm5
+ por xmm10,xmm7
+ pxor xmm3,xmm0
+ movdqa xmm0,XMMWORD[((0-128))+rax]
+
+ movdqa xmm8,xmm13
+ movdqa xmm6,xmm11
+ pxor xmm3,XMMWORD[((96-128))+rax]
+ paddd xmm12,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm14
+
+ movdqa xmm9,xmm13
+ paddd xmm12,xmm2
+ pxor xmm3,xmm0
+ psrld xmm9,27
+ pxor xmm6,xmm10
+ movdqa xmm7,xmm14
+
+ pslld xmm7,30
+ movdqa xmm5,xmm3
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm12,xmm6
+ paddd xmm3,xmm3
+
+ psrld xmm14,2
+ paddd xmm12,xmm8
+ por xmm3,xmm5
+ por xmm14,xmm7
+ pxor xmm4,xmm1
+ movdqa xmm1,XMMWORD[((16-128))+rax]
+
+ movdqa xmm8,xmm12
+ movdqa xmm6,xmm10
+ pxor xmm4,XMMWORD[((112-128))+rax]
+ paddd xmm11,xmm15
+ pslld xmm8,5
+ pxor xmm6,xmm13
+
+ movdqa xmm9,xmm12
+ paddd xmm11,xmm3
+ pxor xmm4,xmm1
+ psrld xmm9,27
+ pxor xmm6,xmm14
+ movdqa xmm7,xmm13
+
+ pslld xmm7,30
+ movdqa xmm5,xmm4
+ por xmm8,xmm9
+ psrld xmm5,31
+ paddd xmm11,xmm6
+ paddd xmm4,xmm4
+
+ psrld xmm13,2
+ paddd xmm11,xmm8
+ por xmm4,xmm5
+ por xmm13,xmm7
+ movdqa xmm8,xmm11
+ paddd xmm10,xmm15
+ movdqa xmm6,xmm14
+ pslld xmm8,5
+ pxor xmm6,xmm12
+
+ movdqa xmm9,xmm11
+ paddd xmm10,xmm4
+ psrld xmm9,27
+ movdqa xmm7,xmm12
+ pxor xmm6,xmm13
+
+ pslld xmm7,30
+ por xmm8,xmm9
+ paddd xmm10,xmm6
+
+ psrld xmm12,2
+ paddd xmm10,xmm8
+ por xmm12,xmm7
+ movdqa xmm0,XMMWORD[rbx]
+ mov ecx,1
+ cmp ecx,DWORD[rbx]
+ pxor xmm8,xmm8
+ cmovge r8,rbp
+ cmp ecx,DWORD[4+rbx]
+ movdqa xmm1,xmm0
+ cmovge r9,rbp
+ cmp ecx,DWORD[8+rbx]
+ pcmpgtd xmm1,xmm8
+ cmovge r10,rbp
+ cmp ecx,DWORD[12+rbx]
+ paddd xmm0,xmm1
+ cmovge r11,rbp
+
+ movdqu xmm6,XMMWORD[rdi]
+ pand xmm10,xmm1
+ movdqu xmm7,XMMWORD[32+rdi]
+ pand xmm11,xmm1
+ paddd xmm10,xmm6
+ movdqu xmm8,XMMWORD[64+rdi]
+ pand xmm12,xmm1
+ paddd xmm11,xmm7
+ movdqu xmm9,XMMWORD[96+rdi]
+ pand xmm13,xmm1
+ paddd xmm12,xmm8
+ movdqu xmm5,XMMWORD[128+rdi]
+ pand xmm14,xmm1
+ movdqu XMMWORD[rdi],xmm10
+ paddd xmm13,xmm9
+ movdqu XMMWORD[32+rdi],xmm11
+ paddd xmm14,xmm5
+ movdqu XMMWORD[64+rdi],xmm12
+ movdqu XMMWORD[96+rdi],xmm13
+ movdqu XMMWORD[128+rdi],xmm14
+
+ movdqa XMMWORD[rbx],xmm0
+ movdqa xmm5,XMMWORD[96+rbp]
+ movdqa xmm15,XMMWORD[((-32))+rbp]
+ dec edx
+ jnz NEAR $L$oop
+
+ mov edx,DWORD[280+rsp]
+ lea rdi,[16+rdi]
+ lea rsi,[64+rsi]
+ dec edx
+ jnz NEAR $L$oop_grande
+
+$L$done:
+ mov rax,QWORD[272+rsp]
+
+ movaps xmm6,XMMWORD[((-184))+rax]
+ movaps xmm7,XMMWORD[((-168))+rax]
+ movaps xmm8,XMMWORD[((-152))+rax]
+ movaps xmm9,XMMWORD[((-136))+rax]
+ movaps xmm10,XMMWORD[((-120))+rax]
+ movaps xmm11,XMMWORD[((-104))+rax]
+ movaps xmm12,XMMWORD[((-88))+rax]
+ movaps xmm13,XMMWORD[((-72))+rax]
+ movaps xmm14,XMMWORD[((-56))+rax]
+ movaps xmm15,XMMWORD[((-40))+rax]
+ mov rbp,QWORD[((-16))+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+
+ lea rsp,[rax]
+
+$L$epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha1_multi_block:
+
+ALIGN 32
+sha1_multi_block_shaext:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha1_multi_block_shaext:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_shaext_shortcut:
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ lea rsp,[((-168))+rsp]
+ movaps XMMWORD[rsp],xmm6
+ movaps XMMWORD[16+rsp],xmm7
+ movaps XMMWORD[32+rsp],xmm8
+ movaps XMMWORD[48+rsp],xmm9
+ movaps XMMWORD[(-120)+rax],xmm10
+ movaps XMMWORD[(-104)+rax],xmm11
+ movaps XMMWORD[(-88)+rax],xmm12
+ movaps XMMWORD[(-72)+rax],xmm13
+ movaps XMMWORD[(-56)+rax],xmm14
+ movaps XMMWORD[(-40)+rax],xmm15
+ sub rsp,288
+ shl edx,1
+ and rsp,-256
+ lea rdi,[64+rdi]
+ mov QWORD[272+rsp],rax
+$L$body_shaext:
+ lea rbx,[256+rsp]
+ movdqa xmm3,XMMWORD[((K_XX_XX+128))]
+
+$L$oop_grande_shaext:
+ mov DWORD[280+rsp],edx
+ xor edx,edx
+
+ mov r8,QWORD[rsi]
+
+ mov ecx,DWORD[8+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[rbx],ecx
+ cmovle r8,rsp
+
+ mov r9,QWORD[16+rsi]
+
+ mov ecx,DWORD[24+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[4+rbx],ecx
+ cmovle r9,rsp
+ test edx,edx
+ jz NEAR $L$done_shaext
+
+ movq xmm0,QWORD[((0-64))+rdi]
+ movq xmm4,QWORD[((32-64))+rdi]
+ movq xmm5,QWORD[((64-64))+rdi]
+ movq xmm6,QWORD[((96-64))+rdi]
+ movq xmm7,QWORD[((128-64))+rdi]
+
+ punpckldq xmm0,xmm4
+ punpckldq xmm5,xmm6
+
+ movdqa xmm8,xmm0
+ punpcklqdq xmm0,xmm5
+ punpckhqdq xmm8,xmm5
+
+ pshufd xmm1,xmm7,63
+ pshufd xmm9,xmm7,127
+ pshufd xmm0,xmm0,27
+ pshufd xmm8,xmm8,27
+ jmp NEAR $L$oop_shaext
+
+ALIGN 32
+$L$oop_shaext:
+ movdqu xmm4,XMMWORD[r8]
+ movdqu xmm11,XMMWORD[r9]
+ movdqu xmm5,XMMWORD[16+r8]
+ movdqu xmm12,XMMWORD[16+r9]
+ movdqu xmm6,XMMWORD[32+r8]
+DB 102,15,56,0,227
+ movdqu xmm13,XMMWORD[32+r9]
+DB 102,68,15,56,0,219
+ movdqu xmm7,XMMWORD[48+r8]
+ lea r8,[64+r8]
+DB 102,15,56,0,235
+ movdqu xmm14,XMMWORD[48+r9]
+ lea r9,[64+r9]
+DB 102,68,15,56,0,227
+
+ movdqa XMMWORD[80+rsp],xmm1
+ paddd xmm1,xmm4
+ movdqa XMMWORD[112+rsp],xmm9
+ paddd xmm9,xmm11
+ movdqa XMMWORD[64+rsp],xmm0
+ movdqa xmm2,xmm0
+ movdqa XMMWORD[96+rsp],xmm8
+ movdqa xmm10,xmm8
+DB 15,58,204,193,0
+DB 15,56,200,213
+DB 69,15,58,204,193,0
+DB 69,15,56,200,212
+DB 102,15,56,0,243
+ prefetcht0 [127+r8]
+DB 15,56,201,229
+DB 102,68,15,56,0,235
+ prefetcht0 [127+r9]
+DB 69,15,56,201,220
+
+DB 102,15,56,0,251
+ movdqa xmm1,xmm0
+DB 102,68,15,56,0,243
+ movdqa xmm9,xmm8
+DB 15,58,204,194,0
+DB 15,56,200,206
+DB 69,15,58,204,194,0
+DB 69,15,56,200,205
+ pxor xmm4,xmm6
+DB 15,56,201,238
+ pxor xmm11,xmm13
+DB 69,15,56,201,229
+ movdqa xmm2,xmm0
+ movdqa xmm10,xmm8
+DB 15,58,204,193,0
+DB 15,56,200,215
+DB 69,15,58,204,193,0
+DB 69,15,56,200,214
+DB 15,56,202,231
+DB 69,15,56,202,222
+ pxor xmm5,xmm7
+DB 15,56,201,247
+ pxor xmm12,xmm14
+DB 69,15,56,201,238
+ movdqa xmm1,xmm0
+ movdqa xmm9,xmm8
+DB 15,58,204,194,0
+DB 15,56,200,204
+DB 69,15,58,204,194,0
+DB 69,15,56,200,203
+DB 15,56,202,236
+DB 69,15,56,202,227
+ pxor xmm6,xmm4
+DB 15,56,201,252
+ pxor xmm13,xmm11
+DB 69,15,56,201,243
+ movdqa xmm2,xmm0
+ movdqa xmm10,xmm8
+DB 15,58,204,193,0
+DB 15,56,200,213
+DB 69,15,58,204,193,0
+DB 69,15,56,200,212
+DB 15,56,202,245
+DB 69,15,56,202,236
+ pxor xmm7,xmm5
+DB 15,56,201,229
+ pxor xmm14,xmm12
+DB 69,15,56,201,220
+ movdqa xmm1,xmm0
+ movdqa xmm9,xmm8
+DB 15,58,204,194,1
+DB 15,56,200,206
+DB 69,15,58,204,194,1
+DB 69,15,56,200,205
+DB 15,56,202,254
+DB 69,15,56,202,245
+ pxor xmm4,xmm6
+DB 15,56,201,238
+ pxor xmm11,xmm13
+DB 69,15,56,201,229
+ movdqa xmm2,xmm0
+ movdqa xmm10,xmm8
+DB 15,58,204,193,1
+DB 15,56,200,215
+DB 69,15,58,204,193,1
+DB 69,15,56,200,214
+DB 15,56,202,231
+DB 69,15,56,202,222
+ pxor xmm5,xmm7
+DB 15,56,201,247
+ pxor xmm12,xmm14
+DB 69,15,56,201,238
+ movdqa xmm1,xmm0
+ movdqa xmm9,xmm8
+DB 15,58,204,194,1
+DB 15,56,200,204
+DB 69,15,58,204,194,1
+DB 69,15,56,200,203
+DB 15,56,202,236
+DB 69,15,56,202,227
+ pxor xmm6,xmm4
+DB 15,56,201,252
+ pxor xmm13,xmm11
+DB 69,15,56,201,243
+ movdqa xmm2,xmm0
+ movdqa xmm10,xmm8
+DB 15,58,204,193,1
+DB 15,56,200,213
+DB 69,15,58,204,193,1
+DB 69,15,56,200,212
+DB 15,56,202,245
+DB 69,15,56,202,236
+ pxor xmm7,xmm5
+DB 15,56,201,229
+ pxor xmm14,xmm12
+DB 69,15,56,201,220
+ movdqa xmm1,xmm0
+ movdqa xmm9,xmm8
+DB 15,58,204,194,1
+DB 15,56,200,206
+DB 69,15,58,204,194,1
+DB 69,15,56,200,205
+DB 15,56,202,254
+DB 69,15,56,202,245
+ pxor xmm4,xmm6
+DB 15,56,201,238
+ pxor xmm11,xmm13
+DB 69,15,56,201,229
+ movdqa xmm2,xmm0
+ movdqa xmm10,xmm8
+DB 15,58,204,193,2
+DB 15,56,200,215
+DB 69,15,58,204,193,2
+DB 69,15,56,200,214
+DB 15,56,202,231
+DB 69,15,56,202,222
+ pxor xmm5,xmm7
+DB 15,56,201,247
+ pxor xmm12,xmm14
+DB 69,15,56,201,238
+ movdqa xmm1,xmm0
+ movdqa xmm9,xmm8
+DB 15,58,204,194,2
+DB 15,56,200,204
+DB 69,15,58,204,194,2
+DB 69,15,56,200,203
+DB 15,56,202,236
+DB 69,15,56,202,227
+ pxor xmm6,xmm4
+DB 15,56,201,252
+ pxor xmm13,xmm11
+DB 69,15,56,201,243
+ movdqa xmm2,xmm0
+ movdqa xmm10,xmm8
+DB 15,58,204,193,2
+DB 15,56,200,213
+DB 69,15,58,204,193,2
+DB 69,15,56,200,212
+DB 15,56,202,245
+DB 69,15,56,202,236
+ pxor xmm7,xmm5
+DB 15,56,201,229
+ pxor xmm14,xmm12
+DB 69,15,56,201,220
+ movdqa xmm1,xmm0
+ movdqa xmm9,xmm8
+DB 15,58,204,194,2
+DB 15,56,200,206
+DB 69,15,58,204,194,2
+DB 69,15,56,200,205
+DB 15,56,202,254
+DB 69,15,56,202,245
+ pxor xmm4,xmm6
+DB 15,56,201,238
+ pxor xmm11,xmm13
+DB 69,15,56,201,229
+ movdqa xmm2,xmm0
+ movdqa xmm10,xmm8
+DB 15,58,204,193,2
+DB 15,56,200,215
+DB 69,15,58,204,193,2
+DB 69,15,56,200,214
+DB 15,56,202,231
+DB 69,15,56,202,222
+ pxor xmm5,xmm7
+DB 15,56,201,247
+ pxor xmm12,xmm14
+DB 69,15,56,201,238
+ movdqa xmm1,xmm0
+ movdqa xmm9,xmm8
+DB 15,58,204,194,3
+DB 15,56,200,204
+DB 69,15,58,204,194,3
+DB 69,15,56,200,203
+DB 15,56,202,236
+DB 69,15,56,202,227
+ pxor xmm6,xmm4
+DB 15,56,201,252
+ pxor xmm13,xmm11
+DB 69,15,56,201,243
+ movdqa xmm2,xmm0
+ movdqa xmm10,xmm8
+DB 15,58,204,193,3
+DB 15,56,200,213
+DB 69,15,58,204,193,3
+DB 69,15,56,200,212
+DB 15,56,202,245
+DB 69,15,56,202,236
+ pxor xmm7,xmm5
+ pxor xmm14,xmm12
+
+ mov ecx,1
+ pxor xmm4,xmm4
+ cmp ecx,DWORD[rbx]
+ cmovge r8,rsp
+
+ movdqa xmm1,xmm0
+ movdqa xmm9,xmm8
+DB 15,58,204,194,3
+DB 15,56,200,206
+DB 69,15,58,204,194,3
+DB 69,15,56,200,205
+DB 15,56,202,254
+DB 69,15,56,202,245
+
+ cmp ecx,DWORD[4+rbx]
+ cmovge r9,rsp
+ movq xmm6,QWORD[rbx]
+
+ movdqa xmm2,xmm0
+ movdqa xmm10,xmm8
+DB 15,58,204,193,3
+DB 15,56,200,215
+DB 69,15,58,204,193,3
+DB 69,15,56,200,214
+
+ pshufd xmm11,xmm6,0x00
+ pshufd xmm12,xmm6,0x55
+ movdqa xmm7,xmm6
+ pcmpgtd xmm11,xmm4
+ pcmpgtd xmm12,xmm4
+
+ movdqa xmm1,xmm0
+ movdqa xmm9,xmm8
+DB 15,58,204,194,3
+DB 15,56,200,204
+DB 69,15,58,204,194,3
+DB 68,15,56,200,204
+
+ pcmpgtd xmm7,xmm4
+ pand xmm0,xmm11
+ pand xmm1,xmm11
+ pand xmm8,xmm12
+ pand xmm9,xmm12
+ paddd xmm6,xmm7
+
+ paddd xmm0,XMMWORD[64+rsp]
+ paddd xmm1,XMMWORD[80+rsp]
+ paddd xmm8,XMMWORD[96+rsp]
+ paddd xmm9,XMMWORD[112+rsp]
+
+ movq QWORD[rbx],xmm6
+ dec edx
+ jnz NEAR $L$oop_shaext
+
+ mov edx,DWORD[280+rsp]
+
+ pshufd xmm0,xmm0,27
+ pshufd xmm8,xmm8,27
+
+ movdqa xmm6,xmm0
+ punpckldq xmm0,xmm8
+ punpckhdq xmm6,xmm8
+ punpckhdq xmm1,xmm9
+ movq QWORD[(0-64)+rdi],xmm0
+ psrldq xmm0,8
+ movq QWORD[(64-64)+rdi],xmm6
+ psrldq xmm6,8
+ movq QWORD[(32-64)+rdi],xmm0
+ psrldq xmm1,8
+ movq QWORD[(96-64)+rdi],xmm6
+ movq QWORD[(128-64)+rdi],xmm1
+
+ lea rdi,[8+rdi]
+ lea rsi,[32+rsi]
+ dec edx
+ jnz NEAR $L$oop_grande_shaext
+
+$L$done_shaext:
+
+ movaps xmm6,XMMWORD[((-184))+rax]
+ movaps xmm7,XMMWORD[((-168))+rax]
+ movaps xmm8,XMMWORD[((-152))+rax]
+ movaps xmm9,XMMWORD[((-136))+rax]
+ movaps xmm10,XMMWORD[((-120))+rax]
+ movaps xmm11,XMMWORD[((-104))+rax]
+ movaps xmm12,XMMWORD[((-88))+rax]
+ movaps xmm13,XMMWORD[((-72))+rax]
+ movaps xmm14,XMMWORD[((-56))+rax]
+ movaps xmm15,XMMWORD[((-40))+rax]
+ mov rbp,QWORD[((-16))+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+
+ lea rsp,[rax]
+
+$L$epilogue_shaext:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha1_multi_block_shaext:
+
+ALIGN 32
+sha1_multi_block_avx:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha1_multi_block_avx:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_avx_shortcut:
+ shr rcx,32
+ cmp edx,2
+ jb NEAR $L$avx
+ test ecx,32
+ jnz NEAR _avx2_shortcut
+ jmp NEAR $L$avx
+ALIGN 32
+$L$avx:
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ lea rsp,[((-168))+rsp]
+ movaps XMMWORD[rsp],xmm6
+ movaps XMMWORD[16+rsp],xmm7
+ movaps XMMWORD[32+rsp],xmm8
+ movaps XMMWORD[48+rsp],xmm9
+ movaps XMMWORD[(-120)+rax],xmm10
+ movaps XMMWORD[(-104)+rax],xmm11
+ movaps XMMWORD[(-88)+rax],xmm12
+ movaps XMMWORD[(-72)+rax],xmm13
+ movaps XMMWORD[(-56)+rax],xmm14
+ movaps XMMWORD[(-40)+rax],xmm15
+ sub rsp,288
+ and rsp,-256
+ mov QWORD[272+rsp],rax
+
+$L$body_avx:
+ lea rbp,[K_XX_XX]
+ lea rbx,[256+rsp]
+
+ vzeroupper
+$L$oop_grande_avx:
+ mov DWORD[280+rsp],edx
+ xor edx,edx
+
+ mov r8,QWORD[rsi]
+
+ mov ecx,DWORD[8+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[rbx],ecx
+ cmovle r8,rbp
+
+ mov r9,QWORD[16+rsi]
+
+ mov ecx,DWORD[24+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[4+rbx],ecx
+ cmovle r9,rbp
+
+ mov r10,QWORD[32+rsi]
+
+ mov ecx,DWORD[40+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[8+rbx],ecx
+ cmovle r10,rbp
+
+ mov r11,QWORD[48+rsi]
+
+ mov ecx,DWORD[56+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[12+rbx],ecx
+ cmovle r11,rbp
+ test edx,edx
+ jz NEAR $L$done_avx
+
+ vmovdqu xmm10,XMMWORD[rdi]
+ lea rax,[128+rsp]
+ vmovdqu xmm11,XMMWORD[32+rdi]
+ vmovdqu xmm12,XMMWORD[64+rdi]
+ vmovdqu xmm13,XMMWORD[96+rdi]
+ vmovdqu xmm14,XMMWORD[128+rdi]
+ vmovdqu xmm5,XMMWORD[96+rbp]
+ jmp NEAR $L$oop_avx
+
+ALIGN 32
+$L$oop_avx:
+ vmovdqa xmm15,XMMWORD[((-32))+rbp]
+ vmovd xmm0,DWORD[r8]
+ lea r8,[64+r8]
+ vmovd xmm2,DWORD[r9]
+ lea r9,[64+r9]
+ vpinsrd xmm0,xmm0,DWORD[r10],1
+ lea r10,[64+r10]
+ vpinsrd xmm2,xmm2,DWORD[r11],1
+ lea r11,[64+r11]
+ vmovd xmm1,DWORD[((-60))+r8]
+ vpunpckldq xmm0,xmm0,xmm2
+ vmovd xmm9,DWORD[((-60))+r9]
+ vpshufb xmm0,xmm0,xmm5
+ vpinsrd xmm1,xmm1,DWORD[((-60))+r10],1
+ vpinsrd xmm9,xmm9,DWORD[((-60))+r11],1
+ vpaddd xmm14,xmm14,xmm15
+ vpslld xmm8,xmm10,5
+ vpandn xmm7,xmm11,xmm13
+ vpand xmm6,xmm11,xmm12
+
+ vmovdqa XMMWORD[(0-128)+rax],xmm0
+ vpaddd xmm14,xmm14,xmm0
+ vpunpckldq xmm1,xmm1,xmm9
+ vpsrld xmm9,xmm10,27
+ vpxor xmm6,xmm6,xmm7
+ vmovd xmm2,DWORD[((-56))+r8]
+
+ vpslld xmm7,xmm11,30
+ vpor xmm8,xmm8,xmm9
+ vmovd xmm9,DWORD[((-56))+r9]
+ vpaddd xmm14,xmm14,xmm6
+
+ vpsrld xmm11,xmm11,2
+ vpaddd xmm14,xmm14,xmm8
+ vpshufb xmm1,xmm1,xmm5
+ vpor xmm11,xmm11,xmm7
+ vpinsrd xmm2,xmm2,DWORD[((-56))+r10],1
+ vpinsrd xmm9,xmm9,DWORD[((-56))+r11],1
+ vpaddd xmm13,xmm13,xmm15
+ vpslld xmm8,xmm14,5
+ vpandn xmm7,xmm10,xmm12
+ vpand xmm6,xmm10,xmm11
+
+ vmovdqa XMMWORD[(16-128)+rax],xmm1
+ vpaddd xmm13,xmm13,xmm1
+ vpunpckldq xmm2,xmm2,xmm9
+ vpsrld xmm9,xmm14,27
+ vpxor xmm6,xmm6,xmm7
+ vmovd xmm3,DWORD[((-52))+r8]
+
+ vpslld xmm7,xmm10,30
+ vpor xmm8,xmm8,xmm9
+ vmovd xmm9,DWORD[((-52))+r9]
+ vpaddd xmm13,xmm13,xmm6
+
+ vpsrld xmm10,xmm10,2
+ vpaddd xmm13,xmm13,xmm8
+ vpshufb xmm2,xmm2,xmm5
+ vpor xmm10,xmm10,xmm7
+ vpinsrd xmm3,xmm3,DWORD[((-52))+r10],1
+ vpinsrd xmm9,xmm9,DWORD[((-52))+r11],1
+ vpaddd xmm12,xmm12,xmm15
+ vpslld xmm8,xmm13,5
+ vpandn xmm7,xmm14,xmm11
+ vpand xmm6,xmm14,xmm10
+
+ vmovdqa XMMWORD[(32-128)+rax],xmm2
+ vpaddd xmm12,xmm12,xmm2
+ vpunpckldq xmm3,xmm3,xmm9
+ vpsrld xmm9,xmm13,27
+ vpxor xmm6,xmm6,xmm7
+ vmovd xmm4,DWORD[((-48))+r8]
+
+ vpslld xmm7,xmm14,30
+ vpor xmm8,xmm8,xmm9
+ vmovd xmm9,DWORD[((-48))+r9]
+ vpaddd xmm12,xmm12,xmm6
+
+ vpsrld xmm14,xmm14,2
+ vpaddd xmm12,xmm12,xmm8
+ vpshufb xmm3,xmm3,xmm5
+ vpor xmm14,xmm14,xmm7
+ vpinsrd xmm4,xmm4,DWORD[((-48))+r10],1
+ vpinsrd xmm9,xmm9,DWORD[((-48))+r11],1
+ vpaddd xmm11,xmm11,xmm15
+ vpslld xmm8,xmm12,5
+ vpandn xmm7,xmm13,xmm10
+ vpand xmm6,xmm13,xmm14
+
+ vmovdqa XMMWORD[(48-128)+rax],xmm3
+ vpaddd xmm11,xmm11,xmm3
+ vpunpckldq xmm4,xmm4,xmm9
+ vpsrld xmm9,xmm12,27
+ vpxor xmm6,xmm6,xmm7
+ vmovd xmm0,DWORD[((-44))+r8]
+
+ vpslld xmm7,xmm13,30
+ vpor xmm8,xmm8,xmm9
+ vmovd xmm9,DWORD[((-44))+r9]
+ vpaddd xmm11,xmm11,xmm6
+
+ vpsrld xmm13,xmm13,2
+ vpaddd xmm11,xmm11,xmm8
+ vpshufb xmm4,xmm4,xmm5
+ vpor xmm13,xmm13,xmm7
+ vpinsrd xmm0,xmm0,DWORD[((-44))+r10],1
+ vpinsrd xmm9,xmm9,DWORD[((-44))+r11],1
+ vpaddd xmm10,xmm10,xmm15
+ vpslld xmm8,xmm11,5
+ vpandn xmm7,xmm12,xmm14
+ vpand xmm6,xmm12,xmm13
+
+ vmovdqa XMMWORD[(64-128)+rax],xmm4
+ vpaddd xmm10,xmm10,xmm4
+ vpunpckldq xmm0,xmm0,xmm9
+ vpsrld xmm9,xmm11,27
+ vpxor xmm6,xmm6,xmm7
+ vmovd xmm1,DWORD[((-40))+r8]
+
+ vpslld xmm7,xmm12,30
+ vpor xmm8,xmm8,xmm9
+ vmovd xmm9,DWORD[((-40))+r9]
+ vpaddd xmm10,xmm10,xmm6
+
+ vpsrld xmm12,xmm12,2
+ vpaddd xmm10,xmm10,xmm8
+ vpshufb xmm0,xmm0,xmm5
+ vpor xmm12,xmm12,xmm7
+ vpinsrd xmm1,xmm1,DWORD[((-40))+r10],1
+ vpinsrd xmm9,xmm9,DWORD[((-40))+r11],1
+ vpaddd xmm14,xmm14,xmm15
+ vpslld xmm8,xmm10,5
+ vpandn xmm7,xmm11,xmm13
+ vpand xmm6,xmm11,xmm12
+
+ vmovdqa XMMWORD[(80-128)+rax],xmm0
+ vpaddd xmm14,xmm14,xmm0
+ vpunpckldq xmm1,xmm1,xmm9
+ vpsrld xmm9,xmm10,27
+ vpxor xmm6,xmm6,xmm7
+ vmovd xmm2,DWORD[((-36))+r8]
+
+ vpslld xmm7,xmm11,30
+ vpor xmm8,xmm8,xmm9
+ vmovd xmm9,DWORD[((-36))+r9]
+ vpaddd xmm14,xmm14,xmm6
+
+ vpsrld xmm11,xmm11,2
+ vpaddd xmm14,xmm14,xmm8
+ vpshufb xmm1,xmm1,xmm5
+ vpor xmm11,xmm11,xmm7
+ vpinsrd xmm2,xmm2,DWORD[((-36))+r10],1
+ vpinsrd xmm9,xmm9,DWORD[((-36))+r11],1
+ vpaddd xmm13,xmm13,xmm15
+ vpslld xmm8,xmm14,5
+ vpandn xmm7,xmm10,xmm12
+ vpand xmm6,xmm10,xmm11
+
+ vmovdqa XMMWORD[(96-128)+rax],xmm1
+ vpaddd xmm13,xmm13,xmm1
+ vpunpckldq xmm2,xmm2,xmm9
+ vpsrld xmm9,xmm14,27
+ vpxor xmm6,xmm6,xmm7
+ vmovd xmm3,DWORD[((-32))+r8]
+
+ vpslld xmm7,xmm10,30
+ vpor xmm8,xmm8,xmm9
+ vmovd xmm9,DWORD[((-32))+r9]
+ vpaddd xmm13,xmm13,xmm6
+
+ vpsrld xmm10,xmm10,2
+ vpaddd xmm13,xmm13,xmm8
+ vpshufb xmm2,xmm2,xmm5
+ vpor xmm10,xmm10,xmm7
+ vpinsrd xmm3,xmm3,DWORD[((-32))+r10],1
+ vpinsrd xmm9,xmm9,DWORD[((-32))+r11],1
+ vpaddd xmm12,xmm12,xmm15
+ vpslld xmm8,xmm13,5
+ vpandn xmm7,xmm14,xmm11
+ vpand xmm6,xmm14,xmm10
+
+ vmovdqa XMMWORD[(112-128)+rax],xmm2
+ vpaddd xmm12,xmm12,xmm2
+ vpunpckldq xmm3,xmm3,xmm9
+ vpsrld xmm9,xmm13,27
+ vpxor xmm6,xmm6,xmm7
+ vmovd xmm4,DWORD[((-28))+r8]
+
+ vpslld xmm7,xmm14,30
+ vpor xmm8,xmm8,xmm9
+ vmovd xmm9,DWORD[((-28))+r9]
+ vpaddd xmm12,xmm12,xmm6
+
+ vpsrld xmm14,xmm14,2
+ vpaddd xmm12,xmm12,xmm8
+ vpshufb xmm3,xmm3,xmm5
+ vpor xmm14,xmm14,xmm7
+ vpinsrd xmm4,xmm4,DWORD[((-28))+r10],1
+ vpinsrd xmm9,xmm9,DWORD[((-28))+r11],1
+ vpaddd xmm11,xmm11,xmm15
+ vpslld xmm8,xmm12,5
+ vpandn xmm7,xmm13,xmm10
+ vpand xmm6,xmm13,xmm14
+
+ vmovdqa XMMWORD[(128-128)+rax],xmm3
+ vpaddd xmm11,xmm11,xmm3
+ vpunpckldq xmm4,xmm4,xmm9
+ vpsrld xmm9,xmm12,27
+ vpxor xmm6,xmm6,xmm7
+ vmovd xmm0,DWORD[((-24))+r8]
+
+ vpslld xmm7,xmm13,30
+ vpor xmm8,xmm8,xmm9
+ vmovd xmm9,DWORD[((-24))+r9]
+ vpaddd xmm11,xmm11,xmm6
+
+ vpsrld xmm13,xmm13,2
+ vpaddd xmm11,xmm11,xmm8
+ vpshufb xmm4,xmm4,xmm5
+ vpor xmm13,xmm13,xmm7
+ vpinsrd xmm0,xmm0,DWORD[((-24))+r10],1
+ vpinsrd xmm9,xmm9,DWORD[((-24))+r11],1
+ vpaddd xmm10,xmm10,xmm15
+ vpslld xmm8,xmm11,5
+ vpandn xmm7,xmm12,xmm14
+ vpand xmm6,xmm12,xmm13
+
+ vmovdqa XMMWORD[(144-128)+rax],xmm4
+ vpaddd xmm10,xmm10,xmm4
+ vpunpckldq xmm0,xmm0,xmm9
+ vpsrld xmm9,xmm11,27
+ vpxor xmm6,xmm6,xmm7
+ vmovd xmm1,DWORD[((-20))+r8]
+
+ vpslld xmm7,xmm12,30
+ vpor xmm8,xmm8,xmm9
+ vmovd xmm9,DWORD[((-20))+r9]
+ vpaddd xmm10,xmm10,xmm6
+
+ vpsrld xmm12,xmm12,2
+ vpaddd xmm10,xmm10,xmm8
+ vpshufb xmm0,xmm0,xmm5
+ vpor xmm12,xmm12,xmm7
+ vpinsrd xmm1,xmm1,DWORD[((-20))+r10],1
+ vpinsrd xmm9,xmm9,DWORD[((-20))+r11],1
+ vpaddd xmm14,xmm14,xmm15
+ vpslld xmm8,xmm10,5
+ vpandn xmm7,xmm11,xmm13
+ vpand xmm6,xmm11,xmm12
+
+ vmovdqa XMMWORD[(160-128)+rax],xmm0
+ vpaddd xmm14,xmm14,xmm0
+ vpunpckldq xmm1,xmm1,xmm9
+ vpsrld xmm9,xmm10,27
+ vpxor xmm6,xmm6,xmm7
+ vmovd xmm2,DWORD[((-16))+r8]
+
+ vpslld xmm7,xmm11,30
+ vpor xmm8,xmm8,xmm9
+ vmovd xmm9,DWORD[((-16))+r9]
+ vpaddd xmm14,xmm14,xmm6
+
+ vpsrld xmm11,xmm11,2
+ vpaddd xmm14,xmm14,xmm8
+ vpshufb xmm1,xmm1,xmm5
+ vpor xmm11,xmm11,xmm7
+ vpinsrd xmm2,xmm2,DWORD[((-16))+r10],1
+ vpinsrd xmm9,xmm9,DWORD[((-16))+r11],1
+ vpaddd xmm13,xmm13,xmm15
+ vpslld xmm8,xmm14,5
+ vpandn xmm7,xmm10,xmm12
+ vpand xmm6,xmm10,xmm11
+
+ vmovdqa XMMWORD[(176-128)+rax],xmm1
+ vpaddd xmm13,xmm13,xmm1
+ vpunpckldq xmm2,xmm2,xmm9
+ vpsrld xmm9,xmm14,27
+ vpxor xmm6,xmm6,xmm7
+ vmovd xmm3,DWORD[((-12))+r8]
+
+ vpslld xmm7,xmm10,30
+ vpor xmm8,xmm8,xmm9
+ vmovd xmm9,DWORD[((-12))+r9]
+ vpaddd xmm13,xmm13,xmm6
+
+ vpsrld xmm10,xmm10,2
+ vpaddd xmm13,xmm13,xmm8
+ vpshufb xmm2,xmm2,xmm5
+ vpor xmm10,xmm10,xmm7
+ vpinsrd xmm3,xmm3,DWORD[((-12))+r10],1
+ vpinsrd xmm9,xmm9,DWORD[((-12))+r11],1
+ vpaddd xmm12,xmm12,xmm15
+ vpslld xmm8,xmm13,5
+ vpandn xmm7,xmm14,xmm11
+ vpand xmm6,xmm14,xmm10
+
+ vmovdqa XMMWORD[(192-128)+rax],xmm2
+ vpaddd xmm12,xmm12,xmm2
+ vpunpckldq xmm3,xmm3,xmm9
+ vpsrld xmm9,xmm13,27
+ vpxor xmm6,xmm6,xmm7
+ vmovd xmm4,DWORD[((-8))+r8]
+
+ vpslld xmm7,xmm14,30
+ vpor xmm8,xmm8,xmm9
+ vmovd xmm9,DWORD[((-8))+r9]
+ vpaddd xmm12,xmm12,xmm6
+
+ vpsrld xmm14,xmm14,2
+ vpaddd xmm12,xmm12,xmm8
+ vpshufb xmm3,xmm3,xmm5
+ vpor xmm14,xmm14,xmm7
+ vpinsrd xmm4,xmm4,DWORD[((-8))+r10],1
+ vpinsrd xmm9,xmm9,DWORD[((-8))+r11],1
+ vpaddd xmm11,xmm11,xmm15
+ vpslld xmm8,xmm12,5
+ vpandn xmm7,xmm13,xmm10
+ vpand xmm6,xmm13,xmm14
+
+ vmovdqa XMMWORD[(208-128)+rax],xmm3
+ vpaddd xmm11,xmm11,xmm3
+ vpunpckldq xmm4,xmm4,xmm9
+ vpsrld xmm9,xmm12,27
+ vpxor xmm6,xmm6,xmm7
+ vmovd xmm0,DWORD[((-4))+r8]
+
+ vpslld xmm7,xmm13,30
+ vpor xmm8,xmm8,xmm9
+ vmovd xmm9,DWORD[((-4))+r9]
+ vpaddd xmm11,xmm11,xmm6
+
+ vpsrld xmm13,xmm13,2
+ vpaddd xmm11,xmm11,xmm8
+ vpshufb xmm4,xmm4,xmm5
+ vpor xmm13,xmm13,xmm7
+ vmovdqa xmm1,XMMWORD[((0-128))+rax]
+ vpinsrd xmm0,xmm0,DWORD[((-4))+r10],1
+ vpinsrd xmm9,xmm9,DWORD[((-4))+r11],1
+ vpaddd xmm10,xmm10,xmm15
+ prefetcht0 [63+r8]
+ vpslld xmm8,xmm11,5
+ vpandn xmm7,xmm12,xmm14
+ vpand xmm6,xmm12,xmm13
+
+ vmovdqa XMMWORD[(224-128)+rax],xmm4
+ vpaddd xmm10,xmm10,xmm4
+ vpunpckldq xmm0,xmm0,xmm9
+ vpsrld xmm9,xmm11,27
+ prefetcht0 [63+r9]
+ vpxor xmm6,xmm6,xmm7
+
+ vpslld xmm7,xmm12,30
+ vpor xmm8,xmm8,xmm9
+ prefetcht0 [63+r10]
+ vpaddd xmm10,xmm10,xmm6
+
+ vpsrld xmm12,xmm12,2
+ vpaddd xmm10,xmm10,xmm8
+ prefetcht0 [63+r11]
+ vpshufb xmm0,xmm0,xmm5
+ vpor xmm12,xmm12,xmm7
+ vmovdqa xmm2,XMMWORD[((16-128))+rax]
+ vpxor xmm1,xmm1,xmm3
+ vmovdqa xmm3,XMMWORD[((32-128))+rax]
+
+ vpaddd xmm14,xmm14,xmm15
+ vpslld xmm8,xmm10,5
+ vpandn xmm7,xmm11,xmm13
+
+ vpand xmm6,xmm11,xmm12
+
+ vmovdqa XMMWORD[(240-128)+rax],xmm0
+ vpaddd xmm14,xmm14,xmm0
+ vpxor xmm1,xmm1,XMMWORD[((128-128))+rax]
+ vpsrld xmm9,xmm10,27
+ vpxor xmm6,xmm6,xmm7
+ vpxor xmm1,xmm1,xmm3
+
+
+ vpslld xmm7,xmm11,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm14,xmm14,xmm6
+
+ vpsrld xmm5,xmm1,31
+ vpaddd xmm1,xmm1,xmm1
+
+ vpsrld xmm11,xmm11,2
+
+ vpaddd xmm14,xmm14,xmm8
+ vpor xmm1,xmm1,xmm5
+ vpor xmm11,xmm11,xmm7
+ vpxor xmm2,xmm2,xmm4
+ vmovdqa xmm4,XMMWORD[((48-128))+rax]
+
+ vpaddd xmm13,xmm13,xmm15
+ vpslld xmm8,xmm14,5
+ vpandn xmm7,xmm10,xmm12
+
+ vpand xmm6,xmm10,xmm11
+
+ vmovdqa XMMWORD[(0-128)+rax],xmm1
+ vpaddd xmm13,xmm13,xmm1
+ vpxor xmm2,xmm2,XMMWORD[((144-128))+rax]
+ vpsrld xmm9,xmm14,27
+ vpxor xmm6,xmm6,xmm7
+ vpxor xmm2,xmm2,xmm4
+
+
+ vpslld xmm7,xmm10,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm13,xmm13,xmm6
+
+ vpsrld xmm5,xmm2,31
+ vpaddd xmm2,xmm2,xmm2
+
+ vpsrld xmm10,xmm10,2
+
+ vpaddd xmm13,xmm13,xmm8
+ vpor xmm2,xmm2,xmm5
+ vpor xmm10,xmm10,xmm7
+ vpxor xmm3,xmm3,xmm0
+ vmovdqa xmm0,XMMWORD[((64-128))+rax]
+
+ vpaddd xmm12,xmm12,xmm15
+ vpslld xmm8,xmm13,5
+ vpandn xmm7,xmm14,xmm11
+
+ vpand xmm6,xmm14,xmm10
+
+ vmovdqa XMMWORD[(16-128)+rax],xmm2
+ vpaddd xmm12,xmm12,xmm2
+ vpxor xmm3,xmm3,XMMWORD[((160-128))+rax]
+ vpsrld xmm9,xmm13,27
+ vpxor xmm6,xmm6,xmm7
+ vpxor xmm3,xmm3,xmm0
+
+
+ vpslld xmm7,xmm14,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm12,xmm12,xmm6
+
+ vpsrld xmm5,xmm3,31
+ vpaddd xmm3,xmm3,xmm3
+
+ vpsrld xmm14,xmm14,2
+
+ vpaddd xmm12,xmm12,xmm8
+ vpor xmm3,xmm3,xmm5
+ vpor xmm14,xmm14,xmm7
+ vpxor xmm4,xmm4,xmm1
+ vmovdqa xmm1,XMMWORD[((80-128))+rax]
+
+ vpaddd xmm11,xmm11,xmm15
+ vpslld xmm8,xmm12,5
+ vpandn xmm7,xmm13,xmm10
+
+ vpand xmm6,xmm13,xmm14
+
+ vmovdqa XMMWORD[(32-128)+rax],xmm3
+ vpaddd xmm11,xmm11,xmm3
+ vpxor xmm4,xmm4,XMMWORD[((176-128))+rax]
+ vpsrld xmm9,xmm12,27
+ vpxor xmm6,xmm6,xmm7
+ vpxor xmm4,xmm4,xmm1
+
+
+ vpslld xmm7,xmm13,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm11,xmm11,xmm6
+
+ vpsrld xmm5,xmm4,31
+ vpaddd xmm4,xmm4,xmm4
+
+ vpsrld xmm13,xmm13,2
+
+ vpaddd xmm11,xmm11,xmm8
+ vpor xmm4,xmm4,xmm5
+ vpor xmm13,xmm13,xmm7
+ vpxor xmm0,xmm0,xmm2
+ vmovdqa xmm2,XMMWORD[((96-128))+rax]
+
+ vpaddd xmm10,xmm10,xmm15
+ vpslld xmm8,xmm11,5
+ vpandn xmm7,xmm12,xmm14
+
+ vpand xmm6,xmm12,xmm13
+
+ vmovdqa XMMWORD[(48-128)+rax],xmm4
+ vpaddd xmm10,xmm10,xmm4
+ vpxor xmm0,xmm0,XMMWORD[((192-128))+rax]
+ vpsrld xmm9,xmm11,27
+ vpxor xmm6,xmm6,xmm7
+ vpxor xmm0,xmm0,xmm2
+
+
+ vpslld xmm7,xmm12,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm10,xmm10,xmm6
+
+ vpsrld xmm5,xmm0,31
+ vpaddd xmm0,xmm0,xmm0
+
+ vpsrld xmm12,xmm12,2
+
+ vpaddd xmm10,xmm10,xmm8
+ vpor xmm0,xmm0,xmm5
+ vpor xmm12,xmm12,xmm7
+ vmovdqa xmm15,XMMWORD[rbp]
+ vpxor xmm1,xmm1,xmm3
+ vmovdqa xmm3,XMMWORD[((112-128))+rax]
+
+ vpslld xmm8,xmm10,5
+ vpaddd xmm14,xmm14,xmm15
+ vpxor xmm6,xmm13,xmm11
+ vmovdqa XMMWORD[(64-128)+rax],xmm0
+ vpaddd xmm14,xmm14,xmm0
+ vpxor xmm1,xmm1,XMMWORD[((208-128))+rax]
+ vpsrld xmm9,xmm10,27
+ vpxor xmm6,xmm6,xmm12
+ vpxor xmm1,xmm1,xmm3
+
+ vpslld xmm7,xmm11,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm14,xmm14,xmm6
+ vpsrld xmm5,xmm1,31
+ vpaddd xmm1,xmm1,xmm1
+
+ vpsrld xmm11,xmm11,2
+ vpaddd xmm14,xmm14,xmm8
+ vpor xmm1,xmm1,xmm5
+ vpor xmm11,xmm11,xmm7
+ vpxor xmm2,xmm2,xmm4
+ vmovdqa xmm4,XMMWORD[((128-128))+rax]
+
+ vpslld xmm8,xmm14,5
+ vpaddd xmm13,xmm13,xmm15
+ vpxor xmm6,xmm12,xmm10
+ vmovdqa XMMWORD[(80-128)+rax],xmm1
+ vpaddd xmm13,xmm13,xmm1
+ vpxor xmm2,xmm2,XMMWORD[((224-128))+rax]
+ vpsrld xmm9,xmm14,27
+ vpxor xmm6,xmm6,xmm11
+ vpxor xmm2,xmm2,xmm4
+
+ vpslld xmm7,xmm10,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm13,xmm13,xmm6
+ vpsrld xmm5,xmm2,31
+ vpaddd xmm2,xmm2,xmm2
+
+ vpsrld xmm10,xmm10,2
+ vpaddd xmm13,xmm13,xmm8
+ vpor xmm2,xmm2,xmm5
+ vpor xmm10,xmm10,xmm7
+ vpxor xmm3,xmm3,xmm0
+ vmovdqa xmm0,XMMWORD[((144-128))+rax]
+
+ vpslld xmm8,xmm13,5
+ vpaddd xmm12,xmm12,xmm15
+ vpxor xmm6,xmm11,xmm14
+ vmovdqa XMMWORD[(96-128)+rax],xmm2
+ vpaddd xmm12,xmm12,xmm2
+ vpxor xmm3,xmm3,XMMWORD[((240-128))+rax]
+ vpsrld xmm9,xmm13,27
+ vpxor xmm6,xmm6,xmm10
+ vpxor xmm3,xmm3,xmm0
+
+ vpslld xmm7,xmm14,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm12,xmm12,xmm6
+ vpsrld xmm5,xmm3,31
+ vpaddd xmm3,xmm3,xmm3
+
+ vpsrld xmm14,xmm14,2
+ vpaddd xmm12,xmm12,xmm8
+ vpor xmm3,xmm3,xmm5
+ vpor xmm14,xmm14,xmm7
+ vpxor xmm4,xmm4,xmm1
+ vmovdqa xmm1,XMMWORD[((160-128))+rax]
+
+ vpslld xmm8,xmm12,5
+ vpaddd xmm11,xmm11,xmm15
+ vpxor xmm6,xmm10,xmm13
+ vmovdqa XMMWORD[(112-128)+rax],xmm3
+ vpaddd xmm11,xmm11,xmm3
+ vpxor xmm4,xmm4,XMMWORD[((0-128))+rax]
+ vpsrld xmm9,xmm12,27
+ vpxor xmm6,xmm6,xmm14
+ vpxor xmm4,xmm4,xmm1
+
+ vpslld xmm7,xmm13,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm11,xmm11,xmm6
+ vpsrld xmm5,xmm4,31
+ vpaddd xmm4,xmm4,xmm4
+
+ vpsrld xmm13,xmm13,2
+ vpaddd xmm11,xmm11,xmm8
+ vpor xmm4,xmm4,xmm5
+ vpor xmm13,xmm13,xmm7
+ vpxor xmm0,xmm0,xmm2
+ vmovdqa xmm2,XMMWORD[((176-128))+rax]
+
+ vpslld xmm8,xmm11,5
+ vpaddd xmm10,xmm10,xmm15
+ vpxor xmm6,xmm14,xmm12
+ vmovdqa XMMWORD[(128-128)+rax],xmm4
+ vpaddd xmm10,xmm10,xmm4
+ vpxor xmm0,xmm0,XMMWORD[((16-128))+rax]
+ vpsrld xmm9,xmm11,27
+ vpxor xmm6,xmm6,xmm13
+ vpxor xmm0,xmm0,xmm2
+
+ vpslld xmm7,xmm12,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm10,xmm10,xmm6
+ vpsrld xmm5,xmm0,31
+ vpaddd xmm0,xmm0,xmm0
+
+ vpsrld xmm12,xmm12,2
+ vpaddd xmm10,xmm10,xmm8
+ vpor xmm0,xmm0,xmm5
+ vpor xmm12,xmm12,xmm7
+ vpxor xmm1,xmm1,xmm3
+ vmovdqa xmm3,XMMWORD[((192-128))+rax]
+
+ vpslld xmm8,xmm10,5
+ vpaddd xmm14,xmm14,xmm15
+ vpxor xmm6,xmm13,xmm11
+ vmovdqa XMMWORD[(144-128)+rax],xmm0
+ vpaddd xmm14,xmm14,xmm0
+ vpxor xmm1,xmm1,XMMWORD[((32-128))+rax]
+ vpsrld xmm9,xmm10,27
+ vpxor xmm6,xmm6,xmm12
+ vpxor xmm1,xmm1,xmm3
+
+ vpslld xmm7,xmm11,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm14,xmm14,xmm6
+ vpsrld xmm5,xmm1,31
+ vpaddd xmm1,xmm1,xmm1
+
+ vpsrld xmm11,xmm11,2
+ vpaddd xmm14,xmm14,xmm8
+ vpor xmm1,xmm1,xmm5
+ vpor xmm11,xmm11,xmm7
+ vpxor xmm2,xmm2,xmm4
+ vmovdqa xmm4,XMMWORD[((208-128))+rax]
+
+ vpslld xmm8,xmm14,5
+ vpaddd xmm13,xmm13,xmm15
+ vpxor xmm6,xmm12,xmm10
+ vmovdqa XMMWORD[(160-128)+rax],xmm1
+ vpaddd xmm13,xmm13,xmm1
+ vpxor xmm2,xmm2,XMMWORD[((48-128))+rax]
+ vpsrld xmm9,xmm14,27
+ vpxor xmm6,xmm6,xmm11
+ vpxor xmm2,xmm2,xmm4
+
+ vpslld xmm7,xmm10,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm13,xmm13,xmm6
+ vpsrld xmm5,xmm2,31
+ vpaddd xmm2,xmm2,xmm2
+
+ vpsrld xmm10,xmm10,2
+ vpaddd xmm13,xmm13,xmm8
+ vpor xmm2,xmm2,xmm5
+ vpor xmm10,xmm10,xmm7
+ vpxor xmm3,xmm3,xmm0
+ vmovdqa xmm0,XMMWORD[((224-128))+rax]
+
+ vpslld xmm8,xmm13,5
+ vpaddd xmm12,xmm12,xmm15
+ vpxor xmm6,xmm11,xmm14
+ vmovdqa XMMWORD[(176-128)+rax],xmm2
+ vpaddd xmm12,xmm12,xmm2
+ vpxor xmm3,xmm3,XMMWORD[((64-128))+rax]
+ vpsrld xmm9,xmm13,27
+ vpxor xmm6,xmm6,xmm10
+ vpxor xmm3,xmm3,xmm0
+
+ vpslld xmm7,xmm14,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm12,xmm12,xmm6
+ vpsrld xmm5,xmm3,31
+ vpaddd xmm3,xmm3,xmm3
+
+ vpsrld xmm14,xmm14,2
+ vpaddd xmm12,xmm12,xmm8
+ vpor xmm3,xmm3,xmm5
+ vpor xmm14,xmm14,xmm7
+ vpxor xmm4,xmm4,xmm1
+ vmovdqa xmm1,XMMWORD[((240-128))+rax]
+
+ vpslld xmm8,xmm12,5
+ vpaddd xmm11,xmm11,xmm15
+ vpxor xmm6,xmm10,xmm13
+ vmovdqa XMMWORD[(192-128)+rax],xmm3
+ vpaddd xmm11,xmm11,xmm3
+ vpxor xmm4,xmm4,XMMWORD[((80-128))+rax]
+ vpsrld xmm9,xmm12,27
+ vpxor xmm6,xmm6,xmm14
+ vpxor xmm4,xmm4,xmm1
+
+ vpslld xmm7,xmm13,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm11,xmm11,xmm6
+ vpsrld xmm5,xmm4,31
+ vpaddd xmm4,xmm4,xmm4
+
+ vpsrld xmm13,xmm13,2
+ vpaddd xmm11,xmm11,xmm8
+ vpor xmm4,xmm4,xmm5
+ vpor xmm13,xmm13,xmm7
+ vpxor xmm0,xmm0,xmm2
+ vmovdqa xmm2,XMMWORD[((0-128))+rax]
+
+ vpslld xmm8,xmm11,5
+ vpaddd xmm10,xmm10,xmm15
+ vpxor xmm6,xmm14,xmm12
+ vmovdqa XMMWORD[(208-128)+rax],xmm4
+ vpaddd xmm10,xmm10,xmm4
+ vpxor xmm0,xmm0,XMMWORD[((96-128))+rax]
+ vpsrld xmm9,xmm11,27
+ vpxor xmm6,xmm6,xmm13
+ vpxor xmm0,xmm0,xmm2
+
+ vpslld xmm7,xmm12,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm10,xmm10,xmm6
+ vpsrld xmm5,xmm0,31
+ vpaddd xmm0,xmm0,xmm0
+
+ vpsrld xmm12,xmm12,2
+ vpaddd xmm10,xmm10,xmm8
+ vpor xmm0,xmm0,xmm5
+ vpor xmm12,xmm12,xmm7
+ vpxor xmm1,xmm1,xmm3
+ vmovdqa xmm3,XMMWORD[((16-128))+rax]
+
+ vpslld xmm8,xmm10,5
+ vpaddd xmm14,xmm14,xmm15
+ vpxor xmm6,xmm13,xmm11
+ vmovdqa XMMWORD[(224-128)+rax],xmm0
+ vpaddd xmm14,xmm14,xmm0
+ vpxor xmm1,xmm1,XMMWORD[((112-128))+rax]
+ vpsrld xmm9,xmm10,27
+ vpxor xmm6,xmm6,xmm12
+ vpxor xmm1,xmm1,xmm3
+
+ vpslld xmm7,xmm11,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm14,xmm14,xmm6
+ vpsrld xmm5,xmm1,31
+ vpaddd xmm1,xmm1,xmm1
+
+ vpsrld xmm11,xmm11,2
+ vpaddd xmm14,xmm14,xmm8
+ vpor xmm1,xmm1,xmm5
+ vpor xmm11,xmm11,xmm7
+ vpxor xmm2,xmm2,xmm4
+ vmovdqa xmm4,XMMWORD[((32-128))+rax]
+
+ vpslld xmm8,xmm14,5
+ vpaddd xmm13,xmm13,xmm15
+ vpxor xmm6,xmm12,xmm10
+ vmovdqa XMMWORD[(240-128)+rax],xmm1
+ vpaddd xmm13,xmm13,xmm1
+ vpxor xmm2,xmm2,XMMWORD[((128-128))+rax]
+ vpsrld xmm9,xmm14,27
+ vpxor xmm6,xmm6,xmm11
+ vpxor xmm2,xmm2,xmm4
+
+ vpslld xmm7,xmm10,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm13,xmm13,xmm6
+ vpsrld xmm5,xmm2,31
+ vpaddd xmm2,xmm2,xmm2
+
+ vpsrld xmm10,xmm10,2
+ vpaddd xmm13,xmm13,xmm8
+ vpor xmm2,xmm2,xmm5
+ vpor xmm10,xmm10,xmm7
+ vpxor xmm3,xmm3,xmm0
+ vmovdqa xmm0,XMMWORD[((48-128))+rax]
+
+ vpslld xmm8,xmm13,5
+ vpaddd xmm12,xmm12,xmm15
+ vpxor xmm6,xmm11,xmm14
+ vmovdqa XMMWORD[(0-128)+rax],xmm2
+ vpaddd xmm12,xmm12,xmm2
+ vpxor xmm3,xmm3,XMMWORD[((144-128))+rax]
+ vpsrld xmm9,xmm13,27
+ vpxor xmm6,xmm6,xmm10
+ vpxor xmm3,xmm3,xmm0
+
+ vpslld xmm7,xmm14,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm12,xmm12,xmm6
+ vpsrld xmm5,xmm3,31
+ vpaddd xmm3,xmm3,xmm3
+
+ vpsrld xmm14,xmm14,2
+ vpaddd xmm12,xmm12,xmm8
+ vpor xmm3,xmm3,xmm5
+ vpor xmm14,xmm14,xmm7
+ vpxor xmm4,xmm4,xmm1
+ vmovdqa xmm1,XMMWORD[((64-128))+rax]
+
+ vpslld xmm8,xmm12,5
+ vpaddd xmm11,xmm11,xmm15
+ vpxor xmm6,xmm10,xmm13
+ vmovdqa XMMWORD[(16-128)+rax],xmm3
+ vpaddd xmm11,xmm11,xmm3
+ vpxor xmm4,xmm4,XMMWORD[((160-128))+rax]
+ vpsrld xmm9,xmm12,27
+ vpxor xmm6,xmm6,xmm14
+ vpxor xmm4,xmm4,xmm1
+
+ vpslld xmm7,xmm13,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm11,xmm11,xmm6
+ vpsrld xmm5,xmm4,31
+ vpaddd xmm4,xmm4,xmm4
+
+ vpsrld xmm13,xmm13,2
+ vpaddd xmm11,xmm11,xmm8
+ vpor xmm4,xmm4,xmm5
+ vpor xmm13,xmm13,xmm7
+ vpxor xmm0,xmm0,xmm2
+ vmovdqa xmm2,XMMWORD[((80-128))+rax]
+
+ vpslld xmm8,xmm11,5
+ vpaddd xmm10,xmm10,xmm15
+ vpxor xmm6,xmm14,xmm12
+ vmovdqa XMMWORD[(32-128)+rax],xmm4
+ vpaddd xmm10,xmm10,xmm4
+ vpxor xmm0,xmm0,XMMWORD[((176-128))+rax]
+ vpsrld xmm9,xmm11,27
+ vpxor xmm6,xmm6,xmm13
+ vpxor xmm0,xmm0,xmm2
+
+ vpslld xmm7,xmm12,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm10,xmm10,xmm6
+ vpsrld xmm5,xmm0,31
+ vpaddd xmm0,xmm0,xmm0
+
+ vpsrld xmm12,xmm12,2
+ vpaddd xmm10,xmm10,xmm8
+ vpor xmm0,xmm0,xmm5
+ vpor xmm12,xmm12,xmm7
+ vpxor xmm1,xmm1,xmm3
+ vmovdqa xmm3,XMMWORD[((96-128))+rax]
+
+ vpslld xmm8,xmm10,5
+ vpaddd xmm14,xmm14,xmm15
+ vpxor xmm6,xmm13,xmm11
+ vmovdqa XMMWORD[(48-128)+rax],xmm0
+ vpaddd xmm14,xmm14,xmm0
+ vpxor xmm1,xmm1,XMMWORD[((192-128))+rax]
+ vpsrld xmm9,xmm10,27
+ vpxor xmm6,xmm6,xmm12
+ vpxor xmm1,xmm1,xmm3
+
+ vpslld xmm7,xmm11,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm14,xmm14,xmm6
+ vpsrld xmm5,xmm1,31
+ vpaddd xmm1,xmm1,xmm1
+
+ vpsrld xmm11,xmm11,2
+ vpaddd xmm14,xmm14,xmm8
+ vpor xmm1,xmm1,xmm5
+ vpor xmm11,xmm11,xmm7
+ vpxor xmm2,xmm2,xmm4
+ vmovdqa xmm4,XMMWORD[((112-128))+rax]
+
+ vpslld xmm8,xmm14,5
+ vpaddd xmm13,xmm13,xmm15
+ vpxor xmm6,xmm12,xmm10
+ vmovdqa XMMWORD[(64-128)+rax],xmm1
+ vpaddd xmm13,xmm13,xmm1
+ vpxor xmm2,xmm2,XMMWORD[((208-128))+rax]
+ vpsrld xmm9,xmm14,27
+ vpxor xmm6,xmm6,xmm11
+ vpxor xmm2,xmm2,xmm4
+
+ vpslld xmm7,xmm10,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm13,xmm13,xmm6
+ vpsrld xmm5,xmm2,31
+ vpaddd xmm2,xmm2,xmm2
+
+ vpsrld xmm10,xmm10,2
+ vpaddd xmm13,xmm13,xmm8
+ vpor xmm2,xmm2,xmm5
+ vpor xmm10,xmm10,xmm7
+ vpxor xmm3,xmm3,xmm0
+ vmovdqa xmm0,XMMWORD[((128-128))+rax]
+
+ vpslld xmm8,xmm13,5
+ vpaddd xmm12,xmm12,xmm15
+ vpxor xmm6,xmm11,xmm14
+ vmovdqa XMMWORD[(80-128)+rax],xmm2
+ vpaddd xmm12,xmm12,xmm2
+ vpxor xmm3,xmm3,XMMWORD[((224-128))+rax]
+ vpsrld xmm9,xmm13,27
+ vpxor xmm6,xmm6,xmm10
+ vpxor xmm3,xmm3,xmm0
+
+ vpslld xmm7,xmm14,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm12,xmm12,xmm6
+ vpsrld xmm5,xmm3,31
+ vpaddd xmm3,xmm3,xmm3
+
+ vpsrld xmm14,xmm14,2
+ vpaddd xmm12,xmm12,xmm8
+ vpor xmm3,xmm3,xmm5
+ vpor xmm14,xmm14,xmm7
+ vpxor xmm4,xmm4,xmm1
+ vmovdqa xmm1,XMMWORD[((144-128))+rax]
+
+ vpslld xmm8,xmm12,5
+ vpaddd xmm11,xmm11,xmm15
+ vpxor xmm6,xmm10,xmm13
+ vmovdqa XMMWORD[(96-128)+rax],xmm3
+ vpaddd xmm11,xmm11,xmm3
+ vpxor xmm4,xmm4,XMMWORD[((240-128))+rax]
+ vpsrld xmm9,xmm12,27
+ vpxor xmm6,xmm6,xmm14
+ vpxor xmm4,xmm4,xmm1
+
+ vpslld xmm7,xmm13,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm11,xmm11,xmm6
+ vpsrld xmm5,xmm4,31
+ vpaddd xmm4,xmm4,xmm4
+
+ vpsrld xmm13,xmm13,2
+ vpaddd xmm11,xmm11,xmm8
+ vpor xmm4,xmm4,xmm5
+ vpor xmm13,xmm13,xmm7
+ vpxor xmm0,xmm0,xmm2
+ vmovdqa xmm2,XMMWORD[((160-128))+rax]
+
+ vpslld xmm8,xmm11,5
+ vpaddd xmm10,xmm10,xmm15
+ vpxor xmm6,xmm14,xmm12
+ vmovdqa XMMWORD[(112-128)+rax],xmm4
+ vpaddd xmm10,xmm10,xmm4
+ vpxor xmm0,xmm0,XMMWORD[((0-128))+rax]
+ vpsrld xmm9,xmm11,27
+ vpxor xmm6,xmm6,xmm13
+ vpxor xmm0,xmm0,xmm2
+
+ vpslld xmm7,xmm12,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm10,xmm10,xmm6
+ vpsrld xmm5,xmm0,31
+ vpaddd xmm0,xmm0,xmm0
+
+ vpsrld xmm12,xmm12,2
+ vpaddd xmm10,xmm10,xmm8
+ vpor xmm0,xmm0,xmm5
+ vpor xmm12,xmm12,xmm7
+ vmovdqa xmm15,XMMWORD[32+rbp]
+ vpxor xmm1,xmm1,xmm3
+ vmovdqa xmm3,XMMWORD[((176-128))+rax]
+
+ vpaddd xmm14,xmm14,xmm15
+ vpslld xmm8,xmm10,5
+ vpand xmm7,xmm13,xmm12
+ vpxor xmm1,xmm1,XMMWORD[((16-128))+rax]
+
+ vpaddd xmm14,xmm14,xmm7
+ vpsrld xmm9,xmm10,27
+ vpxor xmm6,xmm13,xmm12
+ vpxor xmm1,xmm1,xmm3
+
+ vmovdqu XMMWORD[(128-128)+rax],xmm0
+ vpaddd xmm14,xmm14,xmm0
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm1,31
+ vpand xmm6,xmm6,xmm11
+ vpaddd xmm1,xmm1,xmm1
+
+ vpslld xmm7,xmm11,30
+ vpaddd xmm14,xmm14,xmm6
+
+ vpsrld xmm11,xmm11,2
+ vpaddd xmm14,xmm14,xmm8
+ vpor xmm1,xmm1,xmm5
+ vpor xmm11,xmm11,xmm7
+ vpxor xmm2,xmm2,xmm4
+ vmovdqa xmm4,XMMWORD[((192-128))+rax]
+
+ vpaddd xmm13,xmm13,xmm15
+ vpslld xmm8,xmm14,5
+ vpand xmm7,xmm12,xmm11
+ vpxor xmm2,xmm2,XMMWORD[((32-128))+rax]
+
+ vpaddd xmm13,xmm13,xmm7
+ vpsrld xmm9,xmm14,27
+ vpxor xmm6,xmm12,xmm11
+ vpxor xmm2,xmm2,xmm4
+
+ vmovdqu XMMWORD[(144-128)+rax],xmm1
+ vpaddd xmm13,xmm13,xmm1
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm2,31
+ vpand xmm6,xmm6,xmm10
+ vpaddd xmm2,xmm2,xmm2
+
+ vpslld xmm7,xmm10,30
+ vpaddd xmm13,xmm13,xmm6
+
+ vpsrld xmm10,xmm10,2
+ vpaddd xmm13,xmm13,xmm8
+ vpor xmm2,xmm2,xmm5
+ vpor xmm10,xmm10,xmm7
+ vpxor xmm3,xmm3,xmm0
+ vmovdqa xmm0,XMMWORD[((208-128))+rax]
+
+ vpaddd xmm12,xmm12,xmm15
+ vpslld xmm8,xmm13,5
+ vpand xmm7,xmm11,xmm10
+ vpxor xmm3,xmm3,XMMWORD[((48-128))+rax]
+
+ vpaddd xmm12,xmm12,xmm7
+ vpsrld xmm9,xmm13,27
+ vpxor xmm6,xmm11,xmm10
+ vpxor xmm3,xmm3,xmm0
+
+ vmovdqu XMMWORD[(160-128)+rax],xmm2
+ vpaddd xmm12,xmm12,xmm2
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm3,31
+ vpand xmm6,xmm6,xmm14
+ vpaddd xmm3,xmm3,xmm3
+
+ vpslld xmm7,xmm14,30
+ vpaddd xmm12,xmm12,xmm6
+
+ vpsrld xmm14,xmm14,2
+ vpaddd xmm12,xmm12,xmm8
+ vpor xmm3,xmm3,xmm5
+ vpor xmm14,xmm14,xmm7
+ vpxor xmm4,xmm4,xmm1
+ vmovdqa xmm1,XMMWORD[((224-128))+rax]
+
+ vpaddd xmm11,xmm11,xmm15
+ vpslld xmm8,xmm12,5
+ vpand xmm7,xmm10,xmm14
+ vpxor xmm4,xmm4,XMMWORD[((64-128))+rax]
+
+ vpaddd xmm11,xmm11,xmm7
+ vpsrld xmm9,xmm12,27
+ vpxor xmm6,xmm10,xmm14
+ vpxor xmm4,xmm4,xmm1
+
+ vmovdqu XMMWORD[(176-128)+rax],xmm3
+ vpaddd xmm11,xmm11,xmm3
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm4,31
+ vpand xmm6,xmm6,xmm13
+ vpaddd xmm4,xmm4,xmm4
+
+ vpslld xmm7,xmm13,30
+ vpaddd xmm11,xmm11,xmm6
+
+ vpsrld xmm13,xmm13,2
+ vpaddd xmm11,xmm11,xmm8
+ vpor xmm4,xmm4,xmm5
+ vpor xmm13,xmm13,xmm7
+ vpxor xmm0,xmm0,xmm2
+ vmovdqa xmm2,XMMWORD[((240-128))+rax]
+
+ vpaddd xmm10,xmm10,xmm15
+ vpslld xmm8,xmm11,5
+ vpand xmm7,xmm14,xmm13
+ vpxor xmm0,xmm0,XMMWORD[((80-128))+rax]
+
+ vpaddd xmm10,xmm10,xmm7
+ vpsrld xmm9,xmm11,27
+ vpxor xmm6,xmm14,xmm13
+ vpxor xmm0,xmm0,xmm2
+
+ vmovdqu XMMWORD[(192-128)+rax],xmm4
+ vpaddd xmm10,xmm10,xmm4
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm0,31
+ vpand xmm6,xmm6,xmm12
+ vpaddd xmm0,xmm0,xmm0
+
+ vpslld xmm7,xmm12,30
+ vpaddd xmm10,xmm10,xmm6
+
+ vpsrld xmm12,xmm12,2
+ vpaddd xmm10,xmm10,xmm8
+ vpor xmm0,xmm0,xmm5
+ vpor xmm12,xmm12,xmm7
+ vpxor xmm1,xmm1,xmm3
+ vmovdqa xmm3,XMMWORD[((0-128))+rax]
+
+ vpaddd xmm14,xmm14,xmm15
+ vpslld xmm8,xmm10,5
+ vpand xmm7,xmm13,xmm12
+ vpxor xmm1,xmm1,XMMWORD[((96-128))+rax]
+
+ vpaddd xmm14,xmm14,xmm7
+ vpsrld xmm9,xmm10,27
+ vpxor xmm6,xmm13,xmm12
+ vpxor xmm1,xmm1,xmm3
+
+ vmovdqu XMMWORD[(208-128)+rax],xmm0
+ vpaddd xmm14,xmm14,xmm0
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm1,31
+ vpand xmm6,xmm6,xmm11
+ vpaddd xmm1,xmm1,xmm1
+
+ vpslld xmm7,xmm11,30
+ vpaddd xmm14,xmm14,xmm6
+
+ vpsrld xmm11,xmm11,2
+ vpaddd xmm14,xmm14,xmm8
+ vpor xmm1,xmm1,xmm5
+ vpor xmm11,xmm11,xmm7
+ vpxor xmm2,xmm2,xmm4
+ vmovdqa xmm4,XMMWORD[((16-128))+rax]
+
+ vpaddd xmm13,xmm13,xmm15
+ vpslld xmm8,xmm14,5
+ vpand xmm7,xmm12,xmm11
+ vpxor xmm2,xmm2,XMMWORD[((112-128))+rax]
+
+ vpaddd xmm13,xmm13,xmm7
+ vpsrld xmm9,xmm14,27
+ vpxor xmm6,xmm12,xmm11
+ vpxor xmm2,xmm2,xmm4
+
+ vmovdqu XMMWORD[(224-128)+rax],xmm1
+ vpaddd xmm13,xmm13,xmm1
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm2,31
+ vpand xmm6,xmm6,xmm10
+ vpaddd xmm2,xmm2,xmm2
+
+ vpslld xmm7,xmm10,30
+ vpaddd xmm13,xmm13,xmm6
+
+ vpsrld xmm10,xmm10,2
+ vpaddd xmm13,xmm13,xmm8
+ vpor xmm2,xmm2,xmm5
+ vpor xmm10,xmm10,xmm7
+ vpxor xmm3,xmm3,xmm0
+ vmovdqa xmm0,XMMWORD[((32-128))+rax]
+
+ vpaddd xmm12,xmm12,xmm15
+ vpslld xmm8,xmm13,5
+ vpand xmm7,xmm11,xmm10
+ vpxor xmm3,xmm3,XMMWORD[((128-128))+rax]
+
+ vpaddd xmm12,xmm12,xmm7
+ vpsrld xmm9,xmm13,27
+ vpxor xmm6,xmm11,xmm10
+ vpxor xmm3,xmm3,xmm0
+
+ vmovdqu XMMWORD[(240-128)+rax],xmm2
+ vpaddd xmm12,xmm12,xmm2
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm3,31
+ vpand xmm6,xmm6,xmm14
+ vpaddd xmm3,xmm3,xmm3
+
+ vpslld xmm7,xmm14,30
+ vpaddd xmm12,xmm12,xmm6
+
+ vpsrld xmm14,xmm14,2
+ vpaddd xmm12,xmm12,xmm8
+ vpor xmm3,xmm3,xmm5
+ vpor xmm14,xmm14,xmm7
+ vpxor xmm4,xmm4,xmm1
+ vmovdqa xmm1,XMMWORD[((48-128))+rax]
+
+ vpaddd xmm11,xmm11,xmm15
+ vpslld xmm8,xmm12,5
+ vpand xmm7,xmm10,xmm14
+ vpxor xmm4,xmm4,XMMWORD[((144-128))+rax]
+
+ vpaddd xmm11,xmm11,xmm7
+ vpsrld xmm9,xmm12,27
+ vpxor xmm6,xmm10,xmm14
+ vpxor xmm4,xmm4,xmm1
+
+ vmovdqu XMMWORD[(0-128)+rax],xmm3
+ vpaddd xmm11,xmm11,xmm3
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm4,31
+ vpand xmm6,xmm6,xmm13
+ vpaddd xmm4,xmm4,xmm4
+
+ vpslld xmm7,xmm13,30
+ vpaddd xmm11,xmm11,xmm6
+
+ vpsrld xmm13,xmm13,2
+ vpaddd xmm11,xmm11,xmm8
+ vpor xmm4,xmm4,xmm5
+ vpor xmm13,xmm13,xmm7
+ vpxor xmm0,xmm0,xmm2
+ vmovdqa xmm2,XMMWORD[((64-128))+rax]
+
+ vpaddd xmm10,xmm10,xmm15
+ vpslld xmm8,xmm11,5
+ vpand xmm7,xmm14,xmm13
+ vpxor xmm0,xmm0,XMMWORD[((160-128))+rax]
+
+ vpaddd xmm10,xmm10,xmm7
+ vpsrld xmm9,xmm11,27
+ vpxor xmm6,xmm14,xmm13
+ vpxor xmm0,xmm0,xmm2
+
+ vmovdqu XMMWORD[(16-128)+rax],xmm4
+ vpaddd xmm10,xmm10,xmm4
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm0,31
+ vpand xmm6,xmm6,xmm12
+ vpaddd xmm0,xmm0,xmm0
+
+ vpslld xmm7,xmm12,30
+ vpaddd xmm10,xmm10,xmm6
+
+ vpsrld xmm12,xmm12,2
+ vpaddd xmm10,xmm10,xmm8
+ vpor xmm0,xmm0,xmm5
+ vpor xmm12,xmm12,xmm7
+ vpxor xmm1,xmm1,xmm3
+ vmovdqa xmm3,XMMWORD[((80-128))+rax]
+
+ vpaddd xmm14,xmm14,xmm15
+ vpslld xmm8,xmm10,5
+ vpand xmm7,xmm13,xmm12
+ vpxor xmm1,xmm1,XMMWORD[((176-128))+rax]
+
+ vpaddd xmm14,xmm14,xmm7
+ vpsrld xmm9,xmm10,27
+ vpxor xmm6,xmm13,xmm12
+ vpxor xmm1,xmm1,xmm3
+
+ vmovdqu XMMWORD[(32-128)+rax],xmm0
+ vpaddd xmm14,xmm14,xmm0
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm1,31
+ vpand xmm6,xmm6,xmm11
+ vpaddd xmm1,xmm1,xmm1
+
+ vpslld xmm7,xmm11,30
+ vpaddd xmm14,xmm14,xmm6
+
+ vpsrld xmm11,xmm11,2
+ vpaddd xmm14,xmm14,xmm8
+ vpor xmm1,xmm1,xmm5
+ vpor xmm11,xmm11,xmm7
+ vpxor xmm2,xmm2,xmm4
+ vmovdqa xmm4,XMMWORD[((96-128))+rax]
+
+ vpaddd xmm13,xmm13,xmm15
+ vpslld xmm8,xmm14,5
+ vpand xmm7,xmm12,xmm11
+ vpxor xmm2,xmm2,XMMWORD[((192-128))+rax]
+
+ vpaddd xmm13,xmm13,xmm7
+ vpsrld xmm9,xmm14,27
+ vpxor xmm6,xmm12,xmm11
+ vpxor xmm2,xmm2,xmm4
+
+ vmovdqu XMMWORD[(48-128)+rax],xmm1
+ vpaddd xmm13,xmm13,xmm1
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm2,31
+ vpand xmm6,xmm6,xmm10
+ vpaddd xmm2,xmm2,xmm2
+
+ vpslld xmm7,xmm10,30
+ vpaddd xmm13,xmm13,xmm6
+
+ vpsrld xmm10,xmm10,2
+ vpaddd xmm13,xmm13,xmm8
+ vpor xmm2,xmm2,xmm5
+ vpor xmm10,xmm10,xmm7
+ vpxor xmm3,xmm3,xmm0
+ vmovdqa xmm0,XMMWORD[((112-128))+rax]
+
+ vpaddd xmm12,xmm12,xmm15
+ vpslld xmm8,xmm13,5
+ vpand xmm7,xmm11,xmm10
+ vpxor xmm3,xmm3,XMMWORD[((208-128))+rax]
+
+ vpaddd xmm12,xmm12,xmm7
+ vpsrld xmm9,xmm13,27
+ vpxor xmm6,xmm11,xmm10
+ vpxor xmm3,xmm3,xmm0
+
+ vmovdqu XMMWORD[(64-128)+rax],xmm2
+ vpaddd xmm12,xmm12,xmm2
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm3,31
+ vpand xmm6,xmm6,xmm14
+ vpaddd xmm3,xmm3,xmm3
+
+ vpslld xmm7,xmm14,30
+ vpaddd xmm12,xmm12,xmm6
+
+ vpsrld xmm14,xmm14,2
+ vpaddd xmm12,xmm12,xmm8
+ vpor xmm3,xmm3,xmm5
+ vpor xmm14,xmm14,xmm7
+ vpxor xmm4,xmm4,xmm1
+ vmovdqa xmm1,XMMWORD[((128-128))+rax]
+
+ vpaddd xmm11,xmm11,xmm15
+ vpslld xmm8,xmm12,5
+ vpand xmm7,xmm10,xmm14
+ vpxor xmm4,xmm4,XMMWORD[((224-128))+rax]
+
+ vpaddd xmm11,xmm11,xmm7
+ vpsrld xmm9,xmm12,27
+ vpxor xmm6,xmm10,xmm14
+ vpxor xmm4,xmm4,xmm1
+
+ vmovdqu XMMWORD[(80-128)+rax],xmm3
+ vpaddd xmm11,xmm11,xmm3
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm4,31
+ vpand xmm6,xmm6,xmm13
+ vpaddd xmm4,xmm4,xmm4
+
+ vpslld xmm7,xmm13,30
+ vpaddd xmm11,xmm11,xmm6
+
+ vpsrld xmm13,xmm13,2
+ vpaddd xmm11,xmm11,xmm8
+ vpor xmm4,xmm4,xmm5
+ vpor xmm13,xmm13,xmm7
+ vpxor xmm0,xmm0,xmm2
+ vmovdqa xmm2,XMMWORD[((144-128))+rax]
+
+ vpaddd xmm10,xmm10,xmm15
+ vpslld xmm8,xmm11,5
+ vpand xmm7,xmm14,xmm13
+ vpxor xmm0,xmm0,XMMWORD[((240-128))+rax]
+
+ vpaddd xmm10,xmm10,xmm7
+ vpsrld xmm9,xmm11,27
+ vpxor xmm6,xmm14,xmm13
+ vpxor xmm0,xmm0,xmm2
+
+ vmovdqu XMMWORD[(96-128)+rax],xmm4
+ vpaddd xmm10,xmm10,xmm4
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm0,31
+ vpand xmm6,xmm6,xmm12
+ vpaddd xmm0,xmm0,xmm0
+
+ vpslld xmm7,xmm12,30
+ vpaddd xmm10,xmm10,xmm6
+
+ vpsrld xmm12,xmm12,2
+ vpaddd xmm10,xmm10,xmm8
+ vpor xmm0,xmm0,xmm5
+ vpor xmm12,xmm12,xmm7
+ vpxor xmm1,xmm1,xmm3
+ vmovdqa xmm3,XMMWORD[((160-128))+rax]
+
+ vpaddd xmm14,xmm14,xmm15
+ vpslld xmm8,xmm10,5
+ vpand xmm7,xmm13,xmm12
+ vpxor xmm1,xmm1,XMMWORD[((0-128))+rax]
+
+ vpaddd xmm14,xmm14,xmm7
+ vpsrld xmm9,xmm10,27
+ vpxor xmm6,xmm13,xmm12
+ vpxor xmm1,xmm1,xmm3
+
+ vmovdqu XMMWORD[(112-128)+rax],xmm0
+ vpaddd xmm14,xmm14,xmm0
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm1,31
+ vpand xmm6,xmm6,xmm11
+ vpaddd xmm1,xmm1,xmm1
+
+ vpslld xmm7,xmm11,30
+ vpaddd xmm14,xmm14,xmm6
+
+ vpsrld xmm11,xmm11,2
+ vpaddd xmm14,xmm14,xmm8
+ vpor xmm1,xmm1,xmm5
+ vpor xmm11,xmm11,xmm7
+ vpxor xmm2,xmm2,xmm4
+ vmovdqa xmm4,XMMWORD[((176-128))+rax]
+
+ vpaddd xmm13,xmm13,xmm15
+ vpslld xmm8,xmm14,5
+ vpand xmm7,xmm12,xmm11
+ vpxor xmm2,xmm2,XMMWORD[((16-128))+rax]
+
+ vpaddd xmm13,xmm13,xmm7
+ vpsrld xmm9,xmm14,27
+ vpxor xmm6,xmm12,xmm11
+ vpxor xmm2,xmm2,xmm4
+
+ vmovdqu XMMWORD[(128-128)+rax],xmm1
+ vpaddd xmm13,xmm13,xmm1
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm2,31
+ vpand xmm6,xmm6,xmm10
+ vpaddd xmm2,xmm2,xmm2
+
+ vpslld xmm7,xmm10,30
+ vpaddd xmm13,xmm13,xmm6
+
+ vpsrld xmm10,xmm10,2
+ vpaddd xmm13,xmm13,xmm8
+ vpor xmm2,xmm2,xmm5
+ vpor xmm10,xmm10,xmm7
+ vpxor xmm3,xmm3,xmm0
+ vmovdqa xmm0,XMMWORD[((192-128))+rax]
+
+ vpaddd xmm12,xmm12,xmm15
+ vpslld xmm8,xmm13,5
+ vpand xmm7,xmm11,xmm10
+ vpxor xmm3,xmm3,XMMWORD[((32-128))+rax]
+
+ vpaddd xmm12,xmm12,xmm7
+ vpsrld xmm9,xmm13,27
+ vpxor xmm6,xmm11,xmm10
+ vpxor xmm3,xmm3,xmm0
+
+ vmovdqu XMMWORD[(144-128)+rax],xmm2
+ vpaddd xmm12,xmm12,xmm2
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm3,31
+ vpand xmm6,xmm6,xmm14
+ vpaddd xmm3,xmm3,xmm3
+
+ vpslld xmm7,xmm14,30
+ vpaddd xmm12,xmm12,xmm6
+
+ vpsrld xmm14,xmm14,2
+ vpaddd xmm12,xmm12,xmm8
+ vpor xmm3,xmm3,xmm5
+ vpor xmm14,xmm14,xmm7
+ vpxor xmm4,xmm4,xmm1
+ vmovdqa xmm1,XMMWORD[((208-128))+rax]
+
+ vpaddd xmm11,xmm11,xmm15
+ vpslld xmm8,xmm12,5
+ vpand xmm7,xmm10,xmm14
+ vpxor xmm4,xmm4,XMMWORD[((48-128))+rax]
+
+ vpaddd xmm11,xmm11,xmm7
+ vpsrld xmm9,xmm12,27
+ vpxor xmm6,xmm10,xmm14
+ vpxor xmm4,xmm4,xmm1
+
+ vmovdqu XMMWORD[(160-128)+rax],xmm3
+ vpaddd xmm11,xmm11,xmm3
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm4,31
+ vpand xmm6,xmm6,xmm13
+ vpaddd xmm4,xmm4,xmm4
+
+ vpslld xmm7,xmm13,30
+ vpaddd xmm11,xmm11,xmm6
+
+ vpsrld xmm13,xmm13,2
+ vpaddd xmm11,xmm11,xmm8
+ vpor xmm4,xmm4,xmm5
+ vpor xmm13,xmm13,xmm7
+ vpxor xmm0,xmm0,xmm2
+ vmovdqa xmm2,XMMWORD[((224-128))+rax]
+
+ vpaddd xmm10,xmm10,xmm15
+ vpslld xmm8,xmm11,5
+ vpand xmm7,xmm14,xmm13
+ vpxor xmm0,xmm0,XMMWORD[((64-128))+rax]
+
+ vpaddd xmm10,xmm10,xmm7
+ vpsrld xmm9,xmm11,27
+ vpxor xmm6,xmm14,xmm13
+ vpxor xmm0,xmm0,xmm2
+
+ vmovdqu XMMWORD[(176-128)+rax],xmm4
+ vpaddd xmm10,xmm10,xmm4
+ vpor xmm8,xmm8,xmm9
+ vpsrld xmm5,xmm0,31
+ vpand xmm6,xmm6,xmm12
+ vpaddd xmm0,xmm0,xmm0
+
+ vpslld xmm7,xmm12,30
+ vpaddd xmm10,xmm10,xmm6
+
+ vpsrld xmm12,xmm12,2
+ vpaddd xmm10,xmm10,xmm8
+ vpor xmm0,xmm0,xmm5
+ vpor xmm12,xmm12,xmm7
+ vmovdqa xmm15,XMMWORD[64+rbp]
+ vpxor xmm1,xmm1,xmm3
+ vmovdqa xmm3,XMMWORD[((240-128))+rax]
+
+ vpslld xmm8,xmm10,5
+ vpaddd xmm14,xmm14,xmm15
+ vpxor xmm6,xmm13,xmm11
+ vmovdqa XMMWORD[(192-128)+rax],xmm0
+ vpaddd xmm14,xmm14,xmm0
+ vpxor xmm1,xmm1,XMMWORD[((80-128))+rax]
+ vpsrld xmm9,xmm10,27
+ vpxor xmm6,xmm6,xmm12
+ vpxor xmm1,xmm1,xmm3
+
+ vpslld xmm7,xmm11,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm14,xmm14,xmm6
+ vpsrld xmm5,xmm1,31
+ vpaddd xmm1,xmm1,xmm1
+
+ vpsrld xmm11,xmm11,2
+ vpaddd xmm14,xmm14,xmm8
+ vpor xmm1,xmm1,xmm5
+ vpor xmm11,xmm11,xmm7
+ vpxor xmm2,xmm2,xmm4
+ vmovdqa xmm4,XMMWORD[((0-128))+rax]
+
+ vpslld xmm8,xmm14,5
+ vpaddd xmm13,xmm13,xmm15
+ vpxor xmm6,xmm12,xmm10
+ vmovdqa XMMWORD[(208-128)+rax],xmm1
+ vpaddd xmm13,xmm13,xmm1
+ vpxor xmm2,xmm2,XMMWORD[((96-128))+rax]
+ vpsrld xmm9,xmm14,27
+ vpxor xmm6,xmm6,xmm11
+ vpxor xmm2,xmm2,xmm4
+
+ vpslld xmm7,xmm10,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm13,xmm13,xmm6
+ vpsrld xmm5,xmm2,31
+ vpaddd xmm2,xmm2,xmm2
+
+ vpsrld xmm10,xmm10,2
+ vpaddd xmm13,xmm13,xmm8
+ vpor xmm2,xmm2,xmm5
+ vpor xmm10,xmm10,xmm7
+ vpxor xmm3,xmm3,xmm0
+ vmovdqa xmm0,XMMWORD[((16-128))+rax]
+
+ vpslld xmm8,xmm13,5
+ vpaddd xmm12,xmm12,xmm15
+ vpxor xmm6,xmm11,xmm14
+ vmovdqa XMMWORD[(224-128)+rax],xmm2
+ vpaddd xmm12,xmm12,xmm2
+ vpxor xmm3,xmm3,XMMWORD[((112-128))+rax]
+ vpsrld xmm9,xmm13,27
+ vpxor xmm6,xmm6,xmm10
+ vpxor xmm3,xmm3,xmm0
+
+ vpslld xmm7,xmm14,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm12,xmm12,xmm6
+ vpsrld xmm5,xmm3,31
+ vpaddd xmm3,xmm3,xmm3
+
+ vpsrld xmm14,xmm14,2
+ vpaddd xmm12,xmm12,xmm8
+ vpor xmm3,xmm3,xmm5
+ vpor xmm14,xmm14,xmm7
+ vpxor xmm4,xmm4,xmm1
+ vmovdqa xmm1,XMMWORD[((32-128))+rax]
+
+ vpslld xmm8,xmm12,5
+ vpaddd xmm11,xmm11,xmm15
+ vpxor xmm6,xmm10,xmm13
+ vmovdqa XMMWORD[(240-128)+rax],xmm3
+ vpaddd xmm11,xmm11,xmm3
+ vpxor xmm4,xmm4,XMMWORD[((128-128))+rax]
+ vpsrld xmm9,xmm12,27
+ vpxor xmm6,xmm6,xmm14
+ vpxor xmm4,xmm4,xmm1
+
+ vpslld xmm7,xmm13,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm11,xmm11,xmm6
+ vpsrld xmm5,xmm4,31
+ vpaddd xmm4,xmm4,xmm4
+
+ vpsrld xmm13,xmm13,2
+ vpaddd xmm11,xmm11,xmm8
+ vpor xmm4,xmm4,xmm5
+ vpor xmm13,xmm13,xmm7
+ vpxor xmm0,xmm0,xmm2
+ vmovdqa xmm2,XMMWORD[((48-128))+rax]
+
+ vpslld xmm8,xmm11,5
+ vpaddd xmm10,xmm10,xmm15
+ vpxor xmm6,xmm14,xmm12
+ vmovdqa XMMWORD[(0-128)+rax],xmm4
+ vpaddd xmm10,xmm10,xmm4
+ vpxor xmm0,xmm0,XMMWORD[((144-128))+rax]
+ vpsrld xmm9,xmm11,27
+ vpxor xmm6,xmm6,xmm13
+ vpxor xmm0,xmm0,xmm2
+
+ vpslld xmm7,xmm12,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm10,xmm10,xmm6
+ vpsrld xmm5,xmm0,31
+ vpaddd xmm0,xmm0,xmm0
+
+ vpsrld xmm12,xmm12,2
+ vpaddd xmm10,xmm10,xmm8
+ vpor xmm0,xmm0,xmm5
+ vpor xmm12,xmm12,xmm7
+ vpxor xmm1,xmm1,xmm3
+ vmovdqa xmm3,XMMWORD[((64-128))+rax]
+
+ vpslld xmm8,xmm10,5
+ vpaddd xmm14,xmm14,xmm15
+ vpxor xmm6,xmm13,xmm11
+ vmovdqa XMMWORD[(16-128)+rax],xmm0
+ vpaddd xmm14,xmm14,xmm0
+ vpxor xmm1,xmm1,XMMWORD[((160-128))+rax]
+ vpsrld xmm9,xmm10,27
+ vpxor xmm6,xmm6,xmm12
+ vpxor xmm1,xmm1,xmm3
+
+ vpslld xmm7,xmm11,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm14,xmm14,xmm6
+ vpsrld xmm5,xmm1,31
+ vpaddd xmm1,xmm1,xmm1
+
+ vpsrld xmm11,xmm11,2
+ vpaddd xmm14,xmm14,xmm8
+ vpor xmm1,xmm1,xmm5
+ vpor xmm11,xmm11,xmm7
+ vpxor xmm2,xmm2,xmm4
+ vmovdqa xmm4,XMMWORD[((80-128))+rax]
+
+ vpslld xmm8,xmm14,5
+ vpaddd xmm13,xmm13,xmm15
+ vpxor xmm6,xmm12,xmm10
+ vmovdqa XMMWORD[(32-128)+rax],xmm1
+ vpaddd xmm13,xmm13,xmm1
+ vpxor xmm2,xmm2,XMMWORD[((176-128))+rax]
+ vpsrld xmm9,xmm14,27
+ vpxor xmm6,xmm6,xmm11
+ vpxor xmm2,xmm2,xmm4
+
+ vpslld xmm7,xmm10,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm13,xmm13,xmm6
+ vpsrld xmm5,xmm2,31
+ vpaddd xmm2,xmm2,xmm2
+
+ vpsrld xmm10,xmm10,2
+ vpaddd xmm13,xmm13,xmm8
+ vpor xmm2,xmm2,xmm5
+ vpor xmm10,xmm10,xmm7
+ vpxor xmm3,xmm3,xmm0
+ vmovdqa xmm0,XMMWORD[((96-128))+rax]
+
+ vpslld xmm8,xmm13,5
+ vpaddd xmm12,xmm12,xmm15
+ vpxor xmm6,xmm11,xmm14
+ vmovdqa XMMWORD[(48-128)+rax],xmm2
+ vpaddd xmm12,xmm12,xmm2
+ vpxor xmm3,xmm3,XMMWORD[((192-128))+rax]
+ vpsrld xmm9,xmm13,27
+ vpxor xmm6,xmm6,xmm10
+ vpxor xmm3,xmm3,xmm0
+
+ vpslld xmm7,xmm14,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm12,xmm12,xmm6
+ vpsrld xmm5,xmm3,31
+ vpaddd xmm3,xmm3,xmm3
+
+ vpsrld xmm14,xmm14,2
+ vpaddd xmm12,xmm12,xmm8
+ vpor xmm3,xmm3,xmm5
+ vpor xmm14,xmm14,xmm7
+ vpxor xmm4,xmm4,xmm1
+ vmovdqa xmm1,XMMWORD[((112-128))+rax]
+
+ vpslld xmm8,xmm12,5
+ vpaddd xmm11,xmm11,xmm15
+ vpxor xmm6,xmm10,xmm13
+ vmovdqa XMMWORD[(64-128)+rax],xmm3
+ vpaddd xmm11,xmm11,xmm3
+ vpxor xmm4,xmm4,XMMWORD[((208-128))+rax]
+ vpsrld xmm9,xmm12,27
+ vpxor xmm6,xmm6,xmm14
+ vpxor xmm4,xmm4,xmm1
+
+ vpslld xmm7,xmm13,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm11,xmm11,xmm6
+ vpsrld xmm5,xmm4,31
+ vpaddd xmm4,xmm4,xmm4
+
+ vpsrld xmm13,xmm13,2
+ vpaddd xmm11,xmm11,xmm8
+ vpor xmm4,xmm4,xmm5
+ vpor xmm13,xmm13,xmm7
+ vpxor xmm0,xmm0,xmm2
+ vmovdqa xmm2,XMMWORD[((128-128))+rax]
+
+ vpslld xmm8,xmm11,5
+ vpaddd xmm10,xmm10,xmm15
+ vpxor xmm6,xmm14,xmm12
+ vmovdqa XMMWORD[(80-128)+rax],xmm4
+ vpaddd xmm10,xmm10,xmm4
+ vpxor xmm0,xmm0,XMMWORD[((224-128))+rax]
+ vpsrld xmm9,xmm11,27
+ vpxor xmm6,xmm6,xmm13
+ vpxor xmm0,xmm0,xmm2
+
+ vpslld xmm7,xmm12,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm10,xmm10,xmm6
+ vpsrld xmm5,xmm0,31
+ vpaddd xmm0,xmm0,xmm0
+
+ vpsrld xmm12,xmm12,2
+ vpaddd xmm10,xmm10,xmm8
+ vpor xmm0,xmm0,xmm5
+ vpor xmm12,xmm12,xmm7
+ vpxor xmm1,xmm1,xmm3
+ vmovdqa xmm3,XMMWORD[((144-128))+rax]
+
+ vpslld xmm8,xmm10,5
+ vpaddd xmm14,xmm14,xmm15
+ vpxor xmm6,xmm13,xmm11
+ vmovdqa XMMWORD[(96-128)+rax],xmm0
+ vpaddd xmm14,xmm14,xmm0
+ vpxor xmm1,xmm1,XMMWORD[((240-128))+rax]
+ vpsrld xmm9,xmm10,27
+ vpxor xmm6,xmm6,xmm12
+ vpxor xmm1,xmm1,xmm3
+
+ vpslld xmm7,xmm11,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm14,xmm14,xmm6
+ vpsrld xmm5,xmm1,31
+ vpaddd xmm1,xmm1,xmm1
+
+ vpsrld xmm11,xmm11,2
+ vpaddd xmm14,xmm14,xmm8
+ vpor xmm1,xmm1,xmm5
+ vpor xmm11,xmm11,xmm7
+ vpxor xmm2,xmm2,xmm4
+ vmovdqa xmm4,XMMWORD[((160-128))+rax]
+
+ vpslld xmm8,xmm14,5
+ vpaddd xmm13,xmm13,xmm15
+ vpxor xmm6,xmm12,xmm10
+ vmovdqa XMMWORD[(112-128)+rax],xmm1
+ vpaddd xmm13,xmm13,xmm1
+ vpxor xmm2,xmm2,XMMWORD[((0-128))+rax]
+ vpsrld xmm9,xmm14,27
+ vpxor xmm6,xmm6,xmm11
+ vpxor xmm2,xmm2,xmm4
+
+ vpslld xmm7,xmm10,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm13,xmm13,xmm6
+ vpsrld xmm5,xmm2,31
+ vpaddd xmm2,xmm2,xmm2
+
+ vpsrld xmm10,xmm10,2
+ vpaddd xmm13,xmm13,xmm8
+ vpor xmm2,xmm2,xmm5
+ vpor xmm10,xmm10,xmm7
+ vpxor xmm3,xmm3,xmm0
+ vmovdqa xmm0,XMMWORD[((176-128))+rax]
+
+ vpslld xmm8,xmm13,5
+ vpaddd xmm12,xmm12,xmm15
+ vpxor xmm6,xmm11,xmm14
+ vpaddd xmm12,xmm12,xmm2
+ vpxor xmm3,xmm3,XMMWORD[((16-128))+rax]
+ vpsrld xmm9,xmm13,27
+ vpxor xmm6,xmm6,xmm10
+ vpxor xmm3,xmm3,xmm0
+
+ vpslld xmm7,xmm14,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm12,xmm12,xmm6
+ vpsrld xmm5,xmm3,31
+ vpaddd xmm3,xmm3,xmm3
+
+ vpsrld xmm14,xmm14,2
+ vpaddd xmm12,xmm12,xmm8
+ vpor xmm3,xmm3,xmm5
+ vpor xmm14,xmm14,xmm7
+ vpxor xmm4,xmm4,xmm1
+ vmovdqa xmm1,XMMWORD[((192-128))+rax]
+
+ vpslld xmm8,xmm12,5
+ vpaddd xmm11,xmm11,xmm15
+ vpxor xmm6,xmm10,xmm13
+ vpaddd xmm11,xmm11,xmm3
+ vpxor xmm4,xmm4,XMMWORD[((32-128))+rax]
+ vpsrld xmm9,xmm12,27
+ vpxor xmm6,xmm6,xmm14
+ vpxor xmm4,xmm4,xmm1
+
+ vpslld xmm7,xmm13,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm11,xmm11,xmm6
+ vpsrld xmm5,xmm4,31
+ vpaddd xmm4,xmm4,xmm4
+
+ vpsrld xmm13,xmm13,2
+ vpaddd xmm11,xmm11,xmm8
+ vpor xmm4,xmm4,xmm5
+ vpor xmm13,xmm13,xmm7
+ vpxor xmm0,xmm0,xmm2
+ vmovdqa xmm2,XMMWORD[((208-128))+rax]
+
+ vpslld xmm8,xmm11,5
+ vpaddd xmm10,xmm10,xmm15
+ vpxor xmm6,xmm14,xmm12
+ vpaddd xmm10,xmm10,xmm4
+ vpxor xmm0,xmm0,XMMWORD[((48-128))+rax]
+ vpsrld xmm9,xmm11,27
+ vpxor xmm6,xmm6,xmm13
+ vpxor xmm0,xmm0,xmm2
+
+ vpslld xmm7,xmm12,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm10,xmm10,xmm6
+ vpsrld xmm5,xmm0,31
+ vpaddd xmm0,xmm0,xmm0
+
+ vpsrld xmm12,xmm12,2
+ vpaddd xmm10,xmm10,xmm8
+ vpor xmm0,xmm0,xmm5
+ vpor xmm12,xmm12,xmm7
+ vpxor xmm1,xmm1,xmm3
+ vmovdqa xmm3,XMMWORD[((224-128))+rax]
+
+ vpslld xmm8,xmm10,5
+ vpaddd xmm14,xmm14,xmm15
+ vpxor xmm6,xmm13,xmm11
+ vpaddd xmm14,xmm14,xmm0
+ vpxor xmm1,xmm1,XMMWORD[((64-128))+rax]
+ vpsrld xmm9,xmm10,27
+ vpxor xmm6,xmm6,xmm12
+ vpxor xmm1,xmm1,xmm3
+
+ vpslld xmm7,xmm11,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm14,xmm14,xmm6
+ vpsrld xmm5,xmm1,31
+ vpaddd xmm1,xmm1,xmm1
+
+ vpsrld xmm11,xmm11,2
+ vpaddd xmm14,xmm14,xmm8
+ vpor xmm1,xmm1,xmm5
+ vpor xmm11,xmm11,xmm7
+ vpxor xmm2,xmm2,xmm4
+ vmovdqa xmm4,XMMWORD[((240-128))+rax]
+
+ vpslld xmm8,xmm14,5
+ vpaddd xmm13,xmm13,xmm15
+ vpxor xmm6,xmm12,xmm10
+ vpaddd xmm13,xmm13,xmm1
+ vpxor xmm2,xmm2,XMMWORD[((80-128))+rax]
+ vpsrld xmm9,xmm14,27
+ vpxor xmm6,xmm6,xmm11
+ vpxor xmm2,xmm2,xmm4
+
+ vpslld xmm7,xmm10,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm13,xmm13,xmm6
+ vpsrld xmm5,xmm2,31
+ vpaddd xmm2,xmm2,xmm2
+
+ vpsrld xmm10,xmm10,2
+ vpaddd xmm13,xmm13,xmm8
+ vpor xmm2,xmm2,xmm5
+ vpor xmm10,xmm10,xmm7
+ vpxor xmm3,xmm3,xmm0
+ vmovdqa xmm0,XMMWORD[((0-128))+rax]
+
+ vpslld xmm8,xmm13,5
+ vpaddd xmm12,xmm12,xmm15
+ vpxor xmm6,xmm11,xmm14
+ vpaddd xmm12,xmm12,xmm2
+ vpxor xmm3,xmm3,XMMWORD[((96-128))+rax]
+ vpsrld xmm9,xmm13,27
+ vpxor xmm6,xmm6,xmm10
+ vpxor xmm3,xmm3,xmm0
+
+ vpslld xmm7,xmm14,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm12,xmm12,xmm6
+ vpsrld xmm5,xmm3,31
+ vpaddd xmm3,xmm3,xmm3
+
+ vpsrld xmm14,xmm14,2
+ vpaddd xmm12,xmm12,xmm8
+ vpor xmm3,xmm3,xmm5
+ vpor xmm14,xmm14,xmm7
+ vpxor xmm4,xmm4,xmm1
+ vmovdqa xmm1,XMMWORD[((16-128))+rax]
+
+ vpslld xmm8,xmm12,5
+ vpaddd xmm11,xmm11,xmm15
+ vpxor xmm6,xmm10,xmm13
+ vpaddd xmm11,xmm11,xmm3
+ vpxor xmm4,xmm4,XMMWORD[((112-128))+rax]
+ vpsrld xmm9,xmm12,27
+ vpxor xmm6,xmm6,xmm14
+ vpxor xmm4,xmm4,xmm1
+
+ vpslld xmm7,xmm13,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm11,xmm11,xmm6
+ vpsrld xmm5,xmm4,31
+ vpaddd xmm4,xmm4,xmm4
+
+ vpsrld xmm13,xmm13,2
+ vpaddd xmm11,xmm11,xmm8
+ vpor xmm4,xmm4,xmm5
+ vpor xmm13,xmm13,xmm7
+ vpslld xmm8,xmm11,5
+ vpaddd xmm10,xmm10,xmm15
+ vpxor xmm6,xmm14,xmm12
+
+ vpsrld xmm9,xmm11,27
+ vpaddd xmm10,xmm10,xmm4
+ vpxor xmm6,xmm6,xmm13
+
+ vpslld xmm7,xmm12,30
+ vpor xmm8,xmm8,xmm9
+ vpaddd xmm10,xmm10,xmm6
+
+ vpsrld xmm12,xmm12,2
+ vpaddd xmm10,xmm10,xmm8
+ vpor xmm12,xmm12,xmm7
+ mov ecx,1
+ cmp ecx,DWORD[rbx]
+ cmovge r8,rbp
+ cmp ecx,DWORD[4+rbx]
+ cmovge r9,rbp
+ cmp ecx,DWORD[8+rbx]
+ cmovge r10,rbp
+ cmp ecx,DWORD[12+rbx]
+ cmovge r11,rbp
+ vmovdqu xmm6,XMMWORD[rbx]
+ vpxor xmm8,xmm8,xmm8
+ vmovdqa xmm7,xmm6
+ vpcmpgtd xmm7,xmm7,xmm8
+ vpaddd xmm6,xmm6,xmm7
+
+ vpand xmm10,xmm10,xmm7
+ vpand xmm11,xmm11,xmm7
+ vpaddd xmm10,xmm10,XMMWORD[rdi]
+ vpand xmm12,xmm12,xmm7
+ vpaddd xmm11,xmm11,XMMWORD[32+rdi]
+ vpand xmm13,xmm13,xmm7
+ vpaddd xmm12,xmm12,XMMWORD[64+rdi]
+ vpand xmm14,xmm14,xmm7
+ vpaddd xmm13,xmm13,XMMWORD[96+rdi]
+ vpaddd xmm14,xmm14,XMMWORD[128+rdi]
+ vmovdqu XMMWORD[rdi],xmm10
+ vmovdqu XMMWORD[32+rdi],xmm11
+ vmovdqu XMMWORD[64+rdi],xmm12
+ vmovdqu XMMWORD[96+rdi],xmm13
+ vmovdqu XMMWORD[128+rdi],xmm14
+
+ vmovdqu XMMWORD[rbx],xmm6
+ vmovdqu xmm5,XMMWORD[96+rbp]
+ dec edx
+ jnz NEAR $L$oop_avx
+
+ mov edx,DWORD[280+rsp]
+ lea rdi,[16+rdi]
+ lea rsi,[64+rsi]
+ dec edx
+ jnz NEAR $L$oop_grande_avx
+
+$L$done_avx:
+ mov rax,QWORD[272+rsp]
+
+ vzeroupper
+ movaps xmm6,XMMWORD[((-184))+rax]
+ movaps xmm7,XMMWORD[((-168))+rax]
+ movaps xmm8,XMMWORD[((-152))+rax]
+ movaps xmm9,XMMWORD[((-136))+rax]
+ movaps xmm10,XMMWORD[((-120))+rax]
+ movaps xmm11,XMMWORD[((-104))+rax]
+ movaps xmm12,XMMWORD[((-88))+rax]
+ movaps xmm13,XMMWORD[((-72))+rax]
+ movaps xmm14,XMMWORD[((-56))+rax]
+ movaps xmm15,XMMWORD[((-40))+rax]
+ mov rbp,QWORD[((-16))+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+
+ lea rsp,[rax]
+
+$L$epilogue_avx:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha1_multi_block_avx:
+
+ALIGN 32
+sha1_multi_block_avx2:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha1_multi_block_avx2:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_avx2_shortcut:
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ lea rsp,[((-168))+rsp]
+ movaps XMMWORD[rsp],xmm6
+ movaps XMMWORD[16+rsp],xmm7
+ movaps XMMWORD[32+rsp],xmm8
+ movaps XMMWORD[48+rsp],xmm9
+ movaps XMMWORD[64+rsp],xmm10
+ movaps XMMWORD[80+rsp],xmm11
+ movaps XMMWORD[(-120)+rax],xmm12
+ movaps XMMWORD[(-104)+rax],xmm13
+ movaps XMMWORD[(-88)+rax],xmm14
+ movaps XMMWORD[(-72)+rax],xmm15
+ sub rsp,576
+ and rsp,-256
+ mov QWORD[544+rsp],rax
+
+$L$body_avx2:
+ lea rbp,[K_XX_XX]
+ shr edx,1
+
+ vzeroupper
+$L$oop_grande_avx2:
+ mov DWORD[552+rsp],edx
+ xor edx,edx
+ lea rbx,[512+rsp]
+
+ mov r12,QWORD[rsi]
+
+ mov ecx,DWORD[8+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[rbx],ecx
+ cmovle r12,rbp
+
+ mov r13,QWORD[16+rsi]
+
+ mov ecx,DWORD[24+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[4+rbx],ecx
+ cmovle r13,rbp
+
+ mov r14,QWORD[32+rsi]
+
+ mov ecx,DWORD[40+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[8+rbx],ecx
+ cmovle r14,rbp
+
+ mov r15,QWORD[48+rsi]
+
+ mov ecx,DWORD[56+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[12+rbx],ecx
+ cmovle r15,rbp
+
+ mov r8,QWORD[64+rsi]
+
+ mov ecx,DWORD[72+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[16+rbx],ecx
+ cmovle r8,rbp
+
+ mov r9,QWORD[80+rsi]
+
+ mov ecx,DWORD[88+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[20+rbx],ecx
+ cmovle r9,rbp
+
+ mov r10,QWORD[96+rsi]
+
+ mov ecx,DWORD[104+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[24+rbx],ecx
+ cmovle r10,rbp
+
+ mov r11,QWORD[112+rsi]
+
+ mov ecx,DWORD[120+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[28+rbx],ecx
+ cmovle r11,rbp
+ vmovdqu ymm0,YMMWORD[rdi]
+ lea rax,[128+rsp]
+ vmovdqu ymm1,YMMWORD[32+rdi]
+ lea rbx,[((256+128))+rsp]
+ vmovdqu ymm2,YMMWORD[64+rdi]
+ vmovdqu ymm3,YMMWORD[96+rdi]
+ vmovdqu ymm4,YMMWORD[128+rdi]
+ vmovdqu ymm9,YMMWORD[96+rbp]
+ jmp NEAR $L$oop_avx2
+
+ALIGN 32
+$L$oop_avx2:
+ vmovdqa ymm15,YMMWORD[((-32))+rbp]
+ vmovd xmm10,DWORD[r12]
+ lea r12,[64+r12]
+ vmovd xmm12,DWORD[r8]
+ lea r8,[64+r8]
+ vmovd xmm7,DWORD[r13]
+ lea r13,[64+r13]
+ vmovd xmm6,DWORD[r9]
+ lea r9,[64+r9]
+ vpinsrd xmm10,xmm10,DWORD[r14],1
+ lea r14,[64+r14]
+ vpinsrd xmm12,xmm12,DWORD[r10],1
+ lea r10,[64+r10]
+ vpinsrd xmm7,xmm7,DWORD[r15],1
+ lea r15,[64+r15]
+ vpunpckldq ymm10,ymm10,ymm7
+ vpinsrd xmm6,xmm6,DWORD[r11],1
+ lea r11,[64+r11]
+ vpunpckldq ymm12,ymm12,ymm6
+ vmovd xmm11,DWORD[((-60))+r12]
+ vinserti128 ymm10,ymm10,xmm12,1
+ vmovd xmm8,DWORD[((-60))+r8]
+ vpshufb ymm10,ymm10,ymm9
+ vmovd xmm7,DWORD[((-60))+r13]
+ vmovd xmm6,DWORD[((-60))+r9]
+ vpinsrd xmm11,xmm11,DWORD[((-60))+r14],1
+ vpinsrd xmm8,xmm8,DWORD[((-60))+r10],1
+ vpinsrd xmm7,xmm7,DWORD[((-60))+r15],1
+ vpunpckldq ymm11,ymm11,ymm7
+ vpinsrd xmm6,xmm6,DWORD[((-60))+r11],1
+ vpunpckldq ymm8,ymm8,ymm6
+ vpaddd ymm4,ymm4,ymm15
+ vpslld ymm7,ymm0,5
+ vpandn ymm6,ymm1,ymm3
+ vpand ymm5,ymm1,ymm2
+
+ vmovdqa YMMWORD[(0-128)+rax],ymm10
+ vpaddd ymm4,ymm4,ymm10
+ vinserti128 ymm11,ymm11,xmm8,1
+ vpsrld ymm8,ymm0,27
+ vpxor ymm5,ymm5,ymm6
+ vmovd xmm12,DWORD[((-56))+r12]
+
+ vpslld ymm6,ymm1,30
+ vpor ymm7,ymm7,ymm8
+ vmovd xmm8,DWORD[((-56))+r8]
+ vpaddd ymm4,ymm4,ymm5
+
+ vpsrld ymm1,ymm1,2
+ vpaddd ymm4,ymm4,ymm7
+ vpshufb ymm11,ymm11,ymm9
+ vpor ymm1,ymm1,ymm6
+ vmovd xmm7,DWORD[((-56))+r13]
+ vmovd xmm6,DWORD[((-56))+r9]
+ vpinsrd xmm12,xmm12,DWORD[((-56))+r14],1
+ vpinsrd xmm8,xmm8,DWORD[((-56))+r10],1
+ vpinsrd xmm7,xmm7,DWORD[((-56))+r15],1
+ vpunpckldq ymm12,ymm12,ymm7
+ vpinsrd xmm6,xmm6,DWORD[((-56))+r11],1
+ vpunpckldq ymm8,ymm8,ymm6
+ vpaddd ymm3,ymm3,ymm15
+ vpslld ymm7,ymm4,5
+ vpandn ymm6,ymm0,ymm2
+ vpand ymm5,ymm0,ymm1
+
+ vmovdqa YMMWORD[(32-128)+rax],ymm11
+ vpaddd ymm3,ymm3,ymm11
+ vinserti128 ymm12,ymm12,xmm8,1
+ vpsrld ymm8,ymm4,27
+ vpxor ymm5,ymm5,ymm6
+ vmovd xmm13,DWORD[((-52))+r12]
+
+ vpslld ymm6,ymm0,30
+ vpor ymm7,ymm7,ymm8
+ vmovd xmm8,DWORD[((-52))+r8]
+ vpaddd ymm3,ymm3,ymm5
+
+ vpsrld ymm0,ymm0,2
+ vpaddd ymm3,ymm3,ymm7
+ vpshufb ymm12,ymm12,ymm9
+ vpor ymm0,ymm0,ymm6
+ vmovd xmm7,DWORD[((-52))+r13]
+ vmovd xmm6,DWORD[((-52))+r9]
+ vpinsrd xmm13,xmm13,DWORD[((-52))+r14],1
+ vpinsrd xmm8,xmm8,DWORD[((-52))+r10],1
+ vpinsrd xmm7,xmm7,DWORD[((-52))+r15],1
+ vpunpckldq ymm13,ymm13,ymm7
+ vpinsrd xmm6,xmm6,DWORD[((-52))+r11],1
+ vpunpckldq ymm8,ymm8,ymm6
+ vpaddd ymm2,ymm2,ymm15
+ vpslld ymm7,ymm3,5
+ vpandn ymm6,ymm4,ymm1
+ vpand ymm5,ymm4,ymm0
+
+ vmovdqa YMMWORD[(64-128)+rax],ymm12
+ vpaddd ymm2,ymm2,ymm12
+ vinserti128 ymm13,ymm13,xmm8,1
+ vpsrld ymm8,ymm3,27
+ vpxor ymm5,ymm5,ymm6
+ vmovd xmm14,DWORD[((-48))+r12]
+
+ vpslld ymm6,ymm4,30
+ vpor ymm7,ymm7,ymm8
+ vmovd xmm8,DWORD[((-48))+r8]
+ vpaddd ymm2,ymm2,ymm5
+
+ vpsrld ymm4,ymm4,2
+ vpaddd ymm2,ymm2,ymm7
+ vpshufb ymm13,ymm13,ymm9
+ vpor ymm4,ymm4,ymm6
+ vmovd xmm7,DWORD[((-48))+r13]
+ vmovd xmm6,DWORD[((-48))+r9]
+ vpinsrd xmm14,xmm14,DWORD[((-48))+r14],1
+ vpinsrd xmm8,xmm8,DWORD[((-48))+r10],1
+ vpinsrd xmm7,xmm7,DWORD[((-48))+r15],1
+ vpunpckldq ymm14,ymm14,ymm7
+ vpinsrd xmm6,xmm6,DWORD[((-48))+r11],1
+ vpunpckldq ymm8,ymm8,ymm6
+ vpaddd ymm1,ymm1,ymm15
+ vpslld ymm7,ymm2,5
+ vpandn ymm6,ymm3,ymm0
+ vpand ymm5,ymm3,ymm4
+
+ vmovdqa YMMWORD[(96-128)+rax],ymm13
+ vpaddd ymm1,ymm1,ymm13
+ vinserti128 ymm14,ymm14,xmm8,1
+ vpsrld ymm8,ymm2,27
+ vpxor ymm5,ymm5,ymm6
+ vmovd xmm10,DWORD[((-44))+r12]
+
+ vpslld ymm6,ymm3,30
+ vpor ymm7,ymm7,ymm8
+ vmovd xmm8,DWORD[((-44))+r8]
+ vpaddd ymm1,ymm1,ymm5
+
+ vpsrld ymm3,ymm3,2
+ vpaddd ymm1,ymm1,ymm7
+ vpshufb ymm14,ymm14,ymm9
+ vpor ymm3,ymm3,ymm6
+ vmovd xmm7,DWORD[((-44))+r13]
+ vmovd xmm6,DWORD[((-44))+r9]
+ vpinsrd xmm10,xmm10,DWORD[((-44))+r14],1
+ vpinsrd xmm8,xmm8,DWORD[((-44))+r10],1
+ vpinsrd xmm7,xmm7,DWORD[((-44))+r15],1
+ vpunpckldq ymm10,ymm10,ymm7
+ vpinsrd xmm6,xmm6,DWORD[((-44))+r11],1
+ vpunpckldq ymm8,ymm8,ymm6
+ vpaddd ymm0,ymm0,ymm15
+ vpslld ymm7,ymm1,5
+ vpandn ymm6,ymm2,ymm4
+ vpand ymm5,ymm2,ymm3
+
+ vmovdqa YMMWORD[(128-128)+rax],ymm14
+ vpaddd ymm0,ymm0,ymm14
+ vinserti128 ymm10,ymm10,xmm8,1
+ vpsrld ymm8,ymm1,27
+ vpxor ymm5,ymm5,ymm6
+ vmovd xmm11,DWORD[((-40))+r12]
+
+ vpslld ymm6,ymm2,30
+ vpor ymm7,ymm7,ymm8
+ vmovd xmm8,DWORD[((-40))+r8]
+ vpaddd ymm0,ymm0,ymm5
+
+ vpsrld ymm2,ymm2,2
+ vpaddd ymm0,ymm0,ymm7
+ vpshufb ymm10,ymm10,ymm9
+ vpor ymm2,ymm2,ymm6
+ vmovd xmm7,DWORD[((-40))+r13]
+ vmovd xmm6,DWORD[((-40))+r9]
+ vpinsrd xmm11,xmm11,DWORD[((-40))+r14],1
+ vpinsrd xmm8,xmm8,DWORD[((-40))+r10],1
+ vpinsrd xmm7,xmm7,DWORD[((-40))+r15],1
+ vpunpckldq ymm11,ymm11,ymm7
+ vpinsrd xmm6,xmm6,DWORD[((-40))+r11],1
+ vpunpckldq ymm8,ymm8,ymm6
+ vpaddd ymm4,ymm4,ymm15
+ vpslld ymm7,ymm0,5
+ vpandn ymm6,ymm1,ymm3
+ vpand ymm5,ymm1,ymm2
+
+ vmovdqa YMMWORD[(160-128)+rax],ymm10
+ vpaddd ymm4,ymm4,ymm10
+ vinserti128 ymm11,ymm11,xmm8,1
+ vpsrld ymm8,ymm0,27
+ vpxor ymm5,ymm5,ymm6
+ vmovd xmm12,DWORD[((-36))+r12]
+
+ vpslld ymm6,ymm1,30
+ vpor ymm7,ymm7,ymm8
+ vmovd xmm8,DWORD[((-36))+r8]
+ vpaddd ymm4,ymm4,ymm5
+
+ vpsrld ymm1,ymm1,2
+ vpaddd ymm4,ymm4,ymm7
+ vpshufb ymm11,ymm11,ymm9
+ vpor ymm1,ymm1,ymm6
+ vmovd xmm7,DWORD[((-36))+r13]
+ vmovd xmm6,DWORD[((-36))+r9]
+ vpinsrd xmm12,xmm12,DWORD[((-36))+r14],1
+ vpinsrd xmm8,xmm8,DWORD[((-36))+r10],1
+ vpinsrd xmm7,xmm7,DWORD[((-36))+r15],1
+ vpunpckldq ymm12,ymm12,ymm7
+ vpinsrd xmm6,xmm6,DWORD[((-36))+r11],1
+ vpunpckldq ymm8,ymm8,ymm6
+ vpaddd ymm3,ymm3,ymm15
+ vpslld ymm7,ymm4,5
+ vpandn ymm6,ymm0,ymm2
+ vpand ymm5,ymm0,ymm1
+
+ vmovdqa YMMWORD[(192-128)+rax],ymm11
+ vpaddd ymm3,ymm3,ymm11
+ vinserti128 ymm12,ymm12,xmm8,1
+ vpsrld ymm8,ymm4,27
+ vpxor ymm5,ymm5,ymm6
+ vmovd xmm13,DWORD[((-32))+r12]
+
+ vpslld ymm6,ymm0,30
+ vpor ymm7,ymm7,ymm8
+ vmovd xmm8,DWORD[((-32))+r8]
+ vpaddd ymm3,ymm3,ymm5
+
+ vpsrld ymm0,ymm0,2
+ vpaddd ymm3,ymm3,ymm7
+ vpshufb ymm12,ymm12,ymm9
+ vpor ymm0,ymm0,ymm6
+ vmovd xmm7,DWORD[((-32))+r13]
+ vmovd xmm6,DWORD[((-32))+r9]
+ vpinsrd xmm13,xmm13,DWORD[((-32))+r14],1
+ vpinsrd xmm8,xmm8,DWORD[((-32))+r10],1
+ vpinsrd xmm7,xmm7,DWORD[((-32))+r15],1
+ vpunpckldq ymm13,ymm13,ymm7
+ vpinsrd xmm6,xmm6,DWORD[((-32))+r11],1
+ vpunpckldq ymm8,ymm8,ymm6
+ vpaddd ymm2,ymm2,ymm15
+ vpslld ymm7,ymm3,5
+ vpandn ymm6,ymm4,ymm1
+ vpand ymm5,ymm4,ymm0
+
+ vmovdqa YMMWORD[(224-128)+rax],ymm12
+ vpaddd ymm2,ymm2,ymm12
+ vinserti128 ymm13,ymm13,xmm8,1
+ vpsrld ymm8,ymm3,27
+ vpxor ymm5,ymm5,ymm6
+ vmovd xmm14,DWORD[((-28))+r12]
+
+ vpslld ymm6,ymm4,30
+ vpor ymm7,ymm7,ymm8
+ vmovd xmm8,DWORD[((-28))+r8]
+ vpaddd ymm2,ymm2,ymm5
+
+ vpsrld ymm4,ymm4,2
+ vpaddd ymm2,ymm2,ymm7
+ vpshufb ymm13,ymm13,ymm9
+ vpor ymm4,ymm4,ymm6
+ vmovd xmm7,DWORD[((-28))+r13]
+ vmovd xmm6,DWORD[((-28))+r9]
+ vpinsrd xmm14,xmm14,DWORD[((-28))+r14],1
+ vpinsrd xmm8,xmm8,DWORD[((-28))+r10],1
+ vpinsrd xmm7,xmm7,DWORD[((-28))+r15],1
+ vpunpckldq ymm14,ymm14,ymm7
+ vpinsrd xmm6,xmm6,DWORD[((-28))+r11],1
+ vpunpckldq ymm8,ymm8,ymm6
+ vpaddd ymm1,ymm1,ymm15
+ vpslld ymm7,ymm2,5
+ vpandn ymm6,ymm3,ymm0
+ vpand ymm5,ymm3,ymm4
+
+ vmovdqa YMMWORD[(256-256-128)+rbx],ymm13
+ vpaddd ymm1,ymm1,ymm13
+ vinserti128 ymm14,ymm14,xmm8,1
+ vpsrld ymm8,ymm2,27
+ vpxor ymm5,ymm5,ymm6
+ vmovd xmm10,DWORD[((-24))+r12]
+
+ vpslld ymm6,ymm3,30
+ vpor ymm7,ymm7,ymm8
+ vmovd xmm8,DWORD[((-24))+r8]
+ vpaddd ymm1,ymm1,ymm5
+
+ vpsrld ymm3,ymm3,2
+ vpaddd ymm1,ymm1,ymm7
+ vpshufb ymm14,ymm14,ymm9
+ vpor ymm3,ymm3,ymm6
+ vmovd xmm7,DWORD[((-24))+r13]
+ vmovd xmm6,DWORD[((-24))+r9]
+ vpinsrd xmm10,xmm10,DWORD[((-24))+r14],1
+ vpinsrd xmm8,xmm8,DWORD[((-24))+r10],1
+ vpinsrd xmm7,xmm7,DWORD[((-24))+r15],1
+ vpunpckldq ymm10,ymm10,ymm7
+ vpinsrd xmm6,xmm6,DWORD[((-24))+r11],1
+ vpunpckldq ymm8,ymm8,ymm6
+ vpaddd ymm0,ymm0,ymm15
+ vpslld ymm7,ymm1,5
+ vpandn ymm6,ymm2,ymm4
+ vpand ymm5,ymm2,ymm3
+
+ vmovdqa YMMWORD[(288-256-128)+rbx],ymm14
+ vpaddd ymm0,ymm0,ymm14
+ vinserti128 ymm10,ymm10,xmm8,1
+ vpsrld ymm8,ymm1,27
+ vpxor ymm5,ymm5,ymm6
+ vmovd xmm11,DWORD[((-20))+r12]
+
+ vpslld ymm6,ymm2,30
+ vpor ymm7,ymm7,ymm8
+ vmovd xmm8,DWORD[((-20))+r8]
+ vpaddd ymm0,ymm0,ymm5
+
+ vpsrld ymm2,ymm2,2
+ vpaddd ymm0,ymm0,ymm7
+ vpshufb ymm10,ymm10,ymm9
+ vpor ymm2,ymm2,ymm6
+ vmovd xmm7,DWORD[((-20))+r13]
+ vmovd xmm6,DWORD[((-20))+r9]
+ vpinsrd xmm11,xmm11,DWORD[((-20))+r14],1
+ vpinsrd xmm8,xmm8,DWORD[((-20))+r10],1
+ vpinsrd xmm7,xmm7,DWORD[((-20))+r15],1
+ vpunpckldq ymm11,ymm11,ymm7
+ vpinsrd xmm6,xmm6,DWORD[((-20))+r11],1
+ vpunpckldq ymm8,ymm8,ymm6
+ vpaddd ymm4,ymm4,ymm15
+ vpslld ymm7,ymm0,5
+ vpandn ymm6,ymm1,ymm3
+ vpand ymm5,ymm1,ymm2
+
+ vmovdqa YMMWORD[(320-256-128)+rbx],ymm10
+ vpaddd ymm4,ymm4,ymm10
+ vinserti128 ymm11,ymm11,xmm8,1
+ vpsrld ymm8,ymm0,27
+ vpxor ymm5,ymm5,ymm6
+ vmovd xmm12,DWORD[((-16))+r12]
+
+ vpslld ymm6,ymm1,30
+ vpor ymm7,ymm7,ymm8
+ vmovd xmm8,DWORD[((-16))+r8]
+ vpaddd ymm4,ymm4,ymm5
+
+ vpsrld ymm1,ymm1,2
+ vpaddd ymm4,ymm4,ymm7
+ vpshufb ymm11,ymm11,ymm9
+ vpor ymm1,ymm1,ymm6
+ vmovd xmm7,DWORD[((-16))+r13]
+ vmovd xmm6,DWORD[((-16))+r9]
+ vpinsrd xmm12,xmm12,DWORD[((-16))+r14],1
+ vpinsrd xmm8,xmm8,DWORD[((-16))+r10],1
+ vpinsrd xmm7,xmm7,DWORD[((-16))+r15],1
+ vpunpckldq ymm12,ymm12,ymm7
+ vpinsrd xmm6,xmm6,DWORD[((-16))+r11],1
+ vpunpckldq ymm8,ymm8,ymm6
+ vpaddd ymm3,ymm3,ymm15
+ vpslld ymm7,ymm4,5
+ vpandn ymm6,ymm0,ymm2
+ vpand ymm5,ymm0,ymm1
+
+ vmovdqa YMMWORD[(352-256-128)+rbx],ymm11
+ vpaddd ymm3,ymm3,ymm11
+ vinserti128 ymm12,ymm12,xmm8,1
+ vpsrld ymm8,ymm4,27
+ vpxor ymm5,ymm5,ymm6
+ vmovd xmm13,DWORD[((-12))+r12]
+
+ vpslld ymm6,ymm0,30
+ vpor ymm7,ymm7,ymm8
+ vmovd xmm8,DWORD[((-12))+r8]
+ vpaddd ymm3,ymm3,ymm5
+
+ vpsrld ymm0,ymm0,2
+ vpaddd ymm3,ymm3,ymm7
+ vpshufb ymm12,ymm12,ymm9
+ vpor ymm0,ymm0,ymm6
+ vmovd xmm7,DWORD[((-12))+r13]
+ vmovd xmm6,DWORD[((-12))+r9]
+ vpinsrd xmm13,xmm13,DWORD[((-12))+r14],1
+ vpinsrd xmm8,xmm8,DWORD[((-12))+r10],1
+ vpinsrd xmm7,xmm7,DWORD[((-12))+r15],1
+ vpunpckldq ymm13,ymm13,ymm7
+ vpinsrd xmm6,xmm6,DWORD[((-12))+r11],1
+ vpunpckldq ymm8,ymm8,ymm6
+ vpaddd ymm2,ymm2,ymm15
+ vpslld ymm7,ymm3,5
+ vpandn ymm6,ymm4,ymm1
+ vpand ymm5,ymm4,ymm0
+
+ vmovdqa YMMWORD[(384-256-128)+rbx],ymm12
+ vpaddd ymm2,ymm2,ymm12
+ vinserti128 ymm13,ymm13,xmm8,1
+ vpsrld ymm8,ymm3,27
+ vpxor ymm5,ymm5,ymm6
+ vmovd xmm14,DWORD[((-8))+r12]
+
+ vpslld ymm6,ymm4,30
+ vpor ymm7,ymm7,ymm8
+ vmovd xmm8,DWORD[((-8))+r8]
+ vpaddd ymm2,ymm2,ymm5
+
+ vpsrld ymm4,ymm4,2
+ vpaddd ymm2,ymm2,ymm7
+ vpshufb ymm13,ymm13,ymm9
+ vpor ymm4,ymm4,ymm6
+ vmovd xmm7,DWORD[((-8))+r13]
+ vmovd xmm6,DWORD[((-8))+r9]
+ vpinsrd xmm14,xmm14,DWORD[((-8))+r14],1
+ vpinsrd xmm8,xmm8,DWORD[((-8))+r10],1
+ vpinsrd xmm7,xmm7,DWORD[((-8))+r15],1
+ vpunpckldq ymm14,ymm14,ymm7
+ vpinsrd xmm6,xmm6,DWORD[((-8))+r11],1
+ vpunpckldq ymm8,ymm8,ymm6
+ vpaddd ymm1,ymm1,ymm15
+ vpslld ymm7,ymm2,5
+ vpandn ymm6,ymm3,ymm0
+ vpand ymm5,ymm3,ymm4
+
+ vmovdqa YMMWORD[(416-256-128)+rbx],ymm13
+ vpaddd ymm1,ymm1,ymm13
+ vinserti128 ymm14,ymm14,xmm8,1
+ vpsrld ymm8,ymm2,27
+ vpxor ymm5,ymm5,ymm6
+ vmovd xmm10,DWORD[((-4))+r12]
+
+ vpslld ymm6,ymm3,30
+ vpor ymm7,ymm7,ymm8
+ vmovd xmm8,DWORD[((-4))+r8]
+ vpaddd ymm1,ymm1,ymm5
+
+ vpsrld ymm3,ymm3,2
+ vpaddd ymm1,ymm1,ymm7
+ vpshufb ymm14,ymm14,ymm9
+ vpor ymm3,ymm3,ymm6
+ vmovdqa ymm11,YMMWORD[((0-128))+rax]
+ vmovd xmm7,DWORD[((-4))+r13]
+ vmovd xmm6,DWORD[((-4))+r9]
+ vpinsrd xmm10,xmm10,DWORD[((-4))+r14],1
+ vpinsrd xmm8,xmm8,DWORD[((-4))+r10],1
+ vpinsrd xmm7,xmm7,DWORD[((-4))+r15],1
+ vpunpckldq ymm10,ymm10,ymm7
+ vpinsrd xmm6,xmm6,DWORD[((-4))+r11],1
+ vpunpckldq ymm8,ymm8,ymm6
+ vpaddd ymm0,ymm0,ymm15
+ prefetcht0 [63+r12]
+ vpslld ymm7,ymm1,5
+ vpandn ymm6,ymm2,ymm4
+ vpand ymm5,ymm2,ymm3
+
+ vmovdqa YMMWORD[(448-256-128)+rbx],ymm14
+ vpaddd ymm0,ymm0,ymm14
+ vinserti128 ymm10,ymm10,xmm8,1
+ vpsrld ymm8,ymm1,27
+ prefetcht0 [63+r13]
+ vpxor ymm5,ymm5,ymm6
+
+ vpslld ymm6,ymm2,30
+ vpor ymm7,ymm7,ymm8
+ prefetcht0 [63+r14]
+ vpaddd ymm0,ymm0,ymm5
+
+ vpsrld ymm2,ymm2,2
+ vpaddd ymm0,ymm0,ymm7
+ prefetcht0 [63+r15]
+ vpshufb ymm10,ymm10,ymm9
+ vpor ymm2,ymm2,ymm6
+ vmovdqa ymm12,YMMWORD[((32-128))+rax]
+ vpxor ymm11,ymm11,ymm13
+ vmovdqa ymm13,YMMWORD[((64-128))+rax]
+
+ vpaddd ymm4,ymm4,ymm15
+ vpslld ymm7,ymm0,5
+ vpandn ymm6,ymm1,ymm3
+ prefetcht0 [63+r8]
+ vpand ymm5,ymm1,ymm2
+
+ vmovdqa YMMWORD[(480-256-128)+rbx],ymm10
+ vpaddd ymm4,ymm4,ymm10
+ vpxor ymm11,ymm11,YMMWORD[((256-256-128))+rbx]
+ vpsrld ymm8,ymm0,27
+ vpxor ymm5,ymm5,ymm6
+ vpxor ymm11,ymm11,ymm13
+ prefetcht0 [63+r9]
+
+ vpslld ymm6,ymm1,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm4,ymm4,ymm5
+ prefetcht0 [63+r10]
+ vpsrld ymm9,ymm11,31
+ vpaddd ymm11,ymm11,ymm11
+
+ vpsrld ymm1,ymm1,2
+ prefetcht0 [63+r11]
+ vpaddd ymm4,ymm4,ymm7
+ vpor ymm11,ymm11,ymm9
+ vpor ymm1,ymm1,ymm6
+ vpxor ymm12,ymm12,ymm14
+ vmovdqa ymm14,YMMWORD[((96-128))+rax]
+
+ vpaddd ymm3,ymm3,ymm15
+ vpslld ymm7,ymm4,5
+ vpandn ymm6,ymm0,ymm2
+
+ vpand ymm5,ymm0,ymm1
+
+ vmovdqa YMMWORD[(0-128)+rax],ymm11
+ vpaddd ymm3,ymm3,ymm11
+ vpxor ymm12,ymm12,YMMWORD[((288-256-128))+rbx]
+ vpsrld ymm8,ymm4,27
+ vpxor ymm5,ymm5,ymm6
+ vpxor ymm12,ymm12,ymm14
+
+
+ vpslld ymm6,ymm0,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm3,ymm3,ymm5
+
+ vpsrld ymm9,ymm12,31
+ vpaddd ymm12,ymm12,ymm12
+
+ vpsrld ymm0,ymm0,2
+
+ vpaddd ymm3,ymm3,ymm7
+ vpor ymm12,ymm12,ymm9
+ vpor ymm0,ymm0,ymm6
+ vpxor ymm13,ymm13,ymm10
+ vmovdqa ymm10,YMMWORD[((128-128))+rax]
+
+ vpaddd ymm2,ymm2,ymm15
+ vpslld ymm7,ymm3,5
+ vpandn ymm6,ymm4,ymm1
+
+ vpand ymm5,ymm4,ymm0
+
+ vmovdqa YMMWORD[(32-128)+rax],ymm12
+ vpaddd ymm2,ymm2,ymm12
+ vpxor ymm13,ymm13,YMMWORD[((320-256-128))+rbx]
+ vpsrld ymm8,ymm3,27
+ vpxor ymm5,ymm5,ymm6
+ vpxor ymm13,ymm13,ymm10
+
+
+ vpslld ymm6,ymm4,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm2,ymm2,ymm5
+
+ vpsrld ymm9,ymm13,31
+ vpaddd ymm13,ymm13,ymm13
+
+ vpsrld ymm4,ymm4,2
+
+ vpaddd ymm2,ymm2,ymm7
+ vpor ymm13,ymm13,ymm9
+ vpor ymm4,ymm4,ymm6
+ vpxor ymm14,ymm14,ymm11
+ vmovdqa ymm11,YMMWORD[((160-128))+rax]
+
+ vpaddd ymm1,ymm1,ymm15
+ vpslld ymm7,ymm2,5
+ vpandn ymm6,ymm3,ymm0
+
+ vpand ymm5,ymm3,ymm4
+
+ vmovdqa YMMWORD[(64-128)+rax],ymm13
+ vpaddd ymm1,ymm1,ymm13
+ vpxor ymm14,ymm14,YMMWORD[((352-256-128))+rbx]
+ vpsrld ymm8,ymm2,27
+ vpxor ymm5,ymm5,ymm6
+ vpxor ymm14,ymm14,ymm11
+
+
+ vpslld ymm6,ymm3,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm1,ymm1,ymm5
+
+ vpsrld ymm9,ymm14,31
+ vpaddd ymm14,ymm14,ymm14
+
+ vpsrld ymm3,ymm3,2
+
+ vpaddd ymm1,ymm1,ymm7
+ vpor ymm14,ymm14,ymm9
+ vpor ymm3,ymm3,ymm6
+ vpxor ymm10,ymm10,ymm12
+ vmovdqa ymm12,YMMWORD[((192-128))+rax]
+
+ vpaddd ymm0,ymm0,ymm15
+ vpslld ymm7,ymm1,5
+ vpandn ymm6,ymm2,ymm4
+
+ vpand ymm5,ymm2,ymm3
+
+ vmovdqa YMMWORD[(96-128)+rax],ymm14
+ vpaddd ymm0,ymm0,ymm14
+ vpxor ymm10,ymm10,YMMWORD[((384-256-128))+rbx]
+ vpsrld ymm8,ymm1,27
+ vpxor ymm5,ymm5,ymm6
+ vpxor ymm10,ymm10,ymm12
+
+
+ vpslld ymm6,ymm2,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm0,ymm0,ymm5
+
+ vpsrld ymm9,ymm10,31
+ vpaddd ymm10,ymm10,ymm10
+
+ vpsrld ymm2,ymm2,2
+
+ vpaddd ymm0,ymm0,ymm7
+ vpor ymm10,ymm10,ymm9
+ vpor ymm2,ymm2,ymm6
+ vmovdqa ymm15,YMMWORD[rbp]
+ vpxor ymm11,ymm11,ymm13
+ vmovdqa ymm13,YMMWORD[((224-128))+rax]
+
+ vpslld ymm7,ymm0,5
+ vpaddd ymm4,ymm4,ymm15
+ vpxor ymm5,ymm3,ymm1
+ vmovdqa YMMWORD[(128-128)+rax],ymm10
+ vpaddd ymm4,ymm4,ymm10
+ vpxor ymm11,ymm11,YMMWORD[((416-256-128))+rbx]
+ vpsrld ymm8,ymm0,27
+ vpxor ymm5,ymm5,ymm2
+ vpxor ymm11,ymm11,ymm13
+
+ vpslld ymm6,ymm1,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm4,ymm4,ymm5
+ vpsrld ymm9,ymm11,31
+ vpaddd ymm11,ymm11,ymm11
+
+ vpsrld ymm1,ymm1,2
+ vpaddd ymm4,ymm4,ymm7
+ vpor ymm11,ymm11,ymm9
+ vpor ymm1,ymm1,ymm6
+ vpxor ymm12,ymm12,ymm14
+ vmovdqa ymm14,YMMWORD[((256-256-128))+rbx]
+
+ vpslld ymm7,ymm4,5
+ vpaddd ymm3,ymm3,ymm15
+ vpxor ymm5,ymm2,ymm0
+ vmovdqa YMMWORD[(160-128)+rax],ymm11
+ vpaddd ymm3,ymm3,ymm11
+ vpxor ymm12,ymm12,YMMWORD[((448-256-128))+rbx]
+ vpsrld ymm8,ymm4,27
+ vpxor ymm5,ymm5,ymm1
+ vpxor ymm12,ymm12,ymm14
+
+ vpslld ymm6,ymm0,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm3,ymm3,ymm5
+ vpsrld ymm9,ymm12,31
+ vpaddd ymm12,ymm12,ymm12
+
+ vpsrld ymm0,ymm0,2
+ vpaddd ymm3,ymm3,ymm7
+ vpor ymm12,ymm12,ymm9
+ vpor ymm0,ymm0,ymm6
+ vpxor ymm13,ymm13,ymm10
+ vmovdqa ymm10,YMMWORD[((288-256-128))+rbx]
+
+ vpslld ymm7,ymm3,5
+ vpaddd ymm2,ymm2,ymm15
+ vpxor ymm5,ymm1,ymm4
+ vmovdqa YMMWORD[(192-128)+rax],ymm12
+ vpaddd ymm2,ymm2,ymm12
+ vpxor ymm13,ymm13,YMMWORD[((480-256-128))+rbx]
+ vpsrld ymm8,ymm3,27
+ vpxor ymm5,ymm5,ymm0
+ vpxor ymm13,ymm13,ymm10
+
+ vpslld ymm6,ymm4,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm2,ymm2,ymm5
+ vpsrld ymm9,ymm13,31
+ vpaddd ymm13,ymm13,ymm13
+
+ vpsrld ymm4,ymm4,2
+ vpaddd ymm2,ymm2,ymm7
+ vpor ymm13,ymm13,ymm9
+ vpor ymm4,ymm4,ymm6
+ vpxor ymm14,ymm14,ymm11
+ vmovdqa ymm11,YMMWORD[((320-256-128))+rbx]
+
+ vpslld ymm7,ymm2,5
+ vpaddd ymm1,ymm1,ymm15
+ vpxor ymm5,ymm0,ymm3
+ vmovdqa YMMWORD[(224-128)+rax],ymm13
+ vpaddd ymm1,ymm1,ymm13
+ vpxor ymm14,ymm14,YMMWORD[((0-128))+rax]
+ vpsrld ymm8,ymm2,27
+ vpxor ymm5,ymm5,ymm4
+ vpxor ymm14,ymm14,ymm11
+
+ vpslld ymm6,ymm3,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm1,ymm1,ymm5
+ vpsrld ymm9,ymm14,31
+ vpaddd ymm14,ymm14,ymm14
+
+ vpsrld ymm3,ymm3,2
+ vpaddd ymm1,ymm1,ymm7
+ vpor ymm14,ymm14,ymm9
+ vpor ymm3,ymm3,ymm6
+ vpxor ymm10,ymm10,ymm12
+ vmovdqa ymm12,YMMWORD[((352-256-128))+rbx]
+
+ vpslld ymm7,ymm1,5
+ vpaddd ymm0,ymm0,ymm15
+ vpxor ymm5,ymm4,ymm2
+ vmovdqa YMMWORD[(256-256-128)+rbx],ymm14
+ vpaddd ymm0,ymm0,ymm14
+ vpxor ymm10,ymm10,YMMWORD[((32-128))+rax]
+ vpsrld ymm8,ymm1,27
+ vpxor ymm5,ymm5,ymm3
+ vpxor ymm10,ymm10,ymm12
+
+ vpslld ymm6,ymm2,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm0,ymm0,ymm5
+ vpsrld ymm9,ymm10,31
+ vpaddd ymm10,ymm10,ymm10
+
+ vpsrld ymm2,ymm2,2
+ vpaddd ymm0,ymm0,ymm7
+ vpor ymm10,ymm10,ymm9
+ vpor ymm2,ymm2,ymm6
+ vpxor ymm11,ymm11,ymm13
+ vmovdqa ymm13,YMMWORD[((384-256-128))+rbx]
+
+ vpslld ymm7,ymm0,5
+ vpaddd ymm4,ymm4,ymm15
+ vpxor ymm5,ymm3,ymm1
+ vmovdqa YMMWORD[(288-256-128)+rbx],ymm10
+ vpaddd ymm4,ymm4,ymm10
+ vpxor ymm11,ymm11,YMMWORD[((64-128))+rax]
+ vpsrld ymm8,ymm0,27
+ vpxor ymm5,ymm5,ymm2
+ vpxor ymm11,ymm11,ymm13
+
+ vpslld ymm6,ymm1,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm4,ymm4,ymm5
+ vpsrld ymm9,ymm11,31
+ vpaddd ymm11,ymm11,ymm11
+
+ vpsrld ymm1,ymm1,2
+ vpaddd ymm4,ymm4,ymm7
+ vpor ymm11,ymm11,ymm9
+ vpor ymm1,ymm1,ymm6
+ vpxor ymm12,ymm12,ymm14
+ vmovdqa ymm14,YMMWORD[((416-256-128))+rbx]
+
+ vpslld ymm7,ymm4,5
+ vpaddd ymm3,ymm3,ymm15
+ vpxor ymm5,ymm2,ymm0
+ vmovdqa YMMWORD[(320-256-128)+rbx],ymm11
+ vpaddd ymm3,ymm3,ymm11
+ vpxor ymm12,ymm12,YMMWORD[((96-128))+rax]
+ vpsrld ymm8,ymm4,27
+ vpxor ymm5,ymm5,ymm1
+ vpxor ymm12,ymm12,ymm14
+
+ vpslld ymm6,ymm0,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm3,ymm3,ymm5
+ vpsrld ymm9,ymm12,31
+ vpaddd ymm12,ymm12,ymm12
+
+ vpsrld ymm0,ymm0,2
+ vpaddd ymm3,ymm3,ymm7
+ vpor ymm12,ymm12,ymm9
+ vpor ymm0,ymm0,ymm6
+ vpxor ymm13,ymm13,ymm10
+ vmovdqa ymm10,YMMWORD[((448-256-128))+rbx]
+
+ vpslld ymm7,ymm3,5
+ vpaddd ymm2,ymm2,ymm15
+ vpxor ymm5,ymm1,ymm4
+ vmovdqa YMMWORD[(352-256-128)+rbx],ymm12
+ vpaddd ymm2,ymm2,ymm12
+ vpxor ymm13,ymm13,YMMWORD[((128-128))+rax]
+ vpsrld ymm8,ymm3,27
+ vpxor ymm5,ymm5,ymm0
+ vpxor ymm13,ymm13,ymm10
+
+ vpslld ymm6,ymm4,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm2,ymm2,ymm5
+ vpsrld ymm9,ymm13,31
+ vpaddd ymm13,ymm13,ymm13
+
+ vpsrld ymm4,ymm4,2
+ vpaddd ymm2,ymm2,ymm7
+ vpor ymm13,ymm13,ymm9
+ vpor ymm4,ymm4,ymm6
+ vpxor ymm14,ymm14,ymm11
+ vmovdqa ymm11,YMMWORD[((480-256-128))+rbx]
+
+ vpslld ymm7,ymm2,5
+ vpaddd ymm1,ymm1,ymm15
+ vpxor ymm5,ymm0,ymm3
+ vmovdqa YMMWORD[(384-256-128)+rbx],ymm13
+ vpaddd ymm1,ymm1,ymm13
+ vpxor ymm14,ymm14,YMMWORD[((160-128))+rax]
+ vpsrld ymm8,ymm2,27
+ vpxor ymm5,ymm5,ymm4
+ vpxor ymm14,ymm14,ymm11
+
+ vpslld ymm6,ymm3,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm1,ymm1,ymm5
+ vpsrld ymm9,ymm14,31
+ vpaddd ymm14,ymm14,ymm14
+
+ vpsrld ymm3,ymm3,2
+ vpaddd ymm1,ymm1,ymm7
+ vpor ymm14,ymm14,ymm9
+ vpor ymm3,ymm3,ymm6
+ vpxor ymm10,ymm10,ymm12
+ vmovdqa ymm12,YMMWORD[((0-128))+rax]
+
+ vpslld ymm7,ymm1,5
+ vpaddd ymm0,ymm0,ymm15
+ vpxor ymm5,ymm4,ymm2
+ vmovdqa YMMWORD[(416-256-128)+rbx],ymm14
+ vpaddd ymm0,ymm0,ymm14
+ vpxor ymm10,ymm10,YMMWORD[((192-128))+rax]
+ vpsrld ymm8,ymm1,27
+ vpxor ymm5,ymm5,ymm3
+ vpxor ymm10,ymm10,ymm12
+
+ vpslld ymm6,ymm2,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm0,ymm0,ymm5
+ vpsrld ymm9,ymm10,31
+ vpaddd ymm10,ymm10,ymm10
+
+ vpsrld ymm2,ymm2,2
+ vpaddd ymm0,ymm0,ymm7
+ vpor ymm10,ymm10,ymm9
+ vpor ymm2,ymm2,ymm6
+ vpxor ymm11,ymm11,ymm13
+ vmovdqa ymm13,YMMWORD[((32-128))+rax]
+
+ vpslld ymm7,ymm0,5
+ vpaddd ymm4,ymm4,ymm15
+ vpxor ymm5,ymm3,ymm1
+ vmovdqa YMMWORD[(448-256-128)+rbx],ymm10
+ vpaddd ymm4,ymm4,ymm10
+ vpxor ymm11,ymm11,YMMWORD[((224-128))+rax]
+ vpsrld ymm8,ymm0,27
+ vpxor ymm5,ymm5,ymm2
+ vpxor ymm11,ymm11,ymm13
+
+ vpslld ymm6,ymm1,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm4,ymm4,ymm5
+ vpsrld ymm9,ymm11,31
+ vpaddd ymm11,ymm11,ymm11
+
+ vpsrld ymm1,ymm1,2
+ vpaddd ymm4,ymm4,ymm7
+ vpor ymm11,ymm11,ymm9
+ vpor ymm1,ymm1,ymm6
+ vpxor ymm12,ymm12,ymm14
+ vmovdqa ymm14,YMMWORD[((64-128))+rax]
+
+ vpslld ymm7,ymm4,5
+ vpaddd ymm3,ymm3,ymm15
+ vpxor ymm5,ymm2,ymm0
+ vmovdqa YMMWORD[(480-256-128)+rbx],ymm11
+ vpaddd ymm3,ymm3,ymm11
+ vpxor ymm12,ymm12,YMMWORD[((256-256-128))+rbx]
+ vpsrld ymm8,ymm4,27
+ vpxor ymm5,ymm5,ymm1
+ vpxor ymm12,ymm12,ymm14
+
+ vpslld ymm6,ymm0,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm3,ymm3,ymm5
+ vpsrld ymm9,ymm12,31
+ vpaddd ymm12,ymm12,ymm12
+
+ vpsrld ymm0,ymm0,2
+ vpaddd ymm3,ymm3,ymm7
+ vpor ymm12,ymm12,ymm9
+ vpor ymm0,ymm0,ymm6
+ vpxor ymm13,ymm13,ymm10
+ vmovdqa ymm10,YMMWORD[((96-128))+rax]
+
+ vpslld ymm7,ymm3,5
+ vpaddd ymm2,ymm2,ymm15
+ vpxor ymm5,ymm1,ymm4
+ vmovdqa YMMWORD[(0-128)+rax],ymm12
+ vpaddd ymm2,ymm2,ymm12
+ vpxor ymm13,ymm13,YMMWORD[((288-256-128))+rbx]
+ vpsrld ymm8,ymm3,27
+ vpxor ymm5,ymm5,ymm0
+ vpxor ymm13,ymm13,ymm10
+
+ vpslld ymm6,ymm4,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm2,ymm2,ymm5
+ vpsrld ymm9,ymm13,31
+ vpaddd ymm13,ymm13,ymm13
+
+ vpsrld ymm4,ymm4,2
+ vpaddd ymm2,ymm2,ymm7
+ vpor ymm13,ymm13,ymm9
+ vpor ymm4,ymm4,ymm6
+ vpxor ymm14,ymm14,ymm11
+ vmovdqa ymm11,YMMWORD[((128-128))+rax]
+
+ vpslld ymm7,ymm2,5
+ vpaddd ymm1,ymm1,ymm15
+ vpxor ymm5,ymm0,ymm3
+ vmovdqa YMMWORD[(32-128)+rax],ymm13
+ vpaddd ymm1,ymm1,ymm13
+ vpxor ymm14,ymm14,YMMWORD[((320-256-128))+rbx]
+ vpsrld ymm8,ymm2,27
+ vpxor ymm5,ymm5,ymm4
+ vpxor ymm14,ymm14,ymm11
+
+ vpslld ymm6,ymm3,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm1,ymm1,ymm5
+ vpsrld ymm9,ymm14,31
+ vpaddd ymm14,ymm14,ymm14
+
+ vpsrld ymm3,ymm3,2
+ vpaddd ymm1,ymm1,ymm7
+ vpor ymm14,ymm14,ymm9
+ vpor ymm3,ymm3,ymm6
+ vpxor ymm10,ymm10,ymm12
+ vmovdqa ymm12,YMMWORD[((160-128))+rax]
+
+ vpslld ymm7,ymm1,5
+ vpaddd ymm0,ymm0,ymm15
+ vpxor ymm5,ymm4,ymm2
+ vmovdqa YMMWORD[(64-128)+rax],ymm14
+ vpaddd ymm0,ymm0,ymm14
+ vpxor ymm10,ymm10,YMMWORD[((352-256-128))+rbx]
+ vpsrld ymm8,ymm1,27
+ vpxor ymm5,ymm5,ymm3
+ vpxor ymm10,ymm10,ymm12
+
+ vpslld ymm6,ymm2,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm0,ymm0,ymm5
+ vpsrld ymm9,ymm10,31
+ vpaddd ymm10,ymm10,ymm10
+
+ vpsrld ymm2,ymm2,2
+ vpaddd ymm0,ymm0,ymm7
+ vpor ymm10,ymm10,ymm9
+ vpor ymm2,ymm2,ymm6
+ vpxor ymm11,ymm11,ymm13
+ vmovdqa ymm13,YMMWORD[((192-128))+rax]
+
+ vpslld ymm7,ymm0,5
+ vpaddd ymm4,ymm4,ymm15
+ vpxor ymm5,ymm3,ymm1
+ vmovdqa YMMWORD[(96-128)+rax],ymm10
+ vpaddd ymm4,ymm4,ymm10
+ vpxor ymm11,ymm11,YMMWORD[((384-256-128))+rbx]
+ vpsrld ymm8,ymm0,27
+ vpxor ymm5,ymm5,ymm2
+ vpxor ymm11,ymm11,ymm13
+
+ vpslld ymm6,ymm1,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm4,ymm4,ymm5
+ vpsrld ymm9,ymm11,31
+ vpaddd ymm11,ymm11,ymm11
+
+ vpsrld ymm1,ymm1,2
+ vpaddd ymm4,ymm4,ymm7
+ vpor ymm11,ymm11,ymm9
+ vpor ymm1,ymm1,ymm6
+ vpxor ymm12,ymm12,ymm14
+ vmovdqa ymm14,YMMWORD[((224-128))+rax]
+
+ vpslld ymm7,ymm4,5
+ vpaddd ymm3,ymm3,ymm15
+ vpxor ymm5,ymm2,ymm0
+ vmovdqa YMMWORD[(128-128)+rax],ymm11
+ vpaddd ymm3,ymm3,ymm11
+ vpxor ymm12,ymm12,YMMWORD[((416-256-128))+rbx]
+ vpsrld ymm8,ymm4,27
+ vpxor ymm5,ymm5,ymm1
+ vpxor ymm12,ymm12,ymm14
+
+ vpslld ymm6,ymm0,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm3,ymm3,ymm5
+ vpsrld ymm9,ymm12,31
+ vpaddd ymm12,ymm12,ymm12
+
+ vpsrld ymm0,ymm0,2
+ vpaddd ymm3,ymm3,ymm7
+ vpor ymm12,ymm12,ymm9
+ vpor ymm0,ymm0,ymm6
+ vpxor ymm13,ymm13,ymm10
+ vmovdqa ymm10,YMMWORD[((256-256-128))+rbx]
+
+ vpslld ymm7,ymm3,5
+ vpaddd ymm2,ymm2,ymm15
+ vpxor ymm5,ymm1,ymm4
+ vmovdqa YMMWORD[(160-128)+rax],ymm12
+ vpaddd ymm2,ymm2,ymm12
+ vpxor ymm13,ymm13,YMMWORD[((448-256-128))+rbx]
+ vpsrld ymm8,ymm3,27
+ vpxor ymm5,ymm5,ymm0
+ vpxor ymm13,ymm13,ymm10
+
+ vpslld ymm6,ymm4,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm2,ymm2,ymm5
+ vpsrld ymm9,ymm13,31
+ vpaddd ymm13,ymm13,ymm13
+
+ vpsrld ymm4,ymm4,2
+ vpaddd ymm2,ymm2,ymm7
+ vpor ymm13,ymm13,ymm9
+ vpor ymm4,ymm4,ymm6
+ vpxor ymm14,ymm14,ymm11
+ vmovdqa ymm11,YMMWORD[((288-256-128))+rbx]
+
+ vpslld ymm7,ymm2,5
+ vpaddd ymm1,ymm1,ymm15
+ vpxor ymm5,ymm0,ymm3
+ vmovdqa YMMWORD[(192-128)+rax],ymm13
+ vpaddd ymm1,ymm1,ymm13
+ vpxor ymm14,ymm14,YMMWORD[((480-256-128))+rbx]
+ vpsrld ymm8,ymm2,27
+ vpxor ymm5,ymm5,ymm4
+ vpxor ymm14,ymm14,ymm11
+
+ vpslld ymm6,ymm3,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm1,ymm1,ymm5
+ vpsrld ymm9,ymm14,31
+ vpaddd ymm14,ymm14,ymm14
+
+ vpsrld ymm3,ymm3,2
+ vpaddd ymm1,ymm1,ymm7
+ vpor ymm14,ymm14,ymm9
+ vpor ymm3,ymm3,ymm6
+ vpxor ymm10,ymm10,ymm12
+ vmovdqa ymm12,YMMWORD[((320-256-128))+rbx]
+
+ vpslld ymm7,ymm1,5
+ vpaddd ymm0,ymm0,ymm15
+ vpxor ymm5,ymm4,ymm2
+ vmovdqa YMMWORD[(224-128)+rax],ymm14
+ vpaddd ymm0,ymm0,ymm14
+ vpxor ymm10,ymm10,YMMWORD[((0-128))+rax]
+ vpsrld ymm8,ymm1,27
+ vpxor ymm5,ymm5,ymm3
+ vpxor ymm10,ymm10,ymm12
+
+ vpslld ymm6,ymm2,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm0,ymm0,ymm5
+ vpsrld ymm9,ymm10,31
+ vpaddd ymm10,ymm10,ymm10
+
+ vpsrld ymm2,ymm2,2
+ vpaddd ymm0,ymm0,ymm7
+ vpor ymm10,ymm10,ymm9
+ vpor ymm2,ymm2,ymm6
+ vmovdqa ymm15,YMMWORD[32+rbp]
+ vpxor ymm11,ymm11,ymm13
+ vmovdqa ymm13,YMMWORD[((352-256-128))+rbx]
+
+ vpaddd ymm4,ymm4,ymm15
+ vpslld ymm7,ymm0,5
+ vpand ymm6,ymm3,ymm2
+ vpxor ymm11,ymm11,YMMWORD[((32-128))+rax]
+
+ vpaddd ymm4,ymm4,ymm6
+ vpsrld ymm8,ymm0,27
+ vpxor ymm5,ymm3,ymm2
+ vpxor ymm11,ymm11,ymm13
+
+ vmovdqu YMMWORD[(256-256-128)+rbx],ymm10
+ vpaddd ymm4,ymm4,ymm10
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm11,31
+ vpand ymm5,ymm5,ymm1
+ vpaddd ymm11,ymm11,ymm11
+
+ vpslld ymm6,ymm1,30
+ vpaddd ymm4,ymm4,ymm5
+
+ vpsrld ymm1,ymm1,2
+ vpaddd ymm4,ymm4,ymm7
+ vpor ymm11,ymm11,ymm9
+ vpor ymm1,ymm1,ymm6
+ vpxor ymm12,ymm12,ymm14
+ vmovdqa ymm14,YMMWORD[((384-256-128))+rbx]
+
+ vpaddd ymm3,ymm3,ymm15
+ vpslld ymm7,ymm4,5
+ vpand ymm6,ymm2,ymm1
+ vpxor ymm12,ymm12,YMMWORD[((64-128))+rax]
+
+ vpaddd ymm3,ymm3,ymm6
+ vpsrld ymm8,ymm4,27
+ vpxor ymm5,ymm2,ymm1
+ vpxor ymm12,ymm12,ymm14
+
+ vmovdqu YMMWORD[(288-256-128)+rbx],ymm11
+ vpaddd ymm3,ymm3,ymm11
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm12,31
+ vpand ymm5,ymm5,ymm0
+ vpaddd ymm12,ymm12,ymm12
+
+ vpslld ymm6,ymm0,30
+ vpaddd ymm3,ymm3,ymm5
+
+ vpsrld ymm0,ymm0,2
+ vpaddd ymm3,ymm3,ymm7
+ vpor ymm12,ymm12,ymm9
+ vpor ymm0,ymm0,ymm6
+ vpxor ymm13,ymm13,ymm10
+ vmovdqa ymm10,YMMWORD[((416-256-128))+rbx]
+
+ vpaddd ymm2,ymm2,ymm15
+ vpslld ymm7,ymm3,5
+ vpand ymm6,ymm1,ymm0
+ vpxor ymm13,ymm13,YMMWORD[((96-128))+rax]
+
+ vpaddd ymm2,ymm2,ymm6
+ vpsrld ymm8,ymm3,27
+ vpxor ymm5,ymm1,ymm0
+ vpxor ymm13,ymm13,ymm10
+
+ vmovdqu YMMWORD[(320-256-128)+rbx],ymm12
+ vpaddd ymm2,ymm2,ymm12
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm13,31
+ vpand ymm5,ymm5,ymm4
+ vpaddd ymm13,ymm13,ymm13
+
+ vpslld ymm6,ymm4,30
+ vpaddd ymm2,ymm2,ymm5
+
+ vpsrld ymm4,ymm4,2
+ vpaddd ymm2,ymm2,ymm7
+ vpor ymm13,ymm13,ymm9
+ vpor ymm4,ymm4,ymm6
+ vpxor ymm14,ymm14,ymm11
+ vmovdqa ymm11,YMMWORD[((448-256-128))+rbx]
+
+ vpaddd ymm1,ymm1,ymm15
+ vpslld ymm7,ymm2,5
+ vpand ymm6,ymm0,ymm4
+ vpxor ymm14,ymm14,YMMWORD[((128-128))+rax]
+
+ vpaddd ymm1,ymm1,ymm6
+ vpsrld ymm8,ymm2,27
+ vpxor ymm5,ymm0,ymm4
+ vpxor ymm14,ymm14,ymm11
+
+ vmovdqu YMMWORD[(352-256-128)+rbx],ymm13
+ vpaddd ymm1,ymm1,ymm13
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm14,31
+ vpand ymm5,ymm5,ymm3
+ vpaddd ymm14,ymm14,ymm14
+
+ vpslld ymm6,ymm3,30
+ vpaddd ymm1,ymm1,ymm5
+
+ vpsrld ymm3,ymm3,2
+ vpaddd ymm1,ymm1,ymm7
+ vpor ymm14,ymm14,ymm9
+ vpor ymm3,ymm3,ymm6
+ vpxor ymm10,ymm10,ymm12
+ vmovdqa ymm12,YMMWORD[((480-256-128))+rbx]
+
+ vpaddd ymm0,ymm0,ymm15
+ vpslld ymm7,ymm1,5
+ vpand ymm6,ymm4,ymm3
+ vpxor ymm10,ymm10,YMMWORD[((160-128))+rax]
+
+ vpaddd ymm0,ymm0,ymm6
+ vpsrld ymm8,ymm1,27
+ vpxor ymm5,ymm4,ymm3
+ vpxor ymm10,ymm10,ymm12
+
+ vmovdqu YMMWORD[(384-256-128)+rbx],ymm14
+ vpaddd ymm0,ymm0,ymm14
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm10,31
+ vpand ymm5,ymm5,ymm2
+ vpaddd ymm10,ymm10,ymm10
+
+ vpslld ymm6,ymm2,30
+ vpaddd ymm0,ymm0,ymm5
+
+ vpsrld ymm2,ymm2,2
+ vpaddd ymm0,ymm0,ymm7
+ vpor ymm10,ymm10,ymm9
+ vpor ymm2,ymm2,ymm6
+ vpxor ymm11,ymm11,ymm13
+ vmovdqa ymm13,YMMWORD[((0-128))+rax]
+
+ vpaddd ymm4,ymm4,ymm15
+ vpslld ymm7,ymm0,5
+ vpand ymm6,ymm3,ymm2
+ vpxor ymm11,ymm11,YMMWORD[((192-128))+rax]
+
+ vpaddd ymm4,ymm4,ymm6
+ vpsrld ymm8,ymm0,27
+ vpxor ymm5,ymm3,ymm2
+ vpxor ymm11,ymm11,ymm13
+
+ vmovdqu YMMWORD[(416-256-128)+rbx],ymm10
+ vpaddd ymm4,ymm4,ymm10
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm11,31
+ vpand ymm5,ymm5,ymm1
+ vpaddd ymm11,ymm11,ymm11
+
+ vpslld ymm6,ymm1,30
+ vpaddd ymm4,ymm4,ymm5
+
+ vpsrld ymm1,ymm1,2
+ vpaddd ymm4,ymm4,ymm7
+ vpor ymm11,ymm11,ymm9
+ vpor ymm1,ymm1,ymm6
+ vpxor ymm12,ymm12,ymm14
+ vmovdqa ymm14,YMMWORD[((32-128))+rax]
+
+ vpaddd ymm3,ymm3,ymm15
+ vpslld ymm7,ymm4,5
+ vpand ymm6,ymm2,ymm1
+ vpxor ymm12,ymm12,YMMWORD[((224-128))+rax]
+
+ vpaddd ymm3,ymm3,ymm6
+ vpsrld ymm8,ymm4,27
+ vpxor ymm5,ymm2,ymm1
+ vpxor ymm12,ymm12,ymm14
+
+ vmovdqu YMMWORD[(448-256-128)+rbx],ymm11
+ vpaddd ymm3,ymm3,ymm11
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm12,31
+ vpand ymm5,ymm5,ymm0
+ vpaddd ymm12,ymm12,ymm12
+
+ vpslld ymm6,ymm0,30
+ vpaddd ymm3,ymm3,ymm5
+
+ vpsrld ymm0,ymm0,2
+ vpaddd ymm3,ymm3,ymm7
+ vpor ymm12,ymm12,ymm9
+ vpor ymm0,ymm0,ymm6
+ vpxor ymm13,ymm13,ymm10
+ vmovdqa ymm10,YMMWORD[((64-128))+rax]
+
+ vpaddd ymm2,ymm2,ymm15
+ vpslld ymm7,ymm3,5
+ vpand ymm6,ymm1,ymm0
+ vpxor ymm13,ymm13,YMMWORD[((256-256-128))+rbx]
+
+ vpaddd ymm2,ymm2,ymm6
+ vpsrld ymm8,ymm3,27
+ vpxor ymm5,ymm1,ymm0
+ vpxor ymm13,ymm13,ymm10
+
+ vmovdqu YMMWORD[(480-256-128)+rbx],ymm12
+ vpaddd ymm2,ymm2,ymm12
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm13,31
+ vpand ymm5,ymm5,ymm4
+ vpaddd ymm13,ymm13,ymm13
+
+ vpslld ymm6,ymm4,30
+ vpaddd ymm2,ymm2,ymm5
+
+ vpsrld ymm4,ymm4,2
+ vpaddd ymm2,ymm2,ymm7
+ vpor ymm13,ymm13,ymm9
+ vpor ymm4,ymm4,ymm6
+ vpxor ymm14,ymm14,ymm11
+ vmovdqa ymm11,YMMWORD[((96-128))+rax]
+
+ vpaddd ymm1,ymm1,ymm15
+ vpslld ymm7,ymm2,5
+ vpand ymm6,ymm0,ymm4
+ vpxor ymm14,ymm14,YMMWORD[((288-256-128))+rbx]
+
+ vpaddd ymm1,ymm1,ymm6
+ vpsrld ymm8,ymm2,27
+ vpxor ymm5,ymm0,ymm4
+ vpxor ymm14,ymm14,ymm11
+
+ vmovdqu YMMWORD[(0-128)+rax],ymm13
+ vpaddd ymm1,ymm1,ymm13
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm14,31
+ vpand ymm5,ymm5,ymm3
+ vpaddd ymm14,ymm14,ymm14
+
+ vpslld ymm6,ymm3,30
+ vpaddd ymm1,ymm1,ymm5
+
+ vpsrld ymm3,ymm3,2
+ vpaddd ymm1,ymm1,ymm7
+ vpor ymm14,ymm14,ymm9
+ vpor ymm3,ymm3,ymm6
+ vpxor ymm10,ymm10,ymm12
+ vmovdqa ymm12,YMMWORD[((128-128))+rax]
+
+ vpaddd ymm0,ymm0,ymm15
+ vpslld ymm7,ymm1,5
+ vpand ymm6,ymm4,ymm3
+ vpxor ymm10,ymm10,YMMWORD[((320-256-128))+rbx]
+
+ vpaddd ymm0,ymm0,ymm6
+ vpsrld ymm8,ymm1,27
+ vpxor ymm5,ymm4,ymm3
+ vpxor ymm10,ymm10,ymm12
+
+ vmovdqu YMMWORD[(32-128)+rax],ymm14
+ vpaddd ymm0,ymm0,ymm14
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm10,31
+ vpand ymm5,ymm5,ymm2
+ vpaddd ymm10,ymm10,ymm10
+
+ vpslld ymm6,ymm2,30
+ vpaddd ymm0,ymm0,ymm5
+
+ vpsrld ymm2,ymm2,2
+ vpaddd ymm0,ymm0,ymm7
+ vpor ymm10,ymm10,ymm9
+ vpor ymm2,ymm2,ymm6
+ vpxor ymm11,ymm11,ymm13
+ vmovdqa ymm13,YMMWORD[((160-128))+rax]
+
+ vpaddd ymm4,ymm4,ymm15
+ vpslld ymm7,ymm0,5
+ vpand ymm6,ymm3,ymm2
+ vpxor ymm11,ymm11,YMMWORD[((352-256-128))+rbx]
+
+ vpaddd ymm4,ymm4,ymm6
+ vpsrld ymm8,ymm0,27
+ vpxor ymm5,ymm3,ymm2
+ vpxor ymm11,ymm11,ymm13
+
+ vmovdqu YMMWORD[(64-128)+rax],ymm10
+ vpaddd ymm4,ymm4,ymm10
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm11,31
+ vpand ymm5,ymm5,ymm1
+ vpaddd ymm11,ymm11,ymm11
+
+ vpslld ymm6,ymm1,30
+ vpaddd ymm4,ymm4,ymm5
+
+ vpsrld ymm1,ymm1,2
+ vpaddd ymm4,ymm4,ymm7
+ vpor ymm11,ymm11,ymm9
+ vpor ymm1,ymm1,ymm6
+ vpxor ymm12,ymm12,ymm14
+ vmovdqa ymm14,YMMWORD[((192-128))+rax]
+
+ vpaddd ymm3,ymm3,ymm15
+ vpslld ymm7,ymm4,5
+ vpand ymm6,ymm2,ymm1
+ vpxor ymm12,ymm12,YMMWORD[((384-256-128))+rbx]
+
+ vpaddd ymm3,ymm3,ymm6
+ vpsrld ymm8,ymm4,27
+ vpxor ymm5,ymm2,ymm1
+ vpxor ymm12,ymm12,ymm14
+
+ vmovdqu YMMWORD[(96-128)+rax],ymm11
+ vpaddd ymm3,ymm3,ymm11
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm12,31
+ vpand ymm5,ymm5,ymm0
+ vpaddd ymm12,ymm12,ymm12
+
+ vpslld ymm6,ymm0,30
+ vpaddd ymm3,ymm3,ymm5
+
+ vpsrld ymm0,ymm0,2
+ vpaddd ymm3,ymm3,ymm7
+ vpor ymm12,ymm12,ymm9
+ vpor ymm0,ymm0,ymm6
+ vpxor ymm13,ymm13,ymm10
+ vmovdqa ymm10,YMMWORD[((224-128))+rax]
+
+ vpaddd ymm2,ymm2,ymm15
+ vpslld ymm7,ymm3,5
+ vpand ymm6,ymm1,ymm0
+ vpxor ymm13,ymm13,YMMWORD[((416-256-128))+rbx]
+
+ vpaddd ymm2,ymm2,ymm6
+ vpsrld ymm8,ymm3,27
+ vpxor ymm5,ymm1,ymm0
+ vpxor ymm13,ymm13,ymm10
+
+ vmovdqu YMMWORD[(128-128)+rax],ymm12
+ vpaddd ymm2,ymm2,ymm12
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm13,31
+ vpand ymm5,ymm5,ymm4
+ vpaddd ymm13,ymm13,ymm13
+
+ vpslld ymm6,ymm4,30
+ vpaddd ymm2,ymm2,ymm5
+
+ vpsrld ymm4,ymm4,2
+ vpaddd ymm2,ymm2,ymm7
+ vpor ymm13,ymm13,ymm9
+ vpor ymm4,ymm4,ymm6
+ vpxor ymm14,ymm14,ymm11
+ vmovdqa ymm11,YMMWORD[((256-256-128))+rbx]
+
+ vpaddd ymm1,ymm1,ymm15
+ vpslld ymm7,ymm2,5
+ vpand ymm6,ymm0,ymm4
+ vpxor ymm14,ymm14,YMMWORD[((448-256-128))+rbx]
+
+ vpaddd ymm1,ymm1,ymm6
+ vpsrld ymm8,ymm2,27
+ vpxor ymm5,ymm0,ymm4
+ vpxor ymm14,ymm14,ymm11
+
+ vmovdqu YMMWORD[(160-128)+rax],ymm13
+ vpaddd ymm1,ymm1,ymm13
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm14,31
+ vpand ymm5,ymm5,ymm3
+ vpaddd ymm14,ymm14,ymm14
+
+ vpslld ymm6,ymm3,30
+ vpaddd ymm1,ymm1,ymm5
+
+ vpsrld ymm3,ymm3,2
+ vpaddd ymm1,ymm1,ymm7
+ vpor ymm14,ymm14,ymm9
+ vpor ymm3,ymm3,ymm6
+ vpxor ymm10,ymm10,ymm12
+ vmovdqa ymm12,YMMWORD[((288-256-128))+rbx]
+
+ vpaddd ymm0,ymm0,ymm15
+ vpslld ymm7,ymm1,5
+ vpand ymm6,ymm4,ymm3
+ vpxor ymm10,ymm10,YMMWORD[((480-256-128))+rbx]
+
+ vpaddd ymm0,ymm0,ymm6
+ vpsrld ymm8,ymm1,27
+ vpxor ymm5,ymm4,ymm3
+ vpxor ymm10,ymm10,ymm12
+
+ vmovdqu YMMWORD[(192-128)+rax],ymm14
+ vpaddd ymm0,ymm0,ymm14
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm10,31
+ vpand ymm5,ymm5,ymm2
+ vpaddd ymm10,ymm10,ymm10
+
+ vpslld ymm6,ymm2,30
+ vpaddd ymm0,ymm0,ymm5
+
+ vpsrld ymm2,ymm2,2
+ vpaddd ymm0,ymm0,ymm7
+ vpor ymm10,ymm10,ymm9
+ vpor ymm2,ymm2,ymm6
+ vpxor ymm11,ymm11,ymm13
+ vmovdqa ymm13,YMMWORD[((320-256-128))+rbx]
+
+ vpaddd ymm4,ymm4,ymm15
+ vpslld ymm7,ymm0,5
+ vpand ymm6,ymm3,ymm2
+ vpxor ymm11,ymm11,YMMWORD[((0-128))+rax]
+
+ vpaddd ymm4,ymm4,ymm6
+ vpsrld ymm8,ymm0,27
+ vpxor ymm5,ymm3,ymm2
+ vpxor ymm11,ymm11,ymm13
+
+ vmovdqu YMMWORD[(224-128)+rax],ymm10
+ vpaddd ymm4,ymm4,ymm10
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm11,31
+ vpand ymm5,ymm5,ymm1
+ vpaddd ymm11,ymm11,ymm11
+
+ vpslld ymm6,ymm1,30
+ vpaddd ymm4,ymm4,ymm5
+
+ vpsrld ymm1,ymm1,2
+ vpaddd ymm4,ymm4,ymm7
+ vpor ymm11,ymm11,ymm9
+ vpor ymm1,ymm1,ymm6
+ vpxor ymm12,ymm12,ymm14
+ vmovdqa ymm14,YMMWORD[((352-256-128))+rbx]
+
+ vpaddd ymm3,ymm3,ymm15
+ vpslld ymm7,ymm4,5
+ vpand ymm6,ymm2,ymm1
+ vpxor ymm12,ymm12,YMMWORD[((32-128))+rax]
+
+ vpaddd ymm3,ymm3,ymm6
+ vpsrld ymm8,ymm4,27
+ vpxor ymm5,ymm2,ymm1
+ vpxor ymm12,ymm12,ymm14
+
+ vmovdqu YMMWORD[(256-256-128)+rbx],ymm11
+ vpaddd ymm3,ymm3,ymm11
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm12,31
+ vpand ymm5,ymm5,ymm0
+ vpaddd ymm12,ymm12,ymm12
+
+ vpslld ymm6,ymm0,30
+ vpaddd ymm3,ymm3,ymm5
+
+ vpsrld ymm0,ymm0,2
+ vpaddd ymm3,ymm3,ymm7
+ vpor ymm12,ymm12,ymm9
+ vpor ymm0,ymm0,ymm6
+ vpxor ymm13,ymm13,ymm10
+ vmovdqa ymm10,YMMWORD[((384-256-128))+rbx]
+
+ vpaddd ymm2,ymm2,ymm15
+ vpslld ymm7,ymm3,5
+ vpand ymm6,ymm1,ymm0
+ vpxor ymm13,ymm13,YMMWORD[((64-128))+rax]
+
+ vpaddd ymm2,ymm2,ymm6
+ vpsrld ymm8,ymm3,27
+ vpxor ymm5,ymm1,ymm0
+ vpxor ymm13,ymm13,ymm10
+
+ vmovdqu YMMWORD[(288-256-128)+rbx],ymm12
+ vpaddd ymm2,ymm2,ymm12
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm13,31
+ vpand ymm5,ymm5,ymm4
+ vpaddd ymm13,ymm13,ymm13
+
+ vpslld ymm6,ymm4,30
+ vpaddd ymm2,ymm2,ymm5
+
+ vpsrld ymm4,ymm4,2
+ vpaddd ymm2,ymm2,ymm7
+ vpor ymm13,ymm13,ymm9
+ vpor ymm4,ymm4,ymm6
+ vpxor ymm14,ymm14,ymm11
+ vmovdqa ymm11,YMMWORD[((416-256-128))+rbx]
+
+ vpaddd ymm1,ymm1,ymm15
+ vpslld ymm7,ymm2,5
+ vpand ymm6,ymm0,ymm4
+ vpxor ymm14,ymm14,YMMWORD[((96-128))+rax]
+
+ vpaddd ymm1,ymm1,ymm6
+ vpsrld ymm8,ymm2,27
+ vpxor ymm5,ymm0,ymm4
+ vpxor ymm14,ymm14,ymm11
+
+ vmovdqu YMMWORD[(320-256-128)+rbx],ymm13
+ vpaddd ymm1,ymm1,ymm13
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm14,31
+ vpand ymm5,ymm5,ymm3
+ vpaddd ymm14,ymm14,ymm14
+
+ vpslld ymm6,ymm3,30
+ vpaddd ymm1,ymm1,ymm5
+
+ vpsrld ymm3,ymm3,2
+ vpaddd ymm1,ymm1,ymm7
+ vpor ymm14,ymm14,ymm9
+ vpor ymm3,ymm3,ymm6
+ vpxor ymm10,ymm10,ymm12
+ vmovdqa ymm12,YMMWORD[((448-256-128))+rbx]
+
+ vpaddd ymm0,ymm0,ymm15
+ vpslld ymm7,ymm1,5
+ vpand ymm6,ymm4,ymm3
+ vpxor ymm10,ymm10,YMMWORD[((128-128))+rax]
+
+ vpaddd ymm0,ymm0,ymm6
+ vpsrld ymm8,ymm1,27
+ vpxor ymm5,ymm4,ymm3
+ vpxor ymm10,ymm10,ymm12
+
+ vmovdqu YMMWORD[(352-256-128)+rbx],ymm14
+ vpaddd ymm0,ymm0,ymm14
+ vpor ymm7,ymm7,ymm8
+ vpsrld ymm9,ymm10,31
+ vpand ymm5,ymm5,ymm2
+ vpaddd ymm10,ymm10,ymm10
+
+ vpslld ymm6,ymm2,30
+ vpaddd ymm0,ymm0,ymm5
+
+ vpsrld ymm2,ymm2,2
+ vpaddd ymm0,ymm0,ymm7
+ vpor ymm10,ymm10,ymm9
+ vpor ymm2,ymm2,ymm6
+ vmovdqa ymm15,YMMWORD[64+rbp]
+ vpxor ymm11,ymm11,ymm13
+ vmovdqa ymm13,YMMWORD[((480-256-128))+rbx]
+
+ vpslld ymm7,ymm0,5
+ vpaddd ymm4,ymm4,ymm15
+ vpxor ymm5,ymm3,ymm1
+ vmovdqa YMMWORD[(384-256-128)+rbx],ymm10
+ vpaddd ymm4,ymm4,ymm10
+ vpxor ymm11,ymm11,YMMWORD[((160-128))+rax]
+ vpsrld ymm8,ymm0,27
+ vpxor ymm5,ymm5,ymm2
+ vpxor ymm11,ymm11,ymm13
+
+ vpslld ymm6,ymm1,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm4,ymm4,ymm5
+ vpsrld ymm9,ymm11,31
+ vpaddd ymm11,ymm11,ymm11
+
+ vpsrld ymm1,ymm1,2
+ vpaddd ymm4,ymm4,ymm7
+ vpor ymm11,ymm11,ymm9
+ vpor ymm1,ymm1,ymm6
+ vpxor ymm12,ymm12,ymm14
+ vmovdqa ymm14,YMMWORD[((0-128))+rax]
+
+ vpslld ymm7,ymm4,5
+ vpaddd ymm3,ymm3,ymm15
+ vpxor ymm5,ymm2,ymm0
+ vmovdqa YMMWORD[(416-256-128)+rbx],ymm11
+ vpaddd ymm3,ymm3,ymm11
+ vpxor ymm12,ymm12,YMMWORD[((192-128))+rax]
+ vpsrld ymm8,ymm4,27
+ vpxor ymm5,ymm5,ymm1
+ vpxor ymm12,ymm12,ymm14
+
+ vpslld ymm6,ymm0,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm3,ymm3,ymm5
+ vpsrld ymm9,ymm12,31
+ vpaddd ymm12,ymm12,ymm12
+
+ vpsrld ymm0,ymm0,2
+ vpaddd ymm3,ymm3,ymm7
+ vpor ymm12,ymm12,ymm9
+ vpor ymm0,ymm0,ymm6
+ vpxor ymm13,ymm13,ymm10
+ vmovdqa ymm10,YMMWORD[((32-128))+rax]
+
+ vpslld ymm7,ymm3,5
+ vpaddd ymm2,ymm2,ymm15
+ vpxor ymm5,ymm1,ymm4
+ vmovdqa YMMWORD[(448-256-128)+rbx],ymm12
+ vpaddd ymm2,ymm2,ymm12
+ vpxor ymm13,ymm13,YMMWORD[((224-128))+rax]
+ vpsrld ymm8,ymm3,27
+ vpxor ymm5,ymm5,ymm0
+ vpxor ymm13,ymm13,ymm10
+
+ vpslld ymm6,ymm4,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm2,ymm2,ymm5
+ vpsrld ymm9,ymm13,31
+ vpaddd ymm13,ymm13,ymm13
+
+ vpsrld ymm4,ymm4,2
+ vpaddd ymm2,ymm2,ymm7
+ vpor ymm13,ymm13,ymm9
+ vpor ymm4,ymm4,ymm6
+ vpxor ymm14,ymm14,ymm11
+ vmovdqa ymm11,YMMWORD[((64-128))+rax]
+
+ vpslld ymm7,ymm2,5
+ vpaddd ymm1,ymm1,ymm15
+ vpxor ymm5,ymm0,ymm3
+ vmovdqa YMMWORD[(480-256-128)+rbx],ymm13
+ vpaddd ymm1,ymm1,ymm13
+ vpxor ymm14,ymm14,YMMWORD[((256-256-128))+rbx]
+ vpsrld ymm8,ymm2,27
+ vpxor ymm5,ymm5,ymm4
+ vpxor ymm14,ymm14,ymm11
+
+ vpslld ymm6,ymm3,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm1,ymm1,ymm5
+ vpsrld ymm9,ymm14,31
+ vpaddd ymm14,ymm14,ymm14
+
+ vpsrld ymm3,ymm3,2
+ vpaddd ymm1,ymm1,ymm7
+ vpor ymm14,ymm14,ymm9
+ vpor ymm3,ymm3,ymm6
+ vpxor ymm10,ymm10,ymm12
+ vmovdqa ymm12,YMMWORD[((96-128))+rax]
+
+ vpslld ymm7,ymm1,5
+ vpaddd ymm0,ymm0,ymm15
+ vpxor ymm5,ymm4,ymm2
+ vmovdqa YMMWORD[(0-128)+rax],ymm14
+ vpaddd ymm0,ymm0,ymm14
+ vpxor ymm10,ymm10,YMMWORD[((288-256-128))+rbx]
+ vpsrld ymm8,ymm1,27
+ vpxor ymm5,ymm5,ymm3
+ vpxor ymm10,ymm10,ymm12
+
+ vpslld ymm6,ymm2,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm0,ymm0,ymm5
+ vpsrld ymm9,ymm10,31
+ vpaddd ymm10,ymm10,ymm10
+
+ vpsrld ymm2,ymm2,2
+ vpaddd ymm0,ymm0,ymm7
+ vpor ymm10,ymm10,ymm9
+ vpor ymm2,ymm2,ymm6
+ vpxor ymm11,ymm11,ymm13
+ vmovdqa ymm13,YMMWORD[((128-128))+rax]
+
+ vpslld ymm7,ymm0,5
+ vpaddd ymm4,ymm4,ymm15
+ vpxor ymm5,ymm3,ymm1
+ vmovdqa YMMWORD[(32-128)+rax],ymm10
+ vpaddd ymm4,ymm4,ymm10
+ vpxor ymm11,ymm11,YMMWORD[((320-256-128))+rbx]
+ vpsrld ymm8,ymm0,27
+ vpxor ymm5,ymm5,ymm2
+ vpxor ymm11,ymm11,ymm13
+
+ vpslld ymm6,ymm1,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm4,ymm4,ymm5
+ vpsrld ymm9,ymm11,31
+ vpaddd ymm11,ymm11,ymm11
+
+ vpsrld ymm1,ymm1,2
+ vpaddd ymm4,ymm4,ymm7
+ vpor ymm11,ymm11,ymm9
+ vpor ymm1,ymm1,ymm6
+ vpxor ymm12,ymm12,ymm14
+ vmovdqa ymm14,YMMWORD[((160-128))+rax]
+
+ vpslld ymm7,ymm4,5
+ vpaddd ymm3,ymm3,ymm15
+ vpxor ymm5,ymm2,ymm0
+ vmovdqa YMMWORD[(64-128)+rax],ymm11
+ vpaddd ymm3,ymm3,ymm11
+ vpxor ymm12,ymm12,YMMWORD[((352-256-128))+rbx]
+ vpsrld ymm8,ymm4,27
+ vpxor ymm5,ymm5,ymm1
+ vpxor ymm12,ymm12,ymm14
+
+ vpslld ymm6,ymm0,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm3,ymm3,ymm5
+ vpsrld ymm9,ymm12,31
+ vpaddd ymm12,ymm12,ymm12
+
+ vpsrld ymm0,ymm0,2
+ vpaddd ymm3,ymm3,ymm7
+ vpor ymm12,ymm12,ymm9
+ vpor ymm0,ymm0,ymm6
+ vpxor ymm13,ymm13,ymm10
+ vmovdqa ymm10,YMMWORD[((192-128))+rax]
+
+ vpslld ymm7,ymm3,5
+ vpaddd ymm2,ymm2,ymm15
+ vpxor ymm5,ymm1,ymm4
+ vmovdqa YMMWORD[(96-128)+rax],ymm12
+ vpaddd ymm2,ymm2,ymm12
+ vpxor ymm13,ymm13,YMMWORD[((384-256-128))+rbx]
+ vpsrld ymm8,ymm3,27
+ vpxor ymm5,ymm5,ymm0
+ vpxor ymm13,ymm13,ymm10
+
+ vpslld ymm6,ymm4,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm2,ymm2,ymm5
+ vpsrld ymm9,ymm13,31
+ vpaddd ymm13,ymm13,ymm13
+
+ vpsrld ymm4,ymm4,2
+ vpaddd ymm2,ymm2,ymm7
+ vpor ymm13,ymm13,ymm9
+ vpor ymm4,ymm4,ymm6
+ vpxor ymm14,ymm14,ymm11
+ vmovdqa ymm11,YMMWORD[((224-128))+rax]
+
+ vpslld ymm7,ymm2,5
+ vpaddd ymm1,ymm1,ymm15
+ vpxor ymm5,ymm0,ymm3
+ vmovdqa YMMWORD[(128-128)+rax],ymm13
+ vpaddd ymm1,ymm1,ymm13
+ vpxor ymm14,ymm14,YMMWORD[((416-256-128))+rbx]
+ vpsrld ymm8,ymm2,27
+ vpxor ymm5,ymm5,ymm4
+ vpxor ymm14,ymm14,ymm11
+
+ vpslld ymm6,ymm3,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm1,ymm1,ymm5
+ vpsrld ymm9,ymm14,31
+ vpaddd ymm14,ymm14,ymm14
+
+ vpsrld ymm3,ymm3,2
+ vpaddd ymm1,ymm1,ymm7
+ vpor ymm14,ymm14,ymm9
+ vpor ymm3,ymm3,ymm6
+ vpxor ymm10,ymm10,ymm12
+ vmovdqa ymm12,YMMWORD[((256-256-128))+rbx]
+
+ vpslld ymm7,ymm1,5
+ vpaddd ymm0,ymm0,ymm15
+ vpxor ymm5,ymm4,ymm2
+ vmovdqa YMMWORD[(160-128)+rax],ymm14
+ vpaddd ymm0,ymm0,ymm14
+ vpxor ymm10,ymm10,YMMWORD[((448-256-128))+rbx]
+ vpsrld ymm8,ymm1,27
+ vpxor ymm5,ymm5,ymm3
+ vpxor ymm10,ymm10,ymm12
+
+ vpslld ymm6,ymm2,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm0,ymm0,ymm5
+ vpsrld ymm9,ymm10,31
+ vpaddd ymm10,ymm10,ymm10
+
+ vpsrld ymm2,ymm2,2
+ vpaddd ymm0,ymm0,ymm7
+ vpor ymm10,ymm10,ymm9
+ vpor ymm2,ymm2,ymm6
+ vpxor ymm11,ymm11,ymm13
+ vmovdqa ymm13,YMMWORD[((288-256-128))+rbx]
+
+ vpslld ymm7,ymm0,5
+ vpaddd ymm4,ymm4,ymm15
+ vpxor ymm5,ymm3,ymm1
+ vmovdqa YMMWORD[(192-128)+rax],ymm10
+ vpaddd ymm4,ymm4,ymm10
+ vpxor ymm11,ymm11,YMMWORD[((480-256-128))+rbx]
+ vpsrld ymm8,ymm0,27
+ vpxor ymm5,ymm5,ymm2
+ vpxor ymm11,ymm11,ymm13
+
+ vpslld ymm6,ymm1,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm4,ymm4,ymm5
+ vpsrld ymm9,ymm11,31
+ vpaddd ymm11,ymm11,ymm11
+
+ vpsrld ymm1,ymm1,2
+ vpaddd ymm4,ymm4,ymm7
+ vpor ymm11,ymm11,ymm9
+ vpor ymm1,ymm1,ymm6
+ vpxor ymm12,ymm12,ymm14
+ vmovdqa ymm14,YMMWORD[((320-256-128))+rbx]
+
+ vpslld ymm7,ymm4,5
+ vpaddd ymm3,ymm3,ymm15
+ vpxor ymm5,ymm2,ymm0
+ vmovdqa YMMWORD[(224-128)+rax],ymm11
+ vpaddd ymm3,ymm3,ymm11
+ vpxor ymm12,ymm12,YMMWORD[((0-128))+rax]
+ vpsrld ymm8,ymm4,27
+ vpxor ymm5,ymm5,ymm1
+ vpxor ymm12,ymm12,ymm14
+
+ vpslld ymm6,ymm0,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm3,ymm3,ymm5
+ vpsrld ymm9,ymm12,31
+ vpaddd ymm12,ymm12,ymm12
+
+ vpsrld ymm0,ymm0,2
+ vpaddd ymm3,ymm3,ymm7
+ vpor ymm12,ymm12,ymm9
+ vpor ymm0,ymm0,ymm6
+ vpxor ymm13,ymm13,ymm10
+ vmovdqa ymm10,YMMWORD[((352-256-128))+rbx]
+
+ vpslld ymm7,ymm3,5
+ vpaddd ymm2,ymm2,ymm15
+ vpxor ymm5,ymm1,ymm4
+ vpaddd ymm2,ymm2,ymm12
+ vpxor ymm13,ymm13,YMMWORD[((32-128))+rax]
+ vpsrld ymm8,ymm3,27
+ vpxor ymm5,ymm5,ymm0
+ vpxor ymm13,ymm13,ymm10
+
+ vpslld ymm6,ymm4,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm2,ymm2,ymm5
+ vpsrld ymm9,ymm13,31
+ vpaddd ymm13,ymm13,ymm13
+
+ vpsrld ymm4,ymm4,2
+ vpaddd ymm2,ymm2,ymm7
+ vpor ymm13,ymm13,ymm9
+ vpor ymm4,ymm4,ymm6
+ vpxor ymm14,ymm14,ymm11
+ vmovdqa ymm11,YMMWORD[((384-256-128))+rbx]
+
+ vpslld ymm7,ymm2,5
+ vpaddd ymm1,ymm1,ymm15
+ vpxor ymm5,ymm0,ymm3
+ vpaddd ymm1,ymm1,ymm13
+ vpxor ymm14,ymm14,YMMWORD[((64-128))+rax]
+ vpsrld ymm8,ymm2,27
+ vpxor ymm5,ymm5,ymm4
+ vpxor ymm14,ymm14,ymm11
+
+ vpslld ymm6,ymm3,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm1,ymm1,ymm5
+ vpsrld ymm9,ymm14,31
+ vpaddd ymm14,ymm14,ymm14
+
+ vpsrld ymm3,ymm3,2
+ vpaddd ymm1,ymm1,ymm7
+ vpor ymm14,ymm14,ymm9
+ vpor ymm3,ymm3,ymm6
+ vpxor ymm10,ymm10,ymm12
+ vmovdqa ymm12,YMMWORD[((416-256-128))+rbx]
+
+ vpslld ymm7,ymm1,5
+ vpaddd ymm0,ymm0,ymm15
+ vpxor ymm5,ymm4,ymm2
+ vpaddd ymm0,ymm0,ymm14
+ vpxor ymm10,ymm10,YMMWORD[((96-128))+rax]
+ vpsrld ymm8,ymm1,27
+ vpxor ymm5,ymm5,ymm3
+ vpxor ymm10,ymm10,ymm12
+
+ vpslld ymm6,ymm2,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm0,ymm0,ymm5
+ vpsrld ymm9,ymm10,31
+ vpaddd ymm10,ymm10,ymm10
+
+ vpsrld ymm2,ymm2,2
+ vpaddd ymm0,ymm0,ymm7
+ vpor ymm10,ymm10,ymm9
+ vpor ymm2,ymm2,ymm6
+ vpxor ymm11,ymm11,ymm13
+ vmovdqa ymm13,YMMWORD[((448-256-128))+rbx]
+
+ vpslld ymm7,ymm0,5
+ vpaddd ymm4,ymm4,ymm15
+ vpxor ymm5,ymm3,ymm1
+ vpaddd ymm4,ymm4,ymm10
+ vpxor ymm11,ymm11,YMMWORD[((128-128))+rax]
+ vpsrld ymm8,ymm0,27
+ vpxor ymm5,ymm5,ymm2
+ vpxor ymm11,ymm11,ymm13
+
+ vpslld ymm6,ymm1,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm4,ymm4,ymm5
+ vpsrld ymm9,ymm11,31
+ vpaddd ymm11,ymm11,ymm11
+
+ vpsrld ymm1,ymm1,2
+ vpaddd ymm4,ymm4,ymm7
+ vpor ymm11,ymm11,ymm9
+ vpor ymm1,ymm1,ymm6
+ vpxor ymm12,ymm12,ymm14
+ vmovdqa ymm14,YMMWORD[((480-256-128))+rbx]
+
+ vpslld ymm7,ymm4,5
+ vpaddd ymm3,ymm3,ymm15
+ vpxor ymm5,ymm2,ymm0
+ vpaddd ymm3,ymm3,ymm11
+ vpxor ymm12,ymm12,YMMWORD[((160-128))+rax]
+ vpsrld ymm8,ymm4,27
+ vpxor ymm5,ymm5,ymm1
+ vpxor ymm12,ymm12,ymm14
+
+ vpslld ymm6,ymm0,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm3,ymm3,ymm5
+ vpsrld ymm9,ymm12,31
+ vpaddd ymm12,ymm12,ymm12
+
+ vpsrld ymm0,ymm0,2
+ vpaddd ymm3,ymm3,ymm7
+ vpor ymm12,ymm12,ymm9
+ vpor ymm0,ymm0,ymm6
+ vpxor ymm13,ymm13,ymm10
+ vmovdqa ymm10,YMMWORD[((0-128))+rax]
+
+ vpslld ymm7,ymm3,5
+ vpaddd ymm2,ymm2,ymm15
+ vpxor ymm5,ymm1,ymm4
+ vpaddd ymm2,ymm2,ymm12
+ vpxor ymm13,ymm13,YMMWORD[((192-128))+rax]
+ vpsrld ymm8,ymm3,27
+ vpxor ymm5,ymm5,ymm0
+ vpxor ymm13,ymm13,ymm10
+
+ vpslld ymm6,ymm4,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm2,ymm2,ymm5
+ vpsrld ymm9,ymm13,31
+ vpaddd ymm13,ymm13,ymm13
+
+ vpsrld ymm4,ymm4,2
+ vpaddd ymm2,ymm2,ymm7
+ vpor ymm13,ymm13,ymm9
+ vpor ymm4,ymm4,ymm6
+ vpxor ymm14,ymm14,ymm11
+ vmovdqa ymm11,YMMWORD[((32-128))+rax]
+
+ vpslld ymm7,ymm2,5
+ vpaddd ymm1,ymm1,ymm15
+ vpxor ymm5,ymm0,ymm3
+ vpaddd ymm1,ymm1,ymm13
+ vpxor ymm14,ymm14,YMMWORD[((224-128))+rax]
+ vpsrld ymm8,ymm2,27
+ vpxor ymm5,ymm5,ymm4
+ vpxor ymm14,ymm14,ymm11
+
+ vpslld ymm6,ymm3,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm1,ymm1,ymm5
+ vpsrld ymm9,ymm14,31
+ vpaddd ymm14,ymm14,ymm14
+
+ vpsrld ymm3,ymm3,2
+ vpaddd ymm1,ymm1,ymm7
+ vpor ymm14,ymm14,ymm9
+ vpor ymm3,ymm3,ymm6
+ vpslld ymm7,ymm1,5
+ vpaddd ymm0,ymm0,ymm15
+ vpxor ymm5,ymm4,ymm2
+
+ vpsrld ymm8,ymm1,27
+ vpaddd ymm0,ymm0,ymm14
+ vpxor ymm5,ymm5,ymm3
+
+ vpslld ymm6,ymm2,30
+ vpor ymm7,ymm7,ymm8
+ vpaddd ymm0,ymm0,ymm5
+
+ vpsrld ymm2,ymm2,2
+ vpaddd ymm0,ymm0,ymm7
+ vpor ymm2,ymm2,ymm6
+ mov ecx,1
+ lea rbx,[512+rsp]
+ cmp ecx,DWORD[rbx]
+ cmovge r12,rbp
+ cmp ecx,DWORD[4+rbx]
+ cmovge r13,rbp
+ cmp ecx,DWORD[8+rbx]
+ cmovge r14,rbp
+ cmp ecx,DWORD[12+rbx]
+ cmovge r15,rbp
+ cmp ecx,DWORD[16+rbx]
+ cmovge r8,rbp
+ cmp ecx,DWORD[20+rbx]
+ cmovge r9,rbp
+ cmp ecx,DWORD[24+rbx]
+ cmovge r10,rbp
+ cmp ecx,DWORD[28+rbx]
+ cmovge r11,rbp
+ vmovdqu ymm5,YMMWORD[rbx]
+ vpxor ymm7,ymm7,ymm7
+ vmovdqa ymm6,ymm5
+ vpcmpgtd ymm6,ymm6,ymm7
+ vpaddd ymm5,ymm5,ymm6
+
+ vpand ymm0,ymm0,ymm6
+ vpand ymm1,ymm1,ymm6
+ vpaddd ymm0,ymm0,YMMWORD[rdi]
+ vpand ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,YMMWORD[32+rdi]
+ vpand ymm3,ymm3,ymm6
+ vpaddd ymm2,ymm2,YMMWORD[64+rdi]
+ vpand ymm4,ymm4,ymm6
+ vpaddd ymm3,ymm3,YMMWORD[96+rdi]
+ vpaddd ymm4,ymm4,YMMWORD[128+rdi]
+ vmovdqu YMMWORD[rdi],ymm0
+ vmovdqu YMMWORD[32+rdi],ymm1
+ vmovdqu YMMWORD[64+rdi],ymm2
+ vmovdqu YMMWORD[96+rdi],ymm3
+ vmovdqu YMMWORD[128+rdi],ymm4
+
+ vmovdqu YMMWORD[rbx],ymm5
+ lea rbx,[((256+128))+rsp]
+ vmovdqu ymm9,YMMWORD[96+rbp]
+ dec edx
+ jnz NEAR $L$oop_avx2
+
+
+
+
+
+
+
+$L$done_avx2:
+ mov rax,QWORD[544+rsp]
+
+ vzeroupper
+ movaps xmm6,XMMWORD[((-216))+rax]
+ movaps xmm7,XMMWORD[((-200))+rax]
+ movaps xmm8,XMMWORD[((-184))+rax]
+ movaps xmm9,XMMWORD[((-168))+rax]
+ movaps xmm10,XMMWORD[((-152))+rax]
+ movaps xmm11,XMMWORD[((-136))+rax]
+ movaps xmm12,XMMWORD[((-120))+rax]
+ movaps xmm13,XMMWORD[((-104))+rax]
+ movaps xmm14,XMMWORD[((-88))+rax]
+ movaps xmm15,XMMWORD[((-72))+rax]
+ mov r15,QWORD[((-48))+rax]
+
+ mov r14,QWORD[((-40))+rax]
+
+ mov r13,QWORD[((-32))+rax]
+
+ mov r12,QWORD[((-24))+rax]
+
+ mov rbp,QWORD[((-16))+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+
+ lea rsp,[rax]
+
+$L$epilogue_avx2:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha1_multi_block_avx2:
+
+ALIGN 256
+ DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+ DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+K_XX_XX:
+ DD 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+ DD 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+ DD 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+ DD 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+ DD 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+ DD 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+ DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+ DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+DB 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+DB 83,72,65,49,32,109,117,108,116,105,45,98,108,111,99,107
+DB 32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120
+DB 56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77
+DB 83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110
+DB 115,115,108,46,111,114,103,62,0
+EXTERN __imp_RtlVirtualUnwind
+
+ALIGN 16
+se_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
+
+ mov rax,QWORD[152+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$in_prologue
+
+ mov rax,QWORD[272+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+
+ lea rsi,[((-24-160))+rax]
+ lea rdi,[512+r8]
+ mov ecx,20
+ DD 0xa548f3fc
+
+$L$in_prologue:
+ mov rdi,QWORD[8+rax]
+ mov rsi,QWORD[16+rax]
+ mov QWORD[152+r8],rax
+ mov QWORD[168+r8],rsi
+ mov QWORD[176+r8],rdi
+
+ mov rdi,QWORD[40+r9]
+ mov rsi,r8
+ mov ecx,154
+ DD 0xa548f3fc
+
+ mov rsi,r9
+ xor rcx,rcx
+ mov rdx,QWORD[8+rsi]
+ mov r8,QWORD[rsi]
+ mov r9,QWORD[16+rsi]
+ mov r10,QWORD[40+rsi]
+ lea r11,[56+rsi]
+ lea r12,[24+rsi]
+ mov QWORD[32+rsp],r10
+ mov QWORD[40+rsp],r11
+ mov QWORD[48+rsp],r12
+ mov QWORD[56+rsp],rcx
+ call QWORD[__imp_RtlVirtualUnwind]
+
+ mov eax,1
+ add rsp,64
+ popfq
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ DB 0F3h,0C3h ;repret
+
+
+ALIGN 16
+avx2_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
+
+ mov rax,QWORD[152+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$in_prologue
+
+ mov rax,QWORD[544+r8]
+
+ mov rbx,QWORD[((-8))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov r15,QWORD[((-48))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+ mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+ mov QWORD[232+r8],r14
+ mov QWORD[240+r8],r15
+
+ lea rsi,[((-56-160))+rax]
+ lea rdi,[512+r8]
+ mov ecx,20
+ DD 0xa548f3fc
+
+ jmp NEAR $L$in_prologue
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_sha1_multi_block wrt ..imagebase
+ DD $L$SEH_end_sha1_multi_block wrt ..imagebase
+ DD $L$SEH_info_sha1_multi_block wrt ..imagebase
+ DD $L$SEH_begin_sha1_multi_block_shaext wrt ..imagebase
+ DD $L$SEH_end_sha1_multi_block_shaext wrt ..imagebase
+ DD $L$SEH_info_sha1_multi_block_shaext wrt ..imagebase
+ DD $L$SEH_begin_sha1_multi_block_avx wrt ..imagebase
+ DD $L$SEH_end_sha1_multi_block_avx wrt ..imagebase
+ DD $L$SEH_info_sha1_multi_block_avx wrt ..imagebase
+ DD $L$SEH_begin_sha1_multi_block_avx2 wrt ..imagebase
+ DD $L$SEH_end_sha1_multi_block_avx2 wrt ..imagebase
+ DD $L$SEH_info_sha1_multi_block_avx2 wrt ..imagebase
+section .xdata rdata align=8
+ALIGN 8
+$L$SEH_info_sha1_multi_block:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$body wrt ..imagebase,$L$epilogue wrt ..imagebase
+$L$SEH_info_sha1_multi_block_shaext:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$body_shaext wrt ..imagebase,$L$epilogue_shaext wrt ..imagebase
+$L$SEH_info_sha1_multi_block_avx:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$body_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
+$L$SEH_info_sha1_multi_block_avx2:
+DB 9,0,0,0
+ DD avx2_handler wrt ..imagebase
+ DD $L$body_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-x86_64.nasm
new file mode 100644
index 0000000..9d1f10e
--- /dev/null
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-x86_64.nasm
@@ -0,0 +1,5766 @@
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+section .text code align=64
+
+EXTERN OPENSSL_ia32cap_P
+
+global sha1_block_data_order
+
+ALIGN 16
+sha1_block_data_order:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha1_block_data_order:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+ mov r9d,DWORD[((OPENSSL_ia32cap_P+0))]
+ mov r8d,DWORD[((OPENSSL_ia32cap_P+4))]
+ mov r10d,DWORD[((OPENSSL_ia32cap_P+8))]
+ test r8d,512
+ jz NEAR $L$ialu
+ test r10d,536870912
+ jnz NEAR _shaext_shortcut
+ and r10d,296
+ cmp r10d,296
+ je NEAR _avx2_shortcut
+ and r8d,268435456
+ and r9d,1073741824
+ or r8d,r9d
+ cmp r8d,1342177280
+ je NEAR _avx_shortcut
+ jmp NEAR _ssse3_shortcut
+
+ALIGN 16
+$L$ialu:
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ mov r8,rdi
+ sub rsp,72
+ mov r9,rsi
+ and rsp,-64
+ mov r10,rdx
+ mov QWORD[64+rsp],rax
+
+$L$prologue:
+
+ mov esi,DWORD[r8]
+ mov edi,DWORD[4+r8]
+ mov r11d,DWORD[8+r8]
+ mov r12d,DWORD[12+r8]
+ mov r13d,DWORD[16+r8]
+ jmp NEAR $L$loop
+
+ALIGN 16
+$L$loop:
+ mov edx,DWORD[r9]
+ bswap edx
+ mov ebp,DWORD[4+r9]
+ mov eax,r12d
+ mov DWORD[rsp],edx
+ mov ecx,esi
+ bswap ebp
+ xor eax,r11d
+ rol ecx,5
+ and eax,edi
+ lea r13d,[1518500249+r13*1+rdx]
+ add r13d,ecx
+ xor eax,r12d
+ rol edi,30
+ add r13d,eax
+ mov r14d,DWORD[8+r9]
+ mov eax,r11d
+ mov DWORD[4+rsp],ebp
+ mov ecx,r13d
+ bswap r14d
+ xor eax,edi
+ rol ecx,5
+ and eax,esi
+ lea r12d,[1518500249+r12*1+rbp]
+ add r12d,ecx
+ xor eax,r11d
+ rol esi,30
+ add r12d,eax
+ mov edx,DWORD[12+r9]
+ mov eax,edi
+ mov DWORD[8+rsp],r14d
+ mov ecx,r12d
+ bswap edx
+ xor eax,esi
+ rol ecx,5
+ and eax,r13d
+ lea r11d,[1518500249+r11*1+r14]
+ add r11d,ecx
+ xor eax,edi
+ rol r13d,30
+ add r11d,eax
+ mov ebp,DWORD[16+r9]
+ mov eax,esi
+ mov DWORD[12+rsp],edx
+ mov ecx,r11d
+ bswap ebp
+ xor eax,r13d
+ rol ecx,5
+ and eax,r12d
+ lea edi,[1518500249+rdi*1+rdx]
+ add edi,ecx
+ xor eax,esi
+ rol r12d,30
+ add edi,eax
+ mov r14d,DWORD[20+r9]
+ mov eax,r13d
+ mov DWORD[16+rsp],ebp
+ mov ecx,edi
+ bswap r14d
+ xor eax,r12d
+ rol ecx,5
+ and eax,r11d
+ lea esi,[1518500249+rsi*1+rbp]
+ add esi,ecx
+ xor eax,r13d
+ rol r11d,30
+ add esi,eax
+ mov edx,DWORD[24+r9]
+ mov eax,r12d
+ mov DWORD[20+rsp],r14d
+ mov ecx,esi
+ bswap edx
+ xor eax,r11d
+ rol ecx,5
+ and eax,edi
+ lea r13d,[1518500249+r13*1+r14]
+ add r13d,ecx
+ xor eax,r12d
+ rol edi,30
+ add r13d,eax
+ mov ebp,DWORD[28+r9]
+ mov eax,r11d
+ mov DWORD[24+rsp],edx
+ mov ecx,r13d
+ bswap ebp
+ xor eax,edi
+ rol ecx,5
+ and eax,esi
+ lea r12d,[1518500249+r12*1+rdx]
+ add r12d,ecx
+ xor eax,r11d
+ rol esi,30
+ add r12d,eax
+ mov r14d,DWORD[32+r9]
+ mov eax,edi
+ mov DWORD[28+rsp],ebp
+ mov ecx,r12d
+ bswap r14d
+ xor eax,esi
+ rol ecx,5
+ and eax,r13d
+ lea r11d,[1518500249+r11*1+rbp]
+ add r11d,ecx
+ xor eax,edi
+ rol r13d,30
+ add r11d,eax
+ mov edx,DWORD[36+r9]
+ mov eax,esi
+ mov DWORD[32+rsp],r14d
+ mov ecx,r11d
+ bswap edx
+ xor eax,r13d
+ rol ecx,5
+ and eax,r12d
+ lea edi,[1518500249+rdi*1+r14]
+ add edi,ecx
+ xor eax,esi
+ rol r12d,30
+ add edi,eax
+ mov ebp,DWORD[40+r9]
+ mov eax,r13d
+ mov DWORD[36+rsp],edx
+ mov ecx,edi
+ bswap ebp
+ xor eax,r12d
+ rol ecx,5
+ and eax,r11d
+ lea esi,[1518500249+rsi*1+rdx]
+ add esi,ecx
+ xor eax,r13d
+ rol r11d,30
+ add esi,eax
+ mov r14d,DWORD[44+r9]
+ mov eax,r12d
+ mov DWORD[40+rsp],ebp
+ mov ecx,esi
+ bswap r14d
+ xor eax,r11d
+ rol ecx,5
+ and eax,edi
+ lea r13d,[1518500249+r13*1+rbp]
+ add r13d,ecx
+ xor eax,r12d
+ rol edi,30
+ add r13d,eax
+ mov edx,DWORD[48+r9]
+ mov eax,r11d
+ mov DWORD[44+rsp],r14d
+ mov ecx,r13d
+ bswap edx
+ xor eax,edi
+ rol ecx,5
+ and eax,esi
+ lea r12d,[1518500249+r12*1+r14]
+ add r12d,ecx
+ xor eax,r11d
+ rol esi,30
+ add r12d,eax
+ mov ebp,DWORD[52+r9]
+ mov eax,edi
+ mov DWORD[48+rsp],edx
+ mov ecx,r12d
+ bswap ebp
+ xor eax,esi
+ rol ecx,5
+ and eax,r13d
+ lea r11d,[1518500249+r11*1+rdx]
+ add r11d,ecx
+ xor eax,edi
+ rol r13d,30
+ add r11d,eax
+ mov r14d,DWORD[56+r9]
+ mov eax,esi
+ mov DWORD[52+rsp],ebp
+ mov ecx,r11d
+ bswap r14d
+ xor eax,r13d
+ rol ecx,5
+ and eax,r12d
+ lea edi,[1518500249+rdi*1+rbp]
+ add edi,ecx
+ xor eax,esi
+ rol r12d,30
+ add edi,eax
+ mov edx,DWORD[60+r9]
+ mov eax,r13d
+ mov DWORD[56+rsp],r14d
+ mov ecx,edi
+ bswap edx
+ xor eax,r12d
+ rol ecx,5
+ and eax,r11d
+ lea esi,[1518500249+rsi*1+r14]
+ add esi,ecx
+ xor eax,r13d
+ rol r11d,30
+ add esi,eax
+ xor ebp,DWORD[rsp]
+ mov eax,r12d
+ mov DWORD[60+rsp],edx
+ mov ecx,esi
+ xor ebp,DWORD[8+rsp]
+ xor eax,r11d
+ rol ecx,5
+ xor ebp,DWORD[32+rsp]
+ and eax,edi
+ lea r13d,[1518500249+r13*1+rdx]
+ rol edi,30
+ xor eax,r12d
+ add r13d,ecx
+ rol ebp,1
+ add r13d,eax
+ xor r14d,DWORD[4+rsp]
+ mov eax,r11d
+ mov DWORD[rsp],ebp
+ mov ecx,r13d
+ xor r14d,DWORD[12+rsp]
+ xor eax,edi
+ rol ecx,5
+ xor r14d,DWORD[36+rsp]
+ and eax,esi
+ lea r12d,[1518500249+r12*1+rbp]
+ rol esi,30
+ xor eax,r11d
+ add r12d,ecx
+ rol r14d,1
+ add r12d,eax
+ xor edx,DWORD[8+rsp]
+ mov eax,edi
+ mov DWORD[4+rsp],r14d
+ mov ecx,r12d
+ xor edx,DWORD[16+rsp]
+ xor eax,esi
+ rol ecx,5
+ xor edx,DWORD[40+rsp]
+ and eax,r13d
+ lea r11d,[1518500249+r11*1+r14]
+ rol r13d,30
+ xor eax,edi
+ add r11d,ecx
+ rol edx,1
+ add r11d,eax
+ xor ebp,DWORD[12+rsp]
+ mov eax,esi
+ mov DWORD[8+rsp],edx
+ mov ecx,r11d
+ xor ebp,DWORD[20+rsp]
+ xor eax,r13d
+ rol ecx,5
+ xor ebp,DWORD[44+rsp]
+ and eax,r12d
+ lea edi,[1518500249+rdi*1+rdx]
+ rol r12d,30
+ xor eax,esi
+ add edi,ecx
+ rol ebp,1
+ add edi,eax
+ xor r14d,DWORD[16+rsp]
+ mov eax,r13d
+ mov DWORD[12+rsp],ebp
+ mov ecx,edi
+ xor r14d,DWORD[24+rsp]
+ xor eax,r12d
+ rol ecx,5
+ xor r14d,DWORD[48+rsp]
+ and eax,r11d
+ lea esi,[1518500249+rsi*1+rbp]
+ rol r11d,30
+ xor eax,r13d
+ add esi,ecx
+ rol r14d,1
+ add esi,eax
+ xor edx,DWORD[20+rsp]
+ mov eax,edi
+ mov DWORD[16+rsp],r14d
+ mov ecx,esi
+ xor edx,DWORD[28+rsp]
+ xor eax,r12d
+ rol ecx,5
+ xor edx,DWORD[52+rsp]
+ lea r13d,[1859775393+r13*1+r14]
+ xor eax,r11d
+ add r13d,ecx
+ rol edi,30
+ add r13d,eax
+ rol edx,1
+ xor ebp,DWORD[24+rsp]
+ mov eax,esi
+ mov DWORD[20+rsp],edx
+ mov ecx,r13d
+ xor ebp,DWORD[32+rsp]
+ xor eax,r11d
+ rol ecx,5
+ xor ebp,DWORD[56+rsp]
+ lea r12d,[1859775393+r12*1+rdx]
+ xor eax,edi
+ add r12d,ecx
+ rol esi,30
+ add r12d,eax
+ rol ebp,1
+ xor r14d,DWORD[28+rsp]
+ mov eax,r13d
+ mov DWORD[24+rsp],ebp
+ mov ecx,r12d
+ xor r14d,DWORD[36+rsp]
+ xor eax,edi
+ rol ecx,5
+ xor r14d,DWORD[60+rsp]
+ lea r11d,[1859775393+r11*1+rbp]
+ xor eax,esi
+ add r11d,ecx
+ rol r13d,30
+ add r11d,eax
+ rol r14d,1
+ xor edx,DWORD[32+rsp]
+ mov eax,r12d
+ mov DWORD[28+rsp],r14d
+ mov ecx,r11d
+ xor edx,DWORD[40+rsp]
+ xor eax,esi
+ rol ecx,5
+ xor edx,DWORD[rsp]
+ lea edi,[1859775393+rdi*1+r14]
+ xor eax,r13d
+ add edi,ecx
+ rol r12d,30
+ add edi,eax
+ rol edx,1
+ xor ebp,DWORD[36+rsp]
+ mov eax,r11d
+ mov DWORD[32+rsp],edx
+ mov ecx,edi
+ xor ebp,DWORD[44+rsp]
+ xor eax,r13d
+ rol ecx,5
+ xor ebp,DWORD[4+rsp]
+ lea esi,[1859775393+rsi*1+rdx]
+ xor eax,r12d
+ add esi,ecx
+ rol r11d,30
+ add esi,eax
+ rol ebp,1
+ xor r14d,DWORD[40+rsp]
+ mov eax,edi
+ mov DWORD[36+rsp],ebp
+ mov ecx,esi
+ xor r14d,DWORD[48+rsp]
+ xor eax,r12d
+ rol ecx,5
+ xor r14d,DWORD[8+rsp]
+ lea r13d,[1859775393+r13*1+rbp]
+ xor eax,r11d
+ add r13d,ecx
+ rol edi,30
+ add r13d,eax
+ rol r14d,1
+ xor edx,DWORD[44+rsp]
+ mov eax,esi
+ mov DWORD[40+rsp],r14d
+ mov ecx,r13d
+ xor edx,DWORD[52+rsp]
+ xor eax,r11d
+ rol ecx,5
+ xor edx,DWORD[12+rsp]
+ lea r12d,[1859775393+r12*1+r14]
+ xor eax,edi
+ add r12d,ecx
+ rol esi,30
+ add r12d,eax
+ rol edx,1
+ xor ebp,DWORD[48+rsp]
+ mov eax,r13d
+ mov DWORD[44+rsp],edx
+ mov ecx,r12d
+ xor ebp,DWORD[56+rsp]
+ xor eax,edi
+ rol ecx,5
+ xor ebp,DWORD[16+rsp]
+ lea r11d,[1859775393+r11*1+rdx]
+ xor eax,esi
+ add r11d,ecx
+ rol r13d,30
+ add r11d,eax
+ rol ebp,1
+ xor r14d,DWORD[52+rsp]
+ mov eax,r12d
+ mov DWORD[48+rsp],ebp
+ mov ecx,r11d
+ xor r14d,DWORD[60+rsp]
+ xor eax,esi
+ rol ecx,5
+ xor r14d,DWORD[20+rsp]
+ lea edi,[1859775393+rdi*1+rbp]
+ xor eax,r13d
+ add edi,ecx
+ rol r12d,30
+ add edi,eax
+ rol r14d,1
+ xor edx,DWORD[56+rsp]
+ mov eax,r11d
+ mov DWORD[52+rsp],r14d
+ mov ecx,edi
+ xor edx,DWORD[rsp]
+ xor eax,r13d
+ rol ecx,5
+ xor edx,DWORD[24+rsp]
+ lea esi,[1859775393+rsi*1+r14]
+ xor eax,r12d
+ add esi,ecx
+ rol r11d,30
+ add esi,eax
+ rol edx,1
+ xor ebp,DWORD[60+rsp]
+ mov eax,edi
+ mov DWORD[56+rsp],edx
+ mov ecx,esi
+ xor ebp,DWORD[4+rsp]
+ xor eax,r12d
+ rol ecx,5
+ xor ebp,DWORD[28+rsp]
+ lea r13d,[1859775393+r13*1+rdx]
+ xor eax,r11d
+ add r13d,ecx
+ rol edi,30
+ add r13d,eax
+ rol ebp,1
+ xor r14d,DWORD[rsp]
+ mov eax,esi
+ mov DWORD[60+rsp],ebp
+ mov ecx,r13d
+ xor r14d,DWORD[8+rsp]
+ xor eax,r11d
+ rol ecx,5
+ xor r14d,DWORD[32+rsp]
+ lea r12d,[1859775393+r12*1+rbp]
+ xor eax,edi
+ add r12d,ecx
+ rol esi,30
+ add r12d,eax
+ rol r14d,1
+ xor edx,DWORD[4+rsp]
+ mov eax,r13d
+ mov DWORD[rsp],r14d
+ mov ecx,r12d
+ xor edx,DWORD[12+rsp]
+ xor eax,edi
+ rol ecx,5
+ xor edx,DWORD[36+rsp]
+ lea r11d,[1859775393+r11*1+r14]
+ xor eax,esi
+ add r11d,ecx
+ rol r13d,30
+ add r11d,eax
+ rol edx,1
+ xor ebp,DWORD[8+rsp]
+ mov eax,r12d
+ mov DWORD[4+rsp],edx
+ mov ecx,r11d
+ xor ebp,DWORD[16+rsp]
+ xor eax,esi
+ rol ecx,5
+ xor ebp,DWORD[40+rsp]
+ lea edi,[1859775393+rdi*1+rdx]
+ xor eax,r13d
+ add edi,ecx
+ rol r12d,30
+ add edi,eax
+ rol ebp,1
+ xor r14d,DWORD[12+rsp]
+ mov eax,r11d
+ mov DWORD[8+rsp],ebp
+ mov ecx,edi
+ xor r14d,DWORD[20+rsp]
+ xor eax,r13d
+ rol ecx,5
+ xor r14d,DWORD[44+rsp]
+ lea esi,[1859775393+rsi*1+rbp]
+ xor eax,r12d
+ add esi,ecx
+ rol r11d,30
+ add esi,eax
+ rol r14d,1
+ xor edx,DWORD[16+rsp]
+ mov eax,edi
+ mov DWORD[12+rsp],r14d
+ mov ecx,esi
+ xor edx,DWORD[24+rsp]
+ xor eax,r12d
+ rol ecx,5
+ xor edx,DWORD[48+rsp]
+ lea r13d,[1859775393+r13*1+r14]
+ xor eax,r11d
+ add r13d,ecx
+ rol edi,30
+ add r13d,eax
+ rol edx,1
+ xor ebp,DWORD[20+rsp]
+ mov eax,esi
+ mov DWORD[16+rsp],edx
+ mov ecx,r13d
+ xor ebp,DWORD[28+rsp]
+ xor eax,r11d
+ rol ecx,5
+ xor ebp,DWORD[52+rsp]
+ lea r12d,[1859775393+r12*1+rdx]
+ xor eax,edi
+ add r12d,ecx
+ rol esi,30
+ add r12d,eax
+ rol ebp,1
+ xor r14d,DWORD[24+rsp]
+ mov eax,r13d
+ mov DWORD[20+rsp],ebp
+ mov ecx,r12d
+ xor r14d,DWORD[32+rsp]
+ xor eax,edi
+ rol ecx,5
+ xor r14d,DWORD[56+rsp]
+ lea r11d,[1859775393+r11*1+rbp]
+ xor eax,esi
+ add r11d,ecx
+ rol r13d,30
+ add r11d,eax
+ rol r14d,1
+ xor edx,DWORD[28+rsp]
+ mov eax,r12d
+ mov DWORD[24+rsp],r14d
+ mov ecx,r11d
+ xor edx,DWORD[36+rsp]
+ xor eax,esi
+ rol ecx,5
+ xor edx,DWORD[60+rsp]
+ lea edi,[1859775393+rdi*1+r14]
+ xor eax,r13d
+ add edi,ecx
+ rol r12d,30
+ add edi,eax
+ rol edx,1
+ xor ebp,DWORD[32+rsp]
+ mov eax,r11d
+ mov DWORD[28+rsp],edx
+ mov ecx,edi
+ xor ebp,DWORD[40+rsp]
+ xor eax,r13d
+ rol ecx,5
+ xor ebp,DWORD[rsp]
+ lea esi,[1859775393+rsi*1+rdx]
+ xor eax,r12d
+ add esi,ecx
+ rol r11d,30
+ add esi,eax
+ rol ebp,1
+ xor r14d,DWORD[36+rsp]
+ mov eax,r12d
+ mov DWORD[32+rsp],ebp
+ mov ebx,r12d
+ xor r14d,DWORD[44+rsp]
+ and eax,r11d
+ mov ecx,esi
+ xor r14d,DWORD[4+rsp]
+ lea r13d,[((-1894007588))+r13*1+rbp]
+ xor ebx,r11d
+ rol ecx,5
+ add r13d,eax
+ rol r14d,1
+ and ebx,edi
+ add r13d,ecx
+ rol edi,30
+ add r13d,ebx
+ xor edx,DWORD[40+rsp]
+ mov eax,r11d
+ mov DWORD[36+rsp],r14d
+ mov ebx,r11d
+ xor edx,DWORD[48+rsp]
+ and eax,edi
+ mov ecx,r13d
+ xor edx,DWORD[8+rsp]
+ lea r12d,[((-1894007588))+r12*1+r14]
+ xor ebx,edi
+ rol ecx,5
+ add r12d,eax
+ rol edx,1
+ and ebx,esi
+ add r12d,ecx
+ rol esi,30
+ add r12d,ebx
+ xor ebp,DWORD[44+rsp]
+ mov eax,edi
+ mov DWORD[40+rsp],edx
+ mov ebx,edi
+ xor ebp,DWORD[52+rsp]
+ and eax,esi
+ mov ecx,r12d
+ xor ebp,DWORD[12+rsp]
+ lea r11d,[((-1894007588))+r11*1+rdx]
+ xor ebx,esi
+ rol ecx,5
+ add r11d,eax
+ rol ebp,1
+ and ebx,r13d
+ add r11d,ecx
+ rol r13d,30
+ add r11d,ebx
+ xor r14d,DWORD[48+rsp]
+ mov eax,esi
+ mov DWORD[44+rsp],ebp
+ mov ebx,esi
+ xor r14d,DWORD[56+rsp]
+ and eax,r13d
+ mov ecx,r11d
+ xor r14d,DWORD[16+rsp]
+ lea edi,[((-1894007588))+rdi*1+rbp]
+ xor ebx,r13d
+ rol ecx,5
+ add edi,eax
+ rol r14d,1
+ and ebx,r12d
+ add edi,ecx
+ rol r12d,30
+ add edi,ebx
+ xor edx,DWORD[52+rsp]
+ mov eax,r13d
+ mov DWORD[48+rsp],r14d
+ mov ebx,r13d
+ xor edx,DWORD[60+rsp]
+ and eax,r12d
+ mov ecx,edi
+ xor edx,DWORD[20+rsp]
+ lea esi,[((-1894007588))+rsi*1+r14]
+ xor ebx,r12d
+ rol ecx,5
+ add esi,eax
+ rol edx,1
+ and ebx,r11d
+ add esi,ecx
+ rol r11d,30
+ add esi,ebx
+ xor ebp,DWORD[56+rsp]
+ mov eax,r12d
+ mov DWORD[52+rsp],edx
+ mov ebx,r12d
+ xor ebp,DWORD[rsp]
+ and eax,r11d
+ mov ecx,esi
+ xor ebp,DWORD[24+rsp]
+ lea r13d,[((-1894007588))+r13*1+rdx]
+ xor ebx,r11d
+ rol ecx,5
+ add r13d,eax
+ rol ebp,1
+ and ebx,edi
+ add r13d,ecx
+ rol edi,30
+ add r13d,ebx
+ xor r14d,DWORD[60+rsp]
+ mov eax,r11d
+ mov DWORD[56+rsp],ebp
+ mov ebx,r11d
+ xor r14d,DWORD[4+rsp]
+ and eax,edi
+ mov ecx,r13d
+ xor r14d,DWORD[28+rsp]
+ lea r12d,[((-1894007588))+r12*1+rbp]
+ xor ebx,edi
+ rol ecx,5
+ add r12d,eax
+ rol r14d,1
+ and ebx,esi
+ add r12d,ecx
+ rol esi,30
+ add r12d,ebx
+ xor edx,DWORD[rsp]
+ mov eax,edi
+ mov DWORD[60+rsp],r14d
+ mov ebx,edi
+ xor edx,DWORD[8+rsp]
+ and eax,esi
+ mov ecx,r12d
+ xor edx,DWORD[32+rsp]
+ lea r11d,[((-1894007588))+r11*1+r14]
+ xor ebx,esi
+ rol ecx,5
+ add r11d,eax
+ rol edx,1
+ and ebx,r13d
+ add r11d,ecx
+ rol r13d,30
+ add r11d,ebx
+ xor ebp,DWORD[4+rsp]
+ mov eax,esi
+ mov DWORD[rsp],edx
+ mov ebx,esi
+ xor ebp,DWORD[12+rsp]
+ and eax,r13d
+ mov ecx,r11d
+ xor ebp,DWORD[36+rsp]
+ lea edi,[((-1894007588))+rdi*1+rdx]
+ xor ebx,r13d
+ rol ecx,5
+ add edi,eax
+ rol ebp,1
+ and ebx,r12d
+ add edi,ecx
+ rol r12d,30
+ add edi,ebx
+ xor r14d,DWORD[8+rsp]
+ mov eax,r13d
+ mov DWORD[4+rsp],ebp
+ mov ebx,r13d
+ xor r14d,DWORD[16+rsp]
+ and eax,r12d
+ mov ecx,edi
+ xor r14d,DWORD[40+rsp]
+ lea esi,[((-1894007588))+rsi*1+rbp]
+ xor ebx,r12d
+ rol ecx,5
+ add esi,eax
+ rol r14d,1
+ and ebx,r11d
+ add esi,ecx
+ rol r11d,30
+ add esi,ebx
+ xor edx,DWORD[12+rsp]
+ mov eax,r12d
+ mov DWORD[8+rsp],r14d
+ mov ebx,r12d
+ xor edx,DWORD[20+rsp]
+ and eax,r11d
+ mov ecx,esi
+ xor edx,DWORD[44+rsp]
+ lea r13d,[((-1894007588))+r13*1+r14]
+ xor ebx,r11d
+ rol ecx,5
+ add r13d,eax
+ rol edx,1
+ and ebx,edi
+ add r13d,ecx
+ rol edi,30
+ add r13d,ebx
+ xor ebp,DWORD[16+rsp]
+ mov eax,r11d
+ mov DWORD[12+rsp],edx
+ mov ebx,r11d
+ xor ebp,DWORD[24+rsp]
+ and eax,edi
+ mov ecx,r13d
+ xor ebp,DWORD[48+rsp]
+ lea r12d,[((-1894007588))+r12*1+rdx]
+ xor ebx,edi
+ rol ecx,5
+ add r12d,eax
+ rol ebp,1
+ and ebx,esi
+ add r12d,ecx
+ rol esi,30
+ add r12d,ebx
+ xor r14d,DWORD[20+rsp]
+ mov eax,edi
+ mov DWORD[16+rsp],ebp
+ mov ebx,edi
+ xor r14d,DWORD[28+rsp]
+ and eax,esi
+ mov ecx,r12d
+ xor r14d,DWORD[52+rsp]
+ lea r11d,[((-1894007588))+r11*1+rbp]
+ xor ebx,esi
+ rol ecx,5
+ add r11d,eax
+ rol r14d,1
+ and ebx,r13d
+ add r11d,ecx
+ rol r13d,30
+ add r11d,ebx
+ xor edx,DWORD[24+rsp]
+ mov eax,esi
+ mov DWORD[20+rsp],r14d
+ mov ebx,esi
+ xor edx,DWORD[32+rsp]
+ and eax,r13d
+ mov ecx,r11d
+ xor edx,DWORD[56+rsp]
+ lea edi,[((-1894007588))+rdi*1+r14]
+ xor ebx,r13d
+ rol ecx,5
+ add edi,eax
+ rol edx,1
+ and ebx,r12d
+ add edi,ecx
+ rol r12d,30
+ add edi,ebx
+ xor ebp,DWORD[28+rsp]
+ mov eax,r13d
+ mov DWORD[24+rsp],edx
+ mov ebx,r13d
+ xor ebp,DWORD[36+rsp]
+ and eax,r12d
+ mov ecx,edi
+ xor ebp,DWORD[60+rsp]
+ lea esi,[((-1894007588))+rsi*1+rdx]
+ xor ebx,r12d
+ rol ecx,5
+ add esi,eax
+ rol ebp,1
+ and ebx,r11d
+ add esi,ecx
+ rol r11d,30
+ add esi,ebx
+ xor r14d,DWORD[32+rsp]
+ mov eax,r12d
+ mov DWORD[28+rsp],ebp
+ mov ebx,r12d
+ xor r14d,DWORD[40+rsp]
+ and eax,r11d
+ mov ecx,esi
+ xor r14d,DWORD[rsp]
+ lea r13d,[((-1894007588))+r13*1+rbp]
+ xor ebx,r11d
+ rol ecx,5
+ add r13d,eax
+ rol r14d,1
+ and ebx,edi
+ add r13d,ecx
+ rol edi,30
+ add r13d,ebx
+ xor edx,DWORD[36+rsp]
+ mov eax,r11d
+ mov DWORD[32+rsp],r14d
+ mov ebx,r11d
+ xor edx,DWORD[44+rsp]
+ and eax,edi
+ mov ecx,r13d
+ xor edx,DWORD[4+rsp]
+ lea r12d,[((-1894007588))+r12*1+r14]
+ xor ebx,edi
+ rol ecx,5
+ add r12d,eax
+ rol edx,1
+ and ebx,esi
+ add r12d,ecx
+ rol esi,30
+ add r12d,ebx
+ xor ebp,DWORD[40+rsp]
+ mov eax,edi
+ mov DWORD[36+rsp],edx
+ mov ebx,edi
+ xor ebp,DWORD[48+rsp]
+ and eax,esi
+ mov ecx,r12d
+ xor ebp,DWORD[8+rsp]
+ lea r11d,[((-1894007588))+r11*1+rdx]
+ xor ebx,esi
+ rol ecx,5
+ add r11d,eax
+ rol ebp,1
+ and ebx,r13d
+ add r11d,ecx
+ rol r13d,30
+ add r11d,ebx
+ xor r14d,DWORD[44+rsp]
+ mov eax,esi
+ mov DWORD[40+rsp],ebp
+ mov ebx,esi
+ xor r14d,DWORD[52+rsp]
+ and eax,r13d
+ mov ecx,r11d
+ xor r14d,DWORD[12+rsp]
+ lea edi,[((-1894007588))+rdi*1+rbp]
+ xor ebx,r13d
+ rol ecx,5
+ add edi,eax
+ rol r14d,1
+ and ebx,r12d
+ add edi,ecx
+ rol r12d,30
+ add edi,ebx
+ xor edx,DWORD[48+rsp]
+ mov eax,r13d
+ mov DWORD[44+rsp],r14d
+ mov ebx,r13d
+ xor edx,DWORD[56+rsp]
+ and eax,r12d
+ mov ecx,edi
+ xor edx,DWORD[16+rsp]
+ lea esi,[((-1894007588))+rsi*1+r14]
+ xor ebx,r12d
+ rol ecx,5
+ add esi,eax
+ rol edx,1
+ and ebx,r11d
+ add esi,ecx
+ rol r11d,30
+ add esi,ebx
+ xor ebp,DWORD[52+rsp]
+ mov eax,edi
+ mov DWORD[48+rsp],edx
+ mov ecx,esi
+ xor ebp,DWORD[60+rsp]
+ xor eax,r12d
+ rol ecx,5
+ xor ebp,DWORD[20+rsp]
+ lea r13d,[((-899497514))+r13*1+rdx]
+ xor eax,r11d
+ add r13d,ecx
+ rol edi,30
+ add r13d,eax
+ rol ebp,1
+ xor r14d,DWORD[56+rsp]
+ mov eax,esi
+ mov DWORD[52+rsp],ebp
+ mov ecx,r13d
+ xor r14d,DWORD[rsp]
+ xor eax,r11d
+ rol ecx,5
+ xor r14d,DWORD[24+rsp]
+ lea r12d,[((-899497514))+r12*1+rbp]
+ xor eax,edi
+ add r12d,ecx
+ rol esi,30
+ add r12d,eax
+ rol r14d,1
+ xor edx,DWORD[60+rsp]
+ mov eax,r13d
+ mov DWORD[56+rsp],r14d
+ mov ecx,r12d
+ xor edx,DWORD[4+rsp]
+ xor eax,edi
+ rol ecx,5
+ xor edx,DWORD[28+rsp]
+ lea r11d,[((-899497514))+r11*1+r14]
+ xor eax,esi
+ add r11d,ecx
+ rol r13d,30
+ add r11d,eax
+ rol edx,1
+ xor ebp,DWORD[rsp]
+ mov eax,r12d
+ mov DWORD[60+rsp],edx
+ mov ecx,r11d
+ xor ebp,DWORD[8+rsp]
+ xor eax,esi
+ rol ecx,5
+ xor ebp,DWORD[32+rsp]
+ lea edi,[((-899497514))+rdi*1+rdx]
+ xor eax,r13d
+ add edi,ecx
+ rol r12d,30
+ add edi,eax
+ rol ebp,1
+ xor r14d,DWORD[4+rsp]
+ mov eax,r11d
+ mov DWORD[rsp],ebp
+ mov ecx,edi
+ xor r14d,DWORD[12+rsp]
+ xor eax,r13d
+ rol ecx,5
+ xor r14d,DWORD[36+rsp]
+ lea esi,[((-899497514))+rsi*1+rbp]
+ xor eax,r12d
+ add esi,ecx
+ rol r11d,30
+ add esi,eax
+ rol r14d,1
+ xor edx,DWORD[8+rsp]
+ mov eax,edi
+ mov DWORD[4+rsp],r14d
+ mov ecx,esi
+ xor edx,DWORD[16+rsp]
+ xor eax,r12d
+ rol ecx,5
+ xor edx,DWORD[40+rsp]
+ lea r13d,[((-899497514))+r13*1+r14]
+ xor eax,r11d
+ add r13d,ecx
+ rol edi,30
+ add r13d,eax
+ rol edx,1
+ xor ebp,DWORD[12+rsp]
+ mov eax,esi
+ mov DWORD[8+rsp],edx
+ mov ecx,r13d
+ xor ebp,DWORD[20+rsp]
+ xor eax,r11d
+ rol ecx,5
+ xor ebp,DWORD[44+rsp]
+ lea r12d,[((-899497514))+r12*1+rdx]
+ xor eax,edi
+ add r12d,ecx
+ rol esi,30
+ add r12d,eax
+ rol ebp,1
+ xor r14d,DWORD[16+rsp]
+ mov eax,r13d
+ mov DWORD[12+rsp],ebp
+ mov ecx,r12d
+ xor r14d,DWORD[24+rsp]
+ xor eax,edi
+ rol ecx,5
+ xor r14d,DWORD[48+rsp]
+ lea r11d,[((-899497514))+r11*1+rbp]
+ xor eax,esi
+ add r11d,ecx
+ rol r13d,30
+ add r11d,eax
+ rol r14d,1
+ xor edx,DWORD[20+rsp]
+ mov eax,r12d
+ mov DWORD[16+rsp],r14d
+ mov ecx,r11d
+ xor edx,DWORD[28+rsp]
+ xor eax,esi
+ rol ecx,5
+ xor edx,DWORD[52+rsp]
+ lea edi,[((-899497514))+rdi*1+r14]
+ xor eax,r13d
+ add edi,ecx
+ rol r12d,30
+ add edi,eax
+ rol edx,1
+ xor ebp,DWORD[24+rsp]
+ mov eax,r11d
+ mov DWORD[20+rsp],edx
+ mov ecx,edi
+ xor ebp,DWORD[32+rsp]
+ xor eax,r13d
+ rol ecx,5
+ xor ebp,DWORD[56+rsp]
+ lea esi,[((-899497514))+rsi*1+rdx]
+ xor eax,r12d
+ add esi,ecx
+ rol r11d,30
+ add esi,eax
+ rol ebp,1
+ xor r14d,DWORD[28+rsp]
+ mov eax,edi
+ mov DWORD[24+rsp],ebp
+ mov ecx,esi
+ xor r14d,DWORD[36+rsp]
+ xor eax,r12d
+ rol ecx,5
+ xor r14d,DWORD[60+rsp]
+ lea r13d,[((-899497514))+r13*1+rbp]
+ xor eax,r11d
+ add r13d,ecx
+ rol edi,30
+ add r13d,eax
+ rol r14d,1
+ xor edx,DWORD[32+rsp]
+ mov eax,esi
+ mov DWORD[28+rsp],r14d
+ mov ecx,r13d
+ xor edx,DWORD[40+rsp]
+ xor eax,r11d
+ rol ecx,5
+ xor edx,DWORD[rsp]
+ lea r12d,[((-899497514))+r12*1+r14]
+ xor eax,edi
+ add r12d,ecx
+ rol esi,30
+ add r12d,eax
+ rol edx,1
+ xor ebp,DWORD[36+rsp]
+ mov eax,r13d
+
+ mov ecx,r12d
+ xor ebp,DWORD[44+rsp]
+ xor eax,edi
+ rol ecx,5
+ xor ebp,DWORD[4+rsp]
+ lea r11d,[((-899497514))+r11*1+rdx]
+ xor eax,esi
+ add r11d,ecx
+ rol r13d,30
+ add r11d,eax
+ rol ebp,1
+ xor r14d,DWORD[40+rsp]
+ mov eax,r12d
+
+ mov ecx,r11d
+ xor r14d,DWORD[48+rsp]
+ xor eax,esi
+ rol ecx,5
+ xor r14d,DWORD[8+rsp]
+ lea edi,[((-899497514))+rdi*1+rbp]
+ xor eax,r13d
+ add edi,ecx
+ rol r12d,30
+ add edi,eax
+ rol r14d,1
+ xor edx,DWORD[44+rsp]
+ mov eax,r11d
+
+ mov ecx,edi
+ xor edx,DWORD[52+rsp]
+ xor eax,r13d
+ rol ecx,5
+ xor edx,DWORD[12+rsp]
+ lea esi,[((-899497514))+rsi*1+r14]
+ xor eax,r12d
+ add esi,ecx
+ rol r11d,30
+ add esi,eax
+ rol edx,1
+ xor ebp,DWORD[48+rsp]
+ mov eax,edi
+
+ mov ecx,esi
+ xor ebp,DWORD[56+rsp]
+ xor eax,r12d
+ rol ecx,5
+ xor ebp,DWORD[16+rsp]
+ lea r13d,[((-899497514))+r13*1+rdx]
+ xor eax,r11d
+ add r13d,ecx
+ rol edi,30
+ add r13d,eax
+ rol ebp,1
+ xor r14d,DWORD[52+rsp]
+ mov eax,esi
+
+ mov ecx,r13d
+ xor r14d,DWORD[60+rsp]
+ xor eax,r11d
+ rol ecx,5
+ xor r14d,DWORD[20+rsp]
+ lea r12d,[((-899497514))+r12*1+rbp]
+ xor eax,edi
+ add r12d,ecx
+ rol esi,30
+ add r12d,eax
+ rol r14d,1
+ xor edx,DWORD[56+rsp]
+ mov eax,r13d
+
+ mov ecx,r12d
+ xor edx,DWORD[rsp]
+ xor eax,edi
+ rol ecx,5
+ xor edx,DWORD[24+rsp]
+ lea r11d,[((-899497514))+r11*1+r14]
+ xor eax,esi
+ add r11d,ecx
+ rol r13d,30
+ add r11d,eax
+ rol edx,1
+ xor ebp,DWORD[60+rsp]
+ mov eax,r12d
+
+ mov ecx,r11d
+ xor ebp,DWORD[4+rsp]
+ xor eax,esi
+ rol ecx,5
+ xor ebp,DWORD[28+rsp]
+ lea edi,[((-899497514))+rdi*1+rdx]
+ xor eax,r13d
+ add edi,ecx
+ rol r12d,30
+ add edi,eax
+ rol ebp,1
+ mov eax,r11d
+ mov ecx,edi
+ xor eax,r13d
+ lea esi,[((-899497514))+rsi*1+rbp]
+ rol ecx,5
+ xor eax,r12d
+ add esi,ecx
+ rol r11d,30
+ add esi,eax
+ add esi,DWORD[r8]
+ add edi,DWORD[4+r8]
+ add r11d,DWORD[8+r8]
+ add r12d,DWORD[12+r8]
+ add r13d,DWORD[16+r8]
+ mov DWORD[r8],esi
+ mov DWORD[4+r8],edi
+ mov DWORD[8+r8],r11d
+ mov DWORD[12+r8],r12d
+ mov DWORD[16+r8],r13d
+
+ sub r10,1
+ lea r9,[64+r9]
+ jnz NEAR $L$loop
+
+ mov rsi,QWORD[64+rsp]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha1_block_data_order:
+
+ALIGN 32
+sha1_block_data_order_shaext:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha1_block_data_order_shaext:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+_shaext_shortcut:
+
+ lea rsp,[((-72))+rsp]
+ movaps XMMWORD[(-8-64)+rax],xmm6
+ movaps XMMWORD[(-8-48)+rax],xmm7
+ movaps XMMWORD[(-8-32)+rax],xmm8
+ movaps XMMWORD[(-8-16)+rax],xmm9
+$L$prologue_shaext:
+ movdqu xmm0,XMMWORD[rdi]
+ movd xmm1,DWORD[16+rdi]
+ movdqa xmm3,XMMWORD[((K_XX_XX+160))]
+
+ movdqu xmm4,XMMWORD[rsi]
+ pshufd xmm0,xmm0,27
+ movdqu xmm5,XMMWORD[16+rsi]
+ pshufd xmm1,xmm1,27
+ movdqu xmm6,XMMWORD[32+rsi]
+DB 102,15,56,0,227
+ movdqu xmm7,XMMWORD[48+rsi]
+DB 102,15,56,0,235
+DB 102,15,56,0,243
+ movdqa xmm9,xmm1
+DB 102,15,56,0,251
+ jmp NEAR $L$oop_shaext
+
+ALIGN 16
+$L$oop_shaext:
+ dec rdx
+ lea r8,[64+rsi]
+ paddd xmm1,xmm4
+ cmovne rsi,r8
+ movdqa xmm8,xmm0
+DB 15,56,201,229
+ movdqa xmm2,xmm0
+DB 15,58,204,193,0
+DB 15,56,200,213
+ pxor xmm4,xmm6
+DB 15,56,201,238
+DB 15,56,202,231
+
+ movdqa xmm1,xmm0
+DB 15,58,204,194,0
+DB 15,56,200,206
+ pxor xmm5,xmm7
+DB 15,56,202,236
+DB 15,56,201,247
+ movdqa xmm2,xmm0
+DB 15,58,204,193,0
+DB 15,56,200,215
+ pxor xmm6,xmm4
+DB 15,56,201,252
+DB 15,56,202,245
+
+ movdqa xmm1,xmm0
+DB 15,58,204,194,0
+DB 15,56,200,204
+ pxor xmm7,xmm5
+DB 15,56,202,254
+DB 15,56,201,229
+ movdqa xmm2,xmm0
+DB 15,58,204,193,0
+DB 15,56,200,213
+ pxor xmm4,xmm6
+DB 15,56,201,238
+DB 15,56,202,231
+
+ movdqa xmm1,xmm0
+DB 15,58,204,194,1
+DB 15,56,200,206
+ pxor xmm5,xmm7
+DB 15,56,202,236
+DB 15,56,201,247
+ movdqa xmm2,xmm0
+DB 15,58,204,193,1
+DB 15,56,200,215
+ pxor xmm6,xmm4
+DB 15,56,201,252
+DB 15,56,202,245
+
+ movdqa xmm1,xmm0
+DB 15,58,204,194,1
+DB 15,56,200,204
+ pxor xmm7,xmm5
+DB 15,56,202,254
+DB 15,56,201,229
+ movdqa xmm2,xmm0
+DB 15,58,204,193,1
+DB 15,56,200,213
+ pxor xmm4,xmm6
+DB 15,56,201,238
+DB 15,56,202,231
+
+ movdqa xmm1,xmm0
+DB 15,58,204,194,1
+DB 15,56,200,206
+ pxor xmm5,xmm7
+DB 15,56,202,236
+DB 15,56,201,247
+ movdqa xmm2,xmm0
+DB 15,58,204,193,2
+DB 15,56,200,215
+ pxor xmm6,xmm4
+DB 15,56,201,252
+DB 15,56,202,245
+
+ movdqa xmm1,xmm0
+DB 15,58,204,194,2
+DB 15,56,200,204
+ pxor xmm7,xmm5
+DB 15,56,202,254
+DB 15,56,201,229
+ movdqa xmm2,xmm0
+DB 15,58,204,193,2
+DB 15,56,200,213
+ pxor xmm4,xmm6
+DB 15,56,201,238
+DB 15,56,202,231
+
+ movdqa xmm1,xmm0
+DB 15,58,204,194,2
+DB 15,56,200,206
+ pxor xmm5,xmm7
+DB 15,56,202,236
+DB 15,56,201,247
+ movdqa xmm2,xmm0
+DB 15,58,204,193,2
+DB 15,56,200,215
+ pxor xmm6,xmm4
+DB 15,56,201,252
+DB 15,56,202,245
+
+ movdqa xmm1,xmm0
+DB 15,58,204,194,3
+DB 15,56,200,204
+ pxor xmm7,xmm5
+DB 15,56,202,254
+ movdqu xmm4,XMMWORD[rsi]
+ movdqa xmm2,xmm0
+DB 15,58,204,193,3
+DB 15,56,200,213
+ movdqu xmm5,XMMWORD[16+rsi]
+DB 102,15,56,0,227
+
+ movdqa xmm1,xmm0
+DB 15,58,204,194,3
+DB 15,56,200,206
+ movdqu xmm6,XMMWORD[32+rsi]
+DB 102,15,56,0,235
+
+ movdqa xmm2,xmm0
+DB 15,58,204,193,3
+DB 15,56,200,215
+ movdqu xmm7,XMMWORD[48+rsi]
+DB 102,15,56,0,243
+
+ movdqa xmm1,xmm0
+DB 15,58,204,194,3
+DB 65,15,56,200,201
+DB 102,15,56,0,251
+
+ paddd xmm0,xmm8
+ movdqa xmm9,xmm1
+
+ jnz NEAR $L$oop_shaext
+
+ pshufd xmm0,xmm0,27
+ pshufd xmm1,xmm1,27
+ movdqu XMMWORD[rdi],xmm0
+ movd DWORD[16+rdi],xmm1
+ movaps xmm6,XMMWORD[((-8-64))+rax]
+ movaps xmm7,XMMWORD[((-8-48))+rax]
+ movaps xmm8,XMMWORD[((-8-32))+rax]
+ movaps xmm9,XMMWORD[((-8-16))+rax]
+ mov rsp,rax
+$L$epilogue_shaext:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha1_block_data_order_shaext:
+
+ALIGN 16
+sha1_block_data_order_ssse3:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha1_block_data_order_ssse3:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+_ssse3_shortcut:
+
+ mov r11,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ lea rsp,[((-160))+rsp]
+ movaps XMMWORD[(-40-96)+r11],xmm6
+ movaps XMMWORD[(-40-80)+r11],xmm7
+ movaps XMMWORD[(-40-64)+r11],xmm8
+ movaps XMMWORD[(-40-48)+r11],xmm9
+ movaps XMMWORD[(-40-32)+r11],xmm10
+ movaps XMMWORD[(-40-16)+r11],xmm11
+$L$prologue_ssse3:
+ and rsp,-64
+ mov r8,rdi
+ mov r9,rsi
+ mov r10,rdx
+
+ shl r10,6
+ add r10,r9
+ lea r14,[((K_XX_XX+64))]
+
+ mov eax,DWORD[r8]
+ mov ebx,DWORD[4+r8]
+ mov ecx,DWORD[8+r8]
+ mov edx,DWORD[12+r8]
+ mov esi,ebx
+ mov ebp,DWORD[16+r8]
+ mov edi,ecx
+ xor edi,edx
+ and esi,edi
+
+ movdqa xmm6,XMMWORD[64+r14]
+ movdqa xmm9,XMMWORD[((-64))+r14]
+ movdqu xmm0,XMMWORD[r9]
+ movdqu xmm1,XMMWORD[16+r9]
+ movdqu xmm2,XMMWORD[32+r9]
+ movdqu xmm3,XMMWORD[48+r9]
+DB 102,15,56,0,198
+DB 102,15,56,0,206
+DB 102,15,56,0,214
+ add r9,64
+ paddd xmm0,xmm9
+DB 102,15,56,0,222
+ paddd xmm1,xmm9
+ paddd xmm2,xmm9
+ movdqa XMMWORD[rsp],xmm0
+ psubd xmm0,xmm9
+ movdqa XMMWORD[16+rsp],xmm1
+ psubd xmm1,xmm9
+ movdqa XMMWORD[32+rsp],xmm2
+ psubd xmm2,xmm9
+ jmp NEAR $L$oop_ssse3
+ALIGN 16
+$L$oop_ssse3:
+ ror ebx,2
+ pshufd xmm4,xmm0,238
+ xor esi,edx
+ movdqa xmm8,xmm3
+ paddd xmm9,xmm3
+ mov edi,eax
+ add ebp,DWORD[rsp]
+ punpcklqdq xmm4,xmm1
+ xor ebx,ecx
+ rol eax,5
+ add ebp,esi
+ psrldq xmm8,4
+ and edi,ebx
+ xor ebx,ecx
+ pxor xmm4,xmm0
+ add ebp,eax
+ ror eax,7
+ pxor xmm8,xmm2
+ xor edi,ecx
+ mov esi,ebp
+ add edx,DWORD[4+rsp]
+ pxor xmm4,xmm8
+ xor eax,ebx
+ rol ebp,5
+ movdqa XMMWORD[48+rsp],xmm9
+ add edx,edi
+ and esi,eax
+ movdqa xmm10,xmm4
+ xor eax,ebx
+ add edx,ebp
+ ror ebp,7
+ movdqa xmm8,xmm4
+ xor esi,ebx
+ pslldq xmm10,12
+ paddd xmm4,xmm4
+ mov edi,edx
+ add ecx,DWORD[8+rsp]
+ psrld xmm8,31
+ xor ebp,eax
+ rol edx,5
+ add ecx,esi
+ movdqa xmm9,xmm10
+ and edi,ebp
+ xor ebp,eax
+ psrld xmm10,30
+ add ecx,edx
+ ror edx,7
+ por xmm4,xmm8
+ xor edi,eax
+ mov esi,ecx
+ add ebx,DWORD[12+rsp]
+ pslld xmm9,2
+ pxor xmm4,xmm10
+ xor edx,ebp
+ movdqa xmm10,XMMWORD[((-64))+r14]
+ rol ecx,5
+ add ebx,edi
+ and esi,edx
+ pxor xmm4,xmm9
+ xor edx,ebp
+ add ebx,ecx
+ ror ecx,7
+ pshufd xmm5,xmm1,238
+ xor esi,ebp
+ movdqa xmm9,xmm4
+ paddd xmm10,xmm4
+ mov edi,ebx
+ add eax,DWORD[16+rsp]
+ punpcklqdq xmm5,xmm2
+ xor ecx,edx
+ rol ebx,5
+ add eax,esi
+ psrldq xmm9,4
+ and edi,ecx
+ xor ecx,edx
+ pxor xmm5,xmm1
+ add eax,ebx
+ ror ebx,7
+ pxor xmm9,xmm3
+ xor edi,edx
+ mov esi,eax
+ add ebp,DWORD[20+rsp]
+ pxor xmm5,xmm9
+ xor ebx,ecx
+ rol eax,5
+ movdqa XMMWORD[rsp],xmm10
+ add ebp,edi
+ and esi,ebx
+ movdqa xmm8,xmm5
+ xor ebx,ecx
+ add ebp,eax
+ ror eax,7
+ movdqa xmm9,xmm5
+ xor esi,ecx
+ pslldq xmm8,12
+ paddd xmm5,xmm5
+ mov edi,ebp
+ add edx,DWORD[24+rsp]
+ psrld xmm9,31
+ xor eax,ebx
+ rol ebp,5
+ add edx,esi
+ movdqa xmm10,xmm8
+ and edi,eax
+ xor eax,ebx
+ psrld xmm8,30
+ add edx,ebp
+ ror ebp,7
+ por xmm5,xmm9
+ xor edi,ebx
+ mov esi,edx
+ add ecx,DWORD[28+rsp]
+ pslld xmm10,2
+ pxor xmm5,xmm8
+ xor ebp,eax
+ movdqa xmm8,XMMWORD[((-32))+r14]
+ rol edx,5
+ add ecx,edi
+ and esi,ebp
+ pxor xmm5,xmm10
+ xor ebp,eax
+ add ecx,edx
+ ror edx,7
+ pshufd xmm6,xmm2,238
+ xor esi,eax
+ movdqa xmm10,xmm5
+ paddd xmm8,xmm5
+ mov edi,ecx
+ add ebx,DWORD[32+rsp]
+ punpcklqdq xmm6,xmm3
+ xor edx,ebp
+ rol ecx,5
+ add ebx,esi
+ psrldq xmm10,4
+ and edi,edx
+ xor edx,ebp
+ pxor xmm6,xmm2
+ add ebx,ecx
+ ror ecx,7
+ pxor xmm10,xmm4
+ xor edi,ebp
+ mov esi,ebx
+ add eax,DWORD[36+rsp]
+ pxor xmm6,xmm10
+ xor ecx,edx
+ rol ebx,5
+ movdqa XMMWORD[16+rsp],xmm8
+ add eax,edi
+ and esi,ecx
+ movdqa xmm9,xmm6
+ xor ecx,edx
+ add eax,ebx
+ ror ebx,7
+ movdqa xmm10,xmm6
+ xor esi,edx
+ pslldq xmm9,12
+ paddd xmm6,xmm6
+ mov edi,eax
+ add ebp,DWORD[40+rsp]
+ psrld xmm10,31
+ xor ebx,ecx
+ rol eax,5
+ add ebp,esi
+ movdqa xmm8,xmm9
+ and edi,ebx
+ xor ebx,ecx
+ psrld xmm9,30
+ add ebp,eax
+ ror eax,7
+ por xmm6,xmm10
+ xor edi,ecx
+ mov esi,ebp
+ add edx,DWORD[44+rsp]
+ pslld xmm8,2
+ pxor xmm6,xmm9
+ xor eax,ebx
+ movdqa xmm9,XMMWORD[((-32))+r14]
+ rol ebp,5
+ add edx,edi
+ and esi,eax
+ pxor xmm6,xmm8
+ xor eax,ebx
+ add edx,ebp
+ ror ebp,7
+ pshufd xmm7,xmm3,238
+ xor esi,ebx
+ movdqa xmm8,xmm6
+ paddd xmm9,xmm6
+ mov edi,edx
+ add ecx,DWORD[48+rsp]
+ punpcklqdq xmm7,xmm4
+ xor ebp,eax
+ rol edx,5
+ add ecx,esi
+ psrldq xmm8,4
+ and edi,ebp
+ xor ebp,eax
+ pxor xmm7,xmm3
+ add ecx,edx
+ ror edx,7
+ pxor xmm8,xmm5
+ xor edi,eax
+ mov esi,ecx
+ add ebx,DWORD[52+rsp]
+ pxor xmm7,xmm8
+ xor edx,ebp
+ rol ecx,5
+ movdqa XMMWORD[32+rsp],xmm9
+ add ebx,edi
+ and esi,edx
+ movdqa xmm10,xmm7
+ xor edx,ebp
+ add ebx,ecx
+ ror ecx,7
+ movdqa xmm8,xmm7
+ xor esi,ebp
+ pslldq xmm10,12
+ paddd xmm7,xmm7
+ mov edi,ebx
+ add eax,DWORD[56+rsp]
+ psrld xmm8,31
+ xor ecx,edx
+ rol ebx,5
+ add eax,esi
+ movdqa xmm9,xmm10
+ and edi,ecx
+ xor ecx,edx
+ psrld xmm10,30
+ add eax,ebx
+ ror ebx,7
+ por xmm7,xmm8
+ xor edi,edx
+ mov esi,eax
+ add ebp,DWORD[60+rsp]
+ pslld xmm9,2
+ pxor xmm7,xmm10
+ xor ebx,ecx
+ movdqa xmm10,XMMWORD[((-32))+r14]
+ rol eax,5
+ add ebp,edi
+ and esi,ebx
+ pxor xmm7,xmm9
+ pshufd xmm9,xmm6,238
+ xor ebx,ecx
+ add ebp,eax
+ ror eax,7
+ pxor xmm0,xmm4
+ xor esi,ecx
+ mov edi,ebp
+ add edx,DWORD[rsp]
+ punpcklqdq xmm9,xmm7
+ xor eax,ebx
+ rol ebp,5
+ pxor xmm0,xmm1
+ add edx,esi
+ and edi,eax
+ movdqa xmm8,xmm10
+ xor eax,ebx
+ paddd xmm10,xmm7
+ add edx,ebp
+ pxor xmm0,xmm9
+ ror ebp,7
+ xor edi,ebx
+ mov esi,edx
+ add ecx,DWORD[4+rsp]
+ movdqa xmm9,xmm0
+ xor ebp,eax
+ rol edx,5
+ movdqa XMMWORD[48+rsp],xmm10
+ add ecx,edi
+ and esi,ebp
+ xor ebp,eax
+ pslld xmm0,2
+ add ecx,edx
+ ror edx,7
+ psrld xmm9,30
+ xor esi,eax
+ mov edi,ecx
+ add ebx,DWORD[8+rsp]
+ por xmm0,xmm9
+ xor edx,ebp
+ rol ecx,5
+ pshufd xmm10,xmm7,238
+ add ebx,esi
+ and edi,edx
+ xor edx,ebp
+ add ebx,ecx
+ add eax,DWORD[12+rsp]
+ xor edi,ebp
+ mov esi,ebx
+ rol ebx,5
+ add eax,edi
+ xor esi,edx
+ ror ecx,7
+ add eax,ebx
+ pxor xmm1,xmm5
+ add ebp,DWORD[16+rsp]
+ xor esi,ecx
+ punpcklqdq xmm10,xmm0
+ mov edi,eax
+ rol eax,5
+ pxor xmm1,xmm2
+ add ebp,esi
+ xor edi,ecx
+ movdqa xmm9,xmm8
+ ror ebx,7
+ paddd xmm8,xmm0
+ add ebp,eax
+ pxor xmm1,xmm10
+ add edx,DWORD[20+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ rol ebp,5
+ movdqa xmm10,xmm1
+ add edx,edi
+ xor esi,ebx
+ movdqa XMMWORD[rsp],xmm8
+ ror eax,7
+ add edx,ebp
+ add ecx,DWORD[24+rsp]
+ pslld xmm1,2
+ xor esi,eax
+ mov edi,edx
+ psrld xmm10,30
+ rol edx,5
+ add ecx,esi
+ xor edi,eax
+ ror ebp,7
+ por xmm1,xmm10
+ add ecx,edx
+ add ebx,DWORD[28+rsp]
+ pshufd xmm8,xmm0,238
+ xor edi,ebp
+ mov esi,ecx
+ rol ecx,5
+ add ebx,edi
+ xor esi,ebp
+ ror edx,7
+ add ebx,ecx
+ pxor xmm2,xmm6
+ add eax,DWORD[32+rsp]
+ xor esi,edx
+ punpcklqdq xmm8,xmm1
+ mov edi,ebx
+ rol ebx,5
+ pxor xmm2,xmm3
+ add eax,esi
+ xor edi,edx
+ movdqa xmm10,XMMWORD[r14]
+ ror ecx,7
+ paddd xmm9,xmm1
+ add eax,ebx
+ pxor xmm2,xmm8
+ add ebp,DWORD[36+rsp]
+ xor edi,ecx
+ mov esi,eax
+ rol eax,5
+ movdqa xmm8,xmm2
+ add ebp,edi
+ xor esi,ecx
+ movdqa XMMWORD[16+rsp],xmm9
+ ror ebx,7
+ add ebp,eax
+ add edx,DWORD[40+rsp]
+ pslld xmm2,2
+ xor esi,ebx
+ mov edi,ebp
+ psrld xmm8,30
+ rol ebp,5
+ add edx,esi
+ xor edi,ebx
+ ror eax,7
+ por xmm2,xmm8
+ add edx,ebp
+ add ecx,DWORD[44+rsp]
+ pshufd xmm9,xmm1,238
+ xor edi,eax
+ mov esi,edx
+ rol edx,5
+ add ecx,edi
+ xor esi,eax
+ ror ebp,7
+ add ecx,edx
+ pxor xmm3,xmm7
+ add ebx,DWORD[48+rsp]
+ xor esi,ebp
+ punpcklqdq xmm9,xmm2
+ mov edi,ecx
+ rol ecx,5
+ pxor xmm3,xmm4
+ add ebx,esi
+ xor edi,ebp
+ movdqa xmm8,xmm10
+ ror edx,7
+ paddd xmm10,xmm2
+ add ebx,ecx
+ pxor xmm3,xmm9
+ add eax,DWORD[52+rsp]
+ xor edi,edx
+ mov esi,ebx
+ rol ebx,5
+ movdqa xmm9,xmm3
+ add eax,edi
+ xor esi,edx
+ movdqa XMMWORD[32+rsp],xmm10
+ ror ecx,7
+ add eax,ebx
+ add ebp,DWORD[56+rsp]
+ pslld xmm3,2
+ xor esi,ecx
+ mov edi,eax
+ psrld xmm9,30
+ rol eax,5
+ add ebp,esi
+ xor edi,ecx
+ ror ebx,7
+ por xmm3,xmm9
+ add ebp,eax
+ add edx,DWORD[60+rsp]
+ pshufd xmm10,xmm2,238
+ xor edi,ebx
+ mov esi,ebp
+ rol ebp,5
+ add edx,edi
+ xor esi,ebx
+ ror eax,7
+ add edx,ebp
+ pxor xmm4,xmm0
+ add ecx,DWORD[rsp]
+ xor esi,eax
+ punpcklqdq xmm10,xmm3
+ mov edi,edx
+ rol edx,5
+ pxor xmm4,xmm5
+ add ecx,esi
+ xor edi,eax
+ movdqa xmm9,xmm8
+ ror ebp,7
+ paddd xmm8,xmm3
+ add ecx,edx
+ pxor xmm4,xmm10
+ add ebx,DWORD[4+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ rol ecx,5
+ movdqa xmm10,xmm4
+ add ebx,edi
+ xor esi,ebp
+ movdqa XMMWORD[48+rsp],xmm8
+ ror edx,7
+ add ebx,ecx
+ add eax,DWORD[8+rsp]
+ pslld xmm4,2
+ xor esi,edx
+ mov edi,ebx
+ psrld xmm10,30
+ rol ebx,5
+ add eax,esi
+ xor edi,edx
+ ror ecx,7
+ por xmm4,xmm10
+ add eax,ebx
+ add ebp,DWORD[12+rsp]
+ pshufd xmm8,xmm3,238
+ xor edi,ecx
+ mov esi,eax
+ rol eax,5
+ add ebp,edi
+ xor esi,ecx
+ ror ebx,7
+ add ebp,eax
+ pxor xmm5,xmm1
+ add edx,DWORD[16+rsp]
+ xor esi,ebx
+ punpcklqdq xmm8,xmm4
+ mov edi,ebp
+ rol ebp,5
+ pxor xmm5,xmm6
+ add edx,esi
+ xor edi,ebx
+ movdqa xmm10,xmm9
+ ror eax,7
+ paddd xmm9,xmm4
+ add edx,ebp
+ pxor xmm5,xmm8
+ add ecx,DWORD[20+rsp]
+ xor edi,eax
+ mov esi,edx
+ rol edx,5
+ movdqa xmm8,xmm5
+ add ecx,edi
+ xor esi,eax
+ movdqa XMMWORD[rsp],xmm9
+ ror ebp,7
+ add ecx,edx
+ add ebx,DWORD[24+rsp]
+ pslld xmm5,2
+ xor esi,ebp
+ mov edi,ecx
+ psrld xmm8,30
+ rol ecx,5
+ add ebx,esi
+ xor edi,ebp
+ ror edx,7
+ por xmm5,xmm8
+ add ebx,ecx
+ add eax,DWORD[28+rsp]
+ pshufd xmm9,xmm4,238
+ ror ecx,7
+ mov esi,ebx
+ xor edi,edx
+ rol ebx,5
+ add eax,edi
+ xor esi,ecx
+ xor ecx,edx
+ add eax,ebx
+ pxor xmm6,xmm2
+ add ebp,DWORD[32+rsp]
+ and esi,ecx
+ xor ecx,edx
+ ror ebx,7
+ punpcklqdq xmm9,xmm5
+ mov edi,eax
+ xor esi,ecx
+ pxor xmm6,xmm7
+ rol eax,5
+ add ebp,esi
+ movdqa xmm8,xmm10
+ xor edi,ebx
+ paddd xmm10,xmm5
+ xor ebx,ecx
+ pxor xmm6,xmm9
+ add ebp,eax
+ add edx,DWORD[36+rsp]
+ and edi,ebx
+ xor ebx,ecx
+ ror eax,7
+ movdqa xmm9,xmm6
+ mov esi,ebp
+ xor edi,ebx
+ movdqa XMMWORD[16+rsp],xmm10
+ rol ebp,5
+ add edx,edi
+ xor esi,eax
+ pslld xmm6,2
+ xor eax,ebx
+ add edx,ebp
+ psrld xmm9,30
+ add ecx,DWORD[40+rsp]
+ and esi,eax
+ xor eax,ebx
+ por xmm6,xmm9
+ ror ebp,7
+ mov edi,edx
+ xor esi,eax
+ rol edx,5
+ pshufd xmm10,xmm5,238
+ add ecx,esi
+ xor edi,ebp
+ xor ebp,eax
+ add ecx,edx
+ add ebx,DWORD[44+rsp]
+ and edi,ebp
+ xor ebp,eax
+ ror edx,7
+ mov esi,ecx
+ xor edi,ebp
+ rol ecx,5
+ add ebx,edi
+ xor esi,edx
+ xor edx,ebp
+ add ebx,ecx
+ pxor xmm7,xmm3
+ add eax,DWORD[48+rsp]
+ and esi,edx
+ xor edx,ebp
+ ror ecx,7
+ punpcklqdq xmm10,xmm6
+ mov edi,ebx
+ xor esi,edx
+ pxor xmm7,xmm0
+ rol ebx,5
+ add eax,esi
+ movdqa xmm9,XMMWORD[32+r14]
+ xor edi,ecx
+ paddd xmm8,xmm6
+ xor ecx,edx
+ pxor xmm7,xmm10
+ add eax,ebx
+ add ebp,DWORD[52+rsp]
+ and edi,ecx
+ xor ecx,edx
+ ror ebx,7
+ movdqa xmm10,xmm7
+ mov esi,eax
+ xor edi,ecx
+ movdqa XMMWORD[32+rsp],xmm8
+ rol eax,5
+ add ebp,edi
+ xor esi,ebx
+ pslld xmm7,2
+ xor ebx,ecx
+ add ebp,eax
+ psrld xmm10,30
+ add edx,DWORD[56+rsp]
+ and esi,ebx
+ xor ebx,ecx
+ por xmm7,xmm10
+ ror eax,7
+ mov edi,ebp
+ xor esi,ebx
+ rol ebp,5
+ pshufd xmm8,xmm6,238
+ add edx,esi
+ xor edi,eax
+ xor eax,ebx
+ add edx,ebp
+ add ecx,DWORD[60+rsp]
+ and edi,eax
+ xor eax,ebx
+ ror ebp,7
+ mov esi,edx
+ xor edi,eax
+ rol edx,5
+ add ecx,edi
+ xor esi,ebp
+ xor ebp,eax
+ add ecx,edx
+ pxor xmm0,xmm4
+ add ebx,DWORD[rsp]
+ and esi,ebp
+ xor ebp,eax
+ ror edx,7
+ punpcklqdq xmm8,xmm7
+ mov edi,ecx
+ xor esi,ebp
+ pxor xmm0,xmm1
+ rol ecx,5
+ add ebx,esi
+ movdqa xmm10,xmm9
+ xor edi,edx
+ paddd xmm9,xmm7
+ xor edx,ebp
+ pxor xmm0,xmm8
+ add ebx,ecx
+ add eax,DWORD[4+rsp]
+ and edi,edx
+ xor edx,ebp
+ ror ecx,7
+ movdqa xmm8,xmm0
+ mov esi,ebx
+ xor edi,edx
+ movdqa XMMWORD[48+rsp],xmm9
+ rol ebx,5
+ add eax,edi
+ xor esi,ecx
+ pslld xmm0,2
+ xor ecx,edx
+ add eax,ebx
+ psrld xmm8,30
+ add ebp,DWORD[8+rsp]
+ and esi,ecx
+ xor ecx,edx
+ por xmm0,xmm8
+ ror ebx,7
+ mov edi,eax
+ xor esi,ecx
+ rol eax,5
+ pshufd xmm9,xmm7,238
+ add ebp,esi
+ xor edi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ add edx,DWORD[12+rsp]
+ and edi,ebx
+ xor ebx,ecx
+ ror eax,7
+ mov esi,ebp
+ xor edi,ebx
+ rol ebp,5
+ add edx,edi
+ xor esi,eax
+ xor eax,ebx
+ add edx,ebp
+ pxor xmm1,xmm5
+ add ecx,DWORD[16+rsp]
+ and esi,eax
+ xor eax,ebx
+ ror ebp,7
+ punpcklqdq xmm9,xmm0
+ mov edi,edx
+ xor esi,eax
+ pxor xmm1,xmm2
+ rol edx,5
+ add ecx,esi
+ movdqa xmm8,xmm10
+ xor edi,ebp
+ paddd xmm10,xmm0
+ xor ebp,eax
+ pxor xmm1,xmm9
+ add ecx,edx
+ add ebx,DWORD[20+rsp]
+ and edi,ebp
+ xor ebp,eax
+ ror edx,7
+ movdqa xmm9,xmm1
+ mov esi,ecx
+ xor edi,ebp
+ movdqa XMMWORD[rsp],xmm10
+ rol ecx,5
+ add ebx,edi
+ xor esi,edx
+ pslld xmm1,2
+ xor edx,ebp
+ add ebx,ecx
+ psrld xmm9,30
+ add eax,DWORD[24+rsp]
+ and esi,edx
+ xor edx,ebp
+ por xmm1,xmm9
+ ror ecx,7
+ mov edi,ebx
+ xor esi,edx
+ rol ebx,5
+ pshufd xmm10,xmm0,238
+ add eax,esi
+ xor edi,ecx
+ xor ecx,edx
+ add eax,ebx
+ add ebp,DWORD[28+rsp]
+ and edi,ecx
+ xor ecx,edx
+ ror ebx,7
+ mov esi,eax
+ xor edi,ecx
+ rol eax,5
+ add ebp,edi
+ xor esi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ pxor xmm2,xmm6
+ add edx,DWORD[32+rsp]
+ and esi,ebx
+ xor ebx,ecx
+ ror eax,7
+ punpcklqdq xmm10,xmm1
+ mov edi,ebp
+ xor esi,ebx
+ pxor xmm2,xmm3
+ rol ebp,5
+ add edx,esi
+ movdqa xmm9,xmm8
+ xor edi,eax
+ paddd xmm8,xmm1
+ xor eax,ebx
+ pxor xmm2,xmm10
+ add edx,ebp
+ add ecx,DWORD[36+rsp]
+ and edi,eax
+ xor eax,ebx
+ ror ebp,7
+ movdqa xmm10,xmm2
+ mov esi,edx
+ xor edi,eax
+ movdqa XMMWORD[16+rsp],xmm8
+ rol edx,5
+ add ecx,edi
+ xor esi,ebp
+ pslld xmm2,2
+ xor ebp,eax
+ add ecx,edx
+ psrld xmm10,30
+ add ebx,DWORD[40+rsp]
+ and esi,ebp
+ xor ebp,eax
+ por xmm2,xmm10
+ ror edx,7
+ mov edi,ecx
+ xor esi,ebp
+ rol ecx,5
+ pshufd xmm8,xmm1,238
+ add ebx,esi
+ xor edi,edx
+ xor edx,ebp
+ add ebx,ecx
+ add eax,DWORD[44+rsp]
+ and edi,edx
+ xor edx,ebp
+ ror ecx,7
+ mov esi,ebx
+ xor edi,edx
+ rol ebx,5
+ add eax,edi
+ xor esi,edx
+ add eax,ebx
+ pxor xmm3,xmm7
+ add ebp,DWORD[48+rsp]
+ xor esi,ecx
+ punpcklqdq xmm8,xmm2
+ mov edi,eax
+ rol eax,5
+ pxor xmm3,xmm4
+ add ebp,esi
+ xor edi,ecx
+ movdqa xmm10,xmm9
+ ror ebx,7
+ paddd xmm9,xmm2
+ add ebp,eax
+ pxor xmm3,xmm8
+ add edx,DWORD[52+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ rol ebp,5
+ movdqa xmm8,xmm3
+ add edx,edi
+ xor esi,ebx
+ movdqa XMMWORD[32+rsp],xmm9
+ ror eax,7
+ add edx,ebp
+ add ecx,DWORD[56+rsp]
+ pslld xmm3,2
+ xor esi,eax
+ mov edi,edx
+ psrld xmm8,30
+ rol edx,5
+ add ecx,esi
+ xor edi,eax
+ ror ebp,7
+ por xmm3,xmm8
+ add ecx,edx
+ add ebx,DWORD[60+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ rol ecx,5
+ add ebx,edi
+ xor esi,ebp
+ ror edx,7
+ add ebx,ecx
+ add eax,DWORD[rsp]
+ xor esi,edx
+ mov edi,ebx
+ rol ebx,5
+ paddd xmm10,xmm3
+ add eax,esi
+ xor edi,edx
+ movdqa XMMWORD[48+rsp],xmm10
+ ror ecx,7
+ add eax,ebx
+ add ebp,DWORD[4+rsp]
+ xor edi,ecx
+ mov esi,eax
+ rol eax,5
+ add ebp,edi
+ xor esi,ecx
+ ror ebx,7
+ add ebp,eax
+ add edx,DWORD[8+rsp]
+ xor esi,ebx
+ mov edi,ebp
+ rol ebp,5
+ add edx,esi
+ xor edi,ebx
+ ror eax,7
+ add edx,ebp
+ add ecx,DWORD[12+rsp]
+ xor edi,eax
+ mov esi,edx
+ rol edx,5
+ add ecx,edi
+ xor esi,eax
+ ror ebp,7
+ add ecx,edx
+ cmp r9,r10
+ je NEAR $L$done_ssse3
+ movdqa xmm6,XMMWORD[64+r14]
+ movdqa xmm9,XMMWORD[((-64))+r14]
+ movdqu xmm0,XMMWORD[r9]
+ movdqu xmm1,XMMWORD[16+r9]
+ movdqu xmm2,XMMWORD[32+r9]
+ movdqu xmm3,XMMWORD[48+r9]
+DB 102,15,56,0,198
+ add r9,64
+ add ebx,DWORD[16+rsp]
+ xor esi,ebp
+ mov edi,ecx
+DB 102,15,56,0,206
+ rol ecx,5
+ add ebx,esi
+ xor edi,ebp
+ ror edx,7
+ paddd xmm0,xmm9
+ add ebx,ecx
+ add eax,DWORD[20+rsp]
+ xor edi,edx
+ mov esi,ebx
+ movdqa XMMWORD[rsp],xmm0
+ rol ebx,5
+ add eax,edi
+ xor esi,edx
+ ror ecx,7
+ psubd xmm0,xmm9
+ add eax,ebx
+ add ebp,DWORD[24+rsp]
+ xor esi,ecx
+ mov edi,eax
+ rol eax,5
+ add ebp,esi
+ xor edi,ecx
+ ror ebx,7
+ add ebp,eax
+ add edx,DWORD[28+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ rol ebp,5
+ add edx,edi
+ xor esi,ebx
+ ror eax,7
+ add edx,ebp
+ add ecx,DWORD[32+rsp]
+ xor esi,eax
+ mov edi,edx
+DB 102,15,56,0,214
+ rol edx,5
+ add ecx,esi
+ xor edi,eax
+ ror ebp,7
+ paddd xmm1,xmm9
+ add ecx,edx
+ add ebx,DWORD[36+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ movdqa XMMWORD[16+rsp],xmm1
+ rol ecx,5
+ add ebx,edi
+ xor esi,ebp
+ ror edx,7
+ psubd xmm1,xmm9
+ add ebx,ecx
+ add eax,DWORD[40+rsp]
+ xor esi,edx
+ mov edi,ebx
+ rol ebx,5
+ add eax,esi
+ xor edi,edx
+ ror ecx,7
+ add eax,ebx
+ add ebp,DWORD[44+rsp]
+ xor edi,ecx
+ mov esi,eax
+ rol eax,5
+ add ebp,edi
+ xor esi,ecx
+ ror ebx,7
+ add ebp,eax
+ add edx,DWORD[48+rsp]
+ xor esi,ebx
+ mov edi,ebp
+DB 102,15,56,0,222
+ rol ebp,5
+ add edx,esi
+ xor edi,ebx
+ ror eax,7
+ paddd xmm2,xmm9
+ add edx,ebp
+ add ecx,DWORD[52+rsp]
+ xor edi,eax
+ mov esi,edx
+ movdqa XMMWORD[32+rsp],xmm2
+ rol edx,5
+ add ecx,edi
+ xor esi,eax
+ ror ebp,7
+ psubd xmm2,xmm9
+ add ecx,edx
+ add ebx,DWORD[56+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ rol ecx,5
+ add ebx,esi
+ xor edi,ebp
+ ror edx,7
+ add ebx,ecx
+ add eax,DWORD[60+rsp]
+ xor edi,edx
+ mov esi,ebx
+ rol ebx,5
+ add eax,edi
+ ror ecx,7
+ add eax,ebx
+ add eax,DWORD[r8]
+ add esi,DWORD[4+r8]
+ add ecx,DWORD[8+r8]
+ add edx,DWORD[12+r8]
+ mov DWORD[r8],eax
+ add ebp,DWORD[16+r8]
+ mov DWORD[4+r8],esi
+ mov ebx,esi
+ mov DWORD[8+r8],ecx
+ mov edi,ecx
+ mov DWORD[12+r8],edx
+ xor edi,edx
+ mov DWORD[16+r8],ebp
+ and esi,edi
+ jmp NEAR $L$oop_ssse3
+
+ALIGN 16
+$L$done_ssse3:
+ add ebx,DWORD[16+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ rol ecx,5
+ add ebx,esi
+ xor edi,ebp
+ ror edx,7
+ add ebx,ecx
+ add eax,DWORD[20+rsp]
+ xor edi,edx
+ mov esi,ebx
+ rol ebx,5
+ add eax,edi
+ xor esi,edx
+ ror ecx,7
+ add eax,ebx
+ add ebp,DWORD[24+rsp]
+ xor esi,ecx
+ mov edi,eax
+ rol eax,5
+ add ebp,esi
+ xor edi,ecx
+ ror ebx,7
+ add ebp,eax
+ add edx,DWORD[28+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ rol ebp,5
+ add edx,edi
+ xor esi,ebx
+ ror eax,7
+ add edx,ebp
+ add ecx,DWORD[32+rsp]
+ xor esi,eax
+ mov edi,edx
+ rol edx,5
+ add ecx,esi
+ xor edi,eax
+ ror ebp,7
+ add ecx,edx
+ add ebx,DWORD[36+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ rol ecx,5
+ add ebx,edi
+ xor esi,ebp
+ ror edx,7
+ add ebx,ecx
+ add eax,DWORD[40+rsp]
+ xor esi,edx
+ mov edi,ebx
+ rol ebx,5
+ add eax,esi
+ xor edi,edx
+ ror ecx,7
+ add eax,ebx
+ add ebp,DWORD[44+rsp]
+ xor edi,ecx
+ mov esi,eax
+ rol eax,5
+ add ebp,edi
+ xor esi,ecx
+ ror ebx,7
+ add ebp,eax
+ add edx,DWORD[48+rsp]
+ xor esi,ebx
+ mov edi,ebp
+ rol ebp,5
+ add edx,esi
+ xor edi,ebx
+ ror eax,7
+ add edx,ebp
+ add ecx,DWORD[52+rsp]
+ xor edi,eax
+ mov esi,edx
+ rol edx,5
+ add ecx,edi
+ xor esi,eax
+ ror ebp,7
+ add ecx,edx
+ add ebx,DWORD[56+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ rol ecx,5
+ add ebx,esi
+ xor edi,ebp
+ ror edx,7
+ add ebx,ecx
+ add eax,DWORD[60+rsp]
+ xor edi,edx
+ mov esi,ebx
+ rol ebx,5
+ add eax,edi
+ ror ecx,7
+ add eax,ebx
+ add eax,DWORD[r8]
+ add esi,DWORD[4+r8]
+ add ecx,DWORD[8+r8]
+ mov DWORD[r8],eax
+ add edx,DWORD[12+r8]
+ mov DWORD[4+r8],esi
+ add ebp,DWORD[16+r8]
+ mov DWORD[8+r8],ecx
+ mov DWORD[12+r8],edx
+ mov DWORD[16+r8],ebp
+ movaps xmm6,XMMWORD[((-40-96))+r11]
+ movaps xmm7,XMMWORD[((-40-80))+r11]
+ movaps xmm8,XMMWORD[((-40-64))+r11]
+ movaps xmm9,XMMWORD[((-40-48))+r11]
+ movaps xmm10,XMMWORD[((-40-32))+r11]
+ movaps xmm11,XMMWORD[((-40-16))+r11]
+ mov r14,QWORD[((-40))+r11]
+
+ mov r13,QWORD[((-32))+r11]
+
+ mov r12,QWORD[((-24))+r11]
+
+ mov rbp,QWORD[((-16))+r11]
+
+ mov rbx,QWORD[((-8))+r11]
+
+ lea rsp,[r11]
+
+$L$epilogue_ssse3:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha1_block_data_order_ssse3:
+
+ALIGN 16
+sha1_block_data_order_avx:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha1_block_data_order_avx:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+_avx_shortcut:
+
+ mov r11,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ lea rsp,[((-160))+rsp]
+ vzeroupper
+ vmovaps XMMWORD[(-40-96)+r11],xmm6
+ vmovaps XMMWORD[(-40-80)+r11],xmm7
+ vmovaps XMMWORD[(-40-64)+r11],xmm8
+ vmovaps XMMWORD[(-40-48)+r11],xmm9
+ vmovaps XMMWORD[(-40-32)+r11],xmm10
+ vmovaps XMMWORD[(-40-16)+r11],xmm11
+$L$prologue_avx:
+ and rsp,-64
+ mov r8,rdi
+ mov r9,rsi
+ mov r10,rdx
+
+ shl r10,6
+ add r10,r9
+ lea r14,[((K_XX_XX+64))]
+
+ mov eax,DWORD[r8]
+ mov ebx,DWORD[4+r8]
+ mov ecx,DWORD[8+r8]
+ mov edx,DWORD[12+r8]
+ mov esi,ebx
+ mov ebp,DWORD[16+r8]
+ mov edi,ecx
+ xor edi,edx
+ and esi,edi
+
+ vmovdqa xmm6,XMMWORD[64+r14]
+ vmovdqa xmm11,XMMWORD[((-64))+r14]
+ vmovdqu xmm0,XMMWORD[r9]
+ vmovdqu xmm1,XMMWORD[16+r9]
+ vmovdqu xmm2,XMMWORD[32+r9]
+ vmovdqu xmm3,XMMWORD[48+r9]
+ vpshufb xmm0,xmm0,xmm6
+ add r9,64
+ vpshufb xmm1,xmm1,xmm6
+ vpshufb xmm2,xmm2,xmm6
+ vpshufb xmm3,xmm3,xmm6
+ vpaddd xmm4,xmm0,xmm11
+ vpaddd xmm5,xmm1,xmm11
+ vpaddd xmm6,xmm2,xmm11
+ vmovdqa XMMWORD[rsp],xmm4
+ vmovdqa XMMWORD[16+rsp],xmm5
+ vmovdqa XMMWORD[32+rsp],xmm6
+ jmp NEAR $L$oop_avx
+ALIGN 16
+$L$oop_avx:
+ shrd ebx,ebx,2
+ xor esi,edx
+ vpalignr xmm4,xmm1,xmm0,8
+ mov edi,eax
+ add ebp,DWORD[rsp]
+ vpaddd xmm9,xmm11,xmm3
+ xor ebx,ecx
+ shld eax,eax,5
+ vpsrldq xmm8,xmm3,4
+ add ebp,esi
+ and edi,ebx
+ vpxor xmm4,xmm4,xmm0
+ xor ebx,ecx
+ add ebp,eax
+ vpxor xmm8,xmm8,xmm2
+ shrd eax,eax,7
+ xor edi,ecx
+ mov esi,ebp
+ add edx,DWORD[4+rsp]
+ vpxor xmm4,xmm4,xmm8
+ xor eax,ebx
+ shld ebp,ebp,5
+ vmovdqa XMMWORD[48+rsp],xmm9
+ add edx,edi
+ and esi,eax
+ vpsrld xmm8,xmm4,31
+ xor eax,ebx
+ add edx,ebp
+ shrd ebp,ebp,7
+ xor esi,ebx
+ vpslldq xmm10,xmm4,12
+ vpaddd xmm4,xmm4,xmm4
+ mov edi,edx
+ add ecx,DWORD[8+rsp]
+ xor ebp,eax
+ shld edx,edx,5
+ vpsrld xmm9,xmm10,30
+ vpor xmm4,xmm4,xmm8
+ add ecx,esi
+ and edi,ebp
+ xor ebp,eax
+ add ecx,edx
+ vpslld xmm10,xmm10,2
+ vpxor xmm4,xmm4,xmm9
+ shrd edx,edx,7
+ xor edi,eax
+ mov esi,ecx
+ add ebx,DWORD[12+rsp]
+ vpxor xmm4,xmm4,xmm10
+ xor edx,ebp
+ shld ecx,ecx,5
+ add ebx,edi
+ and esi,edx
+ xor edx,ebp
+ add ebx,ecx
+ shrd ecx,ecx,7
+ xor esi,ebp
+ vpalignr xmm5,xmm2,xmm1,8
+ mov edi,ebx
+ add eax,DWORD[16+rsp]
+ vpaddd xmm9,xmm11,xmm4
+ xor ecx,edx
+ shld ebx,ebx,5
+ vpsrldq xmm8,xmm4,4
+ add eax,esi
+ and edi,ecx
+ vpxor xmm5,xmm5,xmm1
+ xor ecx,edx
+ add eax,ebx
+ vpxor xmm8,xmm8,xmm3
+ shrd ebx,ebx,7
+ xor edi,edx
+ mov esi,eax
+ add ebp,DWORD[20+rsp]
+ vpxor xmm5,xmm5,xmm8
+ xor ebx,ecx
+ shld eax,eax,5
+ vmovdqa XMMWORD[rsp],xmm9
+ add ebp,edi
+ and esi,ebx
+ vpsrld xmm8,xmm5,31
+ xor ebx,ecx
+ add ebp,eax
+ shrd eax,eax,7
+ xor esi,ecx
+ vpslldq xmm10,xmm5,12
+ vpaddd xmm5,xmm5,xmm5
+ mov edi,ebp
+ add edx,DWORD[24+rsp]
+ xor eax,ebx
+ shld ebp,ebp,5
+ vpsrld xmm9,xmm10,30
+ vpor xmm5,xmm5,xmm8
+ add edx,esi
+ and edi,eax
+ xor eax,ebx
+ add edx,ebp
+ vpslld xmm10,xmm10,2
+ vpxor xmm5,xmm5,xmm9
+ shrd ebp,ebp,7
+ xor edi,ebx
+ mov esi,edx
+ add ecx,DWORD[28+rsp]
+ vpxor xmm5,xmm5,xmm10
+ xor ebp,eax
+ shld edx,edx,5
+ vmovdqa xmm11,XMMWORD[((-32))+r14]
+ add ecx,edi
+ and esi,ebp
+ xor ebp,eax
+ add ecx,edx
+ shrd edx,edx,7
+ xor esi,eax
+ vpalignr xmm6,xmm3,xmm2,8
+ mov edi,ecx
+ add ebx,DWORD[32+rsp]
+ vpaddd xmm9,xmm11,xmm5
+ xor edx,ebp
+ shld ecx,ecx,5
+ vpsrldq xmm8,xmm5,4
+ add ebx,esi
+ and edi,edx
+ vpxor xmm6,xmm6,xmm2
+ xor edx,ebp
+ add ebx,ecx
+ vpxor xmm8,xmm8,xmm4
+ shrd ecx,ecx,7
+ xor edi,ebp
+ mov esi,ebx
+ add eax,DWORD[36+rsp]
+ vpxor xmm6,xmm6,xmm8
+ xor ecx,edx
+ shld ebx,ebx,5
+ vmovdqa XMMWORD[16+rsp],xmm9
+ add eax,edi
+ and esi,ecx
+ vpsrld xmm8,xmm6,31
+ xor ecx,edx
+ add eax,ebx
+ shrd ebx,ebx,7
+ xor esi,edx
+ vpslldq xmm10,xmm6,12
+ vpaddd xmm6,xmm6,xmm6
+ mov edi,eax
+ add ebp,DWORD[40+rsp]
+ xor ebx,ecx
+ shld eax,eax,5
+ vpsrld xmm9,xmm10,30
+ vpor xmm6,xmm6,xmm8
+ add ebp,esi
+ and edi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ vpslld xmm10,xmm10,2
+ vpxor xmm6,xmm6,xmm9
+ shrd eax,eax,7
+ xor edi,ecx
+ mov esi,ebp
+ add edx,DWORD[44+rsp]
+ vpxor xmm6,xmm6,xmm10
+ xor eax,ebx
+ shld ebp,ebp,5
+ add edx,edi
+ and esi,eax
+ xor eax,ebx
+ add edx,ebp
+ shrd ebp,ebp,7
+ xor esi,ebx
+ vpalignr xmm7,xmm4,xmm3,8
+ mov edi,edx
+ add ecx,DWORD[48+rsp]
+ vpaddd xmm9,xmm11,xmm6
+ xor ebp,eax
+ shld edx,edx,5
+ vpsrldq xmm8,xmm6,4
+ add ecx,esi
+ and edi,ebp
+ vpxor xmm7,xmm7,xmm3
+ xor ebp,eax
+ add ecx,edx
+ vpxor xmm8,xmm8,xmm5
+ shrd edx,edx,7
+ xor edi,eax
+ mov esi,ecx
+ add ebx,DWORD[52+rsp]
+ vpxor xmm7,xmm7,xmm8
+ xor edx,ebp
+ shld ecx,ecx,5
+ vmovdqa XMMWORD[32+rsp],xmm9
+ add ebx,edi
+ and esi,edx
+ vpsrld xmm8,xmm7,31
+ xor edx,ebp
+ add ebx,ecx
+ shrd ecx,ecx,7
+ xor esi,ebp
+ vpslldq xmm10,xmm7,12
+ vpaddd xmm7,xmm7,xmm7
+ mov edi,ebx
+ add eax,DWORD[56+rsp]
+ xor ecx,edx
+ shld ebx,ebx,5
+ vpsrld xmm9,xmm10,30
+ vpor xmm7,xmm7,xmm8
+ add eax,esi
+ and edi,ecx
+ xor ecx,edx
+ add eax,ebx
+ vpslld xmm10,xmm10,2
+ vpxor xmm7,xmm7,xmm9
+ shrd ebx,ebx,7
+ xor edi,edx
+ mov esi,eax
+ add ebp,DWORD[60+rsp]
+ vpxor xmm7,xmm7,xmm10
+ xor ebx,ecx
+ shld eax,eax,5
+ add ebp,edi
+ and esi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ vpalignr xmm8,xmm7,xmm6,8
+ vpxor xmm0,xmm0,xmm4
+ shrd eax,eax,7
+ xor esi,ecx
+ mov edi,ebp
+ add edx,DWORD[rsp]
+ vpxor xmm0,xmm0,xmm1
+ xor eax,ebx
+ shld ebp,ebp,5
+ vpaddd xmm9,xmm11,xmm7
+ add edx,esi
+ and edi,eax
+ vpxor xmm0,xmm0,xmm8
+ xor eax,ebx
+ add edx,ebp
+ shrd ebp,ebp,7
+ xor edi,ebx
+ vpsrld xmm8,xmm0,30
+ vmovdqa XMMWORD[48+rsp],xmm9
+ mov esi,edx
+ add ecx,DWORD[4+rsp]
+ xor ebp,eax
+ shld edx,edx,5
+ vpslld xmm0,xmm0,2
+ add ecx,edi
+ and esi,ebp
+ xor ebp,eax
+ add ecx,edx
+ shrd edx,edx,7
+ xor esi,eax
+ mov edi,ecx
+ add ebx,DWORD[8+rsp]
+ vpor xmm0,xmm0,xmm8
+ xor edx,ebp
+ shld ecx,ecx,5
+ add ebx,esi
+ and edi,edx
+ xor edx,ebp
+ add ebx,ecx
+ add eax,DWORD[12+rsp]
+ xor edi,ebp
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,edi
+ xor esi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ vpalignr xmm8,xmm0,xmm7,8
+ vpxor xmm1,xmm1,xmm5
+ add ebp,DWORD[16+rsp]
+ xor esi,ecx
+ mov edi,eax
+ shld eax,eax,5
+ vpxor xmm1,xmm1,xmm2
+ add ebp,esi
+ xor edi,ecx
+ vpaddd xmm9,xmm11,xmm0
+ shrd ebx,ebx,7
+ add ebp,eax
+ vpxor xmm1,xmm1,xmm8
+ add edx,DWORD[20+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ shld ebp,ebp,5
+ vpsrld xmm8,xmm1,30
+ vmovdqa XMMWORD[rsp],xmm9
+ add edx,edi
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ vpslld xmm1,xmm1,2
+ add ecx,DWORD[24+rsp]
+ xor esi,eax
+ mov edi,edx
+ shld edx,edx,5
+ add ecx,esi
+ xor edi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ vpor xmm1,xmm1,xmm8
+ add ebx,DWORD[28+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ shld ecx,ecx,5
+ add ebx,edi
+ xor esi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ vpalignr xmm8,xmm1,xmm0,8
+ vpxor xmm2,xmm2,xmm6
+ add eax,DWORD[32+rsp]
+ xor esi,edx
+ mov edi,ebx
+ shld ebx,ebx,5
+ vpxor xmm2,xmm2,xmm3
+ add eax,esi
+ xor edi,edx
+ vpaddd xmm9,xmm11,xmm1
+ vmovdqa xmm11,XMMWORD[r14]
+ shrd ecx,ecx,7
+ add eax,ebx
+ vpxor xmm2,xmm2,xmm8
+ add ebp,DWORD[36+rsp]
+ xor edi,ecx
+ mov esi,eax
+ shld eax,eax,5
+ vpsrld xmm8,xmm2,30
+ vmovdqa XMMWORD[16+rsp],xmm9
+ add ebp,edi
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ vpslld xmm2,xmm2,2
+ add edx,DWORD[40+rsp]
+ xor esi,ebx
+ mov edi,ebp
+ shld ebp,ebp,5
+ add edx,esi
+ xor edi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ vpor xmm2,xmm2,xmm8
+ add ecx,DWORD[44+rsp]
+ xor edi,eax
+ mov esi,edx
+ shld edx,edx,5
+ add ecx,edi
+ xor esi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ vpalignr xmm8,xmm2,xmm1,8
+ vpxor xmm3,xmm3,xmm7
+ add ebx,DWORD[48+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ shld ecx,ecx,5
+ vpxor xmm3,xmm3,xmm4
+ add ebx,esi
+ xor edi,ebp
+ vpaddd xmm9,xmm11,xmm2
+ shrd edx,edx,7
+ add ebx,ecx
+ vpxor xmm3,xmm3,xmm8
+ add eax,DWORD[52+rsp]
+ xor edi,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ vpsrld xmm8,xmm3,30
+ vmovdqa XMMWORD[32+rsp],xmm9
+ add eax,edi
+ xor esi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ vpslld xmm3,xmm3,2
+ add ebp,DWORD[56+rsp]
+ xor esi,ecx
+ mov edi,eax
+ shld eax,eax,5
+ add ebp,esi
+ xor edi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ vpor xmm3,xmm3,xmm8
+ add edx,DWORD[60+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ shld ebp,ebp,5
+ add edx,edi
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ vpalignr xmm8,xmm3,xmm2,8
+ vpxor xmm4,xmm4,xmm0
+ add ecx,DWORD[rsp]
+ xor esi,eax
+ mov edi,edx
+ shld edx,edx,5
+ vpxor xmm4,xmm4,xmm5
+ add ecx,esi
+ xor edi,eax
+ vpaddd xmm9,xmm11,xmm3
+ shrd ebp,ebp,7
+ add ecx,edx
+ vpxor xmm4,xmm4,xmm8
+ add ebx,DWORD[4+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ shld ecx,ecx,5
+ vpsrld xmm8,xmm4,30
+ vmovdqa XMMWORD[48+rsp],xmm9
+ add ebx,edi
+ xor esi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ vpslld xmm4,xmm4,2
+ add eax,DWORD[8+rsp]
+ xor esi,edx
+ mov edi,ebx
+ shld ebx,ebx,5
+ add eax,esi
+ xor edi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ vpor xmm4,xmm4,xmm8
+ add ebp,DWORD[12+rsp]
+ xor edi,ecx
+ mov esi,eax
+ shld eax,eax,5
+ add ebp,edi
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ vpalignr xmm8,xmm4,xmm3,8
+ vpxor xmm5,xmm5,xmm1
+ add edx,DWORD[16+rsp]
+ xor esi,ebx
+ mov edi,ebp
+ shld ebp,ebp,5
+ vpxor xmm5,xmm5,xmm6
+ add edx,esi
+ xor edi,ebx
+ vpaddd xmm9,xmm11,xmm4
+ shrd eax,eax,7
+ add edx,ebp
+ vpxor xmm5,xmm5,xmm8
+ add ecx,DWORD[20+rsp]
+ xor edi,eax
+ mov esi,edx
+ shld edx,edx,5
+ vpsrld xmm8,xmm5,30
+ vmovdqa XMMWORD[rsp],xmm9
+ add ecx,edi
+ xor esi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ vpslld xmm5,xmm5,2
+ add ebx,DWORD[24+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ shld ecx,ecx,5
+ add ebx,esi
+ xor edi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ vpor xmm5,xmm5,xmm8
+ add eax,DWORD[28+rsp]
+ shrd ecx,ecx,7
+ mov esi,ebx
+ xor edi,edx
+ shld ebx,ebx,5
+ add eax,edi
+ xor esi,ecx
+ xor ecx,edx
+ add eax,ebx
+ vpalignr xmm8,xmm5,xmm4,8
+ vpxor xmm6,xmm6,xmm2
+ add ebp,DWORD[32+rsp]
+ and esi,ecx
+ xor ecx,edx
+ shrd ebx,ebx,7
+ vpxor xmm6,xmm6,xmm7
+ mov edi,eax
+ xor esi,ecx
+ vpaddd xmm9,xmm11,xmm5
+ shld eax,eax,5
+ add ebp,esi
+ vpxor xmm6,xmm6,xmm8
+ xor edi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ add edx,DWORD[36+rsp]
+ vpsrld xmm8,xmm6,30
+ vmovdqa XMMWORD[16+rsp],xmm9
+ and edi,ebx
+ xor ebx,ecx
+ shrd eax,eax,7
+ mov esi,ebp
+ vpslld xmm6,xmm6,2
+ xor edi,ebx
+ shld ebp,ebp,5
+ add edx,edi
+ xor esi,eax
+ xor eax,ebx
+ add edx,ebp
+ add ecx,DWORD[40+rsp]
+ and esi,eax
+ vpor xmm6,xmm6,xmm8
+ xor eax,ebx
+ shrd ebp,ebp,7
+ mov edi,edx
+ xor esi,eax
+ shld edx,edx,5
+ add ecx,esi
+ xor edi,ebp
+ xor ebp,eax
+ add ecx,edx
+ add ebx,DWORD[44+rsp]
+ and edi,ebp
+ xor ebp,eax
+ shrd edx,edx,7
+ mov esi,ecx
+ xor edi,ebp
+ shld ecx,ecx,5
+ add ebx,edi
+ xor esi,edx
+ xor edx,ebp
+ add ebx,ecx
+ vpalignr xmm8,xmm6,xmm5,8
+ vpxor xmm7,xmm7,xmm3
+ add eax,DWORD[48+rsp]
+ and esi,edx
+ xor edx,ebp
+ shrd ecx,ecx,7
+ vpxor xmm7,xmm7,xmm0
+ mov edi,ebx
+ xor esi,edx
+ vpaddd xmm9,xmm11,xmm6
+ vmovdqa xmm11,XMMWORD[32+r14]
+ shld ebx,ebx,5
+ add eax,esi
+ vpxor xmm7,xmm7,xmm8
+ xor edi,ecx
+ xor ecx,edx
+ add eax,ebx
+ add ebp,DWORD[52+rsp]
+ vpsrld xmm8,xmm7,30
+ vmovdqa XMMWORD[32+rsp],xmm9
+ and edi,ecx
+ xor ecx,edx
+ shrd ebx,ebx,7
+ mov esi,eax
+ vpslld xmm7,xmm7,2
+ xor edi,ecx
+ shld eax,eax,5
+ add ebp,edi
+ xor esi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ add edx,DWORD[56+rsp]
+ and esi,ebx
+ vpor xmm7,xmm7,xmm8
+ xor ebx,ecx
+ shrd eax,eax,7
+ mov edi,ebp
+ xor esi,ebx
+ shld ebp,ebp,5
+ add edx,esi
+ xor edi,eax
+ xor eax,ebx
+ add edx,ebp
+ add ecx,DWORD[60+rsp]
+ and edi,eax
+ xor eax,ebx
+ shrd ebp,ebp,7
+ mov esi,edx
+ xor edi,eax
+ shld edx,edx,5
+ add ecx,edi
+ xor esi,ebp
+ xor ebp,eax
+ add ecx,edx
+ vpalignr xmm8,xmm7,xmm6,8
+ vpxor xmm0,xmm0,xmm4
+ add ebx,DWORD[rsp]
+ and esi,ebp
+ xor ebp,eax
+ shrd edx,edx,7
+ vpxor xmm0,xmm0,xmm1
+ mov edi,ecx
+ xor esi,ebp
+ vpaddd xmm9,xmm11,xmm7
+ shld ecx,ecx,5
+ add ebx,esi
+ vpxor xmm0,xmm0,xmm8
+ xor edi,edx
+ xor edx,ebp
+ add ebx,ecx
+ add eax,DWORD[4+rsp]
+ vpsrld xmm8,xmm0,30
+ vmovdqa XMMWORD[48+rsp],xmm9
+ and edi,edx
+ xor edx,ebp
+ shrd ecx,ecx,7
+ mov esi,ebx
+ vpslld xmm0,xmm0,2
+ xor edi,edx
+ shld ebx,ebx,5
+ add eax,edi
+ xor esi,ecx
+ xor ecx,edx
+ add eax,ebx
+ add ebp,DWORD[8+rsp]
+ and esi,ecx
+ vpor xmm0,xmm0,xmm8
+ xor ecx,edx
+ shrd ebx,ebx,7
+ mov edi,eax
+ xor esi,ecx
+ shld eax,eax,5
+ add ebp,esi
+ xor edi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ add edx,DWORD[12+rsp]
+ and edi,ebx
+ xor ebx,ecx
+ shrd eax,eax,7
+ mov esi,ebp
+ xor edi,ebx
+ shld ebp,ebp,5
+ add edx,edi
+ xor esi,eax
+ xor eax,ebx
+ add edx,ebp
+ vpalignr xmm8,xmm0,xmm7,8
+ vpxor xmm1,xmm1,xmm5
+ add ecx,DWORD[16+rsp]
+ and esi,eax
+ xor eax,ebx
+ shrd ebp,ebp,7
+ vpxor xmm1,xmm1,xmm2
+ mov edi,edx
+ xor esi,eax
+ vpaddd xmm9,xmm11,xmm0
+ shld edx,edx,5
+ add ecx,esi
+ vpxor xmm1,xmm1,xmm8
+ xor edi,ebp
+ xor ebp,eax
+ add ecx,edx
+ add ebx,DWORD[20+rsp]
+ vpsrld xmm8,xmm1,30
+ vmovdqa XMMWORD[rsp],xmm9
+ and edi,ebp
+ xor ebp,eax
+ shrd edx,edx,7
+ mov esi,ecx
+ vpslld xmm1,xmm1,2
+ xor edi,ebp
+ shld ecx,ecx,5
+ add ebx,edi
+ xor esi,edx
+ xor edx,ebp
+ add ebx,ecx
+ add eax,DWORD[24+rsp]
+ and esi,edx
+ vpor xmm1,xmm1,xmm8
+ xor edx,ebp
+ shrd ecx,ecx,7
+ mov edi,ebx
+ xor esi,edx
+ shld ebx,ebx,5
+ add eax,esi
+ xor edi,ecx
+ xor ecx,edx
+ add eax,ebx
+ add ebp,DWORD[28+rsp]
+ and edi,ecx
+ xor ecx,edx
+ shrd ebx,ebx,7
+ mov esi,eax
+ xor edi,ecx
+ shld eax,eax,5
+ add ebp,edi
+ xor esi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ vpalignr xmm8,xmm1,xmm0,8
+ vpxor xmm2,xmm2,xmm6
+ add edx,DWORD[32+rsp]
+ and esi,ebx
+ xor ebx,ecx
+ shrd eax,eax,7
+ vpxor xmm2,xmm2,xmm3
+ mov edi,ebp
+ xor esi,ebx
+ vpaddd xmm9,xmm11,xmm1
+ shld ebp,ebp,5
+ add edx,esi
+ vpxor xmm2,xmm2,xmm8
+ xor edi,eax
+ xor eax,ebx
+ add edx,ebp
+ add ecx,DWORD[36+rsp]
+ vpsrld xmm8,xmm2,30
+ vmovdqa XMMWORD[16+rsp],xmm9
+ and edi,eax
+ xor eax,ebx
+ shrd ebp,ebp,7
+ mov esi,edx
+ vpslld xmm2,xmm2,2
+ xor edi,eax
+ shld edx,edx,5
+ add ecx,edi
+ xor esi,ebp
+ xor ebp,eax
+ add ecx,edx
+ add ebx,DWORD[40+rsp]
+ and esi,ebp
+ vpor xmm2,xmm2,xmm8
+ xor ebp,eax
+ shrd edx,edx,7
+ mov edi,ecx
+ xor esi,ebp
+ shld ecx,ecx,5
+ add ebx,esi
+ xor edi,edx
+ xor edx,ebp
+ add ebx,ecx
+ add eax,DWORD[44+rsp]
+ and edi,edx
+ xor edx,ebp
+ shrd ecx,ecx,7
+ mov esi,ebx
+ xor edi,edx
+ shld ebx,ebx,5
+ add eax,edi
+ xor esi,edx
+ add eax,ebx
+ vpalignr xmm8,xmm2,xmm1,8
+ vpxor xmm3,xmm3,xmm7
+ add ebp,DWORD[48+rsp]
+ xor esi,ecx
+ mov edi,eax
+ shld eax,eax,5
+ vpxor xmm3,xmm3,xmm4
+ add ebp,esi
+ xor edi,ecx
+ vpaddd xmm9,xmm11,xmm2
+ shrd ebx,ebx,7
+ add ebp,eax
+ vpxor xmm3,xmm3,xmm8
+ add edx,DWORD[52+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ shld ebp,ebp,5
+ vpsrld xmm8,xmm3,30
+ vmovdqa XMMWORD[32+rsp],xmm9
+ add edx,edi
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ vpslld xmm3,xmm3,2
+ add ecx,DWORD[56+rsp]
+ xor esi,eax
+ mov edi,edx
+ shld edx,edx,5
+ add ecx,esi
+ xor edi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ vpor xmm3,xmm3,xmm8
+ add ebx,DWORD[60+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ shld ecx,ecx,5
+ add ebx,edi
+ xor esi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD[rsp]
+ vpaddd xmm9,xmm11,xmm3
+ xor esi,edx
+ mov edi,ebx
+ shld ebx,ebx,5
+ add eax,esi
+ vmovdqa XMMWORD[48+rsp],xmm9
+ xor edi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add ebp,DWORD[4+rsp]
+ xor edi,ecx
+ mov esi,eax
+ shld eax,eax,5
+ add ebp,edi
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ add edx,DWORD[8+rsp]
+ xor esi,ebx
+ mov edi,ebp
+ shld ebp,ebp,5
+ add edx,esi
+ xor edi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ add ecx,DWORD[12+rsp]
+ xor edi,eax
+ mov esi,edx
+ shld edx,edx,5
+ add ecx,edi
+ xor esi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ cmp r9,r10
+ je NEAR $L$done_avx
+ vmovdqa xmm6,XMMWORD[64+r14]
+ vmovdqa xmm11,XMMWORD[((-64))+r14]
+ vmovdqu xmm0,XMMWORD[r9]
+ vmovdqu xmm1,XMMWORD[16+r9]
+ vmovdqu xmm2,XMMWORD[32+r9]
+ vmovdqu xmm3,XMMWORD[48+r9]
+ vpshufb xmm0,xmm0,xmm6
+ add r9,64
+ add ebx,DWORD[16+rsp]
+ xor esi,ebp
+ vpshufb xmm1,xmm1,xmm6
+ mov edi,ecx
+ shld ecx,ecx,5
+ vpaddd xmm4,xmm0,xmm11
+ add ebx,esi
+ xor edi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ vmovdqa XMMWORD[rsp],xmm4
+ add eax,DWORD[20+rsp]
+ xor edi,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,edi
+ xor esi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add ebp,DWORD[24+rsp]
+ xor esi,ecx
+ mov edi,eax
+ shld eax,eax,5
+ add ebp,esi
+ xor edi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ add edx,DWORD[28+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ shld ebp,ebp,5
+ add edx,edi
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ add ecx,DWORD[32+rsp]
+ xor esi,eax
+ vpshufb xmm2,xmm2,xmm6
+ mov edi,edx
+ shld edx,edx,5
+ vpaddd xmm5,xmm1,xmm11
+ add ecx,esi
+ xor edi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ vmovdqa XMMWORD[16+rsp],xmm5
+ add ebx,DWORD[36+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ shld ecx,ecx,5
+ add ebx,edi
+ xor esi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD[40+rsp]
+ xor esi,edx
+ mov edi,ebx
+ shld ebx,ebx,5
+ add eax,esi
+ xor edi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add ebp,DWORD[44+rsp]
+ xor edi,ecx
+ mov esi,eax
+ shld eax,eax,5
+ add ebp,edi
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ add edx,DWORD[48+rsp]
+ xor esi,ebx
+ vpshufb xmm3,xmm3,xmm6
+ mov edi,ebp
+ shld ebp,ebp,5
+ vpaddd xmm6,xmm2,xmm11
+ add edx,esi
+ xor edi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ vmovdqa XMMWORD[32+rsp],xmm6
+ add ecx,DWORD[52+rsp]
+ xor edi,eax
+ mov esi,edx
+ shld edx,edx,5
+ add ecx,edi
+ xor esi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ add ebx,DWORD[56+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ shld ecx,ecx,5
+ add ebx,esi
+ xor edi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD[60+rsp]
+ xor edi,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,edi
+ shrd ecx,ecx,7
+ add eax,ebx
+ add eax,DWORD[r8]
+ add esi,DWORD[4+r8]
+ add ecx,DWORD[8+r8]
+ add edx,DWORD[12+r8]
+ mov DWORD[r8],eax
+ add ebp,DWORD[16+r8]
+ mov DWORD[4+r8],esi
+ mov ebx,esi
+ mov DWORD[8+r8],ecx
+ mov edi,ecx
+ mov DWORD[12+r8],edx
+ xor edi,edx
+ mov DWORD[16+r8],ebp
+ and esi,edi
+ jmp NEAR $L$oop_avx
+
+ALIGN 16
+$L$done_avx:
+ add ebx,DWORD[16+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ shld ecx,ecx,5
+ add ebx,esi
+ xor edi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD[20+rsp]
+ xor edi,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,edi
+ xor esi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add ebp,DWORD[24+rsp]
+ xor esi,ecx
+ mov edi,eax
+ shld eax,eax,5
+ add ebp,esi
+ xor edi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ add edx,DWORD[28+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ shld ebp,ebp,5
+ add edx,edi
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ add ecx,DWORD[32+rsp]
+ xor esi,eax
+ mov edi,edx
+ shld edx,edx,5
+ add ecx,esi
+ xor edi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ add ebx,DWORD[36+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ shld ecx,ecx,5
+ add ebx,edi
+ xor esi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD[40+rsp]
+ xor esi,edx
+ mov edi,ebx
+ shld ebx,ebx,5
+ add eax,esi
+ xor edi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add ebp,DWORD[44+rsp]
+ xor edi,ecx
+ mov esi,eax
+ shld eax,eax,5
+ add ebp,edi
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ add edx,DWORD[48+rsp]
+ xor esi,ebx
+ mov edi,ebp
+ shld ebp,ebp,5
+ add edx,esi
+ xor edi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ add ecx,DWORD[52+rsp]
+ xor edi,eax
+ mov esi,edx
+ shld edx,edx,5
+ add ecx,edi
+ xor esi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ add ebx,DWORD[56+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ shld ecx,ecx,5
+ add ebx,esi
+ xor edi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD[60+rsp]
+ xor edi,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,edi
+ shrd ecx,ecx,7
+ add eax,ebx
+ vzeroupper
+
+ add eax,DWORD[r8]
+ add esi,DWORD[4+r8]
+ add ecx,DWORD[8+r8]
+ mov DWORD[r8],eax
+ add edx,DWORD[12+r8]
+ mov DWORD[4+r8],esi
+ add ebp,DWORD[16+r8]
+ mov DWORD[8+r8],ecx
+ mov DWORD[12+r8],edx
+ mov DWORD[16+r8],ebp
+ movaps xmm6,XMMWORD[((-40-96))+r11]
+ movaps xmm7,XMMWORD[((-40-80))+r11]
+ movaps xmm8,XMMWORD[((-40-64))+r11]
+ movaps xmm9,XMMWORD[((-40-48))+r11]
+ movaps xmm10,XMMWORD[((-40-32))+r11]
+ movaps xmm11,XMMWORD[((-40-16))+r11]
+ mov r14,QWORD[((-40))+r11]
+
+ mov r13,QWORD[((-32))+r11]
+
+ mov r12,QWORD[((-24))+r11]
+
+ mov rbp,QWORD[((-16))+r11]
+
+ mov rbx,QWORD[((-8))+r11]
+
+ lea rsp,[r11]
+
+$L$epilogue_avx:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha1_block_data_order_avx:
+
+ALIGN 16
+sha1_block_data_order_avx2:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha1_block_data_order_avx2:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+_avx2_shortcut:
+
+ mov r11,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ vzeroupper
+ lea rsp,[((-96))+rsp]
+ vmovaps XMMWORD[(-40-96)+r11],xmm6
+ vmovaps XMMWORD[(-40-80)+r11],xmm7
+ vmovaps XMMWORD[(-40-64)+r11],xmm8
+ vmovaps XMMWORD[(-40-48)+r11],xmm9
+ vmovaps XMMWORD[(-40-32)+r11],xmm10
+ vmovaps XMMWORD[(-40-16)+r11],xmm11
+$L$prologue_avx2:
+ mov r8,rdi
+ mov r9,rsi
+ mov r10,rdx
+
+ lea rsp,[((-640))+rsp]
+ shl r10,6
+ lea r13,[64+r9]
+ and rsp,-128
+ add r10,r9
+ lea r14,[((K_XX_XX+64))]
+
+ mov eax,DWORD[r8]
+ cmp r13,r10
+ cmovae r13,r9
+ mov ebp,DWORD[4+r8]
+ mov ecx,DWORD[8+r8]
+ mov edx,DWORD[12+r8]
+ mov esi,DWORD[16+r8]
+ vmovdqu ymm6,YMMWORD[64+r14]
+
+ vmovdqu xmm0,XMMWORD[r9]
+ vmovdqu xmm1,XMMWORD[16+r9]
+ vmovdqu xmm2,XMMWORD[32+r9]
+ vmovdqu xmm3,XMMWORD[48+r9]
+ lea r9,[64+r9]
+ vinserti128 ymm0,ymm0,XMMWORD[r13],1
+ vinserti128 ymm1,ymm1,XMMWORD[16+r13],1
+ vpshufb ymm0,ymm0,ymm6
+ vinserti128 ymm2,ymm2,XMMWORD[32+r13],1
+ vpshufb ymm1,ymm1,ymm6
+ vinserti128 ymm3,ymm3,XMMWORD[48+r13],1
+ vpshufb ymm2,ymm2,ymm6
+ vmovdqu ymm11,YMMWORD[((-64))+r14]
+ vpshufb ymm3,ymm3,ymm6
+
+ vpaddd ymm4,ymm0,ymm11
+ vpaddd ymm5,ymm1,ymm11
+ vmovdqu YMMWORD[rsp],ymm4
+ vpaddd ymm6,ymm2,ymm11
+ vmovdqu YMMWORD[32+rsp],ymm5
+ vpaddd ymm7,ymm3,ymm11
+ vmovdqu YMMWORD[64+rsp],ymm6
+ vmovdqu YMMWORD[96+rsp],ymm7
+ vpalignr ymm4,ymm1,ymm0,8
+ vpsrldq ymm8,ymm3,4
+ vpxor ymm4,ymm4,ymm0
+ vpxor ymm8,ymm8,ymm2
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm8,ymm4,31
+ vpslldq ymm10,ymm4,12
+ vpaddd ymm4,ymm4,ymm4
+ vpsrld ymm9,ymm10,30
+ vpor ymm4,ymm4,ymm8
+ vpslld ymm10,ymm10,2
+ vpxor ymm4,ymm4,ymm9
+ vpxor ymm4,ymm4,ymm10
+ vpaddd ymm9,ymm4,ymm11
+ vmovdqu YMMWORD[128+rsp],ymm9
+ vpalignr ymm5,ymm2,ymm1,8
+ vpsrldq ymm8,ymm4,4
+ vpxor ymm5,ymm5,ymm1
+ vpxor ymm8,ymm8,ymm3
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm5,31
+ vmovdqu ymm11,YMMWORD[((-32))+r14]
+ vpslldq ymm10,ymm5,12
+ vpaddd ymm5,ymm5,ymm5
+ vpsrld ymm9,ymm10,30
+ vpor ymm5,ymm5,ymm8
+ vpslld ymm10,ymm10,2
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm5,ymm5,ymm10
+ vpaddd ymm9,ymm5,ymm11
+ vmovdqu YMMWORD[160+rsp],ymm9
+ vpalignr ymm6,ymm3,ymm2,8
+ vpsrldq ymm8,ymm5,4
+ vpxor ymm6,ymm6,ymm2
+ vpxor ymm8,ymm8,ymm4
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm6,31
+ vpslldq ymm10,ymm6,12
+ vpaddd ymm6,ymm6,ymm6
+ vpsrld ymm9,ymm10,30
+ vpor ymm6,ymm6,ymm8
+ vpslld ymm10,ymm10,2
+ vpxor ymm6,ymm6,ymm9
+ vpxor ymm6,ymm6,ymm10
+ vpaddd ymm9,ymm6,ymm11
+ vmovdqu YMMWORD[192+rsp],ymm9
+ vpalignr ymm7,ymm4,ymm3,8
+ vpsrldq ymm8,ymm6,4
+ vpxor ymm7,ymm7,ymm3
+ vpxor ymm8,ymm8,ymm5
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm7,31
+ vpslldq ymm10,ymm7,12
+ vpaddd ymm7,ymm7,ymm7
+ vpsrld ymm9,ymm10,30
+ vpor ymm7,ymm7,ymm8
+ vpslld ymm10,ymm10,2
+ vpxor ymm7,ymm7,ymm9
+ vpxor ymm7,ymm7,ymm10
+ vpaddd ymm9,ymm7,ymm11
+ vmovdqu YMMWORD[224+rsp],ymm9
+ lea r13,[128+rsp]
+ jmp NEAR $L$oop_avx2
+ALIGN 32
+$L$oop_avx2:
+ rorx ebx,ebp,2
+ andn edi,ebp,edx
+ and ebp,ecx
+ xor ebp,edi
+ jmp NEAR $L$align32_1
+ALIGN 32
+$L$align32_1:
+ vpalignr ymm8,ymm7,ymm6,8
+ vpxor ymm0,ymm0,ymm4
+ add esi,DWORD[((-128))+r13]
+ andn edi,eax,ecx
+ vpxor ymm0,ymm0,ymm1
+ add esi,ebp
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ vpxor ymm0,ymm0,ymm8
+ and eax,ebx
+ add esi,r12d
+ xor eax,edi
+ vpsrld ymm8,ymm0,30
+ vpslld ymm0,ymm0,2
+ add edx,DWORD[((-124))+r13]
+ andn edi,esi,ebx
+ add edx,eax
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ and esi,ebp
+ vpor ymm0,ymm0,ymm8
+ add edx,r12d
+ xor esi,edi
+ add ecx,DWORD[((-120))+r13]
+ andn edi,edx,ebp
+ vpaddd ymm9,ymm0,ymm11
+ add ecx,esi
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ and edx,eax
+ vmovdqu YMMWORD[256+rsp],ymm9
+ add ecx,r12d
+ xor edx,edi
+ add ebx,DWORD[((-116))+r13]
+ andn edi,ecx,eax
+ add ebx,edx
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ and ecx,esi
+ add ebx,r12d
+ xor ecx,edi
+ add ebp,DWORD[((-96))+r13]
+ andn edi,ebx,esi
+ add ebp,ecx
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ and ebx,edx
+ add ebp,r12d
+ xor ebx,edi
+ vpalignr ymm8,ymm0,ymm7,8
+ vpxor ymm1,ymm1,ymm5
+ add eax,DWORD[((-92))+r13]
+ andn edi,ebp,edx
+ vpxor ymm1,ymm1,ymm2
+ add eax,ebx
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ vpxor ymm1,ymm1,ymm8
+ and ebp,ecx
+ add eax,r12d
+ xor ebp,edi
+ vpsrld ymm8,ymm1,30
+ vpslld ymm1,ymm1,2
+ add esi,DWORD[((-88))+r13]
+ andn edi,eax,ecx
+ add esi,ebp
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ and eax,ebx
+ vpor ymm1,ymm1,ymm8
+ add esi,r12d
+ xor eax,edi
+ add edx,DWORD[((-84))+r13]
+ andn edi,esi,ebx
+ vpaddd ymm9,ymm1,ymm11
+ add edx,eax
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ and esi,ebp
+ vmovdqu YMMWORD[288+rsp],ymm9
+ add edx,r12d
+ xor esi,edi
+ add ecx,DWORD[((-64))+r13]
+ andn edi,edx,ebp
+ add ecx,esi
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ and edx,eax
+ add ecx,r12d
+ xor edx,edi
+ add ebx,DWORD[((-60))+r13]
+ andn edi,ecx,eax
+ add ebx,edx
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ and ecx,esi
+ add ebx,r12d
+ xor ecx,edi
+ vpalignr ymm8,ymm1,ymm0,8
+ vpxor ymm2,ymm2,ymm6
+ add ebp,DWORD[((-56))+r13]
+ andn edi,ebx,esi
+ vpxor ymm2,ymm2,ymm3
+ vmovdqu ymm11,YMMWORD[r14]
+ add ebp,ecx
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ vpxor ymm2,ymm2,ymm8
+ and ebx,edx
+ add ebp,r12d
+ xor ebx,edi
+ vpsrld ymm8,ymm2,30
+ vpslld ymm2,ymm2,2
+ add eax,DWORD[((-52))+r13]
+ andn edi,ebp,edx
+ add eax,ebx
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ and ebp,ecx
+ vpor ymm2,ymm2,ymm8
+ add eax,r12d
+ xor ebp,edi
+ add esi,DWORD[((-32))+r13]
+ andn edi,eax,ecx
+ vpaddd ymm9,ymm2,ymm11
+ add esi,ebp
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ and eax,ebx
+ vmovdqu YMMWORD[320+rsp],ymm9
+ add esi,r12d
+ xor eax,edi
+ add edx,DWORD[((-28))+r13]
+ andn edi,esi,ebx
+ add edx,eax
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ and esi,ebp
+ add edx,r12d
+ xor esi,edi
+ add ecx,DWORD[((-24))+r13]
+ andn edi,edx,ebp
+ add ecx,esi
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ and edx,eax
+ add ecx,r12d
+ xor edx,edi
+ vpalignr ymm8,ymm2,ymm1,8
+ vpxor ymm3,ymm3,ymm7
+ add ebx,DWORD[((-20))+r13]
+ andn edi,ecx,eax
+ vpxor ymm3,ymm3,ymm4
+ add ebx,edx
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ vpxor ymm3,ymm3,ymm8
+ and ecx,esi
+ add ebx,r12d
+ xor ecx,edi
+ vpsrld ymm8,ymm3,30
+ vpslld ymm3,ymm3,2
+ add ebp,DWORD[r13]
+ andn edi,ebx,esi
+ add ebp,ecx
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ and ebx,edx
+ vpor ymm3,ymm3,ymm8
+ add ebp,r12d
+ xor ebx,edi
+ add eax,DWORD[4+r13]
+ andn edi,ebp,edx
+ vpaddd ymm9,ymm3,ymm11
+ add eax,ebx
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ and ebp,ecx
+ vmovdqu YMMWORD[352+rsp],ymm9
+ add eax,r12d
+ xor ebp,edi
+ add esi,DWORD[8+r13]
+ andn edi,eax,ecx
+ add esi,ebp
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ and eax,ebx
+ add esi,r12d
+ xor eax,edi
+ add edx,DWORD[12+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ vpalignr ymm8,ymm3,ymm2,8
+ vpxor ymm4,ymm4,ymm0
+ add ecx,DWORD[32+r13]
+ lea ecx,[rsi*1+rcx]
+ vpxor ymm4,ymm4,ymm5
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ vpxor ymm4,ymm4,ymm8
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[36+r13]
+ vpsrld ymm8,ymm4,30
+ vpslld ymm4,ymm4,2
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ vpor ymm4,ymm4,ymm8
+ add ebp,DWORD[40+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ vpaddd ymm9,ymm4,ymm11
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ add eax,DWORD[44+r13]
+ vmovdqu YMMWORD[384+rsp],ymm9
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ add esi,DWORD[64+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ vpalignr ymm8,ymm4,ymm3,8
+ vpxor ymm5,ymm5,ymm1
+ add edx,DWORD[68+r13]
+ lea edx,[rax*1+rdx]
+ vpxor ymm5,ymm5,ymm6
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ vpxor ymm5,ymm5,ymm8
+ add edx,r12d
+ xor esi,ebx
+ add ecx,DWORD[72+r13]
+ vpsrld ymm8,ymm5,30
+ vpslld ymm5,ymm5,2
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ vpor ymm5,ymm5,ymm8
+ add ebx,DWORD[76+r13]
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ vpaddd ymm9,ymm5,ymm11
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ add ebp,DWORD[96+r13]
+ vmovdqu YMMWORD[416+rsp],ymm9
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ add eax,DWORD[100+r13]
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ vpalignr ymm8,ymm5,ymm4,8
+ vpxor ymm6,ymm6,ymm2
+ add esi,DWORD[104+r13]
+ lea esi,[rbp*1+rsi]
+ vpxor ymm6,ymm6,ymm7
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ vpxor ymm6,ymm6,ymm8
+ add esi,r12d
+ xor eax,ecx
+ add edx,DWORD[108+r13]
+ lea r13,[256+r13]
+ vpsrld ymm8,ymm6,30
+ vpslld ymm6,ymm6,2
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ vpor ymm6,ymm6,ymm8
+ add ecx,DWORD[((-128))+r13]
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ vpaddd ymm9,ymm6,ymm11
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[((-124))+r13]
+ vmovdqu YMMWORD[448+rsp],ymm9
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ add ebp,DWORD[((-120))+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ vpalignr ymm8,ymm6,ymm5,8
+ vpxor ymm7,ymm7,ymm3
+ add eax,DWORD[((-116))+r13]
+ lea eax,[rbx*1+rax]
+ vpxor ymm7,ymm7,ymm0
+ vmovdqu ymm11,YMMWORD[32+r14]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ vpxor ymm7,ymm7,ymm8
+ add eax,r12d
+ xor ebp,edx
+ add esi,DWORD[((-96))+r13]
+ vpsrld ymm8,ymm7,30
+ vpslld ymm7,ymm7,2
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ vpor ymm7,ymm7,ymm8
+ add edx,DWORD[((-92))+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ vpaddd ymm9,ymm7,ymm11
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ add ecx,DWORD[((-88))+r13]
+ vmovdqu YMMWORD[480+rsp],ymm9
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[((-84))+r13]
+ mov edi,esi
+ xor edi,eax
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ and ecx,edi
+ jmp NEAR $L$align32_2
+ALIGN 32
+$L$align32_2:
+ vpalignr ymm8,ymm7,ymm6,8
+ vpxor ymm0,ymm0,ymm4
+ add ebp,DWORD[((-64))+r13]
+ xor ecx,esi
+ vpxor ymm0,ymm0,ymm1
+ mov edi,edx
+ xor edi,esi
+ lea ebp,[rbp*1+rcx]
+ vpxor ymm0,ymm0,ymm8
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ vpsrld ymm8,ymm0,30
+ vpslld ymm0,ymm0,2
+ add ebp,r12d
+ and ebx,edi
+ add eax,DWORD[((-60))+r13]
+ xor ebx,edx
+ mov edi,ecx
+ xor edi,edx
+ vpor ymm0,ymm0,ymm8
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ vpaddd ymm9,ymm0,ymm11
+ add eax,r12d
+ and ebp,edi
+ add esi,DWORD[((-56))+r13]
+ xor ebp,ecx
+ vmovdqu YMMWORD[512+rsp],ymm9
+ mov edi,ebx
+ xor edi,ecx
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ and eax,edi
+ add edx,DWORD[((-52))+r13]
+ xor eax,ebx
+ mov edi,ebp
+ xor edi,ebx
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ and esi,edi
+ add ecx,DWORD[((-32))+r13]
+ xor esi,ebp
+ mov edi,eax
+ xor edi,ebp
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ and edx,edi
+ vpalignr ymm8,ymm0,ymm7,8
+ vpxor ymm1,ymm1,ymm5
+ add ebx,DWORD[((-28))+r13]
+ xor edx,eax
+ vpxor ymm1,ymm1,ymm2
+ mov edi,esi
+ xor edi,eax
+ lea ebx,[rdx*1+rbx]
+ vpxor ymm1,ymm1,ymm8
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ vpsrld ymm8,ymm1,30
+ vpslld ymm1,ymm1,2
+ add ebx,r12d
+ and ecx,edi
+ add ebp,DWORD[((-24))+r13]
+ xor ecx,esi
+ mov edi,edx
+ xor edi,esi
+ vpor ymm1,ymm1,ymm8
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ vpaddd ymm9,ymm1,ymm11
+ add ebp,r12d
+ and ebx,edi
+ add eax,DWORD[((-20))+r13]
+ xor ebx,edx
+ vmovdqu YMMWORD[544+rsp],ymm9
+ mov edi,ecx
+ xor edi,edx
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ and ebp,edi
+ add esi,DWORD[r13]
+ xor ebp,ecx
+ mov edi,ebx
+ xor edi,ecx
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ and eax,edi
+ add edx,DWORD[4+r13]
+ xor eax,ebx
+ mov edi,ebp
+ xor edi,ebx
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ and esi,edi
+ vpalignr ymm8,ymm1,ymm0,8
+ vpxor ymm2,ymm2,ymm6
+ add ecx,DWORD[8+r13]
+ xor esi,ebp
+ vpxor ymm2,ymm2,ymm3
+ mov edi,eax
+ xor edi,ebp
+ lea ecx,[rsi*1+rcx]
+ vpxor ymm2,ymm2,ymm8
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ vpsrld ymm8,ymm2,30
+ vpslld ymm2,ymm2,2
+ add ecx,r12d
+ and edx,edi
+ add ebx,DWORD[12+r13]
+ xor edx,eax
+ mov edi,esi
+ xor edi,eax
+ vpor ymm2,ymm2,ymm8
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ vpaddd ymm9,ymm2,ymm11
+ add ebx,r12d
+ and ecx,edi
+ add ebp,DWORD[32+r13]
+ xor ecx,esi
+ vmovdqu YMMWORD[576+rsp],ymm9
+ mov edi,edx
+ xor edi,esi
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ and ebx,edi
+ add eax,DWORD[36+r13]
+ xor ebx,edx
+ mov edi,ecx
+ xor edi,edx
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ and ebp,edi
+ add esi,DWORD[40+r13]
+ xor ebp,ecx
+ mov edi,ebx
+ xor edi,ecx
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ and eax,edi
+ vpalignr ymm8,ymm2,ymm1,8
+ vpxor ymm3,ymm3,ymm7
+ add edx,DWORD[44+r13]
+ xor eax,ebx
+ vpxor ymm3,ymm3,ymm4
+ mov edi,ebp
+ xor edi,ebx
+ lea edx,[rax*1+rdx]
+ vpxor ymm3,ymm3,ymm8
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ vpsrld ymm8,ymm3,30
+ vpslld ymm3,ymm3,2
+ add edx,r12d
+ and esi,edi
+ add ecx,DWORD[64+r13]
+ xor esi,ebp
+ mov edi,eax
+ xor edi,ebp
+ vpor ymm3,ymm3,ymm8
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ vpaddd ymm9,ymm3,ymm11
+ add ecx,r12d
+ and edx,edi
+ add ebx,DWORD[68+r13]
+ xor edx,eax
+ vmovdqu YMMWORD[608+rsp],ymm9
+ mov edi,esi
+ xor edi,eax
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ and ecx,edi
+ add ebp,DWORD[72+r13]
+ xor ecx,esi
+ mov edi,edx
+ xor edi,esi
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ and ebx,edi
+ add eax,DWORD[76+r13]
+ xor ebx,edx
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ add esi,DWORD[96+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ add edx,DWORD[100+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ add ecx,DWORD[104+r13]
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[108+r13]
+ lea r13,[256+r13]
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ add ebp,DWORD[((-128))+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ add eax,DWORD[((-124))+r13]
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ add esi,DWORD[((-120))+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ add edx,DWORD[((-116))+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ add ecx,DWORD[((-96))+r13]
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[((-92))+r13]
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ add ebp,DWORD[((-88))+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ add eax,DWORD[((-84))+r13]
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ add esi,DWORD[((-64))+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ add edx,DWORD[((-60))+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ add ecx,DWORD[((-56))+r13]
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[((-52))+r13]
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ add ebp,DWORD[((-32))+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ add eax,DWORD[((-28))+r13]
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ add esi,DWORD[((-24))+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ add edx,DWORD[((-20))+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ add edx,r12d
+ lea r13,[128+r9]
+ lea rdi,[128+r9]
+ cmp r13,r10
+ cmovae r13,r9
+
+
+ add edx,DWORD[r8]
+ add esi,DWORD[4+r8]
+ add ebp,DWORD[8+r8]
+ mov DWORD[r8],edx
+ add ebx,DWORD[12+r8]
+ mov DWORD[4+r8],esi
+ mov eax,edx
+ add ecx,DWORD[16+r8]
+ mov r12d,ebp
+ mov DWORD[8+r8],ebp
+ mov edx,ebx
+
+ mov DWORD[12+r8],ebx
+ mov ebp,esi
+ mov DWORD[16+r8],ecx
+
+ mov esi,ecx
+ mov ecx,r12d
+
+
+ cmp r9,r10
+ je NEAR $L$done_avx2
+ vmovdqu ymm6,YMMWORD[64+r14]
+ cmp rdi,r10
+ ja NEAR $L$ast_avx2
+
+ vmovdqu xmm0,XMMWORD[((-64))+rdi]
+ vmovdqu xmm1,XMMWORD[((-48))+rdi]
+ vmovdqu xmm2,XMMWORD[((-32))+rdi]
+ vmovdqu xmm3,XMMWORD[((-16))+rdi]
+ vinserti128 ymm0,ymm0,XMMWORD[r13],1
+ vinserti128 ymm1,ymm1,XMMWORD[16+r13],1
+ vinserti128 ymm2,ymm2,XMMWORD[32+r13],1
+ vinserti128 ymm3,ymm3,XMMWORD[48+r13],1
+ jmp NEAR $L$ast_avx2
+
+ALIGN 32
+$L$ast_avx2:
+ lea r13,[((128+16))+rsp]
+ rorx ebx,ebp,2
+ andn edi,ebp,edx
+ and ebp,ecx
+ xor ebp,edi
+ sub r9,-128
+ add esi,DWORD[((-128))+r13]
+ andn edi,eax,ecx
+ add esi,ebp
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ and eax,ebx
+ add esi,r12d
+ xor eax,edi
+ add edx,DWORD[((-124))+r13]
+ andn edi,esi,ebx
+ add edx,eax
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ and esi,ebp
+ add edx,r12d
+ xor esi,edi
+ add ecx,DWORD[((-120))+r13]
+ andn edi,edx,ebp
+ add ecx,esi
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ and edx,eax
+ add ecx,r12d
+ xor edx,edi
+ add ebx,DWORD[((-116))+r13]
+ andn edi,ecx,eax
+ add ebx,edx
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ and ecx,esi
+ add ebx,r12d
+ xor ecx,edi
+ add ebp,DWORD[((-96))+r13]
+ andn edi,ebx,esi
+ add ebp,ecx
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ and ebx,edx
+ add ebp,r12d
+ xor ebx,edi
+ add eax,DWORD[((-92))+r13]
+ andn edi,ebp,edx
+ add eax,ebx
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ and ebp,ecx
+ add eax,r12d
+ xor ebp,edi
+ add esi,DWORD[((-88))+r13]
+ andn edi,eax,ecx
+ add esi,ebp
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ and eax,ebx
+ add esi,r12d
+ xor eax,edi
+ add edx,DWORD[((-84))+r13]
+ andn edi,esi,ebx
+ add edx,eax
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ and esi,ebp
+ add edx,r12d
+ xor esi,edi
+ add ecx,DWORD[((-64))+r13]
+ andn edi,edx,ebp
+ add ecx,esi
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ and edx,eax
+ add ecx,r12d
+ xor edx,edi
+ add ebx,DWORD[((-60))+r13]
+ andn edi,ecx,eax
+ add ebx,edx
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ and ecx,esi
+ add ebx,r12d
+ xor ecx,edi
+ add ebp,DWORD[((-56))+r13]
+ andn edi,ebx,esi
+ add ebp,ecx
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ and ebx,edx
+ add ebp,r12d
+ xor ebx,edi
+ add eax,DWORD[((-52))+r13]
+ andn edi,ebp,edx
+ add eax,ebx
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ and ebp,ecx
+ add eax,r12d
+ xor ebp,edi
+ add esi,DWORD[((-32))+r13]
+ andn edi,eax,ecx
+ add esi,ebp
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ and eax,ebx
+ add esi,r12d
+ xor eax,edi
+ add edx,DWORD[((-28))+r13]
+ andn edi,esi,ebx
+ add edx,eax
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ and esi,ebp
+ add edx,r12d
+ xor esi,edi
+ add ecx,DWORD[((-24))+r13]
+ andn edi,edx,ebp
+ add ecx,esi
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ and edx,eax
+ add ecx,r12d
+ xor edx,edi
+ add ebx,DWORD[((-20))+r13]
+ andn edi,ecx,eax
+ add ebx,edx
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ and ecx,esi
+ add ebx,r12d
+ xor ecx,edi
+ add ebp,DWORD[r13]
+ andn edi,ebx,esi
+ add ebp,ecx
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ and ebx,edx
+ add ebp,r12d
+ xor ebx,edi
+ add eax,DWORD[4+r13]
+ andn edi,ebp,edx
+ add eax,ebx
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ and ebp,ecx
+ add eax,r12d
+ xor ebp,edi
+ add esi,DWORD[8+r13]
+ andn edi,eax,ecx
+ add esi,ebp
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ and eax,ebx
+ add esi,r12d
+ xor eax,edi
+ add edx,DWORD[12+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ add ecx,DWORD[32+r13]
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[36+r13]
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ add ebp,DWORD[40+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ add eax,DWORD[44+r13]
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ add esi,DWORD[64+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ vmovdqu ymm11,YMMWORD[((-64))+r14]
+ vpshufb ymm0,ymm0,ymm6
+ add edx,DWORD[68+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ add ecx,DWORD[72+r13]
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[76+r13]
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ add ebp,DWORD[96+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ add eax,DWORD[100+r13]
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ vpshufb ymm1,ymm1,ymm6
+ vpaddd ymm8,ymm0,ymm11
+ add esi,DWORD[104+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ add edx,DWORD[108+r13]
+ lea r13,[256+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ add ecx,DWORD[((-128))+r13]
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[((-124))+r13]
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ add ebp,DWORD[((-120))+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ vmovdqu YMMWORD[rsp],ymm8
+ vpshufb ymm2,ymm2,ymm6
+ vpaddd ymm9,ymm1,ymm11
+ add eax,DWORD[((-116))+r13]
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ add esi,DWORD[((-96))+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ add edx,DWORD[((-92))+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ add ecx,DWORD[((-88))+r13]
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[((-84))+r13]
+ mov edi,esi
+ xor edi,eax
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ and ecx,edi
+ vmovdqu YMMWORD[32+rsp],ymm9
+ vpshufb ymm3,ymm3,ymm6
+ vpaddd ymm6,ymm2,ymm11
+ add ebp,DWORD[((-64))+r13]
+ xor ecx,esi
+ mov edi,edx
+ xor edi,esi
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ and ebx,edi
+ add eax,DWORD[((-60))+r13]
+ xor ebx,edx
+ mov edi,ecx
+ xor edi,edx
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ and ebp,edi
+ add esi,DWORD[((-56))+r13]
+ xor ebp,ecx
+ mov edi,ebx
+ xor edi,ecx
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ and eax,edi
+ add edx,DWORD[((-52))+r13]
+ xor eax,ebx
+ mov edi,ebp
+ xor edi,ebx
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ and esi,edi
+ add ecx,DWORD[((-32))+r13]
+ xor esi,ebp
+ mov edi,eax
+ xor edi,ebp
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ and edx,edi
+ jmp NEAR $L$align32_3
+ALIGN 32
+$L$align32_3:
+ vmovdqu YMMWORD[64+rsp],ymm6
+ vpaddd ymm7,ymm3,ymm11
+ add ebx,DWORD[((-28))+r13]
+ xor edx,eax
+ mov edi,esi
+ xor edi,eax
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ and ecx,edi
+ add ebp,DWORD[((-24))+r13]
+ xor ecx,esi
+ mov edi,edx
+ xor edi,esi
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ and ebx,edi
+ add eax,DWORD[((-20))+r13]
+ xor ebx,edx
+ mov edi,ecx
+ xor edi,edx
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ and ebp,edi
+ add esi,DWORD[r13]
+ xor ebp,ecx
+ mov edi,ebx
+ xor edi,ecx
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ and eax,edi
+ add edx,DWORD[4+r13]
+ xor eax,ebx
+ mov edi,ebp
+ xor edi,ebx
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ and esi,edi
+ vmovdqu YMMWORD[96+rsp],ymm7
+ add ecx,DWORD[8+r13]
+ xor esi,ebp
+ mov edi,eax
+ xor edi,ebp
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ and edx,edi
+ add ebx,DWORD[12+r13]
+ xor edx,eax
+ mov edi,esi
+ xor edi,eax
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ and ecx,edi
+ add ebp,DWORD[32+r13]
+ xor ecx,esi
+ mov edi,edx
+ xor edi,esi
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ and ebx,edi
+ add eax,DWORD[36+r13]
+ xor ebx,edx
+ mov edi,ecx
+ xor edi,edx
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ and ebp,edi
+ add esi,DWORD[40+r13]
+ xor ebp,ecx
+ mov edi,ebx
+ xor edi,ecx
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ and eax,edi
+ vpalignr ymm4,ymm1,ymm0,8
+ add edx,DWORD[44+r13]
+ xor eax,ebx
+ mov edi,ebp
+ xor edi,ebx
+ vpsrldq ymm8,ymm3,4
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ vpxor ymm4,ymm4,ymm0
+ vpxor ymm8,ymm8,ymm2
+ xor esi,ebp
+ add edx,r12d
+ vpxor ymm4,ymm4,ymm8
+ and esi,edi
+ add ecx,DWORD[64+r13]
+ xor esi,ebp
+ mov edi,eax
+ vpsrld ymm8,ymm4,31
+ xor edi,ebp
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ vpslldq ymm10,ymm4,12
+ vpaddd ymm4,ymm4,ymm4
+ rorx esi,edx,2
+ xor edx,eax
+ vpsrld ymm9,ymm10,30
+ vpor ymm4,ymm4,ymm8
+ add ecx,r12d
+ and edx,edi
+ vpslld ymm10,ymm10,2
+ vpxor ymm4,ymm4,ymm9
+ add ebx,DWORD[68+r13]
+ xor edx,eax
+ vpxor ymm4,ymm4,ymm10
+ mov edi,esi
+ xor edi,eax
+ lea ebx,[rdx*1+rbx]
+ vpaddd ymm9,ymm4,ymm11
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ vmovdqu YMMWORD[128+rsp],ymm9
+ add ebx,r12d
+ and ecx,edi
+ add ebp,DWORD[72+r13]
+ xor ecx,esi
+ mov edi,edx
+ xor edi,esi
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ and ebx,edi
+ add eax,DWORD[76+r13]
+ xor ebx,edx
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ vpalignr ymm5,ymm2,ymm1,8
+ add esi,DWORD[96+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ vpsrldq ymm8,ymm4,4
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ vpxor ymm5,ymm5,ymm1
+ vpxor ymm8,ymm8,ymm3
+ add edx,DWORD[100+r13]
+ lea edx,[rax*1+rdx]
+ vpxor ymm5,ymm5,ymm8
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ vpsrld ymm8,ymm5,31
+ vmovdqu ymm11,YMMWORD[((-32))+r14]
+ xor esi,ebx
+ add ecx,DWORD[104+r13]
+ lea ecx,[rsi*1+rcx]
+ vpslldq ymm10,ymm5,12
+ vpaddd ymm5,ymm5,ymm5
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ vpsrld ymm9,ymm10,30
+ vpor ymm5,ymm5,ymm8
+ xor edx,eax
+ add ecx,r12d
+ vpslld ymm10,ymm10,2
+ vpxor ymm5,ymm5,ymm9
+ xor edx,ebp
+ add ebx,DWORD[108+r13]
+ lea r13,[256+r13]
+ vpxor ymm5,ymm5,ymm10
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ vpaddd ymm9,ymm5,ymm11
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ vmovdqu YMMWORD[160+rsp],ymm9
+ add ebp,DWORD[((-128))+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ vpalignr ymm6,ymm3,ymm2,8
+ add eax,DWORD[((-124))+r13]
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ vpsrldq ymm8,ymm5,4
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ vpxor ymm6,ymm6,ymm2
+ vpxor ymm8,ymm8,ymm4
+ add esi,DWORD[((-120))+r13]
+ lea esi,[rbp*1+rsi]
+ vpxor ymm6,ymm6,ymm8
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ vpsrld ymm8,ymm6,31
+ xor eax,ecx
+ add edx,DWORD[((-116))+r13]
+ lea edx,[rax*1+rdx]
+ vpslldq ymm10,ymm6,12
+ vpaddd ymm6,ymm6,ymm6
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ vpsrld ymm9,ymm10,30
+ vpor ymm6,ymm6,ymm8
+ xor esi,ebp
+ add edx,r12d
+ vpslld ymm10,ymm10,2
+ vpxor ymm6,ymm6,ymm9
+ xor esi,ebx
+ add ecx,DWORD[((-96))+r13]
+ vpxor ymm6,ymm6,ymm10
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ vpaddd ymm9,ymm6,ymm11
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ vmovdqu YMMWORD[192+rsp],ymm9
+ add ebx,DWORD[((-92))+r13]
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ vpalignr ymm7,ymm4,ymm3,8
+ add ebp,DWORD[((-88))+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ vpsrldq ymm8,ymm6,4
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ vpxor ymm7,ymm7,ymm3
+ vpxor ymm8,ymm8,ymm5
+ add eax,DWORD[((-84))+r13]
+ lea eax,[rbx*1+rax]
+ vpxor ymm7,ymm7,ymm8
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ vpsrld ymm8,ymm7,31
+ xor ebp,edx
+ add esi,DWORD[((-64))+r13]
+ lea esi,[rbp*1+rsi]
+ vpslldq ymm10,ymm7,12
+ vpaddd ymm7,ymm7,ymm7
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ vpsrld ymm9,ymm10,30
+ vpor ymm7,ymm7,ymm8
+ xor eax,ebx
+ add esi,r12d
+ vpslld ymm10,ymm10,2
+ vpxor ymm7,ymm7,ymm9
+ xor eax,ecx
+ add edx,DWORD[((-60))+r13]
+ vpxor ymm7,ymm7,ymm10
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ vpaddd ymm9,ymm7,ymm11
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ vmovdqu YMMWORD[224+rsp],ymm9
+ add ecx,DWORD[((-56))+r13]
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[((-52))+r13]
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ add ebp,DWORD[((-32))+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ add eax,DWORD[((-28))+r13]
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ add esi,DWORD[((-24))+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ add edx,DWORD[((-20))+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ add edx,r12d
+ lea r13,[128+rsp]
+
+
+ add edx,DWORD[r8]
+ add esi,DWORD[4+r8]
+ add ebp,DWORD[8+r8]
+ mov DWORD[r8],edx
+ add ebx,DWORD[12+r8]
+ mov DWORD[4+r8],esi
+ mov eax,edx
+ add ecx,DWORD[16+r8]
+ mov r12d,ebp
+ mov DWORD[8+r8],ebp
+ mov edx,ebx
+
+ mov DWORD[12+r8],ebx
+ mov ebp,esi
+ mov DWORD[16+r8],ecx
+
+ mov esi,ecx
+ mov ecx,r12d
+
+
+ cmp r9,r10
+ jbe NEAR $L$oop_avx2
+
+$L$done_avx2:
+ vzeroupper
+ movaps xmm6,XMMWORD[((-40-96))+r11]
+ movaps xmm7,XMMWORD[((-40-80))+r11]
+ movaps xmm8,XMMWORD[((-40-64))+r11]
+ movaps xmm9,XMMWORD[((-40-48))+r11]
+ movaps xmm10,XMMWORD[((-40-32))+r11]
+ movaps xmm11,XMMWORD[((-40-16))+r11]
+ mov r14,QWORD[((-40))+r11]
+
+ mov r13,QWORD[((-32))+r11]
+
+ mov r12,QWORD[((-24))+r11]
+
+ mov rbp,QWORD[((-16))+r11]
+
+ mov rbx,QWORD[((-8))+r11]
+
+ lea rsp,[r11]
+
+$L$epilogue_avx2:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha1_block_data_order_avx2:
+ALIGN 64
+K_XX_XX:
+ DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+ DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+ DD 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+ DD 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+ DD 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+ DD 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+ DD 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+ DD 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+ DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+ DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+DB 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+DB 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
+DB 102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44
+DB 32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60
+DB 97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114
+DB 103,62,0
+ALIGN 64
+EXTERN __imp_RtlVirtualUnwind
+
+ALIGN 16
+se_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ lea r10,[$L$prologue]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov rax,QWORD[152+r8]
+
+ lea r10,[$L$epilogue]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ mov rax,QWORD[64+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+ mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+ mov QWORD[232+r8],r14
+
+ jmp NEAR $L$common_seh_tail
+
+
+ALIGN 16
+shaext_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ lea r10,[$L$prologue_shaext]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ lea r10,[$L$epilogue_shaext]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ lea rsi,[((-8-64))+rax]
+ lea rdi,[512+r8]
+ mov ecx,8
+ DD 0xa548f3fc
+
+ jmp NEAR $L$common_seh_tail
+
+
+ALIGN 16
+ssse3_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov rax,QWORD[208+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ lea rsi,[((-40-96))+rax]
+ lea rdi,[512+r8]
+ mov ecx,12
+ DD 0xa548f3fc
+
+ mov rbx,QWORD[((-8))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+ mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+ mov QWORD[232+r8],r14
+
+$L$common_seh_tail:
+ mov rdi,QWORD[8+rax]
+ mov rsi,QWORD[16+rax]
+ mov QWORD[152+r8],rax
+ mov QWORD[168+r8],rsi
+ mov QWORD[176+r8],rdi
+
+ mov rdi,QWORD[40+r9]
+ mov rsi,r8
+ mov ecx,154
+ DD 0xa548f3fc
+
+ mov rsi,r9
+ xor rcx,rcx
+ mov rdx,QWORD[8+rsi]
+ mov r8,QWORD[rsi]
+ mov r9,QWORD[16+rsi]
+ mov r10,QWORD[40+rsi]
+ lea r11,[56+rsi]
+ lea r12,[24+rsi]
+ mov QWORD[32+rsp],r10
+ mov QWORD[40+rsp],r11
+ mov QWORD[48+rsp],r12
+ mov QWORD[56+rsp],rcx
+ call QWORD[__imp_RtlVirtualUnwind]
+
+ mov eax,1
+ add rsp,64
+ popfq
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ DB 0F3h,0C3h ;repret
+
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_sha1_block_data_order wrt ..imagebase
+ DD $L$SEH_end_sha1_block_data_order wrt ..imagebase
+ DD $L$SEH_info_sha1_block_data_order wrt ..imagebase
+ DD $L$SEH_begin_sha1_block_data_order_shaext wrt ..imagebase
+ DD $L$SEH_end_sha1_block_data_order_shaext wrt ..imagebase
+ DD $L$SEH_info_sha1_block_data_order_shaext wrt ..imagebase
+ DD $L$SEH_begin_sha1_block_data_order_ssse3 wrt ..imagebase
+ DD $L$SEH_end_sha1_block_data_order_ssse3 wrt ..imagebase
+ DD $L$SEH_info_sha1_block_data_order_ssse3 wrt ..imagebase
+ DD $L$SEH_begin_sha1_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_end_sha1_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_info_sha1_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_begin_sha1_block_data_order_avx2 wrt ..imagebase
+ DD $L$SEH_end_sha1_block_data_order_avx2 wrt ..imagebase
+ DD $L$SEH_info_sha1_block_data_order_avx2 wrt ..imagebase
+section .xdata rdata align=8
+ALIGN 8
+$L$SEH_info_sha1_block_data_order:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+$L$SEH_info_sha1_block_data_order_shaext:
+DB 9,0,0,0
+ DD shaext_handler wrt ..imagebase
+$L$SEH_info_sha1_block_data_order_ssse3:
+DB 9,0,0,0
+ DD ssse3_handler wrt ..imagebase
+ DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase
+$L$SEH_info_sha1_block_data_order_avx:
+DB 9,0,0,0
+ DD ssse3_handler wrt ..imagebase
+ DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
+$L$SEH_info_sha1_block_data_order_avx2:
+DB 9,0,0,0
+ DD ssse3_handler wrt ..imagebase
+ DD $L$prologue_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm
new file mode 100644
index 0000000..58c00d6
--- /dev/null
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm
@@ -0,0 +1,8291 @@
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+section .text code align=64
+
+
+EXTERN OPENSSL_ia32cap_P
+
+global sha256_multi_block
+
+ALIGN 32
+sha256_multi_block:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha256_multi_block:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+ mov rcx,QWORD[((OPENSSL_ia32cap_P+4))]
+ bt rcx,61
+ jc NEAR _shaext_shortcut
+ test ecx,268435456
+ jnz NEAR _avx_shortcut
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ lea rsp,[((-168))+rsp]
+ movaps XMMWORD[rsp],xmm6
+ movaps XMMWORD[16+rsp],xmm7
+ movaps XMMWORD[32+rsp],xmm8
+ movaps XMMWORD[48+rsp],xmm9
+ movaps XMMWORD[(-120)+rax],xmm10
+ movaps XMMWORD[(-104)+rax],xmm11
+ movaps XMMWORD[(-88)+rax],xmm12
+ movaps XMMWORD[(-72)+rax],xmm13
+ movaps XMMWORD[(-56)+rax],xmm14
+ movaps XMMWORD[(-40)+rax],xmm15
+ sub rsp,288
+ and rsp,-256
+ mov QWORD[272+rsp],rax
+
+$L$body:
+ lea rbp,[((K256+128))]
+ lea rbx,[256+rsp]
+ lea rdi,[128+rdi]
+
+$L$oop_grande:
+ mov DWORD[280+rsp],edx
+ xor edx,edx
+
+ mov r8,QWORD[rsi]
+
+ mov ecx,DWORD[8+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[rbx],ecx
+ cmovle r8,rbp
+
+ mov r9,QWORD[16+rsi]
+
+ mov ecx,DWORD[24+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[4+rbx],ecx
+ cmovle r9,rbp
+
+ mov r10,QWORD[32+rsi]
+
+ mov ecx,DWORD[40+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[8+rbx],ecx
+ cmovle r10,rbp
+
+ mov r11,QWORD[48+rsi]
+
+ mov ecx,DWORD[56+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[12+rbx],ecx
+ cmovle r11,rbp
+ test edx,edx
+ jz NEAR $L$done
+
+ movdqu xmm8,XMMWORD[((0-128))+rdi]
+ lea rax,[128+rsp]
+ movdqu xmm9,XMMWORD[((32-128))+rdi]
+ movdqu xmm10,XMMWORD[((64-128))+rdi]
+ movdqu xmm11,XMMWORD[((96-128))+rdi]
+ movdqu xmm12,XMMWORD[((128-128))+rdi]
+ movdqu xmm13,XMMWORD[((160-128))+rdi]
+ movdqu xmm14,XMMWORD[((192-128))+rdi]
+ movdqu xmm15,XMMWORD[((224-128))+rdi]
+ movdqu xmm6,XMMWORD[$L$pbswap]
+ jmp NEAR $L$oop
+
+ALIGN 32
+$L$oop:
+ movdqa xmm4,xmm10
+ pxor xmm4,xmm9
+ movd xmm5,DWORD[r8]
+ movd xmm0,DWORD[r9]
+ movd xmm1,DWORD[r10]
+ movd xmm2,DWORD[r11]
+ punpckldq xmm5,xmm1
+ punpckldq xmm0,xmm2
+ punpckldq xmm5,xmm0
+ movdqa xmm7,xmm12
+DB 102,15,56,0,238
+ movdqa xmm2,xmm12
+
+ psrld xmm7,6
+ movdqa xmm1,xmm12
+ pslld xmm2,7
+ movdqa XMMWORD[(0-128)+rax],xmm5
+ paddd xmm5,xmm15
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[((-128))+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm12
+
+ pxor xmm7,xmm2
+ movdqa xmm3,xmm12
+ pslld xmm2,26-21
+ pandn xmm0,xmm14
+ pand xmm3,xmm13
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm8
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm8
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ movdqa xmm3,xmm9
+ movdqa xmm7,xmm8
+ pslld xmm2,10
+ pxor xmm3,xmm8
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm4,xmm3
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm15,xmm9
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm15,xmm4
+ paddd xmm11,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm15,xmm5
+ paddd xmm15,xmm7
+ movd xmm5,DWORD[4+r8]
+ movd xmm0,DWORD[4+r9]
+ movd xmm1,DWORD[4+r10]
+ movd xmm2,DWORD[4+r11]
+ punpckldq xmm5,xmm1
+ punpckldq xmm0,xmm2
+ punpckldq xmm5,xmm0
+ movdqa xmm7,xmm11
+
+ movdqa xmm2,xmm11
+DB 102,15,56,0,238
+ psrld xmm7,6
+ movdqa xmm1,xmm11
+ pslld xmm2,7
+ movdqa XMMWORD[(16-128)+rax],xmm5
+ paddd xmm5,xmm14
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[((-96))+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm11
+
+ pxor xmm7,xmm2
+ movdqa xmm4,xmm11
+ pslld xmm2,26-21
+ pandn xmm0,xmm13
+ pand xmm4,xmm12
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm15
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm15
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm8
+ movdqa xmm7,xmm15
+ pslld xmm2,10
+ pxor xmm4,xmm15
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm3,xmm4
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm14,xmm8
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm14,xmm3
+ paddd xmm10,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm14,xmm5
+ paddd xmm14,xmm7
+ movd xmm5,DWORD[8+r8]
+ movd xmm0,DWORD[8+r9]
+ movd xmm1,DWORD[8+r10]
+ movd xmm2,DWORD[8+r11]
+ punpckldq xmm5,xmm1
+ punpckldq xmm0,xmm2
+ punpckldq xmm5,xmm0
+ movdqa xmm7,xmm10
+DB 102,15,56,0,238
+ movdqa xmm2,xmm10
+
+ psrld xmm7,6
+ movdqa xmm1,xmm10
+ pslld xmm2,7
+ movdqa XMMWORD[(32-128)+rax],xmm5
+ paddd xmm5,xmm13
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[((-64))+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm10
+
+ pxor xmm7,xmm2
+ movdqa xmm3,xmm10
+ pslld xmm2,26-21
+ pandn xmm0,xmm12
+ pand xmm3,xmm11
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm14
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm14
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ movdqa xmm3,xmm15
+ movdqa xmm7,xmm14
+ pslld xmm2,10
+ pxor xmm3,xmm14
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm4,xmm3
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm13,xmm15
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm13,xmm4
+ paddd xmm9,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm13,xmm5
+ paddd xmm13,xmm7
+ movd xmm5,DWORD[12+r8]
+ movd xmm0,DWORD[12+r9]
+ movd xmm1,DWORD[12+r10]
+ movd xmm2,DWORD[12+r11]
+ punpckldq xmm5,xmm1
+ punpckldq xmm0,xmm2
+ punpckldq xmm5,xmm0
+ movdqa xmm7,xmm9
+
+ movdqa xmm2,xmm9
+DB 102,15,56,0,238
+ psrld xmm7,6
+ movdqa xmm1,xmm9
+ pslld xmm2,7
+ movdqa XMMWORD[(48-128)+rax],xmm5
+ paddd xmm5,xmm12
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[((-32))+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm9
+
+ pxor xmm7,xmm2
+ movdqa xmm4,xmm9
+ pslld xmm2,26-21
+ pandn xmm0,xmm11
+ pand xmm4,xmm10
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm13
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm13
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm14
+ movdqa xmm7,xmm13
+ pslld xmm2,10
+ pxor xmm4,xmm13
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm3,xmm4
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm12,xmm14
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm12,xmm3
+ paddd xmm8,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm12,xmm5
+ paddd xmm12,xmm7
+ movd xmm5,DWORD[16+r8]
+ movd xmm0,DWORD[16+r9]
+ movd xmm1,DWORD[16+r10]
+ movd xmm2,DWORD[16+r11]
+ punpckldq xmm5,xmm1
+ punpckldq xmm0,xmm2
+ punpckldq xmm5,xmm0
+ movdqa xmm7,xmm8
+DB 102,15,56,0,238
+ movdqa xmm2,xmm8
+
+ psrld xmm7,6
+ movdqa xmm1,xmm8
+ pslld xmm2,7
+ movdqa XMMWORD[(64-128)+rax],xmm5
+ paddd xmm5,xmm11
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm8
+
+ pxor xmm7,xmm2
+ movdqa xmm3,xmm8
+ pslld xmm2,26-21
+ pandn xmm0,xmm10
+ pand xmm3,xmm9
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm12
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm12
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ movdqa xmm3,xmm13
+ movdqa xmm7,xmm12
+ pslld xmm2,10
+ pxor xmm3,xmm12
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm4,xmm3
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm11,xmm13
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm11,xmm4
+ paddd xmm15,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm11,xmm5
+ paddd xmm11,xmm7
+ movd xmm5,DWORD[20+r8]
+ movd xmm0,DWORD[20+r9]
+ movd xmm1,DWORD[20+r10]
+ movd xmm2,DWORD[20+r11]
+ punpckldq xmm5,xmm1
+ punpckldq xmm0,xmm2
+ punpckldq xmm5,xmm0
+ movdqa xmm7,xmm15
+
+ movdqa xmm2,xmm15
+DB 102,15,56,0,238
+ psrld xmm7,6
+ movdqa xmm1,xmm15
+ pslld xmm2,7
+ movdqa XMMWORD[(80-128)+rax],xmm5
+ paddd xmm5,xmm10
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[32+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm15
+
+ pxor xmm7,xmm2
+ movdqa xmm4,xmm15
+ pslld xmm2,26-21
+ pandn xmm0,xmm9
+ pand xmm4,xmm8
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm11
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm11
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm12
+ movdqa xmm7,xmm11
+ pslld xmm2,10
+ pxor xmm4,xmm11
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm3,xmm4
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm10,xmm12
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm10,xmm3
+ paddd xmm14,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm10,xmm5
+ paddd xmm10,xmm7
+ movd xmm5,DWORD[24+r8]
+ movd xmm0,DWORD[24+r9]
+ movd xmm1,DWORD[24+r10]
+ movd xmm2,DWORD[24+r11]
+ punpckldq xmm5,xmm1
+ punpckldq xmm0,xmm2
+ punpckldq xmm5,xmm0
+ movdqa xmm7,xmm14
+DB 102,15,56,0,238
+ movdqa xmm2,xmm14
+
+ psrld xmm7,6
+ movdqa xmm1,xmm14
+ pslld xmm2,7
+ movdqa XMMWORD[(96-128)+rax],xmm5
+ paddd xmm5,xmm9
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[64+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm14
+
+ pxor xmm7,xmm2
+ movdqa xmm3,xmm14
+ pslld xmm2,26-21
+ pandn xmm0,xmm8
+ pand xmm3,xmm15
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm10
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm10
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ movdqa xmm3,xmm11
+ movdqa xmm7,xmm10
+ pslld xmm2,10
+ pxor xmm3,xmm10
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm4,xmm3
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm9,xmm11
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm9,xmm4
+ paddd xmm13,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm9,xmm5
+ paddd xmm9,xmm7
+ movd xmm5,DWORD[28+r8]
+ movd xmm0,DWORD[28+r9]
+ movd xmm1,DWORD[28+r10]
+ movd xmm2,DWORD[28+r11]
+ punpckldq xmm5,xmm1
+ punpckldq xmm0,xmm2
+ punpckldq xmm5,xmm0
+ movdqa xmm7,xmm13
+
+ movdqa xmm2,xmm13
+DB 102,15,56,0,238
+ psrld xmm7,6
+ movdqa xmm1,xmm13
+ pslld xmm2,7
+ movdqa XMMWORD[(112-128)+rax],xmm5
+ paddd xmm5,xmm8
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[96+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm13
+
+ pxor xmm7,xmm2
+ movdqa xmm4,xmm13
+ pslld xmm2,26-21
+ pandn xmm0,xmm15
+ pand xmm4,xmm14
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm9
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm9
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm10
+ movdqa xmm7,xmm9
+ pslld xmm2,10
+ pxor xmm4,xmm9
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm3,xmm4
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm8,xmm10
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm8,xmm3
+ paddd xmm12,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm8,xmm5
+ paddd xmm8,xmm7
+ lea rbp,[256+rbp]
+ movd xmm5,DWORD[32+r8]
+ movd xmm0,DWORD[32+r9]
+ movd xmm1,DWORD[32+r10]
+ movd xmm2,DWORD[32+r11]
+ punpckldq xmm5,xmm1
+ punpckldq xmm0,xmm2
+ punpckldq xmm5,xmm0
+ movdqa xmm7,xmm12
+DB 102,15,56,0,238
+ movdqa xmm2,xmm12
+
+ psrld xmm7,6
+ movdqa xmm1,xmm12
+ pslld xmm2,7
+ movdqa XMMWORD[(128-128)+rax],xmm5
+ paddd xmm5,xmm15
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[((-128))+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm12
+
+ pxor xmm7,xmm2
+ movdqa xmm3,xmm12
+ pslld xmm2,26-21
+ pandn xmm0,xmm14
+ pand xmm3,xmm13
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm8
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm8
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ movdqa xmm3,xmm9
+ movdqa xmm7,xmm8
+ pslld xmm2,10
+ pxor xmm3,xmm8
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm4,xmm3
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm15,xmm9
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm15,xmm4
+ paddd xmm11,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm15,xmm5
+ paddd xmm15,xmm7
+ movd xmm5,DWORD[36+r8]
+ movd xmm0,DWORD[36+r9]
+ movd xmm1,DWORD[36+r10]
+ movd xmm2,DWORD[36+r11]
+ punpckldq xmm5,xmm1
+ punpckldq xmm0,xmm2
+ punpckldq xmm5,xmm0
+ movdqa xmm7,xmm11
+
+ movdqa xmm2,xmm11
+DB 102,15,56,0,238
+ psrld xmm7,6
+ movdqa xmm1,xmm11
+ pslld xmm2,7
+ movdqa XMMWORD[(144-128)+rax],xmm5
+ paddd xmm5,xmm14
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[((-96))+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm11
+
+ pxor xmm7,xmm2
+ movdqa xmm4,xmm11
+ pslld xmm2,26-21
+ pandn xmm0,xmm13
+ pand xmm4,xmm12
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm15
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm15
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm8
+ movdqa xmm7,xmm15
+ pslld xmm2,10
+ pxor xmm4,xmm15
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm3,xmm4
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm14,xmm8
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm14,xmm3
+ paddd xmm10,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm14,xmm5
+ paddd xmm14,xmm7
+ movd xmm5,DWORD[40+r8]
+ movd xmm0,DWORD[40+r9]
+ movd xmm1,DWORD[40+r10]
+ movd xmm2,DWORD[40+r11]
+ punpckldq xmm5,xmm1
+ punpckldq xmm0,xmm2
+ punpckldq xmm5,xmm0
+ movdqa xmm7,xmm10
+DB 102,15,56,0,238
+ movdqa xmm2,xmm10
+
+ psrld xmm7,6
+ movdqa xmm1,xmm10
+ pslld xmm2,7
+ movdqa XMMWORD[(160-128)+rax],xmm5
+ paddd xmm5,xmm13
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[((-64))+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm10
+
+ pxor xmm7,xmm2
+ movdqa xmm3,xmm10
+ pslld xmm2,26-21
+ pandn xmm0,xmm12
+ pand xmm3,xmm11
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm14
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm14
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ movdqa xmm3,xmm15
+ movdqa xmm7,xmm14
+ pslld xmm2,10
+ pxor xmm3,xmm14
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm4,xmm3
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm13,xmm15
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm13,xmm4
+ paddd xmm9,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm13,xmm5
+ paddd xmm13,xmm7
+ movd xmm5,DWORD[44+r8]
+ movd xmm0,DWORD[44+r9]
+ movd xmm1,DWORD[44+r10]
+ movd xmm2,DWORD[44+r11]
+ punpckldq xmm5,xmm1
+ punpckldq xmm0,xmm2
+ punpckldq xmm5,xmm0
+ movdqa xmm7,xmm9
+
+ movdqa xmm2,xmm9
+DB 102,15,56,0,238
+ psrld xmm7,6
+ movdqa xmm1,xmm9
+ pslld xmm2,7
+ movdqa XMMWORD[(176-128)+rax],xmm5
+ paddd xmm5,xmm12
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[((-32))+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm9
+
+ pxor xmm7,xmm2
+ movdqa xmm4,xmm9
+ pslld xmm2,26-21
+ pandn xmm0,xmm11
+ pand xmm4,xmm10
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm13
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm13
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm14
+ movdqa xmm7,xmm13
+ pslld xmm2,10
+ pxor xmm4,xmm13
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm3,xmm4
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm12,xmm14
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm12,xmm3
+ paddd xmm8,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm12,xmm5
+ paddd xmm12,xmm7
+ movd xmm5,DWORD[48+r8]
+ movd xmm0,DWORD[48+r9]
+ movd xmm1,DWORD[48+r10]
+ movd xmm2,DWORD[48+r11]
+ punpckldq xmm5,xmm1
+ punpckldq xmm0,xmm2
+ punpckldq xmm5,xmm0
+ movdqa xmm7,xmm8
+DB 102,15,56,0,238
+ movdqa xmm2,xmm8
+
+ psrld xmm7,6
+ movdqa xmm1,xmm8
+ pslld xmm2,7
+ movdqa XMMWORD[(192-128)+rax],xmm5
+ paddd xmm5,xmm11
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm8
+
+ pxor xmm7,xmm2
+ movdqa xmm3,xmm8
+ pslld xmm2,26-21
+ pandn xmm0,xmm10
+ pand xmm3,xmm9
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm12
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm12
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ movdqa xmm3,xmm13
+ movdqa xmm7,xmm12
+ pslld xmm2,10
+ pxor xmm3,xmm12
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm4,xmm3
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm11,xmm13
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm11,xmm4
+ paddd xmm15,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm11,xmm5
+ paddd xmm11,xmm7
+ movd xmm5,DWORD[52+r8]
+ movd xmm0,DWORD[52+r9]
+ movd xmm1,DWORD[52+r10]
+ movd xmm2,DWORD[52+r11]
+ punpckldq xmm5,xmm1
+ punpckldq xmm0,xmm2
+ punpckldq xmm5,xmm0
+ movdqa xmm7,xmm15
+
+ movdqa xmm2,xmm15
+DB 102,15,56,0,238
+ psrld xmm7,6
+ movdqa xmm1,xmm15
+ pslld xmm2,7
+ movdqa XMMWORD[(208-128)+rax],xmm5
+ paddd xmm5,xmm10
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[32+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm15
+
+ pxor xmm7,xmm2
+ movdqa xmm4,xmm15
+ pslld xmm2,26-21
+ pandn xmm0,xmm9
+ pand xmm4,xmm8
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm11
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm11
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm12
+ movdqa xmm7,xmm11
+ pslld xmm2,10
+ pxor xmm4,xmm11
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm3,xmm4
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm10,xmm12
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm10,xmm3
+ paddd xmm14,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm10,xmm5
+ paddd xmm10,xmm7
+ movd xmm5,DWORD[56+r8]
+ movd xmm0,DWORD[56+r9]
+ movd xmm1,DWORD[56+r10]
+ movd xmm2,DWORD[56+r11]
+ punpckldq xmm5,xmm1
+ punpckldq xmm0,xmm2
+ punpckldq xmm5,xmm0
+ movdqa xmm7,xmm14
+DB 102,15,56,0,238
+ movdqa xmm2,xmm14
+
+ psrld xmm7,6
+ movdqa xmm1,xmm14
+ pslld xmm2,7
+ movdqa XMMWORD[(224-128)+rax],xmm5
+ paddd xmm5,xmm9
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[64+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm14
+
+ pxor xmm7,xmm2
+ movdqa xmm3,xmm14
+ pslld xmm2,26-21
+ pandn xmm0,xmm8
+ pand xmm3,xmm15
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm10
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm10
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ movdqa xmm3,xmm11
+ movdqa xmm7,xmm10
+ pslld xmm2,10
+ pxor xmm3,xmm10
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm4,xmm3
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm9,xmm11
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm9,xmm4
+ paddd xmm13,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm9,xmm5
+ paddd xmm9,xmm7
+ movd xmm5,DWORD[60+r8]
+ lea r8,[64+r8]
+ movd xmm0,DWORD[60+r9]
+ lea r9,[64+r9]
+ movd xmm1,DWORD[60+r10]
+ lea r10,[64+r10]
+ movd xmm2,DWORD[60+r11]
+ lea r11,[64+r11]
+ punpckldq xmm5,xmm1
+ punpckldq xmm0,xmm2
+ punpckldq xmm5,xmm0
+ movdqa xmm7,xmm13
+
+ movdqa xmm2,xmm13
+DB 102,15,56,0,238
+ psrld xmm7,6
+ movdqa xmm1,xmm13
+ pslld xmm2,7
+ movdqa XMMWORD[(240-128)+rax],xmm5
+ paddd xmm5,xmm8
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[96+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm13
+ prefetcht0 [63+r8]
+ pxor xmm7,xmm2
+ movdqa xmm4,xmm13
+ pslld xmm2,26-21
+ pandn xmm0,xmm15
+ pand xmm4,xmm14
+ pxor xmm7,xmm1
+
+ prefetcht0 [63+r9]
+ movdqa xmm1,xmm9
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm9
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm10
+ movdqa xmm7,xmm9
+ pslld xmm2,10
+ pxor xmm4,xmm9
+
+ prefetcht0 [63+r10]
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm3,xmm4
+ pxor xmm1,xmm7
+
+ prefetcht0 [63+r11]
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm8,xmm10
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm8,xmm3
+ paddd xmm12,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm8,xmm5
+ paddd xmm8,xmm7
+ lea rbp,[256+rbp]
+ movdqu xmm5,XMMWORD[((0-128))+rax]
+ mov ecx,3
+ jmp NEAR $L$oop_16_xx
+ALIGN 32
+$L$oop_16_xx:
+ movdqa xmm6,XMMWORD[((16-128))+rax]
+ paddd xmm5,XMMWORD[((144-128))+rax]
+
+ movdqa xmm7,xmm6
+ movdqa xmm1,xmm6
+ psrld xmm7,3
+ movdqa xmm2,xmm6
+
+ psrld xmm1,7
+ movdqa xmm0,XMMWORD[((224-128))+rax]
+ pslld xmm2,14
+ pxor xmm7,xmm1
+ psrld xmm1,18-7
+ movdqa xmm3,xmm0
+ pxor xmm7,xmm2
+ pslld xmm2,25-14
+ pxor xmm7,xmm1
+ psrld xmm0,10
+ movdqa xmm1,xmm3
+
+ psrld xmm3,17
+ pxor xmm7,xmm2
+ pslld xmm1,13
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ psrld xmm3,19-17
+ pxor xmm0,xmm1
+ pslld xmm1,15-13
+ pxor xmm0,xmm3
+ pxor xmm0,xmm1
+ paddd xmm5,xmm0
+ movdqa xmm7,xmm12
+
+ movdqa xmm2,xmm12
+
+ psrld xmm7,6
+ movdqa xmm1,xmm12
+ pslld xmm2,7
+ movdqa XMMWORD[(0-128)+rax],xmm5
+ paddd xmm5,xmm15
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[((-128))+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm12
+
+ pxor xmm7,xmm2
+ movdqa xmm3,xmm12
+ pslld xmm2,26-21
+ pandn xmm0,xmm14
+ pand xmm3,xmm13
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm8
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm8
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ movdqa xmm3,xmm9
+ movdqa xmm7,xmm8
+ pslld xmm2,10
+ pxor xmm3,xmm8
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm4,xmm3
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm15,xmm9
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm15,xmm4
+ paddd xmm11,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm15,xmm5
+ paddd xmm15,xmm7
+ movdqa xmm5,XMMWORD[((32-128))+rax]
+ paddd xmm6,XMMWORD[((160-128))+rax]
+
+ movdqa xmm7,xmm5
+ movdqa xmm1,xmm5
+ psrld xmm7,3
+ movdqa xmm2,xmm5
+
+ psrld xmm1,7
+ movdqa xmm0,XMMWORD[((240-128))+rax]
+ pslld xmm2,14
+ pxor xmm7,xmm1
+ psrld xmm1,18-7
+ movdqa xmm4,xmm0
+ pxor xmm7,xmm2
+ pslld xmm2,25-14
+ pxor xmm7,xmm1
+ psrld xmm0,10
+ movdqa xmm1,xmm4
+
+ psrld xmm4,17
+ pxor xmm7,xmm2
+ pslld xmm1,13
+ paddd xmm6,xmm7
+ pxor xmm0,xmm4
+ psrld xmm4,19-17
+ pxor xmm0,xmm1
+ pslld xmm1,15-13
+ pxor xmm0,xmm4
+ pxor xmm0,xmm1
+ paddd xmm6,xmm0
+ movdqa xmm7,xmm11
+
+ movdqa xmm2,xmm11
+
+ psrld xmm7,6
+ movdqa xmm1,xmm11
+ pslld xmm2,7
+ movdqa XMMWORD[(16-128)+rax],xmm6
+ paddd xmm6,xmm14
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm6,XMMWORD[((-96))+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm11
+
+ pxor xmm7,xmm2
+ movdqa xmm4,xmm11
+ pslld xmm2,26-21
+ pandn xmm0,xmm13
+ pand xmm4,xmm12
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm15
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm15
+ psrld xmm1,2
+ paddd xmm6,xmm7
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm8
+ movdqa xmm7,xmm15
+ pslld xmm2,10
+ pxor xmm4,xmm15
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm6,xmm0
+ pslld xmm2,19-10
+ pand xmm3,xmm4
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm14,xmm8
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm14,xmm3
+ paddd xmm10,xmm6
+ pxor xmm7,xmm2
+
+ paddd xmm14,xmm6
+ paddd xmm14,xmm7
+ movdqa xmm6,XMMWORD[((48-128))+rax]
+ paddd xmm5,XMMWORD[((176-128))+rax]
+
+ movdqa xmm7,xmm6
+ movdqa xmm1,xmm6
+ psrld xmm7,3
+ movdqa xmm2,xmm6
+
+ psrld xmm1,7
+ movdqa xmm0,XMMWORD[((0-128))+rax]
+ pslld xmm2,14
+ pxor xmm7,xmm1
+ psrld xmm1,18-7
+ movdqa xmm3,xmm0
+ pxor xmm7,xmm2
+ pslld xmm2,25-14
+ pxor xmm7,xmm1
+ psrld xmm0,10
+ movdqa xmm1,xmm3
+
+ psrld xmm3,17
+ pxor xmm7,xmm2
+ pslld xmm1,13
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ psrld xmm3,19-17
+ pxor xmm0,xmm1
+ pslld xmm1,15-13
+ pxor xmm0,xmm3
+ pxor xmm0,xmm1
+ paddd xmm5,xmm0
+ movdqa xmm7,xmm10
+
+ movdqa xmm2,xmm10
+
+ psrld xmm7,6
+ movdqa xmm1,xmm10
+ pslld xmm2,7
+ movdqa XMMWORD[(32-128)+rax],xmm5
+ paddd xmm5,xmm13
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[((-64))+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm10
+
+ pxor xmm7,xmm2
+ movdqa xmm3,xmm10
+ pslld xmm2,26-21
+ pandn xmm0,xmm12
+ pand xmm3,xmm11
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm14
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm14
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ movdqa xmm3,xmm15
+ movdqa xmm7,xmm14
+ pslld xmm2,10
+ pxor xmm3,xmm14
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm4,xmm3
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm13,xmm15
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm13,xmm4
+ paddd xmm9,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm13,xmm5
+ paddd xmm13,xmm7
+ movdqa xmm5,XMMWORD[((64-128))+rax]
+ paddd xmm6,XMMWORD[((192-128))+rax]
+
+ movdqa xmm7,xmm5
+ movdqa xmm1,xmm5
+ psrld xmm7,3
+ movdqa xmm2,xmm5
+
+ psrld xmm1,7
+ movdqa xmm0,XMMWORD[((16-128))+rax]
+ pslld xmm2,14
+ pxor xmm7,xmm1
+ psrld xmm1,18-7
+ movdqa xmm4,xmm0
+ pxor xmm7,xmm2
+ pslld xmm2,25-14
+ pxor xmm7,xmm1
+ psrld xmm0,10
+ movdqa xmm1,xmm4
+
+ psrld xmm4,17
+ pxor xmm7,xmm2
+ pslld xmm1,13
+ paddd xmm6,xmm7
+ pxor xmm0,xmm4
+ psrld xmm4,19-17
+ pxor xmm0,xmm1
+ pslld xmm1,15-13
+ pxor xmm0,xmm4
+ pxor xmm0,xmm1
+ paddd xmm6,xmm0
+ movdqa xmm7,xmm9
+
+ movdqa xmm2,xmm9
+
+ psrld xmm7,6
+ movdqa xmm1,xmm9
+ pslld xmm2,7
+ movdqa XMMWORD[(48-128)+rax],xmm6
+ paddd xmm6,xmm12
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm6,XMMWORD[((-32))+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm9
+
+ pxor xmm7,xmm2
+ movdqa xmm4,xmm9
+ pslld xmm2,26-21
+ pandn xmm0,xmm11
+ pand xmm4,xmm10
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm13
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm13
+ psrld xmm1,2
+ paddd xmm6,xmm7
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm14
+ movdqa xmm7,xmm13
+ pslld xmm2,10
+ pxor xmm4,xmm13
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm6,xmm0
+ pslld xmm2,19-10
+ pand xmm3,xmm4
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm12,xmm14
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm12,xmm3
+ paddd xmm8,xmm6
+ pxor xmm7,xmm2
+
+ paddd xmm12,xmm6
+ paddd xmm12,xmm7
+ movdqa xmm6,XMMWORD[((80-128))+rax]
+ paddd xmm5,XMMWORD[((208-128))+rax]
+
+ movdqa xmm7,xmm6
+ movdqa xmm1,xmm6
+ psrld xmm7,3
+ movdqa xmm2,xmm6
+
+ psrld xmm1,7
+ movdqa xmm0,XMMWORD[((32-128))+rax]
+ pslld xmm2,14
+ pxor xmm7,xmm1
+ psrld xmm1,18-7
+ movdqa xmm3,xmm0
+ pxor xmm7,xmm2
+ pslld xmm2,25-14
+ pxor xmm7,xmm1
+ psrld xmm0,10
+ movdqa xmm1,xmm3
+
+ psrld xmm3,17
+ pxor xmm7,xmm2
+ pslld xmm1,13
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ psrld xmm3,19-17
+ pxor xmm0,xmm1
+ pslld xmm1,15-13
+ pxor xmm0,xmm3
+ pxor xmm0,xmm1
+ paddd xmm5,xmm0
+ movdqa xmm7,xmm8
+
+ movdqa xmm2,xmm8
+
+ psrld xmm7,6
+ movdqa xmm1,xmm8
+ pslld xmm2,7
+ movdqa XMMWORD[(64-128)+rax],xmm5
+ paddd xmm5,xmm11
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm8
+
+ pxor xmm7,xmm2
+ movdqa xmm3,xmm8
+ pslld xmm2,26-21
+ pandn xmm0,xmm10
+ pand xmm3,xmm9
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm12
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm12
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ movdqa xmm3,xmm13
+ movdqa xmm7,xmm12
+ pslld xmm2,10
+ pxor xmm3,xmm12
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm4,xmm3
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm11,xmm13
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm11,xmm4
+ paddd xmm15,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm11,xmm5
+ paddd xmm11,xmm7
+ movdqa xmm5,XMMWORD[((96-128))+rax]
+ paddd xmm6,XMMWORD[((224-128))+rax]
+
+ movdqa xmm7,xmm5
+ movdqa xmm1,xmm5
+ psrld xmm7,3
+ movdqa xmm2,xmm5
+
+ psrld xmm1,7
+ movdqa xmm0,XMMWORD[((48-128))+rax]
+ pslld xmm2,14
+ pxor xmm7,xmm1
+ psrld xmm1,18-7
+ movdqa xmm4,xmm0
+ pxor xmm7,xmm2
+ pslld xmm2,25-14
+ pxor xmm7,xmm1
+ psrld xmm0,10
+ movdqa xmm1,xmm4
+
+ psrld xmm4,17
+ pxor xmm7,xmm2
+ pslld xmm1,13
+ paddd xmm6,xmm7
+ pxor xmm0,xmm4
+ psrld xmm4,19-17
+ pxor xmm0,xmm1
+ pslld xmm1,15-13
+ pxor xmm0,xmm4
+ pxor xmm0,xmm1
+ paddd xmm6,xmm0
+ movdqa xmm7,xmm15
+
+ movdqa xmm2,xmm15
+
+ psrld xmm7,6
+ movdqa xmm1,xmm15
+ pslld xmm2,7
+ movdqa XMMWORD[(80-128)+rax],xmm6
+ paddd xmm6,xmm10
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm6,XMMWORD[32+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm15
+
+ pxor xmm7,xmm2
+ movdqa xmm4,xmm15
+ pslld xmm2,26-21
+ pandn xmm0,xmm9
+ pand xmm4,xmm8
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm11
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm11
+ psrld xmm1,2
+ paddd xmm6,xmm7
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm12
+ movdqa xmm7,xmm11
+ pslld xmm2,10
+ pxor xmm4,xmm11
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm6,xmm0
+ pslld xmm2,19-10
+ pand xmm3,xmm4
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm10,xmm12
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm10,xmm3
+ paddd xmm14,xmm6
+ pxor xmm7,xmm2
+
+ paddd xmm10,xmm6
+ paddd xmm10,xmm7
+ movdqa xmm6,XMMWORD[((112-128))+rax]
+ paddd xmm5,XMMWORD[((240-128))+rax]
+
+ movdqa xmm7,xmm6
+ movdqa xmm1,xmm6
+ psrld xmm7,3
+ movdqa xmm2,xmm6
+
+ psrld xmm1,7
+ movdqa xmm0,XMMWORD[((64-128))+rax]
+ pslld xmm2,14
+ pxor xmm7,xmm1
+ psrld xmm1,18-7
+ movdqa xmm3,xmm0
+ pxor xmm7,xmm2
+ pslld xmm2,25-14
+ pxor xmm7,xmm1
+ psrld xmm0,10
+ movdqa xmm1,xmm3
+
+ psrld xmm3,17
+ pxor xmm7,xmm2
+ pslld xmm1,13
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ psrld xmm3,19-17
+ pxor xmm0,xmm1
+ pslld xmm1,15-13
+ pxor xmm0,xmm3
+ pxor xmm0,xmm1
+ paddd xmm5,xmm0
+ movdqa xmm7,xmm14
+
+ movdqa xmm2,xmm14
+
+ psrld xmm7,6
+ movdqa xmm1,xmm14
+ pslld xmm2,7
+ movdqa XMMWORD[(96-128)+rax],xmm5
+ paddd xmm5,xmm9
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[64+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm14
+
+ pxor xmm7,xmm2
+ movdqa xmm3,xmm14
+ pslld xmm2,26-21
+ pandn xmm0,xmm8
+ pand xmm3,xmm15
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm10
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm10
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ movdqa xmm3,xmm11
+ movdqa xmm7,xmm10
+ pslld xmm2,10
+ pxor xmm3,xmm10
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm4,xmm3
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm9,xmm11
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm9,xmm4
+ paddd xmm13,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm9,xmm5
+ paddd xmm9,xmm7
+ movdqa xmm5,XMMWORD[((128-128))+rax]
+ paddd xmm6,XMMWORD[((0-128))+rax]
+
+ movdqa xmm7,xmm5
+ movdqa xmm1,xmm5
+ psrld xmm7,3
+ movdqa xmm2,xmm5
+
+ psrld xmm1,7
+ movdqa xmm0,XMMWORD[((80-128))+rax]
+ pslld xmm2,14
+ pxor xmm7,xmm1
+ psrld xmm1,18-7
+ movdqa xmm4,xmm0
+ pxor xmm7,xmm2
+ pslld xmm2,25-14
+ pxor xmm7,xmm1
+ psrld xmm0,10
+ movdqa xmm1,xmm4
+
+ psrld xmm4,17
+ pxor xmm7,xmm2
+ pslld xmm1,13
+ paddd xmm6,xmm7
+ pxor xmm0,xmm4
+ psrld xmm4,19-17
+ pxor xmm0,xmm1
+ pslld xmm1,15-13
+ pxor xmm0,xmm4
+ pxor xmm0,xmm1
+ paddd xmm6,xmm0
+ movdqa xmm7,xmm13
+
+ movdqa xmm2,xmm13
+
+ psrld xmm7,6
+ movdqa xmm1,xmm13
+ pslld xmm2,7
+ movdqa XMMWORD[(112-128)+rax],xmm6
+ paddd xmm6,xmm8
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm6,XMMWORD[96+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm13
+
+ pxor xmm7,xmm2
+ movdqa xmm4,xmm13
+ pslld xmm2,26-21
+ pandn xmm0,xmm15
+ pand xmm4,xmm14
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm9
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm9
+ psrld xmm1,2
+ paddd xmm6,xmm7
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm10
+ movdqa xmm7,xmm9
+ pslld xmm2,10
+ pxor xmm4,xmm9
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm6,xmm0
+ pslld xmm2,19-10
+ pand xmm3,xmm4
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm8,xmm10
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm8,xmm3
+ paddd xmm12,xmm6
+ pxor xmm7,xmm2
+
+ paddd xmm8,xmm6
+ paddd xmm8,xmm7
+ lea rbp,[256+rbp]
+ movdqa xmm6,XMMWORD[((144-128))+rax]
+ paddd xmm5,XMMWORD[((16-128))+rax]
+
+ movdqa xmm7,xmm6
+ movdqa xmm1,xmm6
+ psrld xmm7,3
+ movdqa xmm2,xmm6
+
+ psrld xmm1,7
+ movdqa xmm0,XMMWORD[((96-128))+rax]
+ pslld xmm2,14
+ pxor xmm7,xmm1
+ psrld xmm1,18-7
+ movdqa xmm3,xmm0
+ pxor xmm7,xmm2
+ pslld xmm2,25-14
+ pxor xmm7,xmm1
+ psrld xmm0,10
+ movdqa xmm1,xmm3
+
+ psrld xmm3,17
+ pxor xmm7,xmm2
+ pslld xmm1,13
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ psrld xmm3,19-17
+ pxor xmm0,xmm1
+ pslld xmm1,15-13
+ pxor xmm0,xmm3
+ pxor xmm0,xmm1
+ paddd xmm5,xmm0
+ movdqa xmm7,xmm12
+
+ movdqa xmm2,xmm12
+
+ psrld xmm7,6
+ movdqa xmm1,xmm12
+ pslld xmm2,7
+ movdqa XMMWORD[(128-128)+rax],xmm5
+ paddd xmm5,xmm15
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[((-128))+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm12
+
+ pxor xmm7,xmm2
+ movdqa xmm3,xmm12
+ pslld xmm2,26-21
+ pandn xmm0,xmm14
+ pand xmm3,xmm13
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm8
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm8
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ movdqa xmm3,xmm9
+ movdqa xmm7,xmm8
+ pslld xmm2,10
+ pxor xmm3,xmm8
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm4,xmm3
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm15,xmm9
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm15,xmm4
+ paddd xmm11,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm15,xmm5
+ paddd xmm15,xmm7
+ movdqa xmm5,XMMWORD[((160-128))+rax]
+ paddd xmm6,XMMWORD[((32-128))+rax]
+
+ movdqa xmm7,xmm5
+ movdqa xmm1,xmm5
+ psrld xmm7,3
+ movdqa xmm2,xmm5
+
+ psrld xmm1,7
+ movdqa xmm0,XMMWORD[((112-128))+rax]
+ pslld xmm2,14
+ pxor xmm7,xmm1
+ psrld xmm1,18-7
+ movdqa xmm4,xmm0
+ pxor xmm7,xmm2
+ pslld xmm2,25-14
+ pxor xmm7,xmm1
+ psrld xmm0,10
+ movdqa xmm1,xmm4
+
+ psrld xmm4,17
+ pxor xmm7,xmm2
+ pslld xmm1,13
+ paddd xmm6,xmm7
+ pxor xmm0,xmm4
+ psrld xmm4,19-17
+ pxor xmm0,xmm1
+ pslld xmm1,15-13
+ pxor xmm0,xmm4
+ pxor xmm0,xmm1
+ paddd xmm6,xmm0
+ movdqa xmm7,xmm11
+
+ movdqa xmm2,xmm11
+
+ psrld xmm7,6
+ movdqa xmm1,xmm11
+ pslld xmm2,7
+ movdqa XMMWORD[(144-128)+rax],xmm6
+ paddd xmm6,xmm14
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm6,XMMWORD[((-96))+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm11
+
+ pxor xmm7,xmm2
+ movdqa xmm4,xmm11
+ pslld xmm2,26-21
+ pandn xmm0,xmm13
+ pand xmm4,xmm12
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm15
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm15
+ psrld xmm1,2
+ paddd xmm6,xmm7
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm8
+ movdqa xmm7,xmm15
+ pslld xmm2,10
+ pxor xmm4,xmm15
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm6,xmm0
+ pslld xmm2,19-10
+ pand xmm3,xmm4
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm14,xmm8
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm14,xmm3
+ paddd xmm10,xmm6
+ pxor xmm7,xmm2
+
+ paddd xmm14,xmm6
+ paddd xmm14,xmm7
+ movdqa xmm6,XMMWORD[((176-128))+rax]
+ paddd xmm5,XMMWORD[((48-128))+rax]
+
+ movdqa xmm7,xmm6
+ movdqa xmm1,xmm6
+ psrld xmm7,3
+ movdqa xmm2,xmm6
+
+ psrld xmm1,7
+ movdqa xmm0,XMMWORD[((128-128))+rax]
+ pslld xmm2,14
+ pxor xmm7,xmm1
+ psrld xmm1,18-7
+ movdqa xmm3,xmm0
+ pxor xmm7,xmm2
+ pslld xmm2,25-14
+ pxor xmm7,xmm1
+ psrld xmm0,10
+ movdqa xmm1,xmm3
+
+ psrld xmm3,17
+ pxor xmm7,xmm2
+ pslld xmm1,13
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ psrld xmm3,19-17
+ pxor xmm0,xmm1
+ pslld xmm1,15-13
+ pxor xmm0,xmm3
+ pxor xmm0,xmm1
+ paddd xmm5,xmm0
+ movdqa xmm7,xmm10
+
+ movdqa xmm2,xmm10
+
+ psrld xmm7,6
+ movdqa xmm1,xmm10
+ pslld xmm2,7
+ movdqa XMMWORD[(160-128)+rax],xmm5
+ paddd xmm5,xmm13
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[((-64))+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm10
+
+ pxor xmm7,xmm2
+ movdqa xmm3,xmm10
+ pslld xmm2,26-21
+ pandn xmm0,xmm12
+ pand xmm3,xmm11
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm14
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm14
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ movdqa xmm3,xmm15
+ movdqa xmm7,xmm14
+ pslld xmm2,10
+ pxor xmm3,xmm14
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm4,xmm3
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm13,xmm15
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm13,xmm4
+ paddd xmm9,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm13,xmm5
+ paddd xmm13,xmm7
+ movdqa xmm5,XMMWORD[((192-128))+rax]
+ paddd xmm6,XMMWORD[((64-128))+rax]
+
+ movdqa xmm7,xmm5
+ movdqa xmm1,xmm5
+ psrld xmm7,3
+ movdqa xmm2,xmm5
+
+ psrld xmm1,7
+ movdqa xmm0,XMMWORD[((144-128))+rax]
+ pslld xmm2,14
+ pxor xmm7,xmm1
+ psrld xmm1,18-7
+ movdqa xmm4,xmm0
+ pxor xmm7,xmm2
+ pslld xmm2,25-14
+ pxor xmm7,xmm1
+ psrld xmm0,10
+ movdqa xmm1,xmm4
+
+ psrld xmm4,17
+ pxor xmm7,xmm2
+ pslld xmm1,13
+ paddd xmm6,xmm7
+ pxor xmm0,xmm4
+ psrld xmm4,19-17
+ pxor xmm0,xmm1
+ pslld xmm1,15-13
+ pxor xmm0,xmm4
+ pxor xmm0,xmm1
+ paddd xmm6,xmm0
+ movdqa xmm7,xmm9
+
+ movdqa xmm2,xmm9
+
+ psrld xmm7,6
+ movdqa xmm1,xmm9
+ pslld xmm2,7
+ movdqa XMMWORD[(176-128)+rax],xmm6
+ paddd xmm6,xmm12
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm6,XMMWORD[((-32))+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm9
+
+ pxor xmm7,xmm2
+ movdqa xmm4,xmm9
+ pslld xmm2,26-21
+ pandn xmm0,xmm11
+ pand xmm4,xmm10
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm13
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm13
+ psrld xmm1,2
+ paddd xmm6,xmm7
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm14
+ movdqa xmm7,xmm13
+ pslld xmm2,10
+ pxor xmm4,xmm13
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm6,xmm0
+ pslld xmm2,19-10
+ pand xmm3,xmm4
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm12,xmm14
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm12,xmm3
+ paddd xmm8,xmm6
+ pxor xmm7,xmm2
+
+ paddd xmm12,xmm6
+ paddd xmm12,xmm7
+ movdqa xmm6,XMMWORD[((208-128))+rax]
+ paddd xmm5,XMMWORD[((80-128))+rax]
+
+ movdqa xmm7,xmm6
+ movdqa xmm1,xmm6
+ psrld xmm7,3
+ movdqa xmm2,xmm6
+
+ psrld xmm1,7
+ movdqa xmm0,XMMWORD[((160-128))+rax]
+ pslld xmm2,14
+ pxor xmm7,xmm1
+ psrld xmm1,18-7
+ movdqa xmm3,xmm0
+ pxor xmm7,xmm2
+ pslld xmm2,25-14
+ pxor xmm7,xmm1
+ psrld xmm0,10
+ movdqa xmm1,xmm3
+
+ psrld xmm3,17
+ pxor xmm7,xmm2
+ pslld xmm1,13
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ psrld xmm3,19-17
+ pxor xmm0,xmm1
+ pslld xmm1,15-13
+ pxor xmm0,xmm3
+ pxor xmm0,xmm1
+ paddd xmm5,xmm0
+ movdqa xmm7,xmm8
+
+ movdqa xmm2,xmm8
+
+ psrld xmm7,6
+ movdqa xmm1,xmm8
+ pslld xmm2,7
+ movdqa XMMWORD[(192-128)+rax],xmm5
+ paddd xmm5,xmm11
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm8
+
+ pxor xmm7,xmm2
+ movdqa xmm3,xmm8
+ pslld xmm2,26-21
+ pandn xmm0,xmm10
+ pand xmm3,xmm9
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm12
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm12
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ movdqa xmm3,xmm13
+ movdqa xmm7,xmm12
+ pslld xmm2,10
+ pxor xmm3,xmm12
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm4,xmm3
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm11,xmm13
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm11,xmm4
+ paddd xmm15,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm11,xmm5
+ paddd xmm11,xmm7
+ movdqa xmm5,XMMWORD[((224-128))+rax]
+ paddd xmm6,XMMWORD[((96-128))+rax]
+
+ movdqa xmm7,xmm5
+ movdqa xmm1,xmm5
+ psrld xmm7,3
+ movdqa xmm2,xmm5
+
+ psrld xmm1,7
+ movdqa xmm0,XMMWORD[((176-128))+rax]
+ pslld xmm2,14
+ pxor xmm7,xmm1
+ psrld xmm1,18-7
+ movdqa xmm4,xmm0
+ pxor xmm7,xmm2
+ pslld xmm2,25-14
+ pxor xmm7,xmm1
+ psrld xmm0,10
+ movdqa xmm1,xmm4
+
+ psrld xmm4,17
+ pxor xmm7,xmm2
+ pslld xmm1,13
+ paddd xmm6,xmm7
+ pxor xmm0,xmm4
+ psrld xmm4,19-17
+ pxor xmm0,xmm1
+ pslld xmm1,15-13
+ pxor xmm0,xmm4
+ pxor xmm0,xmm1
+ paddd xmm6,xmm0
+ movdqa xmm7,xmm15
+
+ movdqa xmm2,xmm15
+
+ psrld xmm7,6
+ movdqa xmm1,xmm15
+ pslld xmm2,7
+ movdqa XMMWORD[(208-128)+rax],xmm6
+ paddd xmm6,xmm10
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm6,XMMWORD[32+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm15
+
+ pxor xmm7,xmm2
+ movdqa xmm4,xmm15
+ pslld xmm2,26-21
+ pandn xmm0,xmm9
+ pand xmm4,xmm8
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm11
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm11
+ psrld xmm1,2
+ paddd xmm6,xmm7
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm12
+ movdqa xmm7,xmm11
+ pslld xmm2,10
+ pxor xmm4,xmm11
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm6,xmm0
+ pslld xmm2,19-10
+ pand xmm3,xmm4
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm10,xmm12
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm10,xmm3
+ paddd xmm14,xmm6
+ pxor xmm7,xmm2
+
+ paddd xmm10,xmm6
+ paddd xmm10,xmm7
+ movdqa xmm6,XMMWORD[((240-128))+rax]
+ paddd xmm5,XMMWORD[((112-128))+rax]
+
+ movdqa xmm7,xmm6
+ movdqa xmm1,xmm6
+ psrld xmm7,3
+ movdqa xmm2,xmm6
+
+ psrld xmm1,7
+ movdqa xmm0,XMMWORD[((192-128))+rax]
+ pslld xmm2,14
+ pxor xmm7,xmm1
+ psrld xmm1,18-7
+ movdqa xmm3,xmm0
+ pxor xmm7,xmm2
+ pslld xmm2,25-14
+ pxor xmm7,xmm1
+ psrld xmm0,10
+ movdqa xmm1,xmm3
+
+ psrld xmm3,17
+ pxor xmm7,xmm2
+ pslld xmm1,13
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ psrld xmm3,19-17
+ pxor xmm0,xmm1
+ pslld xmm1,15-13
+ pxor xmm0,xmm3
+ pxor xmm0,xmm1
+ paddd xmm5,xmm0
+ movdqa xmm7,xmm14
+
+ movdqa xmm2,xmm14
+
+ psrld xmm7,6
+ movdqa xmm1,xmm14
+ pslld xmm2,7
+ movdqa XMMWORD[(224-128)+rax],xmm5
+ paddd xmm5,xmm9
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm5,XMMWORD[64+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm14
+
+ pxor xmm7,xmm2
+ movdqa xmm3,xmm14
+ pslld xmm2,26-21
+ pandn xmm0,xmm8
+ pand xmm3,xmm15
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm10
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm10
+ psrld xmm1,2
+ paddd xmm5,xmm7
+ pxor xmm0,xmm3
+ movdqa xmm3,xmm11
+ movdqa xmm7,xmm10
+ pslld xmm2,10
+ pxor xmm3,xmm10
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm5,xmm0
+ pslld xmm2,19-10
+ pand xmm4,xmm3
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm9,xmm11
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm9,xmm4
+ paddd xmm13,xmm5
+ pxor xmm7,xmm2
+
+ paddd xmm9,xmm5
+ paddd xmm9,xmm7
+ movdqa xmm5,XMMWORD[((0-128))+rax]
+ paddd xmm6,XMMWORD[((128-128))+rax]
+
+ movdqa xmm7,xmm5
+ movdqa xmm1,xmm5
+ psrld xmm7,3
+ movdqa xmm2,xmm5
+
+ psrld xmm1,7
+ movdqa xmm0,XMMWORD[((208-128))+rax]
+ pslld xmm2,14
+ pxor xmm7,xmm1
+ psrld xmm1,18-7
+ movdqa xmm4,xmm0
+ pxor xmm7,xmm2
+ pslld xmm2,25-14
+ pxor xmm7,xmm1
+ psrld xmm0,10
+ movdqa xmm1,xmm4
+
+ psrld xmm4,17
+ pxor xmm7,xmm2
+ pslld xmm1,13
+ paddd xmm6,xmm7
+ pxor xmm0,xmm4
+ psrld xmm4,19-17
+ pxor xmm0,xmm1
+ pslld xmm1,15-13
+ pxor xmm0,xmm4
+ pxor xmm0,xmm1
+ paddd xmm6,xmm0
+ movdqa xmm7,xmm13
+
+ movdqa xmm2,xmm13
+
+ psrld xmm7,6
+ movdqa xmm1,xmm13
+ pslld xmm2,7
+ movdqa XMMWORD[(240-128)+rax],xmm6
+ paddd xmm6,xmm8
+
+ psrld xmm1,11
+ pxor xmm7,xmm2
+ pslld xmm2,21-7
+ paddd xmm6,XMMWORD[96+rbp]
+ pxor xmm7,xmm1
+
+ psrld xmm1,25-11
+ movdqa xmm0,xmm13
+
+ pxor xmm7,xmm2
+ movdqa xmm4,xmm13
+ pslld xmm2,26-21
+ pandn xmm0,xmm15
+ pand xmm4,xmm14
+ pxor xmm7,xmm1
+
+
+ movdqa xmm1,xmm9
+ pxor xmm7,xmm2
+ movdqa xmm2,xmm9
+ psrld xmm1,2
+ paddd xmm6,xmm7
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm10
+ movdqa xmm7,xmm9
+ pslld xmm2,10
+ pxor xmm4,xmm9
+
+
+ psrld xmm7,13
+ pxor xmm1,xmm2
+ paddd xmm6,xmm0
+ pslld xmm2,19-10
+ pand xmm3,xmm4
+ pxor xmm1,xmm7
+
+
+ psrld xmm7,22-13
+ pxor xmm1,xmm2
+ movdqa xmm8,xmm10
+ pslld xmm2,30-19
+ pxor xmm7,xmm1
+ pxor xmm8,xmm3
+ paddd xmm12,xmm6
+ pxor xmm7,xmm2
+
+ paddd xmm8,xmm6
+ paddd xmm8,xmm7
+ lea rbp,[256+rbp]
+ dec ecx
+ jnz NEAR $L$oop_16_xx
+
+ mov ecx,1
+ lea rbp,[((K256+128))]
+
+ movdqa xmm7,XMMWORD[rbx]
+ cmp ecx,DWORD[rbx]
+ pxor xmm0,xmm0
+ cmovge r8,rbp
+ cmp ecx,DWORD[4+rbx]
+ movdqa xmm6,xmm7
+ cmovge r9,rbp
+ cmp ecx,DWORD[8+rbx]
+ pcmpgtd xmm6,xmm0
+ cmovge r10,rbp
+ cmp ecx,DWORD[12+rbx]
+ paddd xmm7,xmm6
+ cmovge r11,rbp
+
+ movdqu xmm0,XMMWORD[((0-128))+rdi]
+ pand xmm8,xmm6
+ movdqu xmm1,XMMWORD[((32-128))+rdi]
+ pand xmm9,xmm6
+ movdqu xmm2,XMMWORD[((64-128))+rdi]
+ pand xmm10,xmm6
+ movdqu xmm5,XMMWORD[((96-128))+rdi]
+ pand xmm11,xmm6
+ paddd xmm8,xmm0
+ movdqu xmm0,XMMWORD[((128-128))+rdi]
+ pand xmm12,xmm6
+ paddd xmm9,xmm1
+ movdqu xmm1,XMMWORD[((160-128))+rdi]
+ pand xmm13,xmm6
+ paddd xmm10,xmm2
+ movdqu xmm2,XMMWORD[((192-128))+rdi]
+ pand xmm14,xmm6
+ paddd xmm11,xmm5
+ movdqu xmm5,XMMWORD[((224-128))+rdi]
+ pand xmm15,xmm6
+ paddd xmm12,xmm0
+ paddd xmm13,xmm1
+ movdqu XMMWORD[(0-128)+rdi],xmm8
+ paddd xmm14,xmm2
+ movdqu XMMWORD[(32-128)+rdi],xmm9
+ paddd xmm15,xmm5
+ movdqu XMMWORD[(64-128)+rdi],xmm10
+ movdqu XMMWORD[(96-128)+rdi],xmm11
+ movdqu XMMWORD[(128-128)+rdi],xmm12
+ movdqu XMMWORD[(160-128)+rdi],xmm13
+ movdqu XMMWORD[(192-128)+rdi],xmm14
+ movdqu XMMWORD[(224-128)+rdi],xmm15
+
+ movdqa XMMWORD[rbx],xmm7
+ movdqa xmm6,XMMWORD[$L$pbswap]
+ dec edx
+ jnz NEAR $L$oop
+
+ mov edx,DWORD[280+rsp]
+ lea rdi,[16+rdi]
+ lea rsi,[64+rsi]
+ dec edx
+ jnz NEAR $L$oop_grande
+
+$L$done:
+ mov rax,QWORD[272+rsp]
+
+ movaps xmm6,XMMWORD[((-184))+rax]
+ movaps xmm7,XMMWORD[((-168))+rax]
+ movaps xmm8,XMMWORD[((-152))+rax]
+ movaps xmm9,XMMWORD[((-136))+rax]
+ movaps xmm10,XMMWORD[((-120))+rax]
+ movaps xmm11,XMMWORD[((-104))+rax]
+ movaps xmm12,XMMWORD[((-88))+rax]
+ movaps xmm13,XMMWORD[((-72))+rax]
+ movaps xmm14,XMMWORD[((-56))+rax]
+ movaps xmm15,XMMWORD[((-40))+rax]
+ mov rbp,QWORD[((-16))+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+
+ lea rsp,[rax]
+
+$L$epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha256_multi_block:
+
+ALIGN 32
+sha256_multi_block_shaext:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha256_multi_block_shaext:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_shaext_shortcut:
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ lea rsp,[((-168))+rsp]
+ movaps XMMWORD[rsp],xmm6
+ movaps XMMWORD[16+rsp],xmm7
+ movaps XMMWORD[32+rsp],xmm8
+ movaps XMMWORD[48+rsp],xmm9
+ movaps XMMWORD[(-120)+rax],xmm10
+ movaps XMMWORD[(-104)+rax],xmm11
+ movaps XMMWORD[(-88)+rax],xmm12
+ movaps XMMWORD[(-72)+rax],xmm13
+ movaps XMMWORD[(-56)+rax],xmm14
+ movaps XMMWORD[(-40)+rax],xmm15
+ sub rsp,288
+ shl edx,1
+ and rsp,-256
+ lea rdi,[128+rdi]
+ mov QWORD[272+rsp],rax
+$L$body_shaext:
+ lea rbx,[256+rsp]
+ lea rbp,[((K256_shaext+128))]
+
+$L$oop_grande_shaext:
+ mov DWORD[280+rsp],edx
+ xor edx,edx
+
+ mov r8,QWORD[rsi]
+
+ mov ecx,DWORD[8+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[rbx],ecx
+ cmovle r8,rsp
+
+ mov r9,QWORD[16+rsi]
+
+ mov ecx,DWORD[24+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[4+rbx],ecx
+ cmovle r9,rsp
+ test edx,edx
+ jz NEAR $L$done_shaext
+
+ movq xmm12,QWORD[((0-128))+rdi]
+ movq xmm4,QWORD[((32-128))+rdi]
+ movq xmm13,QWORD[((64-128))+rdi]
+ movq xmm5,QWORD[((96-128))+rdi]
+ movq xmm8,QWORD[((128-128))+rdi]
+ movq xmm9,QWORD[((160-128))+rdi]
+ movq xmm10,QWORD[((192-128))+rdi]
+ movq xmm11,QWORD[((224-128))+rdi]
+
+ punpckldq xmm12,xmm4
+ punpckldq xmm13,xmm5
+ punpckldq xmm8,xmm9
+ punpckldq xmm10,xmm11
+ movdqa xmm3,XMMWORD[((K256_shaext-16))]
+
+ movdqa xmm14,xmm12
+ movdqa xmm15,xmm13
+ punpcklqdq xmm12,xmm8
+ punpcklqdq xmm13,xmm10
+ punpckhqdq xmm14,xmm8
+ punpckhqdq xmm15,xmm10
+
+ pshufd xmm12,xmm12,27
+ pshufd xmm13,xmm13,27
+ pshufd xmm14,xmm14,27
+ pshufd xmm15,xmm15,27
+ jmp NEAR $L$oop_shaext
+
+ALIGN 32
+$L$oop_shaext:
+ movdqu xmm4,XMMWORD[r8]
+ movdqu xmm8,XMMWORD[r9]
+ movdqu xmm5,XMMWORD[16+r8]
+ movdqu xmm9,XMMWORD[16+r9]
+ movdqu xmm6,XMMWORD[32+r8]
+DB 102,15,56,0,227
+ movdqu xmm10,XMMWORD[32+r9]
+DB 102,68,15,56,0,195
+ movdqu xmm7,XMMWORD[48+r8]
+ lea r8,[64+r8]
+ movdqu xmm11,XMMWORD[48+r9]
+ lea r9,[64+r9]
+
+ movdqa xmm0,XMMWORD[((0-128))+rbp]
+DB 102,15,56,0,235
+ paddd xmm0,xmm4
+ pxor xmm4,xmm12
+ movdqa xmm1,xmm0
+ movdqa xmm2,XMMWORD[((0-128))+rbp]
+DB 102,68,15,56,0,203
+ paddd xmm2,xmm8
+ movdqa XMMWORD[80+rsp],xmm13
+DB 69,15,56,203,236
+ pxor xmm8,xmm14
+ movdqa xmm0,xmm2
+ movdqa XMMWORD[112+rsp],xmm15
+DB 69,15,56,203,254
+ pshufd xmm0,xmm1,0x0e
+ pxor xmm4,xmm12
+ movdqa XMMWORD[64+rsp],xmm12
+DB 69,15,56,203,229
+ pshufd xmm0,xmm2,0x0e
+ pxor xmm8,xmm14
+ movdqa XMMWORD[96+rsp],xmm14
+ movdqa xmm1,XMMWORD[((16-128))+rbp]
+ paddd xmm1,xmm5
+DB 102,15,56,0,243
+DB 69,15,56,203,247
+
+ movdqa xmm0,xmm1
+ movdqa xmm2,XMMWORD[((16-128))+rbp]
+ paddd xmm2,xmm9
+DB 69,15,56,203,236
+ movdqa xmm0,xmm2
+ prefetcht0 [127+r8]
+DB 102,15,56,0,251
+DB 102,68,15,56,0,211
+ prefetcht0 [127+r9]
+DB 69,15,56,203,254
+ pshufd xmm0,xmm1,0x0e
+DB 102,68,15,56,0,219
+DB 15,56,204,229
+DB 69,15,56,203,229
+ pshufd xmm0,xmm2,0x0e
+ movdqa xmm1,XMMWORD[((32-128))+rbp]
+ paddd xmm1,xmm6
+DB 69,15,56,203,247
+
+ movdqa xmm0,xmm1
+ movdqa xmm2,XMMWORD[((32-128))+rbp]
+ paddd xmm2,xmm10
+DB 69,15,56,203,236
+DB 69,15,56,204,193
+ movdqa xmm0,xmm2
+ movdqa xmm3,xmm7
+DB 69,15,56,203,254
+ pshufd xmm0,xmm1,0x0e
+DB 102,15,58,15,222,4
+ paddd xmm4,xmm3
+ movdqa xmm3,xmm11
+DB 102,65,15,58,15,218,4
+DB 15,56,204,238
+DB 69,15,56,203,229
+ pshufd xmm0,xmm2,0x0e
+ movdqa xmm1,XMMWORD[((48-128))+rbp]
+ paddd xmm1,xmm7
+DB 69,15,56,203,247
+DB 69,15,56,204,202
+
+ movdqa xmm0,xmm1
+ movdqa xmm2,XMMWORD[((48-128))+rbp]
+ paddd xmm8,xmm3
+ paddd xmm2,xmm11
+DB 15,56,205,231
+DB 69,15,56,203,236
+ movdqa xmm0,xmm2
+ movdqa xmm3,xmm4
+DB 102,15,58,15,223,4
+DB 69,15,56,203,254
+DB 69,15,56,205,195
+ pshufd xmm0,xmm1,0x0e
+ paddd xmm5,xmm3
+ movdqa xmm3,xmm8
+DB 102,65,15,58,15,219,4
+DB 15,56,204,247
+DB 69,15,56,203,229
+ pshufd xmm0,xmm2,0x0e
+ movdqa xmm1,XMMWORD[((64-128))+rbp]
+ paddd xmm1,xmm4
+DB 69,15,56,203,247
+DB 69,15,56,204,211
+ movdqa xmm0,xmm1
+ movdqa xmm2,XMMWORD[((64-128))+rbp]
+ paddd xmm9,xmm3
+ paddd xmm2,xmm8
+DB 15,56,205,236
+DB 69,15,56,203,236
+ movdqa xmm0,xmm2
+ movdqa xmm3,xmm5
+DB 102,15,58,15,220,4
+DB 69,15,56,203,254
+DB 69,15,56,205,200
+ pshufd xmm0,xmm1,0x0e
+ paddd xmm6,xmm3
+ movdqa xmm3,xmm9
+DB 102,65,15,58,15,216,4
+DB 15,56,204,252
+DB 69,15,56,203,229
+ pshufd xmm0,xmm2,0x0e
+ movdqa xmm1,XMMWORD[((80-128))+rbp]
+ paddd xmm1,xmm5
+DB 69,15,56,203,247
+DB 69,15,56,204,216
+ movdqa xmm0,xmm1
+ movdqa xmm2,XMMWORD[((80-128))+rbp]
+ paddd xmm10,xmm3
+ paddd xmm2,xmm9
+DB 15,56,205,245
+DB 69,15,56,203,236
+ movdqa xmm0,xmm2
+ movdqa xmm3,xmm6
+DB 102,15,58,15,221,4
+DB 69,15,56,203,254
+DB 69,15,56,205,209
+ pshufd xmm0,xmm1,0x0e
+ paddd xmm7,xmm3
+ movdqa xmm3,xmm10
+DB 102,65,15,58,15,217,4
+DB 15,56,204,229
+DB 69,15,56,203,229
+ pshufd xmm0,xmm2,0x0e
+ movdqa xmm1,XMMWORD[((96-128))+rbp]
+ paddd xmm1,xmm6
+DB 69,15,56,203,247
+DB 69,15,56,204,193
+ movdqa xmm0,xmm1
+ movdqa xmm2,XMMWORD[((96-128))+rbp]
+ paddd xmm11,xmm3
+ paddd xmm2,xmm10
+DB 15,56,205,254
+DB 69,15,56,203,236
+ movdqa xmm0,xmm2
+ movdqa xmm3,xmm7
+DB 102,15,58,15,222,4
+DB 69,15,56,203,254
+DB 69,15,56,205,218
+ pshufd xmm0,xmm1,0x0e
+ paddd xmm4,xmm3
+ movdqa xmm3,xmm11
+DB 102,65,15,58,15,218,4
+DB 15,56,204,238
+DB 69,15,56,203,229
+ pshufd xmm0,xmm2,0x0e
+ movdqa xmm1,XMMWORD[((112-128))+rbp]
+ paddd xmm1,xmm7
+DB 69,15,56,203,247
+DB 69,15,56,204,202
+ movdqa xmm0,xmm1
+ movdqa xmm2,XMMWORD[((112-128))+rbp]
+ paddd xmm8,xmm3
+ paddd xmm2,xmm11
+DB 15,56,205,231
+DB 69,15,56,203,236
+ movdqa xmm0,xmm2
+ movdqa xmm3,xmm4
+DB 102,15,58,15,223,4
+DB 69,15,56,203,254
+DB 69,15,56,205,195
+ pshufd xmm0,xmm1,0x0e
+ paddd xmm5,xmm3
+ movdqa xmm3,xmm8
+DB 102,65,15,58,15,219,4
+DB 15,56,204,247
+DB 69,15,56,203,229
+ pshufd xmm0,xmm2,0x0e
+ movdqa xmm1,XMMWORD[((128-128))+rbp]
+ paddd xmm1,xmm4
+DB 69,15,56,203,247
+DB 69,15,56,204,211
+ movdqa xmm0,xmm1
+ movdqa xmm2,XMMWORD[((128-128))+rbp]
+ paddd xmm9,xmm3
+ paddd xmm2,xmm8
+DB 15,56,205,236
+DB 69,15,56,203,236
+ movdqa xmm0,xmm2
+ movdqa xmm3,xmm5
+DB 102,15,58,15,220,4
+DB 69,15,56,203,254
+DB 69,15,56,205,200
+ pshufd xmm0,xmm1,0x0e
+ paddd xmm6,xmm3
+ movdqa xmm3,xmm9
+DB 102,65,15,58,15,216,4
+DB 15,56,204,252
+DB 69,15,56,203,229
+ pshufd xmm0,xmm2,0x0e
+ movdqa xmm1,XMMWORD[((144-128))+rbp]
+ paddd xmm1,xmm5
+DB 69,15,56,203,247
+DB 69,15,56,204,216
+ movdqa xmm0,xmm1
+ movdqa xmm2,XMMWORD[((144-128))+rbp]
+ paddd xmm10,xmm3
+ paddd xmm2,xmm9
+DB 15,56,205,245
+DB 69,15,56,203,236
+ movdqa xmm0,xmm2
+ movdqa xmm3,xmm6
+DB 102,15,58,15,221,4
+DB 69,15,56,203,254
+DB 69,15,56,205,209
+ pshufd xmm0,xmm1,0x0e
+ paddd xmm7,xmm3
+ movdqa xmm3,xmm10
+DB 102,65,15,58,15,217,4
+DB 15,56,204,229
+DB 69,15,56,203,229
+ pshufd xmm0,xmm2,0x0e
+ movdqa xmm1,XMMWORD[((160-128))+rbp]
+ paddd xmm1,xmm6
+DB 69,15,56,203,247
+DB 69,15,56,204,193
+ movdqa xmm0,xmm1
+ movdqa xmm2,XMMWORD[((160-128))+rbp]
+ paddd xmm11,xmm3
+ paddd xmm2,xmm10
+DB 15,56,205,254
+DB 69,15,56,203,236
+ movdqa xmm0,xmm2
+ movdqa xmm3,xmm7
+DB 102,15,58,15,222,4
+DB 69,15,56,203,254
+DB 69,15,56,205,218
+ pshufd xmm0,xmm1,0x0e
+ paddd xmm4,xmm3
+ movdqa xmm3,xmm11
+DB 102,65,15,58,15,218,4
+DB 15,56,204,238
+DB 69,15,56,203,229
+ pshufd xmm0,xmm2,0x0e
+ movdqa xmm1,XMMWORD[((176-128))+rbp]
+ paddd xmm1,xmm7
+DB 69,15,56,203,247
+DB 69,15,56,204,202
+ movdqa xmm0,xmm1
+ movdqa xmm2,XMMWORD[((176-128))+rbp]
+ paddd xmm8,xmm3
+ paddd xmm2,xmm11
+DB 15,56,205,231
+DB 69,15,56,203,236
+ movdqa xmm0,xmm2
+ movdqa xmm3,xmm4
+DB 102,15,58,15,223,4
+DB 69,15,56,203,254
+DB 69,15,56,205,195
+ pshufd xmm0,xmm1,0x0e
+ paddd xmm5,xmm3
+ movdqa xmm3,xmm8
+DB 102,65,15,58,15,219,4
+DB 15,56,204,247
+DB 69,15,56,203,229
+ pshufd xmm0,xmm2,0x0e
+ movdqa xmm1,XMMWORD[((192-128))+rbp]
+ paddd xmm1,xmm4
+DB 69,15,56,203,247
+DB 69,15,56,204,211
+ movdqa xmm0,xmm1
+ movdqa xmm2,XMMWORD[((192-128))+rbp]
+ paddd xmm9,xmm3
+ paddd xmm2,xmm8
+DB 15,56,205,236
+DB 69,15,56,203,236
+ movdqa xmm0,xmm2
+ movdqa xmm3,xmm5
+DB 102,15,58,15,220,4
+DB 69,15,56,203,254
+DB 69,15,56,205,200
+ pshufd xmm0,xmm1,0x0e
+ paddd xmm6,xmm3
+ movdqa xmm3,xmm9
+DB 102,65,15,58,15,216,4
+DB 15,56,204,252
+DB 69,15,56,203,229
+ pshufd xmm0,xmm2,0x0e
+ movdqa xmm1,XMMWORD[((208-128))+rbp]
+ paddd xmm1,xmm5
+DB 69,15,56,203,247
+DB 69,15,56,204,216
+ movdqa xmm0,xmm1
+ movdqa xmm2,XMMWORD[((208-128))+rbp]
+ paddd xmm10,xmm3
+ paddd xmm2,xmm9
+DB 15,56,205,245
+DB 69,15,56,203,236
+ movdqa xmm0,xmm2
+ movdqa xmm3,xmm6
+DB 102,15,58,15,221,4
+DB 69,15,56,203,254
+DB 69,15,56,205,209
+ pshufd xmm0,xmm1,0x0e
+ paddd xmm7,xmm3
+ movdqa xmm3,xmm10
+DB 102,65,15,58,15,217,4
+ nop
+DB 69,15,56,203,229
+ pshufd xmm0,xmm2,0x0e
+ movdqa xmm1,XMMWORD[((224-128))+rbp]
+ paddd xmm1,xmm6
+DB 69,15,56,203,247
+
+ movdqa xmm0,xmm1
+ movdqa xmm2,XMMWORD[((224-128))+rbp]
+ paddd xmm11,xmm3
+ paddd xmm2,xmm10
+DB 15,56,205,254
+ nop
+DB 69,15,56,203,236
+ movdqa xmm0,xmm2
+ mov ecx,1
+ pxor xmm6,xmm6
+DB 69,15,56,203,254
+DB 69,15,56,205,218
+ pshufd xmm0,xmm1,0x0e
+ movdqa xmm1,XMMWORD[((240-128))+rbp]
+ paddd xmm1,xmm7
+ movq xmm7,QWORD[rbx]
+ nop
+DB 69,15,56,203,229
+ pshufd xmm0,xmm2,0x0e
+ movdqa xmm2,XMMWORD[((240-128))+rbp]
+ paddd xmm2,xmm11
+DB 69,15,56,203,247
+
+ movdqa xmm0,xmm1
+ cmp ecx,DWORD[rbx]
+ cmovge r8,rsp
+ cmp ecx,DWORD[4+rbx]
+ cmovge r9,rsp
+ pshufd xmm9,xmm7,0x00
+DB 69,15,56,203,236
+ movdqa xmm0,xmm2
+ pshufd xmm10,xmm7,0x55
+ movdqa xmm11,xmm7
+DB 69,15,56,203,254
+ pshufd xmm0,xmm1,0x0e
+ pcmpgtd xmm9,xmm6
+ pcmpgtd xmm10,xmm6
+DB 69,15,56,203,229
+ pshufd xmm0,xmm2,0x0e
+ pcmpgtd xmm11,xmm6
+ movdqa xmm3,XMMWORD[((K256_shaext-16))]
+DB 69,15,56,203,247
+
+ pand xmm13,xmm9
+ pand xmm15,xmm10
+ pand xmm12,xmm9
+ pand xmm14,xmm10
+ paddd xmm11,xmm7
+
+ paddd xmm13,XMMWORD[80+rsp]
+ paddd xmm15,XMMWORD[112+rsp]
+ paddd xmm12,XMMWORD[64+rsp]
+ paddd xmm14,XMMWORD[96+rsp]
+
+ movq QWORD[rbx],xmm11
+ dec edx
+ jnz NEAR $L$oop_shaext
+
+ mov edx,DWORD[280+rsp]
+
+ pshufd xmm12,xmm12,27
+ pshufd xmm13,xmm13,27
+ pshufd xmm14,xmm14,27
+ pshufd xmm15,xmm15,27
+
+ movdqa xmm5,xmm12
+ movdqa xmm6,xmm13
+ punpckldq xmm12,xmm14
+ punpckhdq xmm5,xmm14
+ punpckldq xmm13,xmm15
+ punpckhdq xmm6,xmm15
+
+ movq QWORD[(0-128)+rdi],xmm12
+ psrldq xmm12,8
+ movq QWORD[(128-128)+rdi],xmm5
+ psrldq xmm5,8
+ movq QWORD[(32-128)+rdi],xmm12
+ movq QWORD[(160-128)+rdi],xmm5
+
+ movq QWORD[(64-128)+rdi],xmm13
+ psrldq xmm13,8
+ movq QWORD[(192-128)+rdi],xmm6
+ psrldq xmm6,8
+ movq QWORD[(96-128)+rdi],xmm13
+ movq QWORD[(224-128)+rdi],xmm6
+
+ lea rdi,[8+rdi]
+ lea rsi,[32+rsi]
+ dec edx
+ jnz NEAR $L$oop_grande_shaext
+
+$L$done_shaext:
+
+ movaps xmm6,XMMWORD[((-184))+rax]
+ movaps xmm7,XMMWORD[((-168))+rax]
+ movaps xmm8,XMMWORD[((-152))+rax]
+ movaps xmm9,XMMWORD[((-136))+rax]
+ movaps xmm10,XMMWORD[((-120))+rax]
+ movaps xmm11,XMMWORD[((-104))+rax]
+ movaps xmm12,XMMWORD[((-88))+rax]
+ movaps xmm13,XMMWORD[((-72))+rax]
+ movaps xmm14,XMMWORD[((-56))+rax]
+ movaps xmm15,XMMWORD[((-40))+rax]
+ mov rbp,QWORD[((-16))+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+
+ lea rsp,[rax]
+
+$L$epilogue_shaext:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha256_multi_block_shaext:
+
+ALIGN 32
+sha256_multi_block_avx:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha256_multi_block_avx:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_avx_shortcut:
+ shr rcx,32
+ cmp edx,2
+ jb NEAR $L$avx
+ test ecx,32
+ jnz NEAR _avx2_shortcut
+ jmp NEAR $L$avx
+ALIGN 32
+$L$avx:
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ lea rsp,[((-168))+rsp]
+ movaps XMMWORD[rsp],xmm6
+ movaps XMMWORD[16+rsp],xmm7
+ movaps XMMWORD[32+rsp],xmm8
+ movaps XMMWORD[48+rsp],xmm9
+ movaps XMMWORD[(-120)+rax],xmm10
+ movaps XMMWORD[(-104)+rax],xmm11
+ movaps XMMWORD[(-88)+rax],xmm12
+ movaps XMMWORD[(-72)+rax],xmm13
+ movaps XMMWORD[(-56)+rax],xmm14
+ movaps XMMWORD[(-40)+rax],xmm15
+ sub rsp,288
+ and rsp,-256
+ mov QWORD[272+rsp],rax
+
+$L$body_avx:
+ lea rbp,[((K256+128))]
+ lea rbx,[256+rsp]
+ lea rdi,[128+rdi]
+
+$L$oop_grande_avx:
+ mov DWORD[280+rsp],edx
+ xor edx,edx
+
+ mov r8,QWORD[rsi]
+
+ mov ecx,DWORD[8+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[rbx],ecx
+ cmovle r8,rbp
+
+ mov r9,QWORD[16+rsi]
+
+ mov ecx,DWORD[24+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[4+rbx],ecx
+ cmovle r9,rbp
+
+ mov r10,QWORD[32+rsi]
+
+ mov ecx,DWORD[40+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[8+rbx],ecx
+ cmovle r10,rbp
+
+ mov r11,QWORD[48+rsi]
+
+ mov ecx,DWORD[56+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[12+rbx],ecx
+ cmovle r11,rbp
+ test edx,edx
+ jz NEAR $L$done_avx
+
+ vmovdqu xmm8,XMMWORD[((0-128))+rdi]
+ lea rax,[128+rsp]
+ vmovdqu xmm9,XMMWORD[((32-128))+rdi]
+ vmovdqu xmm10,XMMWORD[((64-128))+rdi]
+ vmovdqu xmm11,XMMWORD[((96-128))+rdi]
+ vmovdqu xmm12,XMMWORD[((128-128))+rdi]
+ vmovdqu xmm13,XMMWORD[((160-128))+rdi]
+ vmovdqu xmm14,XMMWORD[((192-128))+rdi]
+ vmovdqu xmm15,XMMWORD[((224-128))+rdi]
+ vmovdqu xmm6,XMMWORD[$L$pbswap]
+ jmp NEAR $L$oop_avx
+
+ALIGN 32
+$L$oop_avx:
+ vpxor xmm4,xmm10,xmm9
+ vmovd xmm5,DWORD[r8]
+ vmovd xmm0,DWORD[r9]
+ vpinsrd xmm5,xmm5,DWORD[r10],1
+ vpinsrd xmm0,xmm0,DWORD[r11],1
+ vpunpckldq xmm5,xmm5,xmm0
+ vpshufb xmm5,xmm5,xmm6
+ vpsrld xmm7,xmm12,6
+ vpslld xmm2,xmm12,26
+ vmovdqu XMMWORD[(0-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm15
+
+ vpsrld xmm1,xmm12,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm12,21
+ vpaddd xmm5,xmm5,XMMWORD[((-128))+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm12,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm12,7
+ vpandn xmm0,xmm12,xmm14
+ vpand xmm3,xmm12,xmm13
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm15,xmm8,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm8,30
+ vpxor xmm0,xmm0,xmm3
+ vpxor xmm3,xmm9,xmm8
+
+ vpxor xmm15,xmm15,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm8,13
+
+ vpslld xmm2,xmm8,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm4,xmm4,xmm3
+
+ vpxor xmm7,xmm15,xmm1
+
+ vpsrld xmm1,xmm8,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm8,10
+ vpxor xmm15,xmm9,xmm4
+ vpaddd xmm11,xmm11,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm15,xmm15,xmm5
+ vpaddd xmm15,xmm15,xmm7
+ vmovd xmm5,DWORD[4+r8]
+ vmovd xmm0,DWORD[4+r9]
+ vpinsrd xmm5,xmm5,DWORD[4+r10],1
+ vpinsrd xmm0,xmm0,DWORD[4+r11],1
+ vpunpckldq xmm5,xmm5,xmm0
+ vpshufb xmm5,xmm5,xmm6
+ vpsrld xmm7,xmm11,6
+ vpslld xmm2,xmm11,26
+ vmovdqu XMMWORD[(16-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm14
+
+ vpsrld xmm1,xmm11,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm11,21
+ vpaddd xmm5,xmm5,XMMWORD[((-96))+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm11,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm11,7
+ vpandn xmm0,xmm11,xmm13
+ vpand xmm4,xmm11,xmm12
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm14,xmm15,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm15,30
+ vpxor xmm0,xmm0,xmm4
+ vpxor xmm4,xmm8,xmm15
+
+ vpxor xmm14,xmm14,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm15,13
+
+ vpslld xmm2,xmm15,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm3,xmm3,xmm4
+
+ vpxor xmm7,xmm14,xmm1
+
+ vpsrld xmm1,xmm15,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm15,10
+ vpxor xmm14,xmm8,xmm3
+ vpaddd xmm10,xmm10,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm14,xmm14,xmm5
+ vpaddd xmm14,xmm14,xmm7
+ vmovd xmm5,DWORD[8+r8]
+ vmovd xmm0,DWORD[8+r9]
+ vpinsrd xmm5,xmm5,DWORD[8+r10],1
+ vpinsrd xmm0,xmm0,DWORD[8+r11],1
+ vpunpckldq xmm5,xmm5,xmm0
+ vpshufb xmm5,xmm5,xmm6
+ vpsrld xmm7,xmm10,6
+ vpslld xmm2,xmm10,26
+ vmovdqu XMMWORD[(32-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm13
+
+ vpsrld xmm1,xmm10,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm10,21
+ vpaddd xmm5,xmm5,XMMWORD[((-64))+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm10,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm10,7
+ vpandn xmm0,xmm10,xmm12
+ vpand xmm3,xmm10,xmm11
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm13,xmm14,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm14,30
+ vpxor xmm0,xmm0,xmm3
+ vpxor xmm3,xmm15,xmm14
+
+ vpxor xmm13,xmm13,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm14,13
+
+ vpslld xmm2,xmm14,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm4,xmm4,xmm3
+
+ vpxor xmm7,xmm13,xmm1
+
+ vpsrld xmm1,xmm14,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm14,10
+ vpxor xmm13,xmm15,xmm4
+ vpaddd xmm9,xmm9,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm13,xmm13,xmm5
+ vpaddd xmm13,xmm13,xmm7
+ vmovd xmm5,DWORD[12+r8]
+ vmovd xmm0,DWORD[12+r9]
+ vpinsrd xmm5,xmm5,DWORD[12+r10],1
+ vpinsrd xmm0,xmm0,DWORD[12+r11],1
+ vpunpckldq xmm5,xmm5,xmm0
+ vpshufb xmm5,xmm5,xmm6
+ vpsrld xmm7,xmm9,6
+ vpslld xmm2,xmm9,26
+ vmovdqu XMMWORD[(48-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm12
+
+ vpsrld xmm1,xmm9,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm9,21
+ vpaddd xmm5,xmm5,XMMWORD[((-32))+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm9,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm9,7
+ vpandn xmm0,xmm9,xmm11
+ vpand xmm4,xmm9,xmm10
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm12,xmm13,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm13,30
+ vpxor xmm0,xmm0,xmm4
+ vpxor xmm4,xmm14,xmm13
+
+ vpxor xmm12,xmm12,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm13,13
+
+ vpslld xmm2,xmm13,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm3,xmm3,xmm4
+
+ vpxor xmm7,xmm12,xmm1
+
+ vpsrld xmm1,xmm13,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm13,10
+ vpxor xmm12,xmm14,xmm3
+ vpaddd xmm8,xmm8,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm12,xmm12,xmm5
+ vpaddd xmm12,xmm12,xmm7
+ vmovd xmm5,DWORD[16+r8]
+ vmovd xmm0,DWORD[16+r9]
+ vpinsrd xmm5,xmm5,DWORD[16+r10],1
+ vpinsrd xmm0,xmm0,DWORD[16+r11],1
+ vpunpckldq xmm5,xmm5,xmm0
+ vpshufb xmm5,xmm5,xmm6
+ vpsrld xmm7,xmm8,6
+ vpslld xmm2,xmm8,26
+ vmovdqu XMMWORD[(64-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm11
+
+ vpsrld xmm1,xmm8,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm8,21
+ vpaddd xmm5,xmm5,XMMWORD[rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm8,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm8,7
+ vpandn xmm0,xmm8,xmm10
+ vpand xmm3,xmm8,xmm9
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm11,xmm12,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm12,30
+ vpxor xmm0,xmm0,xmm3
+ vpxor xmm3,xmm13,xmm12
+
+ vpxor xmm11,xmm11,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm12,13
+
+ vpslld xmm2,xmm12,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm4,xmm4,xmm3
+
+ vpxor xmm7,xmm11,xmm1
+
+ vpsrld xmm1,xmm12,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm12,10
+ vpxor xmm11,xmm13,xmm4
+ vpaddd xmm15,xmm15,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm11,xmm11,xmm5
+ vpaddd xmm11,xmm11,xmm7
+ vmovd xmm5,DWORD[20+r8]
+ vmovd xmm0,DWORD[20+r9]
+ vpinsrd xmm5,xmm5,DWORD[20+r10],1
+ vpinsrd xmm0,xmm0,DWORD[20+r11],1
+ vpunpckldq xmm5,xmm5,xmm0
+ vpshufb xmm5,xmm5,xmm6
+ vpsrld xmm7,xmm15,6
+ vpslld xmm2,xmm15,26
+ vmovdqu XMMWORD[(80-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm10
+
+ vpsrld xmm1,xmm15,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm15,21
+ vpaddd xmm5,xmm5,XMMWORD[32+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm15,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm15,7
+ vpandn xmm0,xmm15,xmm9
+ vpand xmm4,xmm15,xmm8
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm10,xmm11,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm11,30
+ vpxor xmm0,xmm0,xmm4
+ vpxor xmm4,xmm12,xmm11
+
+ vpxor xmm10,xmm10,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm11,13
+
+ vpslld xmm2,xmm11,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm3,xmm3,xmm4
+
+ vpxor xmm7,xmm10,xmm1
+
+ vpsrld xmm1,xmm11,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm11,10
+ vpxor xmm10,xmm12,xmm3
+ vpaddd xmm14,xmm14,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm10,xmm10,xmm5
+ vpaddd xmm10,xmm10,xmm7
+ vmovd xmm5,DWORD[24+r8]
+ vmovd xmm0,DWORD[24+r9]
+ vpinsrd xmm5,xmm5,DWORD[24+r10],1
+ vpinsrd xmm0,xmm0,DWORD[24+r11],1
+ vpunpckldq xmm5,xmm5,xmm0
+ vpshufb xmm5,xmm5,xmm6
+ vpsrld xmm7,xmm14,6
+ vpslld xmm2,xmm14,26
+ vmovdqu XMMWORD[(96-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm9
+
+ vpsrld xmm1,xmm14,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm14,21
+ vpaddd xmm5,xmm5,XMMWORD[64+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm14,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm14,7
+ vpandn xmm0,xmm14,xmm8
+ vpand xmm3,xmm14,xmm15
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm9,xmm10,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm10,30
+ vpxor xmm0,xmm0,xmm3
+ vpxor xmm3,xmm11,xmm10
+
+ vpxor xmm9,xmm9,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm10,13
+
+ vpslld xmm2,xmm10,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm4,xmm4,xmm3
+
+ vpxor xmm7,xmm9,xmm1
+
+ vpsrld xmm1,xmm10,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm10,10
+ vpxor xmm9,xmm11,xmm4
+ vpaddd xmm13,xmm13,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm9,xmm9,xmm5
+ vpaddd xmm9,xmm9,xmm7
+ vmovd xmm5,DWORD[28+r8]
+ vmovd xmm0,DWORD[28+r9]
+ vpinsrd xmm5,xmm5,DWORD[28+r10],1
+ vpinsrd xmm0,xmm0,DWORD[28+r11],1
+ vpunpckldq xmm5,xmm5,xmm0
+ vpshufb xmm5,xmm5,xmm6
+ vpsrld xmm7,xmm13,6
+ vpslld xmm2,xmm13,26
+ vmovdqu XMMWORD[(112-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm8
+
+ vpsrld xmm1,xmm13,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm13,21
+ vpaddd xmm5,xmm5,XMMWORD[96+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm13,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm13,7
+ vpandn xmm0,xmm13,xmm15
+ vpand xmm4,xmm13,xmm14
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm8,xmm9,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm9,30
+ vpxor xmm0,xmm0,xmm4
+ vpxor xmm4,xmm10,xmm9
+
+ vpxor xmm8,xmm8,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm9,13
+
+ vpslld xmm2,xmm9,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm3,xmm3,xmm4
+
+ vpxor xmm7,xmm8,xmm1
+
+ vpsrld xmm1,xmm9,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm9,10
+ vpxor xmm8,xmm10,xmm3
+ vpaddd xmm12,xmm12,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm8,xmm8,xmm5
+ vpaddd xmm8,xmm8,xmm7
+ add rbp,256
+ vmovd xmm5,DWORD[32+r8]
+ vmovd xmm0,DWORD[32+r9]
+ vpinsrd xmm5,xmm5,DWORD[32+r10],1
+ vpinsrd xmm0,xmm0,DWORD[32+r11],1
+ vpunpckldq xmm5,xmm5,xmm0
+ vpshufb xmm5,xmm5,xmm6
+ vpsrld xmm7,xmm12,6
+ vpslld xmm2,xmm12,26
+ vmovdqu XMMWORD[(128-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm15
+
+ vpsrld xmm1,xmm12,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm12,21
+ vpaddd xmm5,xmm5,XMMWORD[((-128))+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm12,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm12,7
+ vpandn xmm0,xmm12,xmm14
+ vpand xmm3,xmm12,xmm13
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm15,xmm8,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm8,30
+ vpxor xmm0,xmm0,xmm3
+ vpxor xmm3,xmm9,xmm8
+
+ vpxor xmm15,xmm15,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm8,13
+
+ vpslld xmm2,xmm8,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm4,xmm4,xmm3
+
+ vpxor xmm7,xmm15,xmm1
+
+ vpsrld xmm1,xmm8,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm8,10
+ vpxor xmm15,xmm9,xmm4
+ vpaddd xmm11,xmm11,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm15,xmm15,xmm5
+ vpaddd xmm15,xmm15,xmm7
+ vmovd xmm5,DWORD[36+r8]
+ vmovd xmm0,DWORD[36+r9]
+ vpinsrd xmm5,xmm5,DWORD[36+r10],1
+ vpinsrd xmm0,xmm0,DWORD[36+r11],1
+ vpunpckldq xmm5,xmm5,xmm0
+ vpshufb xmm5,xmm5,xmm6
+ vpsrld xmm7,xmm11,6
+ vpslld xmm2,xmm11,26
+ vmovdqu XMMWORD[(144-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm14
+
+ vpsrld xmm1,xmm11,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm11,21
+ vpaddd xmm5,xmm5,XMMWORD[((-96))+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm11,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm11,7
+ vpandn xmm0,xmm11,xmm13
+ vpand xmm4,xmm11,xmm12
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm14,xmm15,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm15,30
+ vpxor xmm0,xmm0,xmm4
+ vpxor xmm4,xmm8,xmm15
+
+ vpxor xmm14,xmm14,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm15,13
+
+ vpslld xmm2,xmm15,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm3,xmm3,xmm4
+
+ vpxor xmm7,xmm14,xmm1
+
+ vpsrld xmm1,xmm15,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm15,10
+ vpxor xmm14,xmm8,xmm3
+ vpaddd xmm10,xmm10,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm14,xmm14,xmm5
+ vpaddd xmm14,xmm14,xmm7
+ vmovd xmm5,DWORD[40+r8]
+ vmovd xmm0,DWORD[40+r9]
+ vpinsrd xmm5,xmm5,DWORD[40+r10],1
+ vpinsrd xmm0,xmm0,DWORD[40+r11],1
+ vpunpckldq xmm5,xmm5,xmm0
+ vpshufb xmm5,xmm5,xmm6
+ vpsrld xmm7,xmm10,6
+ vpslld xmm2,xmm10,26
+ vmovdqu XMMWORD[(160-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm13
+
+ vpsrld xmm1,xmm10,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm10,21
+ vpaddd xmm5,xmm5,XMMWORD[((-64))+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm10,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm10,7
+ vpandn xmm0,xmm10,xmm12
+ vpand xmm3,xmm10,xmm11
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm13,xmm14,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm14,30
+ vpxor xmm0,xmm0,xmm3
+ vpxor xmm3,xmm15,xmm14
+
+ vpxor xmm13,xmm13,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm14,13
+
+ vpslld xmm2,xmm14,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm4,xmm4,xmm3
+
+ vpxor xmm7,xmm13,xmm1
+
+ vpsrld xmm1,xmm14,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm14,10
+ vpxor xmm13,xmm15,xmm4
+ vpaddd xmm9,xmm9,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm13,xmm13,xmm5
+ vpaddd xmm13,xmm13,xmm7
+ vmovd xmm5,DWORD[44+r8]
+ vmovd xmm0,DWORD[44+r9]
+ vpinsrd xmm5,xmm5,DWORD[44+r10],1
+ vpinsrd xmm0,xmm0,DWORD[44+r11],1
+ vpunpckldq xmm5,xmm5,xmm0
+ vpshufb xmm5,xmm5,xmm6
+ vpsrld xmm7,xmm9,6
+ vpslld xmm2,xmm9,26
+ vmovdqu XMMWORD[(176-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm12
+
+ vpsrld xmm1,xmm9,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm9,21
+ vpaddd xmm5,xmm5,XMMWORD[((-32))+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm9,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm9,7
+ vpandn xmm0,xmm9,xmm11
+ vpand xmm4,xmm9,xmm10
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm12,xmm13,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm13,30
+ vpxor xmm0,xmm0,xmm4
+ vpxor xmm4,xmm14,xmm13
+
+ vpxor xmm12,xmm12,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm13,13
+
+ vpslld xmm2,xmm13,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm3,xmm3,xmm4
+
+ vpxor xmm7,xmm12,xmm1
+
+ vpsrld xmm1,xmm13,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm13,10
+ vpxor xmm12,xmm14,xmm3
+ vpaddd xmm8,xmm8,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm12,xmm12,xmm5
+ vpaddd xmm12,xmm12,xmm7
+ vmovd xmm5,DWORD[48+r8]
+ vmovd xmm0,DWORD[48+r9]
+ vpinsrd xmm5,xmm5,DWORD[48+r10],1
+ vpinsrd xmm0,xmm0,DWORD[48+r11],1
+ vpunpckldq xmm5,xmm5,xmm0
+ vpshufb xmm5,xmm5,xmm6
+ vpsrld xmm7,xmm8,6
+ vpslld xmm2,xmm8,26
+ vmovdqu XMMWORD[(192-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm11
+
+ vpsrld xmm1,xmm8,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm8,21
+ vpaddd xmm5,xmm5,XMMWORD[rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm8,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm8,7
+ vpandn xmm0,xmm8,xmm10
+ vpand xmm3,xmm8,xmm9
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm11,xmm12,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm12,30
+ vpxor xmm0,xmm0,xmm3
+ vpxor xmm3,xmm13,xmm12
+
+ vpxor xmm11,xmm11,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm12,13
+
+ vpslld xmm2,xmm12,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm4,xmm4,xmm3
+
+ vpxor xmm7,xmm11,xmm1
+
+ vpsrld xmm1,xmm12,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm12,10
+ vpxor xmm11,xmm13,xmm4
+ vpaddd xmm15,xmm15,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm11,xmm11,xmm5
+ vpaddd xmm11,xmm11,xmm7
+ vmovd xmm5,DWORD[52+r8]
+ vmovd xmm0,DWORD[52+r9]
+ vpinsrd xmm5,xmm5,DWORD[52+r10],1
+ vpinsrd xmm0,xmm0,DWORD[52+r11],1
+ vpunpckldq xmm5,xmm5,xmm0
+ vpshufb xmm5,xmm5,xmm6
+ vpsrld xmm7,xmm15,6
+ vpslld xmm2,xmm15,26
+ vmovdqu XMMWORD[(208-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm10
+
+ vpsrld xmm1,xmm15,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm15,21
+ vpaddd xmm5,xmm5,XMMWORD[32+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm15,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm15,7
+ vpandn xmm0,xmm15,xmm9
+ vpand xmm4,xmm15,xmm8
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm10,xmm11,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm11,30
+ vpxor xmm0,xmm0,xmm4
+ vpxor xmm4,xmm12,xmm11
+
+ vpxor xmm10,xmm10,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm11,13
+
+ vpslld xmm2,xmm11,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm3,xmm3,xmm4
+
+ vpxor xmm7,xmm10,xmm1
+
+ vpsrld xmm1,xmm11,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm11,10
+ vpxor xmm10,xmm12,xmm3
+ vpaddd xmm14,xmm14,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm10,xmm10,xmm5
+ vpaddd xmm10,xmm10,xmm7
+ vmovd xmm5,DWORD[56+r8]
+ vmovd xmm0,DWORD[56+r9]
+ vpinsrd xmm5,xmm5,DWORD[56+r10],1
+ vpinsrd xmm0,xmm0,DWORD[56+r11],1
+ vpunpckldq xmm5,xmm5,xmm0
+ vpshufb xmm5,xmm5,xmm6
+ vpsrld xmm7,xmm14,6
+ vpslld xmm2,xmm14,26
+ vmovdqu XMMWORD[(224-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm9
+
+ vpsrld xmm1,xmm14,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm14,21
+ vpaddd xmm5,xmm5,XMMWORD[64+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm14,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm14,7
+ vpandn xmm0,xmm14,xmm8
+ vpand xmm3,xmm14,xmm15
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm9,xmm10,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm10,30
+ vpxor xmm0,xmm0,xmm3
+ vpxor xmm3,xmm11,xmm10
+
+ vpxor xmm9,xmm9,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm10,13
+
+ vpslld xmm2,xmm10,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm4,xmm4,xmm3
+
+ vpxor xmm7,xmm9,xmm1
+
+ vpsrld xmm1,xmm10,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm10,10
+ vpxor xmm9,xmm11,xmm4
+ vpaddd xmm13,xmm13,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm9,xmm9,xmm5
+ vpaddd xmm9,xmm9,xmm7
+ vmovd xmm5,DWORD[60+r8]
+ lea r8,[64+r8]
+ vmovd xmm0,DWORD[60+r9]
+ lea r9,[64+r9]
+ vpinsrd xmm5,xmm5,DWORD[60+r10],1
+ lea r10,[64+r10]
+ vpinsrd xmm0,xmm0,DWORD[60+r11],1
+ lea r11,[64+r11]
+ vpunpckldq xmm5,xmm5,xmm0
+ vpshufb xmm5,xmm5,xmm6
+ vpsrld xmm7,xmm13,6
+ vpslld xmm2,xmm13,26
+ vmovdqu XMMWORD[(240-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm8
+
+ vpsrld xmm1,xmm13,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm13,21
+ vpaddd xmm5,xmm5,XMMWORD[96+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm13,25
+ vpxor xmm7,xmm7,xmm2
+ prefetcht0 [63+r8]
+ vpslld xmm2,xmm13,7
+ vpandn xmm0,xmm13,xmm15
+ vpand xmm4,xmm13,xmm14
+ prefetcht0 [63+r9]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm8,xmm9,2
+ vpxor xmm7,xmm7,xmm2
+ prefetcht0 [63+r10]
+ vpslld xmm1,xmm9,30
+ vpxor xmm0,xmm0,xmm4
+ vpxor xmm4,xmm10,xmm9
+ prefetcht0 [63+r11]
+ vpxor xmm8,xmm8,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm9,13
+
+ vpslld xmm2,xmm9,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm3,xmm3,xmm4
+
+ vpxor xmm7,xmm8,xmm1
+
+ vpsrld xmm1,xmm9,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm9,10
+ vpxor xmm8,xmm10,xmm3
+ vpaddd xmm12,xmm12,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm8,xmm8,xmm5
+ vpaddd xmm8,xmm8,xmm7
+ add rbp,256
+ vmovdqu xmm5,XMMWORD[((0-128))+rax]
+ mov ecx,3
+ jmp NEAR $L$oop_16_xx_avx
+ALIGN 32
+$L$oop_16_xx_avx:
+ vmovdqu xmm6,XMMWORD[((16-128))+rax]
+ vpaddd xmm5,xmm5,XMMWORD[((144-128))+rax]
+
+ vpsrld xmm7,xmm6,3
+ vpsrld xmm1,xmm6,7
+ vpslld xmm2,xmm6,25
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm6,18
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm6,14
+ vmovdqu xmm0,XMMWORD[((224-128))+rax]
+ vpsrld xmm3,xmm0,10
+
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm0,17
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,15
+ vpaddd xmm5,xmm5,xmm7
+ vpxor xmm7,xmm3,xmm1
+ vpsrld xmm1,xmm0,19
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,13
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+ vpaddd xmm5,xmm5,xmm7
+ vpsrld xmm7,xmm12,6
+ vpslld xmm2,xmm12,26
+ vmovdqu XMMWORD[(0-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm15
+
+ vpsrld xmm1,xmm12,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm12,21
+ vpaddd xmm5,xmm5,XMMWORD[((-128))+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm12,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm12,7
+ vpandn xmm0,xmm12,xmm14
+ vpand xmm3,xmm12,xmm13
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm15,xmm8,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm8,30
+ vpxor xmm0,xmm0,xmm3
+ vpxor xmm3,xmm9,xmm8
+
+ vpxor xmm15,xmm15,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm8,13
+
+ vpslld xmm2,xmm8,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm4,xmm4,xmm3
+
+ vpxor xmm7,xmm15,xmm1
+
+ vpsrld xmm1,xmm8,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm8,10
+ vpxor xmm15,xmm9,xmm4
+ vpaddd xmm11,xmm11,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm15,xmm15,xmm5
+ vpaddd xmm15,xmm15,xmm7
+ vmovdqu xmm5,XMMWORD[((32-128))+rax]
+ vpaddd xmm6,xmm6,XMMWORD[((160-128))+rax]
+
+ vpsrld xmm7,xmm5,3
+ vpsrld xmm1,xmm5,7
+ vpslld xmm2,xmm5,25
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm5,18
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm5,14
+ vmovdqu xmm0,XMMWORD[((240-128))+rax]
+ vpsrld xmm4,xmm0,10
+
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm0,17
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,15
+ vpaddd xmm6,xmm6,xmm7
+ vpxor xmm7,xmm4,xmm1
+ vpsrld xmm1,xmm0,19
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,13
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+ vpaddd xmm6,xmm6,xmm7
+ vpsrld xmm7,xmm11,6
+ vpslld xmm2,xmm11,26
+ vmovdqu XMMWORD[(16-128)+rax],xmm6
+ vpaddd xmm6,xmm6,xmm14
+
+ vpsrld xmm1,xmm11,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm11,21
+ vpaddd xmm6,xmm6,XMMWORD[((-96))+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm11,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm11,7
+ vpandn xmm0,xmm11,xmm13
+ vpand xmm4,xmm11,xmm12
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm14,xmm15,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm15,30
+ vpxor xmm0,xmm0,xmm4
+ vpxor xmm4,xmm8,xmm15
+
+ vpxor xmm14,xmm14,xmm1
+ vpaddd xmm6,xmm6,xmm7
+
+ vpsrld xmm1,xmm15,13
+
+ vpslld xmm2,xmm15,19
+ vpaddd xmm6,xmm6,xmm0
+ vpand xmm3,xmm3,xmm4
+
+ vpxor xmm7,xmm14,xmm1
+
+ vpsrld xmm1,xmm15,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm15,10
+ vpxor xmm14,xmm8,xmm3
+ vpaddd xmm10,xmm10,xmm6
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm14,xmm14,xmm6
+ vpaddd xmm14,xmm14,xmm7
+ vmovdqu xmm6,XMMWORD[((48-128))+rax]
+ vpaddd xmm5,xmm5,XMMWORD[((176-128))+rax]
+
+ vpsrld xmm7,xmm6,3
+ vpsrld xmm1,xmm6,7
+ vpslld xmm2,xmm6,25
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm6,18
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm6,14
+ vmovdqu xmm0,XMMWORD[((0-128))+rax]
+ vpsrld xmm3,xmm0,10
+
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm0,17
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,15
+ vpaddd xmm5,xmm5,xmm7
+ vpxor xmm7,xmm3,xmm1
+ vpsrld xmm1,xmm0,19
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,13
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+ vpaddd xmm5,xmm5,xmm7
+ vpsrld xmm7,xmm10,6
+ vpslld xmm2,xmm10,26
+ vmovdqu XMMWORD[(32-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm13
+
+ vpsrld xmm1,xmm10,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm10,21
+ vpaddd xmm5,xmm5,XMMWORD[((-64))+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm10,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm10,7
+ vpandn xmm0,xmm10,xmm12
+ vpand xmm3,xmm10,xmm11
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm13,xmm14,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm14,30
+ vpxor xmm0,xmm0,xmm3
+ vpxor xmm3,xmm15,xmm14
+
+ vpxor xmm13,xmm13,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm14,13
+
+ vpslld xmm2,xmm14,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm4,xmm4,xmm3
+
+ vpxor xmm7,xmm13,xmm1
+
+ vpsrld xmm1,xmm14,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm14,10
+ vpxor xmm13,xmm15,xmm4
+ vpaddd xmm9,xmm9,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm13,xmm13,xmm5
+ vpaddd xmm13,xmm13,xmm7
+ vmovdqu xmm5,XMMWORD[((64-128))+rax]
+ vpaddd xmm6,xmm6,XMMWORD[((192-128))+rax]
+
+ vpsrld xmm7,xmm5,3
+ vpsrld xmm1,xmm5,7
+ vpslld xmm2,xmm5,25
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm5,18
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm5,14
+ vmovdqu xmm0,XMMWORD[((16-128))+rax]
+ vpsrld xmm4,xmm0,10
+
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm0,17
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,15
+ vpaddd xmm6,xmm6,xmm7
+ vpxor xmm7,xmm4,xmm1
+ vpsrld xmm1,xmm0,19
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,13
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+ vpaddd xmm6,xmm6,xmm7
+ vpsrld xmm7,xmm9,6
+ vpslld xmm2,xmm9,26
+ vmovdqu XMMWORD[(48-128)+rax],xmm6
+ vpaddd xmm6,xmm6,xmm12
+
+ vpsrld xmm1,xmm9,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm9,21
+ vpaddd xmm6,xmm6,XMMWORD[((-32))+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm9,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm9,7
+ vpandn xmm0,xmm9,xmm11
+ vpand xmm4,xmm9,xmm10
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm12,xmm13,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm13,30
+ vpxor xmm0,xmm0,xmm4
+ vpxor xmm4,xmm14,xmm13
+
+ vpxor xmm12,xmm12,xmm1
+ vpaddd xmm6,xmm6,xmm7
+
+ vpsrld xmm1,xmm13,13
+
+ vpslld xmm2,xmm13,19
+ vpaddd xmm6,xmm6,xmm0
+ vpand xmm3,xmm3,xmm4
+
+ vpxor xmm7,xmm12,xmm1
+
+ vpsrld xmm1,xmm13,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm13,10
+ vpxor xmm12,xmm14,xmm3
+ vpaddd xmm8,xmm8,xmm6
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm12,xmm12,xmm6
+ vpaddd xmm12,xmm12,xmm7
+ vmovdqu xmm6,XMMWORD[((80-128))+rax]
+ vpaddd xmm5,xmm5,XMMWORD[((208-128))+rax]
+
+ vpsrld xmm7,xmm6,3
+ vpsrld xmm1,xmm6,7
+ vpslld xmm2,xmm6,25
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm6,18
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm6,14
+ vmovdqu xmm0,XMMWORD[((32-128))+rax]
+ vpsrld xmm3,xmm0,10
+
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm0,17
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,15
+ vpaddd xmm5,xmm5,xmm7
+ vpxor xmm7,xmm3,xmm1
+ vpsrld xmm1,xmm0,19
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,13
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+ vpaddd xmm5,xmm5,xmm7
+ vpsrld xmm7,xmm8,6
+ vpslld xmm2,xmm8,26
+ vmovdqu XMMWORD[(64-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm11
+
+ vpsrld xmm1,xmm8,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm8,21
+ vpaddd xmm5,xmm5,XMMWORD[rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm8,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm8,7
+ vpandn xmm0,xmm8,xmm10
+ vpand xmm3,xmm8,xmm9
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm11,xmm12,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm12,30
+ vpxor xmm0,xmm0,xmm3
+ vpxor xmm3,xmm13,xmm12
+
+ vpxor xmm11,xmm11,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm12,13
+
+ vpslld xmm2,xmm12,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm4,xmm4,xmm3
+
+ vpxor xmm7,xmm11,xmm1
+
+ vpsrld xmm1,xmm12,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm12,10
+ vpxor xmm11,xmm13,xmm4
+ vpaddd xmm15,xmm15,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm11,xmm11,xmm5
+ vpaddd xmm11,xmm11,xmm7
+ vmovdqu xmm5,XMMWORD[((96-128))+rax]
+ vpaddd xmm6,xmm6,XMMWORD[((224-128))+rax]
+
+ vpsrld xmm7,xmm5,3
+ vpsrld xmm1,xmm5,7
+ vpslld xmm2,xmm5,25
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm5,18
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm5,14
+ vmovdqu xmm0,XMMWORD[((48-128))+rax]
+ vpsrld xmm4,xmm0,10
+
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm0,17
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,15
+ vpaddd xmm6,xmm6,xmm7
+ vpxor xmm7,xmm4,xmm1
+ vpsrld xmm1,xmm0,19
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,13
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+ vpaddd xmm6,xmm6,xmm7
+ vpsrld xmm7,xmm15,6
+ vpslld xmm2,xmm15,26
+ vmovdqu XMMWORD[(80-128)+rax],xmm6
+ vpaddd xmm6,xmm6,xmm10
+
+ vpsrld xmm1,xmm15,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm15,21
+ vpaddd xmm6,xmm6,XMMWORD[32+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm15,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm15,7
+ vpandn xmm0,xmm15,xmm9
+ vpand xmm4,xmm15,xmm8
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm10,xmm11,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm11,30
+ vpxor xmm0,xmm0,xmm4
+ vpxor xmm4,xmm12,xmm11
+
+ vpxor xmm10,xmm10,xmm1
+ vpaddd xmm6,xmm6,xmm7
+
+ vpsrld xmm1,xmm11,13
+
+ vpslld xmm2,xmm11,19
+ vpaddd xmm6,xmm6,xmm0
+ vpand xmm3,xmm3,xmm4
+
+ vpxor xmm7,xmm10,xmm1
+
+ vpsrld xmm1,xmm11,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm11,10
+ vpxor xmm10,xmm12,xmm3
+ vpaddd xmm14,xmm14,xmm6
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm10,xmm10,xmm6
+ vpaddd xmm10,xmm10,xmm7
+ vmovdqu xmm6,XMMWORD[((112-128))+rax]
+ vpaddd xmm5,xmm5,XMMWORD[((240-128))+rax]
+
+ vpsrld xmm7,xmm6,3
+ vpsrld xmm1,xmm6,7
+ vpslld xmm2,xmm6,25
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm6,18
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm6,14
+ vmovdqu xmm0,XMMWORD[((64-128))+rax]
+ vpsrld xmm3,xmm0,10
+
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm0,17
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,15
+ vpaddd xmm5,xmm5,xmm7
+ vpxor xmm7,xmm3,xmm1
+ vpsrld xmm1,xmm0,19
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,13
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+ vpaddd xmm5,xmm5,xmm7
+ vpsrld xmm7,xmm14,6
+ vpslld xmm2,xmm14,26
+ vmovdqu XMMWORD[(96-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm9
+
+ vpsrld xmm1,xmm14,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm14,21
+ vpaddd xmm5,xmm5,XMMWORD[64+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm14,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm14,7
+ vpandn xmm0,xmm14,xmm8
+ vpand xmm3,xmm14,xmm15
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm9,xmm10,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm10,30
+ vpxor xmm0,xmm0,xmm3
+ vpxor xmm3,xmm11,xmm10
+
+ vpxor xmm9,xmm9,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm10,13
+
+ vpslld xmm2,xmm10,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm4,xmm4,xmm3
+
+ vpxor xmm7,xmm9,xmm1
+
+ vpsrld xmm1,xmm10,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm10,10
+ vpxor xmm9,xmm11,xmm4
+ vpaddd xmm13,xmm13,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm9,xmm9,xmm5
+ vpaddd xmm9,xmm9,xmm7
+ vmovdqu xmm5,XMMWORD[((128-128))+rax]
+ vpaddd xmm6,xmm6,XMMWORD[((0-128))+rax]
+
+ vpsrld xmm7,xmm5,3
+ vpsrld xmm1,xmm5,7
+ vpslld xmm2,xmm5,25
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm5,18
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm5,14
+ vmovdqu xmm0,XMMWORD[((80-128))+rax]
+ vpsrld xmm4,xmm0,10
+
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm0,17
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,15
+ vpaddd xmm6,xmm6,xmm7
+ vpxor xmm7,xmm4,xmm1
+ vpsrld xmm1,xmm0,19
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,13
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+ vpaddd xmm6,xmm6,xmm7
+ vpsrld xmm7,xmm13,6
+ vpslld xmm2,xmm13,26
+ vmovdqu XMMWORD[(112-128)+rax],xmm6
+ vpaddd xmm6,xmm6,xmm8
+
+ vpsrld xmm1,xmm13,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm13,21
+ vpaddd xmm6,xmm6,XMMWORD[96+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm13,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm13,7
+ vpandn xmm0,xmm13,xmm15
+ vpand xmm4,xmm13,xmm14
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm8,xmm9,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm9,30
+ vpxor xmm0,xmm0,xmm4
+ vpxor xmm4,xmm10,xmm9
+
+ vpxor xmm8,xmm8,xmm1
+ vpaddd xmm6,xmm6,xmm7
+
+ vpsrld xmm1,xmm9,13
+
+ vpslld xmm2,xmm9,19
+ vpaddd xmm6,xmm6,xmm0
+ vpand xmm3,xmm3,xmm4
+
+ vpxor xmm7,xmm8,xmm1
+
+ vpsrld xmm1,xmm9,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm9,10
+ vpxor xmm8,xmm10,xmm3
+ vpaddd xmm12,xmm12,xmm6
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm8,xmm8,xmm6
+ vpaddd xmm8,xmm8,xmm7
+ add rbp,256
+ vmovdqu xmm6,XMMWORD[((144-128))+rax]
+ vpaddd xmm5,xmm5,XMMWORD[((16-128))+rax]
+
+ vpsrld xmm7,xmm6,3
+ vpsrld xmm1,xmm6,7
+ vpslld xmm2,xmm6,25
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm6,18
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm6,14
+ vmovdqu xmm0,XMMWORD[((96-128))+rax]
+ vpsrld xmm3,xmm0,10
+
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm0,17
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,15
+ vpaddd xmm5,xmm5,xmm7
+ vpxor xmm7,xmm3,xmm1
+ vpsrld xmm1,xmm0,19
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,13
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+ vpaddd xmm5,xmm5,xmm7
+ vpsrld xmm7,xmm12,6
+ vpslld xmm2,xmm12,26
+ vmovdqu XMMWORD[(128-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm15
+
+ vpsrld xmm1,xmm12,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm12,21
+ vpaddd xmm5,xmm5,XMMWORD[((-128))+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm12,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm12,7
+ vpandn xmm0,xmm12,xmm14
+ vpand xmm3,xmm12,xmm13
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm15,xmm8,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm8,30
+ vpxor xmm0,xmm0,xmm3
+ vpxor xmm3,xmm9,xmm8
+
+ vpxor xmm15,xmm15,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm8,13
+
+ vpslld xmm2,xmm8,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm4,xmm4,xmm3
+
+ vpxor xmm7,xmm15,xmm1
+
+ vpsrld xmm1,xmm8,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm8,10
+ vpxor xmm15,xmm9,xmm4
+ vpaddd xmm11,xmm11,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm15,xmm15,xmm5
+ vpaddd xmm15,xmm15,xmm7
+ vmovdqu xmm5,XMMWORD[((160-128))+rax]
+ vpaddd xmm6,xmm6,XMMWORD[((32-128))+rax]
+
+ vpsrld xmm7,xmm5,3
+ vpsrld xmm1,xmm5,7
+ vpslld xmm2,xmm5,25
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm5,18
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm5,14
+ vmovdqu xmm0,XMMWORD[((112-128))+rax]
+ vpsrld xmm4,xmm0,10
+
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm0,17
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,15
+ vpaddd xmm6,xmm6,xmm7
+ vpxor xmm7,xmm4,xmm1
+ vpsrld xmm1,xmm0,19
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,13
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+ vpaddd xmm6,xmm6,xmm7
+ vpsrld xmm7,xmm11,6
+ vpslld xmm2,xmm11,26
+ vmovdqu XMMWORD[(144-128)+rax],xmm6
+ vpaddd xmm6,xmm6,xmm14
+
+ vpsrld xmm1,xmm11,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm11,21
+ vpaddd xmm6,xmm6,XMMWORD[((-96))+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm11,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm11,7
+ vpandn xmm0,xmm11,xmm13
+ vpand xmm4,xmm11,xmm12
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm14,xmm15,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm15,30
+ vpxor xmm0,xmm0,xmm4
+ vpxor xmm4,xmm8,xmm15
+
+ vpxor xmm14,xmm14,xmm1
+ vpaddd xmm6,xmm6,xmm7
+
+ vpsrld xmm1,xmm15,13
+
+ vpslld xmm2,xmm15,19
+ vpaddd xmm6,xmm6,xmm0
+ vpand xmm3,xmm3,xmm4
+
+ vpxor xmm7,xmm14,xmm1
+
+ vpsrld xmm1,xmm15,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm15,10
+ vpxor xmm14,xmm8,xmm3
+ vpaddd xmm10,xmm10,xmm6
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm14,xmm14,xmm6
+ vpaddd xmm14,xmm14,xmm7
+ vmovdqu xmm6,XMMWORD[((176-128))+rax]
+ vpaddd xmm5,xmm5,XMMWORD[((48-128))+rax]
+
+ vpsrld xmm7,xmm6,3
+ vpsrld xmm1,xmm6,7
+ vpslld xmm2,xmm6,25
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm6,18
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm6,14
+ vmovdqu xmm0,XMMWORD[((128-128))+rax]
+ vpsrld xmm3,xmm0,10
+
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm0,17
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,15
+ vpaddd xmm5,xmm5,xmm7
+ vpxor xmm7,xmm3,xmm1
+ vpsrld xmm1,xmm0,19
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,13
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+ vpaddd xmm5,xmm5,xmm7
+ vpsrld xmm7,xmm10,6
+ vpslld xmm2,xmm10,26
+ vmovdqu XMMWORD[(160-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm13
+
+ vpsrld xmm1,xmm10,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm10,21
+ vpaddd xmm5,xmm5,XMMWORD[((-64))+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm10,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm10,7
+ vpandn xmm0,xmm10,xmm12
+ vpand xmm3,xmm10,xmm11
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm13,xmm14,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm14,30
+ vpxor xmm0,xmm0,xmm3
+ vpxor xmm3,xmm15,xmm14
+
+ vpxor xmm13,xmm13,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm14,13
+
+ vpslld xmm2,xmm14,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm4,xmm4,xmm3
+
+ vpxor xmm7,xmm13,xmm1
+
+ vpsrld xmm1,xmm14,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm14,10
+ vpxor xmm13,xmm15,xmm4
+ vpaddd xmm9,xmm9,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm13,xmm13,xmm5
+ vpaddd xmm13,xmm13,xmm7
+ vmovdqu xmm5,XMMWORD[((192-128))+rax]
+ vpaddd xmm6,xmm6,XMMWORD[((64-128))+rax]
+
+ vpsrld xmm7,xmm5,3
+ vpsrld xmm1,xmm5,7
+ vpslld xmm2,xmm5,25
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm5,18
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm5,14
+ vmovdqu xmm0,XMMWORD[((144-128))+rax]
+ vpsrld xmm4,xmm0,10
+
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm0,17
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,15
+ vpaddd xmm6,xmm6,xmm7
+ vpxor xmm7,xmm4,xmm1
+ vpsrld xmm1,xmm0,19
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,13
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+ vpaddd xmm6,xmm6,xmm7
+ vpsrld xmm7,xmm9,6
+ vpslld xmm2,xmm9,26
+ vmovdqu XMMWORD[(176-128)+rax],xmm6
+ vpaddd xmm6,xmm6,xmm12
+
+ vpsrld xmm1,xmm9,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm9,21
+ vpaddd xmm6,xmm6,XMMWORD[((-32))+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm9,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm9,7
+ vpandn xmm0,xmm9,xmm11
+ vpand xmm4,xmm9,xmm10
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm12,xmm13,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm13,30
+ vpxor xmm0,xmm0,xmm4
+ vpxor xmm4,xmm14,xmm13
+
+ vpxor xmm12,xmm12,xmm1
+ vpaddd xmm6,xmm6,xmm7
+
+ vpsrld xmm1,xmm13,13
+
+ vpslld xmm2,xmm13,19
+ vpaddd xmm6,xmm6,xmm0
+ vpand xmm3,xmm3,xmm4
+
+ vpxor xmm7,xmm12,xmm1
+
+ vpsrld xmm1,xmm13,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm13,10
+ vpxor xmm12,xmm14,xmm3
+ vpaddd xmm8,xmm8,xmm6
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm12,xmm12,xmm6
+ vpaddd xmm12,xmm12,xmm7
+ vmovdqu xmm6,XMMWORD[((208-128))+rax]
+ vpaddd xmm5,xmm5,XMMWORD[((80-128))+rax]
+
+ vpsrld xmm7,xmm6,3
+ vpsrld xmm1,xmm6,7
+ vpslld xmm2,xmm6,25
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm6,18
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm6,14
+ vmovdqu xmm0,XMMWORD[((160-128))+rax]
+ vpsrld xmm3,xmm0,10
+
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm0,17
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,15
+ vpaddd xmm5,xmm5,xmm7
+ vpxor xmm7,xmm3,xmm1
+ vpsrld xmm1,xmm0,19
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,13
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+ vpaddd xmm5,xmm5,xmm7
+ vpsrld xmm7,xmm8,6
+ vpslld xmm2,xmm8,26
+ vmovdqu XMMWORD[(192-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm11
+
+ vpsrld xmm1,xmm8,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm8,21
+ vpaddd xmm5,xmm5,XMMWORD[rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm8,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm8,7
+ vpandn xmm0,xmm8,xmm10
+ vpand xmm3,xmm8,xmm9
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm11,xmm12,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm12,30
+ vpxor xmm0,xmm0,xmm3
+ vpxor xmm3,xmm13,xmm12
+
+ vpxor xmm11,xmm11,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm12,13
+
+ vpslld xmm2,xmm12,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm4,xmm4,xmm3
+
+ vpxor xmm7,xmm11,xmm1
+
+ vpsrld xmm1,xmm12,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm12,10
+ vpxor xmm11,xmm13,xmm4
+ vpaddd xmm15,xmm15,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm11,xmm11,xmm5
+ vpaddd xmm11,xmm11,xmm7
+ vmovdqu xmm5,XMMWORD[((224-128))+rax]
+ vpaddd xmm6,xmm6,XMMWORD[((96-128))+rax]
+
+ vpsrld xmm7,xmm5,3
+ vpsrld xmm1,xmm5,7
+ vpslld xmm2,xmm5,25
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm5,18
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm5,14
+ vmovdqu xmm0,XMMWORD[((176-128))+rax]
+ vpsrld xmm4,xmm0,10
+
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm0,17
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,15
+ vpaddd xmm6,xmm6,xmm7
+ vpxor xmm7,xmm4,xmm1
+ vpsrld xmm1,xmm0,19
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,13
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+ vpaddd xmm6,xmm6,xmm7
+ vpsrld xmm7,xmm15,6
+ vpslld xmm2,xmm15,26
+ vmovdqu XMMWORD[(208-128)+rax],xmm6
+ vpaddd xmm6,xmm6,xmm10
+
+ vpsrld xmm1,xmm15,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm15,21
+ vpaddd xmm6,xmm6,XMMWORD[32+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm15,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm15,7
+ vpandn xmm0,xmm15,xmm9
+ vpand xmm4,xmm15,xmm8
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm10,xmm11,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm11,30
+ vpxor xmm0,xmm0,xmm4
+ vpxor xmm4,xmm12,xmm11
+
+ vpxor xmm10,xmm10,xmm1
+ vpaddd xmm6,xmm6,xmm7
+
+ vpsrld xmm1,xmm11,13
+
+ vpslld xmm2,xmm11,19
+ vpaddd xmm6,xmm6,xmm0
+ vpand xmm3,xmm3,xmm4
+
+ vpxor xmm7,xmm10,xmm1
+
+ vpsrld xmm1,xmm11,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm11,10
+ vpxor xmm10,xmm12,xmm3
+ vpaddd xmm14,xmm14,xmm6
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm10,xmm10,xmm6
+ vpaddd xmm10,xmm10,xmm7
+ vmovdqu xmm6,XMMWORD[((240-128))+rax]
+ vpaddd xmm5,xmm5,XMMWORD[((112-128))+rax]
+
+ vpsrld xmm7,xmm6,3
+ vpsrld xmm1,xmm6,7
+ vpslld xmm2,xmm6,25
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm6,18
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm6,14
+ vmovdqu xmm0,XMMWORD[((192-128))+rax]
+ vpsrld xmm3,xmm0,10
+
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm0,17
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,15
+ vpaddd xmm5,xmm5,xmm7
+ vpxor xmm7,xmm3,xmm1
+ vpsrld xmm1,xmm0,19
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,13
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+ vpaddd xmm5,xmm5,xmm7
+ vpsrld xmm7,xmm14,6
+ vpslld xmm2,xmm14,26
+ vmovdqu XMMWORD[(224-128)+rax],xmm5
+ vpaddd xmm5,xmm5,xmm9
+
+ vpsrld xmm1,xmm14,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm14,21
+ vpaddd xmm5,xmm5,XMMWORD[64+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm14,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm14,7
+ vpandn xmm0,xmm14,xmm8
+ vpand xmm3,xmm14,xmm15
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm9,xmm10,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm10,30
+ vpxor xmm0,xmm0,xmm3
+ vpxor xmm3,xmm11,xmm10
+
+ vpxor xmm9,xmm9,xmm1
+ vpaddd xmm5,xmm5,xmm7
+
+ vpsrld xmm1,xmm10,13
+
+ vpslld xmm2,xmm10,19
+ vpaddd xmm5,xmm5,xmm0
+ vpand xmm4,xmm4,xmm3
+
+ vpxor xmm7,xmm9,xmm1
+
+ vpsrld xmm1,xmm10,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm10,10
+ vpxor xmm9,xmm11,xmm4
+ vpaddd xmm13,xmm13,xmm5
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm9,xmm9,xmm5
+ vpaddd xmm9,xmm9,xmm7
+ vmovdqu xmm5,XMMWORD[((0-128))+rax]
+ vpaddd xmm6,xmm6,XMMWORD[((128-128))+rax]
+
+ vpsrld xmm7,xmm5,3
+ vpsrld xmm1,xmm5,7
+ vpslld xmm2,xmm5,25
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm5,18
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm5,14
+ vmovdqu xmm0,XMMWORD[((208-128))+rax]
+ vpsrld xmm4,xmm0,10
+
+ vpxor xmm7,xmm7,xmm1
+ vpsrld xmm1,xmm0,17
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,15
+ vpaddd xmm6,xmm6,xmm7
+ vpxor xmm7,xmm4,xmm1
+ vpsrld xmm1,xmm0,19
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm0,13
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+ vpaddd xmm6,xmm6,xmm7
+ vpsrld xmm7,xmm13,6
+ vpslld xmm2,xmm13,26
+ vmovdqu XMMWORD[(240-128)+rax],xmm6
+ vpaddd xmm6,xmm6,xmm8
+
+ vpsrld xmm1,xmm13,11
+ vpxor xmm7,xmm7,xmm2
+ vpslld xmm2,xmm13,21
+ vpaddd xmm6,xmm6,XMMWORD[96+rbp]
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm1,xmm13,25
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm13,7
+ vpandn xmm0,xmm13,xmm15
+ vpand xmm4,xmm13,xmm14
+
+ vpxor xmm7,xmm7,xmm1
+
+ vpsrld xmm8,xmm9,2
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm1,xmm9,30
+ vpxor xmm0,xmm0,xmm4
+ vpxor xmm4,xmm10,xmm9
+
+ vpxor xmm8,xmm8,xmm1
+ vpaddd xmm6,xmm6,xmm7
+
+ vpsrld xmm1,xmm9,13
+
+ vpslld xmm2,xmm9,19
+ vpaddd xmm6,xmm6,xmm0
+ vpand xmm3,xmm3,xmm4
+
+ vpxor xmm7,xmm8,xmm1
+
+ vpsrld xmm1,xmm9,22
+ vpxor xmm7,xmm7,xmm2
+
+ vpslld xmm2,xmm9,10
+ vpxor xmm8,xmm10,xmm3
+ vpaddd xmm12,xmm12,xmm6
+
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm7,xmm7,xmm2
+
+ vpaddd xmm8,xmm8,xmm6
+ vpaddd xmm8,xmm8,xmm7
+ add rbp,256
+ dec ecx
+ jnz NEAR $L$oop_16_xx_avx
+
+ mov ecx,1
+ lea rbp,[((K256+128))]
+ cmp ecx,DWORD[rbx]
+ cmovge r8,rbp
+ cmp ecx,DWORD[4+rbx]
+ cmovge r9,rbp
+ cmp ecx,DWORD[8+rbx]
+ cmovge r10,rbp
+ cmp ecx,DWORD[12+rbx]
+ cmovge r11,rbp
+ vmovdqa xmm7,XMMWORD[rbx]
+ vpxor xmm0,xmm0,xmm0
+ vmovdqa xmm6,xmm7
+ vpcmpgtd xmm6,xmm6,xmm0
+ vpaddd xmm7,xmm7,xmm6
+
+ vmovdqu xmm0,XMMWORD[((0-128))+rdi]
+ vpand xmm8,xmm8,xmm6
+ vmovdqu xmm1,XMMWORD[((32-128))+rdi]
+ vpand xmm9,xmm9,xmm6
+ vmovdqu xmm2,XMMWORD[((64-128))+rdi]
+ vpand xmm10,xmm10,xmm6
+ vmovdqu xmm5,XMMWORD[((96-128))+rdi]
+ vpand xmm11,xmm11,xmm6
+ vpaddd xmm8,xmm8,xmm0
+ vmovdqu xmm0,XMMWORD[((128-128))+rdi]
+ vpand xmm12,xmm12,xmm6
+ vpaddd xmm9,xmm9,xmm1
+ vmovdqu xmm1,XMMWORD[((160-128))+rdi]
+ vpand xmm13,xmm13,xmm6
+ vpaddd xmm10,xmm10,xmm2
+ vmovdqu xmm2,XMMWORD[((192-128))+rdi]
+ vpand xmm14,xmm14,xmm6
+ vpaddd xmm11,xmm11,xmm5
+ vmovdqu xmm5,XMMWORD[((224-128))+rdi]
+ vpand xmm15,xmm15,xmm6
+ vpaddd xmm12,xmm12,xmm0
+ vpaddd xmm13,xmm13,xmm1
+ vmovdqu XMMWORD[(0-128)+rdi],xmm8
+ vpaddd xmm14,xmm14,xmm2
+ vmovdqu XMMWORD[(32-128)+rdi],xmm9
+ vpaddd xmm15,xmm15,xmm5
+ vmovdqu XMMWORD[(64-128)+rdi],xmm10
+ vmovdqu XMMWORD[(96-128)+rdi],xmm11
+ vmovdqu XMMWORD[(128-128)+rdi],xmm12
+ vmovdqu XMMWORD[(160-128)+rdi],xmm13
+ vmovdqu XMMWORD[(192-128)+rdi],xmm14
+ vmovdqu XMMWORD[(224-128)+rdi],xmm15
+
+ vmovdqu XMMWORD[rbx],xmm7
+ vmovdqu xmm6,XMMWORD[$L$pbswap]
+ dec edx
+ jnz NEAR $L$oop_avx
+
+ mov edx,DWORD[280+rsp]
+ lea rdi,[16+rdi]
+ lea rsi,[64+rsi]
+ dec edx
+ jnz NEAR $L$oop_grande_avx
+
+$L$done_avx:
+ mov rax,QWORD[272+rsp]
+
+ vzeroupper
+ movaps xmm6,XMMWORD[((-184))+rax]
+ movaps xmm7,XMMWORD[((-168))+rax]
+ movaps xmm8,XMMWORD[((-152))+rax]
+ movaps xmm9,XMMWORD[((-136))+rax]
+ movaps xmm10,XMMWORD[((-120))+rax]
+ movaps xmm11,XMMWORD[((-104))+rax]
+ movaps xmm12,XMMWORD[((-88))+rax]
+ movaps xmm13,XMMWORD[((-72))+rax]
+ movaps xmm14,XMMWORD[((-56))+rax]
+ movaps xmm15,XMMWORD[((-40))+rax]
+ mov rbp,QWORD[((-16))+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+
+ lea rsp,[rax]
+
+$L$epilogue_avx:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha256_multi_block_avx:
+
+ALIGN 32
+sha256_multi_block_avx2:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha256_multi_block_avx2:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_avx2_shortcut:
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ lea rsp,[((-168))+rsp]
+ movaps XMMWORD[rsp],xmm6
+ movaps XMMWORD[16+rsp],xmm7
+ movaps XMMWORD[32+rsp],xmm8
+ movaps XMMWORD[48+rsp],xmm9
+ movaps XMMWORD[64+rsp],xmm10
+ movaps XMMWORD[80+rsp],xmm11
+ movaps XMMWORD[(-120)+rax],xmm12
+ movaps XMMWORD[(-104)+rax],xmm13
+ movaps XMMWORD[(-88)+rax],xmm14
+ movaps XMMWORD[(-72)+rax],xmm15
+ sub rsp,576
+ and rsp,-256
+ mov QWORD[544+rsp],rax
+
+$L$body_avx2:
+ lea rbp,[((K256+128))]
+ lea rdi,[128+rdi]
+
+$L$oop_grande_avx2:
+ mov DWORD[552+rsp],edx
+ xor edx,edx
+ lea rbx,[512+rsp]
+
+ mov r12,QWORD[rsi]
+
+ mov ecx,DWORD[8+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[rbx],ecx
+ cmovle r12,rbp
+
+ mov r13,QWORD[16+rsi]
+
+ mov ecx,DWORD[24+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[4+rbx],ecx
+ cmovle r13,rbp
+
+ mov r14,QWORD[32+rsi]
+
+ mov ecx,DWORD[40+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[8+rbx],ecx
+ cmovle r14,rbp
+
+ mov r15,QWORD[48+rsi]
+
+ mov ecx,DWORD[56+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[12+rbx],ecx
+ cmovle r15,rbp
+
+ mov r8,QWORD[64+rsi]
+
+ mov ecx,DWORD[72+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[16+rbx],ecx
+ cmovle r8,rbp
+
+ mov r9,QWORD[80+rsi]
+
+ mov ecx,DWORD[88+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[20+rbx],ecx
+ cmovle r9,rbp
+
+ mov r10,QWORD[96+rsi]
+
+ mov ecx,DWORD[104+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[24+rbx],ecx
+ cmovle r10,rbp
+
+ mov r11,QWORD[112+rsi]
+
+ mov ecx,DWORD[120+rsi]
+ cmp ecx,edx
+ cmovg edx,ecx
+ test ecx,ecx
+ mov DWORD[28+rbx],ecx
+ cmovle r11,rbp
+ vmovdqu ymm8,YMMWORD[((0-128))+rdi]
+ lea rax,[128+rsp]
+ vmovdqu ymm9,YMMWORD[((32-128))+rdi]
+ lea rbx,[((256+128))+rsp]
+ vmovdqu ymm10,YMMWORD[((64-128))+rdi]
+ vmovdqu ymm11,YMMWORD[((96-128))+rdi]
+ vmovdqu ymm12,YMMWORD[((128-128))+rdi]
+ vmovdqu ymm13,YMMWORD[((160-128))+rdi]
+ vmovdqu ymm14,YMMWORD[((192-128))+rdi]
+ vmovdqu ymm15,YMMWORD[((224-128))+rdi]
+ vmovdqu ymm6,YMMWORD[$L$pbswap]
+ jmp NEAR $L$oop_avx2
+
+ALIGN 32
+$L$oop_avx2:
+ vpxor ymm4,ymm10,ymm9
+ vmovd xmm5,DWORD[r12]
+ vmovd xmm0,DWORD[r8]
+ vmovd xmm1,DWORD[r13]
+ vmovd xmm2,DWORD[r9]
+ vpinsrd xmm5,xmm5,DWORD[r14],1
+ vpinsrd xmm0,xmm0,DWORD[r10],1
+ vpinsrd xmm1,xmm1,DWORD[r15],1
+ vpunpckldq ymm5,ymm5,ymm1
+ vpinsrd xmm2,xmm2,DWORD[r11],1
+ vpunpckldq ymm0,ymm0,ymm2
+ vinserti128 ymm5,ymm5,xmm0,1
+ vpshufb ymm5,ymm5,ymm6
+ vpsrld ymm7,ymm12,6
+ vpslld ymm2,ymm12,26
+ vmovdqu YMMWORD[(0-128)+rax],ymm5
+ vpaddd ymm5,ymm5,ymm15
+
+ vpsrld ymm1,ymm12,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm12,21
+ vpaddd ymm5,ymm5,YMMWORD[((-128))+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm12,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm12,7
+ vpandn ymm0,ymm12,ymm14
+ vpand ymm3,ymm12,ymm13
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm15,ymm8,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm8,30
+ vpxor ymm0,ymm0,ymm3
+ vpxor ymm3,ymm9,ymm8
+
+ vpxor ymm15,ymm15,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm8,13
+
+ vpslld ymm2,ymm8,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm4,ymm4,ymm3
+
+ vpxor ymm7,ymm15,ymm1
+
+ vpsrld ymm1,ymm8,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm8,10
+ vpxor ymm15,ymm9,ymm4
+ vpaddd ymm11,ymm11,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm15,ymm15,ymm5
+ vpaddd ymm15,ymm15,ymm7
+ vmovd xmm5,DWORD[4+r12]
+ vmovd xmm0,DWORD[4+r8]
+ vmovd xmm1,DWORD[4+r13]
+ vmovd xmm2,DWORD[4+r9]
+ vpinsrd xmm5,xmm5,DWORD[4+r14],1
+ vpinsrd xmm0,xmm0,DWORD[4+r10],1
+ vpinsrd xmm1,xmm1,DWORD[4+r15],1
+ vpunpckldq ymm5,ymm5,ymm1
+ vpinsrd xmm2,xmm2,DWORD[4+r11],1
+ vpunpckldq ymm0,ymm0,ymm2
+ vinserti128 ymm5,ymm5,xmm0,1
+ vpshufb ymm5,ymm5,ymm6
+ vpsrld ymm7,ymm11,6
+ vpslld ymm2,ymm11,26
+ vmovdqu YMMWORD[(32-128)+rax],ymm5
+ vpaddd ymm5,ymm5,ymm14
+
+ vpsrld ymm1,ymm11,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm11,21
+ vpaddd ymm5,ymm5,YMMWORD[((-96))+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm11,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm11,7
+ vpandn ymm0,ymm11,ymm13
+ vpand ymm4,ymm11,ymm12
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm14,ymm15,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm15,30
+ vpxor ymm0,ymm0,ymm4
+ vpxor ymm4,ymm8,ymm15
+
+ vpxor ymm14,ymm14,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm15,13
+
+ vpslld ymm2,ymm15,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm3,ymm3,ymm4
+
+ vpxor ymm7,ymm14,ymm1
+
+ vpsrld ymm1,ymm15,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm15,10
+ vpxor ymm14,ymm8,ymm3
+ vpaddd ymm10,ymm10,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm14,ymm14,ymm5
+ vpaddd ymm14,ymm14,ymm7
+ vmovd xmm5,DWORD[8+r12]
+ vmovd xmm0,DWORD[8+r8]
+ vmovd xmm1,DWORD[8+r13]
+ vmovd xmm2,DWORD[8+r9]
+ vpinsrd xmm5,xmm5,DWORD[8+r14],1
+ vpinsrd xmm0,xmm0,DWORD[8+r10],1
+ vpinsrd xmm1,xmm1,DWORD[8+r15],1
+ vpunpckldq ymm5,ymm5,ymm1
+ vpinsrd xmm2,xmm2,DWORD[8+r11],1
+ vpunpckldq ymm0,ymm0,ymm2
+ vinserti128 ymm5,ymm5,xmm0,1
+ vpshufb ymm5,ymm5,ymm6
+ vpsrld ymm7,ymm10,6
+ vpslld ymm2,ymm10,26
+ vmovdqu YMMWORD[(64-128)+rax],ymm5
+ vpaddd ymm5,ymm5,ymm13
+
+ vpsrld ymm1,ymm10,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm10,21
+ vpaddd ymm5,ymm5,YMMWORD[((-64))+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm10,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm10,7
+ vpandn ymm0,ymm10,ymm12
+ vpand ymm3,ymm10,ymm11
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm13,ymm14,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm14,30
+ vpxor ymm0,ymm0,ymm3
+ vpxor ymm3,ymm15,ymm14
+
+ vpxor ymm13,ymm13,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm14,13
+
+ vpslld ymm2,ymm14,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm4,ymm4,ymm3
+
+ vpxor ymm7,ymm13,ymm1
+
+ vpsrld ymm1,ymm14,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm14,10
+ vpxor ymm13,ymm15,ymm4
+ vpaddd ymm9,ymm9,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm13,ymm13,ymm5
+ vpaddd ymm13,ymm13,ymm7
+ vmovd xmm5,DWORD[12+r12]
+ vmovd xmm0,DWORD[12+r8]
+ vmovd xmm1,DWORD[12+r13]
+ vmovd xmm2,DWORD[12+r9]
+ vpinsrd xmm5,xmm5,DWORD[12+r14],1
+ vpinsrd xmm0,xmm0,DWORD[12+r10],1
+ vpinsrd xmm1,xmm1,DWORD[12+r15],1
+ vpunpckldq ymm5,ymm5,ymm1
+ vpinsrd xmm2,xmm2,DWORD[12+r11],1
+ vpunpckldq ymm0,ymm0,ymm2
+ vinserti128 ymm5,ymm5,xmm0,1
+ vpshufb ymm5,ymm5,ymm6
+ vpsrld ymm7,ymm9,6
+ vpslld ymm2,ymm9,26
+ vmovdqu YMMWORD[(96-128)+rax],ymm5
+ vpaddd ymm5,ymm5,ymm12
+
+ vpsrld ymm1,ymm9,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm9,21
+ vpaddd ymm5,ymm5,YMMWORD[((-32))+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm9,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm9,7
+ vpandn ymm0,ymm9,ymm11
+ vpand ymm4,ymm9,ymm10
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm12,ymm13,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm13,30
+ vpxor ymm0,ymm0,ymm4
+ vpxor ymm4,ymm14,ymm13
+
+ vpxor ymm12,ymm12,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm13,13
+
+ vpslld ymm2,ymm13,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm3,ymm3,ymm4
+
+ vpxor ymm7,ymm12,ymm1
+
+ vpsrld ymm1,ymm13,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm13,10
+ vpxor ymm12,ymm14,ymm3
+ vpaddd ymm8,ymm8,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm12,ymm12,ymm5
+ vpaddd ymm12,ymm12,ymm7
+ vmovd xmm5,DWORD[16+r12]
+ vmovd xmm0,DWORD[16+r8]
+ vmovd xmm1,DWORD[16+r13]
+ vmovd xmm2,DWORD[16+r9]
+ vpinsrd xmm5,xmm5,DWORD[16+r14],1
+ vpinsrd xmm0,xmm0,DWORD[16+r10],1
+ vpinsrd xmm1,xmm1,DWORD[16+r15],1
+ vpunpckldq ymm5,ymm5,ymm1
+ vpinsrd xmm2,xmm2,DWORD[16+r11],1
+ vpunpckldq ymm0,ymm0,ymm2
+ vinserti128 ymm5,ymm5,xmm0,1
+ vpshufb ymm5,ymm5,ymm6
+ vpsrld ymm7,ymm8,6
+ vpslld ymm2,ymm8,26
+ vmovdqu YMMWORD[(128-128)+rax],ymm5
+ vpaddd ymm5,ymm5,ymm11
+
+ vpsrld ymm1,ymm8,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm8,21
+ vpaddd ymm5,ymm5,YMMWORD[rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm8,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm8,7
+ vpandn ymm0,ymm8,ymm10
+ vpand ymm3,ymm8,ymm9
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm11,ymm12,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm12,30
+ vpxor ymm0,ymm0,ymm3
+ vpxor ymm3,ymm13,ymm12
+
+ vpxor ymm11,ymm11,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm12,13
+
+ vpslld ymm2,ymm12,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm4,ymm4,ymm3
+
+ vpxor ymm7,ymm11,ymm1
+
+ vpsrld ymm1,ymm12,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm12,10
+ vpxor ymm11,ymm13,ymm4
+ vpaddd ymm15,ymm15,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm11,ymm11,ymm5
+ vpaddd ymm11,ymm11,ymm7
+ vmovd xmm5,DWORD[20+r12]
+ vmovd xmm0,DWORD[20+r8]
+ vmovd xmm1,DWORD[20+r13]
+ vmovd xmm2,DWORD[20+r9]
+ vpinsrd xmm5,xmm5,DWORD[20+r14],1
+ vpinsrd xmm0,xmm0,DWORD[20+r10],1
+ vpinsrd xmm1,xmm1,DWORD[20+r15],1
+ vpunpckldq ymm5,ymm5,ymm1
+ vpinsrd xmm2,xmm2,DWORD[20+r11],1
+ vpunpckldq ymm0,ymm0,ymm2
+ vinserti128 ymm5,ymm5,xmm0,1
+ vpshufb ymm5,ymm5,ymm6
+ vpsrld ymm7,ymm15,6
+ vpslld ymm2,ymm15,26
+ vmovdqu YMMWORD[(160-128)+rax],ymm5
+ vpaddd ymm5,ymm5,ymm10
+
+ vpsrld ymm1,ymm15,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm15,21
+ vpaddd ymm5,ymm5,YMMWORD[32+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm15,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm15,7
+ vpandn ymm0,ymm15,ymm9
+ vpand ymm4,ymm15,ymm8
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm10,ymm11,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm11,30
+ vpxor ymm0,ymm0,ymm4
+ vpxor ymm4,ymm12,ymm11
+
+ vpxor ymm10,ymm10,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm11,13
+
+ vpslld ymm2,ymm11,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm3,ymm3,ymm4
+
+ vpxor ymm7,ymm10,ymm1
+
+ vpsrld ymm1,ymm11,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm11,10
+ vpxor ymm10,ymm12,ymm3
+ vpaddd ymm14,ymm14,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm10,ymm10,ymm5
+ vpaddd ymm10,ymm10,ymm7
+ vmovd xmm5,DWORD[24+r12]
+ vmovd xmm0,DWORD[24+r8]
+ vmovd xmm1,DWORD[24+r13]
+ vmovd xmm2,DWORD[24+r9]
+ vpinsrd xmm5,xmm5,DWORD[24+r14],1
+ vpinsrd xmm0,xmm0,DWORD[24+r10],1
+ vpinsrd xmm1,xmm1,DWORD[24+r15],1
+ vpunpckldq ymm5,ymm5,ymm1
+ vpinsrd xmm2,xmm2,DWORD[24+r11],1
+ vpunpckldq ymm0,ymm0,ymm2
+ vinserti128 ymm5,ymm5,xmm0,1
+ vpshufb ymm5,ymm5,ymm6
+ vpsrld ymm7,ymm14,6
+ vpslld ymm2,ymm14,26
+ vmovdqu YMMWORD[(192-128)+rax],ymm5
+ vpaddd ymm5,ymm5,ymm9
+
+ vpsrld ymm1,ymm14,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm14,21
+ vpaddd ymm5,ymm5,YMMWORD[64+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm14,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm14,7
+ vpandn ymm0,ymm14,ymm8
+ vpand ymm3,ymm14,ymm15
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm9,ymm10,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm10,30
+ vpxor ymm0,ymm0,ymm3
+ vpxor ymm3,ymm11,ymm10
+
+ vpxor ymm9,ymm9,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm10,13
+
+ vpslld ymm2,ymm10,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm4,ymm4,ymm3
+
+ vpxor ymm7,ymm9,ymm1
+
+ vpsrld ymm1,ymm10,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm10,10
+ vpxor ymm9,ymm11,ymm4
+ vpaddd ymm13,ymm13,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm9,ymm9,ymm5
+ vpaddd ymm9,ymm9,ymm7
+ vmovd xmm5,DWORD[28+r12]
+ vmovd xmm0,DWORD[28+r8]
+ vmovd xmm1,DWORD[28+r13]
+ vmovd xmm2,DWORD[28+r9]
+ vpinsrd xmm5,xmm5,DWORD[28+r14],1
+ vpinsrd xmm0,xmm0,DWORD[28+r10],1
+ vpinsrd xmm1,xmm1,DWORD[28+r15],1
+ vpunpckldq ymm5,ymm5,ymm1
+ vpinsrd xmm2,xmm2,DWORD[28+r11],1
+ vpunpckldq ymm0,ymm0,ymm2
+ vinserti128 ymm5,ymm5,xmm0,1
+ vpshufb ymm5,ymm5,ymm6
+ vpsrld ymm7,ymm13,6
+ vpslld ymm2,ymm13,26
+ vmovdqu YMMWORD[(224-128)+rax],ymm5
+ vpaddd ymm5,ymm5,ymm8
+
+ vpsrld ymm1,ymm13,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm13,21
+ vpaddd ymm5,ymm5,YMMWORD[96+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm13,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm13,7
+ vpandn ymm0,ymm13,ymm15
+ vpand ymm4,ymm13,ymm14
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm8,ymm9,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm9,30
+ vpxor ymm0,ymm0,ymm4
+ vpxor ymm4,ymm10,ymm9
+
+ vpxor ymm8,ymm8,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm9,13
+
+ vpslld ymm2,ymm9,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm3,ymm3,ymm4
+
+ vpxor ymm7,ymm8,ymm1
+
+ vpsrld ymm1,ymm9,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm9,10
+ vpxor ymm8,ymm10,ymm3
+ vpaddd ymm12,ymm12,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm8,ymm8,ymm5
+ vpaddd ymm8,ymm8,ymm7
+ add rbp,256
+ vmovd xmm5,DWORD[32+r12]
+ vmovd xmm0,DWORD[32+r8]
+ vmovd xmm1,DWORD[32+r13]
+ vmovd xmm2,DWORD[32+r9]
+ vpinsrd xmm5,xmm5,DWORD[32+r14],1
+ vpinsrd xmm0,xmm0,DWORD[32+r10],1
+ vpinsrd xmm1,xmm1,DWORD[32+r15],1
+ vpunpckldq ymm5,ymm5,ymm1
+ vpinsrd xmm2,xmm2,DWORD[32+r11],1
+ vpunpckldq ymm0,ymm0,ymm2
+ vinserti128 ymm5,ymm5,xmm0,1
+ vpshufb ymm5,ymm5,ymm6
+ vpsrld ymm7,ymm12,6
+ vpslld ymm2,ymm12,26
+ vmovdqu YMMWORD[(256-256-128)+rbx],ymm5
+ vpaddd ymm5,ymm5,ymm15
+
+ vpsrld ymm1,ymm12,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm12,21
+ vpaddd ymm5,ymm5,YMMWORD[((-128))+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm12,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm12,7
+ vpandn ymm0,ymm12,ymm14
+ vpand ymm3,ymm12,ymm13
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm15,ymm8,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm8,30
+ vpxor ymm0,ymm0,ymm3
+ vpxor ymm3,ymm9,ymm8
+
+ vpxor ymm15,ymm15,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm8,13
+
+ vpslld ymm2,ymm8,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm4,ymm4,ymm3
+
+ vpxor ymm7,ymm15,ymm1
+
+ vpsrld ymm1,ymm8,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm8,10
+ vpxor ymm15,ymm9,ymm4
+ vpaddd ymm11,ymm11,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm15,ymm15,ymm5
+ vpaddd ymm15,ymm15,ymm7
+ vmovd xmm5,DWORD[36+r12]
+ vmovd xmm0,DWORD[36+r8]
+ vmovd xmm1,DWORD[36+r13]
+ vmovd xmm2,DWORD[36+r9]
+ vpinsrd xmm5,xmm5,DWORD[36+r14],1
+ vpinsrd xmm0,xmm0,DWORD[36+r10],1
+ vpinsrd xmm1,xmm1,DWORD[36+r15],1
+ vpunpckldq ymm5,ymm5,ymm1
+ vpinsrd xmm2,xmm2,DWORD[36+r11],1
+ vpunpckldq ymm0,ymm0,ymm2
+ vinserti128 ymm5,ymm5,xmm0,1
+ vpshufb ymm5,ymm5,ymm6
+ vpsrld ymm7,ymm11,6
+ vpslld ymm2,ymm11,26
+ vmovdqu YMMWORD[(288-256-128)+rbx],ymm5
+ vpaddd ymm5,ymm5,ymm14
+
+ vpsrld ymm1,ymm11,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm11,21
+ vpaddd ymm5,ymm5,YMMWORD[((-96))+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm11,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm11,7
+ vpandn ymm0,ymm11,ymm13
+ vpand ymm4,ymm11,ymm12
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm14,ymm15,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm15,30
+ vpxor ymm0,ymm0,ymm4
+ vpxor ymm4,ymm8,ymm15
+
+ vpxor ymm14,ymm14,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm15,13
+
+ vpslld ymm2,ymm15,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm3,ymm3,ymm4
+
+ vpxor ymm7,ymm14,ymm1
+
+ vpsrld ymm1,ymm15,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm15,10
+ vpxor ymm14,ymm8,ymm3
+ vpaddd ymm10,ymm10,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm14,ymm14,ymm5
+ vpaddd ymm14,ymm14,ymm7
+ vmovd xmm5,DWORD[40+r12]
+ vmovd xmm0,DWORD[40+r8]
+ vmovd xmm1,DWORD[40+r13]
+ vmovd xmm2,DWORD[40+r9]
+ vpinsrd xmm5,xmm5,DWORD[40+r14],1
+ vpinsrd xmm0,xmm0,DWORD[40+r10],1
+ vpinsrd xmm1,xmm1,DWORD[40+r15],1
+ vpunpckldq ymm5,ymm5,ymm1
+ vpinsrd xmm2,xmm2,DWORD[40+r11],1
+ vpunpckldq ymm0,ymm0,ymm2
+ vinserti128 ymm5,ymm5,xmm0,1
+ vpshufb ymm5,ymm5,ymm6
+ vpsrld ymm7,ymm10,6
+ vpslld ymm2,ymm10,26
+ vmovdqu YMMWORD[(320-256-128)+rbx],ymm5
+ vpaddd ymm5,ymm5,ymm13
+
+ vpsrld ymm1,ymm10,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm10,21
+ vpaddd ymm5,ymm5,YMMWORD[((-64))+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm10,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm10,7
+ vpandn ymm0,ymm10,ymm12
+ vpand ymm3,ymm10,ymm11
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm13,ymm14,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm14,30
+ vpxor ymm0,ymm0,ymm3
+ vpxor ymm3,ymm15,ymm14
+
+ vpxor ymm13,ymm13,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm14,13
+
+ vpslld ymm2,ymm14,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm4,ymm4,ymm3
+
+ vpxor ymm7,ymm13,ymm1
+
+ vpsrld ymm1,ymm14,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm14,10
+ vpxor ymm13,ymm15,ymm4
+ vpaddd ymm9,ymm9,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm13,ymm13,ymm5
+ vpaddd ymm13,ymm13,ymm7
+ vmovd xmm5,DWORD[44+r12]
+ vmovd xmm0,DWORD[44+r8]
+ vmovd xmm1,DWORD[44+r13]
+ vmovd xmm2,DWORD[44+r9]
+ vpinsrd xmm5,xmm5,DWORD[44+r14],1
+ vpinsrd xmm0,xmm0,DWORD[44+r10],1
+ vpinsrd xmm1,xmm1,DWORD[44+r15],1
+ vpunpckldq ymm5,ymm5,ymm1
+ vpinsrd xmm2,xmm2,DWORD[44+r11],1
+ vpunpckldq ymm0,ymm0,ymm2
+ vinserti128 ymm5,ymm5,xmm0,1
+ vpshufb ymm5,ymm5,ymm6
+ vpsrld ymm7,ymm9,6
+ vpslld ymm2,ymm9,26
+ vmovdqu YMMWORD[(352-256-128)+rbx],ymm5
+ vpaddd ymm5,ymm5,ymm12
+
+ vpsrld ymm1,ymm9,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm9,21
+ vpaddd ymm5,ymm5,YMMWORD[((-32))+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm9,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm9,7
+ vpandn ymm0,ymm9,ymm11
+ vpand ymm4,ymm9,ymm10
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm12,ymm13,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm13,30
+ vpxor ymm0,ymm0,ymm4
+ vpxor ymm4,ymm14,ymm13
+
+ vpxor ymm12,ymm12,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm13,13
+
+ vpslld ymm2,ymm13,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm3,ymm3,ymm4
+
+ vpxor ymm7,ymm12,ymm1
+
+ vpsrld ymm1,ymm13,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm13,10
+ vpxor ymm12,ymm14,ymm3
+ vpaddd ymm8,ymm8,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm12,ymm12,ymm5
+ vpaddd ymm12,ymm12,ymm7
+ vmovd xmm5,DWORD[48+r12]
+ vmovd xmm0,DWORD[48+r8]
+ vmovd xmm1,DWORD[48+r13]
+ vmovd xmm2,DWORD[48+r9]
+ vpinsrd xmm5,xmm5,DWORD[48+r14],1
+ vpinsrd xmm0,xmm0,DWORD[48+r10],1
+ vpinsrd xmm1,xmm1,DWORD[48+r15],1
+ vpunpckldq ymm5,ymm5,ymm1
+ vpinsrd xmm2,xmm2,DWORD[48+r11],1
+ vpunpckldq ymm0,ymm0,ymm2
+ vinserti128 ymm5,ymm5,xmm0,1
+ vpshufb ymm5,ymm5,ymm6
+ vpsrld ymm7,ymm8,6
+ vpslld ymm2,ymm8,26
+ vmovdqu YMMWORD[(384-256-128)+rbx],ymm5
+ vpaddd ymm5,ymm5,ymm11
+
+ vpsrld ymm1,ymm8,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm8,21
+ vpaddd ymm5,ymm5,YMMWORD[rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm8,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm8,7
+ vpandn ymm0,ymm8,ymm10
+ vpand ymm3,ymm8,ymm9
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm11,ymm12,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm12,30
+ vpxor ymm0,ymm0,ymm3
+ vpxor ymm3,ymm13,ymm12
+
+ vpxor ymm11,ymm11,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm12,13
+
+ vpslld ymm2,ymm12,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm4,ymm4,ymm3
+
+ vpxor ymm7,ymm11,ymm1
+
+ vpsrld ymm1,ymm12,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm12,10
+ vpxor ymm11,ymm13,ymm4
+ vpaddd ymm15,ymm15,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm11,ymm11,ymm5
+ vpaddd ymm11,ymm11,ymm7
+ vmovd xmm5,DWORD[52+r12]
+ vmovd xmm0,DWORD[52+r8]
+ vmovd xmm1,DWORD[52+r13]
+ vmovd xmm2,DWORD[52+r9]
+ vpinsrd xmm5,xmm5,DWORD[52+r14],1
+ vpinsrd xmm0,xmm0,DWORD[52+r10],1
+ vpinsrd xmm1,xmm1,DWORD[52+r15],1
+ vpunpckldq ymm5,ymm5,ymm1
+ vpinsrd xmm2,xmm2,DWORD[52+r11],1
+ vpunpckldq ymm0,ymm0,ymm2
+ vinserti128 ymm5,ymm5,xmm0,1
+ vpshufb ymm5,ymm5,ymm6
+ vpsrld ymm7,ymm15,6
+ vpslld ymm2,ymm15,26
+ vmovdqu YMMWORD[(416-256-128)+rbx],ymm5
+ vpaddd ymm5,ymm5,ymm10
+
+ vpsrld ymm1,ymm15,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm15,21
+ vpaddd ymm5,ymm5,YMMWORD[32+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm15,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm15,7
+ vpandn ymm0,ymm15,ymm9
+ vpand ymm4,ymm15,ymm8
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm10,ymm11,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm11,30
+ vpxor ymm0,ymm0,ymm4
+ vpxor ymm4,ymm12,ymm11
+
+ vpxor ymm10,ymm10,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm11,13
+
+ vpslld ymm2,ymm11,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm3,ymm3,ymm4
+
+ vpxor ymm7,ymm10,ymm1
+
+ vpsrld ymm1,ymm11,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm11,10
+ vpxor ymm10,ymm12,ymm3
+ vpaddd ymm14,ymm14,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm10,ymm10,ymm5
+ vpaddd ymm10,ymm10,ymm7
+ vmovd xmm5,DWORD[56+r12]
+ vmovd xmm0,DWORD[56+r8]
+ vmovd xmm1,DWORD[56+r13]
+ vmovd xmm2,DWORD[56+r9]
+ vpinsrd xmm5,xmm5,DWORD[56+r14],1
+ vpinsrd xmm0,xmm0,DWORD[56+r10],1
+ vpinsrd xmm1,xmm1,DWORD[56+r15],1
+ vpunpckldq ymm5,ymm5,ymm1
+ vpinsrd xmm2,xmm2,DWORD[56+r11],1
+ vpunpckldq ymm0,ymm0,ymm2
+ vinserti128 ymm5,ymm5,xmm0,1
+ vpshufb ymm5,ymm5,ymm6
+ vpsrld ymm7,ymm14,6
+ vpslld ymm2,ymm14,26
+ vmovdqu YMMWORD[(448-256-128)+rbx],ymm5
+ vpaddd ymm5,ymm5,ymm9
+
+ vpsrld ymm1,ymm14,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm14,21
+ vpaddd ymm5,ymm5,YMMWORD[64+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm14,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm14,7
+ vpandn ymm0,ymm14,ymm8
+ vpand ymm3,ymm14,ymm15
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm9,ymm10,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm10,30
+ vpxor ymm0,ymm0,ymm3
+ vpxor ymm3,ymm11,ymm10
+
+ vpxor ymm9,ymm9,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm10,13
+
+ vpslld ymm2,ymm10,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm4,ymm4,ymm3
+
+ vpxor ymm7,ymm9,ymm1
+
+ vpsrld ymm1,ymm10,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm10,10
+ vpxor ymm9,ymm11,ymm4
+ vpaddd ymm13,ymm13,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm9,ymm9,ymm5
+ vpaddd ymm9,ymm9,ymm7
+ vmovd xmm5,DWORD[60+r12]
+ lea r12,[64+r12]
+ vmovd xmm0,DWORD[60+r8]
+ lea r8,[64+r8]
+ vmovd xmm1,DWORD[60+r13]
+ lea r13,[64+r13]
+ vmovd xmm2,DWORD[60+r9]
+ lea r9,[64+r9]
+ vpinsrd xmm5,xmm5,DWORD[60+r14],1
+ lea r14,[64+r14]
+ vpinsrd xmm0,xmm0,DWORD[60+r10],1
+ lea r10,[64+r10]
+ vpinsrd xmm1,xmm1,DWORD[60+r15],1
+ lea r15,[64+r15]
+ vpunpckldq ymm5,ymm5,ymm1
+ vpinsrd xmm2,xmm2,DWORD[60+r11],1
+ lea r11,[64+r11]
+ vpunpckldq ymm0,ymm0,ymm2
+ vinserti128 ymm5,ymm5,xmm0,1
+ vpshufb ymm5,ymm5,ymm6
+ vpsrld ymm7,ymm13,6
+ vpslld ymm2,ymm13,26
+ vmovdqu YMMWORD[(480-256-128)+rbx],ymm5
+ vpaddd ymm5,ymm5,ymm8
+
+ vpsrld ymm1,ymm13,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm13,21
+ vpaddd ymm5,ymm5,YMMWORD[96+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm13,25
+ vpxor ymm7,ymm7,ymm2
+ prefetcht0 [63+r12]
+ vpslld ymm2,ymm13,7
+ vpandn ymm0,ymm13,ymm15
+ vpand ymm4,ymm13,ymm14
+ prefetcht0 [63+r13]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm8,ymm9,2
+ vpxor ymm7,ymm7,ymm2
+ prefetcht0 [63+r14]
+ vpslld ymm1,ymm9,30
+ vpxor ymm0,ymm0,ymm4
+ vpxor ymm4,ymm10,ymm9
+ prefetcht0 [63+r15]
+ vpxor ymm8,ymm8,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm9,13
+ prefetcht0 [63+r8]
+ vpslld ymm2,ymm9,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm3,ymm3,ymm4
+ prefetcht0 [63+r9]
+ vpxor ymm7,ymm8,ymm1
+
+ vpsrld ymm1,ymm9,22
+ vpxor ymm7,ymm7,ymm2
+ prefetcht0 [63+r10]
+ vpslld ymm2,ymm9,10
+ vpxor ymm8,ymm10,ymm3
+ vpaddd ymm12,ymm12,ymm5
+ prefetcht0 [63+r11]
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm8,ymm8,ymm5
+ vpaddd ymm8,ymm8,ymm7
+ add rbp,256
+ vmovdqu ymm5,YMMWORD[((0-128))+rax]
+ mov ecx,3
+ jmp NEAR $L$oop_16_xx_avx2
+ALIGN 32
+$L$oop_16_xx_avx2:
+ vmovdqu ymm6,YMMWORD[((32-128))+rax]
+ vpaddd ymm5,ymm5,YMMWORD[((288-256-128))+rbx]
+
+ vpsrld ymm7,ymm6,3
+ vpsrld ymm1,ymm6,7
+ vpslld ymm2,ymm6,25
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm6,18
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm6,14
+ vmovdqu ymm0,YMMWORD[((448-256-128))+rbx]
+ vpsrld ymm3,ymm0,10
+
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm0,17
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,15
+ vpaddd ymm5,ymm5,ymm7
+ vpxor ymm7,ymm3,ymm1
+ vpsrld ymm1,ymm0,19
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,13
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+ vpaddd ymm5,ymm5,ymm7
+ vpsrld ymm7,ymm12,6
+ vpslld ymm2,ymm12,26
+ vmovdqu YMMWORD[(0-128)+rax],ymm5
+ vpaddd ymm5,ymm5,ymm15
+
+ vpsrld ymm1,ymm12,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm12,21
+ vpaddd ymm5,ymm5,YMMWORD[((-128))+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm12,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm12,7
+ vpandn ymm0,ymm12,ymm14
+ vpand ymm3,ymm12,ymm13
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm15,ymm8,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm8,30
+ vpxor ymm0,ymm0,ymm3
+ vpxor ymm3,ymm9,ymm8
+
+ vpxor ymm15,ymm15,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm8,13
+
+ vpslld ymm2,ymm8,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm4,ymm4,ymm3
+
+ vpxor ymm7,ymm15,ymm1
+
+ vpsrld ymm1,ymm8,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm8,10
+ vpxor ymm15,ymm9,ymm4
+ vpaddd ymm11,ymm11,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm15,ymm15,ymm5
+ vpaddd ymm15,ymm15,ymm7
+ vmovdqu ymm5,YMMWORD[((64-128))+rax]
+ vpaddd ymm6,ymm6,YMMWORD[((320-256-128))+rbx]
+
+ vpsrld ymm7,ymm5,3
+ vpsrld ymm1,ymm5,7
+ vpslld ymm2,ymm5,25
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm5,18
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm5,14
+ vmovdqu ymm0,YMMWORD[((480-256-128))+rbx]
+ vpsrld ymm4,ymm0,10
+
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm0,17
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,15
+ vpaddd ymm6,ymm6,ymm7
+ vpxor ymm7,ymm4,ymm1
+ vpsrld ymm1,ymm0,19
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,13
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+ vpaddd ymm6,ymm6,ymm7
+ vpsrld ymm7,ymm11,6
+ vpslld ymm2,ymm11,26
+ vmovdqu YMMWORD[(32-128)+rax],ymm6
+ vpaddd ymm6,ymm6,ymm14
+
+ vpsrld ymm1,ymm11,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm11,21
+ vpaddd ymm6,ymm6,YMMWORD[((-96))+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm11,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm11,7
+ vpandn ymm0,ymm11,ymm13
+ vpand ymm4,ymm11,ymm12
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm14,ymm15,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm15,30
+ vpxor ymm0,ymm0,ymm4
+ vpxor ymm4,ymm8,ymm15
+
+ vpxor ymm14,ymm14,ymm1
+ vpaddd ymm6,ymm6,ymm7
+
+ vpsrld ymm1,ymm15,13
+
+ vpslld ymm2,ymm15,19
+ vpaddd ymm6,ymm6,ymm0
+ vpand ymm3,ymm3,ymm4
+
+ vpxor ymm7,ymm14,ymm1
+
+ vpsrld ymm1,ymm15,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm15,10
+ vpxor ymm14,ymm8,ymm3
+ vpaddd ymm10,ymm10,ymm6
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm14,ymm14,ymm6
+ vpaddd ymm14,ymm14,ymm7
+ vmovdqu ymm6,YMMWORD[((96-128))+rax]
+ vpaddd ymm5,ymm5,YMMWORD[((352-256-128))+rbx]
+
+ vpsrld ymm7,ymm6,3
+ vpsrld ymm1,ymm6,7
+ vpslld ymm2,ymm6,25
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm6,18
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm6,14
+ vmovdqu ymm0,YMMWORD[((0-128))+rax]
+ vpsrld ymm3,ymm0,10
+
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm0,17
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,15
+ vpaddd ymm5,ymm5,ymm7
+ vpxor ymm7,ymm3,ymm1
+ vpsrld ymm1,ymm0,19
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,13
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+ vpaddd ymm5,ymm5,ymm7
+ vpsrld ymm7,ymm10,6
+ vpslld ymm2,ymm10,26
+ vmovdqu YMMWORD[(64-128)+rax],ymm5
+ vpaddd ymm5,ymm5,ymm13
+
+ vpsrld ymm1,ymm10,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm10,21
+ vpaddd ymm5,ymm5,YMMWORD[((-64))+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm10,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm10,7
+ vpandn ymm0,ymm10,ymm12
+ vpand ymm3,ymm10,ymm11
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm13,ymm14,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm14,30
+ vpxor ymm0,ymm0,ymm3
+ vpxor ymm3,ymm15,ymm14
+
+ vpxor ymm13,ymm13,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm14,13
+
+ vpslld ymm2,ymm14,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm4,ymm4,ymm3
+
+ vpxor ymm7,ymm13,ymm1
+
+ vpsrld ymm1,ymm14,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm14,10
+ vpxor ymm13,ymm15,ymm4
+ vpaddd ymm9,ymm9,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm13,ymm13,ymm5
+ vpaddd ymm13,ymm13,ymm7
+ vmovdqu ymm5,YMMWORD[((128-128))+rax]
+ vpaddd ymm6,ymm6,YMMWORD[((384-256-128))+rbx]
+
+ vpsrld ymm7,ymm5,3
+ vpsrld ymm1,ymm5,7
+ vpslld ymm2,ymm5,25
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm5,18
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm5,14
+ vmovdqu ymm0,YMMWORD[((32-128))+rax]
+ vpsrld ymm4,ymm0,10
+
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm0,17
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,15
+ vpaddd ymm6,ymm6,ymm7
+ vpxor ymm7,ymm4,ymm1
+ vpsrld ymm1,ymm0,19
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,13
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+ vpaddd ymm6,ymm6,ymm7
+ vpsrld ymm7,ymm9,6
+ vpslld ymm2,ymm9,26
+ vmovdqu YMMWORD[(96-128)+rax],ymm6
+ vpaddd ymm6,ymm6,ymm12
+
+ vpsrld ymm1,ymm9,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm9,21
+ vpaddd ymm6,ymm6,YMMWORD[((-32))+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm9,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm9,7
+ vpandn ymm0,ymm9,ymm11
+ vpand ymm4,ymm9,ymm10
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm12,ymm13,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm13,30
+ vpxor ymm0,ymm0,ymm4
+ vpxor ymm4,ymm14,ymm13
+
+ vpxor ymm12,ymm12,ymm1
+ vpaddd ymm6,ymm6,ymm7
+
+ vpsrld ymm1,ymm13,13
+
+ vpslld ymm2,ymm13,19
+ vpaddd ymm6,ymm6,ymm0
+ vpand ymm3,ymm3,ymm4
+
+ vpxor ymm7,ymm12,ymm1
+
+ vpsrld ymm1,ymm13,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm13,10
+ vpxor ymm12,ymm14,ymm3
+ vpaddd ymm8,ymm8,ymm6
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm12,ymm12,ymm6
+ vpaddd ymm12,ymm12,ymm7
+ vmovdqu ymm6,YMMWORD[((160-128))+rax]
+ vpaddd ymm5,ymm5,YMMWORD[((416-256-128))+rbx]
+
+ vpsrld ymm7,ymm6,3
+ vpsrld ymm1,ymm6,7
+ vpslld ymm2,ymm6,25
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm6,18
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm6,14
+ vmovdqu ymm0,YMMWORD[((64-128))+rax]
+ vpsrld ymm3,ymm0,10
+
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm0,17
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,15
+ vpaddd ymm5,ymm5,ymm7
+ vpxor ymm7,ymm3,ymm1
+ vpsrld ymm1,ymm0,19
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,13
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+ vpaddd ymm5,ymm5,ymm7
+ vpsrld ymm7,ymm8,6
+ vpslld ymm2,ymm8,26
+ vmovdqu YMMWORD[(128-128)+rax],ymm5
+ vpaddd ymm5,ymm5,ymm11
+
+ vpsrld ymm1,ymm8,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm8,21
+ vpaddd ymm5,ymm5,YMMWORD[rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm8,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm8,7
+ vpandn ymm0,ymm8,ymm10
+ vpand ymm3,ymm8,ymm9
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm11,ymm12,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm12,30
+ vpxor ymm0,ymm0,ymm3
+ vpxor ymm3,ymm13,ymm12
+
+ vpxor ymm11,ymm11,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm12,13
+
+ vpslld ymm2,ymm12,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm4,ymm4,ymm3
+
+ vpxor ymm7,ymm11,ymm1
+
+ vpsrld ymm1,ymm12,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm12,10
+ vpxor ymm11,ymm13,ymm4
+ vpaddd ymm15,ymm15,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm11,ymm11,ymm5
+ vpaddd ymm11,ymm11,ymm7
+ vmovdqu ymm5,YMMWORD[((192-128))+rax]
+ vpaddd ymm6,ymm6,YMMWORD[((448-256-128))+rbx]
+
+ vpsrld ymm7,ymm5,3
+ vpsrld ymm1,ymm5,7
+ vpslld ymm2,ymm5,25
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm5,18
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm5,14
+ vmovdqu ymm0,YMMWORD[((96-128))+rax]
+ vpsrld ymm4,ymm0,10
+
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm0,17
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,15
+ vpaddd ymm6,ymm6,ymm7
+ vpxor ymm7,ymm4,ymm1
+ vpsrld ymm1,ymm0,19
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,13
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+ vpaddd ymm6,ymm6,ymm7
+ vpsrld ymm7,ymm15,6
+ vpslld ymm2,ymm15,26
+ vmovdqu YMMWORD[(160-128)+rax],ymm6
+ vpaddd ymm6,ymm6,ymm10
+
+ vpsrld ymm1,ymm15,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm15,21
+ vpaddd ymm6,ymm6,YMMWORD[32+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm15,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm15,7
+ vpandn ymm0,ymm15,ymm9
+ vpand ymm4,ymm15,ymm8
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm10,ymm11,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm11,30
+ vpxor ymm0,ymm0,ymm4
+ vpxor ymm4,ymm12,ymm11
+
+ vpxor ymm10,ymm10,ymm1
+ vpaddd ymm6,ymm6,ymm7
+
+ vpsrld ymm1,ymm11,13
+
+ vpslld ymm2,ymm11,19
+ vpaddd ymm6,ymm6,ymm0
+ vpand ymm3,ymm3,ymm4
+
+ vpxor ymm7,ymm10,ymm1
+
+ vpsrld ymm1,ymm11,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm11,10
+ vpxor ymm10,ymm12,ymm3
+ vpaddd ymm14,ymm14,ymm6
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm10,ymm10,ymm6
+ vpaddd ymm10,ymm10,ymm7
+ vmovdqu ymm6,YMMWORD[((224-128))+rax]
+ vpaddd ymm5,ymm5,YMMWORD[((480-256-128))+rbx]
+
+ vpsrld ymm7,ymm6,3
+ vpsrld ymm1,ymm6,7
+ vpslld ymm2,ymm6,25
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm6,18
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm6,14
+ vmovdqu ymm0,YMMWORD[((128-128))+rax]
+ vpsrld ymm3,ymm0,10
+
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm0,17
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,15
+ vpaddd ymm5,ymm5,ymm7
+ vpxor ymm7,ymm3,ymm1
+ vpsrld ymm1,ymm0,19
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,13
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+ vpaddd ymm5,ymm5,ymm7
+ vpsrld ymm7,ymm14,6
+ vpslld ymm2,ymm14,26
+ vmovdqu YMMWORD[(192-128)+rax],ymm5
+ vpaddd ymm5,ymm5,ymm9
+
+ vpsrld ymm1,ymm14,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm14,21
+ vpaddd ymm5,ymm5,YMMWORD[64+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm14,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm14,7
+ vpandn ymm0,ymm14,ymm8
+ vpand ymm3,ymm14,ymm15
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm9,ymm10,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm10,30
+ vpxor ymm0,ymm0,ymm3
+ vpxor ymm3,ymm11,ymm10
+
+ vpxor ymm9,ymm9,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm10,13
+
+ vpslld ymm2,ymm10,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm4,ymm4,ymm3
+
+ vpxor ymm7,ymm9,ymm1
+
+ vpsrld ymm1,ymm10,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm10,10
+ vpxor ymm9,ymm11,ymm4
+ vpaddd ymm13,ymm13,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm9,ymm9,ymm5
+ vpaddd ymm9,ymm9,ymm7
+ vmovdqu ymm5,YMMWORD[((256-256-128))+rbx]
+ vpaddd ymm6,ymm6,YMMWORD[((0-128))+rax]
+
+ vpsrld ymm7,ymm5,3
+ vpsrld ymm1,ymm5,7
+ vpslld ymm2,ymm5,25
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm5,18
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm5,14
+ vmovdqu ymm0,YMMWORD[((160-128))+rax]
+ vpsrld ymm4,ymm0,10
+
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm0,17
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,15
+ vpaddd ymm6,ymm6,ymm7
+ vpxor ymm7,ymm4,ymm1
+ vpsrld ymm1,ymm0,19
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,13
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+ vpaddd ymm6,ymm6,ymm7
+ vpsrld ymm7,ymm13,6
+ vpslld ymm2,ymm13,26
+ vmovdqu YMMWORD[(224-128)+rax],ymm6
+ vpaddd ymm6,ymm6,ymm8
+
+ vpsrld ymm1,ymm13,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm13,21
+ vpaddd ymm6,ymm6,YMMWORD[96+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm13,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm13,7
+ vpandn ymm0,ymm13,ymm15
+ vpand ymm4,ymm13,ymm14
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm8,ymm9,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm9,30
+ vpxor ymm0,ymm0,ymm4
+ vpxor ymm4,ymm10,ymm9
+
+ vpxor ymm8,ymm8,ymm1
+ vpaddd ymm6,ymm6,ymm7
+
+ vpsrld ymm1,ymm9,13
+
+ vpslld ymm2,ymm9,19
+ vpaddd ymm6,ymm6,ymm0
+ vpand ymm3,ymm3,ymm4
+
+ vpxor ymm7,ymm8,ymm1
+
+ vpsrld ymm1,ymm9,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm9,10
+ vpxor ymm8,ymm10,ymm3
+ vpaddd ymm12,ymm12,ymm6
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm8,ymm8,ymm6
+ vpaddd ymm8,ymm8,ymm7
+ add rbp,256
+ vmovdqu ymm6,YMMWORD[((288-256-128))+rbx]
+ vpaddd ymm5,ymm5,YMMWORD[((32-128))+rax]
+
+ vpsrld ymm7,ymm6,3
+ vpsrld ymm1,ymm6,7
+ vpslld ymm2,ymm6,25
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm6,18
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm6,14
+ vmovdqu ymm0,YMMWORD[((192-128))+rax]
+ vpsrld ymm3,ymm0,10
+
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm0,17
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,15
+ vpaddd ymm5,ymm5,ymm7
+ vpxor ymm7,ymm3,ymm1
+ vpsrld ymm1,ymm0,19
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,13
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+ vpaddd ymm5,ymm5,ymm7
+ vpsrld ymm7,ymm12,6
+ vpslld ymm2,ymm12,26
+ vmovdqu YMMWORD[(256-256-128)+rbx],ymm5
+ vpaddd ymm5,ymm5,ymm15
+
+ vpsrld ymm1,ymm12,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm12,21
+ vpaddd ymm5,ymm5,YMMWORD[((-128))+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm12,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm12,7
+ vpandn ymm0,ymm12,ymm14
+ vpand ymm3,ymm12,ymm13
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm15,ymm8,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm8,30
+ vpxor ymm0,ymm0,ymm3
+ vpxor ymm3,ymm9,ymm8
+
+ vpxor ymm15,ymm15,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm8,13
+
+ vpslld ymm2,ymm8,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm4,ymm4,ymm3
+
+ vpxor ymm7,ymm15,ymm1
+
+ vpsrld ymm1,ymm8,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm8,10
+ vpxor ymm15,ymm9,ymm4
+ vpaddd ymm11,ymm11,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm15,ymm15,ymm5
+ vpaddd ymm15,ymm15,ymm7
+ vmovdqu ymm5,YMMWORD[((320-256-128))+rbx]
+ vpaddd ymm6,ymm6,YMMWORD[((64-128))+rax]
+
+ vpsrld ymm7,ymm5,3
+ vpsrld ymm1,ymm5,7
+ vpslld ymm2,ymm5,25
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm5,18
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm5,14
+ vmovdqu ymm0,YMMWORD[((224-128))+rax]
+ vpsrld ymm4,ymm0,10
+
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm0,17
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,15
+ vpaddd ymm6,ymm6,ymm7
+ vpxor ymm7,ymm4,ymm1
+ vpsrld ymm1,ymm0,19
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,13
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+ vpaddd ymm6,ymm6,ymm7
+ vpsrld ymm7,ymm11,6
+ vpslld ymm2,ymm11,26
+ vmovdqu YMMWORD[(288-256-128)+rbx],ymm6
+ vpaddd ymm6,ymm6,ymm14
+
+ vpsrld ymm1,ymm11,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm11,21
+ vpaddd ymm6,ymm6,YMMWORD[((-96))+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm11,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm11,7
+ vpandn ymm0,ymm11,ymm13
+ vpand ymm4,ymm11,ymm12
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm14,ymm15,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm15,30
+ vpxor ymm0,ymm0,ymm4
+ vpxor ymm4,ymm8,ymm15
+
+ vpxor ymm14,ymm14,ymm1
+ vpaddd ymm6,ymm6,ymm7
+
+ vpsrld ymm1,ymm15,13
+
+ vpslld ymm2,ymm15,19
+ vpaddd ymm6,ymm6,ymm0
+ vpand ymm3,ymm3,ymm4
+
+ vpxor ymm7,ymm14,ymm1
+
+ vpsrld ymm1,ymm15,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm15,10
+ vpxor ymm14,ymm8,ymm3
+ vpaddd ymm10,ymm10,ymm6
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm14,ymm14,ymm6
+ vpaddd ymm14,ymm14,ymm7
+ vmovdqu ymm6,YMMWORD[((352-256-128))+rbx]
+ vpaddd ymm5,ymm5,YMMWORD[((96-128))+rax]
+
+ vpsrld ymm7,ymm6,3
+ vpsrld ymm1,ymm6,7
+ vpslld ymm2,ymm6,25
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm6,18
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm6,14
+ vmovdqu ymm0,YMMWORD[((256-256-128))+rbx]
+ vpsrld ymm3,ymm0,10
+
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm0,17
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,15
+ vpaddd ymm5,ymm5,ymm7
+ vpxor ymm7,ymm3,ymm1
+ vpsrld ymm1,ymm0,19
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,13
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+ vpaddd ymm5,ymm5,ymm7
+ vpsrld ymm7,ymm10,6
+ vpslld ymm2,ymm10,26
+ vmovdqu YMMWORD[(320-256-128)+rbx],ymm5
+ vpaddd ymm5,ymm5,ymm13
+
+ vpsrld ymm1,ymm10,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm10,21
+ vpaddd ymm5,ymm5,YMMWORD[((-64))+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm10,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm10,7
+ vpandn ymm0,ymm10,ymm12
+ vpand ymm3,ymm10,ymm11
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm13,ymm14,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm14,30
+ vpxor ymm0,ymm0,ymm3
+ vpxor ymm3,ymm15,ymm14
+
+ vpxor ymm13,ymm13,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm14,13
+
+ vpslld ymm2,ymm14,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm4,ymm4,ymm3
+
+ vpxor ymm7,ymm13,ymm1
+
+ vpsrld ymm1,ymm14,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm14,10
+ vpxor ymm13,ymm15,ymm4
+ vpaddd ymm9,ymm9,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm13,ymm13,ymm5
+ vpaddd ymm13,ymm13,ymm7
+ vmovdqu ymm5,YMMWORD[((384-256-128))+rbx]
+ vpaddd ymm6,ymm6,YMMWORD[((128-128))+rax]
+
+ vpsrld ymm7,ymm5,3
+ vpsrld ymm1,ymm5,7
+ vpslld ymm2,ymm5,25
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm5,18
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm5,14
+ vmovdqu ymm0,YMMWORD[((288-256-128))+rbx]
+ vpsrld ymm4,ymm0,10
+
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm0,17
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,15
+ vpaddd ymm6,ymm6,ymm7
+ vpxor ymm7,ymm4,ymm1
+ vpsrld ymm1,ymm0,19
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,13
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+ vpaddd ymm6,ymm6,ymm7
+ vpsrld ymm7,ymm9,6
+ vpslld ymm2,ymm9,26
+ vmovdqu YMMWORD[(352-256-128)+rbx],ymm6
+ vpaddd ymm6,ymm6,ymm12
+
+ vpsrld ymm1,ymm9,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm9,21
+ vpaddd ymm6,ymm6,YMMWORD[((-32))+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm9,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm9,7
+ vpandn ymm0,ymm9,ymm11
+ vpand ymm4,ymm9,ymm10
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm12,ymm13,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm13,30
+ vpxor ymm0,ymm0,ymm4
+ vpxor ymm4,ymm14,ymm13
+
+ vpxor ymm12,ymm12,ymm1
+ vpaddd ymm6,ymm6,ymm7
+
+ vpsrld ymm1,ymm13,13
+
+ vpslld ymm2,ymm13,19
+ vpaddd ymm6,ymm6,ymm0
+ vpand ymm3,ymm3,ymm4
+
+ vpxor ymm7,ymm12,ymm1
+
+ vpsrld ymm1,ymm13,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm13,10
+ vpxor ymm12,ymm14,ymm3
+ vpaddd ymm8,ymm8,ymm6
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm12,ymm12,ymm6
+ vpaddd ymm12,ymm12,ymm7
+ vmovdqu ymm6,YMMWORD[((416-256-128))+rbx]
+ vpaddd ymm5,ymm5,YMMWORD[((160-128))+rax]
+
+ vpsrld ymm7,ymm6,3
+ vpsrld ymm1,ymm6,7
+ vpslld ymm2,ymm6,25
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm6,18
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm6,14
+ vmovdqu ymm0,YMMWORD[((320-256-128))+rbx]
+ vpsrld ymm3,ymm0,10
+
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm0,17
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,15
+ vpaddd ymm5,ymm5,ymm7
+ vpxor ymm7,ymm3,ymm1
+ vpsrld ymm1,ymm0,19
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,13
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+ vpaddd ymm5,ymm5,ymm7
+ vpsrld ymm7,ymm8,6
+ vpslld ymm2,ymm8,26
+ vmovdqu YMMWORD[(384-256-128)+rbx],ymm5
+ vpaddd ymm5,ymm5,ymm11
+
+ vpsrld ymm1,ymm8,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm8,21
+ vpaddd ymm5,ymm5,YMMWORD[rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm8,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm8,7
+ vpandn ymm0,ymm8,ymm10
+ vpand ymm3,ymm8,ymm9
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm11,ymm12,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm12,30
+ vpxor ymm0,ymm0,ymm3
+ vpxor ymm3,ymm13,ymm12
+
+ vpxor ymm11,ymm11,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm12,13
+
+ vpslld ymm2,ymm12,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm4,ymm4,ymm3
+
+ vpxor ymm7,ymm11,ymm1
+
+ vpsrld ymm1,ymm12,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm12,10
+ vpxor ymm11,ymm13,ymm4
+ vpaddd ymm15,ymm15,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm11,ymm11,ymm5
+ vpaddd ymm11,ymm11,ymm7
+ vmovdqu ymm5,YMMWORD[((448-256-128))+rbx]
+ vpaddd ymm6,ymm6,YMMWORD[((192-128))+rax]
+
+ vpsrld ymm7,ymm5,3
+ vpsrld ymm1,ymm5,7
+ vpslld ymm2,ymm5,25
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm5,18
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm5,14
+ vmovdqu ymm0,YMMWORD[((352-256-128))+rbx]
+ vpsrld ymm4,ymm0,10
+
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm0,17
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,15
+ vpaddd ymm6,ymm6,ymm7
+ vpxor ymm7,ymm4,ymm1
+ vpsrld ymm1,ymm0,19
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,13
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+ vpaddd ymm6,ymm6,ymm7
+ vpsrld ymm7,ymm15,6
+ vpslld ymm2,ymm15,26
+ vmovdqu YMMWORD[(416-256-128)+rbx],ymm6
+ vpaddd ymm6,ymm6,ymm10
+
+ vpsrld ymm1,ymm15,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm15,21
+ vpaddd ymm6,ymm6,YMMWORD[32+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm15,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm15,7
+ vpandn ymm0,ymm15,ymm9
+ vpand ymm4,ymm15,ymm8
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm10,ymm11,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm11,30
+ vpxor ymm0,ymm0,ymm4
+ vpxor ymm4,ymm12,ymm11
+
+ vpxor ymm10,ymm10,ymm1
+ vpaddd ymm6,ymm6,ymm7
+
+ vpsrld ymm1,ymm11,13
+
+ vpslld ymm2,ymm11,19
+ vpaddd ymm6,ymm6,ymm0
+ vpand ymm3,ymm3,ymm4
+
+ vpxor ymm7,ymm10,ymm1
+
+ vpsrld ymm1,ymm11,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm11,10
+ vpxor ymm10,ymm12,ymm3
+ vpaddd ymm14,ymm14,ymm6
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm10,ymm10,ymm6
+ vpaddd ymm10,ymm10,ymm7
+ vmovdqu ymm6,YMMWORD[((480-256-128))+rbx]
+ vpaddd ymm5,ymm5,YMMWORD[((224-128))+rax]
+
+ vpsrld ymm7,ymm6,3
+ vpsrld ymm1,ymm6,7
+ vpslld ymm2,ymm6,25
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm6,18
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm6,14
+ vmovdqu ymm0,YMMWORD[((384-256-128))+rbx]
+ vpsrld ymm3,ymm0,10
+
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm0,17
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,15
+ vpaddd ymm5,ymm5,ymm7
+ vpxor ymm7,ymm3,ymm1
+ vpsrld ymm1,ymm0,19
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,13
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+ vpaddd ymm5,ymm5,ymm7
+ vpsrld ymm7,ymm14,6
+ vpslld ymm2,ymm14,26
+ vmovdqu YMMWORD[(448-256-128)+rbx],ymm5
+ vpaddd ymm5,ymm5,ymm9
+
+ vpsrld ymm1,ymm14,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm14,21
+ vpaddd ymm5,ymm5,YMMWORD[64+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm14,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm14,7
+ vpandn ymm0,ymm14,ymm8
+ vpand ymm3,ymm14,ymm15
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm9,ymm10,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm10,30
+ vpxor ymm0,ymm0,ymm3
+ vpxor ymm3,ymm11,ymm10
+
+ vpxor ymm9,ymm9,ymm1
+ vpaddd ymm5,ymm5,ymm7
+
+ vpsrld ymm1,ymm10,13
+
+ vpslld ymm2,ymm10,19
+ vpaddd ymm5,ymm5,ymm0
+ vpand ymm4,ymm4,ymm3
+
+ vpxor ymm7,ymm9,ymm1
+
+ vpsrld ymm1,ymm10,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm10,10
+ vpxor ymm9,ymm11,ymm4
+ vpaddd ymm13,ymm13,ymm5
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm9,ymm9,ymm5
+ vpaddd ymm9,ymm9,ymm7
+ vmovdqu ymm5,YMMWORD[((0-128))+rax]
+ vpaddd ymm6,ymm6,YMMWORD[((256-256-128))+rbx]
+
+ vpsrld ymm7,ymm5,3
+ vpsrld ymm1,ymm5,7
+ vpslld ymm2,ymm5,25
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm5,18
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm5,14
+ vmovdqu ymm0,YMMWORD[((416-256-128))+rbx]
+ vpsrld ymm4,ymm0,10
+
+ vpxor ymm7,ymm7,ymm1
+ vpsrld ymm1,ymm0,17
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,15
+ vpaddd ymm6,ymm6,ymm7
+ vpxor ymm7,ymm4,ymm1
+ vpsrld ymm1,ymm0,19
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm0,13
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+ vpaddd ymm6,ymm6,ymm7
+ vpsrld ymm7,ymm13,6
+ vpslld ymm2,ymm13,26
+ vmovdqu YMMWORD[(480-256-128)+rbx],ymm6
+ vpaddd ymm6,ymm6,ymm8
+
+ vpsrld ymm1,ymm13,11
+ vpxor ymm7,ymm7,ymm2
+ vpslld ymm2,ymm13,21
+ vpaddd ymm6,ymm6,YMMWORD[96+rbp]
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm1,ymm13,25
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm13,7
+ vpandn ymm0,ymm13,ymm15
+ vpand ymm4,ymm13,ymm14
+
+ vpxor ymm7,ymm7,ymm1
+
+ vpsrld ymm8,ymm9,2
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm1,ymm9,30
+ vpxor ymm0,ymm0,ymm4
+ vpxor ymm4,ymm10,ymm9
+
+ vpxor ymm8,ymm8,ymm1
+ vpaddd ymm6,ymm6,ymm7
+
+ vpsrld ymm1,ymm9,13
+
+ vpslld ymm2,ymm9,19
+ vpaddd ymm6,ymm6,ymm0
+ vpand ymm3,ymm3,ymm4
+
+ vpxor ymm7,ymm8,ymm1
+
+ vpsrld ymm1,ymm9,22
+ vpxor ymm7,ymm7,ymm2
+
+ vpslld ymm2,ymm9,10
+ vpxor ymm8,ymm10,ymm3
+ vpaddd ymm12,ymm12,ymm6
+
+ vpxor ymm7,ymm7,ymm1
+ vpxor ymm7,ymm7,ymm2
+
+ vpaddd ymm8,ymm8,ymm6
+ vpaddd ymm8,ymm8,ymm7
+ add rbp,256
+ dec ecx
+ jnz NEAR $L$oop_16_xx_avx2
+
+ mov ecx,1
+ lea rbx,[512+rsp]
+ lea rbp,[((K256+128))]
+ cmp ecx,DWORD[rbx]
+ cmovge r12,rbp
+ cmp ecx,DWORD[4+rbx]
+ cmovge r13,rbp
+ cmp ecx,DWORD[8+rbx]
+ cmovge r14,rbp
+ cmp ecx,DWORD[12+rbx]
+ cmovge r15,rbp
+ cmp ecx,DWORD[16+rbx]
+ cmovge r8,rbp
+ cmp ecx,DWORD[20+rbx]
+ cmovge r9,rbp
+ cmp ecx,DWORD[24+rbx]
+ cmovge r10,rbp
+ cmp ecx,DWORD[28+rbx]
+ cmovge r11,rbp
+ vmovdqa ymm7,YMMWORD[rbx]
+ vpxor ymm0,ymm0,ymm0
+ vmovdqa ymm6,ymm7
+ vpcmpgtd ymm6,ymm6,ymm0
+ vpaddd ymm7,ymm7,ymm6
+
+ vmovdqu ymm0,YMMWORD[((0-128))+rdi]
+ vpand ymm8,ymm8,ymm6
+ vmovdqu ymm1,YMMWORD[((32-128))+rdi]
+ vpand ymm9,ymm9,ymm6
+ vmovdqu ymm2,YMMWORD[((64-128))+rdi]
+ vpand ymm10,ymm10,ymm6
+ vmovdqu ymm5,YMMWORD[((96-128))+rdi]
+ vpand ymm11,ymm11,ymm6
+ vpaddd ymm8,ymm8,ymm0
+ vmovdqu ymm0,YMMWORD[((128-128))+rdi]
+ vpand ymm12,ymm12,ymm6
+ vpaddd ymm9,ymm9,ymm1
+ vmovdqu ymm1,YMMWORD[((160-128))+rdi]
+ vpand ymm13,ymm13,ymm6
+ vpaddd ymm10,ymm10,ymm2
+ vmovdqu ymm2,YMMWORD[((192-128))+rdi]
+ vpand ymm14,ymm14,ymm6
+ vpaddd ymm11,ymm11,ymm5
+ vmovdqu ymm5,YMMWORD[((224-128))+rdi]
+ vpand ymm15,ymm15,ymm6
+ vpaddd ymm12,ymm12,ymm0
+ vpaddd ymm13,ymm13,ymm1
+ vmovdqu YMMWORD[(0-128)+rdi],ymm8
+ vpaddd ymm14,ymm14,ymm2
+ vmovdqu YMMWORD[(32-128)+rdi],ymm9
+ vpaddd ymm15,ymm15,ymm5
+ vmovdqu YMMWORD[(64-128)+rdi],ymm10
+ vmovdqu YMMWORD[(96-128)+rdi],ymm11
+ vmovdqu YMMWORD[(128-128)+rdi],ymm12
+ vmovdqu YMMWORD[(160-128)+rdi],ymm13
+ vmovdqu YMMWORD[(192-128)+rdi],ymm14
+ vmovdqu YMMWORD[(224-128)+rdi],ymm15
+
+ vmovdqu YMMWORD[rbx],ymm7
+ lea rbx,[((256+128))+rsp]
+ vmovdqu ymm6,YMMWORD[$L$pbswap]
+ dec edx
+ jnz NEAR $L$oop_avx2
+
+
+
+
+
+
+
+$L$done_avx2:
+ mov rax,QWORD[544+rsp]
+
+ vzeroupper
+ movaps xmm6,XMMWORD[((-216))+rax]
+ movaps xmm7,XMMWORD[((-200))+rax]
+ movaps xmm8,XMMWORD[((-184))+rax]
+ movaps xmm9,XMMWORD[((-168))+rax]
+ movaps xmm10,XMMWORD[((-152))+rax]
+ movaps xmm11,XMMWORD[((-136))+rax]
+ movaps xmm12,XMMWORD[((-120))+rax]
+ movaps xmm13,XMMWORD[((-104))+rax]
+ movaps xmm14,XMMWORD[((-88))+rax]
+ movaps xmm15,XMMWORD[((-72))+rax]
+ mov r15,QWORD[((-48))+rax]
+
+ mov r14,QWORD[((-40))+rax]
+
+ mov r13,QWORD[((-32))+rax]
+
+ mov r12,QWORD[((-24))+rax]
+
+ mov rbp,QWORD[((-16))+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+
+ lea rsp,[rax]
+
+$L$epilogue_avx2:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha256_multi_block_avx2:
+ALIGN 256
+K256:
+ DD 1116352408,1116352408,1116352408,1116352408
+ DD 1116352408,1116352408,1116352408,1116352408
+ DD 1899447441,1899447441,1899447441,1899447441
+ DD 1899447441,1899447441,1899447441,1899447441
+ DD 3049323471,3049323471,3049323471,3049323471
+ DD 3049323471,3049323471,3049323471,3049323471
+ DD 3921009573,3921009573,3921009573,3921009573
+ DD 3921009573,3921009573,3921009573,3921009573
+ DD 961987163,961987163,961987163,961987163
+ DD 961987163,961987163,961987163,961987163
+ DD 1508970993,1508970993,1508970993,1508970993
+ DD 1508970993,1508970993,1508970993,1508970993
+ DD 2453635748,2453635748,2453635748,2453635748
+ DD 2453635748,2453635748,2453635748,2453635748
+ DD 2870763221,2870763221,2870763221,2870763221
+ DD 2870763221,2870763221,2870763221,2870763221
+ DD 3624381080,3624381080,3624381080,3624381080
+ DD 3624381080,3624381080,3624381080,3624381080
+ DD 310598401,310598401,310598401,310598401
+ DD 310598401,310598401,310598401,310598401
+ DD 607225278,607225278,607225278,607225278
+ DD 607225278,607225278,607225278,607225278
+ DD 1426881987,1426881987,1426881987,1426881987
+ DD 1426881987,1426881987,1426881987,1426881987
+ DD 1925078388,1925078388,1925078388,1925078388
+ DD 1925078388,1925078388,1925078388,1925078388
+ DD 2162078206,2162078206,2162078206,2162078206
+ DD 2162078206,2162078206,2162078206,2162078206
+ DD 2614888103,2614888103,2614888103,2614888103
+ DD 2614888103,2614888103,2614888103,2614888103
+ DD 3248222580,3248222580,3248222580,3248222580
+ DD 3248222580,3248222580,3248222580,3248222580
+ DD 3835390401,3835390401,3835390401,3835390401
+ DD 3835390401,3835390401,3835390401,3835390401
+ DD 4022224774,4022224774,4022224774,4022224774
+ DD 4022224774,4022224774,4022224774,4022224774
+ DD 264347078,264347078,264347078,264347078
+ DD 264347078,264347078,264347078,264347078
+ DD 604807628,604807628,604807628,604807628
+ DD 604807628,604807628,604807628,604807628
+ DD 770255983,770255983,770255983,770255983
+ DD 770255983,770255983,770255983,770255983
+ DD 1249150122,1249150122,1249150122,1249150122
+ DD 1249150122,1249150122,1249150122,1249150122
+ DD 1555081692,1555081692,1555081692,1555081692
+ DD 1555081692,1555081692,1555081692,1555081692
+ DD 1996064986,1996064986,1996064986,1996064986
+ DD 1996064986,1996064986,1996064986,1996064986
+ DD 2554220882,2554220882,2554220882,2554220882
+ DD 2554220882,2554220882,2554220882,2554220882
+ DD 2821834349,2821834349,2821834349,2821834349
+ DD 2821834349,2821834349,2821834349,2821834349
+ DD 2952996808,2952996808,2952996808,2952996808
+ DD 2952996808,2952996808,2952996808,2952996808
+ DD 3210313671,3210313671,3210313671,3210313671
+ DD 3210313671,3210313671,3210313671,3210313671
+ DD 3336571891,3336571891,3336571891,3336571891
+ DD 3336571891,3336571891,3336571891,3336571891
+ DD 3584528711,3584528711,3584528711,3584528711
+ DD 3584528711,3584528711,3584528711,3584528711
+ DD 113926993,113926993,113926993,113926993
+ DD 113926993,113926993,113926993,113926993
+ DD 338241895,338241895,338241895,338241895
+ DD 338241895,338241895,338241895,338241895
+ DD 666307205,666307205,666307205,666307205
+ DD 666307205,666307205,666307205,666307205
+ DD 773529912,773529912,773529912,773529912
+ DD 773529912,773529912,773529912,773529912
+ DD 1294757372,1294757372,1294757372,1294757372
+ DD 1294757372,1294757372,1294757372,1294757372
+ DD 1396182291,1396182291,1396182291,1396182291
+ DD 1396182291,1396182291,1396182291,1396182291
+ DD 1695183700,1695183700,1695183700,1695183700
+ DD 1695183700,1695183700,1695183700,1695183700
+ DD 1986661051,1986661051,1986661051,1986661051
+ DD 1986661051,1986661051,1986661051,1986661051
+ DD 2177026350,2177026350,2177026350,2177026350
+ DD 2177026350,2177026350,2177026350,2177026350
+ DD 2456956037,2456956037,2456956037,2456956037
+ DD 2456956037,2456956037,2456956037,2456956037
+ DD 2730485921,2730485921,2730485921,2730485921
+ DD 2730485921,2730485921,2730485921,2730485921
+ DD 2820302411,2820302411,2820302411,2820302411
+ DD 2820302411,2820302411,2820302411,2820302411
+ DD 3259730800,3259730800,3259730800,3259730800
+ DD 3259730800,3259730800,3259730800,3259730800
+ DD 3345764771,3345764771,3345764771,3345764771
+ DD 3345764771,3345764771,3345764771,3345764771
+ DD 3516065817,3516065817,3516065817,3516065817
+ DD 3516065817,3516065817,3516065817,3516065817
+ DD 3600352804,3600352804,3600352804,3600352804
+ DD 3600352804,3600352804,3600352804,3600352804
+ DD 4094571909,4094571909,4094571909,4094571909
+ DD 4094571909,4094571909,4094571909,4094571909
+ DD 275423344,275423344,275423344,275423344
+ DD 275423344,275423344,275423344,275423344
+ DD 430227734,430227734,430227734,430227734
+ DD 430227734,430227734,430227734,430227734
+ DD 506948616,506948616,506948616,506948616
+ DD 506948616,506948616,506948616,506948616
+ DD 659060556,659060556,659060556,659060556
+ DD 659060556,659060556,659060556,659060556
+ DD 883997877,883997877,883997877,883997877
+ DD 883997877,883997877,883997877,883997877
+ DD 958139571,958139571,958139571,958139571
+ DD 958139571,958139571,958139571,958139571
+ DD 1322822218,1322822218,1322822218,1322822218
+ DD 1322822218,1322822218,1322822218,1322822218
+ DD 1537002063,1537002063,1537002063,1537002063
+ DD 1537002063,1537002063,1537002063,1537002063
+ DD 1747873779,1747873779,1747873779,1747873779
+ DD 1747873779,1747873779,1747873779,1747873779
+ DD 1955562222,1955562222,1955562222,1955562222
+ DD 1955562222,1955562222,1955562222,1955562222
+ DD 2024104815,2024104815,2024104815,2024104815
+ DD 2024104815,2024104815,2024104815,2024104815
+ DD 2227730452,2227730452,2227730452,2227730452
+ DD 2227730452,2227730452,2227730452,2227730452
+ DD 2361852424,2361852424,2361852424,2361852424
+ DD 2361852424,2361852424,2361852424,2361852424
+ DD 2428436474,2428436474,2428436474,2428436474
+ DD 2428436474,2428436474,2428436474,2428436474
+ DD 2756734187,2756734187,2756734187,2756734187
+ DD 2756734187,2756734187,2756734187,2756734187
+ DD 3204031479,3204031479,3204031479,3204031479
+ DD 3204031479,3204031479,3204031479,3204031479
+ DD 3329325298,3329325298,3329325298,3329325298
+ DD 3329325298,3329325298,3329325298,3329325298
+$L$pbswap:
+ DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+ DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+K256_shaext:
+ DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+DB 83,72,65,50,53,54,32,109,117,108,116,105,45,98,108,111
+DB 99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114
+DB 32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71
+DB 65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112
+DB 101,110,115,115,108,46,111,114,103,62,0
+EXTERN __imp_RtlVirtualUnwind
+
+ALIGN 16
+se_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
+
+ mov rax,QWORD[152+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$in_prologue
+
+ mov rax,QWORD[272+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+
+ lea rsi,[((-24-160))+rax]
+ lea rdi,[512+r8]
+ mov ecx,20
+ DD 0xa548f3fc
+
+$L$in_prologue:
+ mov rdi,QWORD[8+rax]
+ mov rsi,QWORD[16+rax]
+ mov QWORD[152+r8],rax
+ mov QWORD[168+r8],rsi
+ mov QWORD[176+r8],rdi
+
+ mov rdi,QWORD[40+r9]
+ mov rsi,r8
+ mov ecx,154
+ DD 0xa548f3fc
+
+ mov rsi,r9
+ xor rcx,rcx
+ mov rdx,QWORD[8+rsi]
+ mov r8,QWORD[rsi]
+ mov r9,QWORD[16+rsi]
+ mov r10,QWORD[40+rsi]
+ lea r11,[56+rsi]
+ lea r12,[24+rsi]
+ mov QWORD[32+rsp],r10
+ mov QWORD[40+rsp],r11
+ mov QWORD[48+rsp],r12
+ mov QWORD[56+rsp],rcx
+ call QWORD[__imp_RtlVirtualUnwind]
+
+ mov eax,1
+ add rsp,64
+ popfq
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ DB 0F3h,0C3h ;repret
+
+
+ALIGN 16
+avx2_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
+
+ mov rax,QWORD[152+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$in_prologue
+
+ mov rax,QWORD[544+r8]
+
+ mov rbx,QWORD[((-8))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov r15,QWORD[((-48))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+ mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+ mov QWORD[232+r8],r14
+ mov QWORD[240+r8],r15
+
+ lea rsi,[((-56-160))+rax]
+ lea rdi,[512+r8]
+ mov ecx,20
+ DD 0xa548f3fc
+
+ jmp NEAR $L$in_prologue
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_sha256_multi_block wrt ..imagebase
+ DD $L$SEH_end_sha256_multi_block wrt ..imagebase
+ DD $L$SEH_info_sha256_multi_block wrt ..imagebase
+ DD $L$SEH_begin_sha256_multi_block_shaext wrt ..imagebase
+ DD $L$SEH_end_sha256_multi_block_shaext wrt ..imagebase
+ DD $L$SEH_info_sha256_multi_block_shaext wrt ..imagebase
+ DD $L$SEH_begin_sha256_multi_block_avx wrt ..imagebase
+ DD $L$SEH_end_sha256_multi_block_avx wrt ..imagebase
+ DD $L$SEH_info_sha256_multi_block_avx wrt ..imagebase
+ DD $L$SEH_begin_sha256_multi_block_avx2 wrt ..imagebase
+ DD $L$SEH_end_sha256_multi_block_avx2 wrt ..imagebase
+ DD $L$SEH_info_sha256_multi_block_avx2 wrt ..imagebase
+section .xdata rdata align=8
+ALIGN 8
+$L$SEH_info_sha256_multi_block:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$body wrt ..imagebase,$L$epilogue wrt ..imagebase
+$L$SEH_info_sha256_multi_block_shaext:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$body_shaext wrt ..imagebase,$L$epilogue_shaext wrt ..imagebase
+$L$SEH_info_sha256_multi_block_avx:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$body_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
+$L$SEH_info_sha256_multi_block_avx2:
+DB 9,0,0,0
+ DD avx2_handler wrt ..imagebase
+ DD $L$body_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-x86_64.nasm
new file mode 100644
index 0000000..8238c4e
--- /dev/null
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-x86_64.nasm
@@ -0,0 +1,5711 @@
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+section .text code align=64
+
+
+EXTERN OPENSSL_ia32cap_P
+global sha256_block_data_order
+
+ALIGN 16
+sha256_block_data_order:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha256_block_data_order:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+ lea r11,[OPENSSL_ia32cap_P]
+ mov r9d,DWORD[r11]
+ mov r10d,DWORD[4+r11]
+ mov r11d,DWORD[8+r11]
+ test r11d,536870912
+ jnz NEAR _shaext_shortcut
+ and r11d,296
+ cmp r11d,296
+ je NEAR $L$avx2_shortcut
+ and r9d,1073741824
+ and r10d,268435968
+ or r10d,r9d
+ cmp r10d,1342177792
+ je NEAR $L$avx_shortcut
+ test r10d,512
+ jnz NEAR $L$ssse3_shortcut
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ shl rdx,4
+ sub rsp,16*4+4*8
+ lea rdx,[rdx*4+rsi]
+ and rsp,-64
+ mov QWORD[((64+0))+rsp],rdi
+ mov QWORD[((64+8))+rsp],rsi
+ mov QWORD[((64+16))+rsp],rdx
+ mov QWORD[88+rsp],rax
+
+$L$prologue:
+
+ mov eax,DWORD[rdi]
+ mov ebx,DWORD[4+rdi]
+ mov ecx,DWORD[8+rdi]
+ mov edx,DWORD[12+rdi]
+ mov r8d,DWORD[16+rdi]
+ mov r9d,DWORD[20+rdi]
+ mov r10d,DWORD[24+rdi]
+ mov r11d,DWORD[28+rdi]
+ jmp NEAR $L$loop
+
+ALIGN 16
+$L$loop:
+ mov edi,ebx
+ lea rbp,[K256]
+ xor edi,ecx
+ mov r12d,DWORD[rsi]
+ mov r13d,r8d
+ mov r14d,eax
+ bswap r12d
+ ror r13d,14
+ mov r15d,r9d
+
+ xor r13d,r8d
+ ror r14d,9
+ xor r15d,r10d
+
+ mov DWORD[rsp],r12d
+ xor r14d,eax
+ and r15d,r8d
+
+ ror r13d,5
+ add r12d,r11d
+ xor r15d,r10d
+
+ ror r14d,11
+ xor r13d,r8d
+ add r12d,r15d
+
+ mov r15d,eax
+ add r12d,DWORD[rbp]
+ xor r14d,eax
+
+ xor r15d,ebx
+ ror r13d,6
+ mov r11d,ebx
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor r11d,edi
+ add edx,r12d
+ add r11d,r12d
+
+ lea rbp,[4+rbp]
+ add r11d,r14d
+ mov r12d,DWORD[4+rsi]
+ mov r13d,edx
+ mov r14d,r11d
+ bswap r12d
+ ror r13d,14
+ mov edi,r8d
+
+ xor r13d,edx
+ ror r14d,9
+ xor edi,r9d
+
+ mov DWORD[4+rsp],r12d
+ xor r14d,r11d
+ and edi,edx
+
+ ror r13d,5
+ add r12d,r10d
+ xor edi,r9d
+
+ ror r14d,11
+ xor r13d,edx
+ add r12d,edi
+
+ mov edi,r11d
+ add r12d,DWORD[rbp]
+ xor r14d,r11d
+
+ xor edi,eax
+ ror r13d,6
+ mov r10d,eax
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor r10d,r15d
+ add ecx,r12d
+ add r10d,r12d
+
+ lea rbp,[4+rbp]
+ add r10d,r14d
+ mov r12d,DWORD[8+rsi]
+ mov r13d,ecx
+ mov r14d,r10d
+ bswap r12d
+ ror r13d,14
+ mov r15d,edx
+
+ xor r13d,ecx
+ ror r14d,9
+ xor r15d,r8d
+
+ mov DWORD[8+rsp],r12d
+ xor r14d,r10d
+ and r15d,ecx
+
+ ror r13d,5
+ add r12d,r9d
+ xor r15d,r8d
+
+ ror r14d,11
+ xor r13d,ecx
+ add r12d,r15d
+
+ mov r15d,r10d
+ add r12d,DWORD[rbp]
+ xor r14d,r10d
+
+ xor r15d,r11d
+ ror r13d,6
+ mov r9d,r11d
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor r9d,edi
+ add ebx,r12d
+ add r9d,r12d
+
+ lea rbp,[4+rbp]
+ add r9d,r14d
+ mov r12d,DWORD[12+rsi]
+ mov r13d,ebx
+ mov r14d,r9d
+ bswap r12d
+ ror r13d,14
+ mov edi,ecx
+
+ xor r13d,ebx
+ ror r14d,9
+ xor edi,edx
+
+ mov DWORD[12+rsp],r12d
+ xor r14d,r9d
+ and edi,ebx
+
+ ror r13d,5
+ add r12d,r8d
+ xor edi,edx
+
+ ror r14d,11
+ xor r13d,ebx
+ add r12d,edi
+
+ mov edi,r9d
+ add r12d,DWORD[rbp]
+ xor r14d,r9d
+
+ xor edi,r10d
+ ror r13d,6
+ mov r8d,r10d
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor r8d,r15d
+ add eax,r12d
+ add r8d,r12d
+
+ lea rbp,[20+rbp]
+ add r8d,r14d
+ mov r12d,DWORD[16+rsi]
+ mov r13d,eax
+ mov r14d,r8d
+ bswap r12d
+ ror r13d,14
+ mov r15d,ebx
+
+ xor r13d,eax
+ ror r14d,9
+ xor r15d,ecx
+
+ mov DWORD[16+rsp],r12d
+ xor r14d,r8d
+ and r15d,eax
+
+ ror r13d,5
+ add r12d,edx
+ xor r15d,ecx
+
+ ror r14d,11
+ xor r13d,eax
+ add r12d,r15d
+
+ mov r15d,r8d
+ add r12d,DWORD[rbp]
+ xor r14d,r8d
+
+ xor r15d,r9d
+ ror r13d,6
+ mov edx,r9d
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor edx,edi
+ add r11d,r12d
+ add edx,r12d
+
+ lea rbp,[4+rbp]
+ add edx,r14d
+ mov r12d,DWORD[20+rsi]
+ mov r13d,r11d
+ mov r14d,edx
+ bswap r12d
+ ror r13d,14
+ mov edi,eax
+
+ xor r13d,r11d
+ ror r14d,9
+ xor edi,ebx
+
+ mov DWORD[20+rsp],r12d
+ xor r14d,edx
+ and edi,r11d
+
+ ror r13d,5
+ add r12d,ecx
+ xor edi,ebx
+
+ ror r14d,11
+ xor r13d,r11d
+ add r12d,edi
+
+ mov edi,edx
+ add r12d,DWORD[rbp]
+ xor r14d,edx
+
+ xor edi,r8d
+ ror r13d,6
+ mov ecx,r8d
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor ecx,r15d
+ add r10d,r12d
+ add ecx,r12d
+
+ lea rbp,[4+rbp]
+ add ecx,r14d
+ mov r12d,DWORD[24+rsi]
+ mov r13d,r10d
+ mov r14d,ecx
+ bswap r12d
+ ror r13d,14
+ mov r15d,r11d
+
+ xor r13d,r10d
+ ror r14d,9
+ xor r15d,eax
+
+ mov DWORD[24+rsp],r12d
+ xor r14d,ecx
+ and r15d,r10d
+
+ ror r13d,5
+ add r12d,ebx
+ xor r15d,eax
+
+ ror r14d,11
+ xor r13d,r10d
+ add r12d,r15d
+
+ mov r15d,ecx
+ add r12d,DWORD[rbp]
+ xor r14d,ecx
+
+ xor r15d,edx
+ ror r13d,6
+ mov ebx,edx
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor ebx,edi
+ add r9d,r12d
+ add ebx,r12d
+
+ lea rbp,[4+rbp]
+ add ebx,r14d
+ mov r12d,DWORD[28+rsi]
+ mov r13d,r9d
+ mov r14d,ebx
+ bswap r12d
+ ror r13d,14
+ mov edi,r10d
+
+ xor r13d,r9d
+ ror r14d,9
+ xor edi,r11d
+
+ mov DWORD[28+rsp],r12d
+ xor r14d,ebx
+ and edi,r9d
+
+ ror r13d,5
+ add r12d,eax
+ xor edi,r11d
+
+ ror r14d,11
+ xor r13d,r9d
+ add r12d,edi
+
+ mov edi,ebx
+ add r12d,DWORD[rbp]
+ xor r14d,ebx
+
+ xor edi,ecx
+ ror r13d,6
+ mov eax,ecx
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor eax,r15d
+ add r8d,r12d
+ add eax,r12d
+
+ lea rbp,[20+rbp]
+ add eax,r14d
+ mov r12d,DWORD[32+rsi]
+ mov r13d,r8d
+ mov r14d,eax
+ bswap r12d
+ ror r13d,14
+ mov r15d,r9d
+
+ xor r13d,r8d
+ ror r14d,9
+ xor r15d,r10d
+
+ mov DWORD[32+rsp],r12d
+ xor r14d,eax
+ and r15d,r8d
+
+ ror r13d,5
+ add r12d,r11d
+ xor r15d,r10d
+
+ ror r14d,11
+ xor r13d,r8d
+ add r12d,r15d
+
+ mov r15d,eax
+ add r12d,DWORD[rbp]
+ xor r14d,eax
+
+ xor r15d,ebx
+ ror r13d,6
+ mov r11d,ebx
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor r11d,edi
+ add edx,r12d
+ add r11d,r12d
+
+ lea rbp,[4+rbp]
+ add r11d,r14d
+ mov r12d,DWORD[36+rsi]
+ mov r13d,edx
+ mov r14d,r11d
+ bswap r12d
+ ror r13d,14
+ mov edi,r8d
+
+ xor r13d,edx
+ ror r14d,9
+ xor edi,r9d
+
+ mov DWORD[36+rsp],r12d
+ xor r14d,r11d
+ and edi,edx
+
+ ror r13d,5
+ add r12d,r10d
+ xor edi,r9d
+
+ ror r14d,11
+ xor r13d,edx
+ add r12d,edi
+
+ mov edi,r11d
+ add r12d,DWORD[rbp]
+ xor r14d,r11d
+
+ xor edi,eax
+ ror r13d,6
+ mov r10d,eax
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor r10d,r15d
+ add ecx,r12d
+ add r10d,r12d
+
+ lea rbp,[4+rbp]
+ add r10d,r14d
+ mov r12d,DWORD[40+rsi]
+ mov r13d,ecx
+ mov r14d,r10d
+ bswap r12d
+ ror r13d,14
+ mov r15d,edx
+
+ xor r13d,ecx
+ ror r14d,9
+ xor r15d,r8d
+
+ mov DWORD[40+rsp],r12d
+ xor r14d,r10d
+ and r15d,ecx
+
+ ror r13d,5
+ add r12d,r9d
+ xor r15d,r8d
+
+ ror r14d,11
+ xor r13d,ecx
+ add r12d,r15d
+
+ mov r15d,r10d
+ add r12d,DWORD[rbp]
+ xor r14d,r10d
+
+ xor r15d,r11d
+ ror r13d,6
+ mov r9d,r11d
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor r9d,edi
+ add ebx,r12d
+ add r9d,r12d
+
+ lea rbp,[4+rbp]
+ add r9d,r14d
+ mov r12d,DWORD[44+rsi]
+ mov r13d,ebx
+ mov r14d,r9d
+ bswap r12d
+ ror r13d,14
+ mov edi,ecx
+
+ xor r13d,ebx
+ ror r14d,9
+ xor edi,edx
+
+ mov DWORD[44+rsp],r12d
+ xor r14d,r9d
+ and edi,ebx
+
+ ror r13d,5
+ add r12d,r8d
+ xor edi,edx
+
+ ror r14d,11
+ xor r13d,ebx
+ add r12d,edi
+
+ mov edi,r9d
+ add r12d,DWORD[rbp]
+ xor r14d,r9d
+
+ xor edi,r10d
+ ror r13d,6
+ mov r8d,r10d
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor r8d,r15d
+ add eax,r12d
+ add r8d,r12d
+
+ lea rbp,[20+rbp]
+ add r8d,r14d
+ mov r12d,DWORD[48+rsi]
+ mov r13d,eax
+ mov r14d,r8d
+ bswap r12d
+ ror r13d,14
+ mov r15d,ebx
+
+ xor r13d,eax
+ ror r14d,9
+ xor r15d,ecx
+
+ mov DWORD[48+rsp],r12d
+ xor r14d,r8d
+ and r15d,eax
+
+ ror r13d,5
+ add r12d,edx
+ xor r15d,ecx
+
+ ror r14d,11
+ xor r13d,eax
+ add r12d,r15d
+
+ mov r15d,r8d
+ add r12d,DWORD[rbp]
+ xor r14d,r8d
+
+ xor r15d,r9d
+ ror r13d,6
+ mov edx,r9d
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor edx,edi
+ add r11d,r12d
+ add edx,r12d
+
+ lea rbp,[4+rbp]
+ add edx,r14d
+ mov r12d,DWORD[52+rsi]
+ mov r13d,r11d
+ mov r14d,edx
+ bswap r12d
+ ror r13d,14
+ mov edi,eax
+
+ xor r13d,r11d
+ ror r14d,9
+ xor edi,ebx
+
+ mov DWORD[52+rsp],r12d
+ xor r14d,edx
+ and edi,r11d
+
+ ror r13d,5
+ add r12d,ecx
+ xor edi,ebx
+
+ ror r14d,11
+ xor r13d,r11d
+ add r12d,edi
+
+ mov edi,edx
+ add r12d,DWORD[rbp]
+ xor r14d,edx
+
+ xor edi,r8d
+ ror r13d,6
+ mov ecx,r8d
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor ecx,r15d
+ add r10d,r12d
+ add ecx,r12d
+
+ lea rbp,[4+rbp]
+ add ecx,r14d
+ mov r12d,DWORD[56+rsi]
+ mov r13d,r10d
+ mov r14d,ecx
+ bswap r12d
+ ror r13d,14
+ mov r15d,r11d
+
+ xor r13d,r10d
+ ror r14d,9
+ xor r15d,eax
+
+ mov DWORD[56+rsp],r12d
+ xor r14d,ecx
+ and r15d,r10d
+
+ ror r13d,5
+ add r12d,ebx
+ xor r15d,eax
+
+ ror r14d,11
+ xor r13d,r10d
+ add r12d,r15d
+
+ mov r15d,ecx
+ add r12d,DWORD[rbp]
+ xor r14d,ecx
+
+ xor r15d,edx
+ ror r13d,6
+ mov ebx,edx
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor ebx,edi
+ add r9d,r12d
+ add ebx,r12d
+
+ lea rbp,[4+rbp]
+ add ebx,r14d
+ mov r12d,DWORD[60+rsi]
+ mov r13d,r9d
+ mov r14d,ebx
+ bswap r12d
+ ror r13d,14
+ mov edi,r10d
+
+ xor r13d,r9d
+ ror r14d,9
+ xor edi,r11d
+
+ mov DWORD[60+rsp],r12d
+ xor r14d,ebx
+ and edi,r9d
+
+ ror r13d,5
+ add r12d,eax
+ xor edi,r11d
+
+ ror r14d,11
+ xor r13d,r9d
+ add r12d,edi
+
+ mov edi,ebx
+ add r12d,DWORD[rbp]
+ xor r14d,ebx
+
+ xor edi,ecx
+ ror r13d,6
+ mov eax,ecx
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor eax,r15d
+ add r8d,r12d
+ add eax,r12d
+
+ lea rbp,[20+rbp]
+ jmp NEAR $L$rounds_16_xx
+ALIGN 16
+$L$rounds_16_xx:
+ mov r13d,DWORD[4+rsp]
+ mov r15d,DWORD[56+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add eax,r14d
+ mov r14d,r15d
+ ror r15d,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor r15d,r14d
+ shr r14d,10
+
+ ror r15d,17
+ xor r12d,r13d
+ xor r15d,r14d
+ add r12d,DWORD[36+rsp]
+
+ add r12d,DWORD[rsp]
+ mov r13d,r8d
+ add r12d,r15d
+ mov r14d,eax
+ ror r13d,14
+ mov r15d,r9d
+
+ xor r13d,r8d
+ ror r14d,9
+ xor r15d,r10d
+
+ mov DWORD[rsp],r12d
+ xor r14d,eax
+ and r15d,r8d
+
+ ror r13d,5
+ add r12d,r11d
+ xor r15d,r10d
+
+ ror r14d,11
+ xor r13d,r8d
+ add r12d,r15d
+
+ mov r15d,eax
+ add r12d,DWORD[rbp]
+ xor r14d,eax
+
+ xor r15d,ebx
+ ror r13d,6
+ mov r11d,ebx
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor r11d,edi
+ add edx,r12d
+ add r11d,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[8+rsp]
+ mov edi,DWORD[60+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add r11d,r14d
+ mov r14d,edi
+ ror edi,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor edi,r14d
+ shr r14d,10
+
+ ror edi,17
+ xor r12d,r13d
+ xor edi,r14d
+ add r12d,DWORD[40+rsp]
+
+ add r12d,DWORD[4+rsp]
+ mov r13d,edx
+ add r12d,edi
+ mov r14d,r11d
+ ror r13d,14
+ mov edi,r8d
+
+ xor r13d,edx
+ ror r14d,9
+ xor edi,r9d
+
+ mov DWORD[4+rsp],r12d
+ xor r14d,r11d
+ and edi,edx
+
+ ror r13d,5
+ add r12d,r10d
+ xor edi,r9d
+
+ ror r14d,11
+ xor r13d,edx
+ add r12d,edi
+
+ mov edi,r11d
+ add r12d,DWORD[rbp]
+ xor r14d,r11d
+
+ xor edi,eax
+ ror r13d,6
+ mov r10d,eax
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor r10d,r15d
+ add ecx,r12d
+ add r10d,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[12+rsp]
+ mov r15d,DWORD[rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add r10d,r14d
+ mov r14d,r15d
+ ror r15d,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor r15d,r14d
+ shr r14d,10
+
+ ror r15d,17
+ xor r12d,r13d
+ xor r15d,r14d
+ add r12d,DWORD[44+rsp]
+
+ add r12d,DWORD[8+rsp]
+ mov r13d,ecx
+ add r12d,r15d
+ mov r14d,r10d
+ ror r13d,14
+ mov r15d,edx
+
+ xor r13d,ecx
+ ror r14d,9
+ xor r15d,r8d
+
+ mov DWORD[8+rsp],r12d
+ xor r14d,r10d
+ and r15d,ecx
+
+ ror r13d,5
+ add r12d,r9d
+ xor r15d,r8d
+
+ ror r14d,11
+ xor r13d,ecx
+ add r12d,r15d
+
+ mov r15d,r10d
+ add r12d,DWORD[rbp]
+ xor r14d,r10d
+
+ xor r15d,r11d
+ ror r13d,6
+ mov r9d,r11d
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor r9d,edi
+ add ebx,r12d
+ add r9d,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[16+rsp]
+ mov edi,DWORD[4+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add r9d,r14d
+ mov r14d,edi
+ ror edi,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor edi,r14d
+ shr r14d,10
+
+ ror edi,17
+ xor r12d,r13d
+ xor edi,r14d
+ add r12d,DWORD[48+rsp]
+
+ add r12d,DWORD[12+rsp]
+ mov r13d,ebx
+ add r12d,edi
+ mov r14d,r9d
+ ror r13d,14
+ mov edi,ecx
+
+ xor r13d,ebx
+ ror r14d,9
+ xor edi,edx
+
+ mov DWORD[12+rsp],r12d
+ xor r14d,r9d
+ and edi,ebx
+
+ ror r13d,5
+ add r12d,r8d
+ xor edi,edx
+
+ ror r14d,11
+ xor r13d,ebx
+ add r12d,edi
+
+ mov edi,r9d
+ add r12d,DWORD[rbp]
+ xor r14d,r9d
+
+ xor edi,r10d
+ ror r13d,6
+ mov r8d,r10d
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor r8d,r15d
+ add eax,r12d
+ add r8d,r12d
+
+ lea rbp,[20+rbp]
+ mov r13d,DWORD[20+rsp]
+ mov r15d,DWORD[8+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add r8d,r14d
+ mov r14d,r15d
+ ror r15d,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor r15d,r14d
+ shr r14d,10
+
+ ror r15d,17
+ xor r12d,r13d
+ xor r15d,r14d
+ add r12d,DWORD[52+rsp]
+
+ add r12d,DWORD[16+rsp]
+ mov r13d,eax
+ add r12d,r15d
+ mov r14d,r8d
+ ror r13d,14
+ mov r15d,ebx
+
+ xor r13d,eax
+ ror r14d,9
+ xor r15d,ecx
+
+ mov DWORD[16+rsp],r12d
+ xor r14d,r8d
+ and r15d,eax
+
+ ror r13d,5
+ add r12d,edx
+ xor r15d,ecx
+
+ ror r14d,11
+ xor r13d,eax
+ add r12d,r15d
+
+ mov r15d,r8d
+ add r12d,DWORD[rbp]
+ xor r14d,r8d
+
+ xor r15d,r9d
+ ror r13d,6
+ mov edx,r9d
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor edx,edi
+ add r11d,r12d
+ add edx,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[24+rsp]
+ mov edi,DWORD[12+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add edx,r14d
+ mov r14d,edi
+ ror edi,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor edi,r14d
+ shr r14d,10
+
+ ror edi,17
+ xor r12d,r13d
+ xor edi,r14d
+ add r12d,DWORD[56+rsp]
+
+ add r12d,DWORD[20+rsp]
+ mov r13d,r11d
+ add r12d,edi
+ mov r14d,edx
+ ror r13d,14
+ mov edi,eax
+
+ xor r13d,r11d
+ ror r14d,9
+ xor edi,ebx
+
+ mov DWORD[20+rsp],r12d
+ xor r14d,edx
+ and edi,r11d
+
+ ror r13d,5
+ add r12d,ecx
+ xor edi,ebx
+
+ ror r14d,11
+ xor r13d,r11d
+ add r12d,edi
+
+ mov edi,edx
+ add r12d,DWORD[rbp]
+ xor r14d,edx
+
+ xor edi,r8d
+ ror r13d,6
+ mov ecx,r8d
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor ecx,r15d
+ add r10d,r12d
+ add ecx,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[28+rsp]
+ mov r15d,DWORD[16+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add ecx,r14d
+ mov r14d,r15d
+ ror r15d,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor r15d,r14d
+ shr r14d,10
+
+ ror r15d,17
+ xor r12d,r13d
+ xor r15d,r14d
+ add r12d,DWORD[60+rsp]
+
+ add r12d,DWORD[24+rsp]
+ mov r13d,r10d
+ add r12d,r15d
+ mov r14d,ecx
+ ror r13d,14
+ mov r15d,r11d
+
+ xor r13d,r10d
+ ror r14d,9
+ xor r15d,eax
+
+ mov DWORD[24+rsp],r12d
+ xor r14d,ecx
+ and r15d,r10d
+
+ ror r13d,5
+ add r12d,ebx
+ xor r15d,eax
+
+ ror r14d,11
+ xor r13d,r10d
+ add r12d,r15d
+
+ mov r15d,ecx
+ add r12d,DWORD[rbp]
+ xor r14d,ecx
+
+ xor r15d,edx
+ ror r13d,6
+ mov ebx,edx
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor ebx,edi
+ add r9d,r12d
+ add ebx,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[32+rsp]
+ mov edi,DWORD[20+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add ebx,r14d
+ mov r14d,edi
+ ror edi,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor edi,r14d
+ shr r14d,10
+
+ ror edi,17
+ xor r12d,r13d
+ xor edi,r14d
+ add r12d,DWORD[rsp]
+
+ add r12d,DWORD[28+rsp]
+ mov r13d,r9d
+ add r12d,edi
+ mov r14d,ebx
+ ror r13d,14
+ mov edi,r10d
+
+ xor r13d,r9d
+ ror r14d,9
+ xor edi,r11d
+
+ mov DWORD[28+rsp],r12d
+ xor r14d,ebx
+ and edi,r9d
+
+ ror r13d,5
+ add r12d,eax
+ xor edi,r11d
+
+ ror r14d,11
+ xor r13d,r9d
+ add r12d,edi
+
+ mov edi,ebx
+ add r12d,DWORD[rbp]
+ xor r14d,ebx
+
+ xor edi,ecx
+ ror r13d,6
+ mov eax,ecx
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor eax,r15d
+ add r8d,r12d
+ add eax,r12d
+
+ lea rbp,[20+rbp]
+ mov r13d,DWORD[36+rsp]
+ mov r15d,DWORD[24+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add eax,r14d
+ mov r14d,r15d
+ ror r15d,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor r15d,r14d
+ shr r14d,10
+
+ ror r15d,17
+ xor r12d,r13d
+ xor r15d,r14d
+ add r12d,DWORD[4+rsp]
+
+ add r12d,DWORD[32+rsp]
+ mov r13d,r8d
+ add r12d,r15d
+ mov r14d,eax
+ ror r13d,14
+ mov r15d,r9d
+
+ xor r13d,r8d
+ ror r14d,9
+ xor r15d,r10d
+
+ mov DWORD[32+rsp],r12d
+ xor r14d,eax
+ and r15d,r8d
+
+ ror r13d,5
+ add r12d,r11d
+ xor r15d,r10d
+
+ ror r14d,11
+ xor r13d,r8d
+ add r12d,r15d
+
+ mov r15d,eax
+ add r12d,DWORD[rbp]
+ xor r14d,eax
+
+ xor r15d,ebx
+ ror r13d,6
+ mov r11d,ebx
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor r11d,edi
+ add edx,r12d
+ add r11d,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[40+rsp]
+ mov edi,DWORD[28+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add r11d,r14d
+ mov r14d,edi
+ ror edi,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor edi,r14d
+ shr r14d,10
+
+ ror edi,17
+ xor r12d,r13d
+ xor edi,r14d
+ add r12d,DWORD[8+rsp]
+
+ add r12d,DWORD[36+rsp]
+ mov r13d,edx
+ add r12d,edi
+ mov r14d,r11d
+ ror r13d,14
+ mov edi,r8d
+
+ xor r13d,edx
+ ror r14d,9
+ xor edi,r9d
+
+ mov DWORD[36+rsp],r12d
+ xor r14d,r11d
+ and edi,edx
+
+ ror r13d,5
+ add r12d,r10d
+ xor edi,r9d
+
+ ror r14d,11
+ xor r13d,edx
+ add r12d,edi
+
+ mov edi,r11d
+ add r12d,DWORD[rbp]
+ xor r14d,r11d
+
+ xor edi,eax
+ ror r13d,6
+ mov r10d,eax
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor r10d,r15d
+ add ecx,r12d
+ add r10d,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[44+rsp]
+ mov r15d,DWORD[32+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add r10d,r14d
+ mov r14d,r15d
+ ror r15d,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor r15d,r14d
+ shr r14d,10
+
+ ror r15d,17
+ xor r12d,r13d
+ xor r15d,r14d
+ add r12d,DWORD[12+rsp]
+
+ add r12d,DWORD[40+rsp]
+ mov r13d,ecx
+ add r12d,r15d
+ mov r14d,r10d
+ ror r13d,14
+ mov r15d,edx
+
+ xor r13d,ecx
+ ror r14d,9
+ xor r15d,r8d
+
+ mov DWORD[40+rsp],r12d
+ xor r14d,r10d
+ and r15d,ecx
+
+ ror r13d,5
+ add r12d,r9d
+ xor r15d,r8d
+
+ ror r14d,11
+ xor r13d,ecx
+ add r12d,r15d
+
+ mov r15d,r10d
+ add r12d,DWORD[rbp]
+ xor r14d,r10d
+
+ xor r15d,r11d
+ ror r13d,6
+ mov r9d,r11d
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor r9d,edi
+ add ebx,r12d
+ add r9d,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[48+rsp]
+ mov edi,DWORD[36+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add r9d,r14d
+ mov r14d,edi
+ ror edi,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor edi,r14d
+ shr r14d,10
+
+ ror edi,17
+ xor r12d,r13d
+ xor edi,r14d
+ add r12d,DWORD[16+rsp]
+
+ add r12d,DWORD[44+rsp]
+ mov r13d,ebx
+ add r12d,edi
+ mov r14d,r9d
+ ror r13d,14
+ mov edi,ecx
+
+ xor r13d,ebx
+ ror r14d,9
+ xor edi,edx
+
+ mov DWORD[44+rsp],r12d
+ xor r14d,r9d
+ and edi,ebx
+
+ ror r13d,5
+ add r12d,r8d
+ xor edi,edx
+
+ ror r14d,11
+ xor r13d,ebx
+ add r12d,edi
+
+ mov edi,r9d
+ add r12d,DWORD[rbp]
+ xor r14d,r9d
+
+ xor edi,r10d
+ ror r13d,6
+ mov r8d,r10d
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor r8d,r15d
+ add eax,r12d
+ add r8d,r12d
+
+ lea rbp,[20+rbp]
+ mov r13d,DWORD[52+rsp]
+ mov r15d,DWORD[40+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add r8d,r14d
+ mov r14d,r15d
+ ror r15d,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor r15d,r14d
+ shr r14d,10
+
+ ror r15d,17
+ xor r12d,r13d
+ xor r15d,r14d
+ add r12d,DWORD[20+rsp]
+
+ add r12d,DWORD[48+rsp]
+ mov r13d,eax
+ add r12d,r15d
+ mov r14d,r8d
+ ror r13d,14
+ mov r15d,ebx
+
+ xor r13d,eax
+ ror r14d,9
+ xor r15d,ecx
+
+ mov DWORD[48+rsp],r12d
+ xor r14d,r8d
+ and r15d,eax
+
+ ror r13d,5
+ add r12d,edx
+ xor r15d,ecx
+
+ ror r14d,11
+ xor r13d,eax
+ add r12d,r15d
+
+ mov r15d,r8d
+ add r12d,DWORD[rbp]
+ xor r14d,r8d
+
+ xor r15d,r9d
+ ror r13d,6
+ mov edx,r9d
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor edx,edi
+ add r11d,r12d
+ add edx,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[56+rsp]
+ mov edi,DWORD[44+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add edx,r14d
+ mov r14d,edi
+ ror edi,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor edi,r14d
+ shr r14d,10
+
+ ror edi,17
+ xor r12d,r13d
+ xor edi,r14d
+ add r12d,DWORD[24+rsp]
+
+ add r12d,DWORD[52+rsp]
+ mov r13d,r11d
+ add r12d,edi
+ mov r14d,edx
+ ror r13d,14
+ mov edi,eax
+
+ xor r13d,r11d
+ ror r14d,9
+ xor edi,ebx
+
+ mov DWORD[52+rsp],r12d
+ xor r14d,edx
+ and edi,r11d
+
+ ror r13d,5
+ add r12d,ecx
+ xor edi,ebx
+
+ ror r14d,11
+ xor r13d,r11d
+ add r12d,edi
+
+ mov edi,edx
+ add r12d,DWORD[rbp]
+ xor r14d,edx
+
+ xor edi,r8d
+ ror r13d,6
+ mov ecx,r8d
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor ecx,r15d
+ add r10d,r12d
+ add ecx,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[60+rsp]
+ mov r15d,DWORD[48+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add ecx,r14d
+ mov r14d,r15d
+ ror r15d,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor r15d,r14d
+ shr r14d,10
+
+ ror r15d,17
+ xor r12d,r13d
+ xor r15d,r14d
+ add r12d,DWORD[28+rsp]
+
+ add r12d,DWORD[56+rsp]
+ mov r13d,r10d
+ add r12d,r15d
+ mov r14d,ecx
+ ror r13d,14
+ mov r15d,r11d
+
+ xor r13d,r10d
+ ror r14d,9
+ xor r15d,eax
+
+ mov DWORD[56+rsp],r12d
+ xor r14d,ecx
+ and r15d,r10d
+
+ ror r13d,5
+ add r12d,ebx
+ xor r15d,eax
+
+ ror r14d,11
+ xor r13d,r10d
+ add r12d,r15d
+
+ mov r15d,ecx
+ add r12d,DWORD[rbp]
+ xor r14d,ecx
+
+ xor r15d,edx
+ ror r13d,6
+ mov ebx,edx
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor ebx,edi
+ add r9d,r12d
+ add ebx,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[rsp]
+ mov edi,DWORD[52+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add ebx,r14d
+ mov r14d,edi
+ ror edi,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor edi,r14d
+ shr r14d,10
+
+ ror edi,17
+ xor r12d,r13d
+ xor edi,r14d
+ add r12d,DWORD[32+rsp]
+
+ add r12d,DWORD[60+rsp]
+ mov r13d,r9d
+ add r12d,edi
+ mov r14d,ebx
+ ror r13d,14
+ mov edi,r10d
+
+ xor r13d,r9d
+ ror r14d,9
+ xor edi,r11d
+
+ mov DWORD[60+rsp],r12d
+ xor r14d,ebx
+ and edi,r9d
+
+ ror r13d,5
+ add r12d,eax
+ xor edi,r11d
+
+ ror r14d,11
+ xor r13d,r9d
+ add r12d,edi
+
+ mov edi,ebx
+ add r12d,DWORD[rbp]
+ xor r14d,ebx
+
+ xor edi,ecx
+ ror r13d,6
+ mov eax,ecx
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor eax,r15d
+ add r8d,r12d
+ add eax,r12d
+
+ lea rbp,[20+rbp]
+ cmp BYTE[3+rbp],0
+ jnz NEAR $L$rounds_16_xx
+
+ mov rdi,QWORD[((64+0))+rsp]
+ add eax,r14d
+ lea rsi,[64+rsi]
+
+ add eax,DWORD[rdi]
+ add ebx,DWORD[4+rdi]
+ add ecx,DWORD[8+rdi]
+ add edx,DWORD[12+rdi]
+ add r8d,DWORD[16+rdi]
+ add r9d,DWORD[20+rdi]
+ add r10d,DWORD[24+rdi]
+ add r11d,DWORD[28+rdi]
+
+ cmp rsi,QWORD[((64+16))+rsp]
+
+ mov DWORD[rdi],eax
+ mov DWORD[4+rdi],ebx
+ mov DWORD[8+rdi],ecx
+ mov DWORD[12+rdi],edx
+ mov DWORD[16+rdi],r8d
+ mov DWORD[20+rdi],r9d
+ mov DWORD[24+rdi],r10d
+ mov DWORD[28+rdi],r11d
+ jb NEAR $L$loop
+
+ mov rsi,QWORD[88+rsp]
+
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha256_block_data_order:
+ALIGN 64
+
+K256:
+ DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+ DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+ DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+ DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+ DD 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+ DD 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+ DD 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+ DD 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+DB 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54
+DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+DB 111,114,103,62,0
+
+ALIGN 64
+sha256_block_data_order_shaext:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha256_block_data_order_shaext:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+_shaext_shortcut:
+
+ lea rsp,[((-88))+rsp]
+ movaps XMMWORD[(-8-80)+rax],xmm6
+ movaps XMMWORD[(-8-64)+rax],xmm7
+ movaps XMMWORD[(-8-48)+rax],xmm8
+ movaps XMMWORD[(-8-32)+rax],xmm9
+ movaps XMMWORD[(-8-16)+rax],xmm10
+$L$prologue_shaext:
+ lea rcx,[((K256+128))]
+ movdqu xmm1,XMMWORD[rdi]
+ movdqu xmm2,XMMWORD[16+rdi]
+ movdqa xmm7,XMMWORD[((512-128))+rcx]
+
+ pshufd xmm0,xmm1,0x1b
+ pshufd xmm1,xmm1,0xb1
+ pshufd xmm2,xmm2,0x1b
+ movdqa xmm8,xmm7
+DB 102,15,58,15,202,8
+ punpcklqdq xmm2,xmm0
+ jmp NEAR $L$oop_shaext
+
+ALIGN 16
+$L$oop_shaext:
+ movdqu xmm3,XMMWORD[rsi]
+ movdqu xmm4,XMMWORD[16+rsi]
+ movdqu xmm5,XMMWORD[32+rsi]
+DB 102,15,56,0,223
+ movdqu xmm6,XMMWORD[48+rsi]
+
+ movdqa xmm0,XMMWORD[((0-128))+rcx]
+ paddd xmm0,xmm3
+DB 102,15,56,0,231
+ movdqa xmm10,xmm2
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ nop
+ movdqa xmm9,xmm1
+DB 15,56,203,202
+
+ movdqa xmm0,XMMWORD[((32-128))+rcx]
+ paddd xmm0,xmm4
+DB 102,15,56,0,239
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ lea rsi,[64+rsi]
+DB 15,56,204,220
+DB 15,56,203,202
+
+ movdqa xmm0,XMMWORD[((64-128))+rcx]
+ paddd xmm0,xmm5
+DB 102,15,56,0,247
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm6
+DB 102,15,58,15,253,4
+ nop
+ paddd xmm3,xmm7
+DB 15,56,204,229
+DB 15,56,203,202
+
+ movdqa xmm0,XMMWORD[((96-128))+rcx]
+ paddd xmm0,xmm6
+DB 15,56,205,222
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm3
+DB 102,15,58,15,254,4
+ nop
+ paddd xmm4,xmm7
+DB 15,56,204,238
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((128-128))+rcx]
+ paddd xmm0,xmm3
+DB 15,56,205,227
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm4
+DB 102,15,58,15,251,4
+ nop
+ paddd xmm5,xmm7
+DB 15,56,204,243
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((160-128))+rcx]
+ paddd xmm0,xmm4
+DB 15,56,205,236
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm5
+DB 102,15,58,15,252,4
+ nop
+ paddd xmm6,xmm7
+DB 15,56,204,220
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((192-128))+rcx]
+ paddd xmm0,xmm5
+DB 15,56,205,245
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm6
+DB 102,15,58,15,253,4
+ nop
+ paddd xmm3,xmm7
+DB 15,56,204,229
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((224-128))+rcx]
+ paddd xmm0,xmm6
+DB 15,56,205,222
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm3
+DB 102,15,58,15,254,4
+ nop
+ paddd xmm4,xmm7
+DB 15,56,204,238
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((256-128))+rcx]
+ paddd xmm0,xmm3
+DB 15,56,205,227
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm4
+DB 102,15,58,15,251,4
+ nop
+ paddd xmm5,xmm7
+DB 15,56,204,243
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((288-128))+rcx]
+ paddd xmm0,xmm4
+DB 15,56,205,236
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm5
+DB 102,15,58,15,252,4
+ nop
+ paddd xmm6,xmm7
+DB 15,56,204,220
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((320-128))+rcx]
+ paddd xmm0,xmm5
+DB 15,56,205,245
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm6
+DB 102,15,58,15,253,4
+ nop
+ paddd xmm3,xmm7
+DB 15,56,204,229
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((352-128))+rcx]
+ paddd xmm0,xmm6
+DB 15,56,205,222
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm3
+DB 102,15,58,15,254,4
+ nop
+ paddd xmm4,xmm7
+DB 15,56,204,238
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((384-128))+rcx]
+ paddd xmm0,xmm3
+DB 15,56,205,227
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm4
+DB 102,15,58,15,251,4
+ nop
+ paddd xmm5,xmm7
+DB 15,56,204,243
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((416-128))+rcx]
+ paddd xmm0,xmm4
+DB 15,56,205,236
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm5
+DB 102,15,58,15,252,4
+DB 15,56,203,202
+ paddd xmm6,xmm7
+
+ movdqa xmm0,XMMWORD[((448-128))+rcx]
+ paddd xmm0,xmm5
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+DB 15,56,205,245
+ movdqa xmm7,xmm8
+DB 15,56,203,202
+
+ movdqa xmm0,XMMWORD[((480-128))+rcx]
+ paddd xmm0,xmm6
+ nop
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ dec rdx
+ nop
+DB 15,56,203,202
+
+ paddd xmm2,xmm10
+ paddd xmm1,xmm9
+ jnz NEAR $L$oop_shaext
+
+ pshufd xmm2,xmm2,0xb1
+ pshufd xmm7,xmm1,0x1b
+ pshufd xmm1,xmm1,0xb1
+ punpckhqdq xmm1,xmm2
+DB 102,15,58,15,215,8
+
+ movdqu XMMWORD[rdi],xmm1
+ movdqu XMMWORD[16+rdi],xmm2
+ movaps xmm6,XMMWORD[((-8-80))+rax]
+ movaps xmm7,XMMWORD[((-8-64))+rax]
+ movaps xmm8,XMMWORD[((-8-48))+rax]
+ movaps xmm9,XMMWORD[((-8-32))+rax]
+ movaps xmm10,XMMWORD[((-8-16))+rax]
+ mov rsp,rax
+$L$epilogue_shaext:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha256_block_data_order_shaext:
+
+ALIGN 64
+sha256_block_data_order_ssse3:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha256_block_data_order_ssse3:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+$L$ssse3_shortcut:
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ shl rdx,4
+ sub rsp,160
+ lea rdx,[rdx*4+rsi]
+ and rsp,-64
+ mov QWORD[((64+0))+rsp],rdi
+ mov QWORD[((64+8))+rsp],rsi
+ mov QWORD[((64+16))+rsp],rdx
+ mov QWORD[88+rsp],rax
+
+ movaps XMMWORD[(64+32)+rsp],xmm6
+ movaps XMMWORD[(64+48)+rsp],xmm7
+ movaps XMMWORD[(64+64)+rsp],xmm8
+ movaps XMMWORD[(64+80)+rsp],xmm9
+$L$prologue_ssse3:
+
+ mov eax,DWORD[rdi]
+ mov ebx,DWORD[4+rdi]
+ mov ecx,DWORD[8+rdi]
+ mov edx,DWORD[12+rdi]
+ mov r8d,DWORD[16+rdi]
+ mov r9d,DWORD[20+rdi]
+ mov r10d,DWORD[24+rdi]
+ mov r11d,DWORD[28+rdi]
+
+
+ jmp NEAR $L$loop_ssse3
+ALIGN 16
+$L$loop_ssse3:
+ movdqa xmm7,XMMWORD[((K256+512))]
+ movdqu xmm0,XMMWORD[rsi]
+ movdqu xmm1,XMMWORD[16+rsi]
+ movdqu xmm2,XMMWORD[32+rsi]
+DB 102,15,56,0,199
+ movdqu xmm3,XMMWORD[48+rsi]
+ lea rbp,[K256]
+DB 102,15,56,0,207
+ movdqa xmm4,XMMWORD[rbp]
+ movdqa xmm5,XMMWORD[32+rbp]
+DB 102,15,56,0,215
+ paddd xmm4,xmm0
+ movdqa xmm6,XMMWORD[64+rbp]
+DB 102,15,56,0,223
+ movdqa xmm7,XMMWORD[96+rbp]
+ paddd xmm5,xmm1
+ paddd xmm6,xmm2
+ paddd xmm7,xmm3
+ movdqa XMMWORD[rsp],xmm4
+ mov r14d,eax
+ movdqa XMMWORD[16+rsp],xmm5
+ mov edi,ebx
+ movdqa XMMWORD[32+rsp],xmm6
+ xor edi,ecx
+ movdqa XMMWORD[48+rsp],xmm7
+ mov r13d,r8d
+ jmp NEAR $L$ssse3_00_47
+
+ALIGN 16
+$L$ssse3_00_47:
+ sub rbp,-128
+ ror r13d,14
+ movdqa xmm4,xmm1
+ mov eax,r14d
+ mov r12d,r9d
+ movdqa xmm7,xmm3
+ ror r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ ror r13d,5
+ xor r14d,eax
+DB 102,15,58,15,224,4
+ and r12d,r8d
+ xor r13d,r8d
+DB 102,15,58,15,250,4
+ add r11d,DWORD[rsp]
+ mov r15d,eax
+ xor r12d,r10d
+ ror r14d,11
+ movdqa xmm5,xmm4
+ xor r15d,ebx
+ add r11d,r12d
+ movdqa xmm6,xmm4
+ ror r13d,6
+ and edi,r15d
+ psrld xmm4,3
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ paddd xmm0,xmm7
+ ror r14d,2
+ add edx,r11d
+ psrld xmm6,7
+ add r11d,edi
+ mov r13d,edx
+ pshufd xmm7,xmm3,250
+ add r14d,r11d
+ ror r13d,14
+ pslld xmm5,14
+ mov r11d,r14d
+ mov r12d,r8d
+ pxor xmm4,xmm6
+ ror r14d,9
+ xor r13d,edx
+ xor r12d,r9d
+ ror r13d,5
+ psrld xmm6,11
+ xor r14d,r11d
+ pxor xmm4,xmm5
+ and r12d,edx
+ xor r13d,edx
+ pslld xmm5,11
+ add r10d,DWORD[4+rsp]
+ mov edi,r11d
+ pxor xmm4,xmm6
+ xor r12d,r9d
+ ror r14d,11
+ movdqa xmm6,xmm7
+ xor edi,eax
+ add r10d,r12d
+ pxor xmm4,xmm5
+ ror r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ psrld xmm7,10
+ add r10d,r13d
+ xor r15d,eax
+ paddd xmm0,xmm4
+ ror r14d,2
+ add ecx,r10d
+ psrlq xmm6,17
+ add r10d,r15d
+ mov r13d,ecx
+ add r14d,r10d
+ pxor xmm7,xmm6
+ ror r13d,14
+ mov r10d,r14d
+ mov r12d,edx
+ ror r14d,9
+ psrlq xmm6,2
+ xor r13d,ecx
+ xor r12d,r8d
+ pxor xmm7,xmm6
+ ror r13d,5
+ xor r14d,r10d
+ and r12d,ecx
+ pshufd xmm7,xmm7,128
+ xor r13d,ecx
+ add r9d,DWORD[8+rsp]
+ mov r15d,r10d
+ psrldq xmm7,8
+ xor r12d,r8d
+ ror r14d,11
+ xor r15d,r11d
+ add r9d,r12d
+ ror r13d,6
+ paddd xmm0,xmm7
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ pshufd xmm7,xmm0,80
+ xor edi,r11d
+ ror r14d,2
+ add ebx,r9d
+ movdqa xmm6,xmm7
+ add r9d,edi
+ mov r13d,ebx
+ psrld xmm7,10
+ add r14d,r9d
+ ror r13d,14
+ psrlq xmm6,17
+ mov r9d,r14d
+ mov r12d,ecx
+ pxor xmm7,xmm6
+ ror r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ ror r13d,5
+ xor r14d,r9d
+ psrlq xmm6,2
+ and r12d,ebx
+ xor r13d,ebx
+ add r8d,DWORD[12+rsp]
+ pxor xmm7,xmm6
+ mov edi,r9d
+ xor r12d,edx
+ ror r14d,11
+ pshufd xmm7,xmm7,8
+ xor edi,r10d
+ add r8d,r12d
+ movdqa xmm6,XMMWORD[rbp]
+ ror r13d,6
+ and r15d,edi
+ pslldq xmm7,8
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ paddd xmm0,xmm7
+ ror r14d,2
+ add eax,r8d
+ add r8d,r15d
+ paddd xmm6,xmm0
+ mov r13d,eax
+ add r14d,r8d
+ movdqa XMMWORD[rsp],xmm6
+ ror r13d,14
+ movdqa xmm4,xmm2
+ mov r8d,r14d
+ mov r12d,ebx
+ movdqa xmm7,xmm0
+ ror r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ ror r13d,5
+ xor r14d,r8d
+DB 102,15,58,15,225,4
+ and r12d,eax
+ xor r13d,eax
+DB 102,15,58,15,251,4
+ add edx,DWORD[16+rsp]
+ mov r15d,r8d
+ xor r12d,ecx
+ ror r14d,11
+ movdqa xmm5,xmm4
+ xor r15d,r9d
+ add edx,r12d
+ movdqa xmm6,xmm4
+ ror r13d,6
+ and edi,r15d
+ psrld xmm4,3
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ paddd xmm1,xmm7
+ ror r14d,2
+ add r11d,edx
+ psrld xmm6,7
+ add edx,edi
+ mov r13d,r11d
+ pshufd xmm7,xmm0,250
+ add r14d,edx
+ ror r13d,14
+ pslld xmm5,14
+ mov edx,r14d
+ mov r12d,eax
+ pxor xmm4,xmm6
+ ror r14d,9
+ xor r13d,r11d
+ xor r12d,ebx
+ ror r13d,5
+ psrld xmm6,11
+ xor r14d,edx
+ pxor xmm4,xmm5
+ and r12d,r11d
+ xor r13d,r11d
+ pslld xmm5,11
+ add ecx,DWORD[20+rsp]
+ mov edi,edx
+ pxor xmm4,xmm6
+ xor r12d,ebx
+ ror r14d,11
+ movdqa xmm6,xmm7
+ xor edi,r8d
+ add ecx,r12d
+ pxor xmm4,xmm5
+ ror r13d,6
+ and r15d,edi
+ xor r14d,edx
+ psrld xmm7,10
+ add ecx,r13d
+ xor r15d,r8d
+ paddd xmm1,xmm4
+ ror r14d,2
+ add r10d,ecx
+ psrlq xmm6,17
+ add ecx,r15d
+ mov r13d,r10d
+ add r14d,ecx
+ pxor xmm7,xmm6
+ ror r13d,14
+ mov ecx,r14d
+ mov r12d,r11d
+ ror r14d,9
+ psrlq xmm6,2
+ xor r13d,r10d
+ xor r12d,eax
+ pxor xmm7,xmm6
+ ror r13d,5
+ xor r14d,ecx
+ and r12d,r10d
+ pshufd xmm7,xmm7,128
+ xor r13d,r10d
+ add ebx,DWORD[24+rsp]
+ mov r15d,ecx
+ psrldq xmm7,8
+ xor r12d,eax
+ ror r14d,11
+ xor r15d,edx
+ add ebx,r12d
+ ror r13d,6
+ paddd xmm1,xmm7
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ pshufd xmm7,xmm1,80
+ xor edi,edx
+ ror r14d,2
+ add r9d,ebx
+ movdqa xmm6,xmm7
+ add ebx,edi
+ mov r13d,r9d
+ psrld xmm7,10
+ add r14d,ebx
+ ror r13d,14
+ psrlq xmm6,17
+ mov ebx,r14d
+ mov r12d,r10d
+ pxor xmm7,xmm6
+ ror r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ ror r13d,5
+ xor r14d,ebx
+ psrlq xmm6,2
+ and r12d,r9d
+ xor r13d,r9d
+ add eax,DWORD[28+rsp]
+ pxor xmm7,xmm6
+ mov edi,ebx
+ xor r12d,r11d
+ ror r14d,11
+ pshufd xmm7,xmm7,8
+ xor edi,ecx
+ add eax,r12d
+ movdqa xmm6,XMMWORD[32+rbp]
+ ror r13d,6
+ and r15d,edi
+ pslldq xmm7,8
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ paddd xmm1,xmm7
+ ror r14d,2
+ add r8d,eax
+ add eax,r15d
+ paddd xmm6,xmm1
+ mov r13d,r8d
+ add r14d,eax
+ movdqa XMMWORD[16+rsp],xmm6
+ ror r13d,14
+ movdqa xmm4,xmm3
+ mov eax,r14d
+ mov r12d,r9d
+ movdqa xmm7,xmm1
+ ror r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ ror r13d,5
+ xor r14d,eax
+DB 102,15,58,15,226,4
+ and r12d,r8d
+ xor r13d,r8d
+DB 102,15,58,15,248,4
+ add r11d,DWORD[32+rsp]
+ mov r15d,eax
+ xor r12d,r10d
+ ror r14d,11
+ movdqa xmm5,xmm4
+ xor r15d,ebx
+ add r11d,r12d
+ movdqa xmm6,xmm4
+ ror r13d,6
+ and edi,r15d
+ psrld xmm4,3
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ paddd xmm2,xmm7
+ ror r14d,2
+ add edx,r11d
+ psrld xmm6,7
+ add r11d,edi
+ mov r13d,edx
+ pshufd xmm7,xmm1,250
+ add r14d,r11d
+ ror r13d,14
+ pslld xmm5,14
+ mov r11d,r14d
+ mov r12d,r8d
+ pxor xmm4,xmm6
+ ror r14d,9
+ xor r13d,edx
+ xor r12d,r9d
+ ror r13d,5
+ psrld xmm6,11
+ xor r14d,r11d
+ pxor xmm4,xmm5
+ and r12d,edx
+ xor r13d,edx
+ pslld xmm5,11
+ add r10d,DWORD[36+rsp]
+ mov edi,r11d
+ pxor xmm4,xmm6
+ xor r12d,r9d
+ ror r14d,11
+ movdqa xmm6,xmm7
+ xor edi,eax
+ add r10d,r12d
+ pxor xmm4,xmm5
+ ror r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ psrld xmm7,10
+ add r10d,r13d
+ xor r15d,eax
+ paddd xmm2,xmm4
+ ror r14d,2
+ add ecx,r10d
+ psrlq xmm6,17
+ add r10d,r15d
+ mov r13d,ecx
+ add r14d,r10d
+ pxor xmm7,xmm6
+ ror r13d,14
+ mov r10d,r14d
+ mov r12d,edx
+ ror r14d,9
+ psrlq xmm6,2
+ xor r13d,ecx
+ xor r12d,r8d
+ pxor xmm7,xmm6
+ ror r13d,5
+ xor r14d,r10d
+ and r12d,ecx
+ pshufd xmm7,xmm7,128
+ xor r13d,ecx
+ add r9d,DWORD[40+rsp]
+ mov r15d,r10d
+ psrldq xmm7,8
+ xor r12d,r8d
+ ror r14d,11
+ xor r15d,r11d
+ add r9d,r12d
+ ror r13d,6
+ paddd xmm2,xmm7
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ pshufd xmm7,xmm2,80
+ xor edi,r11d
+ ror r14d,2
+ add ebx,r9d
+ movdqa xmm6,xmm7
+ add r9d,edi
+ mov r13d,ebx
+ psrld xmm7,10
+ add r14d,r9d
+ ror r13d,14
+ psrlq xmm6,17
+ mov r9d,r14d
+ mov r12d,ecx
+ pxor xmm7,xmm6
+ ror r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ ror r13d,5
+ xor r14d,r9d
+ psrlq xmm6,2
+ and r12d,ebx
+ xor r13d,ebx
+ add r8d,DWORD[44+rsp]
+ pxor xmm7,xmm6
+ mov edi,r9d
+ xor r12d,edx
+ ror r14d,11
+ pshufd xmm7,xmm7,8
+ xor edi,r10d
+ add r8d,r12d
+ movdqa xmm6,XMMWORD[64+rbp]
+ ror r13d,6
+ and r15d,edi
+ pslldq xmm7,8
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ paddd xmm2,xmm7
+ ror r14d,2
+ add eax,r8d
+ add r8d,r15d
+ paddd xmm6,xmm2
+ mov r13d,eax
+ add r14d,r8d
+ movdqa XMMWORD[32+rsp],xmm6
+ ror r13d,14
+ movdqa xmm4,xmm0
+ mov r8d,r14d
+ mov r12d,ebx
+ movdqa xmm7,xmm2
+ ror r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ ror r13d,5
+ xor r14d,r8d
+DB 102,15,58,15,227,4
+ and r12d,eax
+ xor r13d,eax
+DB 102,15,58,15,249,4
+ add edx,DWORD[48+rsp]
+ mov r15d,r8d
+ xor r12d,ecx
+ ror r14d,11
+ movdqa xmm5,xmm4
+ xor r15d,r9d
+ add edx,r12d
+ movdqa xmm6,xmm4
+ ror r13d,6
+ and edi,r15d
+ psrld xmm4,3
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ paddd xmm3,xmm7
+ ror r14d,2
+ add r11d,edx
+ psrld xmm6,7
+ add edx,edi
+ mov r13d,r11d
+ pshufd xmm7,xmm2,250
+ add r14d,edx
+ ror r13d,14
+ pslld xmm5,14
+ mov edx,r14d
+ mov r12d,eax
+ pxor xmm4,xmm6
+ ror r14d,9
+ xor r13d,r11d
+ xor r12d,ebx
+ ror r13d,5
+ psrld xmm6,11
+ xor r14d,edx
+ pxor xmm4,xmm5
+ and r12d,r11d
+ xor r13d,r11d
+ pslld xmm5,11
+ add ecx,DWORD[52+rsp]
+ mov edi,edx
+ pxor xmm4,xmm6
+ xor r12d,ebx
+ ror r14d,11
+ movdqa xmm6,xmm7
+ xor edi,r8d
+ add ecx,r12d
+ pxor xmm4,xmm5
+ ror r13d,6
+ and r15d,edi
+ xor r14d,edx
+ psrld xmm7,10
+ add ecx,r13d
+ xor r15d,r8d
+ paddd xmm3,xmm4
+ ror r14d,2
+ add r10d,ecx
+ psrlq xmm6,17
+ add ecx,r15d
+ mov r13d,r10d
+ add r14d,ecx
+ pxor xmm7,xmm6
+ ror r13d,14
+ mov ecx,r14d
+ mov r12d,r11d
+ ror r14d,9
+ psrlq xmm6,2
+ xor r13d,r10d
+ xor r12d,eax
+ pxor xmm7,xmm6
+ ror r13d,5
+ xor r14d,ecx
+ and r12d,r10d
+ pshufd xmm7,xmm7,128
+ xor r13d,r10d
+ add ebx,DWORD[56+rsp]
+ mov r15d,ecx
+ psrldq xmm7,8
+ xor r12d,eax
+ ror r14d,11
+ xor r15d,edx
+ add ebx,r12d
+ ror r13d,6
+ paddd xmm3,xmm7
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ pshufd xmm7,xmm3,80
+ xor edi,edx
+ ror r14d,2
+ add r9d,ebx
+ movdqa xmm6,xmm7
+ add ebx,edi
+ mov r13d,r9d
+ psrld xmm7,10
+ add r14d,ebx
+ ror r13d,14
+ psrlq xmm6,17
+ mov ebx,r14d
+ mov r12d,r10d
+ pxor xmm7,xmm6
+ ror r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ ror r13d,5
+ xor r14d,ebx
+ psrlq xmm6,2
+ and r12d,r9d
+ xor r13d,r9d
+ add eax,DWORD[60+rsp]
+ pxor xmm7,xmm6
+ mov edi,ebx
+ xor r12d,r11d
+ ror r14d,11
+ pshufd xmm7,xmm7,8
+ xor edi,ecx
+ add eax,r12d
+ movdqa xmm6,XMMWORD[96+rbp]
+ ror r13d,6
+ and r15d,edi
+ pslldq xmm7,8
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ paddd xmm3,xmm7
+ ror r14d,2
+ add r8d,eax
+ add eax,r15d
+ paddd xmm6,xmm3
+ mov r13d,r8d
+ add r14d,eax
+ movdqa XMMWORD[48+rsp],xmm6
+ cmp BYTE[131+rbp],0
+ jne NEAR $L$ssse3_00_47
+ ror r13d,14
+ mov eax,r14d
+ mov r12d,r9d
+ ror r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ ror r13d,5
+ xor r14d,eax
+ and r12d,r8d
+ xor r13d,r8d
+ add r11d,DWORD[rsp]
+ mov r15d,eax
+ xor r12d,r10d
+ ror r14d,11
+ xor r15d,ebx
+ add r11d,r12d
+ ror r13d,6
+ and edi,r15d
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ ror r14d,2
+ add edx,r11d
+ add r11d,edi
+ mov r13d,edx
+ add r14d,r11d
+ ror r13d,14
+ mov r11d,r14d
+ mov r12d,r8d
+ ror r14d,9
+ xor r13d,edx
+ xor r12d,r9d
+ ror r13d,5
+ xor r14d,r11d
+ and r12d,edx
+ xor r13d,edx
+ add r10d,DWORD[4+rsp]
+ mov edi,r11d
+ xor r12d,r9d
+ ror r14d,11
+ xor edi,eax
+ add r10d,r12d
+ ror r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ add r10d,r13d
+ xor r15d,eax
+ ror r14d,2
+ add ecx,r10d
+ add r10d,r15d
+ mov r13d,ecx
+ add r14d,r10d
+ ror r13d,14
+ mov r10d,r14d
+ mov r12d,edx
+ ror r14d,9
+ xor r13d,ecx
+ xor r12d,r8d
+ ror r13d,5
+ xor r14d,r10d
+ and r12d,ecx
+ xor r13d,ecx
+ add r9d,DWORD[8+rsp]
+ mov r15d,r10d
+ xor r12d,r8d
+ ror r14d,11
+ xor r15d,r11d
+ add r9d,r12d
+ ror r13d,6
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ xor edi,r11d
+ ror r14d,2
+ add ebx,r9d
+ add r9d,edi
+ mov r13d,ebx
+ add r14d,r9d
+ ror r13d,14
+ mov r9d,r14d
+ mov r12d,ecx
+ ror r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ ror r13d,5
+ xor r14d,r9d
+ and r12d,ebx
+ xor r13d,ebx
+ add r8d,DWORD[12+rsp]
+ mov edi,r9d
+ xor r12d,edx
+ ror r14d,11
+ xor edi,r10d
+ add r8d,r12d
+ ror r13d,6
+ and r15d,edi
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ ror r14d,2
+ add eax,r8d
+ add r8d,r15d
+ mov r13d,eax
+ add r14d,r8d
+ ror r13d,14
+ mov r8d,r14d
+ mov r12d,ebx
+ ror r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ ror r13d,5
+ xor r14d,r8d
+ and r12d,eax
+ xor r13d,eax
+ add edx,DWORD[16+rsp]
+ mov r15d,r8d
+ xor r12d,ecx
+ ror r14d,11
+ xor r15d,r9d
+ add edx,r12d
+ ror r13d,6
+ and edi,r15d
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ ror r14d,2
+ add r11d,edx
+ add edx,edi
+ mov r13d,r11d
+ add r14d,edx
+ ror r13d,14
+ mov edx,r14d
+ mov r12d,eax
+ ror r14d,9
+ xor r13d,r11d
+ xor r12d,ebx
+ ror r13d,5
+ xor r14d,edx
+ and r12d,r11d
+ xor r13d,r11d
+ add ecx,DWORD[20+rsp]
+ mov edi,edx
+ xor r12d,ebx
+ ror r14d,11
+ xor edi,r8d
+ add ecx,r12d
+ ror r13d,6
+ and r15d,edi
+ xor r14d,edx
+ add ecx,r13d
+ xor r15d,r8d
+ ror r14d,2
+ add r10d,ecx
+ add ecx,r15d
+ mov r13d,r10d
+ add r14d,ecx
+ ror r13d,14
+ mov ecx,r14d
+ mov r12d,r11d
+ ror r14d,9
+ xor r13d,r10d
+ xor r12d,eax
+ ror r13d,5
+ xor r14d,ecx
+ and r12d,r10d
+ xor r13d,r10d
+ add ebx,DWORD[24+rsp]
+ mov r15d,ecx
+ xor r12d,eax
+ ror r14d,11
+ xor r15d,edx
+ add ebx,r12d
+ ror r13d,6
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ xor edi,edx
+ ror r14d,2
+ add r9d,ebx
+ add ebx,edi
+ mov r13d,r9d
+ add r14d,ebx
+ ror r13d,14
+ mov ebx,r14d
+ mov r12d,r10d
+ ror r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ ror r13d,5
+ xor r14d,ebx
+ and r12d,r9d
+ xor r13d,r9d
+ add eax,DWORD[28+rsp]
+ mov edi,ebx
+ xor r12d,r11d
+ ror r14d,11
+ xor edi,ecx
+ add eax,r12d
+ ror r13d,6
+ and r15d,edi
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ ror r14d,2
+ add r8d,eax
+ add eax,r15d
+ mov r13d,r8d
+ add r14d,eax
+ ror r13d,14
+ mov eax,r14d
+ mov r12d,r9d
+ ror r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ ror r13d,5
+ xor r14d,eax
+ and r12d,r8d
+ xor r13d,r8d
+ add r11d,DWORD[32+rsp]
+ mov r15d,eax
+ xor r12d,r10d
+ ror r14d,11
+ xor r15d,ebx
+ add r11d,r12d
+ ror r13d,6
+ and edi,r15d
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ ror r14d,2
+ add edx,r11d
+ add r11d,edi
+ mov r13d,edx
+ add r14d,r11d
+ ror r13d,14
+ mov r11d,r14d
+ mov r12d,r8d
+ ror r14d,9
+ xor r13d,edx
+ xor r12d,r9d
+ ror r13d,5
+ xor r14d,r11d
+ and r12d,edx
+ xor r13d,edx
+ add r10d,DWORD[36+rsp]
+ mov edi,r11d
+ xor r12d,r9d
+ ror r14d,11
+ xor edi,eax
+ add r10d,r12d
+ ror r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ add r10d,r13d
+ xor r15d,eax
+ ror r14d,2
+ add ecx,r10d
+ add r10d,r15d
+ mov r13d,ecx
+ add r14d,r10d
+ ror r13d,14
+ mov r10d,r14d
+ mov r12d,edx
+ ror r14d,9
+ xor r13d,ecx
+ xor r12d,r8d
+ ror r13d,5
+ xor r14d,r10d
+ and r12d,ecx
+ xor r13d,ecx
+ add r9d,DWORD[40+rsp]
+ mov r15d,r10d
+ xor r12d,r8d
+ ror r14d,11
+ xor r15d,r11d
+ add r9d,r12d
+ ror r13d,6
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ xor edi,r11d
+ ror r14d,2
+ add ebx,r9d
+ add r9d,edi
+ mov r13d,ebx
+ add r14d,r9d
+ ror r13d,14
+ mov r9d,r14d
+ mov r12d,ecx
+ ror r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ ror r13d,5
+ xor r14d,r9d
+ and r12d,ebx
+ xor r13d,ebx
+ add r8d,DWORD[44+rsp]
+ mov edi,r9d
+ xor r12d,edx
+ ror r14d,11
+ xor edi,r10d
+ add r8d,r12d
+ ror r13d,6
+ and r15d,edi
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ ror r14d,2
+ add eax,r8d
+ add r8d,r15d
+ mov r13d,eax
+ add r14d,r8d
+ ror r13d,14
+ mov r8d,r14d
+ mov r12d,ebx
+ ror r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ ror r13d,5
+ xor r14d,r8d
+ and r12d,eax
+ xor r13d,eax
+ add edx,DWORD[48+rsp]
+ mov r15d,r8d
+ xor r12d,ecx
+ ror r14d,11
+ xor r15d,r9d
+ add edx,r12d
+ ror r13d,6
+ and edi,r15d
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ ror r14d,2
+ add r11d,edx
+ add edx,edi
+ mov r13d,r11d
+ add r14d,edx
+ ror r13d,14
+ mov edx,r14d
+ mov r12d,eax
+ ror r14d,9
+ xor r13d,r11d
+ xor r12d,ebx
+ ror r13d,5
+ xor r14d,edx
+ and r12d,r11d
+ xor r13d,r11d
+ add ecx,DWORD[52+rsp]
+ mov edi,edx
+ xor r12d,ebx
+ ror r14d,11
+ xor edi,r8d
+ add ecx,r12d
+ ror r13d,6
+ and r15d,edi
+ xor r14d,edx
+ add ecx,r13d
+ xor r15d,r8d
+ ror r14d,2
+ add r10d,ecx
+ add ecx,r15d
+ mov r13d,r10d
+ add r14d,ecx
+ ror r13d,14
+ mov ecx,r14d
+ mov r12d,r11d
+ ror r14d,9
+ xor r13d,r10d
+ xor r12d,eax
+ ror r13d,5
+ xor r14d,ecx
+ and r12d,r10d
+ xor r13d,r10d
+ add ebx,DWORD[56+rsp]
+ mov r15d,ecx
+ xor r12d,eax
+ ror r14d,11
+ xor r15d,edx
+ add ebx,r12d
+ ror r13d,6
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ xor edi,edx
+ ror r14d,2
+ add r9d,ebx
+ add ebx,edi
+ mov r13d,r9d
+ add r14d,ebx
+ ror r13d,14
+ mov ebx,r14d
+ mov r12d,r10d
+ ror r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ ror r13d,5
+ xor r14d,ebx
+ and r12d,r9d
+ xor r13d,r9d
+ add eax,DWORD[60+rsp]
+ mov edi,ebx
+ xor r12d,r11d
+ ror r14d,11
+ xor edi,ecx
+ add eax,r12d
+ ror r13d,6
+ and r15d,edi
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ ror r14d,2
+ add r8d,eax
+ add eax,r15d
+ mov r13d,r8d
+ add r14d,eax
+ mov rdi,QWORD[((64+0))+rsp]
+ mov eax,r14d
+
+ add eax,DWORD[rdi]
+ lea rsi,[64+rsi]
+ add ebx,DWORD[4+rdi]
+ add ecx,DWORD[8+rdi]
+ add edx,DWORD[12+rdi]
+ add r8d,DWORD[16+rdi]
+ add r9d,DWORD[20+rdi]
+ add r10d,DWORD[24+rdi]
+ add r11d,DWORD[28+rdi]
+
+ cmp rsi,QWORD[((64+16))+rsp]
+
+ mov DWORD[rdi],eax
+ mov DWORD[4+rdi],ebx
+ mov DWORD[8+rdi],ecx
+ mov DWORD[12+rdi],edx
+ mov DWORD[16+rdi],r8d
+ mov DWORD[20+rdi],r9d
+ mov DWORD[24+rdi],r10d
+ mov DWORD[28+rdi],r11d
+ jb NEAR $L$loop_ssse3
+
+ mov rsi,QWORD[88+rsp]
+
+ movaps xmm6,XMMWORD[((64+32))+rsp]
+ movaps xmm7,XMMWORD[((64+48))+rsp]
+ movaps xmm8,XMMWORD[((64+64))+rsp]
+ movaps xmm9,XMMWORD[((64+80))+rsp]
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$epilogue_ssse3:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha256_block_data_order_ssse3:
+
+ALIGN 64
+sha256_block_data_order_avx:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha256_block_data_order_avx:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+$L$avx_shortcut:
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ shl rdx,4
+ sub rsp,160
+ lea rdx,[rdx*4+rsi]
+ and rsp,-64
+ mov QWORD[((64+0))+rsp],rdi
+ mov QWORD[((64+8))+rsp],rsi
+ mov QWORD[((64+16))+rsp],rdx
+ mov QWORD[88+rsp],rax
+
+ movaps XMMWORD[(64+32)+rsp],xmm6
+ movaps XMMWORD[(64+48)+rsp],xmm7
+ movaps XMMWORD[(64+64)+rsp],xmm8
+ movaps XMMWORD[(64+80)+rsp],xmm9
+$L$prologue_avx:
+
+ vzeroupper
+ mov eax,DWORD[rdi]
+ mov ebx,DWORD[4+rdi]
+ mov ecx,DWORD[8+rdi]
+ mov edx,DWORD[12+rdi]
+ mov r8d,DWORD[16+rdi]
+ mov r9d,DWORD[20+rdi]
+ mov r10d,DWORD[24+rdi]
+ mov r11d,DWORD[28+rdi]
+ vmovdqa xmm8,XMMWORD[((K256+512+32))]
+ vmovdqa xmm9,XMMWORD[((K256+512+64))]
+ jmp NEAR $L$loop_avx
+ALIGN 16
+$L$loop_avx:
+ vmovdqa xmm7,XMMWORD[((K256+512))]
+ vmovdqu xmm0,XMMWORD[rsi]
+ vmovdqu xmm1,XMMWORD[16+rsi]
+ vmovdqu xmm2,XMMWORD[32+rsi]
+ vmovdqu xmm3,XMMWORD[48+rsi]
+ vpshufb xmm0,xmm0,xmm7
+ lea rbp,[K256]
+ vpshufb xmm1,xmm1,xmm7
+ vpshufb xmm2,xmm2,xmm7
+ vpaddd xmm4,xmm0,XMMWORD[rbp]
+ vpshufb xmm3,xmm3,xmm7
+ vpaddd xmm5,xmm1,XMMWORD[32+rbp]
+ vpaddd xmm6,xmm2,XMMWORD[64+rbp]
+ vpaddd xmm7,xmm3,XMMWORD[96+rbp]
+ vmovdqa XMMWORD[rsp],xmm4
+ mov r14d,eax
+ vmovdqa XMMWORD[16+rsp],xmm5
+ mov edi,ebx
+ vmovdqa XMMWORD[32+rsp],xmm6
+ xor edi,ecx
+ vmovdqa XMMWORD[48+rsp],xmm7
+ mov r13d,r8d
+ jmp NEAR $L$avx_00_47
+
+ALIGN 16
+$L$avx_00_47:
+ sub rbp,-128
+ vpalignr xmm4,xmm1,xmm0,4
+ shrd r13d,r13d,14
+ mov eax,r14d
+ mov r12d,r9d
+ vpalignr xmm7,xmm3,xmm2,4
+ shrd r14d,r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ vpsrld xmm6,xmm4,7
+ shrd r13d,r13d,5
+ xor r14d,eax
+ and r12d,r8d
+ vpaddd xmm0,xmm0,xmm7
+ xor r13d,r8d
+ add r11d,DWORD[rsp]
+ mov r15d,eax
+ vpsrld xmm7,xmm4,3
+ xor r12d,r10d
+ shrd r14d,r14d,11
+ xor r15d,ebx
+ vpslld xmm5,xmm4,14
+ add r11d,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ vpxor xmm4,xmm7,xmm6
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ vpshufd xmm7,xmm3,250
+ shrd r14d,r14d,2
+ add edx,r11d
+ add r11d,edi
+ vpsrld xmm6,xmm6,11
+ mov r13d,edx
+ add r14d,r11d
+ shrd r13d,r13d,14
+ vpxor xmm4,xmm4,xmm5
+ mov r11d,r14d
+ mov r12d,r8d
+ shrd r14d,r14d,9
+ vpslld xmm5,xmm5,11
+ xor r13d,edx
+ xor r12d,r9d
+ shrd r13d,r13d,5
+ vpxor xmm4,xmm4,xmm6
+ xor r14d,r11d
+ and r12d,edx
+ xor r13d,edx
+ vpsrld xmm6,xmm7,10
+ add r10d,DWORD[4+rsp]
+ mov edi,r11d
+ xor r12d,r9d
+ vpxor xmm4,xmm4,xmm5
+ shrd r14d,r14d,11
+ xor edi,eax
+ add r10d,r12d
+ vpsrlq xmm7,xmm7,17
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ vpaddd xmm0,xmm0,xmm4
+ add r10d,r13d
+ xor r15d,eax
+ shrd r14d,r14d,2
+ vpxor xmm6,xmm6,xmm7
+ add ecx,r10d
+ add r10d,r15d
+ mov r13d,ecx
+ vpsrlq xmm7,xmm7,2
+ add r14d,r10d
+ shrd r13d,r13d,14
+ mov r10d,r14d
+ vpxor xmm6,xmm6,xmm7
+ mov r12d,edx
+ shrd r14d,r14d,9
+ xor r13d,ecx
+ vpshufb xmm6,xmm6,xmm8
+ xor r12d,r8d
+ shrd r13d,r13d,5
+ xor r14d,r10d
+ vpaddd xmm0,xmm0,xmm6
+ and r12d,ecx
+ xor r13d,ecx
+ add r9d,DWORD[8+rsp]
+ vpshufd xmm7,xmm0,80
+ mov r15d,r10d
+ xor r12d,r8d
+ shrd r14d,r14d,11
+ vpsrld xmm6,xmm7,10
+ xor r15d,r11d
+ add r9d,r12d
+ shrd r13d,r13d,6
+ vpsrlq xmm7,xmm7,17
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ vpxor xmm6,xmm6,xmm7
+ xor edi,r11d
+ shrd r14d,r14d,2
+ add ebx,r9d
+ vpsrlq xmm7,xmm7,2
+ add r9d,edi
+ mov r13d,ebx
+ add r14d,r9d
+ vpxor xmm6,xmm6,xmm7
+ shrd r13d,r13d,14
+ mov r9d,r14d
+ mov r12d,ecx
+ vpshufb xmm6,xmm6,xmm9
+ shrd r14d,r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ vpaddd xmm0,xmm0,xmm6
+ shrd r13d,r13d,5
+ xor r14d,r9d
+ and r12d,ebx
+ vpaddd xmm6,xmm0,XMMWORD[rbp]
+ xor r13d,ebx
+ add r8d,DWORD[12+rsp]
+ mov edi,r9d
+ xor r12d,edx
+ shrd r14d,r14d,11
+ xor edi,r10d
+ add r8d,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ shrd r14d,r14d,2
+ add eax,r8d
+ add r8d,r15d
+ mov r13d,eax
+ add r14d,r8d
+ vmovdqa XMMWORD[rsp],xmm6
+ vpalignr xmm4,xmm2,xmm1,4
+ shrd r13d,r13d,14
+ mov r8d,r14d
+ mov r12d,ebx
+ vpalignr xmm7,xmm0,xmm3,4
+ shrd r14d,r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ vpsrld xmm6,xmm4,7
+ shrd r13d,r13d,5
+ xor r14d,r8d
+ and r12d,eax
+ vpaddd xmm1,xmm1,xmm7
+ xor r13d,eax
+ add edx,DWORD[16+rsp]
+ mov r15d,r8d
+ vpsrld xmm7,xmm4,3
+ xor r12d,ecx
+ shrd r14d,r14d,11
+ xor r15d,r9d
+ vpslld xmm5,xmm4,14
+ add edx,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ vpxor xmm4,xmm7,xmm6
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ vpshufd xmm7,xmm0,250
+ shrd r14d,r14d,2
+ add r11d,edx
+ add edx,edi
+ vpsrld xmm6,xmm6,11
+ mov r13d,r11d
+ add r14d,edx
+ shrd r13d,r13d,14
+ vpxor xmm4,xmm4,xmm5
+ mov edx,r14d
+ mov r12d,eax
+ shrd r14d,r14d,9
+ vpslld xmm5,xmm5,11
+ xor r13d,r11d
+ xor r12d,ebx
+ shrd r13d,r13d,5
+ vpxor xmm4,xmm4,xmm6
+ xor r14d,edx
+ and r12d,r11d
+ xor r13d,r11d
+ vpsrld xmm6,xmm7,10
+ add ecx,DWORD[20+rsp]
+ mov edi,edx
+ xor r12d,ebx
+ vpxor xmm4,xmm4,xmm5
+ shrd r14d,r14d,11
+ xor edi,r8d
+ add ecx,r12d
+ vpsrlq xmm7,xmm7,17
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,edx
+ vpaddd xmm1,xmm1,xmm4
+ add ecx,r13d
+ xor r15d,r8d
+ shrd r14d,r14d,2
+ vpxor xmm6,xmm6,xmm7
+ add r10d,ecx
+ add ecx,r15d
+ mov r13d,r10d
+ vpsrlq xmm7,xmm7,2
+ add r14d,ecx
+ shrd r13d,r13d,14
+ mov ecx,r14d
+ vpxor xmm6,xmm6,xmm7
+ mov r12d,r11d
+ shrd r14d,r14d,9
+ xor r13d,r10d
+ vpshufb xmm6,xmm6,xmm8
+ xor r12d,eax
+ shrd r13d,r13d,5
+ xor r14d,ecx
+ vpaddd xmm1,xmm1,xmm6
+ and r12d,r10d
+ xor r13d,r10d
+ add ebx,DWORD[24+rsp]
+ vpshufd xmm7,xmm1,80
+ mov r15d,ecx
+ xor r12d,eax
+ shrd r14d,r14d,11
+ vpsrld xmm6,xmm7,10
+ xor r15d,edx
+ add ebx,r12d
+ shrd r13d,r13d,6
+ vpsrlq xmm7,xmm7,17
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ vpxor xmm6,xmm6,xmm7
+ xor edi,edx
+ shrd r14d,r14d,2
+ add r9d,ebx
+ vpsrlq xmm7,xmm7,2
+ add ebx,edi
+ mov r13d,r9d
+ add r14d,ebx
+ vpxor xmm6,xmm6,xmm7
+ shrd r13d,r13d,14
+ mov ebx,r14d
+ mov r12d,r10d
+ vpshufb xmm6,xmm6,xmm9
+ shrd r14d,r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ vpaddd xmm1,xmm1,xmm6
+ shrd r13d,r13d,5
+ xor r14d,ebx
+ and r12d,r9d
+ vpaddd xmm6,xmm1,XMMWORD[32+rbp]
+ xor r13d,r9d
+ add eax,DWORD[28+rsp]
+ mov edi,ebx
+ xor r12d,r11d
+ shrd r14d,r14d,11
+ xor edi,ecx
+ add eax,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ shrd r14d,r14d,2
+ add r8d,eax
+ add eax,r15d
+ mov r13d,r8d
+ add r14d,eax
+ vmovdqa XMMWORD[16+rsp],xmm6
+ vpalignr xmm4,xmm3,xmm2,4
+ shrd r13d,r13d,14
+ mov eax,r14d
+ mov r12d,r9d
+ vpalignr xmm7,xmm1,xmm0,4
+ shrd r14d,r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ vpsrld xmm6,xmm4,7
+ shrd r13d,r13d,5
+ xor r14d,eax
+ and r12d,r8d
+ vpaddd xmm2,xmm2,xmm7
+ xor r13d,r8d
+ add r11d,DWORD[32+rsp]
+ mov r15d,eax
+ vpsrld xmm7,xmm4,3
+ xor r12d,r10d
+ shrd r14d,r14d,11
+ xor r15d,ebx
+ vpslld xmm5,xmm4,14
+ add r11d,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ vpxor xmm4,xmm7,xmm6
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ vpshufd xmm7,xmm1,250
+ shrd r14d,r14d,2
+ add edx,r11d
+ add r11d,edi
+ vpsrld xmm6,xmm6,11
+ mov r13d,edx
+ add r14d,r11d
+ shrd r13d,r13d,14
+ vpxor xmm4,xmm4,xmm5
+ mov r11d,r14d
+ mov r12d,r8d
+ shrd r14d,r14d,9
+ vpslld xmm5,xmm5,11
+ xor r13d,edx
+ xor r12d,r9d
+ shrd r13d,r13d,5
+ vpxor xmm4,xmm4,xmm6
+ xor r14d,r11d
+ and r12d,edx
+ xor r13d,edx
+ vpsrld xmm6,xmm7,10
+ add r10d,DWORD[36+rsp]
+ mov edi,r11d
+ xor r12d,r9d
+ vpxor xmm4,xmm4,xmm5
+ shrd r14d,r14d,11
+ xor edi,eax
+ add r10d,r12d
+ vpsrlq xmm7,xmm7,17
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ vpaddd xmm2,xmm2,xmm4
+ add r10d,r13d
+ xor r15d,eax
+ shrd r14d,r14d,2
+ vpxor xmm6,xmm6,xmm7
+ add ecx,r10d
+ add r10d,r15d
+ mov r13d,ecx
+ vpsrlq xmm7,xmm7,2
+ add r14d,r10d
+ shrd r13d,r13d,14
+ mov r10d,r14d
+ vpxor xmm6,xmm6,xmm7
+ mov r12d,edx
+ shrd r14d,r14d,9
+ xor r13d,ecx
+ vpshufb xmm6,xmm6,xmm8
+ xor r12d,r8d
+ shrd r13d,r13d,5
+ xor r14d,r10d
+ vpaddd xmm2,xmm2,xmm6
+ and r12d,ecx
+ xor r13d,ecx
+ add r9d,DWORD[40+rsp]
+ vpshufd xmm7,xmm2,80
+ mov r15d,r10d
+ xor r12d,r8d
+ shrd r14d,r14d,11
+ vpsrld xmm6,xmm7,10
+ xor r15d,r11d
+ add r9d,r12d
+ shrd r13d,r13d,6
+ vpsrlq xmm7,xmm7,17
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ vpxor xmm6,xmm6,xmm7
+ xor edi,r11d
+ shrd r14d,r14d,2
+ add ebx,r9d
+ vpsrlq xmm7,xmm7,2
+ add r9d,edi
+ mov r13d,ebx
+ add r14d,r9d
+ vpxor xmm6,xmm6,xmm7
+ shrd r13d,r13d,14
+ mov r9d,r14d
+ mov r12d,ecx
+ vpshufb xmm6,xmm6,xmm9
+ shrd r14d,r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ vpaddd xmm2,xmm2,xmm6
+ shrd r13d,r13d,5
+ xor r14d,r9d
+ and r12d,ebx
+ vpaddd xmm6,xmm2,XMMWORD[64+rbp]
+ xor r13d,ebx
+ add r8d,DWORD[44+rsp]
+ mov edi,r9d
+ xor r12d,edx
+ shrd r14d,r14d,11
+ xor edi,r10d
+ add r8d,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ shrd r14d,r14d,2
+ add eax,r8d
+ add r8d,r15d
+ mov r13d,eax
+ add r14d,r8d
+ vmovdqa XMMWORD[32+rsp],xmm6
+ vpalignr xmm4,xmm0,xmm3,4
+ shrd r13d,r13d,14
+ mov r8d,r14d
+ mov r12d,ebx
+ vpalignr xmm7,xmm2,xmm1,4
+ shrd r14d,r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ vpsrld xmm6,xmm4,7
+ shrd r13d,r13d,5
+ xor r14d,r8d
+ and r12d,eax
+ vpaddd xmm3,xmm3,xmm7
+ xor r13d,eax
+ add edx,DWORD[48+rsp]
+ mov r15d,r8d
+ vpsrld xmm7,xmm4,3
+ xor r12d,ecx
+ shrd r14d,r14d,11
+ xor r15d,r9d
+ vpslld xmm5,xmm4,14
+ add edx,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ vpxor xmm4,xmm7,xmm6
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ vpshufd xmm7,xmm2,250
+ shrd r14d,r14d,2
+ add r11d,edx
+ add edx,edi
+ vpsrld xmm6,xmm6,11
+ mov r13d,r11d
+ add r14d,edx
+ shrd r13d,r13d,14
+ vpxor xmm4,xmm4,xmm5
+ mov edx,r14d
+ mov r12d,eax
+ shrd r14d,r14d,9
+ vpslld xmm5,xmm5,11
+ xor r13d,r11d
+ xor r12d,ebx
+ shrd r13d,r13d,5
+ vpxor xmm4,xmm4,xmm6
+ xor r14d,edx
+ and r12d,r11d
+ xor r13d,r11d
+ vpsrld xmm6,xmm7,10
+ add ecx,DWORD[52+rsp]
+ mov edi,edx
+ xor r12d,ebx
+ vpxor xmm4,xmm4,xmm5
+ shrd r14d,r14d,11
+ xor edi,r8d
+ add ecx,r12d
+ vpsrlq xmm7,xmm7,17
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,edx
+ vpaddd xmm3,xmm3,xmm4
+ add ecx,r13d
+ xor r15d,r8d
+ shrd r14d,r14d,2
+ vpxor xmm6,xmm6,xmm7
+ add r10d,ecx
+ add ecx,r15d
+ mov r13d,r10d
+ vpsrlq xmm7,xmm7,2
+ add r14d,ecx
+ shrd r13d,r13d,14
+ mov ecx,r14d
+ vpxor xmm6,xmm6,xmm7
+ mov r12d,r11d
+ shrd r14d,r14d,9
+ xor r13d,r10d
+ vpshufb xmm6,xmm6,xmm8
+ xor r12d,eax
+ shrd r13d,r13d,5
+ xor r14d,ecx
+ vpaddd xmm3,xmm3,xmm6
+ and r12d,r10d
+ xor r13d,r10d
+ add ebx,DWORD[56+rsp]
+ vpshufd xmm7,xmm3,80
+ mov r15d,ecx
+ xor r12d,eax
+ shrd r14d,r14d,11
+ vpsrld xmm6,xmm7,10
+ xor r15d,edx
+ add ebx,r12d
+ shrd r13d,r13d,6
+ vpsrlq xmm7,xmm7,17
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ vpxor xmm6,xmm6,xmm7
+ xor edi,edx
+ shrd r14d,r14d,2
+ add r9d,ebx
+ vpsrlq xmm7,xmm7,2
+ add ebx,edi
+ mov r13d,r9d
+ add r14d,ebx
+ vpxor xmm6,xmm6,xmm7
+ shrd r13d,r13d,14
+ mov ebx,r14d
+ mov r12d,r10d
+ vpshufb xmm6,xmm6,xmm9
+ shrd r14d,r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ vpaddd xmm3,xmm3,xmm6
+ shrd r13d,r13d,5
+ xor r14d,ebx
+ and r12d,r9d
+ vpaddd xmm6,xmm3,XMMWORD[96+rbp]
+ xor r13d,r9d
+ add eax,DWORD[60+rsp]
+ mov edi,ebx
+ xor r12d,r11d
+ shrd r14d,r14d,11
+ xor edi,ecx
+ add eax,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ shrd r14d,r14d,2
+ add r8d,eax
+ add eax,r15d
+ mov r13d,r8d
+ add r14d,eax
+ vmovdqa XMMWORD[48+rsp],xmm6
+ cmp BYTE[131+rbp],0
+ jne NEAR $L$avx_00_47
+ shrd r13d,r13d,14
+ mov eax,r14d
+ mov r12d,r9d
+ shrd r14d,r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ shrd r13d,r13d,5
+ xor r14d,eax
+ and r12d,r8d
+ xor r13d,r8d
+ add r11d,DWORD[rsp]
+ mov r15d,eax
+ xor r12d,r10d
+ shrd r14d,r14d,11
+ xor r15d,ebx
+ add r11d,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ shrd r14d,r14d,2
+ add edx,r11d
+ add r11d,edi
+ mov r13d,edx
+ add r14d,r11d
+ shrd r13d,r13d,14
+ mov r11d,r14d
+ mov r12d,r8d
+ shrd r14d,r14d,9
+ xor r13d,edx
+ xor r12d,r9d
+ shrd r13d,r13d,5
+ xor r14d,r11d
+ and r12d,edx
+ xor r13d,edx
+ add r10d,DWORD[4+rsp]
+ mov edi,r11d
+ xor r12d,r9d
+ shrd r14d,r14d,11
+ xor edi,eax
+ add r10d,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ add r10d,r13d
+ xor r15d,eax
+ shrd r14d,r14d,2
+ add ecx,r10d
+ add r10d,r15d
+ mov r13d,ecx
+ add r14d,r10d
+ shrd r13d,r13d,14
+ mov r10d,r14d
+ mov r12d,edx
+ shrd r14d,r14d,9
+ xor r13d,ecx
+ xor r12d,r8d
+ shrd r13d,r13d,5
+ xor r14d,r10d
+ and r12d,ecx
+ xor r13d,ecx
+ add r9d,DWORD[8+rsp]
+ mov r15d,r10d
+ xor r12d,r8d
+ shrd r14d,r14d,11
+ xor r15d,r11d
+ add r9d,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ xor edi,r11d
+ shrd r14d,r14d,2
+ add ebx,r9d
+ add r9d,edi
+ mov r13d,ebx
+ add r14d,r9d
+ shrd r13d,r13d,14
+ mov r9d,r14d
+ mov r12d,ecx
+ shrd r14d,r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ shrd r13d,r13d,5
+ xor r14d,r9d
+ and r12d,ebx
+ xor r13d,ebx
+ add r8d,DWORD[12+rsp]
+ mov edi,r9d
+ xor r12d,edx
+ shrd r14d,r14d,11
+ xor edi,r10d
+ add r8d,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ shrd r14d,r14d,2
+ add eax,r8d
+ add r8d,r15d
+ mov r13d,eax
+ add r14d,r8d
+ shrd r13d,r13d,14
+ mov r8d,r14d
+ mov r12d,ebx
+ shrd r14d,r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ shrd r13d,r13d,5
+ xor r14d,r8d
+ and r12d,eax
+ xor r13d,eax
+ add edx,DWORD[16+rsp]
+ mov r15d,r8d
+ xor r12d,ecx
+ shrd r14d,r14d,11
+ xor r15d,r9d
+ add edx,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ shrd r14d,r14d,2
+ add r11d,edx
+ add edx,edi
+ mov r13d,r11d
+ add r14d,edx
+ shrd r13d,r13d,14
+ mov edx,r14d
+ mov r12d,eax
+ shrd r14d,r14d,9
+ xor r13d,r11d
+ xor r12d,ebx
+ shrd r13d,r13d,5
+ xor r14d,edx
+ and r12d,r11d
+ xor r13d,r11d
+ add ecx,DWORD[20+rsp]
+ mov edi,edx
+ xor r12d,ebx
+ shrd r14d,r14d,11
+ xor edi,r8d
+ add ecx,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,edx
+ add ecx,r13d
+ xor r15d,r8d
+ shrd r14d,r14d,2
+ add r10d,ecx
+ add ecx,r15d
+ mov r13d,r10d
+ add r14d,ecx
+ shrd r13d,r13d,14
+ mov ecx,r14d
+ mov r12d,r11d
+ shrd r14d,r14d,9
+ xor r13d,r10d
+ xor r12d,eax
+ shrd r13d,r13d,5
+ xor r14d,ecx
+ and r12d,r10d
+ xor r13d,r10d
+ add ebx,DWORD[24+rsp]
+ mov r15d,ecx
+ xor r12d,eax
+ shrd r14d,r14d,11
+ xor r15d,edx
+ add ebx,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ xor edi,edx
+ shrd r14d,r14d,2
+ add r9d,ebx
+ add ebx,edi
+ mov r13d,r9d
+ add r14d,ebx
+ shrd r13d,r13d,14
+ mov ebx,r14d
+ mov r12d,r10d
+ shrd r14d,r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ shrd r13d,r13d,5
+ xor r14d,ebx
+ and r12d,r9d
+ xor r13d,r9d
+ add eax,DWORD[28+rsp]
+ mov edi,ebx
+ xor r12d,r11d
+ shrd r14d,r14d,11
+ xor edi,ecx
+ add eax,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ shrd r14d,r14d,2
+ add r8d,eax
+ add eax,r15d
+ mov r13d,r8d
+ add r14d,eax
+ shrd r13d,r13d,14
+ mov eax,r14d
+ mov r12d,r9d
+ shrd r14d,r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ shrd r13d,r13d,5
+ xor r14d,eax
+ and r12d,r8d
+ xor r13d,r8d
+ add r11d,DWORD[32+rsp]
+ mov r15d,eax
+ xor r12d,r10d
+ shrd r14d,r14d,11
+ xor r15d,ebx
+ add r11d,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ shrd r14d,r14d,2
+ add edx,r11d
+ add r11d,edi
+ mov r13d,edx
+ add r14d,r11d
+ shrd r13d,r13d,14
+ mov r11d,r14d
+ mov r12d,r8d
+ shrd r14d,r14d,9
+ xor r13d,edx
+ xor r12d,r9d
+ shrd r13d,r13d,5
+ xor r14d,r11d
+ and r12d,edx
+ xor r13d,edx
+ add r10d,DWORD[36+rsp]
+ mov edi,r11d
+ xor r12d,r9d
+ shrd r14d,r14d,11
+ xor edi,eax
+ add r10d,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ add r10d,r13d
+ xor r15d,eax
+ shrd r14d,r14d,2
+ add ecx,r10d
+ add r10d,r15d
+ mov r13d,ecx
+ add r14d,r10d
+ shrd r13d,r13d,14
+ mov r10d,r14d
+ mov r12d,edx
+ shrd r14d,r14d,9
+ xor r13d,ecx
+ xor r12d,r8d
+ shrd r13d,r13d,5
+ xor r14d,r10d
+ and r12d,ecx
+ xor r13d,ecx
+ add r9d,DWORD[40+rsp]
+ mov r15d,r10d
+ xor r12d,r8d
+ shrd r14d,r14d,11
+ xor r15d,r11d
+ add r9d,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ xor edi,r11d
+ shrd r14d,r14d,2
+ add ebx,r9d
+ add r9d,edi
+ mov r13d,ebx
+ add r14d,r9d
+ shrd r13d,r13d,14
+ mov r9d,r14d
+ mov r12d,ecx
+ shrd r14d,r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ shrd r13d,r13d,5
+ xor r14d,r9d
+ and r12d,ebx
+ xor r13d,ebx
+ add r8d,DWORD[44+rsp]
+ mov edi,r9d
+ xor r12d,edx
+ shrd r14d,r14d,11
+ xor edi,r10d
+ add r8d,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ shrd r14d,r14d,2
+ add eax,r8d
+ add r8d,r15d
+ mov r13d,eax
+ add r14d,r8d
+ shrd r13d,r13d,14
+ mov r8d,r14d
+ mov r12d,ebx
+ shrd r14d,r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ shrd r13d,r13d,5
+ xor r14d,r8d
+ and r12d,eax
+ xor r13d,eax
+ add edx,DWORD[48+rsp]
+ mov r15d,r8d
+ xor r12d,ecx
+ shrd r14d,r14d,11
+ xor r15d,r9d
+ add edx,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ shrd r14d,r14d,2
+ add r11d,edx
+ add edx,edi
+ mov r13d,r11d
+ add r14d,edx
+ shrd r13d,r13d,14
+ mov edx,r14d
+ mov r12d,eax
+ shrd r14d,r14d,9
+ xor r13d,r11d
+ xor r12d,ebx
+ shrd r13d,r13d,5
+ xor r14d,edx
+ and r12d,r11d
+ xor r13d,r11d
+ add ecx,DWORD[52+rsp]
+ mov edi,edx
+ xor r12d,ebx
+ shrd r14d,r14d,11
+ xor edi,r8d
+ add ecx,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,edx
+ add ecx,r13d
+ xor r15d,r8d
+ shrd r14d,r14d,2
+ add r10d,ecx
+ add ecx,r15d
+ mov r13d,r10d
+ add r14d,ecx
+ shrd r13d,r13d,14
+ mov ecx,r14d
+ mov r12d,r11d
+ shrd r14d,r14d,9
+ xor r13d,r10d
+ xor r12d,eax
+ shrd r13d,r13d,5
+ xor r14d,ecx
+ and r12d,r10d
+ xor r13d,r10d
+ add ebx,DWORD[56+rsp]
+ mov r15d,ecx
+ xor r12d,eax
+ shrd r14d,r14d,11
+ xor r15d,edx
+ add ebx,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ xor edi,edx
+ shrd r14d,r14d,2
+ add r9d,ebx
+ add ebx,edi
+ mov r13d,r9d
+ add r14d,ebx
+ shrd r13d,r13d,14
+ mov ebx,r14d
+ mov r12d,r10d
+ shrd r14d,r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ shrd r13d,r13d,5
+ xor r14d,ebx
+ and r12d,r9d
+ xor r13d,r9d
+ add eax,DWORD[60+rsp]
+ mov edi,ebx
+ xor r12d,r11d
+ shrd r14d,r14d,11
+ xor edi,ecx
+ add eax,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ shrd r14d,r14d,2
+ add r8d,eax
+ add eax,r15d
+ mov r13d,r8d
+ add r14d,eax
+ mov rdi,QWORD[((64+0))+rsp]
+ mov eax,r14d
+
+ add eax,DWORD[rdi]
+ lea rsi,[64+rsi]
+ add ebx,DWORD[4+rdi]
+ add ecx,DWORD[8+rdi]
+ add edx,DWORD[12+rdi]
+ add r8d,DWORD[16+rdi]
+ add r9d,DWORD[20+rdi]
+ add r10d,DWORD[24+rdi]
+ add r11d,DWORD[28+rdi]
+
+ cmp rsi,QWORD[((64+16))+rsp]
+
+ mov DWORD[rdi],eax
+ mov DWORD[4+rdi],ebx
+ mov DWORD[8+rdi],ecx
+ mov DWORD[12+rdi],edx
+ mov DWORD[16+rdi],r8d
+ mov DWORD[20+rdi],r9d
+ mov DWORD[24+rdi],r10d
+ mov DWORD[28+rdi],r11d
+ jb NEAR $L$loop_avx
+
+ mov rsi,QWORD[88+rsp]
+
+ vzeroupper
+ movaps xmm6,XMMWORD[((64+32))+rsp]
+ movaps xmm7,XMMWORD[((64+48))+rsp]
+ movaps xmm8,XMMWORD[((64+64))+rsp]
+ movaps xmm9,XMMWORD[((64+80))+rsp]
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$epilogue_avx:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha256_block_data_order_avx:
+
+ALIGN 64
+sha256_block_data_order_avx2:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha256_block_data_order_avx2:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+$L$avx2_shortcut:
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ sub rsp,608
+ shl rdx,4
+ and rsp,-256*4
+ lea rdx,[rdx*4+rsi]
+ add rsp,448
+ mov QWORD[((64+0))+rsp],rdi
+ mov QWORD[((64+8))+rsp],rsi
+ mov QWORD[((64+16))+rsp],rdx
+ mov QWORD[88+rsp],rax
+
+ movaps XMMWORD[(64+32)+rsp],xmm6
+ movaps XMMWORD[(64+48)+rsp],xmm7
+ movaps XMMWORD[(64+64)+rsp],xmm8
+ movaps XMMWORD[(64+80)+rsp],xmm9
+$L$prologue_avx2:
+
+ vzeroupper
+ sub rsi,-16*4
+ mov eax,DWORD[rdi]
+ mov r12,rsi
+ mov ebx,DWORD[4+rdi]
+ cmp rsi,rdx
+ mov ecx,DWORD[8+rdi]
+ cmove r12,rsp
+ mov edx,DWORD[12+rdi]
+ mov r8d,DWORD[16+rdi]
+ mov r9d,DWORD[20+rdi]
+ mov r10d,DWORD[24+rdi]
+ mov r11d,DWORD[28+rdi]
+ vmovdqa ymm8,YMMWORD[((K256+512+32))]
+ vmovdqa ymm9,YMMWORD[((K256+512+64))]
+ jmp NEAR $L$oop_avx2
+ALIGN 16
+$L$oop_avx2:
+ vmovdqa ymm7,YMMWORD[((K256+512))]
+ vmovdqu xmm0,XMMWORD[((-64+0))+rsi]
+ vmovdqu xmm1,XMMWORD[((-64+16))+rsi]
+ vmovdqu xmm2,XMMWORD[((-64+32))+rsi]
+ vmovdqu xmm3,XMMWORD[((-64+48))+rsi]
+
+ vinserti128 ymm0,ymm0,XMMWORD[r12],1
+ vinserti128 ymm1,ymm1,XMMWORD[16+r12],1
+ vpshufb ymm0,ymm0,ymm7
+ vinserti128 ymm2,ymm2,XMMWORD[32+r12],1
+ vpshufb ymm1,ymm1,ymm7
+ vinserti128 ymm3,ymm3,XMMWORD[48+r12],1
+
+ lea rbp,[K256]
+ vpshufb ymm2,ymm2,ymm7
+ vpaddd ymm4,ymm0,YMMWORD[rbp]
+ vpshufb ymm3,ymm3,ymm7
+ vpaddd ymm5,ymm1,YMMWORD[32+rbp]
+ vpaddd ymm6,ymm2,YMMWORD[64+rbp]
+ vpaddd ymm7,ymm3,YMMWORD[96+rbp]
+ vmovdqa YMMWORD[rsp],ymm4
+ xor r14d,r14d
+ vmovdqa YMMWORD[32+rsp],ymm5
+ lea rsp,[((-64))+rsp]
+ mov edi,ebx
+ vmovdqa YMMWORD[rsp],ymm6
+ xor edi,ecx
+ vmovdqa YMMWORD[32+rsp],ymm7
+ mov r12d,r9d
+ sub rbp,-16*2*4
+ jmp NEAR $L$avx2_00_47
+
+ALIGN 16
+$L$avx2_00_47:
+ lea rsp,[((-64))+rsp]
+ vpalignr ymm4,ymm1,ymm0,4
+ add r11d,DWORD[((0+128))+rsp]
+ and r12d,r8d
+ rorx r13d,r8d,25
+ vpalignr ymm7,ymm3,ymm2,4
+ rorx r15d,r8d,11
+ lea eax,[r14*1+rax]
+ lea r11d,[r12*1+r11]
+ vpsrld ymm6,ymm4,7
+ andn r12d,r8d,r10d
+ xor r13d,r15d
+ rorx r14d,r8d,6
+ vpaddd ymm0,ymm0,ymm7
+ lea r11d,[r12*1+r11]
+ xor r13d,r14d
+ mov r15d,eax
+ vpsrld ymm7,ymm4,3
+ rorx r12d,eax,22
+ lea r11d,[r13*1+r11]
+ xor r15d,ebx
+ vpslld ymm5,ymm4,14
+ rorx r14d,eax,13
+ rorx r13d,eax,2
+ lea edx,[r11*1+rdx]
+ vpxor ymm4,ymm7,ymm6
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,ebx
+ vpshufd ymm7,ymm3,250
+ xor r14d,r13d
+ lea r11d,[rdi*1+r11]
+ mov r12d,r8d
+ vpsrld ymm6,ymm6,11
+ add r10d,DWORD[((4+128))+rsp]
+ and r12d,edx
+ rorx r13d,edx,25
+ vpxor ymm4,ymm4,ymm5
+ rorx edi,edx,11
+ lea r11d,[r14*1+r11]
+ lea r10d,[r12*1+r10]
+ vpslld ymm5,ymm5,11
+ andn r12d,edx,r9d
+ xor r13d,edi
+ rorx r14d,edx,6
+ vpxor ymm4,ymm4,ymm6
+ lea r10d,[r12*1+r10]
+ xor r13d,r14d
+ mov edi,r11d
+ vpsrld ymm6,ymm7,10
+ rorx r12d,r11d,22
+ lea r10d,[r13*1+r10]
+ xor edi,eax
+ vpxor ymm4,ymm4,ymm5
+ rorx r14d,r11d,13
+ rorx r13d,r11d,2
+ lea ecx,[r10*1+rcx]
+ vpsrlq ymm7,ymm7,17
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,eax
+ vpaddd ymm0,ymm0,ymm4
+ xor r14d,r13d
+ lea r10d,[r15*1+r10]
+ mov r12d,edx
+ vpxor ymm6,ymm6,ymm7
+ add r9d,DWORD[((8+128))+rsp]
+ and r12d,ecx
+ rorx r13d,ecx,25
+ vpsrlq ymm7,ymm7,2
+ rorx r15d,ecx,11
+ lea r10d,[r14*1+r10]
+ lea r9d,[r12*1+r9]
+ vpxor ymm6,ymm6,ymm7
+ andn r12d,ecx,r8d
+ xor r13d,r15d
+ rorx r14d,ecx,6
+ vpshufb ymm6,ymm6,ymm8
+ lea r9d,[r12*1+r9]
+ xor r13d,r14d
+ mov r15d,r10d
+ vpaddd ymm0,ymm0,ymm6
+ rorx r12d,r10d,22
+ lea r9d,[r13*1+r9]
+ xor r15d,r11d
+ vpshufd ymm7,ymm0,80
+ rorx r14d,r10d,13
+ rorx r13d,r10d,2
+ lea ebx,[r9*1+rbx]
+ vpsrld ymm6,ymm7,10
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,r11d
+ vpsrlq ymm7,ymm7,17
+ xor r14d,r13d
+ lea r9d,[rdi*1+r9]
+ mov r12d,ecx
+ vpxor ymm6,ymm6,ymm7
+ add r8d,DWORD[((12+128))+rsp]
+ and r12d,ebx
+ rorx r13d,ebx,25
+ vpsrlq ymm7,ymm7,2
+ rorx edi,ebx,11
+ lea r9d,[r14*1+r9]
+ lea r8d,[r12*1+r8]
+ vpxor ymm6,ymm6,ymm7
+ andn r12d,ebx,edx
+ xor r13d,edi
+ rorx r14d,ebx,6
+ vpshufb ymm6,ymm6,ymm9
+ lea r8d,[r12*1+r8]
+ xor r13d,r14d
+ mov edi,r9d
+ vpaddd ymm0,ymm0,ymm6
+ rorx r12d,r9d,22
+ lea r8d,[r13*1+r8]
+ xor edi,r10d
+ vpaddd ymm6,ymm0,YMMWORD[rbp]
+ rorx r14d,r9d,13
+ rorx r13d,r9d,2
+ lea eax,[r8*1+rax]
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,r10d
+ xor r14d,r13d
+ lea r8d,[r15*1+r8]
+ mov r12d,ebx
+ vmovdqa YMMWORD[rsp],ymm6
+ vpalignr ymm4,ymm2,ymm1,4
+ add edx,DWORD[((32+128))+rsp]
+ and r12d,eax
+ rorx r13d,eax,25
+ vpalignr ymm7,ymm0,ymm3,4
+ rorx r15d,eax,11
+ lea r8d,[r14*1+r8]
+ lea edx,[r12*1+rdx]
+ vpsrld ymm6,ymm4,7
+ andn r12d,eax,ecx
+ xor r13d,r15d
+ rorx r14d,eax,6
+ vpaddd ymm1,ymm1,ymm7
+ lea edx,[r12*1+rdx]
+ xor r13d,r14d
+ mov r15d,r8d
+ vpsrld ymm7,ymm4,3
+ rorx r12d,r8d,22
+ lea edx,[r13*1+rdx]
+ xor r15d,r9d
+ vpslld ymm5,ymm4,14
+ rorx r14d,r8d,13
+ rorx r13d,r8d,2
+ lea r11d,[rdx*1+r11]
+ vpxor ymm4,ymm7,ymm6
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,r9d
+ vpshufd ymm7,ymm0,250
+ xor r14d,r13d
+ lea edx,[rdi*1+rdx]
+ mov r12d,eax
+ vpsrld ymm6,ymm6,11
+ add ecx,DWORD[((36+128))+rsp]
+ and r12d,r11d
+ rorx r13d,r11d,25
+ vpxor ymm4,ymm4,ymm5
+ rorx edi,r11d,11
+ lea edx,[r14*1+rdx]
+ lea ecx,[r12*1+rcx]
+ vpslld ymm5,ymm5,11
+ andn r12d,r11d,ebx
+ xor r13d,edi
+ rorx r14d,r11d,6
+ vpxor ymm4,ymm4,ymm6
+ lea ecx,[r12*1+rcx]
+ xor r13d,r14d
+ mov edi,edx
+ vpsrld ymm6,ymm7,10
+ rorx r12d,edx,22
+ lea ecx,[r13*1+rcx]
+ xor edi,r8d
+ vpxor ymm4,ymm4,ymm5
+ rorx r14d,edx,13
+ rorx r13d,edx,2
+ lea r10d,[rcx*1+r10]
+ vpsrlq ymm7,ymm7,17
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,r8d
+ vpaddd ymm1,ymm1,ymm4
+ xor r14d,r13d
+ lea ecx,[r15*1+rcx]
+ mov r12d,r11d
+ vpxor ymm6,ymm6,ymm7
+ add ebx,DWORD[((40+128))+rsp]
+ and r12d,r10d
+ rorx r13d,r10d,25
+ vpsrlq ymm7,ymm7,2
+ rorx r15d,r10d,11
+ lea ecx,[r14*1+rcx]
+ lea ebx,[r12*1+rbx]
+ vpxor ymm6,ymm6,ymm7
+ andn r12d,r10d,eax
+ xor r13d,r15d
+ rorx r14d,r10d,6
+ vpshufb ymm6,ymm6,ymm8
+ lea ebx,[r12*1+rbx]
+ xor r13d,r14d
+ mov r15d,ecx
+ vpaddd ymm1,ymm1,ymm6
+ rorx r12d,ecx,22
+ lea ebx,[r13*1+rbx]
+ xor r15d,edx
+ vpshufd ymm7,ymm1,80
+ rorx r14d,ecx,13
+ rorx r13d,ecx,2
+ lea r9d,[rbx*1+r9]
+ vpsrld ymm6,ymm7,10
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,edx
+ vpsrlq ymm7,ymm7,17
+ xor r14d,r13d
+ lea ebx,[rdi*1+rbx]
+ mov r12d,r10d
+ vpxor ymm6,ymm6,ymm7
+ add eax,DWORD[((44+128))+rsp]
+ and r12d,r9d
+ rorx r13d,r9d,25
+ vpsrlq ymm7,ymm7,2
+ rorx edi,r9d,11
+ lea ebx,[r14*1+rbx]
+ lea eax,[r12*1+rax]
+ vpxor ymm6,ymm6,ymm7
+ andn r12d,r9d,r11d
+ xor r13d,edi
+ rorx r14d,r9d,6
+ vpshufb ymm6,ymm6,ymm9
+ lea eax,[r12*1+rax]
+ xor r13d,r14d
+ mov edi,ebx
+ vpaddd ymm1,ymm1,ymm6
+ rorx r12d,ebx,22
+ lea eax,[r13*1+rax]
+ xor edi,ecx
+ vpaddd ymm6,ymm1,YMMWORD[32+rbp]
+ rorx r14d,ebx,13
+ rorx r13d,ebx,2
+ lea r8d,[rax*1+r8]
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,ecx
+ xor r14d,r13d
+ lea eax,[r15*1+rax]
+ mov r12d,r9d
+ vmovdqa YMMWORD[32+rsp],ymm6
+ lea rsp,[((-64))+rsp]
+ vpalignr ymm4,ymm3,ymm2,4
+ add r11d,DWORD[((0+128))+rsp]
+ and r12d,r8d
+ rorx r13d,r8d,25
+ vpalignr ymm7,ymm1,ymm0,4
+ rorx r15d,r8d,11
+ lea eax,[r14*1+rax]
+ lea r11d,[r12*1+r11]
+ vpsrld ymm6,ymm4,7
+ andn r12d,r8d,r10d
+ xor r13d,r15d
+ rorx r14d,r8d,6
+ vpaddd ymm2,ymm2,ymm7
+ lea r11d,[r12*1+r11]
+ xor r13d,r14d
+ mov r15d,eax
+ vpsrld ymm7,ymm4,3
+ rorx r12d,eax,22
+ lea r11d,[r13*1+r11]
+ xor r15d,ebx
+ vpslld ymm5,ymm4,14
+ rorx r14d,eax,13
+ rorx r13d,eax,2
+ lea edx,[r11*1+rdx]
+ vpxor ymm4,ymm7,ymm6
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,ebx
+ vpshufd ymm7,ymm1,250
+ xor r14d,r13d
+ lea r11d,[rdi*1+r11]
+ mov r12d,r8d
+ vpsrld ymm6,ymm6,11
+ add r10d,DWORD[((4+128))+rsp]
+ and r12d,edx
+ rorx r13d,edx,25
+ vpxor ymm4,ymm4,ymm5
+ rorx edi,edx,11
+ lea r11d,[r14*1+r11]
+ lea r10d,[r12*1+r10]
+ vpslld ymm5,ymm5,11
+ andn r12d,edx,r9d
+ xor r13d,edi
+ rorx r14d,edx,6
+ vpxor ymm4,ymm4,ymm6
+ lea r10d,[r12*1+r10]
+ xor r13d,r14d
+ mov edi,r11d
+ vpsrld ymm6,ymm7,10
+ rorx r12d,r11d,22
+ lea r10d,[r13*1+r10]
+ xor edi,eax
+ vpxor ymm4,ymm4,ymm5
+ rorx r14d,r11d,13
+ rorx r13d,r11d,2
+ lea ecx,[r10*1+rcx]
+ vpsrlq ymm7,ymm7,17
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,eax
+ vpaddd ymm2,ymm2,ymm4
+ xor r14d,r13d
+ lea r10d,[r15*1+r10]
+ mov r12d,edx
+ vpxor ymm6,ymm6,ymm7
+ add r9d,DWORD[((8+128))+rsp]
+ and r12d,ecx
+ rorx r13d,ecx,25
+ vpsrlq ymm7,ymm7,2
+ rorx r15d,ecx,11
+ lea r10d,[r14*1+r10]
+ lea r9d,[r12*1+r9]
+ vpxor ymm6,ymm6,ymm7
+ andn r12d,ecx,r8d
+ xor r13d,r15d
+ rorx r14d,ecx,6
+ vpshufb ymm6,ymm6,ymm8
+ lea r9d,[r12*1+r9]
+ xor r13d,r14d
+ mov r15d,r10d
+ vpaddd ymm2,ymm2,ymm6
+ rorx r12d,r10d,22
+ lea r9d,[r13*1+r9]
+ xor r15d,r11d
+ vpshufd ymm7,ymm2,80
+ rorx r14d,r10d,13
+ rorx r13d,r10d,2
+ lea ebx,[r9*1+rbx]
+ vpsrld ymm6,ymm7,10
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,r11d
+ vpsrlq ymm7,ymm7,17
+ xor r14d,r13d
+ lea r9d,[rdi*1+r9]
+ mov r12d,ecx
+ vpxor ymm6,ymm6,ymm7
+ add r8d,DWORD[((12+128))+rsp]
+ and r12d,ebx
+ rorx r13d,ebx,25
+ vpsrlq ymm7,ymm7,2
+ rorx edi,ebx,11
+ lea r9d,[r14*1+r9]
+ lea r8d,[r12*1+r8]
+ vpxor ymm6,ymm6,ymm7
+ andn r12d,ebx,edx
+ xor r13d,edi
+ rorx r14d,ebx,6
+ vpshufb ymm6,ymm6,ymm9
+ lea r8d,[r12*1+r8]
+ xor r13d,r14d
+ mov edi,r9d
+ vpaddd ymm2,ymm2,ymm6
+ rorx r12d,r9d,22
+ lea r8d,[r13*1+r8]
+ xor edi,r10d
+ vpaddd ymm6,ymm2,YMMWORD[64+rbp]
+ rorx r14d,r9d,13
+ rorx r13d,r9d,2
+ lea eax,[r8*1+rax]
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,r10d
+ xor r14d,r13d
+ lea r8d,[r15*1+r8]
+ mov r12d,ebx
+ vmovdqa YMMWORD[rsp],ymm6
+ vpalignr ymm4,ymm0,ymm3,4
+ add edx,DWORD[((32+128))+rsp]
+ and r12d,eax
+ rorx r13d,eax,25
+ vpalignr ymm7,ymm2,ymm1,4
+ rorx r15d,eax,11
+ lea r8d,[r14*1+r8]
+ lea edx,[r12*1+rdx]
+ vpsrld ymm6,ymm4,7
+ andn r12d,eax,ecx
+ xor r13d,r15d
+ rorx r14d,eax,6
+ vpaddd ymm3,ymm3,ymm7
+ lea edx,[r12*1+rdx]
+ xor r13d,r14d
+ mov r15d,r8d
+ vpsrld ymm7,ymm4,3
+ rorx r12d,r8d,22
+ lea edx,[r13*1+rdx]
+ xor r15d,r9d
+ vpslld ymm5,ymm4,14
+ rorx r14d,r8d,13
+ rorx r13d,r8d,2
+ lea r11d,[rdx*1+r11]
+ vpxor ymm4,ymm7,ymm6
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,r9d
+ vpshufd ymm7,ymm2,250
+ xor r14d,r13d
+ lea edx,[rdi*1+rdx]
+ mov r12d,eax
+ vpsrld ymm6,ymm6,11
+ add ecx,DWORD[((36+128))+rsp]
+ and r12d,r11d
+ rorx r13d,r11d,25
+ vpxor ymm4,ymm4,ymm5
+ rorx edi,r11d,11
+ lea edx,[r14*1+rdx]
+ lea ecx,[r12*1+rcx]
+ vpslld ymm5,ymm5,11
+ andn r12d,r11d,ebx
+ xor r13d,edi
+ rorx r14d,r11d,6
+ vpxor ymm4,ymm4,ymm6
+ lea ecx,[r12*1+rcx]
+ xor r13d,r14d
+ mov edi,edx
+ vpsrld ymm6,ymm7,10
+ rorx r12d,edx,22
+ lea ecx,[r13*1+rcx]
+ xor edi,r8d
+ vpxor ymm4,ymm4,ymm5
+ rorx r14d,edx,13
+ rorx r13d,edx,2
+ lea r10d,[rcx*1+r10]
+ vpsrlq ymm7,ymm7,17
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,r8d
+ vpaddd ymm3,ymm3,ymm4
+ xor r14d,r13d
+ lea ecx,[r15*1+rcx]
+ mov r12d,r11d
+ vpxor ymm6,ymm6,ymm7
+ add ebx,DWORD[((40+128))+rsp]
+ and r12d,r10d
+ rorx r13d,r10d,25
+ vpsrlq ymm7,ymm7,2
+ rorx r15d,r10d,11
+ lea ecx,[r14*1+rcx]
+ lea ebx,[r12*1+rbx]
+ vpxor ymm6,ymm6,ymm7
+ andn r12d,r10d,eax
+ xor r13d,r15d
+ rorx r14d,r10d,6
+ vpshufb ymm6,ymm6,ymm8
+ lea ebx,[r12*1+rbx]
+ xor r13d,r14d
+ mov r15d,ecx
+ vpaddd ymm3,ymm3,ymm6
+ rorx r12d,ecx,22
+ lea ebx,[r13*1+rbx]
+ xor r15d,edx
+ vpshufd ymm7,ymm3,80
+ rorx r14d,ecx,13
+ rorx r13d,ecx,2
+ lea r9d,[rbx*1+r9]
+ vpsrld ymm6,ymm7,10
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,edx
+ vpsrlq ymm7,ymm7,17
+ xor r14d,r13d
+ lea ebx,[rdi*1+rbx]
+ mov r12d,r10d
+ vpxor ymm6,ymm6,ymm7
+ add eax,DWORD[((44+128))+rsp]
+ and r12d,r9d
+ rorx r13d,r9d,25
+ vpsrlq ymm7,ymm7,2
+ rorx edi,r9d,11
+ lea ebx,[r14*1+rbx]
+ lea eax,[r12*1+rax]
+ vpxor ymm6,ymm6,ymm7
+ andn r12d,r9d,r11d
+ xor r13d,edi
+ rorx r14d,r9d,6
+ vpshufb ymm6,ymm6,ymm9
+ lea eax,[r12*1+rax]
+ xor r13d,r14d
+ mov edi,ebx
+ vpaddd ymm3,ymm3,ymm6
+ rorx r12d,ebx,22
+ lea eax,[r13*1+rax]
+ xor edi,ecx
+ vpaddd ymm6,ymm3,YMMWORD[96+rbp]
+ rorx r14d,ebx,13
+ rorx r13d,ebx,2
+ lea r8d,[rax*1+r8]
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,ecx
+ xor r14d,r13d
+ lea eax,[r15*1+rax]
+ mov r12d,r9d
+ vmovdqa YMMWORD[32+rsp],ymm6
+ lea rbp,[128+rbp]
+ cmp BYTE[3+rbp],0
+ jne NEAR $L$avx2_00_47
+ add r11d,DWORD[((0+64))+rsp]
+ and r12d,r8d
+ rorx r13d,r8d,25
+ rorx r15d,r8d,11
+ lea eax,[r14*1+rax]
+ lea r11d,[r12*1+r11]
+ andn r12d,r8d,r10d
+ xor r13d,r15d
+ rorx r14d,r8d,6
+ lea r11d,[r12*1+r11]
+ xor r13d,r14d
+ mov r15d,eax
+ rorx r12d,eax,22
+ lea r11d,[r13*1+r11]
+ xor r15d,ebx
+ rorx r14d,eax,13
+ rorx r13d,eax,2
+ lea edx,[r11*1+rdx]
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,ebx
+ xor r14d,r13d
+ lea r11d,[rdi*1+r11]
+ mov r12d,r8d
+ add r10d,DWORD[((4+64))+rsp]
+ and r12d,edx
+ rorx r13d,edx,25
+ rorx edi,edx,11
+ lea r11d,[r14*1+r11]
+ lea r10d,[r12*1+r10]
+ andn r12d,edx,r9d
+ xor r13d,edi
+ rorx r14d,edx,6
+ lea r10d,[r12*1+r10]
+ xor r13d,r14d
+ mov edi,r11d
+ rorx r12d,r11d,22
+ lea r10d,[r13*1+r10]
+ xor edi,eax
+ rorx r14d,r11d,13
+ rorx r13d,r11d,2
+ lea ecx,[r10*1+rcx]
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,eax
+ xor r14d,r13d
+ lea r10d,[r15*1+r10]
+ mov r12d,edx
+ add r9d,DWORD[((8+64))+rsp]
+ and r12d,ecx
+ rorx r13d,ecx,25
+ rorx r15d,ecx,11
+ lea r10d,[r14*1+r10]
+ lea r9d,[r12*1+r9]
+ andn r12d,ecx,r8d
+ xor r13d,r15d
+ rorx r14d,ecx,6
+ lea r9d,[r12*1+r9]
+ xor r13d,r14d
+ mov r15d,r10d
+ rorx r12d,r10d,22
+ lea r9d,[r13*1+r9]
+ xor r15d,r11d
+ rorx r14d,r10d,13
+ rorx r13d,r10d,2
+ lea ebx,[r9*1+rbx]
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,r11d
+ xor r14d,r13d
+ lea r9d,[rdi*1+r9]
+ mov r12d,ecx
+ add r8d,DWORD[((12+64))+rsp]
+ and r12d,ebx
+ rorx r13d,ebx,25
+ rorx edi,ebx,11
+ lea r9d,[r14*1+r9]
+ lea r8d,[r12*1+r8]
+ andn r12d,ebx,edx
+ xor r13d,edi
+ rorx r14d,ebx,6
+ lea r8d,[r12*1+r8]
+ xor r13d,r14d
+ mov edi,r9d
+ rorx r12d,r9d,22
+ lea r8d,[r13*1+r8]
+ xor edi,r10d
+ rorx r14d,r9d,13
+ rorx r13d,r9d,2
+ lea eax,[r8*1+rax]
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,r10d
+ xor r14d,r13d
+ lea r8d,[r15*1+r8]
+ mov r12d,ebx
+ add edx,DWORD[((32+64))+rsp]
+ and r12d,eax
+ rorx r13d,eax,25
+ rorx r15d,eax,11
+ lea r8d,[r14*1+r8]
+ lea edx,[r12*1+rdx]
+ andn r12d,eax,ecx
+ xor r13d,r15d
+ rorx r14d,eax,6
+ lea edx,[r12*1+rdx]
+ xor r13d,r14d
+ mov r15d,r8d
+ rorx r12d,r8d,22
+ lea edx,[r13*1+rdx]
+ xor r15d,r9d
+ rorx r14d,r8d,13
+ rorx r13d,r8d,2
+ lea r11d,[rdx*1+r11]
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,r9d
+ xor r14d,r13d
+ lea edx,[rdi*1+rdx]
+ mov r12d,eax
+ add ecx,DWORD[((36+64))+rsp]
+ and r12d,r11d
+ rorx r13d,r11d,25
+ rorx edi,r11d,11
+ lea edx,[r14*1+rdx]
+ lea ecx,[r12*1+rcx]
+ andn r12d,r11d,ebx
+ xor r13d,edi
+ rorx r14d,r11d,6
+ lea ecx,[r12*1+rcx]
+ xor r13d,r14d
+ mov edi,edx
+ rorx r12d,edx,22
+ lea ecx,[r13*1+rcx]
+ xor edi,r8d
+ rorx r14d,edx,13
+ rorx r13d,edx,2
+ lea r10d,[rcx*1+r10]
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,r8d
+ xor r14d,r13d
+ lea ecx,[r15*1+rcx]
+ mov r12d,r11d
+ add ebx,DWORD[((40+64))+rsp]
+ and r12d,r10d
+ rorx r13d,r10d,25
+ rorx r15d,r10d,11
+ lea ecx,[r14*1+rcx]
+ lea ebx,[r12*1+rbx]
+ andn r12d,r10d,eax
+ xor r13d,r15d
+ rorx r14d,r10d,6
+ lea ebx,[r12*1+rbx]
+ xor r13d,r14d
+ mov r15d,ecx
+ rorx r12d,ecx,22
+ lea ebx,[r13*1+rbx]
+ xor r15d,edx
+ rorx r14d,ecx,13
+ rorx r13d,ecx,2
+ lea r9d,[rbx*1+r9]
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,edx
+ xor r14d,r13d
+ lea ebx,[rdi*1+rbx]
+ mov r12d,r10d
+ add eax,DWORD[((44+64))+rsp]
+ and r12d,r9d
+ rorx r13d,r9d,25
+ rorx edi,r9d,11
+ lea ebx,[r14*1+rbx]
+ lea eax,[r12*1+rax]
+ andn r12d,r9d,r11d
+ xor r13d,edi
+ rorx r14d,r9d,6
+ lea eax,[r12*1+rax]
+ xor r13d,r14d
+ mov edi,ebx
+ rorx r12d,ebx,22
+ lea eax,[r13*1+rax]
+ xor edi,ecx
+ rorx r14d,ebx,13
+ rorx r13d,ebx,2
+ lea r8d,[rax*1+r8]
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,ecx
+ xor r14d,r13d
+ lea eax,[r15*1+rax]
+ mov r12d,r9d
+ add r11d,DWORD[rsp]
+ and r12d,r8d
+ rorx r13d,r8d,25
+ rorx r15d,r8d,11
+ lea eax,[r14*1+rax]
+ lea r11d,[r12*1+r11]
+ andn r12d,r8d,r10d
+ xor r13d,r15d
+ rorx r14d,r8d,6
+ lea r11d,[r12*1+r11]
+ xor r13d,r14d
+ mov r15d,eax
+ rorx r12d,eax,22
+ lea r11d,[r13*1+r11]
+ xor r15d,ebx
+ rorx r14d,eax,13
+ rorx r13d,eax,2
+ lea edx,[r11*1+rdx]
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,ebx
+ xor r14d,r13d
+ lea r11d,[rdi*1+r11]
+ mov r12d,r8d
+ add r10d,DWORD[4+rsp]
+ and r12d,edx
+ rorx r13d,edx,25
+ rorx edi,edx,11
+ lea r11d,[r14*1+r11]
+ lea r10d,[r12*1+r10]
+ andn r12d,edx,r9d
+ xor r13d,edi
+ rorx r14d,edx,6
+ lea r10d,[r12*1+r10]
+ xor r13d,r14d
+ mov edi,r11d
+ rorx r12d,r11d,22
+ lea r10d,[r13*1+r10]
+ xor edi,eax
+ rorx r14d,r11d,13
+ rorx r13d,r11d,2
+ lea ecx,[r10*1+rcx]
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,eax
+ xor r14d,r13d
+ lea r10d,[r15*1+r10]
+ mov r12d,edx
+ add r9d,DWORD[8+rsp]
+ and r12d,ecx
+ rorx r13d,ecx,25
+ rorx r15d,ecx,11
+ lea r10d,[r14*1+r10]
+ lea r9d,[r12*1+r9]
+ andn r12d,ecx,r8d
+ xor r13d,r15d
+ rorx r14d,ecx,6
+ lea r9d,[r12*1+r9]
+ xor r13d,r14d
+ mov r15d,r10d
+ rorx r12d,r10d,22
+ lea r9d,[r13*1+r9]
+ xor r15d,r11d
+ rorx r14d,r10d,13
+ rorx r13d,r10d,2
+ lea ebx,[r9*1+rbx]
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,r11d
+ xor r14d,r13d
+ lea r9d,[rdi*1+r9]
+ mov r12d,ecx
+ add r8d,DWORD[12+rsp]
+ and r12d,ebx
+ rorx r13d,ebx,25
+ rorx edi,ebx,11
+ lea r9d,[r14*1+r9]
+ lea r8d,[r12*1+r8]
+ andn r12d,ebx,edx
+ xor r13d,edi
+ rorx r14d,ebx,6
+ lea r8d,[r12*1+r8]
+ xor r13d,r14d
+ mov edi,r9d
+ rorx r12d,r9d,22
+ lea r8d,[r13*1+r8]
+ xor edi,r10d
+ rorx r14d,r9d,13
+ rorx r13d,r9d,2
+ lea eax,[r8*1+rax]
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,r10d
+ xor r14d,r13d
+ lea r8d,[r15*1+r8]
+ mov r12d,ebx
+ add edx,DWORD[32+rsp]
+ and r12d,eax
+ rorx r13d,eax,25
+ rorx r15d,eax,11
+ lea r8d,[r14*1+r8]
+ lea edx,[r12*1+rdx]
+ andn r12d,eax,ecx
+ xor r13d,r15d
+ rorx r14d,eax,6
+ lea edx,[r12*1+rdx]
+ xor r13d,r14d
+ mov r15d,r8d
+ rorx r12d,r8d,22
+ lea edx,[r13*1+rdx]
+ xor r15d,r9d
+ rorx r14d,r8d,13
+ rorx r13d,r8d,2
+ lea r11d,[rdx*1+r11]
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,r9d
+ xor r14d,r13d
+ lea edx,[rdi*1+rdx]
+ mov r12d,eax
+ add ecx,DWORD[36+rsp]
+ and r12d,r11d
+ rorx r13d,r11d,25
+ rorx edi,r11d,11
+ lea edx,[r14*1+rdx]
+ lea ecx,[r12*1+rcx]
+ andn r12d,r11d,ebx
+ xor r13d,edi
+ rorx r14d,r11d,6
+ lea ecx,[r12*1+rcx]
+ xor r13d,r14d
+ mov edi,edx
+ rorx r12d,edx,22
+ lea ecx,[r13*1+rcx]
+ xor edi,r8d
+ rorx r14d,edx,13
+ rorx r13d,edx,2
+ lea r10d,[rcx*1+r10]
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,r8d
+ xor r14d,r13d
+ lea ecx,[r15*1+rcx]
+ mov r12d,r11d
+ add ebx,DWORD[40+rsp]
+ and r12d,r10d
+ rorx r13d,r10d,25
+ rorx r15d,r10d,11
+ lea ecx,[r14*1+rcx]
+ lea ebx,[r12*1+rbx]
+ andn r12d,r10d,eax
+ xor r13d,r15d
+ rorx r14d,r10d,6
+ lea ebx,[r12*1+rbx]
+ xor r13d,r14d
+ mov r15d,ecx
+ rorx r12d,ecx,22
+ lea ebx,[r13*1+rbx]
+ xor r15d,edx
+ rorx r14d,ecx,13
+ rorx r13d,ecx,2
+ lea r9d,[rbx*1+r9]
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,edx
+ xor r14d,r13d
+ lea ebx,[rdi*1+rbx]
+ mov r12d,r10d
+ add eax,DWORD[44+rsp]
+ and r12d,r9d
+ rorx r13d,r9d,25
+ rorx edi,r9d,11
+ lea ebx,[r14*1+rbx]
+ lea eax,[r12*1+rax]
+ andn r12d,r9d,r11d
+ xor r13d,edi
+ rorx r14d,r9d,6
+ lea eax,[r12*1+rax]
+ xor r13d,r14d
+ mov edi,ebx
+ rorx r12d,ebx,22
+ lea eax,[r13*1+rax]
+ xor edi,ecx
+ rorx r14d,ebx,13
+ rorx r13d,ebx,2
+ lea r8d,[rax*1+r8]
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,ecx
+ xor r14d,r13d
+ lea eax,[r15*1+rax]
+ mov r12d,r9d
+ mov rdi,QWORD[512+rsp]
+ add eax,r14d
+
+ lea rbp,[448+rsp]
+
+ add eax,DWORD[rdi]
+ add ebx,DWORD[4+rdi]
+ add ecx,DWORD[8+rdi]
+ add edx,DWORD[12+rdi]
+ add r8d,DWORD[16+rdi]
+ add r9d,DWORD[20+rdi]
+ add r10d,DWORD[24+rdi]
+ add r11d,DWORD[28+rdi]
+
+ mov DWORD[rdi],eax
+ mov DWORD[4+rdi],ebx
+ mov DWORD[8+rdi],ecx
+ mov DWORD[12+rdi],edx
+ mov DWORD[16+rdi],r8d
+ mov DWORD[20+rdi],r9d
+ mov DWORD[24+rdi],r10d
+ mov DWORD[28+rdi],r11d
+
+ cmp rsi,QWORD[80+rbp]
+ je NEAR $L$done_avx2
+
+ xor r14d,r14d
+ mov edi,ebx
+ xor edi,ecx
+ mov r12d,r9d
+ jmp NEAR $L$ower_avx2
+ALIGN 16
+$L$ower_avx2:
+ add r11d,DWORD[((0+16))+rbp]
+ and r12d,r8d
+ rorx r13d,r8d,25
+ rorx r15d,r8d,11
+ lea eax,[r14*1+rax]
+ lea r11d,[r12*1+r11]
+ andn r12d,r8d,r10d
+ xor r13d,r15d
+ rorx r14d,r8d,6
+ lea r11d,[r12*1+r11]
+ xor r13d,r14d
+ mov r15d,eax
+ rorx r12d,eax,22
+ lea r11d,[r13*1+r11]
+ xor r15d,ebx
+ rorx r14d,eax,13
+ rorx r13d,eax,2
+ lea edx,[r11*1+rdx]
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,ebx
+ xor r14d,r13d
+ lea r11d,[rdi*1+r11]
+ mov r12d,r8d
+ add r10d,DWORD[((4+16))+rbp]
+ and r12d,edx
+ rorx r13d,edx,25
+ rorx edi,edx,11
+ lea r11d,[r14*1+r11]
+ lea r10d,[r12*1+r10]
+ andn r12d,edx,r9d
+ xor r13d,edi
+ rorx r14d,edx,6
+ lea r10d,[r12*1+r10]
+ xor r13d,r14d
+ mov edi,r11d
+ rorx r12d,r11d,22
+ lea r10d,[r13*1+r10]
+ xor edi,eax
+ rorx r14d,r11d,13
+ rorx r13d,r11d,2
+ lea ecx,[r10*1+rcx]
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,eax
+ xor r14d,r13d
+ lea r10d,[r15*1+r10]
+ mov r12d,edx
+ add r9d,DWORD[((8+16))+rbp]
+ and r12d,ecx
+ rorx r13d,ecx,25
+ rorx r15d,ecx,11
+ lea r10d,[r14*1+r10]
+ lea r9d,[r12*1+r9]
+ andn r12d,ecx,r8d
+ xor r13d,r15d
+ rorx r14d,ecx,6
+ lea r9d,[r12*1+r9]
+ xor r13d,r14d
+ mov r15d,r10d
+ rorx r12d,r10d,22
+ lea r9d,[r13*1+r9]
+ xor r15d,r11d
+ rorx r14d,r10d,13
+ rorx r13d,r10d,2
+ lea ebx,[r9*1+rbx]
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,r11d
+ xor r14d,r13d
+ lea r9d,[rdi*1+r9]
+ mov r12d,ecx
+ add r8d,DWORD[((12+16))+rbp]
+ and r12d,ebx
+ rorx r13d,ebx,25
+ rorx edi,ebx,11
+ lea r9d,[r14*1+r9]
+ lea r8d,[r12*1+r8]
+ andn r12d,ebx,edx
+ xor r13d,edi
+ rorx r14d,ebx,6
+ lea r8d,[r12*1+r8]
+ xor r13d,r14d
+ mov edi,r9d
+ rorx r12d,r9d,22
+ lea r8d,[r13*1+r8]
+ xor edi,r10d
+ rorx r14d,r9d,13
+ rorx r13d,r9d,2
+ lea eax,[r8*1+rax]
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,r10d
+ xor r14d,r13d
+ lea r8d,[r15*1+r8]
+ mov r12d,ebx
+ add edx,DWORD[((32+16))+rbp]
+ and r12d,eax
+ rorx r13d,eax,25
+ rorx r15d,eax,11
+ lea r8d,[r14*1+r8]
+ lea edx,[r12*1+rdx]
+ andn r12d,eax,ecx
+ xor r13d,r15d
+ rorx r14d,eax,6
+ lea edx,[r12*1+rdx]
+ xor r13d,r14d
+ mov r15d,r8d
+ rorx r12d,r8d,22
+ lea edx,[r13*1+rdx]
+ xor r15d,r9d
+ rorx r14d,r8d,13
+ rorx r13d,r8d,2
+ lea r11d,[rdx*1+r11]
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,r9d
+ xor r14d,r13d
+ lea edx,[rdi*1+rdx]
+ mov r12d,eax
+ add ecx,DWORD[((36+16))+rbp]
+ and r12d,r11d
+ rorx r13d,r11d,25
+ rorx edi,r11d,11
+ lea edx,[r14*1+rdx]
+ lea ecx,[r12*1+rcx]
+ andn r12d,r11d,ebx
+ xor r13d,edi
+ rorx r14d,r11d,6
+ lea ecx,[r12*1+rcx]
+ xor r13d,r14d
+ mov edi,edx
+ rorx r12d,edx,22
+ lea ecx,[r13*1+rcx]
+ xor edi,r8d
+ rorx r14d,edx,13
+ rorx r13d,edx,2
+ lea r10d,[rcx*1+r10]
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,r8d
+ xor r14d,r13d
+ lea ecx,[r15*1+rcx]
+ mov r12d,r11d
+ add ebx,DWORD[((40+16))+rbp]
+ and r12d,r10d
+ rorx r13d,r10d,25
+ rorx r15d,r10d,11
+ lea ecx,[r14*1+rcx]
+ lea ebx,[r12*1+rbx]
+ andn r12d,r10d,eax
+ xor r13d,r15d
+ rorx r14d,r10d,6
+ lea ebx,[r12*1+rbx]
+ xor r13d,r14d
+ mov r15d,ecx
+ rorx r12d,ecx,22
+ lea ebx,[r13*1+rbx]
+ xor r15d,edx
+ rorx r14d,ecx,13
+ rorx r13d,ecx,2
+ lea r9d,[rbx*1+r9]
+ and edi,r15d
+ xor r14d,r12d
+ xor edi,edx
+ xor r14d,r13d
+ lea ebx,[rdi*1+rbx]
+ mov r12d,r10d
+ add eax,DWORD[((44+16))+rbp]
+ and r12d,r9d
+ rorx r13d,r9d,25
+ rorx edi,r9d,11
+ lea ebx,[r14*1+rbx]
+ lea eax,[r12*1+rax]
+ andn r12d,r9d,r11d
+ xor r13d,edi
+ rorx r14d,r9d,6
+ lea eax,[r12*1+rax]
+ xor r13d,r14d
+ mov edi,ebx
+ rorx r12d,ebx,22
+ lea eax,[r13*1+rax]
+ xor edi,ecx
+ rorx r14d,ebx,13
+ rorx r13d,ebx,2
+ lea r8d,[rax*1+r8]
+ and r15d,edi
+ xor r14d,r12d
+ xor r15d,ecx
+ xor r14d,r13d
+ lea eax,[r15*1+rax]
+ mov r12d,r9d
+ lea rbp,[((-64))+rbp]
+ cmp rbp,rsp
+ jae NEAR $L$ower_avx2
+
+ mov rdi,QWORD[512+rsp]
+ add eax,r14d
+
+ lea rsp,[448+rsp]
+
+
+
+ add eax,DWORD[rdi]
+ add ebx,DWORD[4+rdi]
+ add ecx,DWORD[8+rdi]
+ add edx,DWORD[12+rdi]
+ add r8d,DWORD[16+rdi]
+ add r9d,DWORD[20+rdi]
+ lea rsi,[128+rsi]
+ add r10d,DWORD[24+rdi]
+ mov r12,rsi
+ add r11d,DWORD[28+rdi]
+ cmp rsi,QWORD[((64+16))+rsp]
+
+ mov DWORD[rdi],eax
+ cmove r12,rsp
+ mov DWORD[4+rdi],ebx
+ mov DWORD[8+rdi],ecx
+ mov DWORD[12+rdi],edx
+ mov DWORD[16+rdi],r8d
+ mov DWORD[20+rdi],r9d
+ mov DWORD[24+rdi],r10d
+ mov DWORD[28+rdi],r11d
+
+ jbe NEAR $L$oop_avx2
+ lea rbp,[rsp]
+
+
+
+
+$L$done_avx2:
+ mov rsi,QWORD[88+rbp]
+
+ vzeroupper
+ movaps xmm6,XMMWORD[((64+32))+rbp]
+ movaps xmm7,XMMWORD[((64+48))+rbp]
+ movaps xmm8,XMMWORD[((64+64))+rbp]
+ movaps xmm9,XMMWORD[((64+80))+rbp]
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$epilogue_avx2:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha256_block_data_order_avx2:
+EXTERN __imp_RtlVirtualUnwind
+
+ALIGN 16
+se_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
+
+ mov rax,QWORD[152+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$in_prologue
+ lea r10,[$L$avx2_shortcut]
+ cmp rbx,r10
+ jb NEAR $L$not_in_avx2
+
+ and rax,-256*4
+ add rax,448
+$L$not_in_avx2:
+ mov rsi,rax
+ mov rax,QWORD[((64+24))+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov r15,QWORD[((-48))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+ mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+ mov QWORD[232+r8],r14
+ mov QWORD[240+r8],r15
+
+ lea r10,[$L$epilogue]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
+
+ lea rsi,[((64+32))+rsi]
+ lea rdi,[512+r8]
+ mov ecx,8
+ DD 0xa548f3fc
+
+$L$in_prologue:
+ mov rdi,QWORD[8+rax]
+ mov rsi,QWORD[16+rax]
+ mov QWORD[152+r8],rax
+ mov QWORD[168+r8],rsi
+ mov QWORD[176+r8],rdi
+
+ mov rdi,QWORD[40+r9]
+ mov rsi,r8
+ mov ecx,154
+ DD 0xa548f3fc
+
+ mov rsi,r9
+ xor rcx,rcx
+ mov rdx,QWORD[8+rsi]
+ mov r8,QWORD[rsi]
+ mov r9,QWORD[16+rsi]
+ mov r10,QWORD[40+rsi]
+ lea r11,[56+rsi]
+ lea r12,[24+rsi]
+ mov QWORD[32+rsp],r10
+ mov QWORD[40+rsp],r11
+ mov QWORD[48+rsp],r12
+ mov QWORD[56+rsp],rcx
+ call QWORD[__imp_RtlVirtualUnwind]
+
+ mov eax,1
+ add rsp,64
+ popfq
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ DB 0F3h,0C3h ;repret
+
+
+ALIGN 16
+shaext_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ lea r10,[$L$prologue_shaext]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
+
+ lea r10,[$L$epilogue_shaext]
+ cmp rbx,r10
+ jae NEAR $L$in_prologue
+
+ lea rsi,[((-8-80))+rax]
+ lea rdi,[512+r8]
+ mov ecx,10
+ DD 0xa548f3fc
+
+ jmp NEAR $L$in_prologue
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_sha256_block_data_order wrt ..imagebase
+ DD $L$SEH_end_sha256_block_data_order wrt ..imagebase
+ DD $L$SEH_info_sha256_block_data_order wrt ..imagebase
+ DD $L$SEH_begin_sha256_block_data_order_shaext wrt ..imagebase
+ DD $L$SEH_end_sha256_block_data_order_shaext wrt ..imagebase
+ DD $L$SEH_info_sha256_block_data_order_shaext wrt ..imagebase
+ DD $L$SEH_begin_sha256_block_data_order_ssse3 wrt ..imagebase
+ DD $L$SEH_end_sha256_block_data_order_ssse3 wrt ..imagebase
+ DD $L$SEH_info_sha256_block_data_order_ssse3 wrt ..imagebase
+ DD $L$SEH_begin_sha256_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_end_sha256_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_info_sha256_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_begin_sha256_block_data_order_avx2 wrt ..imagebase
+ DD $L$SEH_end_sha256_block_data_order_avx2 wrt ..imagebase
+ DD $L$SEH_info_sha256_block_data_order_avx2 wrt ..imagebase
+section .xdata rdata align=8
+ALIGN 8
+$L$SEH_info_sha256_block_data_order:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase
+$L$SEH_info_sha256_block_data_order_shaext:
+DB 9,0,0,0
+ DD shaext_handler wrt ..imagebase
+$L$SEH_info_sha256_block_data_order_ssse3:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase
+$L$SEH_info_sha256_block_data_order_avx:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
+$L$SEH_info_sha256_block_data_order_avx2:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$prologue_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha512-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha512-x86_64.nasm
new file mode 100644
index 0000000..5ddba53
--- /dev/null
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha512-x86_64.nasm
@@ -0,0 +1,5665 @@
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+section .text code align=64
+
+
+EXTERN OPENSSL_ia32cap_P
+global sha512_block_data_order
+
+ALIGN 16
+sha512_block_data_order:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha512_block_data_order:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+ lea r11,[OPENSSL_ia32cap_P]
+ mov r9d,DWORD[r11]
+ mov r10d,DWORD[4+r11]
+ mov r11d,DWORD[8+r11]
+ test r10d,2048
+ jnz NEAR $L$xop_shortcut
+ and r11d,296
+ cmp r11d,296
+ je NEAR $L$avx2_shortcut
+ and r9d,1073741824
+ and r10d,268435968
+ or r10d,r9d
+ cmp r10d,1342177792
+ je NEAR $L$avx_shortcut
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ shl rdx,4
+ sub rsp,16*8+4*8
+ lea rdx,[rdx*8+rsi]
+ and rsp,-64
+ mov QWORD[((128+0))+rsp],rdi
+ mov QWORD[((128+8))+rsp],rsi
+ mov QWORD[((128+16))+rsp],rdx
+ mov QWORD[152+rsp],rax
+
+$L$prologue:
+
+ mov rax,QWORD[rdi]
+ mov rbx,QWORD[8+rdi]
+ mov rcx,QWORD[16+rdi]
+ mov rdx,QWORD[24+rdi]
+ mov r8,QWORD[32+rdi]
+ mov r9,QWORD[40+rdi]
+ mov r10,QWORD[48+rdi]
+ mov r11,QWORD[56+rdi]
+ jmp NEAR $L$loop
+
+ALIGN 16
+$L$loop:
+ mov rdi,rbx
+ lea rbp,[K512]
+ xor rdi,rcx
+ mov r12,QWORD[rsi]
+ mov r13,r8
+ mov r14,rax
+ bswap r12
+ ror r13,23
+ mov r15,r9
+
+ xor r13,r8
+ ror r14,5
+ xor r15,r10
+
+ mov QWORD[rsp],r12
+ xor r14,rax
+ and r15,r8
+
+ ror r13,4
+ add r12,r11
+ xor r15,r10
+
+ ror r14,6
+ xor r13,r8
+ add r12,r15
+
+ mov r15,rax
+ add r12,QWORD[rbp]
+ xor r14,rax
+
+ xor r15,rbx
+ ror r13,14
+ mov r11,rbx
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor r11,rdi
+ add rdx,r12
+ add r11,r12
+
+ lea rbp,[8+rbp]
+ add r11,r14
+ mov r12,QWORD[8+rsi]
+ mov r13,rdx
+ mov r14,r11
+ bswap r12
+ ror r13,23
+ mov rdi,r8
+
+ xor r13,rdx
+ ror r14,5
+ xor rdi,r9
+
+ mov QWORD[8+rsp],r12
+ xor r14,r11
+ and rdi,rdx
+
+ ror r13,4
+ add r12,r10
+ xor rdi,r9
+
+ ror r14,6
+ xor r13,rdx
+ add r12,rdi
+
+ mov rdi,r11
+ add r12,QWORD[rbp]
+ xor r14,r11
+
+ xor rdi,rax
+ ror r13,14
+ mov r10,rax
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor r10,r15
+ add rcx,r12
+ add r10,r12
+
+ lea rbp,[24+rbp]
+ add r10,r14
+ mov r12,QWORD[16+rsi]
+ mov r13,rcx
+ mov r14,r10
+ bswap r12
+ ror r13,23
+ mov r15,rdx
+
+ xor r13,rcx
+ ror r14,5
+ xor r15,r8
+
+ mov QWORD[16+rsp],r12
+ xor r14,r10
+ and r15,rcx
+
+ ror r13,4
+ add r12,r9
+ xor r15,r8
+
+ ror r14,6
+ xor r13,rcx
+ add r12,r15
+
+ mov r15,r10
+ add r12,QWORD[rbp]
+ xor r14,r10
+
+ xor r15,r11
+ ror r13,14
+ mov r9,r11
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor r9,rdi
+ add rbx,r12
+ add r9,r12
+
+ lea rbp,[8+rbp]
+ add r9,r14
+ mov r12,QWORD[24+rsi]
+ mov r13,rbx
+ mov r14,r9
+ bswap r12
+ ror r13,23
+ mov rdi,rcx
+
+ xor r13,rbx
+ ror r14,5
+ xor rdi,rdx
+
+ mov QWORD[24+rsp],r12
+ xor r14,r9
+ and rdi,rbx
+
+ ror r13,4
+ add r12,r8
+ xor rdi,rdx
+
+ ror r14,6
+ xor r13,rbx
+ add r12,rdi
+
+ mov rdi,r9
+ add r12,QWORD[rbp]
+ xor r14,r9
+
+ xor rdi,r10
+ ror r13,14
+ mov r8,r10
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor r8,r15
+ add rax,r12
+ add r8,r12
+
+ lea rbp,[24+rbp]
+ add r8,r14
+ mov r12,QWORD[32+rsi]
+ mov r13,rax
+ mov r14,r8
+ bswap r12
+ ror r13,23
+ mov r15,rbx
+
+ xor r13,rax
+ ror r14,5
+ xor r15,rcx
+
+ mov QWORD[32+rsp],r12
+ xor r14,r8
+ and r15,rax
+
+ ror r13,4
+ add r12,rdx
+ xor r15,rcx
+
+ ror r14,6
+ xor r13,rax
+ add r12,r15
+
+ mov r15,r8
+ add r12,QWORD[rbp]
+ xor r14,r8
+
+ xor r15,r9
+ ror r13,14
+ mov rdx,r9
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor rdx,rdi
+ add r11,r12
+ add rdx,r12
+
+ lea rbp,[8+rbp]
+ add rdx,r14
+ mov r12,QWORD[40+rsi]
+ mov r13,r11
+ mov r14,rdx
+ bswap r12
+ ror r13,23
+ mov rdi,rax
+
+ xor r13,r11
+ ror r14,5
+ xor rdi,rbx
+
+ mov QWORD[40+rsp],r12
+ xor r14,rdx
+ and rdi,r11
+
+ ror r13,4
+ add r12,rcx
+ xor rdi,rbx
+
+ ror r14,6
+ xor r13,r11
+ add r12,rdi
+
+ mov rdi,rdx
+ add r12,QWORD[rbp]
+ xor r14,rdx
+
+ xor rdi,r8
+ ror r13,14
+ mov rcx,r8
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor rcx,r15
+ add r10,r12
+ add rcx,r12
+
+ lea rbp,[24+rbp]
+ add rcx,r14
+ mov r12,QWORD[48+rsi]
+ mov r13,r10
+ mov r14,rcx
+ bswap r12
+ ror r13,23
+ mov r15,r11
+
+ xor r13,r10
+ ror r14,5
+ xor r15,rax
+
+ mov QWORD[48+rsp],r12
+ xor r14,rcx
+ and r15,r10
+
+ ror r13,4
+ add r12,rbx
+ xor r15,rax
+
+ ror r14,6
+ xor r13,r10
+ add r12,r15
+
+ mov r15,rcx
+ add r12,QWORD[rbp]
+ xor r14,rcx
+
+ xor r15,rdx
+ ror r13,14
+ mov rbx,rdx
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor rbx,rdi
+ add r9,r12
+ add rbx,r12
+
+ lea rbp,[8+rbp]
+ add rbx,r14
+ mov r12,QWORD[56+rsi]
+ mov r13,r9
+ mov r14,rbx
+ bswap r12
+ ror r13,23
+ mov rdi,r10
+
+ xor r13,r9
+ ror r14,5
+ xor rdi,r11
+
+ mov QWORD[56+rsp],r12
+ xor r14,rbx
+ and rdi,r9
+
+ ror r13,4
+ add r12,rax
+ xor rdi,r11
+
+ ror r14,6
+ xor r13,r9
+ add r12,rdi
+
+ mov rdi,rbx
+ add r12,QWORD[rbp]
+ xor r14,rbx
+
+ xor rdi,rcx
+ ror r13,14
+ mov rax,rcx
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor rax,r15
+ add r8,r12
+ add rax,r12
+
+ lea rbp,[24+rbp]
+ add rax,r14
+ mov r12,QWORD[64+rsi]
+ mov r13,r8
+ mov r14,rax
+ bswap r12
+ ror r13,23
+ mov r15,r9
+
+ xor r13,r8
+ ror r14,5
+ xor r15,r10
+
+ mov QWORD[64+rsp],r12
+ xor r14,rax
+ and r15,r8
+
+ ror r13,4
+ add r12,r11
+ xor r15,r10
+
+ ror r14,6
+ xor r13,r8
+ add r12,r15
+
+ mov r15,rax
+ add r12,QWORD[rbp]
+ xor r14,rax
+
+ xor r15,rbx
+ ror r13,14
+ mov r11,rbx
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor r11,rdi
+ add rdx,r12
+ add r11,r12
+
+ lea rbp,[8+rbp]
+ add r11,r14
+ mov r12,QWORD[72+rsi]
+ mov r13,rdx
+ mov r14,r11
+ bswap r12
+ ror r13,23
+ mov rdi,r8
+
+ xor r13,rdx
+ ror r14,5
+ xor rdi,r9
+
+ mov QWORD[72+rsp],r12
+ xor r14,r11
+ and rdi,rdx
+
+ ror r13,4
+ add r12,r10
+ xor rdi,r9
+
+ ror r14,6
+ xor r13,rdx
+ add r12,rdi
+
+ mov rdi,r11
+ add r12,QWORD[rbp]
+ xor r14,r11
+
+ xor rdi,rax
+ ror r13,14
+ mov r10,rax
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor r10,r15
+ add rcx,r12
+ add r10,r12
+
+ lea rbp,[24+rbp]
+ add r10,r14
+ mov r12,QWORD[80+rsi]
+ mov r13,rcx
+ mov r14,r10
+ bswap r12
+ ror r13,23
+ mov r15,rdx
+
+ xor r13,rcx
+ ror r14,5
+ xor r15,r8
+
+ mov QWORD[80+rsp],r12
+ xor r14,r10
+ and r15,rcx
+
+ ror r13,4
+ add r12,r9
+ xor r15,r8
+
+ ror r14,6
+ xor r13,rcx
+ add r12,r15
+
+ mov r15,r10
+ add r12,QWORD[rbp]
+ xor r14,r10
+
+ xor r15,r11
+ ror r13,14
+ mov r9,r11
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor r9,rdi
+ add rbx,r12
+ add r9,r12
+
+ lea rbp,[8+rbp]
+ add r9,r14
+ mov r12,QWORD[88+rsi]
+ mov r13,rbx
+ mov r14,r9
+ bswap r12
+ ror r13,23
+ mov rdi,rcx
+
+ xor r13,rbx
+ ror r14,5
+ xor rdi,rdx
+
+ mov QWORD[88+rsp],r12
+ xor r14,r9
+ and rdi,rbx
+
+ ror r13,4
+ add r12,r8
+ xor rdi,rdx
+
+ ror r14,6
+ xor r13,rbx
+ add r12,rdi
+
+ mov rdi,r9
+ add r12,QWORD[rbp]
+ xor r14,r9
+
+ xor rdi,r10
+ ror r13,14
+ mov r8,r10
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor r8,r15
+ add rax,r12
+ add r8,r12
+
+ lea rbp,[24+rbp]
+ add r8,r14
+ mov r12,QWORD[96+rsi]
+ mov r13,rax
+ mov r14,r8
+ bswap r12
+ ror r13,23
+ mov r15,rbx
+
+ xor r13,rax
+ ror r14,5
+ xor r15,rcx
+
+ mov QWORD[96+rsp],r12
+ xor r14,r8
+ and r15,rax
+
+ ror r13,4
+ add r12,rdx
+ xor r15,rcx
+
+ ror r14,6
+ xor r13,rax
+ add r12,r15
+
+ mov r15,r8
+ add r12,QWORD[rbp]
+ xor r14,r8
+
+ xor r15,r9
+ ror r13,14
+ mov rdx,r9
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor rdx,rdi
+ add r11,r12
+ add rdx,r12
+
+ lea rbp,[8+rbp]
+ add rdx,r14
+ mov r12,QWORD[104+rsi]
+ mov r13,r11
+ mov r14,rdx
+ bswap r12
+ ror r13,23
+ mov rdi,rax
+
+ xor r13,r11
+ ror r14,5
+ xor rdi,rbx
+
+ mov QWORD[104+rsp],r12
+ xor r14,rdx
+ and rdi,r11
+
+ ror r13,4
+ add r12,rcx
+ xor rdi,rbx
+
+ ror r14,6
+ xor r13,r11
+ add r12,rdi
+
+ mov rdi,rdx
+ add r12,QWORD[rbp]
+ xor r14,rdx
+
+ xor rdi,r8
+ ror r13,14
+ mov rcx,r8
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor rcx,r15
+ add r10,r12
+ add rcx,r12
+
+ lea rbp,[24+rbp]
+ add rcx,r14
+ mov r12,QWORD[112+rsi]
+ mov r13,r10
+ mov r14,rcx
+ bswap r12
+ ror r13,23
+ mov r15,r11
+
+ xor r13,r10
+ ror r14,5
+ xor r15,rax
+
+ mov QWORD[112+rsp],r12
+ xor r14,rcx
+ and r15,r10
+
+ ror r13,4
+ add r12,rbx
+ xor r15,rax
+
+ ror r14,6
+ xor r13,r10
+ add r12,r15
+
+ mov r15,rcx
+ add r12,QWORD[rbp]
+ xor r14,rcx
+
+ xor r15,rdx
+ ror r13,14
+ mov rbx,rdx
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor rbx,rdi
+ add r9,r12
+ add rbx,r12
+
+ lea rbp,[8+rbp]
+ add rbx,r14
+ mov r12,QWORD[120+rsi]
+ mov r13,r9
+ mov r14,rbx
+ bswap r12
+ ror r13,23
+ mov rdi,r10
+
+ xor r13,r9
+ ror r14,5
+ xor rdi,r11
+
+ mov QWORD[120+rsp],r12
+ xor r14,rbx
+ and rdi,r9
+
+ ror r13,4
+ add r12,rax
+ xor rdi,r11
+
+ ror r14,6
+ xor r13,r9
+ add r12,rdi
+
+ mov rdi,rbx
+ add r12,QWORD[rbp]
+ xor r14,rbx
+
+ xor rdi,rcx
+ ror r13,14
+ mov rax,rcx
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor rax,r15
+ add r8,r12
+ add rax,r12
+
+ lea rbp,[24+rbp]
+ jmp NEAR $L$rounds_16_xx
+ALIGN 16
+$L$rounds_16_xx:
+ mov r13,QWORD[8+rsp]
+ mov r15,QWORD[112+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add rax,r14
+ mov r14,r15
+ ror r15,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor r15,r14
+ shr r14,6
+
+ ror r15,19
+ xor r12,r13
+ xor r15,r14
+ add r12,QWORD[72+rsp]
+
+ add r12,QWORD[rsp]
+ mov r13,r8
+ add r12,r15
+ mov r14,rax
+ ror r13,23
+ mov r15,r9
+
+ xor r13,r8
+ ror r14,5
+ xor r15,r10
+
+ mov QWORD[rsp],r12
+ xor r14,rax
+ and r15,r8
+
+ ror r13,4
+ add r12,r11
+ xor r15,r10
+
+ ror r14,6
+ xor r13,r8
+ add r12,r15
+
+ mov r15,rax
+ add r12,QWORD[rbp]
+ xor r14,rax
+
+ xor r15,rbx
+ ror r13,14
+ mov r11,rbx
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor r11,rdi
+ add rdx,r12
+ add r11,r12
+
+ lea rbp,[8+rbp]
+ mov r13,QWORD[16+rsp]
+ mov rdi,QWORD[120+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add r11,r14
+ mov r14,rdi
+ ror rdi,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor rdi,r14
+ shr r14,6
+
+ ror rdi,19
+ xor r12,r13
+ xor rdi,r14
+ add r12,QWORD[80+rsp]
+
+ add r12,QWORD[8+rsp]
+ mov r13,rdx
+ add r12,rdi
+ mov r14,r11
+ ror r13,23
+ mov rdi,r8
+
+ xor r13,rdx
+ ror r14,5
+ xor rdi,r9
+
+ mov QWORD[8+rsp],r12
+ xor r14,r11
+ and rdi,rdx
+
+ ror r13,4
+ add r12,r10
+ xor rdi,r9
+
+ ror r14,6
+ xor r13,rdx
+ add r12,rdi
+
+ mov rdi,r11
+ add r12,QWORD[rbp]
+ xor r14,r11
+
+ xor rdi,rax
+ ror r13,14
+ mov r10,rax
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor r10,r15
+ add rcx,r12
+ add r10,r12
+
+ lea rbp,[24+rbp]
+ mov r13,QWORD[24+rsp]
+ mov r15,QWORD[rsp]
+
+ mov r12,r13
+ ror r13,7
+ add r10,r14
+ mov r14,r15
+ ror r15,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor r15,r14
+ shr r14,6
+
+ ror r15,19
+ xor r12,r13
+ xor r15,r14
+ add r12,QWORD[88+rsp]
+
+ add r12,QWORD[16+rsp]
+ mov r13,rcx
+ add r12,r15
+ mov r14,r10
+ ror r13,23
+ mov r15,rdx
+
+ xor r13,rcx
+ ror r14,5
+ xor r15,r8
+
+ mov QWORD[16+rsp],r12
+ xor r14,r10
+ and r15,rcx
+
+ ror r13,4
+ add r12,r9
+ xor r15,r8
+
+ ror r14,6
+ xor r13,rcx
+ add r12,r15
+
+ mov r15,r10
+ add r12,QWORD[rbp]
+ xor r14,r10
+
+ xor r15,r11
+ ror r13,14
+ mov r9,r11
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor r9,rdi
+ add rbx,r12
+ add r9,r12
+
+ lea rbp,[8+rbp]
+ mov r13,QWORD[32+rsp]
+ mov rdi,QWORD[8+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add r9,r14
+ mov r14,rdi
+ ror rdi,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor rdi,r14
+ shr r14,6
+
+ ror rdi,19
+ xor r12,r13
+ xor rdi,r14
+ add r12,QWORD[96+rsp]
+
+ add r12,QWORD[24+rsp]
+ mov r13,rbx
+ add r12,rdi
+ mov r14,r9
+ ror r13,23
+ mov rdi,rcx
+
+ xor r13,rbx
+ ror r14,5
+ xor rdi,rdx
+
+ mov QWORD[24+rsp],r12
+ xor r14,r9
+ and rdi,rbx
+
+ ror r13,4
+ add r12,r8
+ xor rdi,rdx
+
+ ror r14,6
+ xor r13,rbx
+ add r12,rdi
+
+ mov rdi,r9
+ add r12,QWORD[rbp]
+ xor r14,r9
+
+ xor rdi,r10
+ ror r13,14
+ mov r8,r10
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor r8,r15
+ add rax,r12
+ add r8,r12
+
+ lea rbp,[24+rbp]
+ mov r13,QWORD[40+rsp]
+ mov r15,QWORD[16+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add r8,r14
+ mov r14,r15
+ ror r15,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor r15,r14
+ shr r14,6
+
+ ror r15,19
+ xor r12,r13
+ xor r15,r14
+ add r12,QWORD[104+rsp]
+
+ add r12,QWORD[32+rsp]
+ mov r13,rax
+ add r12,r15
+ mov r14,r8
+ ror r13,23
+ mov r15,rbx
+
+ xor r13,rax
+ ror r14,5
+ xor r15,rcx
+
+ mov QWORD[32+rsp],r12
+ xor r14,r8
+ and r15,rax
+
+ ror r13,4
+ add r12,rdx
+ xor r15,rcx
+
+ ror r14,6
+ xor r13,rax
+ add r12,r15
+
+ mov r15,r8
+ add r12,QWORD[rbp]
+ xor r14,r8
+
+ xor r15,r9
+ ror r13,14
+ mov rdx,r9
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor rdx,rdi
+ add r11,r12
+ add rdx,r12
+
+ lea rbp,[8+rbp]
+ mov r13,QWORD[48+rsp]
+ mov rdi,QWORD[24+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add rdx,r14
+ mov r14,rdi
+ ror rdi,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor rdi,r14
+ shr r14,6
+
+ ror rdi,19
+ xor r12,r13
+ xor rdi,r14
+ add r12,QWORD[112+rsp]
+
+ add r12,QWORD[40+rsp]
+ mov r13,r11
+ add r12,rdi
+ mov r14,rdx
+ ror r13,23
+ mov rdi,rax
+
+ xor r13,r11
+ ror r14,5
+ xor rdi,rbx
+
+ mov QWORD[40+rsp],r12
+ xor r14,rdx
+ and rdi,r11
+
+ ror r13,4
+ add r12,rcx
+ xor rdi,rbx
+
+ ror r14,6
+ xor r13,r11
+ add r12,rdi
+
+ mov rdi,rdx
+ add r12,QWORD[rbp]
+ xor r14,rdx
+
+ xor rdi,r8
+ ror r13,14
+ mov rcx,r8
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor rcx,r15
+ add r10,r12
+ add rcx,r12
+
+ lea rbp,[24+rbp]
+ mov r13,QWORD[56+rsp]
+ mov r15,QWORD[32+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add rcx,r14
+ mov r14,r15
+ ror r15,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor r15,r14
+ shr r14,6
+
+ ror r15,19
+ xor r12,r13
+ xor r15,r14
+ add r12,QWORD[120+rsp]
+
+ add r12,QWORD[48+rsp]
+ mov r13,r10
+ add r12,r15
+ mov r14,rcx
+ ror r13,23
+ mov r15,r11
+
+ xor r13,r10
+ ror r14,5
+ xor r15,rax
+
+ mov QWORD[48+rsp],r12
+ xor r14,rcx
+ and r15,r10
+
+ ror r13,4
+ add r12,rbx
+ xor r15,rax
+
+ ror r14,6
+ xor r13,r10
+ add r12,r15
+
+ mov r15,rcx
+ add r12,QWORD[rbp]
+ xor r14,rcx
+
+ xor r15,rdx
+ ror r13,14
+ mov rbx,rdx
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor rbx,rdi
+ add r9,r12
+ add rbx,r12
+
+ lea rbp,[8+rbp]
+ mov r13,QWORD[64+rsp]
+ mov rdi,QWORD[40+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add rbx,r14
+ mov r14,rdi
+ ror rdi,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor rdi,r14
+ shr r14,6
+
+ ror rdi,19
+ xor r12,r13
+ xor rdi,r14
+ add r12,QWORD[rsp]
+
+ add r12,QWORD[56+rsp]
+ mov r13,r9
+ add r12,rdi
+ mov r14,rbx
+ ror r13,23
+ mov rdi,r10
+
+ xor r13,r9
+ ror r14,5
+ xor rdi,r11
+
+ mov QWORD[56+rsp],r12
+ xor r14,rbx
+ and rdi,r9
+
+ ror r13,4
+ add r12,rax
+ xor rdi,r11
+
+ ror r14,6
+ xor r13,r9
+ add r12,rdi
+
+ mov rdi,rbx
+ add r12,QWORD[rbp]
+ xor r14,rbx
+
+ xor rdi,rcx
+ ror r13,14
+ mov rax,rcx
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor rax,r15
+ add r8,r12
+ add rax,r12
+
+ lea rbp,[24+rbp]
+ mov r13,QWORD[72+rsp]
+ mov r15,QWORD[48+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add rax,r14
+ mov r14,r15
+ ror r15,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor r15,r14
+ shr r14,6
+
+ ror r15,19
+ xor r12,r13
+ xor r15,r14
+ add r12,QWORD[8+rsp]
+
+ add r12,QWORD[64+rsp]
+ mov r13,r8
+ add r12,r15
+ mov r14,rax
+ ror r13,23
+ mov r15,r9
+
+ xor r13,r8
+ ror r14,5
+ xor r15,r10
+
+ mov QWORD[64+rsp],r12
+ xor r14,rax
+ and r15,r8
+
+ ror r13,4
+ add r12,r11
+ xor r15,r10
+
+ ror r14,6
+ xor r13,r8
+ add r12,r15
+
+ mov r15,rax
+ add r12,QWORD[rbp]
+ xor r14,rax
+
+ xor r15,rbx
+ ror r13,14
+ mov r11,rbx
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor r11,rdi
+ add rdx,r12
+ add r11,r12
+
+ lea rbp,[8+rbp]
+ mov r13,QWORD[80+rsp]
+ mov rdi,QWORD[56+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add r11,r14
+ mov r14,rdi
+ ror rdi,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor rdi,r14
+ shr r14,6
+
+ ror rdi,19
+ xor r12,r13
+ xor rdi,r14
+ add r12,QWORD[16+rsp]
+
+ add r12,QWORD[72+rsp]
+ mov r13,rdx
+ add r12,rdi
+ mov r14,r11
+ ror r13,23
+ mov rdi,r8
+
+ xor r13,rdx
+ ror r14,5
+ xor rdi,r9
+
+ mov QWORD[72+rsp],r12
+ xor r14,r11
+ and rdi,rdx
+
+ ror r13,4
+ add r12,r10
+ xor rdi,r9
+
+ ror r14,6
+ xor r13,rdx
+ add r12,rdi
+
+ mov rdi,r11
+ add r12,QWORD[rbp]
+ xor r14,r11
+
+ xor rdi,rax
+ ror r13,14
+ mov r10,rax
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor r10,r15
+ add rcx,r12
+ add r10,r12
+
+ lea rbp,[24+rbp]
+ mov r13,QWORD[88+rsp]
+ mov r15,QWORD[64+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add r10,r14
+ mov r14,r15
+ ror r15,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor r15,r14
+ shr r14,6
+
+ ror r15,19
+ xor r12,r13
+ xor r15,r14
+ add r12,QWORD[24+rsp]
+
+ add r12,QWORD[80+rsp]
+ mov r13,rcx
+ add r12,r15
+ mov r14,r10
+ ror r13,23
+ mov r15,rdx
+
+ xor r13,rcx
+ ror r14,5
+ xor r15,r8
+
+ mov QWORD[80+rsp],r12
+ xor r14,r10
+ and r15,rcx
+
+ ror r13,4
+ add r12,r9
+ xor r15,r8
+
+ ror r14,6
+ xor r13,rcx
+ add r12,r15
+
+ mov r15,r10
+ add r12,QWORD[rbp]
+ xor r14,r10
+
+ xor r15,r11
+ ror r13,14
+ mov r9,r11
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor r9,rdi
+ add rbx,r12
+ add r9,r12
+
+ lea rbp,[8+rbp]
+ mov r13,QWORD[96+rsp]
+ mov rdi,QWORD[72+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add r9,r14
+ mov r14,rdi
+ ror rdi,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor rdi,r14
+ shr r14,6
+
+ ror rdi,19
+ xor r12,r13
+ xor rdi,r14
+ add r12,QWORD[32+rsp]
+
+ add r12,QWORD[88+rsp]
+ mov r13,rbx
+ add r12,rdi
+ mov r14,r9
+ ror r13,23
+ mov rdi,rcx
+
+ xor r13,rbx
+ ror r14,5
+ xor rdi,rdx
+
+ mov QWORD[88+rsp],r12
+ xor r14,r9
+ and rdi,rbx
+
+ ror r13,4
+ add r12,r8
+ xor rdi,rdx
+
+ ror r14,6
+ xor r13,rbx
+ add r12,rdi
+
+ mov rdi,r9
+ add r12,QWORD[rbp]
+ xor r14,r9
+
+ xor rdi,r10
+ ror r13,14
+ mov r8,r10
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor r8,r15
+ add rax,r12
+ add r8,r12
+
+ lea rbp,[24+rbp]
+ mov r13,QWORD[104+rsp]
+ mov r15,QWORD[80+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add r8,r14
+ mov r14,r15
+ ror r15,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor r15,r14
+ shr r14,6
+
+ ror r15,19
+ xor r12,r13
+ xor r15,r14
+ add r12,QWORD[40+rsp]
+
+ add r12,QWORD[96+rsp]
+ mov r13,rax
+ add r12,r15
+ mov r14,r8
+ ror r13,23
+ mov r15,rbx
+
+ xor r13,rax
+ ror r14,5
+ xor r15,rcx
+
+ mov QWORD[96+rsp],r12
+ xor r14,r8
+ and r15,rax
+
+ ror r13,4
+ add r12,rdx
+ xor r15,rcx
+
+ ror r14,6
+ xor r13,rax
+ add r12,r15
+
+ mov r15,r8
+ add r12,QWORD[rbp]
+ xor r14,r8
+
+ xor r15,r9
+ ror r13,14
+ mov rdx,r9
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor rdx,rdi
+ add r11,r12
+ add rdx,r12
+
+ lea rbp,[8+rbp]
+ mov r13,QWORD[112+rsp]
+ mov rdi,QWORD[88+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add rdx,r14
+ mov r14,rdi
+ ror rdi,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor rdi,r14
+ shr r14,6
+
+ ror rdi,19
+ xor r12,r13
+ xor rdi,r14
+ add r12,QWORD[48+rsp]
+
+ add r12,QWORD[104+rsp]
+ mov r13,r11
+ add r12,rdi
+ mov r14,rdx
+ ror r13,23
+ mov rdi,rax
+
+ xor r13,r11
+ ror r14,5
+ xor rdi,rbx
+
+ mov QWORD[104+rsp],r12
+ xor r14,rdx
+ and rdi,r11
+
+ ror r13,4
+ add r12,rcx
+ xor rdi,rbx
+
+ ror r14,6
+ xor r13,r11
+ add r12,rdi
+
+ mov rdi,rdx
+ add r12,QWORD[rbp]
+ xor r14,rdx
+
+ xor rdi,r8
+ ror r13,14
+ mov rcx,r8
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor rcx,r15
+ add r10,r12
+ add rcx,r12
+
+ lea rbp,[24+rbp]
+ mov r13,QWORD[120+rsp]
+ mov r15,QWORD[96+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add rcx,r14
+ mov r14,r15
+ ror r15,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor r15,r14
+ shr r14,6
+
+ ror r15,19
+ xor r12,r13
+ xor r15,r14
+ add r12,QWORD[56+rsp]
+
+ add r12,QWORD[112+rsp]
+ mov r13,r10
+ add r12,r15
+ mov r14,rcx
+ ror r13,23
+ mov r15,r11
+
+ xor r13,r10
+ ror r14,5
+ xor r15,rax
+
+ mov QWORD[112+rsp],r12
+ xor r14,rcx
+ and r15,r10
+
+ ror r13,4
+ add r12,rbx
+ xor r15,rax
+
+ ror r14,6
+ xor r13,r10
+ add r12,r15
+
+ mov r15,rcx
+ add r12,QWORD[rbp]
+ xor r14,rcx
+
+ xor r15,rdx
+ ror r13,14
+ mov rbx,rdx
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor rbx,rdi
+ add r9,r12
+ add rbx,r12
+
+ lea rbp,[8+rbp]
+ mov r13,QWORD[rsp]
+ mov rdi,QWORD[104+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add rbx,r14
+ mov r14,rdi
+ ror rdi,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor rdi,r14
+ shr r14,6
+
+ ror rdi,19
+ xor r12,r13
+ xor rdi,r14
+ add r12,QWORD[64+rsp]
+
+ add r12,QWORD[120+rsp]
+ mov r13,r9
+ add r12,rdi
+ mov r14,rbx
+ ror r13,23
+ mov rdi,r10
+
+ xor r13,r9
+ ror r14,5
+ xor rdi,r11
+
+ mov QWORD[120+rsp],r12
+ xor r14,rbx
+ and rdi,r9
+
+ ror r13,4
+ add r12,rax
+ xor rdi,r11
+
+ ror r14,6
+ xor r13,r9
+ add r12,rdi
+
+ mov rdi,rbx
+ add r12,QWORD[rbp]
+ xor r14,rbx
+
+ xor rdi,rcx
+ ror r13,14
+ mov rax,rcx
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor rax,r15
+ add r8,r12
+ add rax,r12
+
+ lea rbp,[24+rbp]
+ cmp BYTE[7+rbp],0
+ jnz NEAR $L$rounds_16_xx
+
+ mov rdi,QWORD[((128+0))+rsp]
+ add rax,r14
+ lea rsi,[128+rsi]
+
+ add rax,QWORD[rdi]
+ add rbx,QWORD[8+rdi]
+ add rcx,QWORD[16+rdi]
+ add rdx,QWORD[24+rdi]
+ add r8,QWORD[32+rdi]
+ add r9,QWORD[40+rdi]
+ add r10,QWORD[48+rdi]
+ add r11,QWORD[56+rdi]
+
+ cmp rsi,QWORD[((128+16))+rsp]
+
+ mov QWORD[rdi],rax
+ mov QWORD[8+rdi],rbx
+ mov QWORD[16+rdi],rcx
+ mov QWORD[24+rdi],rdx
+ mov QWORD[32+rdi],r8
+ mov QWORD[40+rdi],r9
+ mov QWORD[48+rdi],r10
+ mov QWORD[56+rdi],r11
+ jb NEAR $L$loop
+
+ mov rsi,QWORD[152+rsp]
+
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha512_block_data_order:
+ALIGN 64
+
+K512:
+ DQ 0x428a2f98d728ae22,0x7137449123ef65cd
+ DQ 0x428a2f98d728ae22,0x7137449123ef65cd
+ DQ 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ DQ 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ DQ 0x3956c25bf348b538,0x59f111f1b605d019
+ DQ 0x3956c25bf348b538,0x59f111f1b605d019
+ DQ 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ DQ 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ DQ 0xd807aa98a3030242,0x12835b0145706fbe
+ DQ 0xd807aa98a3030242,0x12835b0145706fbe
+ DQ 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ DQ 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ DQ 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ DQ 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ DQ 0x9bdc06a725c71235,0xc19bf174cf692694
+ DQ 0x9bdc06a725c71235,0xc19bf174cf692694
+ DQ 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ DQ 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ DQ 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ DQ 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ DQ 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ DQ 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ DQ 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ DQ 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ DQ 0x983e5152ee66dfab,0xa831c66d2db43210
+ DQ 0x983e5152ee66dfab,0xa831c66d2db43210
+ DQ 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ DQ 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ DQ 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ DQ 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ DQ 0x06ca6351e003826f,0x142929670a0e6e70
+ DQ 0x06ca6351e003826f,0x142929670a0e6e70
+ DQ 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ DQ 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ DQ 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ DQ 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ DQ 0x650a73548baf63de,0x766a0abb3c77b2a8
+ DQ 0x650a73548baf63de,0x766a0abb3c77b2a8
+ DQ 0x81c2c92e47edaee6,0x92722c851482353b
+ DQ 0x81c2c92e47edaee6,0x92722c851482353b
+ DQ 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ DQ 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ DQ 0xc24b8b70d0f89791,0xc76c51a30654be30
+ DQ 0xc24b8b70d0f89791,0xc76c51a30654be30
+ DQ 0xd192e819d6ef5218,0xd69906245565a910
+ DQ 0xd192e819d6ef5218,0xd69906245565a910
+ DQ 0xf40e35855771202a,0x106aa07032bbd1b8
+ DQ 0xf40e35855771202a,0x106aa07032bbd1b8
+ DQ 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ DQ 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ DQ 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ DQ 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ DQ 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ DQ 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ DQ 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ DQ 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ DQ 0x748f82ee5defb2fc,0x78a5636f43172f60
+ DQ 0x748f82ee5defb2fc,0x78a5636f43172f60
+ DQ 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ DQ 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ DQ 0x90befffa23631e28,0xa4506cebde82bde9
+ DQ 0x90befffa23631e28,0xa4506cebde82bde9
+ DQ 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ DQ 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ DQ 0xca273eceea26619c,0xd186b8c721c0c207
+ DQ 0xca273eceea26619c,0xd186b8c721c0c207
+ DQ 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ DQ 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ DQ 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ DQ 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ DQ 0x113f9804bef90dae,0x1b710b35131c471b
+ DQ 0x113f9804bef90dae,0x1b710b35131c471b
+ DQ 0x28db77f523047d84,0x32caab7b40c72493
+ DQ 0x28db77f523047d84,0x32caab7b40c72493
+ DQ 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ DQ 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ DQ 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ DQ 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ DQ 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+ DQ 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+ DQ 0x0001020304050607,0x08090a0b0c0d0e0f
+ DQ 0x0001020304050607,0x08090a0b0c0d0e0f
+DB 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
+DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54
+DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+DB 111,114,103,62,0
+
+ALIGN 64
+sha512_block_data_order_xop:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha512_block_data_order_xop:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+$L$xop_shortcut:
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ shl rdx,4
+ sub rsp,256
+ lea rdx,[rdx*8+rsi]
+ and rsp,-64
+ mov QWORD[((128+0))+rsp],rdi
+ mov QWORD[((128+8))+rsp],rsi
+ mov QWORD[((128+16))+rsp],rdx
+ mov QWORD[152+rsp],rax
+
+ movaps XMMWORD[(128+32)+rsp],xmm6
+ movaps XMMWORD[(128+48)+rsp],xmm7
+ movaps XMMWORD[(128+64)+rsp],xmm8
+ movaps XMMWORD[(128+80)+rsp],xmm9
+ movaps XMMWORD[(128+96)+rsp],xmm10
+ movaps XMMWORD[(128+112)+rsp],xmm11
+$L$prologue_xop:
+
+ vzeroupper
+ mov rax,QWORD[rdi]
+ mov rbx,QWORD[8+rdi]
+ mov rcx,QWORD[16+rdi]
+ mov rdx,QWORD[24+rdi]
+ mov r8,QWORD[32+rdi]
+ mov r9,QWORD[40+rdi]
+ mov r10,QWORD[48+rdi]
+ mov r11,QWORD[56+rdi]
+ jmp NEAR $L$loop_xop
+ALIGN 16
+$L$loop_xop:
+ vmovdqa xmm11,XMMWORD[((K512+1280))]
+ vmovdqu xmm0,XMMWORD[rsi]
+ lea rbp,[((K512+128))]
+ vmovdqu xmm1,XMMWORD[16+rsi]
+ vmovdqu xmm2,XMMWORD[32+rsi]
+ vpshufb xmm0,xmm0,xmm11
+ vmovdqu xmm3,XMMWORD[48+rsi]
+ vpshufb xmm1,xmm1,xmm11
+ vmovdqu xmm4,XMMWORD[64+rsi]
+ vpshufb xmm2,xmm2,xmm11
+ vmovdqu xmm5,XMMWORD[80+rsi]
+ vpshufb xmm3,xmm3,xmm11
+ vmovdqu xmm6,XMMWORD[96+rsi]
+ vpshufb xmm4,xmm4,xmm11
+ vmovdqu xmm7,XMMWORD[112+rsi]
+ vpshufb xmm5,xmm5,xmm11
+ vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp]
+ vpshufb xmm6,xmm6,xmm11
+ vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp]
+ vpshufb xmm7,xmm7,xmm11
+ vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
+ vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp]
+ vmovdqa XMMWORD[rsp],xmm8
+ vpaddq xmm8,xmm4,XMMWORD[rbp]
+ vmovdqa XMMWORD[16+rsp],xmm9
+ vpaddq xmm9,xmm5,XMMWORD[32+rbp]
+ vmovdqa XMMWORD[32+rsp],xmm10
+ vpaddq xmm10,xmm6,XMMWORD[64+rbp]
+ vmovdqa XMMWORD[48+rsp],xmm11
+ vpaddq xmm11,xmm7,XMMWORD[96+rbp]
+ vmovdqa XMMWORD[64+rsp],xmm8
+ mov r14,rax
+ vmovdqa XMMWORD[80+rsp],xmm9
+ mov rdi,rbx
+ vmovdqa XMMWORD[96+rsp],xmm10
+ xor rdi,rcx
+ vmovdqa XMMWORD[112+rsp],xmm11
+ mov r13,r8
+ jmp NEAR $L$xop_00_47
+
+ALIGN 16
+$L$xop_00_47:
+ add rbp,256
+ vpalignr xmm8,xmm1,xmm0,8
+ ror r13,23
+ mov rax,r14
+ vpalignr xmm11,xmm5,xmm4,8
+ mov r12,r9
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,r8
+ xor r12,r10
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,rax
+ vpaddq xmm0,xmm0,xmm11
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[rsp]
+ mov r15,rax
+DB 143,72,120,195,209,7
+ xor r12,r10
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,rbx
+ add r11,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,223,3
+ xor r14,rax
+ add r11,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rbx
+ ror r14,28
+ vpsrlq xmm10,xmm7,6
+ add rdx,r11
+ add r11,rdi
+ vpaddq xmm0,xmm0,xmm8
+ mov r13,rdx
+ add r14,r11
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov r11,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,r8
+ ror r14,5
+ xor r13,rdx
+ xor r12,r9
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ vpaddq xmm0,xmm0,xmm11
+ add r10,QWORD[8+rsp]
+ mov rdi,r11
+ xor r12,r9
+ ror r14,6
+ vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp]
+ xor rdi,rax
+ add r10,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ ror r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ vmovdqa XMMWORD[rsp],xmm10
+ vpalignr xmm8,xmm2,xmm1,8
+ ror r13,23
+ mov r10,r14
+ vpalignr xmm11,xmm6,xmm5,8
+ mov r12,rdx
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,rcx
+ xor r12,r8
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,r10
+ vpaddq xmm1,xmm1,xmm11
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[16+rsp]
+ mov r15,r10
+DB 143,72,120,195,209,7
+ xor r12,r8
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,r11
+ add r9,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,216,3
+ xor r14,r10
+ add r9,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r11
+ ror r14,28
+ vpsrlq xmm10,xmm0,6
+ add rbx,r9
+ add r9,rdi
+ vpaddq xmm1,xmm1,xmm8
+ mov r13,rbx
+ add r14,r9
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov r9,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,rcx
+ ror r14,5
+ xor r13,rbx
+ xor r12,rdx
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ vpaddq xmm1,xmm1,xmm11
+ add r8,QWORD[24+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ ror r14,6
+ vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp]
+ xor rdi,r10
+ add r8,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ ror r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ vmovdqa XMMWORD[16+rsp],xmm10
+ vpalignr xmm8,xmm3,xmm2,8
+ ror r13,23
+ mov r8,r14
+ vpalignr xmm11,xmm7,xmm6,8
+ mov r12,rbx
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,rax
+ xor r12,rcx
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,r8
+ vpaddq xmm2,xmm2,xmm11
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[32+rsp]
+ mov r15,r8
+DB 143,72,120,195,209,7
+ xor r12,rcx
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,r9
+ add rdx,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,217,3
+ xor r14,r8
+ add rdx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r9
+ ror r14,28
+ vpsrlq xmm10,xmm1,6
+ add r11,rdx
+ add rdx,rdi
+ vpaddq xmm2,xmm2,xmm8
+ mov r13,r11
+ add r14,rdx
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov rdx,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,rax
+ ror r14,5
+ xor r13,r11
+ xor r12,rbx
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ vpaddq xmm2,xmm2,xmm11
+ add rcx,QWORD[40+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ ror r14,6
+ vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
+ xor rdi,r8
+ add rcx,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ ror r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ vmovdqa XMMWORD[32+rsp],xmm10
+ vpalignr xmm8,xmm4,xmm3,8
+ ror r13,23
+ mov rcx,r14
+ vpalignr xmm11,xmm0,xmm7,8
+ mov r12,r11
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,r10
+ xor r12,rax
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,rcx
+ vpaddq xmm3,xmm3,xmm11
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[48+rsp]
+ mov r15,rcx
+DB 143,72,120,195,209,7
+ xor r12,rax
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,rdx
+ add rbx,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,218,3
+ xor r14,rcx
+ add rbx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rdx
+ ror r14,28
+ vpsrlq xmm10,xmm2,6
+ add r9,rbx
+ add rbx,rdi
+ vpaddq xmm3,xmm3,xmm8
+ mov r13,r9
+ add r14,rbx
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov rbx,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,r10
+ ror r14,5
+ xor r13,r9
+ xor r12,r11
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ vpaddq xmm3,xmm3,xmm11
+ add rax,QWORD[56+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ ror r14,6
+ vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp]
+ xor rdi,rcx
+ add rax,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ ror r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ vmovdqa XMMWORD[48+rsp],xmm10
+ vpalignr xmm8,xmm5,xmm4,8
+ ror r13,23
+ mov rax,r14
+ vpalignr xmm11,xmm1,xmm0,8
+ mov r12,r9
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,r8
+ xor r12,r10
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,rax
+ vpaddq xmm4,xmm4,xmm11
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[64+rsp]
+ mov r15,rax
+DB 143,72,120,195,209,7
+ xor r12,r10
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,rbx
+ add r11,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,219,3
+ xor r14,rax
+ add r11,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rbx
+ ror r14,28
+ vpsrlq xmm10,xmm3,6
+ add rdx,r11
+ add r11,rdi
+ vpaddq xmm4,xmm4,xmm8
+ mov r13,rdx
+ add r14,r11
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov r11,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,r8
+ ror r14,5
+ xor r13,rdx
+ xor r12,r9
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ vpaddq xmm4,xmm4,xmm11
+ add r10,QWORD[72+rsp]
+ mov rdi,r11
+ xor r12,r9
+ ror r14,6
+ vpaddq xmm10,xmm4,XMMWORD[rbp]
+ xor rdi,rax
+ add r10,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ ror r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ vmovdqa XMMWORD[64+rsp],xmm10
+ vpalignr xmm8,xmm6,xmm5,8
+ ror r13,23
+ mov r10,r14
+ vpalignr xmm11,xmm2,xmm1,8
+ mov r12,rdx
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,rcx
+ xor r12,r8
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,r10
+ vpaddq xmm5,xmm5,xmm11
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[80+rsp]
+ mov r15,r10
+DB 143,72,120,195,209,7
+ xor r12,r8
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,r11
+ add r9,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,220,3
+ xor r14,r10
+ add r9,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r11
+ ror r14,28
+ vpsrlq xmm10,xmm4,6
+ add rbx,r9
+ add r9,rdi
+ vpaddq xmm5,xmm5,xmm8
+ mov r13,rbx
+ add r14,r9
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov r9,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,rcx
+ ror r14,5
+ xor r13,rbx
+ xor r12,rdx
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ vpaddq xmm5,xmm5,xmm11
+ add r8,QWORD[88+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ ror r14,6
+ vpaddq xmm10,xmm5,XMMWORD[32+rbp]
+ xor rdi,r10
+ add r8,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ ror r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ vmovdqa XMMWORD[80+rsp],xmm10
+ vpalignr xmm8,xmm7,xmm6,8
+ ror r13,23
+ mov r8,r14
+ vpalignr xmm11,xmm3,xmm2,8
+ mov r12,rbx
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,rax
+ xor r12,rcx
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,r8
+ vpaddq xmm6,xmm6,xmm11
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[96+rsp]
+ mov r15,r8
+DB 143,72,120,195,209,7
+ xor r12,rcx
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,r9
+ add rdx,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,221,3
+ xor r14,r8
+ add rdx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r9
+ ror r14,28
+ vpsrlq xmm10,xmm5,6
+ add r11,rdx
+ add rdx,rdi
+ vpaddq xmm6,xmm6,xmm8
+ mov r13,r11
+ add r14,rdx
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov rdx,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,rax
+ ror r14,5
+ xor r13,r11
+ xor r12,rbx
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ vpaddq xmm6,xmm6,xmm11
+ add rcx,QWORD[104+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ ror r14,6
+ vpaddq xmm10,xmm6,XMMWORD[64+rbp]
+ xor rdi,r8
+ add rcx,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ ror r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ vmovdqa XMMWORD[96+rsp],xmm10
+ vpalignr xmm8,xmm0,xmm7,8
+ ror r13,23
+ mov rcx,r14
+ vpalignr xmm11,xmm4,xmm3,8
+ mov r12,r11
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,r10
+ xor r12,rax
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,rcx
+ vpaddq xmm7,xmm7,xmm11
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[112+rsp]
+ mov r15,rcx
+DB 143,72,120,195,209,7
+ xor r12,rax
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,rdx
+ add rbx,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,222,3
+ xor r14,rcx
+ add rbx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rdx
+ ror r14,28
+ vpsrlq xmm10,xmm6,6
+ add r9,rbx
+ add rbx,rdi
+ vpaddq xmm7,xmm7,xmm8
+ mov r13,r9
+ add r14,rbx
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov rbx,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,r10
+ ror r14,5
+ xor r13,r9
+ xor r12,r11
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ vpaddq xmm7,xmm7,xmm11
+ add rax,QWORD[120+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ ror r14,6
+ vpaddq xmm10,xmm7,XMMWORD[96+rbp]
+ xor rdi,rcx
+ add rax,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ ror r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ vmovdqa XMMWORD[112+rsp],xmm10
+ cmp BYTE[135+rbp],0
+ jne NEAR $L$xop_00_47
+ ror r13,23
+ mov rax,r14
+ mov r12,r9
+ ror r14,5
+ xor r13,r8
+ xor r12,r10
+ ror r13,4
+ xor r14,rax
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[rsp]
+ mov r15,rax
+ xor r12,r10
+ ror r14,6
+ xor r15,rbx
+ add r11,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,rax
+ add r11,r13
+ xor rdi,rbx
+ ror r14,28
+ add rdx,r11
+ add r11,rdi
+ mov r13,rdx
+ add r14,r11
+ ror r13,23
+ mov r11,r14
+ mov r12,r8
+ ror r14,5
+ xor r13,rdx
+ xor r12,r9
+ ror r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ add r10,QWORD[8+rsp]
+ mov rdi,r11
+ xor r12,r9
+ ror r14,6
+ xor rdi,rax
+ add r10,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ ror r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ ror r13,23
+ mov r10,r14
+ mov r12,rdx
+ ror r14,5
+ xor r13,rcx
+ xor r12,r8
+ ror r13,4
+ xor r14,r10
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[16+rsp]
+ mov r15,r10
+ xor r12,r8
+ ror r14,6
+ xor r15,r11
+ add r9,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,r10
+ add r9,r13
+ xor rdi,r11
+ ror r14,28
+ add rbx,r9
+ add r9,rdi
+ mov r13,rbx
+ add r14,r9
+ ror r13,23
+ mov r9,r14
+ mov r12,rcx
+ ror r14,5
+ xor r13,rbx
+ xor r12,rdx
+ ror r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ add r8,QWORD[24+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ ror r14,6
+ xor rdi,r10
+ add r8,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ ror r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ ror r13,23
+ mov r8,r14
+ mov r12,rbx
+ ror r14,5
+ xor r13,rax
+ xor r12,rcx
+ ror r13,4
+ xor r14,r8
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[32+rsp]
+ mov r15,r8
+ xor r12,rcx
+ ror r14,6
+ xor r15,r9
+ add rdx,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,r8
+ add rdx,r13
+ xor rdi,r9
+ ror r14,28
+ add r11,rdx
+ add rdx,rdi
+ mov r13,r11
+ add r14,rdx
+ ror r13,23
+ mov rdx,r14
+ mov r12,rax
+ ror r14,5
+ xor r13,r11
+ xor r12,rbx
+ ror r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ add rcx,QWORD[40+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ ror r14,6
+ xor rdi,r8
+ add rcx,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ ror r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ ror r13,23
+ mov rcx,r14
+ mov r12,r11
+ ror r14,5
+ xor r13,r10
+ xor r12,rax
+ ror r13,4
+ xor r14,rcx
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[48+rsp]
+ mov r15,rcx
+ xor r12,rax
+ ror r14,6
+ xor r15,rdx
+ add rbx,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,rcx
+ add rbx,r13
+ xor rdi,rdx
+ ror r14,28
+ add r9,rbx
+ add rbx,rdi
+ mov r13,r9
+ add r14,rbx
+ ror r13,23
+ mov rbx,r14
+ mov r12,r10
+ ror r14,5
+ xor r13,r9
+ xor r12,r11
+ ror r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ add rax,QWORD[56+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ ror r14,6
+ xor rdi,rcx
+ add rax,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ ror r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ ror r13,23
+ mov rax,r14
+ mov r12,r9
+ ror r14,5
+ xor r13,r8
+ xor r12,r10
+ ror r13,4
+ xor r14,rax
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[64+rsp]
+ mov r15,rax
+ xor r12,r10
+ ror r14,6
+ xor r15,rbx
+ add r11,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,rax
+ add r11,r13
+ xor rdi,rbx
+ ror r14,28
+ add rdx,r11
+ add r11,rdi
+ mov r13,rdx
+ add r14,r11
+ ror r13,23
+ mov r11,r14
+ mov r12,r8
+ ror r14,5
+ xor r13,rdx
+ xor r12,r9
+ ror r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ add r10,QWORD[72+rsp]
+ mov rdi,r11
+ xor r12,r9
+ ror r14,6
+ xor rdi,rax
+ add r10,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ ror r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ ror r13,23
+ mov r10,r14
+ mov r12,rdx
+ ror r14,5
+ xor r13,rcx
+ xor r12,r8
+ ror r13,4
+ xor r14,r10
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[80+rsp]
+ mov r15,r10
+ xor r12,r8
+ ror r14,6
+ xor r15,r11
+ add r9,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,r10
+ add r9,r13
+ xor rdi,r11
+ ror r14,28
+ add rbx,r9
+ add r9,rdi
+ mov r13,rbx
+ add r14,r9
+ ror r13,23
+ mov r9,r14
+ mov r12,rcx
+ ror r14,5
+ xor r13,rbx
+ xor r12,rdx
+ ror r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ add r8,QWORD[88+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ ror r14,6
+ xor rdi,r10
+ add r8,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ ror r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ ror r13,23
+ mov r8,r14
+ mov r12,rbx
+ ror r14,5
+ xor r13,rax
+ xor r12,rcx
+ ror r13,4
+ xor r14,r8
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[96+rsp]
+ mov r15,r8
+ xor r12,rcx
+ ror r14,6
+ xor r15,r9
+ add rdx,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,r8
+ add rdx,r13
+ xor rdi,r9
+ ror r14,28
+ add r11,rdx
+ add rdx,rdi
+ mov r13,r11
+ add r14,rdx
+ ror r13,23
+ mov rdx,r14
+ mov r12,rax
+ ror r14,5
+ xor r13,r11
+ xor r12,rbx
+ ror r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ add rcx,QWORD[104+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ ror r14,6
+ xor rdi,r8
+ add rcx,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ ror r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ ror r13,23
+ mov rcx,r14
+ mov r12,r11
+ ror r14,5
+ xor r13,r10
+ xor r12,rax
+ ror r13,4
+ xor r14,rcx
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[112+rsp]
+ mov r15,rcx
+ xor r12,rax
+ ror r14,6
+ xor r15,rdx
+ add rbx,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,rcx
+ add rbx,r13
+ xor rdi,rdx
+ ror r14,28
+ add r9,rbx
+ add rbx,rdi
+ mov r13,r9
+ add r14,rbx
+ ror r13,23
+ mov rbx,r14
+ mov r12,r10
+ ror r14,5
+ xor r13,r9
+ xor r12,r11
+ ror r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ add rax,QWORD[120+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ ror r14,6
+ xor rdi,rcx
+ add rax,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ ror r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ mov rdi,QWORD[((128+0))+rsp]
+ mov rax,r14
+
+ add rax,QWORD[rdi]
+ lea rsi,[128+rsi]
+ add rbx,QWORD[8+rdi]
+ add rcx,QWORD[16+rdi]
+ add rdx,QWORD[24+rdi]
+ add r8,QWORD[32+rdi]
+ add r9,QWORD[40+rdi]
+ add r10,QWORD[48+rdi]
+ add r11,QWORD[56+rdi]
+
+ cmp rsi,QWORD[((128+16))+rsp]
+
+ mov QWORD[rdi],rax
+ mov QWORD[8+rdi],rbx
+ mov QWORD[16+rdi],rcx
+ mov QWORD[24+rdi],rdx
+ mov QWORD[32+rdi],r8
+ mov QWORD[40+rdi],r9
+ mov QWORD[48+rdi],r10
+ mov QWORD[56+rdi],r11
+ jb NEAR $L$loop_xop
+
+ mov rsi,QWORD[152+rsp]
+
+ vzeroupper
+ movaps xmm6,XMMWORD[((128+32))+rsp]
+ movaps xmm7,XMMWORD[((128+48))+rsp]
+ movaps xmm8,XMMWORD[((128+64))+rsp]
+ movaps xmm9,XMMWORD[((128+80))+rsp]
+ movaps xmm10,XMMWORD[((128+96))+rsp]
+ movaps xmm11,XMMWORD[((128+112))+rsp]
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$epilogue_xop:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha512_block_data_order_xop:
+
+ALIGN 64
+sha512_block_data_order_avx:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha512_block_data_order_avx:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+$L$avx_shortcut:
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ shl rdx,4
+ sub rsp,256
+ lea rdx,[rdx*8+rsi]
+ and rsp,-64
+ mov QWORD[((128+0))+rsp],rdi
+ mov QWORD[((128+8))+rsp],rsi
+ mov QWORD[((128+16))+rsp],rdx
+ mov QWORD[152+rsp],rax
+
+ movaps XMMWORD[(128+32)+rsp],xmm6
+ movaps XMMWORD[(128+48)+rsp],xmm7
+ movaps XMMWORD[(128+64)+rsp],xmm8
+ movaps XMMWORD[(128+80)+rsp],xmm9
+ movaps XMMWORD[(128+96)+rsp],xmm10
+ movaps XMMWORD[(128+112)+rsp],xmm11
+$L$prologue_avx:
+
+ vzeroupper
+ mov rax,QWORD[rdi]
+ mov rbx,QWORD[8+rdi]
+ mov rcx,QWORD[16+rdi]
+ mov rdx,QWORD[24+rdi]
+ mov r8,QWORD[32+rdi]
+ mov r9,QWORD[40+rdi]
+ mov r10,QWORD[48+rdi]
+ mov r11,QWORD[56+rdi]
+ jmp NEAR $L$loop_avx
+ALIGN 16
+$L$loop_avx:
+ vmovdqa xmm11,XMMWORD[((K512+1280))]
+ vmovdqu xmm0,XMMWORD[rsi]
+ lea rbp,[((K512+128))]
+ vmovdqu xmm1,XMMWORD[16+rsi]
+ vmovdqu xmm2,XMMWORD[32+rsi]
+ vpshufb xmm0,xmm0,xmm11
+ vmovdqu xmm3,XMMWORD[48+rsi]
+ vpshufb xmm1,xmm1,xmm11
+ vmovdqu xmm4,XMMWORD[64+rsi]
+ vpshufb xmm2,xmm2,xmm11
+ vmovdqu xmm5,XMMWORD[80+rsi]
+ vpshufb xmm3,xmm3,xmm11
+ vmovdqu xmm6,XMMWORD[96+rsi]
+ vpshufb xmm4,xmm4,xmm11
+ vmovdqu xmm7,XMMWORD[112+rsi]
+ vpshufb xmm5,xmm5,xmm11
+ vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp]
+ vpshufb xmm6,xmm6,xmm11
+ vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp]
+ vpshufb xmm7,xmm7,xmm11
+ vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
+ vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp]
+ vmovdqa XMMWORD[rsp],xmm8
+ vpaddq xmm8,xmm4,XMMWORD[rbp]
+ vmovdqa XMMWORD[16+rsp],xmm9
+ vpaddq xmm9,xmm5,XMMWORD[32+rbp]
+ vmovdqa XMMWORD[32+rsp],xmm10
+ vpaddq xmm10,xmm6,XMMWORD[64+rbp]
+ vmovdqa XMMWORD[48+rsp],xmm11
+ vpaddq xmm11,xmm7,XMMWORD[96+rbp]
+ vmovdqa XMMWORD[64+rsp],xmm8
+ mov r14,rax
+ vmovdqa XMMWORD[80+rsp],xmm9
+ mov rdi,rbx
+ vmovdqa XMMWORD[96+rsp],xmm10
+ xor rdi,rcx
+ vmovdqa XMMWORD[112+rsp],xmm11
+ mov r13,r8
+ jmp NEAR $L$avx_00_47
+
+ALIGN 16
+$L$avx_00_47:
+ add rbp,256
+ vpalignr xmm8,xmm1,xmm0,8
+ shrd r13,r13,23
+ mov rax,r14
+ vpalignr xmm11,xmm5,xmm4,8
+ mov r12,r9
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,r8
+ xor r12,r10
+ vpaddq xmm0,xmm0,xmm11
+ shrd r13,r13,4
+ xor r14,rax
+ vpsrlq xmm11,xmm8,7
+ and r12,r8
+ xor r13,r8
+ vpsllq xmm9,xmm8,56
+ add r11,QWORD[rsp]
+ mov r15,rax
+ vpxor xmm8,xmm11,xmm10
+ xor r12,r10
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,rbx
+ add r11,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,rax
+ add r11,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rbx
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm7,6
+ add rdx,r11
+ add r11,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,rdx
+ add r14,r11
+ vpsllq xmm10,xmm7,3
+ shrd r13,r13,23
+ mov r11,r14
+ vpaddq xmm0,xmm0,xmm8
+ mov r12,r8
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm7,19
+ xor r13,rdx
+ xor r12,r9
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,r11
+ vpsllq xmm10,xmm10,42
+ and r12,rdx
+ xor r13,rdx
+ vpxor xmm11,xmm11,xmm9
+ add r10,QWORD[8+rsp]
+ mov rdi,r11
+ vpsrlq xmm9,xmm9,42
+ xor r12,r9
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,rax
+ add r10,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm0,xmm0,xmm11
+ xor r14,r11
+ add r10,r13
+ vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp]
+ xor r15,rax
+ shrd r14,r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ vmovdqa XMMWORD[rsp],xmm10
+ vpalignr xmm8,xmm2,xmm1,8
+ shrd r13,r13,23
+ mov r10,r14
+ vpalignr xmm11,xmm6,xmm5,8
+ mov r12,rdx
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,rcx
+ xor r12,r8
+ vpaddq xmm1,xmm1,xmm11
+ shrd r13,r13,4
+ xor r14,r10
+ vpsrlq xmm11,xmm8,7
+ and r12,rcx
+ xor r13,rcx
+ vpsllq xmm9,xmm8,56
+ add r9,QWORD[16+rsp]
+ mov r15,r10
+ vpxor xmm8,xmm11,xmm10
+ xor r12,r8
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,r11
+ add r9,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,r10
+ add r9,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r11
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm0,6
+ add rbx,r9
+ add r9,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,rbx
+ add r14,r9
+ vpsllq xmm10,xmm0,3
+ shrd r13,r13,23
+ mov r9,r14
+ vpaddq xmm1,xmm1,xmm8
+ mov r12,rcx
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm0,19
+ xor r13,rbx
+ xor r12,rdx
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,r9
+ vpsllq xmm10,xmm10,42
+ and r12,rbx
+ xor r13,rbx
+ vpxor xmm11,xmm11,xmm9
+ add r8,QWORD[24+rsp]
+ mov rdi,r9
+ vpsrlq xmm9,xmm9,42
+ xor r12,rdx
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,r10
+ add r8,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm1,xmm1,xmm11
+ xor r14,r9
+ add r8,r13
+ vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp]
+ xor r15,r10
+ shrd r14,r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ vmovdqa XMMWORD[16+rsp],xmm10
+ vpalignr xmm8,xmm3,xmm2,8
+ shrd r13,r13,23
+ mov r8,r14
+ vpalignr xmm11,xmm7,xmm6,8
+ mov r12,rbx
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,rax
+ xor r12,rcx
+ vpaddq xmm2,xmm2,xmm11
+ shrd r13,r13,4
+ xor r14,r8
+ vpsrlq xmm11,xmm8,7
+ and r12,rax
+ xor r13,rax
+ vpsllq xmm9,xmm8,56
+ add rdx,QWORD[32+rsp]
+ mov r15,r8
+ vpxor xmm8,xmm11,xmm10
+ xor r12,rcx
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,r9
+ add rdx,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,r8
+ add rdx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r9
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm1,6
+ add r11,rdx
+ add rdx,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,r11
+ add r14,rdx
+ vpsllq xmm10,xmm1,3
+ shrd r13,r13,23
+ mov rdx,r14
+ vpaddq xmm2,xmm2,xmm8
+ mov r12,rax
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm1,19
+ xor r13,r11
+ xor r12,rbx
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,rdx
+ vpsllq xmm10,xmm10,42
+ and r12,r11
+ xor r13,r11
+ vpxor xmm11,xmm11,xmm9
+ add rcx,QWORD[40+rsp]
+ mov rdi,rdx
+ vpsrlq xmm9,xmm9,42
+ xor r12,rbx
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,r8
+ add rcx,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm2,xmm2,xmm11
+ xor r14,rdx
+ add rcx,r13
+ vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
+ xor r15,r8
+ shrd r14,r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ vmovdqa XMMWORD[32+rsp],xmm10
+ vpalignr xmm8,xmm4,xmm3,8
+ shrd r13,r13,23
+ mov rcx,r14
+ vpalignr xmm11,xmm0,xmm7,8
+ mov r12,r11
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,r10
+ xor r12,rax
+ vpaddq xmm3,xmm3,xmm11
+ shrd r13,r13,4
+ xor r14,rcx
+ vpsrlq xmm11,xmm8,7
+ and r12,r10
+ xor r13,r10
+ vpsllq xmm9,xmm8,56
+ add rbx,QWORD[48+rsp]
+ mov r15,rcx
+ vpxor xmm8,xmm11,xmm10
+ xor r12,rax
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,rdx
+ add rbx,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,rcx
+ add rbx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rdx
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm2,6
+ add r9,rbx
+ add rbx,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,r9
+ add r14,rbx
+ vpsllq xmm10,xmm2,3
+ shrd r13,r13,23
+ mov rbx,r14
+ vpaddq xmm3,xmm3,xmm8
+ mov r12,r10
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm2,19
+ xor r13,r9
+ xor r12,r11
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,rbx
+ vpsllq xmm10,xmm10,42
+ and r12,r9
+ xor r13,r9
+ vpxor xmm11,xmm11,xmm9
+ add rax,QWORD[56+rsp]
+ mov rdi,rbx
+ vpsrlq xmm9,xmm9,42
+ xor r12,r11
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,rcx
+ add rax,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm3,xmm3,xmm11
+ xor r14,rbx
+ add rax,r13
+ vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp]
+ xor r15,rcx
+ shrd r14,r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ vmovdqa XMMWORD[48+rsp],xmm10
+ vpalignr xmm8,xmm5,xmm4,8
+ shrd r13,r13,23
+ mov rax,r14
+ vpalignr xmm11,xmm1,xmm0,8
+ mov r12,r9
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,r8
+ xor r12,r10
+ vpaddq xmm4,xmm4,xmm11
+ shrd r13,r13,4
+ xor r14,rax
+ vpsrlq xmm11,xmm8,7
+ and r12,r8
+ xor r13,r8
+ vpsllq xmm9,xmm8,56
+ add r11,QWORD[64+rsp]
+ mov r15,rax
+ vpxor xmm8,xmm11,xmm10
+ xor r12,r10
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,rbx
+ add r11,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,rax
+ add r11,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rbx
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm3,6
+ add rdx,r11
+ add r11,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,rdx
+ add r14,r11
+ vpsllq xmm10,xmm3,3
+ shrd r13,r13,23
+ mov r11,r14
+ vpaddq xmm4,xmm4,xmm8
+ mov r12,r8
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm3,19
+ xor r13,rdx
+ xor r12,r9
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,r11
+ vpsllq xmm10,xmm10,42
+ and r12,rdx
+ xor r13,rdx
+ vpxor xmm11,xmm11,xmm9
+ add r10,QWORD[72+rsp]
+ mov rdi,r11
+ vpsrlq xmm9,xmm9,42
+ xor r12,r9
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,rax
+ add r10,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm4,xmm4,xmm11
+ xor r14,r11
+ add r10,r13
+ vpaddq xmm10,xmm4,XMMWORD[rbp]
+ xor r15,rax
+ shrd r14,r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ vmovdqa XMMWORD[64+rsp],xmm10
+ vpalignr xmm8,xmm6,xmm5,8
+ shrd r13,r13,23
+ mov r10,r14
+ vpalignr xmm11,xmm2,xmm1,8
+ mov r12,rdx
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,rcx
+ xor r12,r8
+ vpaddq xmm5,xmm5,xmm11
+ shrd r13,r13,4
+ xor r14,r10
+ vpsrlq xmm11,xmm8,7
+ and r12,rcx
+ xor r13,rcx
+ vpsllq xmm9,xmm8,56
+ add r9,QWORD[80+rsp]
+ mov r15,r10
+ vpxor xmm8,xmm11,xmm10
+ xor r12,r8
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,r11
+ add r9,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,r10
+ add r9,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r11
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm4,6
+ add rbx,r9
+ add r9,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,rbx
+ add r14,r9
+ vpsllq xmm10,xmm4,3
+ shrd r13,r13,23
+ mov r9,r14
+ vpaddq xmm5,xmm5,xmm8
+ mov r12,rcx
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm4,19
+ xor r13,rbx
+ xor r12,rdx
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,r9
+ vpsllq xmm10,xmm10,42
+ and r12,rbx
+ xor r13,rbx
+ vpxor xmm11,xmm11,xmm9
+ add r8,QWORD[88+rsp]
+ mov rdi,r9
+ vpsrlq xmm9,xmm9,42
+ xor r12,rdx
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,r10
+ add r8,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm5,xmm5,xmm11
+ xor r14,r9
+ add r8,r13
+ vpaddq xmm10,xmm5,XMMWORD[32+rbp]
+ xor r15,r10
+ shrd r14,r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ vmovdqa XMMWORD[80+rsp],xmm10
+ vpalignr xmm8,xmm7,xmm6,8
+ shrd r13,r13,23
+ mov r8,r14
+ vpalignr xmm11,xmm3,xmm2,8
+ mov r12,rbx
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,rax
+ xor r12,rcx
+ vpaddq xmm6,xmm6,xmm11
+ shrd r13,r13,4
+ xor r14,r8
+ vpsrlq xmm11,xmm8,7
+ and r12,rax
+ xor r13,rax
+ vpsllq xmm9,xmm8,56
+ add rdx,QWORD[96+rsp]
+ mov r15,r8
+ vpxor xmm8,xmm11,xmm10
+ xor r12,rcx
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,r9
+ add rdx,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,r8
+ add rdx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r9
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm5,6
+ add r11,rdx
+ add rdx,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,r11
+ add r14,rdx
+ vpsllq xmm10,xmm5,3
+ shrd r13,r13,23
+ mov rdx,r14
+ vpaddq xmm6,xmm6,xmm8
+ mov r12,rax
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm5,19
+ xor r13,r11
+ xor r12,rbx
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,rdx
+ vpsllq xmm10,xmm10,42
+ and r12,r11
+ xor r13,r11
+ vpxor xmm11,xmm11,xmm9
+ add rcx,QWORD[104+rsp]
+ mov rdi,rdx
+ vpsrlq xmm9,xmm9,42
+ xor r12,rbx
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,r8
+ add rcx,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm6,xmm6,xmm11
+ xor r14,rdx
+ add rcx,r13
+ vpaddq xmm10,xmm6,XMMWORD[64+rbp]
+ xor r15,r8
+ shrd r14,r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ vmovdqa XMMWORD[96+rsp],xmm10
+ vpalignr xmm8,xmm0,xmm7,8
+ shrd r13,r13,23
+ mov rcx,r14
+ vpalignr xmm11,xmm4,xmm3,8
+ mov r12,r11
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,r10
+ xor r12,rax
+ vpaddq xmm7,xmm7,xmm11
+ shrd r13,r13,4
+ xor r14,rcx
+ vpsrlq xmm11,xmm8,7
+ and r12,r10
+ xor r13,r10
+ vpsllq xmm9,xmm8,56
+ add rbx,QWORD[112+rsp]
+ mov r15,rcx
+ vpxor xmm8,xmm11,xmm10
+ xor r12,rax
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,rdx
+ add rbx,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,rcx
+ add rbx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rdx
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm6,6
+ add r9,rbx
+ add rbx,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,r9
+ add r14,rbx
+ vpsllq xmm10,xmm6,3
+ shrd r13,r13,23
+ mov rbx,r14
+ vpaddq xmm7,xmm7,xmm8
+ mov r12,r10
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm6,19
+ xor r13,r9
+ xor r12,r11
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,rbx
+ vpsllq xmm10,xmm10,42
+ and r12,r9
+ xor r13,r9
+ vpxor xmm11,xmm11,xmm9
+ add rax,QWORD[120+rsp]
+ mov rdi,rbx
+ vpsrlq xmm9,xmm9,42
+ xor r12,r11
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,rcx
+ add rax,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm7,xmm7,xmm11
+ xor r14,rbx
+ add rax,r13
+ vpaddq xmm10,xmm7,XMMWORD[96+rbp]
+ xor r15,rcx
+ shrd r14,r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ vmovdqa XMMWORD[112+rsp],xmm10
+ cmp BYTE[135+rbp],0
+ jne NEAR $L$avx_00_47
+ shrd r13,r13,23
+ mov rax,r14
+ mov r12,r9
+ shrd r14,r14,5
+ xor r13,r8
+ xor r12,r10
+ shrd r13,r13,4
+ xor r14,rax
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[rsp]
+ mov r15,rax
+ xor r12,r10
+ shrd r14,r14,6
+ xor r15,rbx
+ add r11,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,rax
+ add r11,r13
+ xor rdi,rbx
+ shrd r14,r14,28
+ add rdx,r11
+ add r11,rdi
+ mov r13,rdx
+ add r14,r11
+ shrd r13,r13,23
+ mov r11,r14
+ mov r12,r8
+ shrd r14,r14,5
+ xor r13,rdx
+ xor r12,r9
+ shrd r13,r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ add r10,QWORD[8+rsp]
+ mov rdi,r11
+ xor r12,r9
+ shrd r14,r14,6
+ xor rdi,rax
+ add r10,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ shrd r14,r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ shrd r13,r13,23
+ mov r10,r14
+ mov r12,rdx
+ shrd r14,r14,5
+ xor r13,rcx
+ xor r12,r8
+ shrd r13,r13,4
+ xor r14,r10
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[16+rsp]
+ mov r15,r10
+ xor r12,r8
+ shrd r14,r14,6
+ xor r15,r11
+ add r9,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,r10
+ add r9,r13
+ xor rdi,r11
+ shrd r14,r14,28
+ add rbx,r9
+ add r9,rdi
+ mov r13,rbx
+ add r14,r9
+ shrd r13,r13,23
+ mov r9,r14
+ mov r12,rcx
+ shrd r14,r14,5
+ xor r13,rbx
+ xor r12,rdx
+ shrd r13,r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ add r8,QWORD[24+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ shrd r14,r14,6
+ xor rdi,r10
+ add r8,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ shrd r14,r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ shrd r13,r13,23
+ mov r8,r14
+ mov r12,rbx
+ shrd r14,r14,5
+ xor r13,rax
+ xor r12,rcx
+ shrd r13,r13,4
+ xor r14,r8
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[32+rsp]
+ mov r15,r8
+ xor r12,rcx
+ shrd r14,r14,6
+ xor r15,r9
+ add rdx,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,r8
+ add rdx,r13
+ xor rdi,r9
+ shrd r14,r14,28
+ add r11,rdx
+ add rdx,rdi
+ mov r13,r11
+ add r14,rdx
+ shrd r13,r13,23
+ mov rdx,r14
+ mov r12,rax
+ shrd r14,r14,5
+ xor r13,r11
+ xor r12,rbx
+ shrd r13,r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ add rcx,QWORD[40+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ shrd r14,r14,6
+ xor rdi,r8
+ add rcx,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ shrd r14,r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ shrd r13,r13,23
+ mov rcx,r14
+ mov r12,r11
+ shrd r14,r14,5
+ xor r13,r10
+ xor r12,rax
+ shrd r13,r13,4
+ xor r14,rcx
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[48+rsp]
+ mov r15,rcx
+ xor r12,rax
+ shrd r14,r14,6
+ xor r15,rdx
+ add rbx,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,rcx
+ add rbx,r13
+ xor rdi,rdx
+ shrd r14,r14,28
+ add r9,rbx
+ add rbx,rdi
+ mov r13,r9
+ add r14,rbx
+ shrd r13,r13,23
+ mov rbx,r14
+ mov r12,r10
+ shrd r14,r14,5
+ xor r13,r9
+ xor r12,r11
+ shrd r13,r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ add rax,QWORD[56+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ shrd r14,r14,6
+ xor rdi,rcx
+ add rax,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ shrd r14,r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ shrd r13,r13,23
+ mov rax,r14
+ mov r12,r9
+ shrd r14,r14,5
+ xor r13,r8
+ xor r12,r10
+ shrd r13,r13,4
+ xor r14,rax
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[64+rsp]
+ mov r15,rax
+ xor r12,r10
+ shrd r14,r14,6
+ xor r15,rbx
+ add r11,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,rax
+ add r11,r13
+ xor rdi,rbx
+ shrd r14,r14,28
+ add rdx,r11
+ add r11,rdi
+ mov r13,rdx
+ add r14,r11
+ shrd r13,r13,23
+ mov r11,r14
+ mov r12,r8
+ shrd r14,r14,5
+ xor r13,rdx
+ xor r12,r9
+ shrd r13,r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ add r10,QWORD[72+rsp]
+ mov rdi,r11
+ xor r12,r9
+ shrd r14,r14,6
+ xor rdi,rax
+ add r10,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ shrd r14,r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ shrd r13,r13,23
+ mov r10,r14
+ mov r12,rdx
+ shrd r14,r14,5
+ xor r13,rcx
+ xor r12,r8
+ shrd r13,r13,4
+ xor r14,r10
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[80+rsp]
+ mov r15,r10
+ xor r12,r8
+ shrd r14,r14,6
+ xor r15,r11
+ add r9,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,r10
+ add r9,r13
+ xor rdi,r11
+ shrd r14,r14,28
+ add rbx,r9
+ add r9,rdi
+ mov r13,rbx
+ add r14,r9
+ shrd r13,r13,23
+ mov r9,r14
+ mov r12,rcx
+ shrd r14,r14,5
+ xor r13,rbx
+ xor r12,rdx
+ shrd r13,r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ add r8,QWORD[88+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ shrd r14,r14,6
+ xor rdi,r10
+ add r8,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ shrd r14,r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ shrd r13,r13,23
+ mov r8,r14
+ mov r12,rbx
+ shrd r14,r14,5
+ xor r13,rax
+ xor r12,rcx
+ shrd r13,r13,4
+ xor r14,r8
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[96+rsp]
+ mov r15,r8
+ xor r12,rcx
+ shrd r14,r14,6
+ xor r15,r9
+ add rdx,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,r8
+ add rdx,r13
+ xor rdi,r9
+ shrd r14,r14,28
+ add r11,rdx
+ add rdx,rdi
+ mov r13,r11
+ add r14,rdx
+ shrd r13,r13,23
+ mov rdx,r14
+ mov r12,rax
+ shrd r14,r14,5
+ xor r13,r11
+ xor r12,rbx
+ shrd r13,r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ add rcx,QWORD[104+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ shrd r14,r14,6
+ xor rdi,r8
+ add rcx,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ shrd r14,r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ shrd r13,r13,23
+ mov rcx,r14
+ mov r12,r11
+ shrd r14,r14,5
+ xor r13,r10
+ xor r12,rax
+ shrd r13,r13,4
+ xor r14,rcx
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[112+rsp]
+ mov r15,rcx
+ xor r12,rax
+ shrd r14,r14,6
+ xor r15,rdx
+ add rbx,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,rcx
+ add rbx,r13
+ xor rdi,rdx
+ shrd r14,r14,28
+ add r9,rbx
+ add rbx,rdi
+ mov r13,r9
+ add r14,rbx
+ shrd r13,r13,23
+ mov rbx,r14
+ mov r12,r10
+ shrd r14,r14,5
+ xor r13,r9
+ xor r12,r11
+ shrd r13,r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ add rax,QWORD[120+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ shrd r14,r14,6
+ xor rdi,rcx
+ add rax,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ shrd r14,r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ mov rdi,QWORD[((128+0))+rsp]
+ mov rax,r14
+
+ add rax,QWORD[rdi]
+ lea rsi,[128+rsi]
+ add rbx,QWORD[8+rdi]
+ add rcx,QWORD[16+rdi]
+ add rdx,QWORD[24+rdi]
+ add r8,QWORD[32+rdi]
+ add r9,QWORD[40+rdi]
+ add r10,QWORD[48+rdi]
+ add r11,QWORD[56+rdi]
+
+ cmp rsi,QWORD[((128+16))+rsp]
+
+ mov QWORD[rdi],rax
+ mov QWORD[8+rdi],rbx
+ mov QWORD[16+rdi],rcx
+ mov QWORD[24+rdi],rdx
+ mov QWORD[32+rdi],r8
+ mov QWORD[40+rdi],r9
+ mov QWORD[48+rdi],r10
+ mov QWORD[56+rdi],r11
+ jb NEAR $L$loop_avx
+
+ mov rsi,QWORD[152+rsp]
+
+ vzeroupper
+ movaps xmm6,XMMWORD[((128+32))+rsp]
+ movaps xmm7,XMMWORD[((128+48))+rsp]
+ movaps xmm8,XMMWORD[((128+64))+rsp]
+ movaps xmm9,XMMWORD[((128+80))+rsp]
+ movaps xmm10,XMMWORD[((128+96))+rsp]
+ movaps xmm11,XMMWORD[((128+112))+rsp]
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$epilogue_avx:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha512_block_data_order_avx:
+
+ALIGN 64
+sha512_block_data_order_avx2:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha512_block_data_order_avx2:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+$L$avx2_shortcut:
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ sub rsp,1408
+ shl rdx,4
+ and rsp,-256*8
+ lea rdx,[rdx*8+rsi]
+ add rsp,1152
+ mov QWORD[((128+0))+rsp],rdi
+ mov QWORD[((128+8))+rsp],rsi
+ mov QWORD[((128+16))+rsp],rdx
+ mov QWORD[152+rsp],rax
+
+ movaps XMMWORD[(128+32)+rsp],xmm6
+ movaps XMMWORD[(128+48)+rsp],xmm7
+ movaps XMMWORD[(128+64)+rsp],xmm8
+ movaps XMMWORD[(128+80)+rsp],xmm9
+ movaps XMMWORD[(128+96)+rsp],xmm10
+ movaps XMMWORD[(128+112)+rsp],xmm11
+$L$prologue_avx2:
+
+ vzeroupper
+ sub rsi,-16*8
+ mov rax,QWORD[rdi]
+ mov r12,rsi
+ mov rbx,QWORD[8+rdi]
+ cmp rsi,rdx
+ mov rcx,QWORD[16+rdi]
+ cmove r12,rsp
+ mov rdx,QWORD[24+rdi]
+ mov r8,QWORD[32+rdi]
+ mov r9,QWORD[40+rdi]
+ mov r10,QWORD[48+rdi]
+ mov r11,QWORD[56+rdi]
+ jmp NEAR $L$oop_avx2
+ALIGN 16
+$L$oop_avx2:
+ vmovdqu xmm0,XMMWORD[((-128))+rsi]
+ vmovdqu xmm1,XMMWORD[((-128+16))+rsi]
+ vmovdqu xmm2,XMMWORD[((-128+32))+rsi]
+ lea rbp,[((K512+128))]
+ vmovdqu xmm3,XMMWORD[((-128+48))+rsi]
+ vmovdqu xmm4,XMMWORD[((-128+64))+rsi]
+ vmovdqu xmm5,XMMWORD[((-128+80))+rsi]
+ vmovdqu xmm6,XMMWORD[((-128+96))+rsi]
+ vmovdqu xmm7,XMMWORD[((-128+112))+rsi]
+
+ vmovdqa ymm10,YMMWORD[1152+rbp]
+ vinserti128 ymm0,ymm0,XMMWORD[r12],1
+ vinserti128 ymm1,ymm1,XMMWORD[16+r12],1
+ vpshufb ymm0,ymm0,ymm10
+ vinserti128 ymm2,ymm2,XMMWORD[32+r12],1
+ vpshufb ymm1,ymm1,ymm10
+ vinserti128 ymm3,ymm3,XMMWORD[48+r12],1
+ vpshufb ymm2,ymm2,ymm10
+ vinserti128 ymm4,ymm4,XMMWORD[64+r12],1
+ vpshufb ymm3,ymm3,ymm10
+ vinserti128 ymm5,ymm5,XMMWORD[80+r12],1
+ vpshufb ymm4,ymm4,ymm10
+ vinserti128 ymm6,ymm6,XMMWORD[96+r12],1
+ vpshufb ymm5,ymm5,ymm10
+ vinserti128 ymm7,ymm7,XMMWORD[112+r12],1
+
+ vpaddq ymm8,ymm0,YMMWORD[((-128))+rbp]
+ vpshufb ymm6,ymm6,ymm10
+ vpaddq ymm9,ymm1,YMMWORD[((-96))+rbp]
+ vpshufb ymm7,ymm7,ymm10
+ vpaddq ymm10,ymm2,YMMWORD[((-64))+rbp]
+ vpaddq ymm11,ymm3,YMMWORD[((-32))+rbp]
+ vmovdqa YMMWORD[rsp],ymm8
+ vpaddq ymm8,ymm4,YMMWORD[rbp]
+ vmovdqa YMMWORD[32+rsp],ymm9
+ vpaddq ymm9,ymm5,YMMWORD[32+rbp]
+ vmovdqa YMMWORD[64+rsp],ymm10
+ vpaddq ymm10,ymm6,YMMWORD[64+rbp]
+ vmovdqa YMMWORD[96+rsp],ymm11
+ lea rsp,[((-128))+rsp]
+ vpaddq ymm11,ymm7,YMMWORD[96+rbp]
+ vmovdqa YMMWORD[rsp],ymm8
+ xor r14,r14
+ vmovdqa YMMWORD[32+rsp],ymm9
+ mov rdi,rbx
+ vmovdqa YMMWORD[64+rsp],ymm10
+ xor rdi,rcx
+ vmovdqa YMMWORD[96+rsp],ymm11
+ mov r12,r9
+ add rbp,16*2*8
+ jmp NEAR $L$avx2_00_47
+
+ALIGN 16
+$L$avx2_00_47:
+ lea rsp,[((-128))+rsp]
+ vpalignr ymm8,ymm1,ymm0,8
+ add r11,QWORD[((0+256))+rsp]
+ and r12,r8
+ rorx r13,r8,41
+ vpalignr ymm11,ymm5,ymm4,8
+ rorx r15,r8,18
+ lea rax,[r14*1+rax]
+ lea r11,[r12*1+r11]
+ vpsrlq ymm10,ymm8,1
+ andn r12,r8,r10
+ xor r13,r15
+ rorx r14,r8,14
+ vpaddq ymm0,ymm0,ymm11
+ vpsrlq ymm11,ymm8,7
+ lea r11,[r12*1+r11]
+ xor r13,r14
+ mov r15,rax
+ vpsllq ymm9,ymm8,56
+ vpxor ymm8,ymm11,ymm10
+ rorx r12,rax,39
+ lea r11,[r13*1+r11]
+ xor r15,rbx
+ vpsrlq ymm10,ymm10,7
+ vpxor ymm8,ymm8,ymm9
+ rorx r14,rax,34
+ rorx r13,rax,28
+ lea rdx,[r11*1+rdx]
+ vpsllq ymm9,ymm9,7
+ vpxor ymm8,ymm8,ymm10
+ and rdi,r15
+ xor r14,r12
+ xor rdi,rbx
+ vpsrlq ymm11,ymm7,6
+ vpxor ymm8,ymm8,ymm9
+ xor r14,r13
+ lea r11,[rdi*1+r11]
+ mov r12,r8
+ vpsllq ymm10,ymm7,3
+ vpaddq ymm0,ymm0,ymm8
+ add r10,QWORD[((8+256))+rsp]
+ and r12,rdx
+ rorx r13,rdx,41
+ vpsrlq ymm9,ymm7,19
+ vpxor ymm11,ymm11,ymm10
+ rorx rdi,rdx,18
+ lea r11,[r14*1+r11]
+ lea r10,[r12*1+r10]
+ vpsllq ymm10,ymm10,42
+ vpxor ymm11,ymm11,ymm9
+ andn r12,rdx,r9
+ xor r13,rdi
+ rorx r14,rdx,14
+ vpsrlq ymm9,ymm9,42
+ vpxor ymm11,ymm11,ymm10
+ lea r10,[r12*1+r10]
+ xor r13,r14
+ mov rdi,r11
+ vpxor ymm11,ymm11,ymm9
+ rorx r12,r11,39
+ lea r10,[r13*1+r10]
+ xor rdi,rax
+ vpaddq ymm0,ymm0,ymm11
+ rorx r14,r11,34
+ rorx r13,r11,28
+ lea rcx,[r10*1+rcx]
+ vpaddq ymm10,ymm0,YMMWORD[((-128))+rbp]
+ and r15,rdi
+ xor r14,r12
+ xor r15,rax
+ xor r14,r13
+ lea r10,[r15*1+r10]
+ mov r12,rdx
+ vmovdqa YMMWORD[rsp],ymm10
+ vpalignr ymm8,ymm2,ymm1,8
+ add r9,QWORD[((32+256))+rsp]
+ and r12,rcx
+ rorx r13,rcx,41
+ vpalignr ymm11,ymm6,ymm5,8
+ rorx r15,rcx,18
+ lea r10,[r14*1+r10]
+ lea r9,[r12*1+r9]
+ vpsrlq ymm10,ymm8,1
+ andn r12,rcx,r8
+ xor r13,r15
+ rorx r14,rcx,14
+ vpaddq ymm1,ymm1,ymm11
+ vpsrlq ymm11,ymm8,7
+ lea r9,[r12*1+r9]
+ xor r13,r14
+ mov r15,r10
+ vpsllq ymm9,ymm8,56
+ vpxor ymm8,ymm11,ymm10
+ rorx r12,r10,39
+ lea r9,[r13*1+r9]
+ xor r15,r11
+ vpsrlq ymm10,ymm10,7
+ vpxor ymm8,ymm8,ymm9
+ rorx r14,r10,34
+ rorx r13,r10,28
+ lea rbx,[r9*1+rbx]
+ vpsllq ymm9,ymm9,7
+ vpxor ymm8,ymm8,ymm10
+ and rdi,r15
+ xor r14,r12
+ xor rdi,r11
+ vpsrlq ymm11,ymm0,6
+ vpxor ymm8,ymm8,ymm9
+ xor r14,r13
+ lea r9,[rdi*1+r9]
+ mov r12,rcx
+ vpsllq ymm10,ymm0,3
+ vpaddq ymm1,ymm1,ymm8
+ add r8,QWORD[((40+256))+rsp]
+ and r12,rbx
+ rorx r13,rbx,41
+ vpsrlq ymm9,ymm0,19
+ vpxor ymm11,ymm11,ymm10
+ rorx rdi,rbx,18
+ lea r9,[r14*1+r9]
+ lea r8,[r12*1+r8]
+ vpsllq ymm10,ymm10,42
+ vpxor ymm11,ymm11,ymm9
+ andn r12,rbx,rdx
+ xor r13,rdi
+ rorx r14,rbx,14
+ vpsrlq ymm9,ymm9,42
+ vpxor ymm11,ymm11,ymm10
+ lea r8,[r12*1+r8]
+ xor r13,r14
+ mov rdi,r9
+ vpxor ymm11,ymm11,ymm9
+ rorx r12,r9,39
+ lea r8,[r13*1+r8]
+ xor rdi,r10
+ vpaddq ymm1,ymm1,ymm11
+ rorx r14,r9,34
+ rorx r13,r9,28
+ lea rax,[r8*1+rax]
+ vpaddq ymm10,ymm1,YMMWORD[((-96))+rbp]
+ and r15,rdi
+ xor r14,r12
+ xor r15,r10
+ xor r14,r13
+ lea r8,[r15*1+r8]
+ mov r12,rbx
+ vmovdqa YMMWORD[32+rsp],ymm10
+ vpalignr ymm8,ymm3,ymm2,8
+ add rdx,QWORD[((64+256))+rsp]
+ and r12,rax
+ rorx r13,rax,41
+ vpalignr ymm11,ymm7,ymm6,8
+ rorx r15,rax,18
+ lea r8,[r14*1+r8]
+ lea rdx,[r12*1+rdx]
+ vpsrlq ymm10,ymm8,1
+ andn r12,rax,rcx
+ xor r13,r15
+ rorx r14,rax,14
+ vpaddq ymm2,ymm2,ymm11
+ vpsrlq ymm11,ymm8,7
+ lea rdx,[r12*1+rdx]
+ xor r13,r14
+ mov r15,r8
+ vpsllq ymm9,ymm8,56
+ vpxor ymm8,ymm11,ymm10
+ rorx r12,r8,39
+ lea rdx,[r13*1+rdx]
+ xor r15,r9
+ vpsrlq ymm10,ymm10,7
+ vpxor ymm8,ymm8,ymm9
+ rorx r14,r8,34
+ rorx r13,r8,28
+ lea r11,[rdx*1+r11]
+ vpsllq ymm9,ymm9,7
+ vpxor ymm8,ymm8,ymm10
+ and rdi,r15
+ xor r14,r12
+ xor rdi,r9
+ vpsrlq ymm11,ymm1,6
+ vpxor ymm8,ymm8,ymm9
+ xor r14,r13
+ lea rdx,[rdi*1+rdx]
+ mov r12,rax
+ vpsllq ymm10,ymm1,3
+ vpaddq ymm2,ymm2,ymm8
+ add rcx,QWORD[((72+256))+rsp]
+ and r12,r11
+ rorx r13,r11,41
+ vpsrlq ymm9,ymm1,19
+ vpxor ymm11,ymm11,ymm10
+ rorx rdi,r11,18
+ lea rdx,[r14*1+rdx]
+ lea rcx,[r12*1+rcx]
+ vpsllq ymm10,ymm10,42
+ vpxor ymm11,ymm11,ymm9
+ andn r12,r11,rbx
+ xor r13,rdi
+ rorx r14,r11,14
+ vpsrlq ymm9,ymm9,42
+ vpxor ymm11,ymm11,ymm10
+ lea rcx,[r12*1+rcx]
+ xor r13,r14
+ mov rdi,rdx
+ vpxor ymm11,ymm11,ymm9
+ rorx r12,rdx,39
+ lea rcx,[r13*1+rcx]
+ xor rdi,r8
+ vpaddq ymm2,ymm2,ymm11
+ rorx r14,rdx,34
+ rorx r13,rdx,28
+ lea r10,[rcx*1+r10]
+ vpaddq ymm10,ymm2,YMMWORD[((-64))+rbp]
+ and r15,rdi
+ xor r14,r12
+ xor r15,r8
+ xor r14,r13
+ lea rcx,[r15*1+rcx]
+ mov r12,r11
+ vmovdqa YMMWORD[64+rsp],ymm10
+ vpalignr ymm8,ymm4,ymm3,8
+ add rbx,QWORD[((96+256))+rsp]
+ and r12,r10
+ rorx r13,r10,41
+ vpalignr ymm11,ymm0,ymm7,8
+ rorx r15,r10,18
+ lea rcx,[r14*1+rcx]
+ lea rbx,[r12*1+rbx]
+ vpsrlq ymm10,ymm8,1
+ andn r12,r10,rax
+ xor r13,r15
+ rorx r14,r10,14
+ vpaddq ymm3,ymm3,ymm11
+ vpsrlq ymm11,ymm8,7
+ lea rbx,[r12*1+rbx]
+ xor r13,r14
+ mov r15,rcx
+ vpsllq ymm9,ymm8,56
+ vpxor ymm8,ymm11,ymm10
+ rorx r12,rcx,39
+ lea rbx,[r13*1+rbx]
+ xor r15,rdx
+ vpsrlq ymm10,ymm10,7
+ vpxor ymm8,ymm8,ymm9
+ rorx r14,rcx,34
+ rorx r13,rcx,28
+ lea r9,[rbx*1+r9]
+ vpsllq ymm9,ymm9,7
+ vpxor ymm8,ymm8,ymm10
+ and rdi,r15
+ xor r14,r12
+ xor rdi,rdx
+ vpsrlq ymm11,ymm2,6
+ vpxor ymm8,ymm8,ymm9
+ xor r14,r13
+ lea rbx,[rdi*1+rbx]
+ mov r12,r10
+ vpsllq ymm10,ymm2,3
+ vpaddq ymm3,ymm3,ymm8
+ add rax,QWORD[((104+256))+rsp]
+ and r12,r9
+ rorx r13,r9,41
+ vpsrlq ymm9,ymm2,19
+ vpxor ymm11,ymm11,ymm10
+ rorx rdi,r9,18
+ lea rbx,[r14*1+rbx]
+ lea rax,[r12*1+rax]
+ vpsllq ymm10,ymm10,42
+ vpxor ymm11,ymm11,ymm9
+ andn r12,r9,r11
+ xor r13,rdi
+ rorx r14,r9,14
+ vpsrlq ymm9,ymm9,42
+ vpxor ymm11,ymm11,ymm10
+ lea rax,[r12*1+rax]
+ xor r13,r14
+ mov rdi,rbx
+ vpxor ymm11,ymm11,ymm9
+ rorx r12,rbx,39
+ lea rax,[r13*1+rax]
+ xor rdi,rcx
+ vpaddq ymm3,ymm3,ymm11
+ rorx r14,rbx,34
+ rorx r13,rbx,28
+ lea r8,[rax*1+r8]
+ vpaddq ymm10,ymm3,YMMWORD[((-32))+rbp]
+ and r15,rdi
+ xor r14,r12
+ xor r15,rcx
+ xor r14,r13
+ lea rax,[r15*1+rax]
+ mov r12,r9
+ vmovdqa YMMWORD[96+rsp],ymm10
+ lea rsp,[((-128))+rsp]
+ vpalignr ymm8,ymm5,ymm4,8
+ add r11,QWORD[((0+256))+rsp]
+ and r12,r8
+ rorx r13,r8,41
+ vpalignr ymm11,ymm1,ymm0,8
+ rorx r15,r8,18
+ lea rax,[r14*1+rax]
+ lea r11,[r12*1+r11]
+ vpsrlq ymm10,ymm8,1
+ andn r12,r8,r10
+ xor r13,r15
+ rorx r14,r8,14
+ vpaddq ymm4,ymm4,ymm11
+ vpsrlq ymm11,ymm8,7
+ lea r11,[r12*1+r11]
+ xor r13,r14
+ mov r15,rax
+ vpsllq ymm9,ymm8,56
+ vpxor ymm8,ymm11,ymm10
+ rorx r12,rax,39
+ lea r11,[r13*1+r11]
+ xor r15,rbx
+ vpsrlq ymm10,ymm10,7
+ vpxor ymm8,ymm8,ymm9
+ rorx r14,rax,34
+ rorx r13,rax,28
+ lea rdx,[r11*1+rdx]
+ vpsllq ymm9,ymm9,7
+ vpxor ymm8,ymm8,ymm10
+ and rdi,r15
+ xor r14,r12
+ xor rdi,rbx
+ vpsrlq ymm11,ymm3,6
+ vpxor ymm8,ymm8,ymm9
+ xor r14,r13
+ lea r11,[rdi*1+r11]
+ mov r12,r8
+ vpsllq ymm10,ymm3,3
+ vpaddq ymm4,ymm4,ymm8
+ add r10,QWORD[((8+256))+rsp]
+ and r12,rdx
+ rorx r13,rdx,41
+ vpsrlq ymm9,ymm3,19
+ vpxor ymm11,ymm11,ymm10
+ rorx rdi,rdx,18
+ lea r11,[r14*1+r11]
+ lea r10,[r12*1+r10]
+ vpsllq ymm10,ymm10,42
+ vpxor ymm11,ymm11,ymm9
+ andn r12,rdx,r9
+ xor r13,rdi
+ rorx r14,rdx,14
+ vpsrlq ymm9,ymm9,42
+ vpxor ymm11,ymm11,ymm10
+ lea r10,[r12*1+r10]
+ xor r13,r14
+ mov rdi,r11
+ vpxor ymm11,ymm11,ymm9
+ rorx r12,r11,39
+ lea r10,[r13*1+r10]
+ xor rdi,rax
+ vpaddq ymm4,ymm4,ymm11
+ rorx r14,r11,34
+ rorx r13,r11,28
+ lea rcx,[r10*1+rcx]
+ vpaddq ymm10,ymm4,YMMWORD[rbp]
+ and r15,rdi
+ xor r14,r12
+ xor r15,rax
+ xor r14,r13
+ lea r10,[r15*1+r10]
+ mov r12,rdx
+ vmovdqa YMMWORD[rsp],ymm10
+ vpalignr ymm8,ymm6,ymm5,8
+ add r9,QWORD[((32+256))+rsp]
+ and r12,rcx
+ rorx r13,rcx,41
+ vpalignr ymm11,ymm2,ymm1,8
+ rorx r15,rcx,18
+ lea r10,[r14*1+r10]
+ lea r9,[r12*1+r9]
+ vpsrlq ymm10,ymm8,1
+ andn r12,rcx,r8
+ xor r13,r15
+ rorx r14,rcx,14
+ vpaddq ymm5,ymm5,ymm11
+ vpsrlq ymm11,ymm8,7
+ lea r9,[r12*1+r9]
+ xor r13,r14
+ mov r15,r10
+ vpsllq ymm9,ymm8,56
+ vpxor ymm8,ymm11,ymm10
+ rorx r12,r10,39
+ lea r9,[r13*1+r9]
+ xor r15,r11
+ vpsrlq ymm10,ymm10,7
+ vpxor ymm8,ymm8,ymm9
+ rorx r14,r10,34
+ rorx r13,r10,28
+ lea rbx,[r9*1+rbx]
+ vpsllq ymm9,ymm9,7
+ vpxor ymm8,ymm8,ymm10
+ and rdi,r15
+ xor r14,r12
+ xor rdi,r11
+ vpsrlq ymm11,ymm4,6
+ vpxor ymm8,ymm8,ymm9
+ xor r14,r13
+ lea r9,[rdi*1+r9]
+ mov r12,rcx
+ vpsllq ymm10,ymm4,3
+ vpaddq ymm5,ymm5,ymm8
+ add r8,QWORD[((40+256))+rsp]
+ and r12,rbx
+ rorx r13,rbx,41
+ vpsrlq ymm9,ymm4,19
+ vpxor ymm11,ymm11,ymm10
+ rorx rdi,rbx,18
+ lea r9,[r14*1+r9]
+ lea r8,[r12*1+r8]
+ vpsllq ymm10,ymm10,42
+ vpxor ymm11,ymm11,ymm9
+ andn r12,rbx,rdx
+ xor r13,rdi
+ rorx r14,rbx,14
+ vpsrlq ymm9,ymm9,42
+ vpxor ymm11,ymm11,ymm10
+ lea r8,[r12*1+r8]
+ xor r13,r14
+ mov rdi,r9
+ vpxor ymm11,ymm11,ymm9
+ rorx r12,r9,39
+ lea r8,[r13*1+r8]
+ xor rdi,r10
+ vpaddq ymm5,ymm5,ymm11
+ rorx r14,r9,34
+ rorx r13,r9,28
+ lea rax,[r8*1+rax]
+ vpaddq ymm10,ymm5,YMMWORD[32+rbp]
+ and r15,rdi
+ xor r14,r12
+ xor r15,r10
+ xor r14,r13
+ lea r8,[r15*1+r8]
+ mov r12,rbx
+ vmovdqa YMMWORD[32+rsp],ymm10
+ vpalignr ymm8,ymm7,ymm6,8
+ add rdx,QWORD[((64+256))+rsp]
+ and r12,rax
+ rorx r13,rax,41
+ vpalignr ymm11,ymm3,ymm2,8
+ rorx r15,rax,18
+ lea r8,[r14*1+r8]
+ lea rdx,[r12*1+rdx]
+ vpsrlq ymm10,ymm8,1
+ andn r12,rax,rcx
+ xor r13,r15
+ rorx r14,rax,14
+ vpaddq ymm6,ymm6,ymm11
+ vpsrlq ymm11,ymm8,7
+ lea rdx,[r12*1+rdx]
+ xor r13,r14
+ mov r15,r8
+ vpsllq ymm9,ymm8,56
+ vpxor ymm8,ymm11,ymm10
+ rorx r12,r8,39
+ lea rdx,[r13*1+rdx]
+ xor r15,r9
+ vpsrlq ymm10,ymm10,7
+ vpxor ymm8,ymm8,ymm9
+ rorx r14,r8,34
+ rorx r13,r8,28
+ lea r11,[rdx*1+r11]
+ vpsllq ymm9,ymm9,7
+ vpxor ymm8,ymm8,ymm10
+ and rdi,r15
+ xor r14,r12
+ xor rdi,r9
+ vpsrlq ymm11,ymm5,6
+ vpxor ymm8,ymm8,ymm9
+ xor r14,r13
+ lea rdx,[rdi*1+rdx]
+ mov r12,rax
+ vpsllq ymm10,ymm5,3
+ vpaddq ymm6,ymm6,ymm8
+ add rcx,QWORD[((72+256))+rsp]
+ and r12,r11
+ rorx r13,r11,41
+ vpsrlq ymm9,ymm5,19
+ vpxor ymm11,ymm11,ymm10
+ rorx rdi,r11,18
+ lea rdx,[r14*1+rdx]
+ lea rcx,[r12*1+rcx]
+ vpsllq ymm10,ymm10,42
+ vpxor ymm11,ymm11,ymm9
+ andn r12,r11,rbx
+ xor r13,rdi
+ rorx r14,r11,14
+ vpsrlq ymm9,ymm9,42
+ vpxor ymm11,ymm11,ymm10
+ lea rcx,[r12*1+rcx]
+ xor r13,r14
+ mov rdi,rdx
+ vpxor ymm11,ymm11,ymm9
+ rorx r12,rdx,39
+ lea rcx,[r13*1+rcx]
+ xor rdi,r8
+ vpaddq ymm6,ymm6,ymm11
+ rorx r14,rdx,34
+ rorx r13,rdx,28
+ lea r10,[rcx*1+r10]
+ vpaddq ymm10,ymm6,YMMWORD[64+rbp]
+ and r15,rdi
+ xor r14,r12
+ xor r15,r8
+ xor r14,r13
+ lea rcx,[r15*1+rcx]
+ mov r12,r11
+ vmovdqa YMMWORD[64+rsp],ymm10
+ vpalignr ymm8,ymm0,ymm7,8
+ add rbx,QWORD[((96+256))+rsp]
+ and r12,r10
+ rorx r13,r10,41
+ vpalignr ymm11,ymm4,ymm3,8
+ rorx r15,r10,18
+ lea rcx,[r14*1+rcx]
+ lea rbx,[r12*1+rbx]
+ vpsrlq ymm10,ymm8,1
+ andn r12,r10,rax
+ xor r13,r15
+ rorx r14,r10,14
+ vpaddq ymm7,ymm7,ymm11
+ vpsrlq ymm11,ymm8,7
+ lea rbx,[r12*1+rbx]
+ xor r13,r14
+ mov r15,rcx
+ vpsllq ymm9,ymm8,56
+ vpxor ymm8,ymm11,ymm10
+ rorx r12,rcx,39
+ lea rbx,[r13*1+rbx]
+ xor r15,rdx
+ vpsrlq ymm10,ymm10,7
+ vpxor ymm8,ymm8,ymm9
+ rorx r14,rcx,34
+ rorx r13,rcx,28
+ lea r9,[rbx*1+r9]
+ vpsllq ymm9,ymm9,7
+ vpxor ymm8,ymm8,ymm10
+ and rdi,r15
+ xor r14,r12
+ xor rdi,rdx
+ vpsrlq ymm11,ymm6,6
+ vpxor ymm8,ymm8,ymm9
+ xor r14,r13
+ lea rbx,[rdi*1+rbx]
+ mov r12,r10
+ vpsllq ymm10,ymm6,3
+ vpaddq ymm7,ymm7,ymm8
+ add rax,QWORD[((104+256))+rsp]
+ and r12,r9
+ rorx r13,r9,41
+ vpsrlq ymm9,ymm6,19
+ vpxor ymm11,ymm11,ymm10
+ rorx rdi,r9,18
+ lea rbx,[r14*1+rbx]
+ lea rax,[r12*1+rax]
+ vpsllq ymm10,ymm10,42
+ vpxor ymm11,ymm11,ymm9
+ andn r12,r9,r11
+ xor r13,rdi
+ rorx r14,r9,14
+ vpsrlq ymm9,ymm9,42
+ vpxor ymm11,ymm11,ymm10
+ lea rax,[r12*1+rax]
+ xor r13,r14
+ mov rdi,rbx
+ vpxor ymm11,ymm11,ymm9
+ rorx r12,rbx,39
+ lea rax,[r13*1+rax]
+ xor rdi,rcx
+ vpaddq ymm7,ymm7,ymm11
+ rorx r14,rbx,34
+ rorx r13,rbx,28
+ lea r8,[rax*1+r8]
+ vpaddq ymm10,ymm7,YMMWORD[96+rbp]
+ and r15,rdi
+ xor r14,r12
+ xor r15,rcx
+ xor r14,r13
+ lea rax,[r15*1+rax]
+ mov r12,r9
+ vmovdqa YMMWORD[96+rsp],ymm10
+ lea rbp,[256+rbp]
+ cmp BYTE[((-121))+rbp],0
+ jne NEAR $L$avx2_00_47
+ add r11,QWORD[((0+128))+rsp]
+ and r12,r8
+ rorx r13,r8,41
+ rorx r15,r8,18
+ lea rax,[r14*1+rax]
+ lea r11,[r12*1+r11]
+ andn r12,r8,r10
+ xor r13,r15
+ rorx r14,r8,14
+ lea r11,[r12*1+r11]
+ xor r13,r14
+ mov r15,rax
+ rorx r12,rax,39
+ lea r11,[r13*1+r11]
+ xor r15,rbx
+ rorx r14,rax,34
+ rorx r13,rax,28
+ lea rdx,[r11*1+rdx]
+ and rdi,r15
+ xor r14,r12
+ xor rdi,rbx
+ xor r14,r13
+ lea r11,[rdi*1+r11]
+ mov r12,r8
+ add r10,QWORD[((8+128))+rsp]
+ and r12,rdx
+ rorx r13,rdx,41
+ rorx rdi,rdx,18
+ lea r11,[r14*1+r11]
+ lea r10,[r12*1+r10]
+ andn r12,rdx,r9
+ xor r13,rdi
+ rorx r14,rdx,14
+ lea r10,[r12*1+r10]
+ xor r13,r14
+ mov rdi,r11
+ rorx r12,r11,39
+ lea r10,[r13*1+r10]
+ xor rdi,rax
+ rorx r14,r11,34
+ rorx r13,r11,28
+ lea rcx,[r10*1+rcx]
+ and r15,rdi
+ xor r14,r12
+ xor r15,rax
+ xor r14,r13
+ lea r10,[r15*1+r10]
+ mov r12,rdx
+ add r9,QWORD[((32+128))+rsp]
+ and r12,rcx
+ rorx r13,rcx,41
+ rorx r15,rcx,18
+ lea r10,[r14*1+r10]
+ lea r9,[r12*1+r9]
+ andn r12,rcx,r8
+ xor r13,r15
+ rorx r14,rcx,14
+ lea r9,[r12*1+r9]
+ xor r13,r14
+ mov r15,r10
+ rorx r12,r10,39
+ lea r9,[r13*1+r9]
+ xor r15,r11
+ rorx r14,r10,34
+ rorx r13,r10,28
+ lea rbx,[r9*1+rbx]
+ and rdi,r15
+ xor r14,r12
+ xor rdi,r11
+ xor r14,r13
+ lea r9,[rdi*1+r9]
+ mov r12,rcx
+ add r8,QWORD[((40+128))+rsp]
+ and r12,rbx
+ rorx r13,rbx,41
+ rorx rdi,rbx,18
+ lea r9,[r14*1+r9]
+ lea r8,[r12*1+r8]
+ andn r12,rbx,rdx
+ xor r13,rdi
+ rorx r14,rbx,14
+ lea r8,[r12*1+r8]
+ xor r13,r14
+ mov rdi,r9
+ rorx r12,r9,39
+ lea r8,[r13*1+r8]
+ xor rdi,r10
+ rorx r14,r9,34
+ rorx r13,r9,28
+ lea rax,[r8*1+rax]
+ and r15,rdi
+ xor r14,r12
+ xor r15,r10
+ xor r14,r13
+ lea r8,[r15*1+r8]
+ mov r12,rbx
+ add rdx,QWORD[((64+128))+rsp]
+ and r12,rax
+ rorx r13,rax,41
+ rorx r15,rax,18
+ lea r8,[r14*1+r8]
+ lea rdx,[r12*1+rdx]
+ andn r12,rax,rcx
+ xor r13,r15
+ rorx r14,rax,14
+ lea rdx,[r12*1+rdx]
+ xor r13,r14
+ mov r15,r8
+ rorx r12,r8,39
+ lea rdx,[r13*1+rdx]
+ xor r15,r9
+ rorx r14,r8,34
+ rorx r13,r8,28
+ lea r11,[rdx*1+r11]
+ and rdi,r15
+ xor r14,r12
+ xor rdi,r9
+ xor r14,r13
+ lea rdx,[rdi*1+rdx]
+ mov r12,rax
+ add rcx,QWORD[((72+128))+rsp]
+ and r12,r11
+ rorx r13,r11,41
+ rorx rdi,r11,18
+ lea rdx,[r14*1+rdx]
+ lea rcx,[r12*1+rcx]
+ andn r12,r11,rbx
+ xor r13,rdi
+ rorx r14,r11,14
+ lea rcx,[r12*1+rcx]
+ xor r13,r14
+ mov rdi,rdx
+ rorx r12,rdx,39
+ lea rcx,[r13*1+rcx]
+ xor rdi,r8
+ rorx r14,rdx,34
+ rorx r13,rdx,28
+ lea r10,[rcx*1+r10]
+ and r15,rdi
+ xor r14,r12
+ xor r15,r8
+ xor r14,r13
+ lea rcx,[r15*1+rcx]
+ mov r12,r11
+ add rbx,QWORD[((96+128))+rsp]
+ and r12,r10
+ rorx r13,r10,41
+ rorx r15,r10,18
+ lea rcx,[r14*1+rcx]
+ lea rbx,[r12*1+rbx]
+ andn r12,r10,rax
+ xor r13,r15
+ rorx r14,r10,14
+ lea rbx,[r12*1+rbx]
+ xor r13,r14
+ mov r15,rcx
+ rorx r12,rcx,39
+ lea rbx,[r13*1+rbx]
+ xor r15,rdx
+ rorx r14,rcx,34
+ rorx r13,rcx,28
+ lea r9,[rbx*1+r9]
+ and rdi,r15
+ xor r14,r12
+ xor rdi,rdx
+ xor r14,r13
+ lea rbx,[rdi*1+rbx]
+ mov r12,r10
+ add rax,QWORD[((104+128))+rsp]
+ and r12,r9
+ rorx r13,r9,41
+ rorx rdi,r9,18
+ lea rbx,[r14*1+rbx]
+ lea rax,[r12*1+rax]
+ andn r12,r9,r11
+ xor r13,rdi
+ rorx r14,r9,14
+ lea rax,[r12*1+rax]
+ xor r13,r14
+ mov rdi,rbx
+ rorx r12,rbx,39
+ lea rax,[r13*1+rax]
+ xor rdi,rcx
+ rorx r14,rbx,34
+ rorx r13,rbx,28
+ lea r8,[rax*1+r8]
+ and r15,rdi
+ xor r14,r12
+ xor r15,rcx
+ xor r14,r13
+ lea rax,[r15*1+rax]
+ mov r12,r9
+ add r11,QWORD[rsp]
+ and r12,r8
+ rorx r13,r8,41
+ rorx r15,r8,18
+ lea rax,[r14*1+rax]
+ lea r11,[r12*1+r11]
+ andn r12,r8,r10
+ xor r13,r15
+ rorx r14,r8,14
+ lea r11,[r12*1+r11]
+ xor r13,r14
+ mov r15,rax
+ rorx r12,rax,39
+ lea r11,[r13*1+r11]
+ xor r15,rbx
+ rorx r14,rax,34
+ rorx r13,rax,28
+ lea rdx,[r11*1+rdx]
+ and rdi,r15
+ xor r14,r12
+ xor rdi,rbx
+ xor r14,r13
+ lea r11,[rdi*1+r11]
+ mov r12,r8
+ add r10,QWORD[8+rsp]
+ and r12,rdx
+ rorx r13,rdx,41
+ rorx rdi,rdx,18
+ lea r11,[r14*1+r11]
+ lea r10,[r12*1+r10]
+ andn r12,rdx,r9
+ xor r13,rdi
+ rorx r14,rdx,14
+ lea r10,[r12*1+r10]
+ xor r13,r14
+ mov rdi,r11
+ rorx r12,r11,39
+ lea r10,[r13*1+r10]
+ xor rdi,rax
+ rorx r14,r11,34
+ rorx r13,r11,28
+ lea rcx,[r10*1+rcx]
+ and r15,rdi
+ xor r14,r12
+ xor r15,rax
+ xor r14,r13
+ lea r10,[r15*1+r10]
+ mov r12,rdx
+ add r9,QWORD[32+rsp]
+ and r12,rcx
+ rorx r13,rcx,41
+ rorx r15,rcx,18
+ lea r10,[r14*1+r10]
+ lea r9,[r12*1+r9]
+ andn r12,rcx,r8
+ xor r13,r15
+ rorx r14,rcx,14
+ lea r9,[r12*1+r9]
+ xor r13,r14
+ mov r15,r10
+ rorx r12,r10,39
+ lea r9,[r13*1+r9]
+ xor r15,r11
+ rorx r14,r10,34
+ rorx r13,r10,28
+ lea rbx,[r9*1+rbx]
+ and rdi,r15
+ xor r14,r12
+ xor rdi,r11
+ xor r14,r13
+ lea r9,[rdi*1+r9]
+ mov r12,rcx
+ add r8,QWORD[40+rsp]
+ and r12,rbx
+ rorx r13,rbx,41
+ rorx rdi,rbx,18
+ lea r9,[r14*1+r9]
+ lea r8,[r12*1+r8]
+ andn r12,rbx,rdx
+ xor r13,rdi
+ rorx r14,rbx,14
+ lea r8,[r12*1+r8]
+ xor r13,r14
+ mov rdi,r9
+ rorx r12,r9,39
+ lea r8,[r13*1+r8]
+ xor rdi,r10
+ rorx r14,r9,34
+ rorx r13,r9,28
+ lea rax,[r8*1+rax]
+ and r15,rdi
+ xor r14,r12
+ xor r15,r10
+ xor r14,r13
+ lea r8,[r15*1+r8]
+ mov r12,rbx
+ add rdx,QWORD[64+rsp]
+ and r12,rax
+ rorx r13,rax,41
+ rorx r15,rax,18
+ lea r8,[r14*1+r8]
+ lea rdx,[r12*1+rdx]
+ andn r12,rax,rcx
+ xor r13,r15
+ rorx r14,rax,14
+ lea rdx,[r12*1+rdx]
+ xor r13,r14
+ mov r15,r8
+ rorx r12,r8,39
+ lea rdx,[r13*1+rdx]
+ xor r15,r9
+ rorx r14,r8,34
+ rorx r13,r8,28
+ lea r11,[rdx*1+r11]
+ and rdi,r15
+ xor r14,r12
+ xor rdi,r9
+ xor r14,r13
+ lea rdx,[rdi*1+rdx]
+ mov r12,rax
+ add rcx,QWORD[72+rsp]
+ and r12,r11
+ rorx r13,r11,41
+ rorx rdi,r11,18
+ lea rdx,[r14*1+rdx]
+ lea rcx,[r12*1+rcx]
+ andn r12,r11,rbx
+ xor r13,rdi
+ rorx r14,r11,14
+ lea rcx,[r12*1+rcx]
+ xor r13,r14
+ mov rdi,rdx
+ rorx r12,rdx,39
+ lea rcx,[r13*1+rcx]
+ xor rdi,r8
+ rorx r14,rdx,34
+ rorx r13,rdx,28
+ lea r10,[rcx*1+r10]
+ and r15,rdi
+ xor r14,r12
+ xor r15,r8
+ xor r14,r13
+ lea rcx,[r15*1+rcx]
+ mov r12,r11
+ add rbx,QWORD[96+rsp]
+ and r12,r10
+ rorx r13,r10,41
+ rorx r15,r10,18
+ lea rcx,[r14*1+rcx]
+ lea rbx,[r12*1+rbx]
+ andn r12,r10,rax
+ xor r13,r15
+ rorx r14,r10,14
+ lea rbx,[r12*1+rbx]
+ xor r13,r14
+ mov r15,rcx
+ rorx r12,rcx,39
+ lea rbx,[r13*1+rbx]
+ xor r15,rdx
+ rorx r14,rcx,34
+ rorx r13,rcx,28
+ lea r9,[rbx*1+r9]
+ and rdi,r15
+ xor r14,r12
+ xor rdi,rdx
+ xor r14,r13
+ lea rbx,[rdi*1+rbx]
+ mov r12,r10
+ add rax,QWORD[104+rsp]
+ and r12,r9
+ rorx r13,r9,41
+ rorx rdi,r9,18
+ lea rbx,[r14*1+rbx]
+ lea rax,[r12*1+rax]
+ andn r12,r9,r11
+ xor r13,rdi
+ rorx r14,r9,14
+ lea rax,[r12*1+rax]
+ xor r13,r14
+ mov rdi,rbx
+ rorx r12,rbx,39
+ lea rax,[r13*1+rax]
+ xor rdi,rcx
+ rorx r14,rbx,34
+ rorx r13,rbx,28
+ lea r8,[rax*1+r8]
+ and r15,rdi
+ xor r14,r12
+ xor r15,rcx
+ xor r14,r13
+ lea rax,[r15*1+rax]
+ mov r12,r9
+ mov rdi,QWORD[1280+rsp]
+ add rax,r14
+
+ lea rbp,[1152+rsp]
+
+ add rax,QWORD[rdi]
+ add rbx,QWORD[8+rdi]
+ add rcx,QWORD[16+rdi]
+ add rdx,QWORD[24+rdi]
+ add r8,QWORD[32+rdi]
+ add r9,QWORD[40+rdi]
+ add r10,QWORD[48+rdi]
+ add r11,QWORD[56+rdi]
+
+ mov QWORD[rdi],rax
+ mov QWORD[8+rdi],rbx
+ mov QWORD[16+rdi],rcx
+ mov QWORD[24+rdi],rdx
+ mov QWORD[32+rdi],r8
+ mov QWORD[40+rdi],r9
+ mov QWORD[48+rdi],r10
+ mov QWORD[56+rdi],r11
+
+ cmp rsi,QWORD[144+rbp]
+ je NEAR $L$done_avx2
+
+ xor r14,r14
+ mov rdi,rbx
+ xor rdi,rcx
+ mov r12,r9
+ jmp NEAR $L$ower_avx2
+ALIGN 16
+$L$ower_avx2:
+ add r11,QWORD[((0+16))+rbp]
+ and r12,r8
+ rorx r13,r8,41
+ rorx r15,r8,18
+ lea rax,[r14*1+rax]
+ lea r11,[r12*1+r11]
+ andn r12,r8,r10
+ xor r13,r15
+ rorx r14,r8,14
+ lea r11,[r12*1+r11]
+ xor r13,r14
+ mov r15,rax
+ rorx r12,rax,39
+ lea r11,[r13*1+r11]
+ xor r15,rbx
+ rorx r14,rax,34
+ rorx r13,rax,28
+ lea rdx,[r11*1+rdx]
+ and rdi,r15
+ xor r14,r12
+ xor rdi,rbx
+ xor r14,r13
+ lea r11,[rdi*1+r11]
+ mov r12,r8
+ add r10,QWORD[((8+16))+rbp]
+ and r12,rdx
+ rorx r13,rdx,41
+ rorx rdi,rdx,18
+ lea r11,[r14*1+r11]
+ lea r10,[r12*1+r10]
+ andn r12,rdx,r9
+ xor r13,rdi
+ rorx r14,rdx,14
+ lea r10,[r12*1+r10]
+ xor r13,r14
+ mov rdi,r11
+ rorx r12,r11,39
+ lea r10,[r13*1+r10]
+ xor rdi,rax
+ rorx r14,r11,34
+ rorx r13,r11,28
+ lea rcx,[r10*1+rcx]
+ and r15,rdi
+ xor r14,r12
+ xor r15,rax
+ xor r14,r13
+ lea r10,[r15*1+r10]
+ mov r12,rdx
+ add r9,QWORD[((32+16))+rbp]
+ and r12,rcx
+ rorx r13,rcx,41
+ rorx r15,rcx,18
+ lea r10,[r14*1+r10]
+ lea r9,[r12*1+r9]
+ andn r12,rcx,r8
+ xor r13,r15
+ rorx r14,rcx,14
+ lea r9,[r12*1+r9]
+ xor r13,r14
+ mov r15,r10
+ rorx r12,r10,39
+ lea r9,[r13*1+r9]
+ xor r15,r11
+ rorx r14,r10,34
+ rorx r13,r10,28
+ lea rbx,[r9*1+rbx]
+ and rdi,r15
+ xor r14,r12
+ xor rdi,r11
+ xor r14,r13
+ lea r9,[rdi*1+r9]
+ mov r12,rcx
+ add r8,QWORD[((40+16))+rbp]
+ and r12,rbx
+ rorx r13,rbx,41
+ rorx rdi,rbx,18
+ lea r9,[r14*1+r9]
+ lea r8,[r12*1+r8]
+ andn r12,rbx,rdx
+ xor r13,rdi
+ rorx r14,rbx,14
+ lea r8,[r12*1+r8]
+ xor r13,r14
+ mov rdi,r9
+ rorx r12,r9,39
+ lea r8,[r13*1+r8]
+ xor rdi,r10
+ rorx r14,r9,34
+ rorx r13,r9,28
+ lea rax,[r8*1+rax]
+ and r15,rdi
+ xor r14,r12
+ xor r15,r10
+ xor r14,r13
+ lea r8,[r15*1+r8]
+ mov r12,rbx
+ add rdx,QWORD[((64+16))+rbp]
+ and r12,rax
+ rorx r13,rax,41
+ rorx r15,rax,18
+ lea r8,[r14*1+r8]
+ lea rdx,[r12*1+rdx]
+ andn r12,rax,rcx
+ xor r13,r15
+ rorx r14,rax,14
+ lea rdx,[r12*1+rdx]
+ xor r13,r14
+ mov r15,r8
+ rorx r12,r8,39
+ lea rdx,[r13*1+rdx]
+ xor r15,r9
+ rorx r14,r8,34
+ rorx r13,r8,28
+ lea r11,[rdx*1+r11]
+ and rdi,r15
+ xor r14,r12
+ xor rdi,r9
+ xor r14,r13
+ lea rdx,[rdi*1+rdx]
+ mov r12,rax
+ add rcx,QWORD[((72+16))+rbp]
+ and r12,r11
+ rorx r13,r11,41
+ rorx rdi,r11,18
+ lea rdx,[r14*1+rdx]
+ lea rcx,[r12*1+rcx]
+ andn r12,r11,rbx
+ xor r13,rdi
+ rorx r14,r11,14
+ lea rcx,[r12*1+rcx]
+ xor r13,r14
+ mov rdi,rdx
+ rorx r12,rdx,39
+ lea rcx,[r13*1+rcx]
+ xor rdi,r8
+ rorx r14,rdx,34
+ rorx r13,rdx,28
+ lea r10,[rcx*1+r10]
+ and r15,rdi
+ xor r14,r12
+ xor r15,r8
+ xor r14,r13
+ lea rcx,[r15*1+rcx]
+ mov r12,r11
+ add rbx,QWORD[((96+16))+rbp]
+ and r12,r10
+ rorx r13,r10,41
+ rorx r15,r10,18
+ lea rcx,[r14*1+rcx]
+ lea rbx,[r12*1+rbx]
+ andn r12,r10,rax
+ xor r13,r15
+ rorx r14,r10,14
+ lea rbx,[r12*1+rbx]
+ xor r13,r14
+ mov r15,rcx
+ rorx r12,rcx,39
+ lea rbx,[r13*1+rbx]
+ xor r15,rdx
+ rorx r14,rcx,34
+ rorx r13,rcx,28
+ lea r9,[rbx*1+r9]
+ and rdi,r15
+ xor r14,r12
+ xor rdi,rdx
+ xor r14,r13
+ lea rbx,[rdi*1+rbx]
+ mov r12,r10
+ add rax,QWORD[((104+16))+rbp]
+ and r12,r9
+ rorx r13,r9,41
+ rorx rdi,r9,18
+ lea rbx,[r14*1+rbx]
+ lea rax,[r12*1+rax]
+ andn r12,r9,r11
+ xor r13,rdi
+ rorx r14,r9,14
+ lea rax,[r12*1+rax]
+ xor r13,r14
+ mov rdi,rbx
+ rorx r12,rbx,39
+ lea rax,[r13*1+rax]
+ xor rdi,rcx
+ rorx r14,rbx,34
+ rorx r13,rbx,28
+ lea r8,[rax*1+r8]
+ and r15,rdi
+ xor r14,r12
+ xor r15,rcx
+ xor r14,r13
+ lea rax,[r15*1+rax]
+ mov r12,r9
+ lea rbp,[((-128))+rbp]
+ cmp rbp,rsp
+ jae NEAR $L$ower_avx2
+
+ mov rdi,QWORD[1280+rsp]
+ add rax,r14
+
+ lea rsp,[1152+rsp]
+
+
+
+ add rax,QWORD[rdi]
+ add rbx,QWORD[8+rdi]
+ add rcx,QWORD[16+rdi]
+ add rdx,QWORD[24+rdi]
+ add r8,QWORD[32+rdi]
+ add r9,QWORD[40+rdi]
+ lea rsi,[256+rsi]
+ add r10,QWORD[48+rdi]
+ mov r12,rsi
+ add r11,QWORD[56+rdi]
+ cmp rsi,QWORD[((128+16))+rsp]
+
+ mov QWORD[rdi],rax
+ cmove r12,rsp
+ mov QWORD[8+rdi],rbx
+ mov QWORD[16+rdi],rcx
+ mov QWORD[24+rdi],rdx
+ mov QWORD[32+rdi],r8
+ mov QWORD[40+rdi],r9
+ mov QWORD[48+rdi],r10
+ mov QWORD[56+rdi],r11
+
+ jbe NEAR $L$oop_avx2
+ lea rbp,[rsp]
+
+
+
+
+$L$done_avx2:
+ mov rsi,QWORD[152+rbp]
+
+ vzeroupper
+ movaps xmm6,XMMWORD[((128+32))+rbp]
+ movaps xmm7,XMMWORD[((128+48))+rbp]
+ movaps xmm8,XMMWORD[((128+64))+rbp]
+ movaps xmm9,XMMWORD[((128+80))+rbp]
+ movaps xmm10,XMMWORD[((128+96))+rbp]
+ movaps xmm11,XMMWORD[((128+112))+rbp]
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$epilogue_avx2:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha512_block_data_order_avx2:
+EXTERN __imp_RtlVirtualUnwind
+
+ALIGN 16
+se_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
+
+ mov rax,QWORD[152+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$in_prologue
+ lea r10,[$L$avx2_shortcut]
+ cmp rbx,r10
+ jb NEAR $L$not_in_avx2
+
+ and rax,-256*8
+ add rax,1152
+$L$not_in_avx2:
+ mov rsi,rax
+ mov rax,QWORD[((128+24))+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov r15,QWORD[((-48))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+ mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+ mov QWORD[232+r8],r14
+ mov QWORD[240+r8],r15
+
+ lea r10,[$L$epilogue]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
+
+ lea rsi,[((128+32))+rsi]
+ lea rdi,[512+r8]
+ mov ecx,12
+ DD 0xa548f3fc
+
+$L$in_prologue:
+ mov rdi,QWORD[8+rax]
+ mov rsi,QWORD[16+rax]
+ mov QWORD[152+r8],rax
+ mov QWORD[168+r8],rsi
+ mov QWORD[176+r8],rdi
+
+ mov rdi,QWORD[40+r9]
+ mov rsi,r8
+ mov ecx,154
+ DD 0xa548f3fc
+
+ mov rsi,r9
+ xor rcx,rcx
+ mov rdx,QWORD[8+rsi]
+ mov r8,QWORD[rsi]
+ mov r9,QWORD[16+rsi]
+ mov r10,QWORD[40+rsi]
+ lea r11,[56+rsi]
+ lea r12,[24+rsi]
+ mov QWORD[32+rsp],r10
+ mov QWORD[40+rsp],r11
+ mov QWORD[48+rsp],r12
+ mov QWORD[56+rsp],rcx
+ call QWORD[__imp_RtlVirtualUnwind]
+
+ mov eax,1
+ add rsp,64
+ popfq
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ DB 0F3h,0C3h ;repret
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_sha512_block_data_order wrt ..imagebase
+ DD $L$SEH_end_sha512_block_data_order wrt ..imagebase
+ DD $L$SEH_info_sha512_block_data_order wrt ..imagebase
+ DD $L$SEH_begin_sha512_block_data_order_xop wrt ..imagebase
+ DD $L$SEH_end_sha512_block_data_order_xop wrt ..imagebase
+ DD $L$SEH_info_sha512_block_data_order_xop wrt ..imagebase
+ DD $L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_end_sha512_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_info_sha512_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_begin_sha512_block_data_order_avx2 wrt ..imagebase
+ DD $L$SEH_end_sha512_block_data_order_avx2 wrt ..imagebase
+ DD $L$SEH_info_sha512_block_data_order_avx2 wrt ..imagebase
+section .xdata rdata align=8
+ALIGN 8
+$L$SEH_info_sha512_block_data_order:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase
+$L$SEH_info_sha512_block_data_order_xop:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$prologue_xop wrt ..imagebase,$L$epilogue_xop wrt ..imagebase
+$L$SEH_info_sha512_block_data_order_avx:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
+$L$SEH_info_sha512_block_data_order_avx2:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$prologue_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase