From 384e6de4c7e35e37fb3d6fbeb32ddcb5eb0d3d3f Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Fri, 3 Feb 2017 12:07:16 +0100 Subject: x86_64 assembly pack: Win64 SEH face-lift. - harmonize handlers with guidelines and themselves; - fix some bugs in handlers; - add missing handlers in chacha and ecp_nistz256 modules; Reviewed-by: Rich Salz --- crypto/aes/asm/aes-x86_64.pl | 42 +++--- crypto/aes/asm/aesni-sha256-x86_64.pl | 55 ++++--- crypto/aes/asm/aesni-x86_64.pl | 274 +++++++++++++++++----------------- crypto/aes/asm/bsaes-x86_64.pl | 161 +++++++++++--------- 4 files changed, 274 insertions(+), 258 deletions(-) (limited to 'crypto/aes') diff --git a/crypto/aes/asm/aes-x86_64.pl b/crypto/aes/asm/aes-x86_64.pl index ae7fde2..5eecfdf 100755 --- a/crypto/aes/asm/aes-x86_64.pl +++ b/crypto/aes/asm/aes-x86_64.pl @@ -599,6 +599,7 @@ $code.=<<___; .hidden asm_AES_encrypt asm_AES_encrypt: AES_encrypt: + mov %rsp,%rax push %rbx push %rbp push %r12 @@ -607,7 +608,6 @@ AES_encrypt: push %r15 # allocate frame "above" key schedule - mov %rsp,%r10 lea -63(%rdx),%rcx # %rdx is key argument and \$-64,%rsp sub %rsp,%rcx @@ -617,7 +617,7 @@ AES_encrypt: sub \$32,%rsp mov %rsi,16(%rsp) # save out - mov %r10,24(%rsp) # save real stack pointer + mov %rax,24(%rsp) # save original stack pointer .Lenc_prologue: mov %rdx,$key @@ -649,13 +649,13 @@ AES_encrypt: mov $s2,8($out) mov $s3,12($out) - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lenc_epilogue: ret .size AES_encrypt,.-AES_encrypt @@ -1197,6 +1197,7 @@ $code.=<<___; .hidden asm_AES_decrypt asm_AES_decrypt: AES_decrypt: + mov %rsp,%rax push %rbx push %rbp push %r12 @@ -1205,7 +1206,6 @@ AES_decrypt: push %r15 # allocate frame "above" key schedule - mov %rsp,%r10 lea -63(%rdx),%rcx # %rdx is key argument and \$-64,%rsp sub %rsp,%rcx @@ -1215,7 +1215,7 @@ AES_decrypt: sub \$32,%rsp mov %rsi,16(%rsp) # save out - mov %r10,24(%rsp) # save real stack pointer + mov %rax,24(%rsp) # save original stack pointer .Ldec_prologue: mov %rdx,$key @@ -1249,13 +1249,13 @@ AES_decrypt: mov $s2,8($out) mov $s3,12($out) - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Ldec_epilogue: ret .size AES_decrypt,.-AES_decrypt @@ -1675,10 +1675,9 @@ AES_cbc_encrypt: mov %r9d,%r9d # clear upper half of enc lea .LAES_Te(%rip),$sbox + lea .LAES_Td(%rip),%r10 cmp \$0,%r9 - jne .Lcbc_picked_te - lea .LAES_Td(%rip),$sbox -.Lcbc_picked_te: + cmoveq %r10,$sbox mov OPENSSL_ia32cap_P(%rip),%r10d cmp \$$speed_limit,%rdx @@ -2580,7 +2579,6 @@ block_se_handler: jae .Lin_block_prologue mov 24(%rax),%rax # pull saved real stack pointer - lea 48(%rax),%rax # adjust... mov -8(%rax),%rbx mov -16(%rax),%rbp diff --git a/crypto/aes/asm/aesni-sha256-x86_64.pl b/crypto/aes/asm/aesni-sha256-x86_64.pl index ba4964a..2d6424f 100644 --- a/crypto/aes/asm/aesni-sha256-x86_64.pl +++ b/crypto/aes/asm/aesni-sha256-x86_64.pl @@ -341,13 +341,13 @@ $code.=<<___; ${func}_xop: .Lxop_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter + mov %rsp,%rax # copy %rsp push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 - mov %rsp,%r11 # copy %rsp sub \$`$framesz+$win64*16*10`,%rsp and \$-64,%rsp # align stack frame @@ -363,7 +363,7 @@ ${func}_xop: mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 - mov %r11,$_rsp + mov %rax,$_rsp ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) @@ -617,13 +617,13 @@ $code.=<<___ if ($win64); movaps `$framesz+16*9`(%rsp),%xmm15 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lepilogue_xop: ret .size ${func}_xop,.-${func}_xop @@ -639,13 +639,13 @@ $code.=<<___; ${func}_avx: .Lavx_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter + mov %rsp,%rax # copy %rsp push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 - mov %rsp,%r11 # copy %rsp sub \$`$framesz+$win64*16*10`,%rsp and \$-64,%rsp # align stack frame @@ -661,7 +661,7 @@ ${func}_avx: mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 - mov %r11,$_rsp + mov %rax,$_rsp ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) @@ -868,13 +868,13 @@ $code.=<<___ if ($win64); movaps `$framesz+16*9`(%rsp),%xmm15 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lepilogue_avx: ret .size ${func}_avx,.-${func}_avx @@ -935,13 +935,13 @@ $code.=<<___; ${func}_avx2: .Lavx2_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter + mov %rsp,%rax # copy %rsp push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 - mov %rsp,%r11 # copy %rsp sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp and \$-256*$SZ,%rsp # align stack frame add \$`2*$SZ*($rounds-8)`,%rsp @@ -958,7 +958,7 @@ ${func}_avx2: mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 - mov %r11,$_rsp + mov %rax,$_rsp ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) @@ -1205,13 +1205,13 @@ $code.=<<___ if ($win64); movaps `$framesz+16*9`(%rsp),%xmm15 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lepilogue_avx2: ret .size ${func}_avx2,.-${func}_avx2 @@ -1569,7 +1569,6 @@ ___ $code.=<<___; mov %rax,%rsi # put aside Rsp mov 16*$SZ+7*8(%rax),%rax # pull $_rsp - lea 48(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl index 443f2f7..8ae6dbf 100644 --- a/crypto/aes/asm/aesni-x86_64.pl +++ b/crypto/aes/asm/aesni-x86_64.pl @@ -1172,7 +1172,7 @@ ___ # with zero-round key xor. { my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); -my ($key0,$ctr)=("${key_}d","${ivp}d"); +my ($key0,$ctr)=("%ebp","${ivp}d"); my $frame_size = 0x80 + ($win64?160:0); $code.=<<___; @@ -1201,26 +1201,25 @@ $code.=<<___; .align 16 .Lctr32_bulk: - lea (%rsp),%rax + lea (%rsp),$key_ # use $key_ as frame pointer push %rbp sub \$$frame_size,%rsp and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%rax) # offload everything - movaps %xmm7,-0x98(%rax) - movaps %xmm8,-0x88(%rax) - movaps %xmm9,-0x78(%rax) - movaps %xmm10,-0x68(%rax) - movaps %xmm11,-0x58(%rax) - movaps %xmm12,-0x48(%rax) - movaps %xmm13,-0x38(%rax) - movaps %xmm14,-0x28(%rax) - movaps %xmm15,-0x18(%rax) + movaps %xmm6,-0xa8($key_) # offload everything + movaps %xmm7,-0x98($key_) + movaps %xmm8,-0x88($key_) + movaps %xmm9,-0x78($key_) + movaps %xmm10,-0x68($key_) + movaps %xmm11,-0x58($key_) + movaps %xmm12,-0x48($key_) + movaps %xmm13,-0x38($key_) + movaps %xmm14,-0x28($key_) + movaps %xmm15,-0x18($key_) .Lctr32_body: ___ $code.=<<___; - lea -8(%rax),%rbp # 8 16-byte words on top of stack are counter values # xor-ed with zero-round key @@ -1692,26 +1691,26 @@ $code.=<<___ if (!$win64); pxor %xmm15,%xmm15 ___ $code.=<<___ if ($win64); - movaps -0xa0(%rbp),%xmm6 - movaps %xmm0,-0xa0(%rbp) # clear stack - movaps -0x90(%rbp),%xmm7 - movaps %xmm0,-0x90(%rbp) - movaps -0x80(%rbp),%xmm8 - movaps %xmm0,-0x80(%rbp) - movaps -0x70(%rbp),%xmm9 - movaps %xmm0,-0x70(%rbp) - movaps -0x60(%rbp),%xmm10 - movaps %xmm0,-0x60(%rbp) - movaps -0x50(%rbp),%xmm11 - movaps %xmm0,-0x50(%rbp) - movaps -0x40(%rbp),%xmm12 - movaps %xmm0,-0x40(%rbp) - movaps -0x30(%rbp),%xmm13 - movaps %xmm0,-0x30(%rbp) - movaps -0x20(%rbp),%xmm14 - movaps %xmm0,-0x20(%rbp) - movaps -0x10(%rbp),%xmm15 - movaps %xmm0,-0x10(%rbp) + movaps -0xa8($key_),%xmm6 + movaps %xmm0,-0xa8($key_) # clear stack + movaps -0x98($key_),%xmm7 + movaps %xmm0,-0x98($key_) + movaps -0x88($key_),%xmm8 + movaps %xmm0,-0x88($key_) + movaps -0x78($key_),%xmm9 + movaps %xmm0,-0x78($key_) + movaps -0x68($key_),%xmm10 + movaps %xmm0,-0x68($key_) + movaps -0x58($key_),%xmm11 + movaps %xmm0,-0x58($key_) + movaps -0x48($key_),%xmm12 + movaps %xmm0,-0x48($key_) + movaps -0x38($key_),%xmm13 + movaps %xmm0,-0x38($key_) + movaps -0x28($key_),%xmm14 + movaps %xmm0,-0x28($key_) + movaps -0x18($key_),%xmm15 + movaps %xmm0,-0x18($key_) movaps %xmm0,0x00(%rsp) movaps %xmm0,0x10(%rsp) movaps %xmm0,0x20(%rsp) @@ -1722,8 +1721,8 @@ $code.=<<___ if ($win64); movaps %xmm0,0x70(%rsp) ___ $code.=<<___; - lea (%rbp),%rsp - pop %rbp + mov -8($key_),%rbp + lea ($key_),%rsp .Lctr32_epilogue: ret .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks @@ -1740,32 +1739,32 @@ my @tweak=map("%xmm$_",(10..15)); my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); my $frame_size = 0x70 + ($win64?160:0); +my $key_ = "%rbp"; # override so that we can use %r11 as FP $code.=<<___; .globl aesni_xts_encrypt .type aesni_xts_encrypt,\@function,6 .align 16 aesni_xts_encrypt: - lea (%rsp),%rax + lea (%rsp),%r11 # frame pointer push %rbp sub \$$frame_size,%rsp and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%rax) # offload everything - movaps %xmm7,-0x98(%rax) - movaps %xmm8,-0x88(%rax) - movaps %xmm9,-0x78(%rax) - movaps %xmm10,-0x68(%rax) - movaps %xmm11,-0x58(%rax) - movaps %xmm12,-0x48(%rax) - movaps %xmm13,-0x38(%rax) - movaps %xmm14,-0x28(%rax) - movaps %xmm15,-0x18(%rax) + movaps %xmm6,-0xa8(%r11) # offload everything + movaps %xmm7,-0x98(%r11) + movaps %xmm8,-0x88(%r11) + movaps %xmm9,-0x78(%r11) + movaps %xmm10,-0x68(%r11) + movaps %xmm11,-0x58(%r11) + movaps %xmm12,-0x48(%r11) + movaps %xmm13,-0x38(%r11) + movaps %xmm14,-0x28(%r11) + movaps %xmm15,-0x18(%r11) .Lxts_enc_body: ___ $code.=<<___; - lea -8(%rax),%rbp movups ($ivp),$inout0 # load clear-text tweak mov 240(%r8),$rounds # key2->rounds mov 240($key),$rnds_ # key1->rounds @@ -2183,26 +2182,26 @@ $code.=<<___ if (!$win64); pxor %xmm15,%xmm15 ___ $code.=<<___ if ($win64); - movaps -0xa0(%rbp),%xmm6 - movaps %xmm0,-0xa0(%rbp) # clear stack - movaps -0x90(%rbp),%xmm7 - movaps %xmm0,-0x90(%rbp) - movaps -0x80(%rbp),%xmm8 - movaps %xmm0,-0x80(%rbp) - movaps -0x70(%rbp),%xmm9 - movaps %xmm0,-0x70(%rbp) - movaps -0x60(%rbp),%xmm10 - movaps %xmm0,-0x60(%rbp) - movaps -0x50(%rbp),%xmm11 - movaps %xmm0,-0x50(%rbp) - movaps -0x40(%rbp),%xmm12 - movaps %xmm0,-0x40(%rbp) - movaps -0x30(%rbp),%xmm13 - movaps %xmm0,-0x30(%rbp) - movaps -0x20(%rbp),%xmm14 - movaps %xmm0,-0x20(%rbp) - movaps -0x10(%rbp),%xmm15 - movaps %xmm0,-0x10(%rbp) + movaps -0xa8(%r11),%xmm6 + movaps %xmm0,-0xa8(%r11) # clear stack + movaps -0x98(%r11),%xmm7 + movaps %xmm0,-0x98(%r11) + movaps -0x88(%r11),%xmm8 + movaps %xmm0,-0x88(%r11) + movaps -0x78(%r11),%xmm9 + movaps %xmm0,-0x78(%r11) + movaps -0x68(%r11),%xmm10 + movaps %xmm0,-0x68(%r11) + movaps -0x58(%r11),%xmm11 + movaps %xmm0,-0x58(%r11) + movaps -0x48(%r11),%xmm12 + movaps %xmm0,-0x48(%r11) + movaps -0x38(%r11),%xmm13 + movaps %xmm0,-0x38(%r11) + movaps -0x28(%r11),%xmm14 + movaps %xmm0,-0x28(%r11) + movaps -0x18(%r11),%xmm15 + movaps %xmm0,-0x18(%r11) movaps %xmm0,0x00(%rsp) movaps %xmm0,0x10(%rsp) movaps %xmm0,0x20(%rsp) @@ -2212,8 +2211,8 @@ $code.=<<___ if ($win64); movaps %xmm0,0x60(%rsp) ___ $code.=<<___; - lea (%rbp),%rsp - pop %rbp + mov -8(%r11),%rbp + lea (%r11),%rsp .Lxts_enc_epilogue: ret .size aesni_xts_encrypt,.-aesni_xts_encrypt @@ -2224,26 +2223,25 @@ $code.=<<___; .type aesni_xts_decrypt,\@function,6 .align 16 aesni_xts_decrypt: - lea (%rsp),%rax + lea (%rsp),%r11 # frame pointer push %rbp sub \$$frame_size,%rsp and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%rax) # offload everything - movaps %xmm7,-0x98(%rax) - movaps %xmm8,-0x88(%rax) - movaps %xmm9,-0x78(%rax) - movaps %xmm10,-0x68(%rax) - movaps %xmm11,-0x58(%rax) - movaps %xmm12,-0x48(%rax) - movaps %xmm13,-0x38(%rax) - movaps %xmm14,-0x28(%rax) - movaps %xmm15,-0x18(%rax) + movaps %xmm6,-0xa8(%r11) # offload everything + movaps %xmm7,-0x98(%r11) + movaps %xmm8,-0x88(%r11) + movaps %xmm9,-0x78(%r11) + movaps %xmm10,-0x68(%r11) + movaps %xmm11,-0x58(%r11) + movaps %xmm12,-0x48(%r11) + movaps %xmm13,-0x38(%r11) + movaps %xmm14,-0x28(%r11) + movaps %xmm15,-0x18(%r11) .Lxts_dec_body: ___ $code.=<<___; - lea -8(%rax),%rbp movups ($ivp),$inout0 # load clear-text tweak mov 240($key2),$rounds # key2->rounds mov 240($key),$rnds_ # key1->rounds @@ -2687,26 +2685,26 @@ $code.=<<___ if (!$win64); pxor %xmm15,%xmm15 ___ $code.=<<___ if ($win64); - movaps -0xa0(%rbp),%xmm6 - movaps %xmm0,-0xa0(%rbp) # clear stack - movaps -0x90(%rbp),%xmm7 - movaps %xmm0,-0x90(%rbp) - movaps -0x80(%rbp),%xmm8 - movaps %xmm0,-0x80(%rbp) - movaps -0x70(%rbp),%xmm9 - movaps %xmm0,-0x70(%rbp) - movaps -0x60(%rbp),%xmm10 - movaps %xmm0,-0x60(%rbp) - movaps -0x50(%rbp),%xmm11 - movaps %xmm0,-0x50(%rbp) - movaps -0x40(%rbp),%xmm12 - movaps %xmm0,-0x40(%rbp) - movaps -0x30(%rbp),%xmm13 - movaps %xmm0,-0x30(%rbp) - movaps -0x20(%rbp),%xmm14 - movaps %xmm0,-0x20(%rbp) - movaps -0x10(%rbp),%xmm15 - movaps %xmm0,-0x10(%rbp) + movaps -0xa8(%r11),%xmm6 + movaps %xmm0,-0xa8(%r11) # clear stack + movaps -0x98(%r11),%xmm7 + movaps %xmm0,-0x98(%r11) + movaps -0x88(%r11),%xmm8 + movaps %xmm0,-0x88(%r11) + movaps -0x78(%r11),%xmm9 + movaps %xmm0,-0x78(%r11) + movaps -0x68(%r11),%xmm10 + movaps %xmm0,-0x68(%r11) + movaps -0x58(%r11),%xmm11 + movaps %xmm0,-0x58(%r11) + movaps -0x48(%r11),%xmm12 + movaps %xmm0,-0x48(%r11) + movaps -0x38(%r11),%xmm13 + movaps %xmm0,-0x38(%r11) + movaps -0x28(%r11),%xmm14 + movaps %xmm0,-0x28(%r11) + movaps -0x18(%r11),%xmm15 + movaps %xmm0,-0x18(%r11) movaps %xmm0,0x00(%rsp) movaps %xmm0,0x10(%rsp) movaps %xmm0,0x20(%rsp) @@ -2716,8 +2714,8 @@ $code.=<<___ if ($win64); movaps %xmm0,0x60(%rsp) ___ $code.=<<___; - lea (%rbp),%rsp - pop %rbp + mov -8(%r11),%rbp + lea (%r11),%rsp .Lxts_dec_epilogue: ret .size aesni_xts_decrypt,.-aesni_xts_decrypt @@ -2943,6 +2941,7 @@ $code.=<<___ if (!$win64); pxor %xmm13,%xmm13 pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 + lea 0x28(%rsp),%rax ___ $code.=<<___ if ($win64); movaps 0x00(%rsp),%xmm6 @@ -2967,14 +2966,14 @@ $code.=<<___ if ($win64); movaps %xmm0,0x90(%rsp) lea 0xa0+0x28(%rsp),%rax .Locb_enc_pop: - lea 0xa0(%rsp),%rsp ___ $code.=<<___; - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp .Locb_enc_epilogue: ret .size aesni_ocb_encrypt,.-aesni_ocb_encrypt @@ -3410,6 +3409,7 @@ $code.=<<___ if (!$win64); pxor %xmm13,%xmm13 pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 + lea 0x28(%rsp),%rax ___ $code.=<<___ if ($win64); movaps 0x00(%rsp),%xmm6 @@ -3434,14 +3434,14 @@ $code.=<<___ if ($win64); movaps %xmm0,0x90(%rsp) lea 0xa0+0x28(%rsp),%rax .Locb_dec_pop: - lea 0xa0(%rsp),%rsp ___ $code.=<<___; - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp .Locb_dec_epilogue: ret .size aesni_ocb_decrypt,.-aesni_ocb_decrypt @@ -3650,7 +3650,6 @@ ___ { my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); -my $inp_=$key_; $code.=<<___; .globl ${PREFIX}_cbc_encrypt @@ -3732,7 +3731,7 @@ $code.=<<___; jmp .Lcbc_ret .align 16 .Lcbc_decrypt_bulk: - lea (%rsp),%rax + lea (%rsp),%r11 # frame pointer push %rbp sub \$$frame_size,%rsp and \$-16,%rsp # Linux kernel stack can be incorrectly seeded @@ -3750,8 +3749,11 @@ $code.=<<___ if ($win64); movaps %xmm15,0xa0(%rsp) .Lcbc_decrypt_body: ___ + +my $inp_=$key_="%rbp"; # reassign $key_ + $code.=<<___; - lea -8(%rax),%rbp + mov $key,$key_ # [re-]backup $key [after reassignment] movups ($ivp),$iv mov $rnds_,$rounds cmp \$0x50,$len @@ -3791,7 +3793,7 @@ $code.=<<___; pxor $rndkey0,$inout1 $movkey 0x10-0x70($key),$rndkey1 pxor $rndkey0,$inout2 - xor $inp_,$inp_ + mov \$-1,$inp_ cmp \$0x70,$len # is there at least 0x60 bytes ahead? pxor $rndkey0,$inout3 pxor $rndkey0,$inout4 @@ -3807,8 +3809,8 @@ $code.=<<___; aesdec $rndkey1,$inout4 aesdec $rndkey1,$inout5 aesdec $rndkey1,$inout6 - setnc ${inp_}b - shl \$7,$inp_ + adc \$0,$inp_ + and \$128,$inp_ aesdec $rndkey1,$inout7 add $inp,$inp_ $movkey 0x30-0x70($key),$rndkey1 @@ -4172,8 +4174,8 @@ $code.=<<___ if ($win64); movaps %xmm0,0xa0(%rsp) ___ $code.=<<___; - lea (%rbp),%rsp - pop %rbp + mov -8(%r11),%rbp + lea (%r11),%rsp .Lcbc_ret: ret .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt @@ -4744,13 +4746,16 @@ ctr_xts_se_handler: cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail - mov 160($context),%rax # pull context->Rbp - lea -0xa0(%rax),%rsi # %xmm save area + mov 208($context),%rax # pull context->R11 + + lea -0xa8(%rax),%rsi # %xmm save area lea 512($context),%rdi # & context.Xmm6 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq - jmp .Lcommon_rbp_tail + mov -8(%rax),%rbp # restore saved %rbp + mov %rbp,160($context) # restore context->Rbp + jmp .Lcommon_seh_tail .size ctr_xts_se_handler,.-ctr_xts_se_handler .type ocb_se_handler,\@abi-omnipotent @@ -4834,9 +4839,13 @@ cbc_se_handler: cmp %r10,%rbx # context->Rip<"prologue" label jb .Lcommon_seh_tail + mov 120($context),%rax # pull context->Rax + lea .Lcbc_decrypt_body(%rip),%r10 cmp %r10,%rbx # context->RipRsp lea .Lcbc_ret(%rip),%r10 cmp %r10,%rbx # context->Rip>="epilogue" label @@ -4847,15 +4856,10 @@ cbc_se_handler: mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq -.Lcommon_rbp_tail: - mov 160($context),%rax # pull context->Rbp - mov (%rax),%rbp # restore saved %rbp - lea 8(%rax),%rax # adjust stack pointer - mov %rbp,160($context) # restore context->Rbp - jmp .Lcommon_seh_tail + mov 208($context),%rax # pull context->R11 -.Lrestore_cbc_rax: - mov 120($context),%rax + mov -8(%rax),%rbp # restore saved %rbp + mov %rbp,160($context) # restore context->Rbp .Lcommon_seh_tail: mov 8(%rax),%rdi diff --git a/crypto/aes/asm/bsaes-x86_64.pl b/crypto/aes/asm/bsaes-x86_64.pl index 0d49947..cce5795 100644 --- a/crypto/aes/asm/bsaes-x86_64.pl +++ b/crypto/aes/asm/bsaes-x86_64.pl @@ -1334,7 +1334,7 @@ $code.=<<___; cmp %rax, %rbp jb .Lecb_enc_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -1347,17 +1347,17 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lecb_enc_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rax - lea 0x78(%rsp), %rsp - mov %rax, %rbp + mov -48(%rax), %r15 + mov -40(%rax), %r14 + mov -32(%rax), %r13 + mov -24(%rax), %r12 + mov -16(%rax), %rbx + mov -8(%rax), %rbp + lea (%rax), %rsp # restore %rsp .Lecb_enc_epilogue: ret .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks @@ -1536,7 +1536,7 @@ $code.=<<___; cmp %rax, %rbp jb .Lecb_dec_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -1549,17 +1549,17 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lecb_dec_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rax - lea 0x78(%rsp), %rsp - mov %rax, %rbp + mov -48(%rax), %r15 + mov -40(%rax), %r14 + mov -32(%rax), %r13 + mov -24(%rax), %r12 + mov -16(%rax), %rbx + mov -8(%rax), %rbp + lea (%rax), %rsp # restore %rsp .Lecb_dec_epilogue: ret .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks @@ -1826,7 +1826,7 @@ $code.=<<___; cmp %rax, %rbp ja .Lcbc_dec_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -1839,17 +1839,17 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lcbc_dec_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rax - lea 0x78(%rsp), %rsp - mov %rax, %rbp + mov -48(%rax), %r15 + mov -40(%rax), %r14 + mov -32(%rax), %r13 + mov -24(%rax), %r12 + mov -16(%rax), %rbx + mov -8(%rax), %rbp + lea (%rax), %rsp # restore %rsp .Lcbc_dec_epilogue: ret .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt @@ -2058,7 +2058,7 @@ $code.=<<___; cmp %rax, %rbp ja .Lctr_enc_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -2071,17 +2071,17 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lctr_enc_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rax - lea 0x78(%rsp), %rsp - mov %rax, %rbp + mov -48(%rax), %r15 + mov -40(%rax), %r14 + mov -32(%rax), %r13 + mov -24(%rax), %r12 + mov -16(%rax), %rbx + mov -8(%rax), %rbp + lea (%rax), %rsp # restore %rsp .Lctr_enc_epilogue: ret .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks @@ -2448,7 +2448,7 @@ $code.=<<___; cmp %rax, %rbp ja .Lxts_enc_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -2461,17 +2461,17 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lxts_enc_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rax - lea 0x78(%rsp), %rsp - mov %rax, %rbp + mov -48(%rax), %r15 + mov -40(%rax), %r14 + mov -32(%rax), %r13 + mov -24(%rax), %r12 + mov -16(%rax), %rbx + mov -8(%rax), %rbp + lea (%rax), %rsp # restore %rsp .Lxts_enc_epilogue: ret .size bsaes_xts_encrypt,.-bsaes_xts_encrypt @@ -2855,7 +2855,7 @@ $code.=<<___; cmp %rax, %rbp ja .Lxts_dec_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -2868,17 +2868,17 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lxts_dec_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rax - lea 0x78(%rsp), %rsp - mov %rax, %rbp + mov -48(%rax), %r15 + mov -40(%rax), %r14 + mov -32(%rax), %r13 + mov -24(%rax), %r12 + mov -16(%rax), %rbx + mov -8(%rax), %rbp + lea (%rax), %rsp # restore %rsp .Lxts_dec_epilogue: ret .size bsaes_xts_decrypt,.-bsaes_xts_decrypt @@ -2974,31 +2974,34 @@ se_handler: mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->RipRsp + cmp %r10,%rbx # context->Rip<=prologue label + jbe .Lin_prologue mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue + mov 8(%r11),%r10d # HandlerData[2] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=tail label + jae .Lin_tail + mov 160($context),%rax # pull context->Rbp lea 0x40(%rax),%rsi # %xmm save area lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq - lea 0xa0(%rax),%rax # adjust stack pointer - - mov 0x70(%rax),%rbp - mov 0x68(%rax),%rbx - mov 0x60(%rax),%r12 - mov 0x58(%rax),%r13 - mov 0x50(%rax),%r14 - mov 0x48(%rax),%r15 - lea 0x78(%rax),%rax # adjust stack pointer + lea 0xa0+0x78(%rax),%rax # adjust stack pointer + +.Lin_tail: + mov -48(%rax),%rbp + mov -40(%rax),%rbx + mov -32(%rax),%r12 + mov -24(%rax),%r13 + mov -16(%rax),%r14 + mov -8(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 @@ -3079,28 +3082,40 @@ $code.=<<___ if ($ecb); .byte 9,0,0,0 .rva se_handler .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] + .rva .Lecb_enc_tail + .long 0 .Lecb_dec_info: .byte 9,0,0,0 .rva se_handler .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] + .rva .Lecb_dec_tail + .long 0 ___ $code.=<<___; .Lcbc_dec_info: .byte 9,0,0,0 .rva se_handler .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] + .rva .Lcbc_dec_tail + .long 0 .Lctr_enc_info: .byte 9,0,0,0 .rva se_handler .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] + .rva .Lctr_enc_tail + .long 0 .Lxts_enc_info: .byte 9,0,0,0 .rva se_handler .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] + .rva .Lxts_enc_tail + .long 0 .Lxts_dec_info: .byte 9,0,0,0 .rva se_handler .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] + .rva .Lxts_dec_tail + .long 0 ___ } -- cgit v1.1