aboutsummaryrefslogtreecommitdiff
path: root/crypto/modes
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2013-09-09 21:43:21 +0200
committerAndy Polyakov <appro@openssl.org>2013-09-09 21:43:21 +0200
commit7a1a12232a84621271bf808107f3be9a2df5121a (patch)
treef6f300bed433e775378ac23f41345b98a086f570 /crypto/modes
parent72a158703bf2b33f4eba6920302941560f7a848d (diff)
downloadopenssl-7a1a12232a84621271bf808107f3be9a2df5121a.zip
openssl-7a1a12232a84621271bf808107f3be9a2df5121a.tar.gz
openssl-7a1a12232a84621271bf808107f3be9a2df5121a.tar.bz2
crypto/modes/asm/aesni-gcm-x86_64.pl: minor optimization.
Avoid occasional up to 8% performance drops.
Diffstat (limited to 'crypto/modes')
-rw-r--r--crypto/modes/asm/aesni-gcm-x86_64.pl34
1 files changed, 28 insertions, 6 deletions
diff --git a/crypto/modes/asm/aesni-gcm-x86_64.pl b/crypto/modes/asm/aesni-gcm-x86_64.pl
index 3198714..3781933 100644
--- a/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -21,8 +21,8 @@
# justify. This module is based on combination of Intel submissions,
# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
# Locktyukhin of Intel Corp. who verified that it reduces shuffles
-# pressure with notable relative improvement on upcoming Haswell
-# processor. [Exact performance numbers to be added at launch.]
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor.
#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
@@ -422,17 +422,28 @@ $code.=<<___;
vzeroupper
vmovdqu ($ivp),$T1 # input counter value
- sub \$128,%rsp
+ add \$-128,%rsp
mov 12($ivp),$counter
lea .Lbswap_mask(%rip),$const
+ lea -0x80($key),$in0 # borrow $in0
+ mov \$0xf80,$end0 # borrow $end0
vmovdqu ($Xip),$Xi # load Xi
- and \$-64,%rsp # ensure stack alignment
+ and \$-128,%rsp # ensure stack alignment
vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
lea 0x80($key),$key # size optimization
lea 0x20+0x20($Xip),$Xip # size optimization
mov 0xf0-0x80($key),$rounds
vpshufb $Ii,$Xi,$Xi
+ and $end0,$in0
+ and %rsp,$end0
+ sub $in0,$end0
+ jc .Ldec_no_key_aliasing
+ cmp \$768,$end0
+ jnc .Ldec_no_key_aliasing
+ sub $end0,%rsp # avoid aliasing with key
+.Ldec_no_key_aliasing:
+
vmovdqu 0x50($inp),$Z3 # I[5]
lea ($inp),$in0
vmovdqu 0x40($inp),$Z0
@@ -621,14 +632,25 @@ $code.=<<___;
vzeroupper
vmovdqu ($ivp),$T1 # input counter value
- sub \$128,%rsp
+ add \$-128,%rsp
mov 12($ivp),$counter
lea .Lbswap_mask(%rip),$const
+ lea -0x80($key),$in0 # borrow $in0
+ mov \$0xf80,$end0 # borrow $end0
lea 0x80($key),$key # size optimization
vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
- and \$-64,%rsp # ensure stack alignment
+ and \$-128,%rsp # ensure stack alignment
mov 0xf0-0x80($key),$rounds
+ and $end0,$in0
+ and %rsp,$end0
+ sub $in0,$end0
+ jc .Lenc_no_key_aliasing
+ cmp \$768,$end0
+ jnc .Lenc_no_key_aliasing
+ sub $end0,%rsp # avoid aliasing with key
+.Lenc_no_key_aliasing:
+
lea ($out),$in0
lea -0xc0($out,$len),$end0
shr \$4,$len