crypto/modes/asm/aesni-gcm-x86_64.pl: minor optimization.

Avoid occasional up to 8% performance drops.
author: Andy Polyakov <appro@openssl.org> 2013-09-09 21:43:21 +0200
committer: Andy Polyakov <appro@openssl.org> 2013-09-09 21:43:21 +0200
commit: 7a1a12232a84621271bf808107f3be9a2df5121a (patch)
tree: f6f300bed433e775378ac23f41345b98a086f570 /crypto/modes
parent: 72a158703bf2b33f4eba6920302941560f7a848d (diff)
download: openssl-7a1a12232a84621271bf808107f3be9a2df5121a.zip
openssl-7a1a12232a84621271bf808107f3be9a2df5121a.tar.gz
openssl-7a1a12232a84621271bf808107f3be9a2df5121a.tar.bz2
1 files changed, 28 insertions, 6 deletions
diff --git a/crypto/modes/asm/aesni-gcm-x86_64.pl b/crypto/modes/asm/aesni-gcm-x86_64.pl
index 3198714..3781933 100644
--- a/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -21,8 +21,8 @@
 # justify. This module is based on combination of Intel submissions,
 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
-# pressure with notable relative improvement on upcoming Haswell
-# processor. [Exact performance numbers to be added at launch.]
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor.
 #
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
@@ -422,17 +422,28 @@ $code.=<<___;
 	vzeroupper
 
 	vmovdqu		($ivp),$T1		# input counter value
-	sub		\$128,%rsp
+	add		\$-128,%rsp
 	mov		12($ivp),$counter
 	lea		.Lbswap_mask(%rip),$const
+	lea		-0x80($key),$in0	# borrow $in0
+	mov		\$0xf80,$end0		# borrow $end0
 	vmovdqu		($Xip),$Xi		# load Xi
-	and		\$-64,%rsp		# ensure stack alignment
+	and		\$-128,%rsp		# ensure stack alignment
 	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
 	lea		0x80($key),$key		# size optimization
 	lea		0x20+0x20($Xip),$Xip	# size optimization
 	mov		0xf0-0x80($key),$rounds
 	vpshufb		$Ii,$Xi,$Xi
 
+	and		$end0,$in0
+	and		%rsp,$end0
+	sub		$in0,$end0
+	jc		.Ldec_no_key_aliasing
+	cmp		\$768,$end0
+	jnc		.Ldec_no_key_aliasing
+	sub		$end0,%rsp		# avoid aliasing with key
+.Ldec_no_key_aliasing:
+
 	vmovdqu		0x50($inp),$Z3		# I[5]
 	lea		($inp),$in0
 	vmovdqu		0x40($inp),$Z0
@@ -621,14 +632,25 @@ $code.=<<___;
 	vzeroupper
 
 	vmovdqu		($ivp),$T1		# input counter value
-	sub		\$128,%rsp
+	add		\$-128,%rsp
 	mov		12($ivp),$counter
 	lea		.Lbswap_mask(%rip),$const
+	lea		-0x80($key),$in0	# borrow $in0
+	mov		\$0xf80,$end0		# borrow $end0
 	lea		0x80($key),$key		# size optimization
 	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
-	and		\$-64,%rsp		# ensure stack alignment
+	and		\$-128,%rsp		# ensure stack alignment
 	mov		0xf0-0x80($key),$rounds
 
+	and		$end0,$in0
+	and		%rsp,$end0
+	sub		$in0,$end0
+	jc		.Lenc_no_key_aliasing
+	cmp		\$768,$end0
+	jnc		.Lenc_no_key_aliasing
+	sub		$end0,%rsp		# avoid aliasing with key
+.Lenc_no_key_aliasing:
+
 	lea		($out),$in0
 	lea		-0xc0($out,$len),$end0
 	shr		\$4,$len
author	Andy Polyakov <appro@openssl.org>	2013-09-09 21:43:21 +0200
committer	Andy Polyakov <appro@openssl.org>	2013-09-09 21:43:21 +0200
commit	7a1a12232a84621271bf808107f3be9a2df5121a (patch)
tree	f6f300bed433e775378ac23f41345b98a086f570 /crypto/modes
parent	72a158703bf2b33f4eba6920302941560f7a848d (diff)
download	openssl-7a1a12232a84621271bf808107f3be9a2df5121a.zip openssl-7a1a12232a84621271bf808107f3be9a2df5121a.tar.gz openssl-7a1a12232a84621271bf808107f3be9a2df5121a.tar.bz2