aesni-sha256-x86_64.pl: fix crash on AMD Jaguar.

It was also found that stich performs suboptimally on AMD Jaguar, hence execution is limited to XOP-capable and Intel processors. Reviewed-by: Kurt Roeckx <kurt@openssl.org>
author: Andy Polyakov <appro@openssl.org> 2015-11-04 23:57:06 +0100
committer: Andy Polyakov <appro@openssl.org> 2015-11-16 13:06:10 +0100
commit: a5fd24d19bbb586b1c6d235c2021e9bead22c9f5 (patch)
tree: c0075407f89726a669a66748a2550982f01f6d88
parent: 39e46af6bb3f1ad7f5c0dee8e3d13e2daf9a0160 (diff)
download: openssl-a5fd24d19bbb586b1c6d235c2021e9bead22c9f5.zip
openssl-a5fd24d19bbb586b1c6d235c2021e9bead22c9f5.tar.gz
openssl-a5fd24d19bbb586b1c6d235c2021e9bead22c9f5.tar.bz2
2 files changed, 13 insertions, 5 deletions
diff --git a/crypto/aes/asm/aesni-sha256-x86_64.pl b/crypto/aes/asm/aesni-sha256-x86_64.pl
index 74dad44..8a81994 100644
--- a/crypto/aes/asm/aesni-sha256-x86_64.pl
+++ b/crypto/aes/asm/aesni-sha256-x86_64.pl
@@ -140,11 +140,8 @@ $code.=<<___ if ($avx>1);
 	je	${func}_avx2
 ___
 $code.=<<___;
-	and	\$`1<<30`,%eax			# mask "Intel CPU" bit
-	and	\$`1<<28|1<<9`,%r10d		# mask AVX+SSSE3 bits
-	or	%eax,%r10d
-	cmp	\$`1<<28|1<<9|1<<30`,%r10d
-	je	${func}_avx
+	and	\$`1<<28`,%r10d			# check for AVX
+	jnz	${func}_avx
 	ud2
 ___
 						}
diff --git a/crypto/evp/e_aes_cbc_hmac_sha256.c b/crypto/evp/e_aes_cbc_hmac_sha256.c
index 411c770..63f6e48 100644
--- a/crypto/evp/e_aes_cbc_hmac_sha256.c
+++ b/crypto/evp/e_aes_cbc_hmac_sha256.c
@@ -498,7 +498,18 @@ static int aesni_cbc_hmac_sha256_cipher(EVP_CIPHER_CTX *ctx,
             iv = AES_BLOCK_SIZE;
 
 #  if defined(STITCHED_CALL)
+        /*
+         * Assembly stitch handles AVX-capable processors, but its
+         * performance is not optimal on AMD Jaguar, ~40% worse, for
+         * unknown reasons. Incidentally processor in question supports
+         * AVX, but not AMD-specific XOP extension, which can be used
+         * to identify it and avoid stitch invocation. So that after we
+         * establish that current CPU supports AVX, we even see if it's
+         * either even XOP-capable Bulldozer-based or GenuineIntel one.
+         */
         if (OPENSSL_ia32cap_P[1] & (1 << (60 - 32)) && /* AVX? */
+            ((OPENSSL_ia32cap_P[1] & (1 << (43 - 32))) /* XOP? */
+             | (OPENSSL_ia32cap_P[0] & (1<<30))) &&    /* "Intel CPU"? */
             plen > (sha_off + iv) &&
             (blocks = (plen - (sha_off + iv)) / SHA256_CBLOCK)) {
             SHA256_Update(&key->md, in + iv, sha_off);
author	Andy Polyakov <appro@openssl.org>	2015-11-04 23:57:06 +0100
committer	Andy Polyakov <appro@openssl.org>	2015-11-16 13:06:10 +0100
commit	a5fd24d19bbb586b1c6d235c2021e9bead22c9f5 (patch)
tree	c0075407f89726a669a66748a2550982f01f6d88
parent	39e46af6bb3f1ad7f5c0dee8e3d13e2daf9a0160 (diff)
download	openssl-a5fd24d19bbb586b1c6d235c2021e9bead22c9f5.zip openssl-a5fd24d19bbb586b1c6d235c2021e9bead22c9f5.tar.gz openssl-a5fd24d19bbb586b1c6d235c2021e9bead22c9f5.tar.bz2