aboutsummaryrefslogtreecommitdiff
path: root/crypto
diff options
context:
space:
mode:
authorDavid Benjamin <davidben@google.com>2024-05-18 09:47:39 -0400
committerBoringssl LUCI CQ <boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com>2024-06-11 17:46:11 +0000
commit962432c687f67f8df1aa6e3dd364fbc88fea4ed8 (patch)
tree4fd90bda42147bd9d07476834c81f0decdb5fef1 /crypto
parenta220a6024f66c123019b5c080f6bd8bcaf75448c (diff)
downloadboringssl-962432c687f67f8df1aa6e3dd364fbc88fea4ed8.zip
boringssl-962432c687f67f8df1aa6e3dd364fbc88fea4ed8.tar.gz
boringssl-962432c687f67f8df1aa6e3dd364fbc88fea4ed8.tar.bz2
Remove OPENSSL_ia32cap_P references from AES-NI assembly
The AES-NI key schedule functions have two versions, dating to OpenSSL's 23f6eec71dbd472044db7dc854599f1de14a1f48. This cites RT#3576. Unfortunately, OpenSSL purged their old RT bugs, without any archives, so this context is now lost. Some archives of openssl-dev discussion (also predating OpenSSL's archives) give most of the context: https://groups.google.com/g/mailing.openssl.dev/c/OuFXwW4NfO8/m/7d2ZXVjkxVkJ Broadly, although AES-NI has an aeskeygenassist instruction for the key schedule, apparently it's overall faster to ignore it and use aesenclast instead. But it's slower on older processors, so the assembly would check for AVX && !XOP as a proxy. (Note we always set XOP to false, even though this likely wasn't a capability check but a proxy for pre-Xen AMD chips.) It is unclear if the aeskeygenassist version is still worthwhile. However, the aesenclast version requires SSSE3. SSSE3 long predates AES-NI, but it's not clear if AES-NI implies SSSE3. In OpenSSL, the CCM AES-NI assembly seems to assume it does. For now, I've preserved the pair of them. There are now only two assembly files with OPENSSL_ia32cap_P references! Bug: 673 Change-Id: I990b1393d780db4caf074c184ce8bbd182da6e29 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/68690 Reviewed-by: Bob Beck <bbe@google.com> Commit-Queue: David Benjamin <davidben@google.com>
Diffstat (limited to 'crypto')
-rw-r--r--crypto/fipsmodule/aes/aes.c8
-rw-r--r--crypto/fipsmodule/aes/aes_test.cc11
-rw-r--r--crypto/fipsmodule/aes/asm/aesni-x86.pl302
-rw-r--r--crypto/fipsmodule/aes/asm/aesni-x86_64.pl342
-rw-r--r--crypto/fipsmodule/aes/internal.h22
-rw-r--r--crypto/perlasm/x86asm.pl6
6 files changed, 392 insertions, 299 deletions
diff --git a/crypto/fipsmodule/aes/aes.c b/crypto/fipsmodule/aes/aes.c
index 56dfbe2..7eab5ac 100644
--- a/crypto/fipsmodule/aes/aes.c
+++ b/crypto/fipsmodule/aes/aes.c
@@ -116,4 +116,12 @@ int aes_hw_set_decrypt_key(const uint8_t *user_key, int bits, AES_KEY *key) {
}
return ret;
}
+
+int aes_hw_set_encrypt_key(const uint8_t *user_key, int bits, AES_KEY *key) {
+ if (aes_hw_set_encrypt_key_alt_preferred()) {
+ return aes_hw_set_encrypt_key_alt(user_key, bits, key);
+ } else {
+ return aes_hw_set_encrypt_key_base(user_key, bits, key);
+ }
+}
#endif
diff --git a/crypto/fipsmodule/aes/aes_test.cc b/crypto/fipsmodule/aes/aes_test.cc
index dc90067..07feabb 100644
--- a/crypto/fipsmodule/aes/aes_test.cc
+++ b/crypto/fipsmodule/aes/aes_test.cc
@@ -347,7 +347,16 @@ TEST(AESTest, ABI) {
}
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
- ASSERT_EQ(CHECK_ABI_SEH(aes_hw_set_encrypt_key, kKey, bits, &key), 0);
+ ASSERT_EQ(CHECK_ABI_SEH(aes_hw_set_encrypt_key_base, kKey, bits, &key), 0);
+ if (aes_hw_set_encrypt_key_alt_capable()) {
+ AES_KEY alt;
+ ASSERT_EQ(CHECK_ABI_SEH(aes_hw_set_encrypt_key_alt, kKey, bits, &alt),
+ 0);
+ EXPECT_EQ(alt.rounds, key.rounds);
+ for (unsigned i = 0; i <= alt.rounds; i++) {
+ EXPECT_EQ(alt.rd_key[i], key.rd_key[i]);
+ }
+ }
CHECK_ABI_SEH(aes_hw_encrypt_key_to_decrypt_key, &key);
#else
ASSERT_EQ(CHECK_ABI_SEH(aes_hw_set_decrypt_key, kKey, bits, &key), 0);
diff --git a/crypto/fipsmodule/aes/asm/aesni-x86.pl b/crypto/fipsmodule/aes/asm/aesni-x86.pl
index d8fdfb8..077be94 100644
--- a/crypto/fipsmodule/aes/asm/aesni-x86.pl
+++ b/crypto/fipsmodule/aes/asm/aesni-x86.pl
@@ -83,7 +83,6 @@ open OUT,">$output";
&asm_init($ARGV[0]);
-&external_label("OPENSSL_ia32cap_P");
&preprocessor_ifdef("BORINGSSL_DISPATCH_TEST")
&external_label("BORINGSSL_function_hit");
&preprocessor_endif();
@@ -2109,18 +2108,15 @@ if ($PREFIX eq $AESNI_PREFIX) {
######################################################################
# Mechanical port from aesni-x86_64.pl.
-#
-# _aesni_set_encrypt_key is private interface,
-# input:
-# "eax" const unsigned char *userKey
-# $rounds int bits
-# $key AES_KEY *key
-# output:
-# "eax" return code
-# $round rounds
-
-&function_begin_B("_aesni_set_encrypt_key");
- &push ("ebp");
+
+# int $PREFIX_set_encrypt_key_base (const unsigned char *userKey, int bits,
+# AES_KEY *key)
+&function_begin_B("${PREFIX}_set_encrypt_key_base");
+ &record_function_hit(3);
+
+ &mov ("eax",&wparam(0));
+ &mov ($rounds,&wparam(1));
+ &mov ($key,&wparam(2));
&push ("ebx");
&call (&label("pic"));
@@ -2128,12 +2124,9 @@ if ($PREFIX eq $AESNI_PREFIX) {
&blindpop("ebx");
&lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
- &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
&movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
&xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
- &mov ("ebp",&DWP(4,"ebp"));
&lea ($key,&DWP(16,$key));
- &and ("ebp",1<<28|1<<11); # AVX and XOP bits
&cmp ($rounds,256);
&je (&label("14rounds"));
&cmp ($rounds,192);
@@ -2142,9 +2135,6 @@ if ($PREFIX eq $AESNI_PREFIX) {
&jne (&label("bad_keybits"));
&set_label("10rounds",16);
- &cmp ("ebp",1<<28);
- &je (&label("10rounds_alt"));
-
&mov ($rounds,9);
&$movekey (&QWP(-16,$key),"xmm0"); # round 0
&aeskeygenassist("xmm1","xmm0",0x01); # round 1
@@ -2184,75 +2174,8 @@ if ($PREFIX eq $AESNI_PREFIX) {
&xorps ("xmm0","xmm1");
&ret();
-&set_label("10rounds_alt",16);
- &movdqa ("xmm5",&QWP(0x00,"ebx"));
- &mov ($rounds,8);
- &movdqa ("xmm4",&QWP(0x20,"ebx"));
- &movdqa ("xmm2","xmm0");
- &movdqu (&QWP(-16,$key),"xmm0");
-
-&set_label("loop_key128");
- &pshufb ("xmm0","xmm5");
- &aesenclast ("xmm0","xmm4");
- &pslld ("xmm4",1);
- &lea ($key,&DWP(16,$key));
-
- &movdqa ("xmm3","xmm2");
- &pslldq ("xmm2",4);
- &pxor ("xmm3","xmm2");
- &pslldq ("xmm2",4);
- &pxor ("xmm3","xmm2");
- &pslldq ("xmm2",4);
- &pxor ("xmm2","xmm3");
-
- &pxor ("xmm0","xmm2");
- &movdqu (&QWP(-16,$key),"xmm0");
- &movdqa ("xmm2","xmm0");
-
- &dec ($rounds);
- &jnz (&label("loop_key128"));
-
- &movdqa ("xmm4",&QWP(0x30,"ebx"));
-
- &pshufb ("xmm0","xmm5");
- &aesenclast ("xmm0","xmm4");
- &pslld ("xmm4",1);
-
- &movdqa ("xmm3","xmm2");
- &pslldq ("xmm2",4);
- &pxor ("xmm3","xmm2");
- &pslldq ("xmm2",4);
- &pxor ("xmm3","xmm2");
- &pslldq ("xmm2",4);
- &pxor ("xmm2","xmm3");
-
- &pxor ("xmm0","xmm2");
- &movdqu (&QWP(0,$key),"xmm0");
-
- &movdqa ("xmm2","xmm0");
- &pshufb ("xmm0","xmm5");
- &aesenclast ("xmm0","xmm4");
-
- &movdqa ("xmm3","xmm2");
- &pslldq ("xmm2",4);
- &pxor ("xmm3","xmm2");
- &pslldq ("xmm2",4);
- &pxor ("xmm3","xmm2");
- &pslldq ("xmm2",4);
- &pxor ("xmm2","xmm3");
-
- &pxor ("xmm0","xmm2");
- &movdqu (&QWP(16,$key),"xmm0");
-
- &mov ($rounds,9);
- &mov (&DWP(96,$key),$rounds);
-
- &jmp (&label("good_key"));
-
&set_label("12rounds",16);
&movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
- &cmp ("ebp",1<<28);
- &je (&label("12rounds_alt"));
&mov ($rounds,11);
&$movekey (&QWP(-16,$key),"xmm0"); # round 0
@@ -2305,50 +2228,9 @@ if ($PREFIX eq $AESNI_PREFIX) {
&lea ($key,&DWP(32,$key));
&jmp (&label("key_192b_warm"));
-&set_label("12rounds_alt",16);
- &movdqa ("xmm5",&QWP(0x10,"ebx"));
- &movdqa ("xmm4",&QWP(0x20,"ebx"));
- &mov ($rounds,8);
- &movdqu (&QWP(-16,$key),"xmm0");
-
-&set_label("loop_key192");
- &movq (&QWP(0,$key),"xmm2");
- &movdqa ("xmm1","xmm2");
- &pshufb ("xmm2","xmm5");
- &aesenclast ("xmm2","xmm4");
- &pslld ("xmm4",1);
- &lea ($key,&DWP(24,$key));
-
- &movdqa ("xmm3","xmm0");
- &pslldq ("xmm0",4);
- &pxor ("xmm3","xmm0");
- &pslldq ("xmm0",4);
- &pxor ("xmm3","xmm0");
- &pslldq ("xmm0",4);
- &pxor ("xmm0","xmm3");
-
- &pshufd ("xmm3","xmm0",0xff);
- &pxor ("xmm3","xmm1");
- &pslldq ("xmm1",4);
- &pxor ("xmm3","xmm1");
-
- &pxor ("xmm0","xmm2");
- &pxor ("xmm2","xmm3");
- &movdqu (&QWP(-16,$key),"xmm0");
-
- &dec ($rounds);
- &jnz (&label("loop_key192"));
-
- &mov ($rounds,11);
- &mov (&DWP(32,$key),$rounds);
-
- &jmp (&label("good_key"));
-
&set_label("14rounds",16);
&movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
&lea ($key,&DWP(16,$key));
- &cmp ("ebp",1<<28);
- &je (&label("14rounds_alt"));
&mov ($rounds,13);
&$movekey (&QWP(-32,$key),"xmm0"); # round 0
@@ -2409,7 +2291,157 @@ if ($PREFIX eq $AESNI_PREFIX) {
&xorps ("xmm2","xmm1");
&ret();
+&set_label("good_key");
+ &pxor ("xmm0","xmm0");
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &pxor ("xmm5","xmm5");
+ &xor ("eax","eax");
+ &pop ("ebx");
+ &ret ();
+
+&set_label("bad_keybits",4);
+ &pxor ("xmm0","xmm0");
+ &mov ("eax",-2);
+ &pop ("ebx");
+ &ret ();
+&function_end_B("${PREFIX}_set_encrypt_key_base");
+
+# int $PREFIX_set_encrypt_key_alt (const unsigned char *userKey, int bits,
+# AES_KEY *key)
+&function_begin_B("${PREFIX}_set_encrypt_key_alt");
+ &record_function_hit(3);
+
+ &mov ("eax",&wparam(0));
+ &mov ($rounds,&wparam(1));
+ &mov ($key,&wparam(2));
+ &push ("ebx");
+
+ &call (&label("pic"));
+&set_label("pic");
+ &blindpop("ebx");
+ &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
+
+ &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
+ &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
+ &lea ($key,&DWP(16,$key));
+ &cmp ($rounds,256);
+ &je (&label("14rounds_alt"));
+ &cmp ($rounds,192);
+ &je (&label("12rounds_alt"));
+ &cmp ($rounds,128);
+ &jne (&label("bad_keybits"));
+
+&set_label("10rounds_alt",16);
+ &movdqa ("xmm5",&QWP(0x00,"ebx"));
+ &mov ($rounds,8);
+ &movdqa ("xmm4",&QWP(0x20,"ebx"));
+ &movdqa ("xmm2","xmm0");
+ &movdqu (&QWP(-16,$key),"xmm0");
+
+&set_label("loop_key128");
+ &pshufb ("xmm0","xmm5");
+ &aesenclast ("xmm0","xmm4");
+ &pslld ("xmm4",1);
+ &lea ($key,&DWP(16,$key));
+
+ &movdqa ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm2","xmm3");
+
+ &pxor ("xmm0","xmm2");
+ &movdqu (&QWP(-16,$key),"xmm0");
+ &movdqa ("xmm2","xmm0");
+
+ &dec ($rounds);
+ &jnz (&label("loop_key128"));
+
+ &movdqa ("xmm4",&QWP(0x30,"ebx"));
+
+ &pshufb ("xmm0","xmm5");
+ &aesenclast ("xmm0","xmm4");
+ &pslld ("xmm4",1);
+
+ &movdqa ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm2","xmm3");
+
+ &pxor ("xmm0","xmm2");
+ &movdqu (&QWP(0,$key),"xmm0");
+
+ &movdqa ("xmm2","xmm0");
+ &pshufb ("xmm0","xmm5");
+ &aesenclast ("xmm0","xmm4");
+
+ &movdqa ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm2","xmm3");
+
+ &pxor ("xmm0","xmm2");
+ &movdqu (&QWP(16,$key),"xmm0");
+
+ &mov ($rounds,9);
+ &mov (&DWP(96,$key),$rounds);
+
+ &jmp (&label("good_key"));
+
+&set_label("12rounds_alt",16);
+ &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
+ &movdqa ("xmm5",&QWP(0x10,"ebx"));
+ &movdqa ("xmm4",&QWP(0x20,"ebx"));
+ &mov ($rounds,8);
+ &movdqu (&QWP(-16,$key),"xmm0");
+
+&set_label("loop_key192");
+ &movq (&QWP(0,$key),"xmm2");
+ &movdqa ("xmm1","xmm2");
+ &pshufb ("xmm2","xmm5");
+ &aesenclast ("xmm2","xmm4");
+ &pslld ("xmm4",1);
+ &lea ($key,&DWP(24,$key));
+
+ &movdqa ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm0","xmm3");
+
+ &pshufd ("xmm3","xmm0",0xff);
+ &pxor ("xmm3","xmm1");
+ &pslldq ("xmm1",4);
+ &pxor ("xmm3","xmm1");
+
+ &pxor ("xmm0","xmm2");
+ &pxor ("xmm2","xmm3");
+ &movdqu (&QWP(-16,$key),"xmm0");
+
+ &dec ($rounds);
+ &jnz (&label("loop_key192"));
+
+ &mov ($rounds,11);
+ &mov (&DWP(32,$key),$rounds);
+
+ &jmp (&label("good_key"));
+
&set_label("14rounds_alt",16);
+ &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
+ &lea ($key,&DWP(16,$key));
&movdqa ("xmm5",&QWP(0x00,"ebx"));
&movdqa ("xmm4",&QWP(0x20,"ebx"));
&mov ($rounds,7);
@@ -2467,28 +2499,14 @@ if ($PREFIX eq $AESNI_PREFIX) {
&pxor ("xmm5","xmm5");
&xor ("eax","eax");
&pop ("ebx");
- &pop ("ebp");
&ret ();
&set_label("bad_keybits",4);
&pxor ("xmm0","xmm0");
&mov ("eax",-2);
&pop ("ebx");
- &pop ("ebp");
- &ret ();
-&function_end_B("_aesni_set_encrypt_key");
-
-# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
-# AES_KEY *key)
-&function_begin_B("${PREFIX}_set_encrypt_key");
- &record_function_hit(3);
-
- &mov ("eax",&wparam(0));
- &mov ($rounds,&wparam(1));
- &mov ($key,&wparam(2));
- &call ("_aesni_set_encrypt_key");
&ret ();
-&function_end_B("${PREFIX}_set_encrypt_key");
+&function_end_B("${PREFIX}_set_encrypt_key_alt");
# void $PREFIX_encrypt_key_to_decrypt_key (AES_KEY *key)
&function_begin_B("${PREFIX}_encrypt_key_to_decrypt_key");
diff --git a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
index 8b6036e..ab45749 100644
--- a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
@@ -211,7 +211,6 @@ $movkey = $PREFIX eq "aes_hw" ? "movups" : "movups";
("%rdi","%rsi","%rdx","%rcx"); # Unix order
$code=".text\n";
-$code.=".extern OPENSSL_ia32cap_P\n";
$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
@@ -3245,11 +3244,14 @@ ___
# are used. Note that it's declared "abi-omnipotent", which means that
# amount of volatile registers is smaller on Windows.
#
+# There are two variants of this function, one which uses aeskeygenassist
+# ("base") and one which uses aesenclast + pshufb ("alt"). See aes/internal.h
+# for details.
$code.=<<___;
-.globl ${PREFIX}_set_encrypt_key
-.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
+.globl ${PREFIX}_set_encrypt_key_base
+.type ${PREFIX}_set_encrypt_key_base,\@abi-omnipotent
.align 16
-${PREFIX}_set_encrypt_key:
+${PREFIX}_set_encrypt_key_base:
.cfi_startproc
.seh_startproc
_CET_ENDBR
@@ -3262,9 +3264,6 @@ ${PREFIX}_set_encrypt_key:
.seh_endprologue
movups ($inp),%xmm0 # pull first 128 bits of *userKey
xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
- leaq OPENSSL_ia32cap_P(%rip),%r10
- movl 4(%r10),%r10d
- and \$`1<<28|1<<11`,%r10d # AVX and XOP bits
lea 16($key),%rax # %rax is used as modifiable copy of $key
cmp \$256,$bits
je .L14rounds
@@ -3275,8 +3274,6 @@ ${PREFIX}_set_encrypt_key:
.L10rounds:
mov \$9,$bits # 10 rounds for 128-bit key
- cmp \$`1<<28`,%r10d # AVX, bit no XOP
- je .L10rounds_alt
$movkey %xmm0,($key) # round 0
aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
@@ -3305,7 +3302,182 @@ ${PREFIX}_set_encrypt_key:
jmp .Lenc_key_ret
.align 16
-.L10rounds_alt:
+.L12rounds:
+ movq 16($inp),%xmm2 # remaining 1/3 of *userKey
+ mov \$11,$bits # 12 rounds for 192
+
+ $movkey %xmm0,($key) # round 0
+ aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
+ call .Lkey_expansion_192a_cold
+ aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
+ call .Lkey_expansion_192b
+ aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
+ call .Lkey_expansion_192a
+ aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
+ call .Lkey_expansion_192b
+ aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
+ call .Lkey_expansion_192a
+ aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
+ call .Lkey_expansion_192b
+ aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
+ call .Lkey_expansion_192a
+ aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
+ call .Lkey_expansion_192b
+ $movkey %xmm0,(%rax)
+ mov $bits,48(%rax) # 240(%rdx)
+ xor %rax, %rax
+ jmp .Lenc_key_ret
+
+.align 16
+.L14rounds:
+ movups 16($inp),%xmm2 # remaining half of *userKey
+ mov \$13,$bits # 14 rounds for 256
+ lea 16(%rax),%rax
+
+ $movkey %xmm0,($key) # round 0
+ $movkey %xmm2,16($key) # round 1
+ aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
+ call .Lkey_expansion_256a_cold
+ aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
+ call .Lkey_expansion_256a
+ $movkey %xmm0,(%rax)
+ mov $bits,16(%rax) # 240(%rdx)
+ xor %rax,%rax
+ jmp .Lenc_key_ret
+
+.align 16
+.Lbad_keybits:
+ mov \$-2,%rax
+.Lenc_key_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ add \$8,%rsp
+.cfi_adjust_cfa_offset -8
+ ret
+.cfi_endproc
+.seh_endproc
+
+.align 16
+.Lkey_expansion_128:
+ $movkey %xmm0,(%rax)
+ lea 16(%rax),%rax
+.Lkey_expansion_128_cold:
+ shufps \$0b00010000,%xmm0,%xmm4
+ xorps %xmm4, %xmm0
+ shufps \$0b10001100,%xmm0,%xmm4
+ xorps %xmm4, %xmm0
+ shufps \$0b11111111,%xmm1,%xmm1 # critical path
+ xorps %xmm1,%xmm0
+ ret
+
+.align 16
+.Lkey_expansion_192a:
+ $movkey %xmm0,(%rax)
+ lea 16(%rax),%rax
+.Lkey_expansion_192a_cold:
+ movaps %xmm2, %xmm5
+.Lkey_expansion_192b_warm:
+ shufps \$0b00010000,%xmm0,%xmm4
+ movdqa %xmm2,%xmm3
+ xorps %xmm4,%xmm0
+ shufps \$0b10001100,%xmm0,%xmm4
+ pslldq \$4,%xmm3
+ xorps %xmm4,%xmm0
+ pshufd \$0b01010101,%xmm1,%xmm1 # critical path
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+ pshufd \$0b11111111,%xmm0,%xmm3
+ pxor %xmm3,%xmm2
+ ret
+
+.align 16
+.Lkey_expansion_192b:
+ movaps %xmm0,%xmm3
+ shufps \$0b01000100,%xmm0,%xmm5
+ $movkey %xmm5,(%rax)
+ shufps \$0b01001110,%xmm2,%xmm3
+ $movkey %xmm3,16(%rax)
+ lea 32(%rax),%rax
+ jmp .Lkey_expansion_192b_warm
+
+.align 16
+.Lkey_expansion_256a:
+ $movkey %xmm2,(%rax)
+ lea 16(%rax),%rax
+.Lkey_expansion_256a_cold:
+ shufps \$0b00010000,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps \$0b10001100,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps \$0b11111111,%xmm1,%xmm1 # critical path
+ xorps %xmm1,%xmm0
+ ret
+
+.align 16
+.Lkey_expansion_256b:
+ $movkey %xmm0,(%rax)
+ lea 16(%rax),%rax
+
+ shufps \$0b00010000,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps \$0b10001100,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps \$0b10101010,%xmm1,%xmm1 # critical path
+ xorps %xmm1,%xmm2
+ ret
+.size ${PREFIX}_set_encrypt_key_base,.-${PREFIX}_set_encrypt_key_base
+
+.globl ${PREFIX}_set_encrypt_key_alt
+.type ${PREFIX}_set_encrypt_key_alt,\@abi-omnipotent
+.align 16
+${PREFIX}_set_encrypt_key_alt:
+.cfi_startproc
+.seh_startproc
+ _CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+ movb \$1,BORINGSSL_function_hit+3(%rip)
+#endif
+ sub \$8,%rsp
+.cfi_adjust_cfa_offset 8
+.seh_stackalloc 8
+.seh_endprologue
+ movups ($inp),%xmm0 # pull first 128 bits of *userKey
+ xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
+ lea 16($key),%rax # %rax is used as modifiable copy of $key
+ cmp \$256,$bits
+ je .L14rounds_alt
+ cmp \$192,$bits
+ je .L12rounds_alt
+ cmp \$128,$bits
+ jne .Lbad_keybits_alt
+
+ mov \$9,$bits # 10 rounds for 128-bit key
movdqa .Lkey_rotate(%rip),%xmm5
mov \$8,%r10d
movdqa .Lkey_rcon1(%rip),%xmm4
@@ -3369,39 +3541,12 @@ ${PREFIX}_set_encrypt_key:
mov $bits,96(%rax) # 240($key)
xor %eax,%eax
- jmp .Lenc_key_ret
+ jmp .Lenc_key_ret_alt
.align 16
-.L12rounds:
+.L12rounds_alt:
movq 16($inp),%xmm2 # remaining 1/3 of *userKey
mov \$11,$bits # 12 rounds for 192
- cmp \$`1<<28`,%r10d # AVX, but no XOP
- je .L12rounds_alt
-
- $movkey %xmm0,($key) # round 0
- aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
- call .Lkey_expansion_192a_cold
- aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
- call .Lkey_expansion_192b
- aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
- call .Lkey_expansion_192a
- aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
- call .Lkey_expansion_192b
- aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
- call .Lkey_expansion_192a
- aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
- call .Lkey_expansion_192b
- aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
- call .Lkey_expansion_192a
- aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
- call .Lkey_expansion_192b
- $movkey %xmm0,(%rax)
- mov $bits,48(%rax) # 240(%rdx)
- xor %rax, %rax
- jmp .Lenc_key_ret
-
-.align 16
-.L12rounds_alt:
movdqa .Lkey_rotate192(%rip),%xmm5
movdqa .Lkey_rcon1(%rip),%xmm4
mov \$8,%r10d
@@ -3439,51 +3584,13 @@ ${PREFIX}_set_encrypt_key:
mov $bits,32(%rax) # 240($key)
xor %eax,%eax
- jmp .Lenc_key_ret
+ jmp .Lenc_key_ret_alt
.align 16
-.L14rounds:
+.L14rounds_alt:
movups 16($inp),%xmm2 # remaining half of *userKey
mov \$13,$bits # 14 rounds for 256
lea 16(%rax),%rax
- cmp \$`1<<28`,%r10d # AVX, but no XOP
- je .L14rounds_alt
-
- $movkey %xmm0,($key) # round 0
- $movkey %xmm2,16($key) # round 1
- aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
- call .Lkey_expansion_256a_cold
- aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
- call .Lkey_expansion_256b
- aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
- call .Lkey_expansion_256a
- aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
- call .Lkey_expansion_256b
- aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
- call .Lkey_expansion_256a
- aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
- call .Lkey_expansion_256b
- aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
- call .Lkey_expansion_256a
- aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
- call .Lkey_expansion_256b
- aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
- call .Lkey_expansion_256a
- aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
- call .Lkey_expansion_256b
- aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
- call .Lkey_expansion_256a
- aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
- call .Lkey_expansion_256b
- aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
- call .Lkey_expansion_256a
- $movkey %xmm0,(%rax)
- mov $bits,16(%rax) # 240(%rdx)
- xor %rax,%rax
- jmp .Lenc_key_ret
-
-.align 16
-.L14rounds_alt:
movdqa .Lkey_rotate(%rip),%xmm5
movdqa .Lkey_rcon1(%rip),%xmm4
mov \$7,%r10d
@@ -3534,12 +3641,12 @@ ${PREFIX}_set_encrypt_key:
.Ldone_key256:
mov $bits,16(%rax) # 240($key)
xor %eax,%eax
- jmp .Lenc_key_ret
+ jmp .Lenc_key_ret_alt
.align 16
-.Lbad_keybits:
+.Lbad_keybits_alt:
mov \$-2,%rax
-.Lenc_key_ret:
+.Lenc_key_ret_alt:
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
@@ -3551,76 +3658,7 @@ ${PREFIX}_set_encrypt_key:
ret
.cfi_endproc
.seh_endproc
-
-.align 16
-.Lkey_expansion_128:
- $movkey %xmm0,(%rax)
- lea 16(%rax),%rax
-.Lkey_expansion_128_cold:
- shufps \$0b00010000,%xmm0,%xmm4
- xorps %xmm4, %xmm0
- shufps \$0b10001100,%xmm0,%xmm4
- xorps %xmm4, %xmm0
- shufps \$0b11111111,%xmm1,%xmm1 # critical path
- xorps %xmm1,%xmm0
- ret
-
-.align 16
-.Lkey_expansion_192a:
- $movkey %xmm0,(%rax)
- lea 16(%rax),%rax
-.Lkey_expansion_192a_cold:
- movaps %xmm2, %xmm5
-.Lkey_expansion_192b_warm:
- shufps \$0b00010000,%xmm0,%xmm4
- movdqa %xmm2,%xmm3
- xorps %xmm4,%xmm0
- shufps \$0b10001100,%xmm0,%xmm4
- pslldq \$4,%xmm3
- xorps %xmm4,%xmm0
- pshufd \$0b01010101,%xmm1,%xmm1 # critical path
- pxor %xmm3,%xmm2
- pxor %xmm1,%xmm0
- pshufd \$0b11111111,%xmm0,%xmm3
- pxor %xmm3,%xmm2
- ret
-
-.align 16
-.Lkey_expansion_192b:
- movaps %xmm0,%xmm3
- shufps \$0b01000100,%xmm0,%xmm5
- $movkey %xmm5,(%rax)
- shufps \$0b01001110,%xmm2,%xmm3
- $movkey %xmm3,16(%rax)
- lea 32(%rax),%rax
- jmp .Lkey_expansion_192b_warm
-
-.align 16
-.Lkey_expansion_256a:
- $movkey %xmm2,(%rax)
- lea 16(%rax),%rax
-.Lkey_expansion_256a_cold:
- shufps \$0b00010000,%xmm0,%xmm4
- xorps %xmm4,%xmm0
- shufps \$0b10001100,%xmm0,%xmm4
- xorps %xmm4,%xmm0
- shufps \$0b11111111,%xmm1,%xmm1 # critical path
- xorps %xmm1,%xmm0
- ret
-
-.align 16
-.Lkey_expansion_256b:
- $movkey %xmm0,(%rax)
- lea 16(%rax),%rax
-
- shufps \$0b00010000,%xmm2,%xmm4
- xorps %xmm4,%xmm2
- shufps \$0b10001100,%xmm2,%xmm4
- xorps %xmm4,%xmm2
- shufps \$0b10101010,%xmm1,%xmm1 # critical path
- xorps %xmm1,%xmm2
- ret
-.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+.size ${PREFIX}_set_encrypt_key_alt,.-${PREFIX}_set_encrypt_key_alt
___
}
diff --git a/crypto/fipsmodule/aes/internal.h b/crypto/fipsmodule/aes/internal.h
index e7f55d2..7d2db3b 100644
--- a/crypto/fipsmodule/aes/internal.h
+++ b/crypto/fipsmodule/aes/internal.h
@@ -79,7 +79,27 @@ void aes_hw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
// On x86 and x86_64, |aes_hw_set_decrypt_key| is implemented in terms of
// |aes_hw_set_encrypt_key| and a conversion function.
void aes_hw_encrypt_key_to_decrypt_key(AES_KEY *key);
-#endif
+
+// There are two variants of this function, one which uses aeskeygenassist
+// ("base") and one which uses aesenclast + pshufb ("alt"). aesenclast is
+// overall faster but is slower on some older processors. It doesn't use AVX,
+// but AVX is used as a proxy to detecting this. See
+// https://groups.google.com/g/mailing.openssl.dev/c/OuFXwW4NfO8/m/7d2ZXVjkxVkJ
+//
+// TODO(davidben): It is unclear if the aeskeygenassist version is still
+// worthwhile. However, the aesenclast version requires SSSE3. SSSE3 long
+// predates AES-NI, but it's not clear if AES-NI implies SSSE3. In OpenSSL, the
+// CCM AES-NI assembly seems to assume it does.
+OPENSSL_INLINE int aes_hw_set_encrypt_key_alt_capable(void) {
+ return hwaes_capable() && CRYPTO_is_SSSE3_capable();
+}
+OPENSSL_INLINE int aes_hw_set_encrypt_key_alt_preferred(void) {
+ return hwaes_capable() && CRYPTO_is_AVX_capable();
+}
+int aes_hw_set_encrypt_key_base(const uint8_t *user_key, int bits,
+ AES_KEY *key);
+int aes_hw_set_encrypt_key_alt(const uint8_t *user_key, int bits, AES_KEY *key);
+#endif // OPENSSL_X86 || OPENSSL_X86_64
#else
diff --git a/crypto/perlasm/x86asm.pl b/crypto/perlasm/x86asm.pl
index f0d6310..b10eaf6 100644
--- a/crypto/perlasm/x86asm.pl
+++ b/crypto/perlasm/x86asm.pl
@@ -42,10 +42,10 @@ sub ::record_function_hit
&preprocessor_ifdef("BORINGSSL_DISPATCH_TEST");
&push("ebx");
&push("edx");
- &call(&label("pic"));
- &set_label("pic");
+ &call(&label("pic_for_function_hit"));
+ &set_label("pic_for_function_hit");
&blindpop("ebx");
- &lea("ebx",&DWP("BORINGSSL_function_hit+$index"."-".&label("pic"),"ebx"));
+ &lea("ebx",&DWP("BORINGSSL_function_hit+$index"."-".&label("pic_for_function_hit"),"ebx"));
&mov("edx", 1);
&movb(&BP(0, "ebx"), "dl");
&pop("edx");