diff options
Diffstat (limited to 'src/crypto/chacha')
-rwxr-xr-x | src/crypto/chacha/asm/chacha-armv4.pl | 50 | ||||
-rwxr-xr-x | src/crypto/chacha/asm/chacha-armv8.pl | 33 | ||||
-rwxr-xr-x | src/crypto/chacha/asm/chacha-x86_64.pl | 98 | ||||
-rw-r--r-- | src/crypto/chacha/chacha.c | 35 | ||||
-rw-r--r-- | src/crypto/chacha/chacha_test.cc | 44 | ||||
-rw-r--r-- | src/crypto/chacha/internal.h | 52 |
6 files changed, 174 insertions, 138 deletions
diff --git a/src/crypto/chacha/asm/chacha-armv4.pl b/src/crypto/chacha/asm/chacha-armv4.pl index 1f5ceff..fd92fdb 100755 --- a/src/crypto/chacha/asm/chacha-armv4.pl +++ b/src/crypto/chacha/asm/chacha-armv4.pl @@ -196,46 +196,16 @@ $code.=<<___; .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral .Lone: .long 1,0,0,0 -#if __ARM_MAX_ARCH__>=7 -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-.LChaCha20_ctr32 -#else -.word -1 -#endif -.globl ChaCha20_ctr32 -.type ChaCha20_ctr32,%function +.globl ChaCha20_ctr32_nohw +.type ChaCha20_ctr32_nohw,%function .align 5 -ChaCha20_ctr32: -.LChaCha20_ctr32: +ChaCha20_ctr32_nohw: ldr r12,[sp,#0] @ pull pointer to counter and nonce stmdb sp!,{r0-r2,r4-r11,lr} -#if __ARM_ARCH<7 && !defined(__thumb2__) - sub r14,pc,#16 @ ChaCha20_ctr32 -#else - adr r14,.LChaCha20_ctr32 -#endif - cmp r2,#0 @ len==0? -#ifdef __thumb2__ - itt eq -#endif - addeq sp,sp,#4*3 - beq .Lno_data -#if __ARM_MAX_ARCH__>=7 - cmp r2,#192 @ test len - bls .Lshort - ldr r4,[r14,#-32] - ldr r4,[r14,r4] -# ifdef __APPLE__ - ldr r4,[r4] -# endif - tst r4,#ARMV7_NEON - bne .LChaCha20_neon -.Lshort: -#endif + adr r14,.Lsigma ldmia r12,{r4-r7} @ load counter and nonce sub sp,sp,#4*(16) @ off-load area - sub r14,r14,#64 @ .Lsigma stmdb sp!,{r4-r7} @ copy counter and nonce ldmia r3,{r4-r11} @ load key ldmia r14,{r0-r3} @ load sigma @@ -626,9 +596,8 @@ $code.=<<___; .Ldone: add sp,sp,#4*(32+3) -.Lno_data: ldmia sp!,{r4-r11,pc} -.size ChaCha20_ctr32,.-ChaCha20_ctr32 +.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw ___ {{{ @@ -670,12 +639,12 @@ $code.=<<___; .arch armv7-a .fpu neon -.type ChaCha20_neon,%function +.globl ChaCha20_ctr32_neon +.type ChaCha20_ctr32_neon,%function .align 5 -ChaCha20_neon: +ChaCha20_ctr32_neon: ldr r12,[sp,#0] @ pull pointer to counter and nonce stmdb sp!,{r0-r2,r4-r11,lr} -.LChaCha20_neon: adr r14,.Lsigma vstmdb sp!,{d8-d15} @ ABI spec says so stmdb sp!,{r0-r3} @@ -1150,8 +1119,7 @@ $code.=<<___; vldmia sp,{d8-d15} add sp,sp,#4*(16+3) ldmia sp!,{r4-r11,pc} -.size ChaCha20_neon,.-ChaCha20_neon -.comm OPENSSL_armcap_P,4,4 +.size ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon #endif ___ }}} diff --git a/src/crypto/chacha/asm/chacha-armv8.pl b/src/crypto/chacha/asm/chacha-armv8.pl index a519b5f..6818da2 100755 --- a/src/crypto/chacha/asm/chacha-armv8.pl +++ b/src/crypto/chacha/asm/chacha-armv8.pl @@ -122,9 +122,6 @@ my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); $code.=<<___; #include <openssl/arm_arch.h> -.extern OPENSSL_armcap_P -.hidden OPENSSL_armcap_P - .section .rodata .align 5 @@ -136,24 +133,10 @@ $code.=<<___; .text -.globl ChaCha20_ctr32 -.type ChaCha20_ctr32,%function +.globl ChaCha20_ctr32_nohw +.type ChaCha20_ctr32_nohw,%function .align 5 -ChaCha20_ctr32: - AARCH64_VALID_CALL_TARGET - cbz $len,.Labort -#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10 - adrp @x[0],:pg_hi21_nc:OPENSSL_armcap_P -#else - adrp @x[0],:pg_hi21:OPENSSL_armcap_P -#endif - cmp $len,#192 - b.lo .Lshort - ldr w17,[@x[0],:lo12:OPENSSL_armcap_P] - tst w17,#ARMV7_NEON - b.ne ChaCha20_neon - -.Lshort: +ChaCha20_ctr32_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 @@ -276,7 +259,6 @@ $code.=<<___; ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER -.Labort: ret .align 4 @@ -334,7 +316,7 @@ $code.=<<___; ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret -.size ChaCha20_ctr32,.-ChaCha20_ctr32 +.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw ___ {{{ @@ -375,9 +357,10 @@ my ($a,$b,$c,$d,$t)=@_; $code.=<<___; -.type ChaCha20_neon,%function +.globl ChaCha20_ctr32_neon +.type ChaCha20_ctr32_neon,%function .align 5 -ChaCha20_neon: +ChaCha20_ctr32_neon: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 @@ -690,7 +673,7 @@ $code.=<<___; ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret -.size ChaCha20_neon,.-ChaCha20_neon +.size ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon ___ { my ($T0,$T1,$T2,$T3,$T4,$T5)=@K; diff --git a/src/crypto/chacha/asm/chacha-x86_64.pl b/src/crypto/chacha/asm/chacha-x86_64.pl index 418044c..6d26b71 100755 --- a/src/crypto/chacha/asm/chacha-x86_64.pl +++ b/src/crypto/chacha/asm/chacha-x86_64.pl @@ -76,8 +76,6 @@ open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; $code.=<<___; .text -.extern OPENSSL_ia32cap_P - .section .rodata .align 64 .Lzero: @@ -226,24 +224,12 @@ my @x=map("\"$_\"",@x); ######################################################################## # Generic code path that handles all lengths on pre-SSSE3 processors. $code.=<<___; -.globl ChaCha20_ctr32 -.type ChaCha20_ctr32,\@function,5 +.globl ChaCha20_ctr32_nohw +.type ChaCha20_ctr32_nohw,\@function,5 .align 64 -ChaCha20_ctr32: +ChaCha20_ctr32_nohw: .cfi_startproc _CET_ENDBR - cmp \$0,$len - je .Lno_data - mov OPENSSL_ia32cap_P+4(%rip),%r10 -___ -$code.=<<___ if ($avx>2); - bt \$48,%r10 # check for AVX512F - jc .LChaCha20_avx512 -___ -$code.=<<___; - test \$`1<<(41-32)`,%r10d - jnz .LChaCha20_ssse3 - push %rbx .cfi_push rbx push %rbp @@ -415,7 +401,7 @@ $code.=<<___; .Lno_data: ret .cfi_endproc -.size ChaCha20_ctr32,.-ChaCha20_ctr32 +.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw ___ ######################################################################## @@ -450,19 +436,16 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round my $xframe = $win64 ? 32+8 : 8; $code.=<<___; -.type ChaCha20_ssse3,\@function,5 +.globl ChaCha20_ctr32_ssse3 +.type ChaCha20_ctr32_ssse3,\@function,5 .align 32 -ChaCha20_ssse3: -.LChaCha20_ssse3: +ChaCha20_ctr32_ssse3: .cfi_startproc + _CET_ENDBR mov %rsp,%r9 # frame pointer .cfi_def_cfa_register r9 ___ $code.=<<___; - cmp \$128,$len # we might throw away some data, - ja .LChaCha20_4x # but overall it won't be slower - -.Ldo_sse3_after_all: sub \$64+$xframe,%rsp ___ $code.=<<___ if ($win64); @@ -572,7 +555,7 @@ $code.=<<___; .Lssse3_epilogue: ret .cfi_endproc -.size ChaCha20_ssse3,.-ChaCha20_ssse3 +.size ChaCha20_ctr32_ssse3,.-ChaCha20_ctr32_ssse3 ___ } @@ -710,29 +693,17 @@ my @x=map("\"$_\"",@xx); my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; -.type ChaCha20_4x,\@function,5 +.globl ChaCha20_ctr32_ssse3_4x +.type ChaCha20_ctr32_ssse3_4x,\@function,5 .align 32 -ChaCha20_4x: -.LChaCha20_4x: +ChaCha20_ctr32_ssse3_4x: .cfi_startproc + _CET_ENDBR mov %rsp,%r9 # frame pointer .cfi_def_cfa_register r9 mov %r10,%r11 ___ -$code.=<<___ if ($avx>1); - shr \$32,%r10 # OPENSSL_ia32cap_P+8 - test \$`1<<5`,%r10 # test AVX2 - jnz .LChaCha20_8x -___ $code.=<<___; - cmp \$192,$len - ja .Lproceed4x - - and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE - cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE - je .Ldo_sse3_after_all # to detect Atom - -.Lproceed4x: sub \$0x140+$xframe,%rsp ___ ################ stack layout @@ -1160,7 +1131,7 @@ $code.=<<___; .L4x_epilogue: ret .cfi_endproc -.size ChaCha20_4x,.-ChaCha20_4x +.size ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x ___ } @@ -1289,11 +1260,12 @@ my @x=map("\"$_\"",@xx); my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; -.type ChaCha20_8x,\@function,5 +.globl ChaCha20_ctr32_avx2 +.type ChaCha20_ctr32_avx2,\@function,5 .align 32 -ChaCha20_8x: -.LChaCha20_8x: +ChaCha20_ctr32_avx2: .cfi_startproc + _CET_ENDBR mov %rsp,%r9 # frame register .cfi_def_cfa_register r9 sub \$0x280+$xframe,%rsp @@ -1805,7 +1777,7 @@ $code.=<<___; .L8x_epilogue: ret .cfi_endproc -.size ChaCha20_8x,.-ChaCha20_8x +.size ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2 ___ } @@ -2715,22 +2687,22 @@ full_handler: .section .pdata .align 4 - .rva .LSEH_begin_ChaCha20_ctr32 - .rva .LSEH_end_ChaCha20_ctr32 - .rva .LSEH_info_ChaCha20_ctr32 + .rva .LSEH_begin_ChaCha20_ctr32_nohw + .rva .LSEH_end_ChaCha20_ctr32_nohw + .rva .LSEH_info_ChaCha20_ctr32_nohw - .rva .LSEH_begin_ChaCha20_ssse3 - .rva .LSEH_end_ChaCha20_ssse3 - .rva .LSEH_info_ChaCha20_ssse3 + .rva .LSEH_begin_ChaCha20_ctr32_ssse3 + .rva .LSEH_end_ChaCha20_ctr32_ssse3 + .rva .LSEH_info_ChaCha20_ctr32_ssse3 - .rva .LSEH_begin_ChaCha20_4x - .rva .LSEH_end_ChaCha20_4x - .rva .LSEH_info_ChaCha20_4x + .rva .LSEH_begin_ChaCha20_ctr32_ssse3_4x + .rva .LSEH_end_ChaCha20_ctr32_ssse3_4x + .rva .LSEH_info_ChaCha20_ctr32_ssse3_4x ___ $code.=<<___ if ($avx>1); - .rva .LSEH_begin_ChaCha20_8x - .rva .LSEH_end_ChaCha20_8x - .rva .LSEH_info_ChaCha20_8x + .rva .LSEH_begin_ChaCha20_ctr32_avx2 + .rva .LSEH_end_ChaCha20_ctr32_avx2 + .rva .LSEH_info_ChaCha20_ctr32_avx2 ___ $code.=<<___ if ($avx>2); .rva .LSEH_begin_ChaCha20_avx512 @@ -2744,22 +2716,22 @@ ___ $code.=<<___; .section .xdata .align 8 -.LSEH_info_ChaCha20_ctr32: +.LSEH_info_ChaCha20_ctr32_nohw: .byte 9,0,0,0 .rva se_handler -.LSEH_info_ChaCha20_ssse3: +.LSEH_info_ChaCha20_ctr32_ssse3: .byte 9,0,0,0 .rva ssse3_handler .rva .Lssse3_body,.Lssse3_epilogue -.LSEH_info_ChaCha20_4x: +.LSEH_info_ChaCha20_ctr32_ssse3_4x: .byte 9,0,0,0 .rva full_handler .rva .L4x_body,.L4x_epilogue ___ $code.=<<___ if ($avx>1); -.LSEH_info_ChaCha20_8x: +.LSEH_info_ChaCha20_ctr32_avx2: .byte 9,0,0,0 .rva full_handler .rva .L8x_body,.L8x_epilogue # HandlerData[] diff --git a/src/crypto/chacha/chacha.c b/src/crypto/chacha/chacha.c index a4d88c0..68c0c5d 100644 --- a/src/crypto/chacha/chacha.c +++ b/src/crypto/chacha/chacha.c @@ -60,7 +60,40 @@ void CRYPTO_hchacha20(uint8_t out[32], const uint8_t key[32], OPENSSL_memcpy(&out[16], &x[12], sizeof(uint32_t) * 4); } -#if defined(CHACHA20_ASM) +#if defined(CHACHA20_ASM_NOHW) +static void ChaCha20_ctr32(uint8_t *out, const uint8_t *in, size_t in_len, + const uint32_t key[8], const uint32_t counter[4]) { +#if defined(CHACHA20_ASM_NEON) + if (ChaCha20_ctr32_neon_capable(in_len)) { + ChaCha20_ctr32_neon(out, in, in_len, key, counter); + return; + } +#endif +#if defined(CHACHA20_ASM_AVX2) + if (ChaCha20_ctr32_avx2_capable(in_len)) { + ChaCha20_ctr32_avx2(out, in, in_len, key, counter); + return; + } +#endif +#if defined(CHACHA20_ASM_SSSE3_4X) + if (ChaCha20_ctr32_ssse3_4x_capable(in_len)) { + ChaCha20_ctr32_ssse3_4x(out, in, in_len, key, counter); + return; + } +#endif +#if defined(CHACHA20_ASM_SSSE3) + if (ChaCha20_ctr32_ssse3_capable(in_len)) { + ChaCha20_ctr32_ssse3(out, in, in_len, key, counter); + return; + } +#endif + if (in_len > 0) { + ChaCha20_ctr32_nohw(out, in, in_len, key, counter); + } +} +#endif + +#if defined(CHACHA20_ASM) || defined(CHACHA20_ASM_NOHW) void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len, const uint8_t key[32], const uint8_t nonce[12], diff --git a/src/crypto/chacha/chacha_test.cc b/src/crypto/chacha/chacha_test.cc index d4e5332..ff7bfd9 100644 --- a/src/crypto/chacha/chacha_test.cc +++ b/src/crypto/chacha/chacha_test.cc @@ -347,7 +347,40 @@ TEST(ChaChaTest, CounterOverflow) { } } -#if defined(CHACHA20_ASM) && defined(SUPPORTS_ABI_TEST) +#if defined(SUPPORTS_ABI_TEST) + +static void check_abi(uint8_t *out, const uint8_t *in, size_t in_len, + const uint32_t key[8], const uint32_t counter[4]) { +#if defined(CHACHA20_ASM) + CHECK_ABI(ChaCha20_ctr32, out, in, in_len, key, counter); +#endif +#if defined(CHACHA20_ASM_NEON) + if (ChaCha20_ctr32_neon_capable(in_len)) { + CHECK_ABI(ChaCha20_ctr32_neon, out, in, in_len, key, counter); + } +#endif +#if defined(CHACHA20_ASM_AVX2) + if (ChaCha20_ctr32_avx2_capable(in_len)) { + CHECK_ABI(ChaCha20_ctr32_avx2, out, in, in_len, key, counter); + } +#endif +#if defined(CHACHA20_ASM_SSSE3_4X) + if (ChaCha20_ctr32_ssse3_4x_capable(in_len)) { + CHECK_ABI(ChaCha20_ctr32_ssse3_4x, out, in, in_len, key, counter); + } +#endif +#if defined(CHACHA20_ASM_SSSE3) + if (ChaCha20_ctr32_ssse3_capable(in_len)) { + CHECK_ABI(ChaCha20_ctr32_ssse3, out, in, in_len, key, counter); + } +#endif +#if defined(CHACHA20_ASM_NOHW) + if (in_len > 0) { + CHECK_ABI(ChaCha20_ctr32_nohw, out, in, in_len, key, counter); + } +#endif +} + TEST(ChaChaTest, ABI) { uint32_t key[8]; OPENSSL_memcpy(key, kKey, sizeof(key)); @@ -357,14 +390,15 @@ TEST(ChaChaTest, ABI) { auto buf = std::make_unique<uint8_t[]>(sizeof(kInput)); for (size_t len = 0; len <= 32; len++) { SCOPED_TRACE(len); - CHECK_ABI(ChaCha20_ctr32, buf.get(), kInput, len, key, kCounterNonce); + check_abi(buf.get(), kInput, len, key, kCounterNonce); } for (size_t len : {32 * 2, 32 * 4, 32 * 8, 32 * 16, 32 * 24}) { SCOPED_TRACE(len); - CHECK_ABI(ChaCha20_ctr32, buf.get(), kInput, len, key, kCounterNonce); + check_abi(buf.get(), kInput, len, key, kCounterNonce); // Cover the partial block paths. - CHECK_ABI(ChaCha20_ctr32, buf.get(), kInput, len + 15, key, kCounterNonce); + check_abi(buf.get(), kInput, len + 15, key, kCounterNonce); } } -#endif // CHACHA20_ASM && SUPPORTS_ABI_TEST + +#endif // SUPPORTS_ABI_TEST diff --git a/src/crypto/chacha/internal.h b/src/crypto/chacha/internal.h index 5f442ec..48eb033 100644 --- a/src/crypto/chacha/internal.h +++ b/src/crypto/chacha/internal.h @@ -17,6 +17,8 @@ #include <openssl/base.h> +#include "../internal.h" + #if defined(__cplusplus) extern "C" { #endif @@ -27,11 +29,49 @@ extern "C" { void CRYPTO_hchacha20(uint8_t out[32], const uint8_t key[32], const uint8_t nonce[16]); -#if !defined(OPENSSL_NO_ASM) && \ - (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \ - defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) + #define CHACHA20_ASM +#elif !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) + +#define CHACHA20_ASM_NOHW + +#define CHACHA20_ASM_NEON +OPENSSL_INLINE int ChaCha20_ctr32_neon_capable(size_t len) { + return (len >= 192) && CRYPTO_is_NEON_capable(); +} +void ChaCha20_ctr32_neon(uint8_t *out, const uint8_t *in, size_t in_len, + const uint32_t key[8], const uint32_t counter[4]); +#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) +#define CHACHA20_ASM_NOHW + +#define CHACHA20_ASM_AVX2 +OPENSSL_INLINE int ChaCha20_ctr32_avx2_capable(size_t len) { + return (len > 128) && CRYPTO_is_AVX2_capable(); +} +void ChaCha20_ctr32_avx2(uint8_t *out, const uint8_t *in, size_t in_len, + const uint32_t key[8], const uint32_t counter[4]); + +#define CHACHA20_ASM_SSSE3_4X +OPENSSL_INLINE int ChaCha20_ctr32_ssse3_4x_capable(size_t len) { + int capable = (len > 128) && CRYPTO_is_SSSE3_capable(); + int faster = (len > 192) || !CRYPTO_cpu_perf_is_like_silvermont(); + return capable && faster; +} +void ChaCha20_ctr32_ssse3_4x(uint8_t *out, const uint8_t *in, size_t in_len, + const uint32_t key[8], const uint32_t counter[4]); + +#define CHACHA20_ASM_SSSE3 +OPENSSL_INLINE int ChaCha20_ctr32_ssse3_capable(size_t len) { + return (len > 128) && CRYPTO_is_SSSE3_capable(); +} +void ChaCha20_ctr32_ssse3(uint8_t *out, const uint8_t *in, size_t in_len, + const uint32_t key[8], const uint32_t counter[4]); +#endif + +#if defined(CHACHA20_ASM) // ChaCha20_ctr32 encrypts |in_len| bytes from |in| and writes the result to // |out|. If |in| and |out| alias, they must be equal. // @@ -44,6 +84,12 @@ void ChaCha20_ctr32(uint8_t *out, const uint8_t *in, size_t in_len, const uint32_t key[8], const uint32_t counter[4]); #endif +#if defined(CHACHA20_ASM_NOHW) +// ChaCha20_ctr32_nohw is like |ChaCha20_ctr32| except |in_len| must be nonzero. +void ChaCha20_ctr32_nohw(uint8_t *out, const uint8_t *in, size_t in_len, + const uint32_t key[8], const uint32_t counter[4]); +#endif + #if defined(__cplusplus) } // extern C |