diff options
Diffstat (limited to 'sysdeps')
-rw-r--r-- | sysdeps/x86/cpu-features.c | 14 | ||||
-rw-r--r-- | sysdeps/x86/cpu-features.h | 6 | ||||
-rw-r--r-- | sysdeps/x86_64/dl-machine.h | 24 | ||||
-rw-r--r-- | sysdeps/x86_64/dl-trampoline.S | 22 | ||||
-rw-r--r-- | sysdeps/x86_64/dl-trampoline.h | 104 |
5 files changed, 167 insertions, 3 deletions
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index 218ff2b..b1915cd 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -134,6 +134,20 @@ init_cpu_features (struct cpu_features *cpu_features) break; } } + + /* To avoid SSE transition penalty, use _dl_runtime_resolve_slow. + If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt. */ + cpu_features->feature[index_Use_dl_runtime_resolve_slow] + |= bit_Use_dl_runtime_resolve_slow; + if (cpu_features->max_cpuid >= 0xd) + { + unsigned int eax; + + __cpuid_count (0xd, 1, eax, ebx, ecx, edx); + if ((eax & (1 << 2)) != 0) + cpu_features->feature[index_Use_dl_runtime_resolve_opt] + |= bit_Use_dl_runtime_resolve_opt; + } } /* This spells out "AuthenticAMD". */ else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h index e354920..22c9eaf 100644 --- a/sysdeps/x86/cpu-features.h +++ b/sysdeps/x86/cpu-features.h @@ -35,6 +35,8 @@ #define bit_I686 (1 << 15) #define bit_Prefer_MAP_32BIT_EXEC (1 << 16) #define bit_Prefer_No_VZEROUPPER (1 << 17) +#define bit_Use_dl_runtime_resolve_opt (1 << 18) +#define bit_Use_dl_runtime_resolve_slow (1 << 19) /* CPUID Feature flags. */ @@ -101,6 +103,8 @@ # define index_I686 FEATURE_INDEX_1*FEATURE_SIZE # define index_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1*FEATURE_SIZE # define index_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE +# define index_Use_dl_runtime_resolve_opt FEATURE_INDEX_1*FEATURE_SIZE +# define index_Use_dl_runtime_resolve_slow FEATURE_INDEX_1*FEATURE_SIZE # if defined (_LIBC) && !IS_IN (nonlib) @@ -255,6 +259,8 @@ extern const struct cpu_features *__get_cpu_features (void) # define index_I686 FEATURE_INDEX_1 # define index_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1 # define index_Prefer_No_VZEROUPPER FEATURE_INDEX_1 +# define index_Use_dl_runtime_resolve_opt FEATURE_INDEX_1 +# define index_Use_dl_runtime_resolve_slow FEATURE_INDEX_1 #endif /* !__ASSEMBLER__ */ diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h index 980ca73..e6a17e3 100644 --- a/sysdeps/x86_64/dl-machine.h +++ b/sysdeps/x86_64/dl-machine.h @@ -68,7 +68,10 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) Elf64_Addr *got; extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden; extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_avx_slow (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_avx_opt (ElfW(Word)) attribute_hidden; extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_avx512_opt (ElfW(Word)) attribute_hidden; extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden; extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden; extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden; @@ -118,9 +121,26 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) indicated by the offset on the stack, and then jump to the resolved address. */ if (HAS_ARCH_FEATURE (AVX512F_Usable)) - *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx512; + { + if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt)) + *(ElfW(Addr) *) (got + 2) + = (ElfW(Addr)) &_dl_runtime_resolve_avx512_opt; + else + *(ElfW(Addr) *) (got + 2) + = (ElfW(Addr)) &_dl_runtime_resolve_avx512; + } else if (HAS_ARCH_FEATURE (AVX_Usable)) - *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx; + { + if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt)) + *(ElfW(Addr) *) (got + 2) + = (ElfW(Addr)) &_dl_runtime_resolve_avx_opt; + else if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_slow)) + *(ElfW(Addr) *) (got + 2) + = (ElfW(Addr)) &_dl_runtime_resolve_avx_slow; + else + *(ElfW(Addr) *) (got + 2) + = (ElfW(Addr)) &_dl_runtime_resolve_avx; + } else *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse; } diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index 39b8771..3f812b8 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -18,6 +18,7 @@ #include <config.h> #include <sysdep.h> +#include <cpu-features.h> #include <link-defines.h> #ifndef DL_STACK_ALIGNMENT @@ -87,9 +88,11 @@ # endif # define VEC(i) zmm##i # define _dl_runtime_resolve _dl_runtime_resolve_avx512 +# define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt # define _dl_runtime_profile _dl_runtime_profile_avx512 # include "dl-trampoline.h" # undef _dl_runtime_resolve +# undef _dl_runtime_resolve_opt # undef _dl_runtime_profile # undef VEC # undef VMOV @@ -98,6 +101,8 @@ #else strong_alias (_dl_runtime_resolve_avx, _dl_runtime_resolve_avx512) .hidden _dl_runtime_resolve_avx512 +strong_alias (_dl_runtime_resolve_avx_opt, _dl_runtime_resolve_avx512_opt) + .hidden _dl_runtime_resolve_avx512_opt strong_alias (_dl_runtime_profile_avx, _dl_runtime_profile_avx512) .hidden _dl_runtime_profile_avx512 #endif @@ -111,9 +116,11 @@ strong_alias (_dl_runtime_profile_avx, _dl_runtime_profile_avx512) #endif #define VEC(i) ymm##i #define _dl_runtime_resolve _dl_runtime_resolve_avx +#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx_opt #define _dl_runtime_profile _dl_runtime_profile_avx #include "dl-trampoline.h" #undef _dl_runtime_resolve +#undef _dl_runtime_resolve_opt #undef _dl_runtime_profile #undef VEC #undef VMOV @@ -133,3 +140,18 @@ strong_alias (_dl_runtime_profile_avx, _dl_runtime_profile_avx512) #define _dl_runtime_profile _dl_runtime_profile_sse #undef RESTORE_AVX #include "dl-trampoline.h" +#undef _dl_runtime_resolve +#undef _dl_runtime_profile +#undef VMOV +#undef VMOVA + +/* Used by _dl_runtime_resolve_avx_opt/_dl_runtime_resolve_avx512_opt + to preserve the full vector registers with zero upper bits. */ +#define VMOVA vmovdqa +#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT +# define VMOV vmovdqa +#else +# define VMOV vmovdqu +#endif +#define _dl_runtime_resolve _dl_runtime_resolve_sse_vex +#include "dl-trampoline.h" diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h index b90836a..abe4471 100644 --- a/sysdeps/x86_64/dl-trampoline.h +++ b/sysdeps/x86_64/dl-trampoline.h @@ -50,6 +50,105 @@ #endif .text +#ifdef _dl_runtime_resolve_opt +/* Use the smallest vector registers to preserve the full YMM/ZMM + registers to avoid SSE transition penalty. */ + +# if VEC_SIZE == 32 +/* Check if the upper 128 bits in %ymm0 - %ymm7 registers are non-zero + and preserve %xmm0 - %xmm7 registers with the zero upper bits. Since + there is no SSE transition penalty on AVX512 processors which don't + support XGETBV with ECX == 1, _dl_runtime_resolve_avx512_slow isn't + provided. */ + .globl _dl_runtime_resolve_avx_slow + .hidden _dl_runtime_resolve_avx_slow + .type _dl_runtime_resolve_avx_slow, @function + .align 16 +_dl_runtime_resolve_avx_slow: + cfi_startproc + cfi_adjust_cfa_offset(16) # Incorporate PLT + vorpd %ymm0, %ymm1, %ymm8 + vorpd %ymm2, %ymm3, %ymm9 + vorpd %ymm4, %ymm5, %ymm10 + vorpd %ymm6, %ymm7, %ymm11 + vorpd %ymm8, %ymm9, %ymm9 + vorpd %ymm10, %ymm11, %ymm10 + vpcmpeqd %xmm8, %xmm8, %xmm8 + vorpd %ymm9, %ymm10, %ymm10 + vptest %ymm10, %ymm8 + # Preserve %ymm0 - %ymm7 registers if the upper 128 bits of any + # %ymm0 - %ymm7 registers aren't zero. + PRESERVE_BND_REGS_PREFIX + jnc _dl_runtime_resolve_avx + # Use vzeroupper to avoid SSE transition penalty. + vzeroupper + # Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits + # when the upper 128 bits of %ymm0 - %ymm7 registers are zero. + PRESERVE_BND_REGS_PREFIX + jmp _dl_runtime_resolve_sse_vex + cfi_adjust_cfa_offset(-16) # Restore PLT adjustment + cfi_endproc + .size _dl_runtime_resolve_avx_slow, .-_dl_runtime_resolve_avx_slow +# endif + +/* Use XGETBV with ECX == 1 to check which bits in vector registers are + non-zero and only preserve the non-zero lower bits with zero upper + bits. */ + .globl _dl_runtime_resolve_opt + .hidden _dl_runtime_resolve_opt + .type _dl_runtime_resolve_opt, @function + .align 16 +_dl_runtime_resolve_opt: + cfi_startproc + cfi_adjust_cfa_offset(16) # Incorporate PLT + pushq %rax + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%rax, 0) + pushq %rcx + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%rcx, 0) + pushq %rdx + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%rdx, 0) + movl $1, %ecx + xgetbv + movl %eax, %r11d + popq %rdx + cfi_adjust_cfa_offset(-8) + cfi_restore (%rdx) + popq %rcx + cfi_adjust_cfa_offset(-8) + cfi_restore (%rcx) + popq %rax + cfi_adjust_cfa_offset(-8) + cfi_restore (%rax) +# if VEC_SIZE == 32 + # For YMM registers, check if YMM state is in use. + andl $bit_YMM_state, %r11d + # Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits if + # YMM state isn't in use. + PRESERVE_BND_REGS_PREFIX + jz _dl_runtime_resolve_sse_vex +# elif VEC_SIZE == 64 + # For ZMM registers, check if YMM state and ZMM state are in + # use. + andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d + cmpl $bit_YMM_state, %r11d + # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if + # neither YMM state nor ZMM state are in use. + PRESERVE_BND_REGS_PREFIX + jl _dl_runtime_resolve_sse_vex + # Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if + # ZMM state isn't in use. + PRESERVE_BND_REGS_PREFIX + je _dl_runtime_resolve_avx +# else +# error Unsupported VEC_SIZE! +# endif + cfi_adjust_cfa_offset(-16) # Restore PLT adjustment + cfi_endproc + .size _dl_runtime_resolve_opt, .-_dl_runtime_resolve_opt +#endif .globl _dl_runtime_resolve .hidden _dl_runtime_resolve .type _dl_runtime_resolve, @function @@ -162,7 +261,10 @@ _dl_runtime_resolve: .size _dl_runtime_resolve, .-_dl_runtime_resolve -#ifndef PROF +/* To preserve %xmm0 - %xmm7 registers, dl-trampoline.h is included + twice, for _dl_runtime_resolve_sse and _dl_runtime_resolve_sse_vex. + But we don't need another _dl_runtime_profile for XMM registers. */ +#if !defined PROF && defined _dl_runtime_profile # if (LR_VECTOR_OFFSET % VEC_SIZE) != 0 # error LR_VECTOR_OFFSET must be multples of VEC_SIZE # endif |