diff options
author | Andrew Senkevich <andrew.senkevich@intel.com> | 2015-07-24 14:47:23 +0300 |
---|---|---|
committer | Andrew Senkevich <andrew.senkevich@intel.com> | 2015-07-24 14:47:23 +0300 |
commit | 99017161354321845d11dce4fcd3abfebc5dd0d5 (patch) | |
tree | 50c62fe44aef915a84b1eb5fb0ad787e39f5a210 /sysdeps | |
parent | 3bcea719ddd6ce399d7bccb492c40af77d216e42 (diff) | |
download | glibc-99017161354321845d11dce4fcd3abfebc5dd0d5.zip glibc-99017161354321845d11dce4fcd3abfebc5dd0d5.tar.gz glibc-99017161354321845d11dce4fcd3abfebc5dd0d5.tar.bz2 |
Fixed several libmvec bugs found during testing on KNL hardware.
AVX512 IFUNC implementations, implementations of wrappers to
AVX2 versions and KNL expf implementation fixed.
* sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC.
* sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise.
* sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2.
* sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL
implementation.
Diffstat (limited to 'sysdeps')
-rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S | 10 | ||||
-rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S | 10 | ||||
-rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S | 10 | ||||
-rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S | 10 | ||||
-rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S | 10 | ||||
-rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S | 10 | ||||
-rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S | 10 | ||||
-rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S | 10 | ||||
-rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 1 | ||||
-rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S | 10 | ||||
-rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S | 10 | ||||
-rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S | 10 | ||||
-rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S | 10 | ||||
-rw-r--r-- | sysdeps/x86_64/fpu/svml_d_wrapper_impl.h | 202 | ||||
-rw-r--r-- | sysdeps/x86_64/fpu/svml_s_wrapper_impl.h | 101 |
15 files changed, 201 insertions, 223 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S index ba3b66f..d0f4f27 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN8v_cos) .type _ZGVeN8v_cos, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN8v_cos_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN8v_cos_knl(%rip), %rax + jnz 2f + leaq _ZGVeN8v_cos_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN8v_cos_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN8v_cos) #define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S index 8f837fb..7b7c07d 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN8v_exp) .type _ZGVeN8v_exp, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN8v_exp_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN8v_exp_knl(%rip), %rax + jnz 2f + leaq _ZGVeN8v_exp_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN8v_exp_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN8v_exp) #define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S index 2f9e9d8..76375fd 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN8v_log) .type _ZGVeN8v_log, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN8v_log_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN8v_log_knl(%rip), %rax + jnz 2f + leaq _ZGVeN8v_log_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN8v_log_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN8v_log) #define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S index 3b11511..c1e5e76 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN8vv_pow) .type _ZGVeN8vv_pow, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN8vv_pow_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN8vv_pow_knl(%rip), %rax + jnz 2f + leaq _ZGVeN8vv_pow_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN8vv_pow) #define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S index ba63102..131f2f4 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN8v_sin) .type _ZGVeN8v_sin, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN8v_sin_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN8v_sin_knl(%rip), %rax + jnz 2f + leaq _ZGVeN8v_sin_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN8v_sin_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN8v_sin) #define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S index 7228ba5..e331090 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN8vvv_sincos) .type _ZGVeN8vvv_sincos, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN8vvv_sincos_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN8vvv_sincos_knl(%rip), %rax + jnz 2f + leaq _ZGVeN8vvv_sincos_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN8vvv_sincos) #define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S index 91564de..0654d3c 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN16v_cosf) .type _ZGVeN16v_cosf, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN16v_cosf_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN16v_cosf_knl(%rip), %rax + jnz 2f + leaq _ZGVeN16v_cosf_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN16v_cosf) #define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S index 3b3489d..62858eb 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN16v_expf) .type _ZGVeN16v_expf, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN16v_expf_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN16v_expf_knl(%rip), %rax + jnz 2f + leaq _ZGVeN16v_expf_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN16v_expf_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN16v_expf) #define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S index cb807e0..ec69055 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S @@ -46,6 +46,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf The table lookup is skipped if k = 0. For low accuracy approximation, exp(r) ~ 1 or 1+r. */ + pushq %rbp cfi_adjust_cfa_offset (8) cfi_rel_offset (%rbp, 0) movq %rsp, %rbp diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S index 8756750..68c57e4 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN16v_logf) .type _ZGVeN16v_logf, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN16v_logf_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN16v_logf_knl(%rip), %rax + jnz 2f + leaq _ZGVeN16v_logf_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN16v_logf_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN16v_logf) #define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S index a4ba4fb..3aa9f95 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN16vv_powf) .type _ZGVeN16vv_powf, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN16vv_powf_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN16vv_powf_knl(%rip), %rax + jnz 2f + leaq _ZGVeN16vv_powf_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN16vv_powf) #define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S index 0a1753e..bdcabab 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN16vvv_sincosf) .type _ZGVeN16vvv_sincosf, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN16vvv_sincosf_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN16vvv_sincosf_knl(%rip), %rax + jnz 2f + leaq _ZGVeN16vvv_sincosf_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN16vvv_sincosf) #define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S index 7ed637b..3ec78a0 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN16v_sinf) .type _ZGVeN16v_sinf, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN16v_sinf_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN16v_sinf_knl(%rip), %rax + jnz 2f + leaq _ZGVeN16v_sinf_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN16v_sinf) #define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h index bd93b8e..5c0ff89 100644 --- a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h +++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h @@ -194,39 +194,39 @@ /* AVX512 ISA version as wrapper to AVX2 ISA version. */ .macro WRAPPER_IMPL_AVX512 callee - pushq %rbp + pushq %rbp cfi_adjust_cfa_offset (8) cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp + movq %rsp, %rbp cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $64, %rsp -/* Below is encoding for vmovaps %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x04 - .byte 0x24 -/* Below is encoding for vmovapd (%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x04 - .byte 0x24 - call HIDDEN_JUMPTARGET(\callee) -/* Below is encoding for vmovapd 32(%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x44 - .byte 0x24 - .byte 0x20 - call HIDDEN_JUMPTARGET(\callee) - movq %rbp, %rsp + andq $-64, %rsp + subq $128, %rsp +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 + vmovupd (%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 64(%rsp) + vmovupd 32(%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 96(%rsp) +/* Below is encoding for vmovups 64(%rsp), %zmm0. */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x10 + .byte 0x44 + .byte 0x24 + .byte 0x01 + movq %rbp, %rsp cfi_def_cfa_register (%rsp) - popq %rbp + popq %rbp cfi_adjust_cfa_offset (-8) cfi_restore (%rbp) ret @@ -234,61 +234,50 @@ /* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ .macro WRAPPER_IMPL_AVX512_ff callee - pushq %rbp + pushq %rbp cfi_adjust_cfa_offset (8) cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp + movq %rsp, %rbp cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $128, %rsp -/* Below is encoding for vmovaps %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x04 - .byte 0x24 -/* Below is encoding for vmovaps %zmm1, 64(%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x4c - .byte 0x24 -/* Below is encoding for vmovapd (%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x04 - .byte 0x24 -/* Below is encoding for vmovapd 64(%rsp), %ymm1. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x4c - .byte 0x24 - .byte 0x40 - call HIDDEN_JUMPTARGET(\callee) -/* Below is encoding for vmovapd 32(%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x44 - .byte 0x24 - .byte 0x20 -/* Below is encoding for vmovapd 96(%rsp), %ymm1. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x4c - .byte 0x24 - .byte 0x60 - call HIDDEN_JUMPTARGET(\callee) - movq %rbp, %rsp + andq $-64, %rsp + subq $192, %rsp +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 +/* Below is encoding for vmovups %zmm1, 64(%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x4c + .byte 0x24 + .byte 0x01 + vmovupd (%rsp), %ymm0 + vmovupd 64(%rsp), %ymm1 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 128(%rsp) + vmovupd 32(%rsp), %ymm0 + vmovupd 96(%rsp), %ymm1 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 160(%rsp) +/* Below is encoding for vmovups 128(%rsp), %zmm0. */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x10 + .byte 0x44 + .byte 0x24 + .byte 0x02 + movq %rbp, %rsp cfi_def_cfa_register (%rsp) - popq %rbp + popq %rbp cfi_adjust_cfa_offset (-8) cfi_restore (%rbp) ret @@ -310,61 +299,26 @@ cfi_rel_offset (%r13, 0) subq $176, %rsp movq %rsi, %r13 -/* Below is encoding for vmovaps %zmm0, (%rsp). */ +/* Below is encoding for vmovups %zmm0, (%rsp). */ .byte 0x62 .byte 0xf1 .byte 0x7c .byte 0x48 - .byte 0x29 + .byte 0x11 .byte 0x04 .byte 0x24 movq %rdi, %r12 -/* Below is encoding for vmovapd (%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x04 - .byte 0x24 + vmovupd (%rsp), %ymm0 call HIDDEN_JUMPTARGET(\callee) -/* Below is encoding for vmovapd 32(%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x44 - .byte 0x24 - .byte 0x20 + vmovupd 32(%rsp), %ymm0 lea 64(%rsp), %rdi lea 96(%rsp), %rsi call HIDDEN_JUMPTARGET(\callee) -/* Below is encoding for vmovapd 64(%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x44 - .byte 0x24 - .byte 0x40 -/* Below is encoding for vmovapd 96(%rsp), %ymm1. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x4c - .byte 0x24 - .byte 0x60 -/* Below is encoding for vmovapd %ymm0, 32(%r12). */ - .byte 0xc4 - .byte 0xc1 - .byte 0x7d - .byte 0x29 - .byte 0x44 - .byte 0x24 - .byte 0x20 -/* Below is encoding for vmovapd %ymm1, 32(%r13). */ - .byte 0xc4 - .byte 0xc1 - .byte 0x7d - .byte 0x29 - .byte 0x4d - .byte 0x20 + vmovupd 64(%rsp), %ymm0 + vmovupd 96(%rsp), %ymm1 + vmovupd %ymm0, 32(%r12) + vmovupd %ymm1, 32(%r13) + vzeroupper addq $176, %rsp popq %r13 cfi_adjust_cfa_offset (-8) diff --git a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h index 66bb081..d255d19 100644 --- a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h +++ b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h @@ -239,28 +239,39 @@ /* AVX512 ISA version as wrapper to AVX2 ISA version. */ .macro WRAPPER_IMPL_AVX512 callee - pushq %rbp + pushq %rbp cfi_adjust_cfa_offset (8) cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp + movq %rsp, %rbp cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $64, %rsp -/* Below is encoding for vmovaps %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x04 - .byte 0x24 - vmovaps (%rsp), %ymm0 - call HIDDEN_JUMPTARGET(\callee) - vmovaps 32(%rsp), %ymm0 - call HIDDEN_JUMPTARGET(\callee) - movq %rbp, %rsp + andq $-64, %rsp + subq $128, %rsp +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 + vmovupd (%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 64(%rsp) + vmovupd 32(%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 96(%rsp) +/* Below is encoding for vmovups 64(%rsp), %zmm0. */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x10 + .byte 0x44 + .byte 0x24 + .byte 0x01 + movq %rbp, %rsp cfi_def_cfa_register (%rsp) - popq %rbp + popq %rbp cfi_adjust_cfa_offset (-8) cfi_restore (%rbp) ret @@ -274,29 +285,41 @@ movq %rsp, %rbp cfi_def_cfa_register (%rbp) andq $-64, %rsp - subq $128, %rsp -/* Below is encoding for vmovaps %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x04 - .byte 0x24 -/* Below is encoding for vmovaps %zmm1, 64(%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x4c - .byte 0x24 - vmovaps (%rsp), %ymm0 - vmovaps 64(%rsp), %ymm1 + subq $192, %rsp +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 +/* Below is encoding for vmovups %zmm1, 64(%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x4c + .byte 0x24 + .byte 0x01 + vmovups (%rsp), %ymm0 + vmovups 64(%rsp), %ymm1 call HIDDEN_JUMPTARGET(\callee) - vmovaps 32(%rsp), %ymm0 - vmovaps 96(%rsp), %ymm1 + vmovups %ymm0, 128(%rsp) + vmovups 32(%rsp), %ymm0 + vmovups 96(%rsp), %ymm1 call HIDDEN_JUMPTARGET(\callee) + vmovups %ymm0, 160(%rsp) +/* Below is encoding for vmovups 128(%rsp), %zmm0. */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x10 + .byte 0x44 + .byte 0x24 + .byte 0x02 movq %rbp, %rsp cfi_def_cfa_register (%rsp) popq %rbp |