aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Senkevich <andrew.senkevich@intel.com>2015-07-24 14:47:23 +0300
committerAndrew Senkevich <andrew.senkevich@intel.com>2015-07-24 14:47:23 +0300
commit99017161354321845d11dce4fcd3abfebc5dd0d5 (patch)
tree50c62fe44aef915a84b1eb5fb0ad787e39f5a210
parent3bcea719ddd6ce399d7bccb492c40af77d216e42 (diff)
downloadglibc-99017161354321845d11dce4fcd3abfebc5dd0d5.zip
glibc-99017161354321845d11dce4fcd3abfebc5dd0d5.tar.gz
glibc-99017161354321845d11dce4fcd3abfebc5dd0d5.tar.bz2
Fixed several libmvec bugs found during testing on KNL hardware.
AVX512 IFUNC implementations, implementations of wrappers to AVX2 versions and KNL expf implementation fixed. * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC. * sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise. * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2. * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL implementation.
-rw-r--r--ChangeLog19
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S10
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S10
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S10
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S10
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S10
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S10
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S10
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S10
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S1
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S10
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S10
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S10
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S10
-rw-r--r--sysdeps/x86_64/fpu/svml_d_wrapper_impl.h202
-rw-r--r--sysdeps/x86_64/fpu/svml_s_wrapper_impl.h101
16 files changed, 220 insertions, 223 deletions
diff --git a/ChangeLog b/ChangeLog
index 6f6016d..3e22413 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+2015-07-24 Andrew Senkevich <andrew.senkevich@intel.com>
+
+ * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC.
+ * sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise.
+ * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2.
+ * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL
+ implementation.
+
2015-07-24 Szabolcs Nagy <szabolcs.nagy@arm.com>
[BZ #17711]
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
index ba3b66f..d0f4f27 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN8v_cos)
.type _ZGVeN8v_cos, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN8v_cos_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN8v_cos_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN8v_cos_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN8v_cos_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN8v_cos)
#define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
index 8f837fb..7b7c07d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN8v_exp)
.type _ZGVeN8v_exp, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN8v_exp_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN8v_exp_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN8v_exp_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN8v_exp_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN8v_exp)
#define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
index 2f9e9d8..76375fd 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN8v_log)
.type _ZGVeN8v_log, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN8v_log_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN8v_log_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN8v_log_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN8v_log_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN8v_log)
#define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
index 3b11511..c1e5e76 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN8vv_pow)
.type _ZGVeN8vv_pow, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN8vv_pow_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN8vv_pow_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN8vv_pow_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN8vv_pow)
#define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
index ba63102..131f2f4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN8v_sin)
.type _ZGVeN8v_sin, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN8v_sin_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN8v_sin_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN8v_sin_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN8v_sin_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN8v_sin)
#define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
index 7228ba5..e331090 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN8vvv_sincos)
.type _ZGVeN8vvv_sincos, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN8vvv_sincos_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN8vvv_sincos_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN8vvv_sincos_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN8vvv_sincos)
#define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
index 91564de..0654d3c 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN16v_cosf)
.type _ZGVeN16v_cosf, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN16v_cosf_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN16v_cosf_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN16v_cosf_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN16v_cosf)
#define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
index 3b3489d..62858eb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN16v_expf)
.type _ZGVeN16v_expf, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN16v_expf_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN16v_expf_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN16v_expf_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN16v_expf_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN16v_expf)
#define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
index cb807e0..ec69055 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
@@ -46,6 +46,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
The table lookup is skipped if k = 0.
For low accuracy approximation, exp(r) ~ 1 or 1+r. */
+ pushq %rbp
cfi_adjust_cfa_offset (8)
cfi_rel_offset (%rbp, 0)
movq %rsp, %rbp
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
index 8756750..68c57e4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN16v_logf)
.type _ZGVeN16v_logf, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN16v_logf_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN16v_logf_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN16v_logf_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN16v_logf_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN16v_logf)
#define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
index a4ba4fb..3aa9f95 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN16vv_powf)
.type _ZGVeN16vv_powf, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN16vv_powf_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN16vv_powf_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN16vv_powf_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN16vv_powf)
#define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
index 0a1753e..bdcabab 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN16vvv_sincosf)
.type _ZGVeN16vvv_sincosf, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN16vvv_sincosf_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN16vvv_sincosf_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN16vvv_sincosf_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN16vvv_sincosf)
#define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
index 7ed637b..3ec78a0 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN16v_sinf)
.type _ZGVeN16v_sinf, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN16v_sinf_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN16v_sinf_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN16v_sinf_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN16v_sinf)
#define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
index bd93b8e..5c0ff89 100644
--- a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
@@ -194,39 +194,39 @@
/* AVX512 ISA version as wrapper to AVX2 ISA version. */
.macro WRAPPER_IMPL_AVX512 callee
- pushq %rbp
+ pushq %rbp
cfi_adjust_cfa_offset (8)
cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
+ movq %rsp, %rbp
cfi_def_cfa_register (%rbp)
- andq $-64, %rsp
- subq $64, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x29
- .byte 0x04
- .byte 0x24
-/* Below is encoding for vmovapd (%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x04
- .byte 0x24
- call HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x44
- .byte 0x24
- .byte 0x20
- call HIDDEN_JUMPTARGET(\callee)
- movq %rbp, %rsp
+ andq $-64, %rsp
+ subq $128, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x04
+ .byte 0x24
+ vmovupd (%rsp), %ymm0
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovupd %ymm0, 64(%rsp)
+ vmovupd 32(%rsp), %ymm0
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovupd %ymm0, 96(%rsp)
+/* Below is encoding for vmovups 64(%rsp), %zmm0. */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x10
+ .byte 0x44
+ .byte 0x24
+ .byte 0x01
+ movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
- popq %rbp
+ popq %rbp
cfi_adjust_cfa_offset (-8)
cfi_restore (%rbp)
ret
@@ -234,61 +234,50 @@
/* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */
.macro WRAPPER_IMPL_AVX512_ff callee
- pushq %rbp
+ pushq %rbp
cfi_adjust_cfa_offset (8)
cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
+ movq %rsp, %rbp
cfi_def_cfa_register (%rbp)
- andq $-64, %rsp
- subq $128, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x29
- .byte 0x04
- .byte 0x24
-/* Below is encoding for vmovaps %zmm1, 64(%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x29
- .byte 0x4c
- .byte 0x24
-/* Below is encoding for vmovapd (%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x04
- .byte 0x24
-/* Below is encoding for vmovapd 64(%rsp), %ymm1. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x4c
- .byte 0x24
- .byte 0x40
- call HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x44
- .byte 0x24
- .byte 0x20
-/* Below is encoding for vmovapd 96(%rsp), %ymm1. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x4c
- .byte 0x24
- .byte 0x60
- call HIDDEN_JUMPTARGET(\callee)
- movq %rbp, %rsp
+ andq $-64, %rsp
+ subq $192, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x04
+ .byte 0x24
+/* Below is encoding for vmovups %zmm1, 64(%rsp). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x4c
+ .byte 0x24
+ .byte 0x01
+ vmovupd (%rsp), %ymm0
+ vmovupd 64(%rsp), %ymm1
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovupd %ymm0, 128(%rsp)
+ vmovupd 32(%rsp), %ymm0
+ vmovupd 96(%rsp), %ymm1
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovupd %ymm0, 160(%rsp)
+/* Below is encoding for vmovups 128(%rsp), %zmm0. */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x10
+ .byte 0x44
+ .byte 0x24
+ .byte 0x02
+ movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
- popq %rbp
+ popq %rbp
cfi_adjust_cfa_offset (-8)
cfi_restore (%rbp)
ret
@@ -310,61 +299,26 @@
cfi_rel_offset (%r13, 0)
subq $176, %rsp
movq %rsi, %r13
-/* Below is encoding for vmovaps %zmm0, (%rsp). */
+/* Below is encoding for vmovups %zmm0, (%rsp). */
.byte 0x62
.byte 0xf1
.byte 0x7c
.byte 0x48
- .byte 0x29
+ .byte 0x11
.byte 0x04
.byte 0x24
movq %rdi, %r12
-/* Below is encoding for vmovapd (%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x04
- .byte 0x24
+ vmovupd (%rsp), %ymm0
call HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x44
- .byte 0x24
- .byte 0x20
+ vmovupd 32(%rsp), %ymm0
lea 64(%rsp), %rdi
lea 96(%rsp), %rsi
call HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 64(%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x44
- .byte 0x24
- .byte 0x40
-/* Below is encoding for vmovapd 96(%rsp), %ymm1. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x4c
- .byte 0x24
- .byte 0x60
-/* Below is encoding for vmovapd %ymm0, 32(%r12). */
- .byte 0xc4
- .byte 0xc1
- .byte 0x7d
- .byte 0x29
- .byte 0x44
- .byte 0x24
- .byte 0x20
-/* Below is encoding for vmovapd %ymm1, 32(%r13). */
- .byte 0xc4
- .byte 0xc1
- .byte 0x7d
- .byte 0x29
- .byte 0x4d
- .byte 0x20
+ vmovupd 64(%rsp), %ymm0
+ vmovupd 96(%rsp), %ymm1
+ vmovupd %ymm0, 32(%r12)
+ vmovupd %ymm1, 32(%r13)
+ vzeroupper
addq $176, %rsp
popq %r13
cfi_adjust_cfa_offset (-8)
diff --git a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
index 66bb081..d255d19 100644
--- a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
@@ -239,28 +239,39 @@
/* AVX512 ISA version as wrapper to AVX2 ISA version. */
.macro WRAPPER_IMPL_AVX512 callee
- pushq %rbp
+ pushq %rbp
cfi_adjust_cfa_offset (8)
cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
+ movq %rsp, %rbp
cfi_def_cfa_register (%rbp)
- andq $-64, %rsp
- subq $64, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x29
- .byte 0x04
- .byte 0x24
- vmovaps (%rsp), %ymm0
- call HIDDEN_JUMPTARGET(\callee)
- vmovaps 32(%rsp), %ymm0
- call HIDDEN_JUMPTARGET(\callee)
- movq %rbp, %rsp
+ andq $-64, %rsp
+ subq $128, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x04
+ .byte 0x24
+ vmovupd (%rsp), %ymm0
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovupd %ymm0, 64(%rsp)
+ vmovupd 32(%rsp), %ymm0
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovupd %ymm0, 96(%rsp)
+/* Below is encoding for vmovups 64(%rsp), %zmm0. */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x10
+ .byte 0x44
+ .byte 0x24
+ .byte 0x01
+ movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
- popq %rbp
+ popq %rbp
cfi_adjust_cfa_offset (-8)
cfi_restore (%rbp)
ret
@@ -274,29 +285,41 @@
movq %rsp, %rbp
cfi_def_cfa_register (%rbp)
andq $-64, %rsp
- subq $128, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x29
- .byte 0x04
- .byte 0x24
-/* Below is encoding for vmovaps %zmm1, 64(%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x29
- .byte 0x4c
- .byte 0x24
- vmovaps (%rsp), %ymm0
- vmovaps 64(%rsp), %ymm1
+ subq $192, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x04
+ .byte 0x24
+/* Below is encoding for vmovups %zmm1, 64(%rsp). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x4c
+ .byte 0x24
+ .byte 0x01
+ vmovups (%rsp), %ymm0
+ vmovups 64(%rsp), %ymm1
call HIDDEN_JUMPTARGET(\callee)
- vmovaps 32(%rsp), %ymm0
- vmovaps 96(%rsp), %ymm1
+ vmovups %ymm0, 128(%rsp)
+ vmovups 32(%rsp), %ymm0
+ vmovups 96(%rsp), %ymm1
call HIDDEN_JUMPTARGET(\callee)
+ vmovups %ymm0, 160(%rsp)
+/* Below is encoding for vmovups 128(%rsp), %zmm0. */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x10
+ .byte 0x44
+ .byte 0x24
+ .byte 0x02
movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
popq %rbp