aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2017-08-23 06:16:12 -0700
committerH.J. Lu <hjl.tools@gmail.com>2017-08-23 06:26:44 -0700
commitb9eaca8fa0a9628a992e0f1478aaadde576804e1 (patch)
tree10d8dcd9696ac057b485e94e7a003d3856de71dc
parent5a706f649de3952271930a8340db4ca8aa50f485 (diff)
downloadglibc-b9eaca8fa0a9628a992e0f1478aaadde576804e1.zip
glibc-b9eaca8fa0a9628a992e0f1478aaadde576804e1.tar.gz
glibc-b9eaca8fa0a9628a992e0f1478aaadde576804e1.tar.bz2
x86_64: Replace AVX512F .byte sequences with instructions
Since binutils 2.25 or later is required to build glibc, we can replace AVX512F .byte sequences with AVX512F instructions. Tested on x86-64 and x32. There are no code differences in libmvec.so and libmvec.a. * sysdeps/x86_64/fpu/svml_d_sincos8_core.S: Replace AVX512F .byte sequences with AVX512F instructions. * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Likewise. * sysdeps/x86_64/fpu/svml_s_sincosf16_core.S: Likewise. * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S: Likewise.
-rw-r--r--ChangeLog12
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S19
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S63
-rw-r--r--sysdeps/x86_64/fpu/svml_d_sincos8_core.S41
-rw-r--r--sysdeps/x86_64/fpu/svml_d_wrapper_impl.h57
-rw-r--r--sysdeps/x86_64/fpu/svml_s_sincosf16_core.S85
-rw-r--r--sysdeps/x86_64/fpu/svml_s_wrapper_impl.h57
7 files changed, 44 insertions, 290 deletions
diff --git a/ChangeLog b/ChangeLog
index 43c8880..a58de05 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2017-08-23 H.J. Lu <hongjiu.lu@intel.com>
+
+ * sysdeps/x86_64/fpu/svml_d_sincos8_core.S: Replace AVX512F
+ .byte sequences with AVX512F instructions.
+ * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Likewise.
+ * sysdeps/x86_64/fpu/svml_s_sincosf16_core.S: Likewise.
+ * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S:
+ Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S:
+ Likewise.
+
2017-08-22 Szabolcs Nagy <szabolcs.nagy@arm.com>
Steve Ellcey <sellcey@cavium.com>
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
index c920755..3667faa 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
@@ -599,24 +599,9 @@ libmvec_hidden_def(_ZGVeN8vl8l8_sincos_skx)
cfi_def_cfa_register (%rbp)
andq $-64, %rsp
subq $256, %rsp
- /* Encoding for vmovups %zmm1, 128(%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x4c
- .byte 0x24
- .byte 0x02
+ vmovups %zmm1, 128(%rsp)
lea (%rsp), %rdi
- /* Encoding for vmovups %zmm2, 192(%rdi). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x57
- .byte 0x03
+ vmovups %zmm2, 192(%rdi)
lea 64(%rsp), %rsi
call HIDDEN_JUMPTARGET(\callee)
movq 128(%rsp), %rdx
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
index f73ab7d..8fa4255 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
@@ -510,40 +510,11 @@ libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx)
cfi_def_cfa_register (%rbp)
andq $-64, %rsp
subq $384, %rsp
- /* Encoding for vmovups %zmm1, 128(%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x4c
- .byte 0x24
- .byte 0x02
+ vmovups %zmm1, 128(%rsp)
lea (%rsp), %rdi
- /* Encoding for vmovups %zmm2, 192(%rdi). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x57
- .byte 0x03
- /* Encoding for vmovups %zmm3, 256(%rdi). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x5f
- .byte 0x04
- /* Encoding for vmovups %zmm4, 320(%rdi). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x67
- .byte 0x05
+ vmovups %zmm2, 192(%rdi)
+ vmovups %zmm3, 256(%rdi)
+ vmovups %zmm4, 320(%rdi)
lea 64(%rsp), %rsi
call HIDDEN_JUMPTARGET(\callee)
movq 128(%rsp), %rdx
@@ -661,30 +632,8 @@ libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx)
leal -112(%rbp), %esi
leal -176(%rbp), %edi
subl $296, %esp
- /* Encoding for vmovdqa64 %zmm1, -240(%ebp). */
- .byte 0x67
- .byte 0x62
- .byte 0xf1
- .byte 0xfd
- .byte 0x48
- .byte 0x7f
- .byte 0x8d
- .byte 0x10
- .byte 0xff
- .byte 0xff
- .byte 0xff
- /* Encoding for vmovdqa64 %zmm2, -304(%ebp). */
- .byte 0x67
- .byte 0x62
- .byte 0xf1
- .byte 0xfd
- .byte 0x48
- .byte 0x7f
- .byte 0x95
- .byte 0xd0
- .byte 0xfe
- .byte 0xff
- .byte 0xff
+ vmovdqa64 %zmm1, -240(%ebp)
+ vmovdqa64 %zmm2, -304(%ebp)
call HIDDEN_JUMPTARGET(\callee)
movl -240(%ebp), %eax
vmovss -176(%ebp), %xmm0
diff --git a/sysdeps/x86_64/fpu/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/svml_d_sincos8_core.S
index c104539..cdea304 100644
--- a/sysdeps/x86_64/fpu/svml_d_sincos8_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_sincos8_core.S
@@ -35,32 +35,10 @@ END (_ZGVeN8vl8l8_sincos)
cfi_def_cfa_register (%rbp)
andq $-64, %rsp
subq $320, %rsp
- /* Encoding for vmovups %zmm0, 256(%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x44
- .byte 0x24
- .byte 0x04
+ vmovups %zmm0, 256(%rsp)
lea (%rsp), %rdi
- /* Encoding for vmovups %zmm1, 128(%rdi). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x4f
- .byte 0x02
- /* Encoding for vmovups %zmm2, 192(%rdi). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x57
- .byte 0x03
+ vmovups %zmm1, 128(%rdi)
+ vmovups %zmm2, 192(%rdi)
lea 64(%rsp), %rsi
call HIDDEN_JUMPTARGET(\callee)
vmovdqu 288(%rsp), %ymm0
@@ -142,18 +120,7 @@ END (_ZGVeN8vl8l8_sincos)
subl $280, %esp
vmovdqa %ymm1, -208(%ebp)
vmovdqa %ymm2, -240(%ebp)
- /* Encoding for vmovapd %zmm0, -304(%ebp). */
- .byte 0x67
- .byte 0x62
- .byte 0xf1
- .byte 0xfd
- .byte 0x48
- .byte 0x29
- .byte 0x85
- .byte 0xd0
- .byte 0xfe
- .byte 0xff
- .byte 0xff
+ vmovapd %zmm0, -304(%ebp)
call HIDDEN_JUMPTARGET(\callee)
leal 32(%r12), %esi
vmovupd -272(%ebp), %ymm0
diff --git a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
index 625eb66..3933644 100644
--- a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
@@ -201,29 +201,14 @@
cfi_def_cfa_register (%rbp)
andq $-64, %rsp
subq $128, %rsp
-/* Below is encoding for vmovups %zmm0, (%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x04
- .byte 0x24
+ vmovups %zmm0, (%rsp)
vmovupd (%rsp), %ymm0
call HIDDEN_JUMPTARGET(\callee)
vmovupd %ymm0, 64(%rsp)
vmovupd 32(%rsp), %ymm0
call HIDDEN_JUMPTARGET(\callee)
vmovupd %ymm0, 96(%rsp)
-/* Below is encoding for vmovups 64(%rsp), %zmm0. */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x10
- .byte 0x44
- .byte 0x24
- .byte 0x01
+ vmovups 64(%rsp), %zmm0
movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
popq %rbp
@@ -241,23 +226,8 @@
cfi_def_cfa_register (%rbp)
andq $-64, %rsp
subq $192, %rsp
-/* Below is encoding for vmovups %zmm0, (%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x04
- .byte 0x24
-/* Below is encoding for vmovups %zmm1, 64(%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x4c
- .byte 0x24
- .byte 0x01
+ vmovups %zmm0, (%rsp)
+ vmovups %zmm1, 64(%rsp)
vmovupd (%rsp), %ymm0
vmovupd 64(%rsp), %ymm1
call HIDDEN_JUMPTARGET(\callee)
@@ -266,15 +236,7 @@
vmovupd 96(%rsp), %ymm1
call HIDDEN_JUMPTARGET(\callee)
vmovupd %ymm0, 160(%rsp)
-/* Below is encoding for vmovups 128(%rsp), %zmm0. */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x10
- .byte 0x44
- .byte 0x24
- .byte 0x02
+ vmovups 128(%rsp), %zmm0
movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
popq %rbp
@@ -299,14 +261,7 @@
cfi_rel_offset (%r13, 0)
subq $176, %rsp
movq %rsi, %r13
-/* Below is encoding for vmovups %zmm0, (%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x04
- .byte 0x24
+ vmovups %zmm0, (%rsp)
movq %rdi, %r12
vmovupd (%rsp), %ymm0
call HIDDEN_JUMPTARGET(\callee)
diff --git a/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S
index d86c913..8ebcebb 100644
--- a/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S
@@ -35,48 +35,12 @@ END (_ZGVeN16vl4l4_sincosf)
cfi_def_cfa_register (%rbp)
andq $-64, %rsp
subq $448, %rsp
- /* Encoding for vmovups %zmm0, 384(%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x44
- .byte 0x24
- .byte 0x06
+ vmovups %zmm0, 384(%rsp)
lea (%rsp), %rdi
- /* Encoding for vmovups %zmm1, 128(%rdi). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x4f
- .byte 0x02
- /* Encoding for vmovups %zmm2, 192(%rdi). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x57
- .byte 0x03
- /* Encoding for vmovups %zmm3, 256(%rdi). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x5f
- .byte 0x04
- /* Encoding for vmovups %zmm4, 320(%rdi). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x67
- .byte 0x05
+ vmovups %zmm1, 128(%rdi)
+ vmovups %zmm2, 192(%rdi)
+ vmovups %zmm3, 256(%rdi)
+ vmovups %zmm4, 320(%rdi)
lea 64(%rsp), %rsi
call HIDDEN_JUMPTARGET(\callee)
vmovdqu 416(%rsp), %ymm0
@@ -204,42 +168,9 @@ END (_ZGVeN16vl4l4_sincosf)
.cfi_escape 0x10,0x3,0x2,0x76,0x68
movq %rdi, %rbx
subl $344, %esp
- /* Encoding for vmovdqa64 %zmm1, -240(%ebp). */
- .byte 0x67
- .byte 0x62
- .byte 0xf1
- .byte 0xfd
- .byte 0x48
- .byte 0x7f
- .byte 0x8d
- .byte 0x10
- .byte 0xff
- .byte 0xff
- .byte 0xff
- /* Encoding for vmovdqa64 %zmm2, -304(%ebp). */
- .byte 0x67
- .byte 0x62
- .byte 0xf1
- .byte 0xfd
- .byte 0x48
- .byte 0x7f
- .byte 0x95
- .byte 0xd0
- .byte 0xfe
- .byte 0xff
- .byte 0xff
- /* Encoding for vmovaps %zmm0, -368(%ebp). */
- .byte 0x67
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x29
- .byte 0x85
- .byte 0x90
- .byte 0xfe
- .byte 0xff
- .byte 0xff
+ vmovdqa64 %zmm1, -240(%ebp)
+ vmovdqa64 %zmm2, -304(%ebp)
+ vmovaps %zmm0, -368(%ebp)
call HIDDEN_JUMPTARGET(\callee)
leal 32(%r12), %esi
vmovups -336(%ebp), %ymm0
diff --git a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
index cd6d583..00b86cd 100644
--- a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
@@ -246,29 +246,14 @@
cfi_def_cfa_register (%rbp)
andq $-64, %rsp
subq $128, %rsp
-/* Below is encoding for vmovups %zmm0, (%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x04
- .byte 0x24
+ vmovups %zmm0, (%rsp)
vmovupd (%rsp), %ymm0
call HIDDEN_JUMPTARGET(\callee)
vmovupd %ymm0, 64(%rsp)
vmovupd 32(%rsp), %ymm0
call HIDDEN_JUMPTARGET(\callee)
vmovupd %ymm0, 96(%rsp)
-/* Below is encoding for vmovups 64(%rsp), %zmm0. */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x10
- .byte 0x44
- .byte 0x24
- .byte 0x01
+ vmovups 64(%rsp), %zmm0
movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
popq %rbp
@@ -286,23 +271,8 @@
cfi_def_cfa_register (%rbp)
andq $-64, %rsp
subq $192, %rsp
-/* Below is encoding for vmovups %zmm0, (%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x04
- .byte 0x24
-/* Below is encoding for vmovups %zmm1, 64(%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x11
- .byte 0x4c
- .byte 0x24
- .byte 0x01
+ vmovups %zmm0, (%rsp)
+ vmovups %zmm1, 64(%rsp)
vmovups (%rsp), %ymm0
vmovups 64(%rsp), %ymm1
call HIDDEN_JUMPTARGET(\callee)
@@ -311,15 +281,7 @@
vmovups 96(%rsp), %ymm1
call HIDDEN_JUMPTARGET(\callee)
vmovups %ymm0, 160(%rsp)
-/* Below is encoding for vmovups 128(%rsp), %zmm0. */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x10
- .byte 0x44
- .byte 0x24
- .byte 0x02
+ vmovups 128(%rsp), %zmm0
movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
popq %rbp
@@ -340,14 +302,7 @@
pushq %r13
subq $176, %rsp
movq %rsi, %r13
-/* Below is encoding for vmovaps %zmm0, (%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x29
- .byte 0x04
- .byte 0x24
+ vmovaps %zmm0, (%rsp)
movq %rdi, %r12
vmovaps (%rsp), %ymm0
call HIDDEN_JUMPTARGET(\callee)