aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/x86_64/fpu
diff options
context:
space:
mode:
authorNoah Goldstein <goldstein.w.n@gmail.com>2022-11-18 16:13:32 -0800
committerNoah Goldstein <goldstein.w.n@gmail.com>2022-11-27 20:22:49 -0800
commitf704192911c6c7b65a54beab3ab369fca7609a5d (patch)
tree577ab06e06659f4acafd17c290ac02605f628b49 /sysdeps/x86_64/fpu
parent72f6a5a0ed25d14e6dab8f54878fd46ebaee2dd5 (diff)
downloadglibc-f704192911c6c7b65a54beab3ab369fca7609a5d.zip
glibc-f704192911c6c7b65a54beab3ab369fca7609a5d.tar.gz
glibc-f704192911c6c7b65a54beab3ab369fca7609a5d.tar.bz2
x86/fpu: Factor out shared avx2/avx512 code in svml_{s|d}_wrapper_impl.h
Code is exactly the same for the two so better to only maintain one version. All math and mathvec tests pass on x86.
Diffstat (limited to 'sysdeps/x86_64/fpu')
-rw-r--r--sysdeps/x86_64/fpu/svml_d_wrapper_impl.h172
-rw-r--r--sysdeps/x86_64/fpu/svml_s_wrapper_impl.h172
-rw-r--r--sysdeps/x86_64/fpu/svml_sd_wrapper_impl.h190
3 files changed, 192 insertions, 342 deletions
diff --git a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
index 9900f85..f63b49f 100644
--- a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
@@ -82,174 +82,4 @@
ret
.endm
-/* AVX/AVX2 ISA version as wrapper to SSE ISA version. */
-.macro WRAPPER_IMPL_AVX callee
- pushq %rbp
- cfi_adjust_cfa_offset (8)
- cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
- cfi_def_cfa_register (%rbp)
- andq $-32, %rsp
- subq $32, %rsp
- vmovaps %ymm0, (%rsp)
- vzeroupper
- call HIDDEN_JUMPTARGET(\callee)
- vmovaps %xmm0, (%rsp)
- vmovaps 16(%rsp), %xmm0
- call HIDDEN_JUMPTARGET(\callee)
- /* combine xmm0 (return of second call) with result of first
- call (saved on stack). Might be worth exploring logic that
- uses `vpblend` and reads in ymm1 using -16(rsp). */
- vmovaps (%rsp), %xmm1
- vinsertf128 $1, %xmm0, %ymm1, %ymm0
- movq %rbp, %rsp
- cfi_def_cfa_register (%rsp)
- popq %rbp
- cfi_adjust_cfa_offset (-8)
- cfi_restore (%rbp)
- ret
-.endm
-
-/* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */
-.macro WRAPPER_IMPL_AVX_ff callee
- pushq %rbp
- cfi_adjust_cfa_offset (8)
- cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
- cfi_def_cfa_register (%rbp)
- andq $-32, %rsp
- subq $64, %rsp
- vmovaps %ymm0, (%rsp)
- vmovaps %ymm1, 32(%rsp)
- vzeroupper
- call HIDDEN_JUMPTARGET(\callee)
- vmovaps 48(%rsp), %xmm1
- vmovaps %xmm0, (%rsp)
- vmovaps 16(%rsp), %xmm0
- call HIDDEN_JUMPTARGET(\callee)
- /* combine xmm0 (return of second call) with result of first
- call (saved on stack). Might be worth exploring logic that
- uses `vpblend` and reads in ymm1 using -16(rsp). */
- vmovaps (%rsp), %xmm1
- vinsertf128 $1, %xmm0, %ymm1, %ymm0
- movq %rbp, %rsp
- cfi_def_cfa_register (%rsp)
- popq %rbp
- cfi_adjust_cfa_offset (-8)
- cfi_restore (%rbp)
- ret
-.endm
-
-/* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */
-.macro WRAPPER_IMPL_AVX_fFF callee
- pushq %rbp
- cfi_adjust_cfa_offset (8)
- cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
- andq $-32, %rsp
- subq $32, %rsp
- vmovaps %ymm0, (%rsp)
- pushq %rbx
- pushq %r14
- movq %rdi, %rbx
- movq %rsi, %r14
- vzeroupper
- call HIDDEN_JUMPTARGET(\callee)
- vmovaps 32(%rsp), %xmm0
- leaq 16(%rbx), %rdi
- leaq 16(%r14), %rsi
- call HIDDEN_JUMPTARGET(\callee)
- popq %r14
- popq %rbx
- movq %rbp, %rsp
- cfi_def_cfa_register (%rsp)
- popq %rbp
- cfi_adjust_cfa_offset (-8)
- cfi_restore (%rbp)
- ret
-.endm
-
-/* AVX512 ISA version as wrapper to AVX2 ISA version. */
-.macro WRAPPER_IMPL_AVX512 callee
- pushq %rbp
- cfi_adjust_cfa_offset (8)
- cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
- cfi_def_cfa_register (%rbp)
- andq $-64, %rsp
- subq $64, %rsp
- vmovups %zmm0, (%rsp)
- call HIDDEN_JUMPTARGET(\callee)
- vmovupd %ymm0, (%rsp)
- vmovupd 32(%rsp), %ymm0
- call HIDDEN_JUMPTARGET(\callee)
- /* combine ymm0 (return of second call) with result of first
- call (saved on stack). */
- vmovaps (%rsp), %ymm1
- vinserti64x4 $0x1, %ymm0, %zmm1, %zmm0
- movq %rbp, %rsp
- cfi_def_cfa_register (%rsp)
- popq %rbp
- cfi_adjust_cfa_offset (-8)
- cfi_restore (%rbp)
- ret
-.endm
-
-/* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */
-.macro WRAPPER_IMPL_AVX512_ff callee
- pushq %rbp
- cfi_adjust_cfa_offset (8)
- cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
- cfi_def_cfa_register (%rbp)
- andq $-64, %rsp
- addq $-128, %rsp
- vmovups %zmm0, (%rsp)
- vmovups %zmm1, 64(%rsp)
- /* ymm0 and ymm1 are already set. */
- call HIDDEN_JUMPTARGET(\callee)
- vmovups 96(%rsp), %ymm1
- vmovaps %ymm0, (%rsp)
- vmovups 32(%rsp), %ymm0
- call HIDDEN_JUMPTARGET(\callee)
- /* combine ymm0 (return of second call) with result of first
- call (saved on stack). */
- vmovaps (%rsp), %ymm1
- vinserti64x4 $0x1, %ymm0, %zmm1, %zmm0
- movq %rbp, %rsp
- cfi_def_cfa_register (%rsp)
- popq %rbp
- cfi_adjust_cfa_offset (-8)
- cfi_restore (%rbp)
- ret
-.endm
-
-/* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version. */
-.macro WRAPPER_IMPL_AVX512_fFF callee
- pushq %rbp
- cfi_adjust_cfa_offset (8)
- cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
- cfi_def_cfa_register (%rbp)
- andq $-64, %rsp
- subq $64, %rsp
- vmovaps %zmm0, (%rsp)
- pushq %rbx
- pushq %r14
- movq %rdi, %rbx
- movq %rsi, %r14
- /* ymm0 is already set. */
- call HIDDEN_JUMPTARGET(\callee)
- vmovaps 48(%rsp), %ymm0
- leaq 32(%rbx), %rdi
- leaq 32(%r14), %rsi
- call HIDDEN_JUMPTARGET(\callee)
- popq %r14
- popq %rbx
- movq %rbp, %rsp
- cfi_def_cfa_register (%rsp)
- popq %rbp
- cfi_adjust_cfa_offset (-8)
- cfi_restore (%rbp)
- ret
-.endm
+#include "svml_sd_wrapper_impl.h"
diff --git a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
index fd9b363..8d8e5ef 100644
--- a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
@@ -118,174 +118,4 @@
ret
.endm
-/* AVX/AVX2 ISA version as wrapper to SSE ISA version. */
-.macro WRAPPER_IMPL_AVX callee
- pushq %rbp
- cfi_adjust_cfa_offset (8)
- cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
- cfi_def_cfa_register (%rbp)
- andq $-32, %rsp
- subq $32, %rsp
- vmovaps %ymm0, (%rsp)
- vzeroupper
- call HIDDEN_JUMPTARGET(\callee)
- vmovaps %xmm0, (%rsp)
- vmovaps 16(%rsp), %xmm0
- call HIDDEN_JUMPTARGET(\callee)
- /* combine xmm0 (return of second call) with result of first
- call (saved on stack). Might be worth exploring logic that
- uses `vpblend` and reads in ymm1 using -16(rsp). */
- vmovaps (%rsp), %xmm1
- vinsertf128 $1, %xmm0, %ymm1, %ymm0
- movq %rbp, %rsp
- cfi_def_cfa_register (%rsp)
- popq %rbp
- cfi_adjust_cfa_offset (-8)
- cfi_restore (%rbp)
- ret
-.endm
-
-/* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */
-.macro WRAPPER_IMPL_AVX_ff callee
- pushq %rbp
- cfi_adjust_cfa_offset (8)
- cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
- cfi_def_cfa_register (%rbp)
- andq $-32, %rsp
- subq $64, %rsp
- vmovaps %ymm0, (%rsp)
- vmovaps %ymm1, 32(%rsp)
- vzeroupper
- call HIDDEN_JUMPTARGET(\callee)
- vmovaps 48(%rsp), %xmm1
- vmovaps %xmm0, (%rsp)
- vmovaps 16(%rsp), %xmm0
- call HIDDEN_JUMPTARGET(\callee)
- /* combine xmm0 (return of second call) with result of first
- call (saved on stack). Might be worth exploring logic that
- uses `vpblend` and reads in ymm1 using -16(rsp). */
- vmovaps (%rsp), %xmm1
- vinsertf128 $1, %xmm0, %ymm1, %ymm0
- movq %rbp, %rsp
- cfi_def_cfa_register (%rsp)
- popq %rbp
- cfi_adjust_cfa_offset (-8)
- cfi_restore (%rbp)
- ret
-.endm
-
-/* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */
-.macro WRAPPER_IMPL_AVX_fFF callee
- pushq %rbp
- cfi_adjust_cfa_offset (8)
- cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
- andq $-32, %rsp
- subq $32, %rsp
- vmovaps %ymm0, (%rsp)
- pushq %rbx
- pushq %r14
- movq %rdi, %rbx
- movq %rsi, %r14
- vzeroupper
- call HIDDEN_JUMPTARGET(\callee)
- vmovaps 32(%rsp), %xmm0
- leaq 16(%rbx), %rdi
- leaq 16(%r14), %rsi
- call HIDDEN_JUMPTARGET(\callee)
- popq %r14
- popq %rbx
- movq %rbp, %rsp
- cfi_def_cfa_register (%rsp)
- popq %rbp
- cfi_adjust_cfa_offset (-8)
- cfi_restore (%rbp)
- ret
-.endm
-
-/* AVX512 ISA version as wrapper to AVX2 ISA version. */
-.macro WRAPPER_IMPL_AVX512 callee
- pushq %rbp
- cfi_adjust_cfa_offset (8)
- cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
- cfi_def_cfa_register (%rbp)
- andq $-64, %rsp
- subq $64, %rsp
- vmovups %zmm0, (%rsp)
- call HIDDEN_JUMPTARGET(\callee)
- vmovupd %ymm0, (%rsp)
- vmovupd 32(%rsp), %ymm0
- call HIDDEN_JUMPTARGET(\callee)
- /* combine ymm0 (return of second call) with result of first
- call (saved on stack). */
- vmovaps (%rsp), %ymm1
- vinserti64x4 $0x1, %ymm0, %zmm1, %zmm0
- movq %rbp, %rsp
- cfi_def_cfa_register (%rsp)
- popq %rbp
- cfi_adjust_cfa_offset (-8)
- cfi_restore (%rbp)
- ret
-.endm
-
-/* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */
-.macro WRAPPER_IMPL_AVX512_ff callee
- pushq %rbp
- cfi_adjust_cfa_offset (8)
- cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
- cfi_def_cfa_register (%rbp)
- andq $-64, %rsp
- addq $-128, %rsp
- vmovups %zmm0, (%rsp)
- vmovups %zmm1, 64(%rsp)
- /* ymm0 and ymm1 are already set. */
- call HIDDEN_JUMPTARGET(\callee)
- vmovups 96(%rsp), %ymm1
- vmovaps %ymm0, (%rsp)
- vmovups 32(%rsp), %ymm0
- call HIDDEN_JUMPTARGET(\callee)
- /* combine ymm0 (return of second call) with result of first
- call (saved on stack). */
- vmovaps (%rsp), %ymm1
- vinserti64x4 $0x1, %ymm0, %zmm1, %zmm0
- movq %rbp, %rsp
- cfi_def_cfa_register (%rsp)
- popq %rbp
- cfi_adjust_cfa_offset (-8)
- cfi_restore (%rbp)
- ret
-.endm
-
-/* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version. */
-.macro WRAPPER_IMPL_AVX512_fFF callee
- pushq %rbp
- cfi_adjust_cfa_offset (8)
- cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
- cfi_def_cfa_register (%rbp)
- andq $-64, %rsp
- subq $64, %rsp
- vmovaps %zmm0, (%rsp)
- pushq %rbx
- pushq %r14
- movq %rdi, %rbx
- movq %rsi, %r14
- /* ymm0 is already set. */
- call HIDDEN_JUMPTARGET(\callee)
- vmovaps 48(%rsp), %ymm0
- leaq 32(%rbx), %rdi
- leaq 32(%r14), %rsi
- call HIDDEN_JUMPTARGET(\callee)
- popq %r14
- popq %rbx
- movq %rbp, %rsp
- cfi_def_cfa_register (%rsp)
- popq %rbp
- cfi_adjust_cfa_offset (-8)
- cfi_restore (%rbp)
- ret
-.endm
+#include "svml_sd_wrapper_impl.h"
diff --git a/sysdeps/x86_64/fpu/svml_sd_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_sd_wrapper_impl.h
new file mode 100644
index 0000000..bd934ad
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_sd_wrapper_impl.h
@@ -0,0 +1,190 @@
+/* Common float/double wrapper implementations of vector math
+ functions.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* AVX/AVX2 ISA version as wrapper to SSE ISA version. */
+.macro WRAPPER_IMPL_AVX callee
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-32, %rsp
+ subq $32, %rsp
+ vmovaps %ymm0, (%rsp)
+ vzeroupper
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovaps %xmm0, (%rsp)
+ vmovaps 16(%rsp), %xmm0
+ call HIDDEN_JUMPTARGET(\callee)
+ /* combine xmm0 (return of second call) with result of first
+ call (saved on stack). Might be worth exploring logic that
+ uses `vpblend` and reads in ymm1 using -16(rsp). */
+ vmovaps (%rsp), %xmm1
+ vinsertf128 $1, %xmm0, %ymm1, %ymm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+.endm
+
+/* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */
+.macro WRAPPER_IMPL_AVX_ff callee
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-32, %rsp
+ subq $64, %rsp
+ vmovaps %ymm0, (%rsp)
+ vmovaps %ymm1, 32(%rsp)
+ vzeroupper
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovaps 48(%rsp), %xmm1
+ vmovaps %xmm0, (%rsp)
+ vmovaps 16(%rsp), %xmm0
+ call HIDDEN_JUMPTARGET(\callee)
+ /* combine xmm0 (return of second call) with result of first
+ call (saved on stack). Might be worth exploring logic that
+ uses `vpblend` and reads in ymm1 using -16(rsp). */
+ vmovaps (%rsp), %xmm1
+ vinsertf128 $1, %xmm0, %ymm1, %ymm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+.endm
+
+/* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */
+.macro WRAPPER_IMPL_AVX_fFF callee
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ andq $-32, %rsp
+ subq $32, %rsp
+ vmovaps %ymm0, (%rsp)
+ pushq %rbx
+ pushq %r14
+ movq %rdi, %rbx
+ movq %rsi, %r14
+ vzeroupper
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovaps 32(%rsp), %xmm0
+ leaq 16(%rbx), %rdi
+ leaq 16(%r14), %rsi
+ call HIDDEN_JUMPTARGET(\callee)
+ popq %r14
+ popq %rbx
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+.endm
+
+/* AVX512 ISA version as wrapper to AVX2 ISA version. */
+.macro WRAPPER_IMPL_AVX512 callee
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $64, %rsp
+ vmovups %zmm0, (%rsp)
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovupd %ymm0, (%rsp)
+ vmovupd 32(%rsp), %ymm0
+ call HIDDEN_JUMPTARGET(\callee)
+ /* combine ymm0 (return of second call) with result of first
+ call (saved on stack). */
+ vmovaps (%rsp), %ymm1
+ vinserti64x4 $0x1, %ymm0, %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+.endm
+
+/* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */
+.macro WRAPPER_IMPL_AVX512_ff callee
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ addq $-128, %rsp
+ vmovups %zmm0, (%rsp)
+ vmovups %zmm1, 64(%rsp)
+ /* ymm0 and ymm1 are already set. */
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovups 96(%rsp), %ymm1
+ vmovaps %ymm0, (%rsp)
+ vmovups 32(%rsp), %ymm0
+ call HIDDEN_JUMPTARGET(\callee)
+ /* combine ymm0 (return of second call) with result of first
+ call (saved on stack). */
+ vmovaps (%rsp), %ymm1
+ vinserti64x4 $0x1, %ymm0, %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+.endm
+
+/* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version. */
+.macro WRAPPER_IMPL_AVX512_fFF callee
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $64, %rsp
+ vmovaps %zmm0, (%rsp)
+ pushq %rbx
+ pushq %r14
+ movq %rdi, %rbx
+ movq %rsi, %r14
+ /* ymm0 is already set. */
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovaps 48(%rsp), %ymm0
+ leaq 32(%rbx), %rdi
+ leaq 32(%r14), %rsi
+ call HIDDEN_JUMPTARGET(\callee)
+ popq %r14
+ popq %rbx
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+.endm