diff options
author | Andrew Senkevich <andrew.senkevich@intel.com> | 2015-06-18 17:55:55 +0300 |
---|---|---|
committer | Andrew Senkevich <andrew.senkevich@intel.com> | 2015-06-18 17:55:55 +0300 |
commit | c9a8c526acd185176e486bee4624039740f8c435 (patch) | |
tree | 1f199d8fb0bb0ce9b0bdb21c86c3c213a514c22a /sysdeps | |
parent | 8aa92022e2e7cb5470b6e252020140c05b8013ed (diff) | |
download | glibc-c9a8c526acd185176e486bee4624039740f8c435.zip glibc-c9a8c526acd185176e486bee4624039740f8c435.tar.gz glibc-c9a8c526acd185176e486bee4624039740f8c435.tar.bz2 |
Vector sincos for x86_64 and tests.
Here is implementation of vectorized sincos containing SSE, AVX,
AVX2 and AVX512 versions according to Vector ABI
<https://groups.google.com/forum/#!topic/x86-64-abi/LmppCfN1rZ4>.
* NEWS: Mention addition of x86_64 vector sincos.
* bits/libm-simd-decl-stubs.h: Added stubs for sincos.
* math/math.h (__MATHDECL_VEC): New macro.
* math/bits/mathcalls.h: Added sincos declaration with __MATHDECL_VEC.
* math/gen-libm-have-vector-test.sh: Added generation of sincos wrapper
declaration under condition.
* math/test-vec-loop.h (TEST_VEC_LOOP): Refactored.
* math/test-double-vlen2.h: Added wrapper for sincos tests, reflected
TEST_VEC_LOOP change.
* math/test-double-vlen4.h: Likewise.
* math/test-double-vlen8.h: Likewise.
* math/test-float-vlen16.h: Reflected TEST_VEC_LOOP change.
* math/test-float-vlen4.h: Likewise.
* math/test-float-vlen8.h: Likewise.
* sysdeps/unix/sysv/linux/x86_64/libmvec.abilist: New symbols added.
* sysdeps/x86/fpu/bits/math-vector.h: Added sincos SIMD declaration.
* sysdeps/x86_64/fpu/Makefile (libmvec-support): Added new files.
* sysdeps/x86_64/fpu/Versions: New versions added.
* sysdeps/x86_64/fpu/libm-test-ulps: Regenerated.
* sysdeps/x86_64/fpu/multiarch/Makefile (libmvec-sysdep_routines):
Added build of SSE, AVX2 and AVX512 IFUNC versions.
* sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S: New file.
* sysdeps/x86_64/fpu/svml_d_sincos2_core.S: New file.
* sysdeps/x86_64/fpu/svml_d_sincos4_core.S: New file.
* sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S: New file.
* sysdeps/x86_64/fpu/svml_d_sincos8_core.S: New file.
* sysdeps/x86_64/fpu/svml_d_sincos_data.S: New file.
* sysdeps/x86_64/fpu/svml_d_sincos_data.h: New file.
* sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Added wrappers for sincos.
* sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c: Vector sincos tests.
* sysdeps/x86_64/fpu/test-double-vlen2.c: Likewise.
* sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c: Likewise.
* sysdeps/x86_64/fpu/test-double-vlen4-avx2.c: Likewise.
* sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c: Likewise.
* sysdeps/x86_64/fpu/test-double-vlen4.c: Likewise.
* sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c: Likewise.
* sysdeps/x86_64/fpu/test-double-vlen8.c: Likewise.
Diffstat (limited to 'sysdeps')
27 files changed, 1776 insertions, 1 deletions
diff --git a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist index 4f9c040..6c45844 100644 --- a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist +++ b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist @@ -5,6 +5,7 @@ GLIBC_2.22 _ZGVbN2v_log F _ZGVbN2v_sin F _ZGVbN2vv_pow F + _ZGVbN2vvv_sincos F _ZGVbN4v_cosf F _ZGVbN4v_expf F _ZGVbN4v_logf F @@ -15,6 +16,7 @@ GLIBC_2.22 _ZGVcN4v_log F _ZGVcN4v_sin F _ZGVcN4vv_pow F + _ZGVcN4vvv_sincos F _ZGVcN8v_cosf F _ZGVcN8v_expf F _ZGVcN8v_logf F @@ -25,6 +27,7 @@ GLIBC_2.22 _ZGVdN4v_log F _ZGVdN4v_sin F _ZGVdN4vv_pow F + _ZGVdN4vvv_sincos F _ZGVdN8v_cosf F _ZGVdN8v_expf F _ZGVdN8v_logf F @@ -40,3 +43,4 @@ GLIBC_2.22 _ZGVeN8v_log F _ZGVeN8v_sin F _ZGVeN8vv_pow F + _ZGVeN8vvv_sincos F diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86/fpu/bits/math-vector.h index 9e53bdf..f684ff5 100644 --- a/sysdeps/x86/fpu/bits/math-vector.h +++ b/sysdeps/x86/fpu/bits/math-vector.h @@ -36,6 +36,8 @@ # define __DECL_SIMD_sin __DECL_SIMD_x86_64 # undef __DECL_SIMD_sinf # define __DECL_SIMD_sinf __DECL_SIMD_x86_64 +# undef __DECL_SIMD_sincos +# define __DECL_SIMD_sincos __DECL_SIMD_x86_64 # undef __DECL_SIMD_log # define __DECL_SIMD_log __DECL_SIMD_x86_64 # undef __DECL_SIMD_logf diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile index 20b22f0..9c28d62 100644 --- a/sysdeps/x86_64/fpu/Makefile +++ b/sysdeps/x86_64/fpu/Makefile @@ -7,6 +7,8 @@ libmvec-support += svml_d_cos2_core svml_d_cos4_core_avx \ svml_s_cosf8_core svml_s_cosf16_core svml_s_cosf_data \ svml_s_sinf4_core svml_s_sinf8_core_avx \ svml_s_sinf8_core svml_s_sinf16_core svml_s_sinf_data \ + svml_d_sincos2_core svml_d_sincos4_core_avx \ + svml_d_sincos4_core svml_d_sincos8_core svml_d_sincos_data \ svml_d_log2_core svml_d_log4_core_avx svml_d_log4_core \ svml_d_log8_core svml_d_log_data svml_s_logf4_core \ svml_s_logf8_core_avx svml_s_logf8_core svml_s_logf16_core \ diff --git a/sysdeps/x86_64/fpu/Versions b/sysdeps/x86_64/fpu/Versions index 1aa7937..d950f58 100644 --- a/sysdeps/x86_64/fpu/Versions +++ b/sysdeps/x86_64/fpu/Versions @@ -2,6 +2,7 @@ libmvec { GLIBC_2.22 { _ZGVbN2v_cos; _ZGVcN4v_cos; _ZGVdN4v_cos; _ZGVeN8v_cos; _ZGVbN2v_sin; _ZGVcN4v_sin; _ZGVdN4v_sin; _ZGVeN8v_sin; + _ZGVbN2vvv_sincos; _ZGVcN4vvv_sincos; _ZGVdN4vvv_sincos; _ZGVeN8vvv_sincos; _ZGVbN2v_log; _ZGVcN4v_log; _ZGVdN4v_log; _ZGVeN8v_log; _ZGVbN2v_exp; _ZGVcN4v_exp; _ZGVdN4v_exp; _ZGVeN8v_exp; _ZGVbN2vv_pow; _ZGVcN4vv_pow; _ZGVdN4vv_pow; _ZGVeN8vv_pow; diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps index e5ec939..74b1af5 100644 --- a/sysdeps/x86_64/fpu/libm-test-ulps +++ b/sysdeps/x86_64/fpu/libm-test-ulps @@ -2031,6 +2031,18 @@ idouble: 1 ildouble: 3 ldouble: 3 +Function: "sincos_vlen2": +double: 1 + +Function: "sincos_vlen4": +double: 1 + +Function: "sincos_vlen4_avx2": +double: 1 + +Function: "sincos_vlen8": +double: 1 + Function: "sinh": double: 2 float: 2 diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile index 8f3c457..9e510db 100644 --- a/sysdeps/x86_64/fpu/multiarch/Makefile +++ b/sysdeps/x86_64/fpu/multiarch/Makefile @@ -57,7 +57,8 @@ libmvec-sysdep_routines += svml_d_cos2_core_sse4 svml_d_cos4_core_avx2 \ svml_d_cos8_core_avx512 svml_d_sin2_core_sse4 \ svml_d_sin4_core_avx2 svml_d_sin8_core_avx512 \ svml_d_log2_core_sse4 svml_d_log4_core_avx2 \ - svml_d_log8_core_avx512 \ + svml_d_log8_core_avx512 svml_d_sincos2_core_sse4 \ + svml_d_sincos4_core_avx2 svml_d_sincos8_core_avx512 \ svml_s_cosf4_core_sse4 svml_s_cosf8_core_avx2 \ svml_s_cosf16_core_avx512 svml_s_sinf4_core_sse4 \ svml_s_sinf8_core_avx2 svml_s_sinf16_core_avx512 \ diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S new file mode 100644 index 0000000..e8e5771 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S @@ -0,0 +1,38 @@ +/* Multiple versions of vectorized sincos. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN2vvv_sincos) + .type _ZGVbN2vvv_sincos, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features(%rip) + jne 1f + call __init_cpu_features +1: leaq _ZGVbN2vvv_sincos_sse4(%rip), %rax + testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) + jz 2f + ret +2: leaq _ZGVbN2vvv_sincos_sse2(%rip), %rax + ret +END (_ZGVbN2vvv_sincos) +libmvec_hidden_def (_ZGVbN2vvv_sincos) + +#define _ZGVbN2vvv_sincos _ZGVbN2vvv_sincos_sse2 +#include "../svml_d_sincos2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S new file mode 100644 index 0000000..0b37c7c --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S @@ -0,0 +1,314 @@ +/* Function sincos vectorized with SSE4. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_sincos_data.h" + + .text +ENTRY (_ZGVbN2vvv_sincos_sse4) +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg = N*Pi + R + + Result calculation: + sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) + arg + Pi/2 = (N'*Pi + R') + cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R') + sin(R), sin(R') are approximated by corresponding polynomial. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movq __svml_dsincos_data@GOTPCREL(%rip), %rax + movups %xmm11, 160(%rsp) + movups %xmm12, 144(%rsp) + movups __dSignMask(%rax), %xmm11 + +/* ARGUMENT RANGE REDUCTION: + Absolute argument: X' = |X| */ + movaps %xmm11, %xmm4 + +/* Grab sign bit from argument */ + movaps %xmm11, %xmm7 + movups __dInvPI(%rax), %xmm5 + andnps %xmm0, %xmm4 + +/* SinY = X'*InvPi + RS : right shifter add */ + mulpd %xmm4, %xmm5 + addpd __dRShifter(%rax), %xmm5 + +/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ + movaps %xmm5, %xmm12 + andps %xmm0, %xmm7 + +/* SinN = Y - RS : right shifter sub */ + subpd __dRShifter(%rax), %xmm5 + movups %xmm10, 176(%rsp) + psllq $63, %xmm12 + movups __dPI1(%rax), %xmm10 + +/* SinR = X' - SinN*Pi1 */ + movaps %xmm10, %xmm1 + mulpd %xmm5, %xmm1 + movups __dPI2(%rax), %xmm6 + +/* SinR = SinR - SinN*Pi1 */ + movaps %xmm6, %xmm2 + mulpd %xmm5, %xmm2 + movups %xmm13, 112(%rsp) + movaps %xmm4, %xmm13 + subpd %xmm1, %xmm13 + subpd %xmm2, %xmm13 + +/* Sine result sign: SinRSign = SignMask & SinR */ + movaps %xmm11, %xmm2 + +/* CosR = SinX - CosN*Pi1 */ + movaps %xmm4, %xmm1 + movups __dOneHalf(%rax), %xmm3 + andps %xmm13, %xmm2 + +/* Set SinRSign to 0.5 */ + orps %xmm2, %xmm3 + +/* Update CosRSign and CosSignRes signs */ + xorps %xmm11, %xmm2 + +/* CosN = SinN +(-)0.5 */ + addpd %xmm5, %xmm3 + cmpnlepd __dRangeVal(%rax), %xmm4 + mulpd %xmm3, %xmm10 + +/* CosR = CosR - CosN*Pi2 */ + mulpd %xmm3, %xmm6 + subpd %xmm10, %xmm1 + movmskpd %xmm4, %ecx + movups __dPI3(%rax), %xmm10 + xorps %xmm12, %xmm2 + subpd %xmm6, %xmm1 + +/* SinR = SinR - SinN*Pi3 */ + movaps %xmm10, %xmm6 + +/* Final reconstruction. + Combine Sin result's sign */ + xorps %xmm7, %xmm12 + mulpd %xmm5, %xmm6 + +/* CosR = CosR - CosN*Pi3 */ + mulpd %xmm3, %xmm10 + subpd %xmm6, %xmm13 + subpd %xmm10, %xmm1 + movups __dPI4(%rax), %xmm6 + +/* SinR = SinR - SinN*Pi4 */ + mulpd %xmm6, %xmm5 + +/* CosR = CosR - CosN*Pi4 */ + mulpd %xmm6, %xmm3 + subpd %xmm5, %xmm13 + subpd %xmm3, %xmm1 + +/* SinR2 = SinR^2 */ + movaps %xmm13, %xmm6 + +/* CosR2 = CosR^2 */ + movaps %xmm1, %xmm10 + mulpd %xmm13, %xmm6 + mulpd %xmm1, %xmm10 + +/* Polynomial approximation */ + movups __dC7(%rax), %xmm5 + movaps %xmm5, %xmm3 + mulpd %xmm6, %xmm3 + mulpd %xmm10, %xmm5 + addpd __dC6(%rax), %xmm3 + addpd __dC6(%rax), %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm10, %xmm5 + addpd __dC5(%rax), %xmm3 + addpd __dC5(%rax), %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm10, %xmm5 + addpd __dC4(%rax), %xmm3 + addpd __dC4(%rax), %xmm5 + +/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ + mulpd %xmm6, %xmm3 + +/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ + mulpd %xmm10, %xmm5 + addpd __dC3(%rax), %xmm3 + addpd __dC3(%rax), %xmm5 + +/* SinPoly = C2 + SinR2*SinPoly */ + mulpd %xmm6, %xmm3 + +/* CosPoly = C2 + CosR2*CosPoly */ + mulpd %xmm10, %xmm5 + addpd __dC2(%rax), %xmm3 + addpd __dC2(%rax), %xmm5 + +/* SinPoly = C1 + SinR2*SinPoly */ + mulpd %xmm6, %xmm3 + +/* CosPoly = C1 + CosR2*CosPoly */ + mulpd %xmm10, %xmm5 + addpd __dC1(%rax), %xmm3 + addpd __dC1(%rax), %xmm5 + +/* SinPoly = SinR2*SinPoly */ + mulpd %xmm3, %xmm6 + +/* CosPoly = CosR2*CosPoly */ + mulpd %xmm5, %xmm10 + +/* SinPoly = SinR*SinPoly */ + mulpd %xmm13, %xmm6 + +/* CosPoly = CosR*CosPoly */ + mulpd %xmm1, %xmm10 + addpd %xmm6, %xmm13 + addpd %xmm10, %xmm1 + +/* Update Sin result's sign */ + xorps %xmm12, %xmm13 + +/* Update Cos result's sign */ + xorps %xmm2, %xmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movups 176(%rsp), %xmm10 + movaps %xmm13, (%rdi) + movups 160(%rsp), %xmm11 + movups 144(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups %xmm1, (%rsi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm0, 128(%rsp) + movups %xmm13, 192(%rsp) + movups %xmm1, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 48(%rsp) + movups %xmm9, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 64(%rsp) + movq %r12, 104(%rsp) + cfi_offset_rel_rsp (12, 104) + movb %dl, %r12b + movq %r13, 96(%rsp) + cfi_offset_rel_rsp (13, 96) + movl %eax, %r13d + movq %r14, 88(%rsp) + cfi_offset_rel_rsp (14, 88) + movl %ecx, %r14d + movq %r15, 80(%rsp) + cfi_offset_rel_rsp (15, 80) + movq %rbx, 72(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_1_6: + btl %r13d, %r14d + jc .LBL_1_13 + +.LBL_1_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 48(%rsp), %xmm8 + movq %rbx, %rdi + movups 32(%rsp), %xmm9 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 64(%rsp), %rsi + movq 104(%rsp), %r12 + cfi_restore (%r12) + movq 96(%rsp), %r13 + cfi_restore (%r13) + movq 88(%rsp), %r14 + cfi_restore (%r14) + movq 80(%rsp), %r15 + cfi_restore (%r15) + movq 72(%rsp), %rbx + movups 192(%rsp), %xmm13 + movups 256(%rsp), %xmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 136(%rsp,%r15), %xmm0 + + call sin@PLT + + movsd %xmm0, 200(%rsp,%r15) + movsd 136(%rsp,%r15), %xmm0 + + call cos@PLT + + movsd %xmm0, 264(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_13: + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 128(%rsp,%r15), %xmm0 + + call sin@PLT + + movsd %xmm0, 192(%rsp,%r15) + movsd 128(%rsp,%r15), %xmm0 + + call cos@PLT + + movsd %xmm0, 256(%rsp,%r15) + jmp .LBL_1_7 +END (_ZGVbN2vvv_sincos_sse4) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S new file mode 100644 index 0000000..64744ff --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S @@ -0,0 +1,38 @@ +/* Multiple versions of vectorized sincos. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVdN4vvv_sincos) + .type _ZGVdN4vvv_sincos, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features(%rip) + jne 1f + call __init_cpu_features +1: leaq _ZGVdN4vvv_sincos_avx2(%rip), %rax + testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip) + jz 2f + ret +2: leaq _ZGVdN4vvv_sincos_sse_wrapper(%rip), %rax + ret +END (_ZGVdN4vvv_sincos) +libmvec_hidden_def (_ZGVdN4vvv_sincos) + +#define _ZGVdN4vvv_sincos _ZGVdN4vvv_sincos_sse_wrapper +#include "../svml_d_sincos4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S new file mode 100644 index 0000000..ec1ccc6 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S @@ -0,0 +1,277 @@ +/* Function sincos vectorized with AVX2. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_sincos_data.h" + + .text +ENTRY (_ZGVdN4vvv_sincos_avx2) +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg = N*Pi + R + + Result calculation: + sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) + arg + Pi/2 = (N'*Pi + R') + cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R') + sin(R), sin(R') are approximated by corresponding polynomial. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_dsincos_data@GOTPCREL(%rip), %rax + vmovups %ymm14, 288(%rsp) + vmovups %ymm8, 352(%rsp) + vmovupd __dSignMask(%rax), %ymm6 + vmovupd __dInvPI(%rax), %ymm2 + vmovupd __dPI1_FMA(%rax), %ymm5 + vmovups %ymm9, 224(%rsp) + +/* ARGUMENT RANGE REDUCTION: + Absolute argument: X' = |X| */ + vandnpd %ymm0, %ymm6, %ymm1 + +/* SinY = X'*InvPi + RS : right shifter add */ + vfmadd213pd __dRShifter(%rax), %ymm1, %ymm2 + +/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %ymm2, %ymm4 + +/* SinN = Y - RS : right shifter sub */ + vsubpd __dRShifter(%rax), %ymm2, %ymm2 + +/* SinR = X' - SinN*Pi1 */ + vmovdqa %ymm1, %ymm14 + vfnmadd231pd %ymm2, %ymm5, %ymm14 + +/* SinR = SinR - SinN*Pi1 */ + vfnmadd231pd __dPI2_FMA(%rax), %ymm2, %ymm14 + +/* Sine result sign: SinRSign = SignMask & SinR */ + vandpd %ymm14, %ymm6, %ymm7 + +/* Set SinRSign to 0.5 */ + vorpd __dOneHalf(%rax), %ymm7, %ymm3 + +/* CosN = SinN +(-)0.5 */ + vaddpd %ymm3, %ymm2, %ymm3 + +/* CosR = SinX - CosN*Pi1 */ + vmovdqa %ymm1, %ymm8 + vfnmadd231pd %ymm3, %ymm5, %ymm8 + vmovupd __dPI3_FMA(%rax), %ymm5 + vcmpnle_uqpd __dRangeVal(%rax), %ymm1, %ymm1 + +/* CosR = CosR - CosN*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %ymm3, %ymm8 + +/* SinR = SinR - SinN*Pi3 */ + vfnmadd213pd %ymm14, %ymm5, %ymm2 + +/* CosR = CosR - CosN*Pi3 */ + vfnmadd213pd %ymm8, %ymm5, %ymm3 + vmovupd __dC6(%rax), %ymm8 + +/* SinR2 = SinR^2 */ + vmulpd %ymm2, %ymm2, %ymm14 + +/* CosR2 = CosR^2 */ + vmulpd %ymm3, %ymm3, %ymm5 + +/* Grab SignX */ + vandpd %ymm0, %ymm6, %ymm9 + +/* Update CosRSign and CosSignRes signs */ + vxorpd %ymm6, %ymm7, %ymm6 + vxorpd %ymm6, %ymm4, %ymm7 + +/* Update sign SinSignRes */ + vxorpd %ymm9, %ymm4, %ymm6 + +/* Polynomial approximation */ + vmovupd __dC7(%rax), %ymm4 + vmovdqa %ymm8, %ymm9 + vfmadd231pd __dC7(%rax), %ymm14, %ymm9 + vfmadd213pd %ymm8, %ymm5, %ymm4 + vfmadd213pd __dC5(%rax), %ymm14, %ymm9 + vfmadd213pd __dC5(%rax), %ymm5, %ymm4 + vfmadd213pd __dC4(%rax), %ymm14, %ymm9 + vfmadd213pd __dC4(%rax), %ymm5, %ymm4 + +/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ + vfmadd213pd __dC3(%rax), %ymm14, %ymm9 + +/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ + vfmadd213pd __dC3(%rax), %ymm5, %ymm4 + +/* SinPoly = C2 + SinR2*SinPoly */ + vfmadd213pd __dC2(%rax), %ymm14, %ymm9 + +/* CosPoly = C2 + CosR2*CosPoly */ + vfmadd213pd __dC2(%rax), %ymm5, %ymm4 + +/* SinPoly = C1 + SinR2*SinPoly */ + vfmadd213pd __dC1(%rax), %ymm14, %ymm9 + +/* CosPoly = C1 + CosR2*CosPoly */ + vfmadd213pd __dC1(%rax), %ymm5, %ymm4 + +/* SinPoly = SinR2*SinPoly */ + vmulpd %ymm14, %ymm9, %ymm8 + +/* CosPoly = CosR2*CosPoly */ + vmulpd %ymm5, %ymm4, %ymm4 + +/* SinPoly = SinR*SinPoly */ + vfmadd213pd %ymm2, %ymm2, %ymm8 + +/* CosPoly = CosR*CosPoly */ + vfmadd213pd %ymm3, %ymm3, %ymm4 + vmovmskpd %ymm1, %ecx + +/* Final reconstruction + Update Sin result's sign */ + vxorpd %ymm6, %ymm8, %ymm3 + +/* Update Cos result's sign */ + vxorpd %ymm7, %ymm4, %ymm2 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovups 352(%rsp), %ymm8 + vmovups 224(%rsp), %ymm9 + vmovups 288(%rsp), %ymm14 + vmovupd %ymm2, (%rsi) + vmovdqa %ymm3, (%rdi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovupd %ymm0, 256(%rsp) + vmovupd %ymm3, 320(%rsp) + vmovupd %ymm2, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm10, 128(%rsp) + vmovups %ymm11, 96(%rsp) + vmovups %ymm12, 64(%rsp) + vmovups %ymm13, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 160(%rsp) + movq %r12, 200(%rsp) + cfi_offset_rel_rsp (12, 200) + movb %dl, %r12b + movq %r13, 192(%rsp) + cfi_offset_rel_rsp (13, 192) + movl %eax, %r13d + movq %r14, 184(%rsp) + cfi_offset_rel_rsp (14, 184) + movl %ecx, %r14d + movq %r15, 176(%rsp) + cfi_offset_rel_rsp (15, 176) + movq %rbx, 168(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_1_6: + btl %r13d, %r14d + jc .LBL_1_13 + +.LBL_1_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 128(%rsp), %ymm10 + movq %rbx, %rdi + vmovups 96(%rsp), %ymm11 + vmovups 64(%rsp), %ymm12 + vmovups 32(%rsp), %ymm13 + vmovups (%rsp), %ymm15 + vmovupd 320(%rsp), %ymm3 + vmovupd 384(%rsp), %ymm2 + movq 160(%rsp), %rsi + movq 200(%rsp), %r12 + cfi_restore (%r12) + movq 192(%rsp), %r13 + cfi_restore (%r13) + movq 184(%rsp), %r14 + cfi_restore (%r14) + movq 176(%rsp), %r15 + cfi_restore (%r15) + movq 168(%rsp), %rbx + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 264(%rsp,%r15), %xmm0 + vzeroupper + + call sin@PLT + + vmovsd %xmm0, 328(%rsp,%r15) + vmovsd 264(%rsp,%r15), %xmm0 + + call cos@PLT + + vmovsd %xmm0, 392(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_13: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 256(%rsp,%r15), %xmm0 + vzeroupper + + call sin@PLT + + vmovsd %xmm0, 320(%rsp,%r15) + vmovsd 256(%rsp,%r15), %xmm0 + + call cos@PLT + + vmovsd %xmm0, 384(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVdN4vvv_sincos_avx2) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S new file mode 100644 index 0000000..7228ba5 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S @@ -0,0 +1,39 @@ +/* Multiple versions of vectorized sincos. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVeN8vvv_sincos) + .type _ZGVeN8vvv_sincos, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features(%rip) + jne 1 + call __init_cpu_features +1: leaq _ZGVeN8vvv_sincos_skx(%rip), %rax + testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) + jnz 3 +2: leaq _ZGVeN8vvv_sincos_knl(%rip), %rax + testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) + jnz 3 + leaq _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax +3: ret +END (_ZGVeN8vvv_sincos) + +#define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper +#include "../svml_d_sincos8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S new file mode 100644 index 0000000..fcbf393 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S @@ -0,0 +1,593 @@ +/* Function sincos vectorized with AVX-512. KNL and SKX versions. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_sincos_data.h" +#include "svml_d_wrapper_impl.h" + +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg = N*Pi + R + + Result calculation: + sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) + arg + Pi/2 = (N'*Pi + R') + cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R') + sin(R), sin(R') are approximated by corresponding polynomial. */ + + .text +ENTRY (_ZGVeN8vvv_sincos_knl) +#ifndef HAVE_AVX512_ASM_SUPPORT +WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos +#else + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1344, %rsp + movq __svml_dsincos_data@GOTPCREL(%rip), %rax + vmovaps %zmm0, %zmm4 + movq $-1, %rdx + vmovups __dSignMask(%rax), %zmm12 + vmovups __dInvPI(%rax), %zmm5 + +/* ARGUMENT RANGE REDUCTION: + Absolute argument: X' = |X| */ + vpandnq %zmm4, %zmm12, %zmm3 + vmovups __dPI1_FMA(%rax), %zmm7 + vmovups __dPI3_FMA(%rax), %zmm9 + +/* SinR = X' - SinN*Pi1 */ + vmovaps %zmm3, %zmm8 + +/* CosR = SinX - CosN*Pi1 */ + vmovaps %zmm3, %zmm10 + +/* SinY = X'*InvPi + RS : right shifter add */ + vfmadd213pd __dRShifter(%rax), %zmm3, %zmm5 + vmovups __dC6(%rax), %zmm13 + +/* SinN = Y - RS : right shifter sub */ + vsubpd __dRShifter(%rax), %zmm5, %zmm1 + vmovaps %zmm13, %zmm14 + +/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %zmm5, %zmm2 + vcmppd $22, __dRangeVal(%rax), %zmm3, %k1 + +/* Update CosRSign and CosSignRes signs */ + vmovaps %zmm12, %zmm5 + vfnmadd231pd %zmm1, %zmm7, %zmm8 + +/* SinR = SinR - SinN*Pi1 */ + vfnmadd231pd __dPI2_FMA(%rax), %zmm1, %zmm8 + +/* Sine result sign: SinRSign = SignMask & SinR */ + vpandq %zmm8, %zmm12, %zmm11 + +/* Set SinRSign to 0.5 */ + vporq __dOneHalf(%rax), %zmm11, %zmm6 + vpternlogq $150, %zmm2, %zmm11, %zmm5 + +/* Update sign SinSignRes */ + vpternlogq $120, %zmm4, %zmm12, %zmm2 + +/* Polynomial approximation */ + vmovups __dC7(%rax), %zmm11 + +/* CosN = SinN +(-)0.5 */ + vaddpd %zmm6, %zmm1, %zmm0 + +/* SinR = SinR - SinN*Pi3 */ + vfnmadd213pd %zmm8, %zmm9, %zmm1 + vfnmadd231pd %zmm0, %zmm7, %zmm10 + +/* SinR2 = SinR^2 */ + vmulpd %zmm1, %zmm1, %zmm15 + +/* Grab SignX + CosR = CosR - CosN*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %zmm0, %zmm10 + vfmadd231pd __dC7(%rax), %zmm15, %zmm14 + +/* CosR = CosR - CosN*Pi3 */ + vfnmadd213pd %zmm10, %zmm9, %zmm0 + vfmadd213pd __dC5(%rax), %zmm15, %zmm14 + +/* CosR2 = CosR^2 */ + vmulpd %zmm0, %zmm0, %zmm12 + vfmadd213pd __dC4(%rax), %zmm15, %zmm14 + vfmadd213pd %zmm13, %zmm12, %zmm11 + +/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ + vfmadd213pd __dC3(%rax), %zmm15, %zmm14 + vfmadd213pd __dC5(%rax), %zmm12, %zmm11 + +/* SinPoly = C2 + SinR2*SinPoly */ + vfmadd213pd __dC2(%rax), %zmm15, %zmm14 + vfmadd213pd __dC4(%rax), %zmm12, %zmm11 + +/* SinPoly = C1 + SinR2*SinPoly */ + vfmadd213pd __dC1(%rax), %zmm15, %zmm14 + +/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ + vfmadd213pd __dC3(%rax), %zmm12, %zmm11 + +/* SinPoly = SinR2*SinPoly */ + vmulpd %zmm15, %zmm14, %zmm13 + +/* CosPoly = C2 + CosR2*CosPoly */ + vfmadd213pd __dC2(%rax), %zmm12, %zmm11 + +/* SinPoly = SinR*SinPoly */ + vfmadd213pd %zmm1, %zmm1, %zmm13 + vpbroadcastq %rdx, %zmm1{%k1}{z} + +/* CosPoly = C1 + CosR2*CosPoly */ + vfmadd213pd __dC1(%rax), %zmm12, %zmm11 + vptestmq %zmm1, %zmm1, %k0 + kmovw %k0, %ecx + +/* CosPoly = CosR2*CosPoly */ + vmulpd %zmm12, %zmm11, %zmm14 + movzbl %cl, %ecx + +/* CosPoly = CosR*CosPoly */ + vfmadd213pd %zmm0, %zmm0, %zmm14 + +/* Final reconstruction. + Update Sin result's sign */ + vpxorq %zmm2, %zmm13, %zmm0 + +/* Update Cos result's sign */ + vpxorq %zmm5, %zmm14, %zmm2 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovups %zmm0, (%rdi) + vmovups %zmm2, (%rsi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm4, 1152(%rsp) + vmovups %zmm0, 1216(%rsp) + vmovups %zmm2, 1280(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %eax, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %ecx, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + movq %rbx, 1064(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_1_6: + btl %r13d, %r14d + jc .LBL_1_13 + +.LBL_1_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_1_6 + + movq %rbx, %rdi + kmovw 1048(%rsp), %k4 + movq 1056(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + kmovw 1032(%rsp), %k6 + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + movq 1064(%rsp), %rbx + vmovups 1216(%rsp), %zmm0 + vmovups 1280(%rsp), %zmm2 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + + call sin@PLT + + vmovsd %xmm0, 1224(%rsp,%r15) + vmovsd 1160(%rsp,%r15), %xmm0 + + call cos@PLT + + vmovsd %xmm0, 1288(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_13: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + + call sin@PLT + + vmovsd %xmm0, 1216(%rsp,%r15) + vmovsd 1152(%rsp,%r15), %xmm0 + + call cos@PLT + + vmovsd %xmm0, 1280(%rsp,%r15) + jmp .LBL_1_7 + +#endif +END (_ZGVeN8vvv_sincos_knl) + +ENTRY (_ZGVeN8vvv_sincos_skx) +#ifndef HAVE_AVX512_ASM_SUPPORT +WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos +#else + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1344, %rsp + movq __svml_dsincos_data@GOTPCREL(%rip), %rax + vmovaps %zmm0, %zmm8 + vmovups __dSignMask(%rax), %zmm4 + vmovups __dInvPI(%rax), %zmm9 + vmovups __dRShifter(%rax), %zmm10 + vmovups __dPI1_FMA(%rax), %zmm13 + vmovups __dPI2_FMA(%rax), %zmm14 + vmovups __dOneHalf(%rax), %zmm11 + vmovups __dPI3_FMA(%rax), %zmm2 + +/* ARGUMENT RANGE REDUCTION: + Absolute argument: X' = |X| */ + vandnpd %zmm8, %zmm4, %zmm7 + +/* SinY = X'*InvPi + RS : right shifter add */ + vfmadd213pd %zmm10, %zmm7, %zmm9 + vcmppd $18, __dRangeVal(%rax), %zmm7, %k1 + +/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %zmm9, %zmm6 + +/* SinN = Y - RS : right shifter sub */ + vsubpd %zmm10, %zmm9, %zmm5 + vmovups __dC5(%rax), %zmm9 + vmovups __dC4(%rax), %zmm10 + +/* SinR = X' - SinN*Pi1 */ + vmovaps %zmm7, %zmm15 + vfnmadd231pd %zmm5, %zmm13, %zmm15 + +/* SinR = SinR - SinN*Pi1 */ + vfnmadd231pd %zmm5, %zmm14, %zmm15 + +/* Sine result sign: SinRSign = SignMask & SinR */ + vandpd %zmm15, %zmm4, %zmm1 + +/* Set SinRSign to 0.5 */ + vorpd %zmm1, %zmm11, %zmm12 + vmovups __dC3(%rax), %zmm11 + +/* CosN = SinN +(-)0.5 */ + vaddpd %zmm12, %zmm5, %zmm3 + +/* SinR = SinR - SinN*Pi3 */ + vfnmadd213pd %zmm15, %zmm2, %zmm5 + vmovups __dC2(%rax), %zmm12 + +/* SinR2 = SinR^2 */ + vmulpd %zmm5, %zmm5, %zmm15 + +/* CosR = SinX - CosN*Pi1 */ + vmovaps %zmm7, %zmm0 + vfnmadd231pd %zmm3, %zmm13, %zmm0 + vmovups __dC1(%rax), %zmm13 + +/* Grab SignX + CosR = CosR - CosN*Pi2 */ + vfnmadd231pd %zmm3, %zmm14, %zmm0 + +/* CosR = CosR - CosN*Pi3 */ + vfnmadd213pd %zmm0, %zmm2, %zmm3 + +/* Polynomial approximation */ + vmovups __dC7(%rax), %zmm0 + +/* Update CosRSign and CosSignRes signs */ + vmovaps %zmm4, %zmm2 + vpternlogq $150, %zmm6, %zmm1, %zmm2 + +/* Update sign SinSignRes */ + vpternlogq $120, %zmm8, %zmm4, %zmm6 + +/* CosR2 = CosR^2 */ + vmulpd %zmm3, %zmm3, %zmm1 + vmovups __dC6(%rax), %zmm4 + vmovaps %zmm0, %zmm14 + vfmadd213pd %zmm4, %zmm1, %zmm0 + vfmadd213pd %zmm4, %zmm15, %zmm14 + vfmadd213pd %zmm9, %zmm1, %zmm0 + vfmadd213pd %zmm9, %zmm15, %zmm14 + vfmadd213pd %zmm10, %zmm1, %zmm0 + vfmadd213pd %zmm10, %zmm15, %zmm14 + +/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ + vfmadd213pd %zmm11, %zmm1, %zmm0 + +/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ + vfmadd213pd %zmm11, %zmm15, %zmm14 + +/* CosPoly = C2 + CosR2*CosPoly */ + vfmadd213pd %zmm12, %zmm1, %zmm0 + +/* SinPoly = C2 + SinR2*SinPoly */ + vfmadd213pd %zmm12, %zmm15, %zmm14 + +/* CosPoly = C1 + CosR2*CosPoly */ + vfmadd213pd %zmm13, %zmm1, %zmm0 + +/* SinPoly = C1 + SinR2*SinPoly */ + vfmadd213pd %zmm13, %zmm15, %zmm14 + +/* CosPoly = CosR2*CosPoly */ + vmulpd %zmm1, %zmm0, %zmm1 + +/* SinPoly = SinR2*SinPoly */ + vmulpd %zmm15, %zmm14, %zmm4 + +/* CosPoly = CosR*CosPoly */ + vfmadd213pd %zmm3, %zmm3, %zmm1 + +/* SinPoly = SinR*SinPoly */ + vfmadd213pd %zmm5, %zmm5, %zmm4 + vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3 + +/* Update Cos result's sign */ + vxorpd %zmm2, %zmm1, %zmm1 + +/* Final reconstruction. + Update Sin result's sign */ + vxorpd %zmm6, %zmm4, %zmm0 + vpandnq %zmm7, %zmm7, %zmm3{%k1} + vcmppd $3, %zmm3, %zmm3, %k0 + kmovw %k0, %ecx + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovups %zmm0, (%rdi) + vmovups %zmm1, (%rsi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm8, 1152(%rsp) + vmovups %zmm0, 1216(%rsp) + vmovups %zmm1, 1280(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %eax, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %ecx, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + movq %rbx, 1064(%rsp) + movq %rdi, %rbx + cfi_remember_state + +.LBL_2_6: + btl %r13d, %r14d + jc .LBL_2_13 + +.LBL_2_7: + lea 1(%r13), %esi + btl %esi, %r14d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r13d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + movq %rbx, %rdi + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm0 + vmovups 1280(%rsp), %zmm1 + movq 1056(%rsp), %rsi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + movq 1064(%rsp), %rbx + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1160(%rsp,%r15), %xmm0 + + call sin@PLT + + vmovsd %xmm0, 1224(%rsp,%r15) + vmovsd 1160(%rsp,%r15), %xmm0 + + call cos@PLT + + vmovsd %xmm0, 1288(%rsp,%r15) + jmp .LBL_2_8 + +.LBL_2_13: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1152(%rsp,%r15), %xmm0 + + call sin@PLT + + vmovsd %xmm0, 1216(%rsp,%r15) + vmovsd 1152(%rsp,%r15), %xmm0 + + call cos@PLT + + vmovsd %xmm0, 1280(%rsp,%r15) + jmp .LBL_2_7 + +#endif +END (_ZGVeN8vvv_sincos_skx) + + .section .rodata, "a" +.L_2il0floatpacket.15: + .long 0xffffffff,0xffffffff + .type .L_2il0floatpacket.15,@object diff --git a/sysdeps/x86_64/fpu/svml_d_sincos2_core.S b/sysdeps/x86_64/fpu/svml_d_sincos2_core.S new file mode 100644 index 0000000..bd089e1 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_sincos2_core.S @@ -0,0 +1,29 @@ +/* Function sincos vectorized with SSE2. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVbN2vvv_sincos) +WRAPPER_IMPL_SSE2_fFF sincos +END (_ZGVbN2vvv_sincos) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVbN2vvv_sincos) +#endif diff --git a/sysdeps/x86_64/fpu/svml_d_sincos4_core.S b/sysdeps/x86_64/fpu/svml_d_sincos4_core.S new file mode 100644 index 0000000..d67cd30 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_sincos4_core.S @@ -0,0 +1,29 @@ +/* Function sincos vectorized with AVX2, wrapper version. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVdN4vvv_sincos) +WRAPPER_IMPL_AVX_fFF _ZGVbN2vvv_sincos +END (_ZGVdN4vvv_sincos) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVdN4vvv_sincos) +#endif diff --git a/sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S new file mode 100644 index 0000000..4f3f15a --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S @@ -0,0 +1,25 @@ +/* Function sincos vectorized in AVX ISA as wrapper to SSE4 ISA version. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVcN4vvv_sincos) +WRAPPER_IMPL_AVX_fFF _ZGVbN2vvv_sincos +END (_ZGVcN4vvv_sincos) diff --git a/sysdeps/x86_64/fpu/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/svml_d_sincos8_core.S new file mode 100644 index 0000000..e7f7121 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_sincos8_core.S @@ -0,0 +1,25 @@ +/* Function sincos vectorized with AVX-512. Wrapper to AVX2 version. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8vvv_sincos) +WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos +END (_ZGVeN8vvv_sincos) diff --git a/sysdeps/x86_64/fpu/svml_d_sincos_data.S b/sysdeps/x86_64/fpu/svml_d_sincos_data.S new file mode 100644 index 0000000..6749ba6 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_sincos_data.S @@ -0,0 +1,111 @@ +/* Data for function sincos. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "svml_d_sincos_data.h" + + .section .rodata, "a" + .align 64 + +/* Data table for vector implementations of function sincos. + The table may contain polynomial, reduction, lookup coefficients and + other coefficients obtained through different methods of research and + experimental work. */ + + .globl __svml_dsincos_data +__svml_dsincos_data: + +/* General purpose constants: + dSignMask */ +double_vector __dSignMask 0x8000000000000000 + +/* dAbsMask */ +double_vector __dAbsMask 0x7fffffffffffffff + +/* lRangeVal */ +double_vector __dRangeVal 0x4160000000000000 + +/* HalfPI */ +double_vector __dHalfPI 0x3ff921fb54442d18 + +/* InvPI */ +double_vector __dInvPI 0x3fd45f306dc9c883 + +/* dRShifter */ +double_vector __dRShifter 0x4338000000000000 + +/* dOneHalf */ +double_vector __dOneHalf 0x3fe0000000000000 + +/* Range reduction PI-based constants: + PI1 */ +double_vector __dPI1 0x400921fb40000000 + +/* PI2 */ +double_vector __dPI2 0x3e84442d00000000 + +/* PI3 */ +double_vector __dPI3 0x3d08469880000000 + +/* PI4 */ +double_vector __dPI4 0x3b88cc51701b839a + +/* Range reduction PI-based constants if FMA available: + PI1_FMA */ +double_vector __dPI1_FMA 0x400921fb54442d18 + +/* PI2_FMA */ +double_vector __dPI2_FMA 0x3ca1a62633145c06 + +/* PI3_FMA */ +double_vector __dPI3_FMA 0x395c1cd129024e09 + +/* HalfPI1 */ +double_vector __dHalfPI1 0x3ff921fc00000000 + +/* HalfPI2 */ +double_vector __dHalfPI2 0xbea5777a00000000 + +/* HalfPI3 */ +double_vector __dHalfPI3 0xbd473dcc00000000 + +/* HalfPI4 */ +double_vector __dHalfPI4 0x3bf898cc51701b84 + +/* Polynomial coefficients (relative error 2^(-52.115)): + C1 */ +double_vector __dC1 0xbfc55555555554a7 + +/* C2 */ +double_vector __dC2 0x3f8111111110a4a8 + +/* C3 */ +double_vector __dC3 0xbf2a01a019a5b86d + +/* C4 */ +double_vector __dC4 0x3ec71de38030fea0 + +/* C5 */ +double_vector __dC5 0xbe5ae63546002231 + +/* C6 */ +double_vector __dC6 0x3de60e6857a2f220 + +/* C7 */ +double_vector __dC7 0xbd69f0d60811aac8 + .type __svml_dsincos_data,@object + .size __svml_dsincos_data,.-__svml_dsincos_data diff --git a/sysdeps/x86_64/fpu/svml_d_sincos_data.h b/sysdeps/x86_64/fpu/svml_d_sincos_data.h new file mode 100644 index 0000000..cc316dc --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_sincos_data.h @@ -0,0 +1,57 @@ +/* Offsets for data table for function sincos. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef D_SINCOS_DATA_H +#define D_SINCOS_DATA_H + +#define __dSignMask 0 +#define __dAbsMask 64 +#define __dRangeVal 128 +#define __dHalfPI 192 +#define __dInvPI 256 +#define __dRShifter 320 +#define __dOneHalf 384 +#define __dPI1 448 +#define __dPI2 512 +#define __dPI3 576 +#define __dPI4 640 +#define __dPI1_FMA 704 +#define __dPI2_FMA 768 +#define __dPI3_FMA 832 +#define __dHalfPI1 896 +#define __dHalfPI2 960 +#define __dHalfPI3 1024 +#define __dHalfPI4 1088 +#define __dC1 1152 +#define __dC2 1216 +#define __dC3 1280 +#define __dC4 1344 +#define __dC5 1408 +#define __dC6 1472 +#define __dC7 1536 + +.macro double_vector offset value +.if .-__svml_dsincos_data != \offset +.err +.endif +.rept 8 +.quad \value +.endr +.endm + +#endif diff --git a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h index 25465cd..bd93b8e 100644 --- a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h +++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h @@ -54,6 +54,47 @@ ret .endm +/* 3 argument SSE2 ISA version as wrapper to scalar. */ +.macro WRAPPER_IMPL_SSE2_fFF callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + pushq %rbx + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbx, 0) + movq %rdi, %rbp + movq %rsi, %rbx + subq $40, %rsp + cfi_adjust_cfa_offset(40) + leaq 16(%rsp), %rsi + leaq 24(%rsp), %rdi + movaps %xmm0, (%rsp) + call \callee@PLT + leaq 16(%rsp), %rsi + leaq 24(%rsp), %rdi + movsd 24(%rsp), %xmm0 + movapd (%rsp), %xmm1 + movsd %xmm0, 0(%rbp) + unpckhpd %xmm1, %xmm1 + movsd 16(%rsp), %xmm0 + movsd %xmm0, (%rbx) + movapd %xmm1, %xmm0 + call \callee@PLT + movsd 24(%rsp), %xmm0 + movsd %xmm0, 8(%rbp) + movsd 16(%rsp), %xmm0 + movsd %xmm0, 8(%rbx) + addq $40, %rsp + cfi_adjust_cfa_offset(-40) + popq %rbx + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbx) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm + /* AVX/AVX2 ISA version as wrapper to SSE ISA version. */ .macro WRAPPER_IMPL_AVX callee pushq %rbp @@ -108,6 +149,49 @@ ret .endm +/* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */ +.macro WRAPPER_IMPL_AVX_fFF callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + pushq %r13 + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%r13, 0) + pushq %r14 + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%r14, 0) + subq $48, %rsp + movq %rsi, %r14 + movq %rdi, %r13 + vextractf128 $1, %ymm0, 32(%rsp) + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + vmovaps 32(%rsp), %xmm0 + lea (%rsp), %rdi + lea 16(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + vmovapd (%rsp), %xmm0 + vmovapd 16(%rsp), %xmm1 + vmovapd %xmm0, 16(%r13) + vmovapd %xmm1, 16(%r14) + addq $48, %rsp + popq %r14 + cfi_adjust_cfa_offset (-8) + cfi_restore (%r14) + popq %r13 + cfi_adjust_cfa_offset (-8) + cfi_restore (%r13) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm + /* AVX512 ISA version as wrapper to AVX2 ISA version. */ .macro WRAPPER_IMPL_AVX512 callee pushq %rbp @@ -209,3 +293,89 @@ cfi_restore (%rbp) ret .endm + +/* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ +.macro WRAPPER_IMPL_AVX512_fFF callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + pushq %r12 + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%r12, 0) + pushq %r13 + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%r13, 0) + subq $176, %rsp + movq %rsi, %r13 +/* Below is encoding for vmovaps %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x29 + .byte 0x04 + .byte 0x24 + movq %rdi, %r12 +/* Below is encoding for vmovapd (%rsp), %ymm0. */ + .byte 0xc5 + .byte 0xfd + .byte 0x28 + .byte 0x04 + .byte 0x24 + call HIDDEN_JUMPTARGET(\callee) +/* Below is encoding for vmovapd 32(%rsp), %ymm0. */ + .byte 0xc5 + .byte 0xfd + .byte 0x28 + .byte 0x44 + .byte 0x24 + .byte 0x20 + lea 64(%rsp), %rdi + lea 96(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) +/* Below is encoding for vmovapd 64(%rsp), %ymm0. */ + .byte 0xc5 + .byte 0xfd + .byte 0x28 + .byte 0x44 + .byte 0x24 + .byte 0x40 +/* Below is encoding for vmovapd 96(%rsp), %ymm1. */ + .byte 0xc5 + .byte 0xfd + .byte 0x28 + .byte 0x4c + .byte 0x24 + .byte 0x60 +/* Below is encoding for vmovapd %ymm0, 32(%r12). */ + .byte 0xc4 + .byte 0xc1 + .byte 0x7d + .byte 0x29 + .byte 0x44 + .byte 0x24 + .byte 0x20 +/* Below is encoding for vmovapd %ymm1, 32(%r13). */ + .byte 0xc4 + .byte 0xc1 + .byte 0x7d + .byte 0x29 + .byte 0x4d + .byte 0x20 + addq $176, %rsp + popq %r13 + cfi_adjust_cfa_offset (-8) + cfi_restore (%r13) + popq %r12 + cfi_adjust_cfa_offset (-8) + cfi_restore (%r12) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm diff --git a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c index d7897aa..806bd99 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c @@ -24,6 +24,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVbN2v_cos) VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin) +VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVbN2vvv_sincos) VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log) VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp) VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow) diff --git a/sysdeps/x86_64/fpu/test-double-vlen2.c b/sysdeps/x86_64/fpu/test-double-vlen2.c index 83cab4d..2b68964 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen2.c +++ b/sysdeps/x86_64/fpu/test-double-vlen2.c @@ -20,6 +20,7 @@ #define TEST_VECTOR_cos 1 #define TEST_VECTOR_sin 1 +#define TEST_VECTOR_sincos 1 #define TEST_VECTOR_log 1 #define TEST_VECTOR_exp 1 #define TEST_VECTOR_pow 1 diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c index 82159ce..a12b897 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVdN4v_cos) VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVdN4v_sin) +VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVdN4vvv_sincos) VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log) VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp) VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow) diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2.c index 771dc89..56723ab 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2.c +++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2.c @@ -23,6 +23,7 @@ #define TEST_VECTOR_cos 1 #define TEST_VECTOR_sin 1 +#define TEST_VECTOR_sincos 1 #define TEST_VECTOR_log 1 #define TEST_VECTOR_exp 1 #define TEST_VECTOR_pow 1 diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c index 9421a59..7525af7 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c @@ -24,6 +24,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVcN4v_cos) VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVcN4v_sin) +VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVcN4vvv_sincos) VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log) VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp) VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow) diff --git a/sysdeps/x86_64/fpu/test-double-vlen4.c b/sysdeps/x86_64/fpu/test-double-vlen4.c index 8da28c8..679397f 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen4.c +++ b/sysdeps/x86_64/fpu/test-double-vlen4.c @@ -20,6 +20,7 @@ #define TEST_VECTOR_cos 1 #define TEST_VECTOR_sin 1 +#define TEST_VECTOR_sincos 1 #define TEST_VECTOR_log 1 #define TEST_VECTOR_exp 1 #define TEST_VECTOR_pow 1 diff --git a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c index 4bdcdd4..fcc4fa4 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c @@ -24,6 +24,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVeN8v_cos) VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVeN8v_sin) +VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVeN8vvv_sincos) VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log) VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp) VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow) diff --git a/sysdeps/x86_64/fpu/test-double-vlen8.c b/sysdeps/x86_64/fpu/test-double-vlen8.c index bf832b1..1e23b83 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen8.c +++ b/sysdeps/x86_64/fpu/test-double-vlen8.c @@ -20,6 +20,7 @@ #define TEST_VECTOR_cos 1 #define TEST_VECTOR_sin 1 +#define TEST_VECTOR_sincos 1 #define TEST_VECTOR_log 1 #define TEST_VECTOR_exp 1 #define TEST_VECTOR_pow 1 |