/* Function acos vectorized with AVX2. Copyright (C) 2021 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see https://www.gnu.org/licenses/. */ /* * ALGORITHM DESCRIPTION: * * SelMask = (|x| >= 0.5) ? 1 : 0; * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R)) * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|) * */ /* Offsets for data table __svml_dacos_data_internal */ #define SgnBit 0 #define OneHalf 32 #define SmallNorm 64 #define MOne 96 #define Two 128 #define sqrt_coeff 160 #define poly_coeff 288 #define PiH 672 #define Pi2H 704 #include .text .section .text.avx2,"ax",@progbits ENTRY(_ZGVdN4v_acos_avx2) pushq %rbp cfi_def_cfa_offset(16) movq %rsp, %rbp cfi_def_cfa(6, 16) cfi_offset(6, -16) andq $-32, %rsp subq $96, %rsp vmovupd __svml_dacos_data_internal(%rip), %ymm6 vmovupd OneHalf+__svml_dacos_data_internal(%rip), %ymm7 vmovapd %ymm0, %ymm5 /* x = -|arg| */ vorpd %ymm5, %ymm6, %ymm4 /* Y = 0.5 + 0.5*(-x) */ vfmadd231pd %ymm4, %ymm7, %ymm7 /* x^2 */ vmulpd %ymm4, %ymm4, %ymm8 /* S ~ 2*sqrt(Y) */ vmovupd sqrt_coeff+__svml_dacos_data_internal(%rip), %ymm0 vcmplt_oqpd SmallNorm+__svml_dacos_data_internal(%rip), %ymm7, %ymm12 vminpd %ymm7, %ymm8, %ymm2 /* NaN processed in special branch (so wind test passed) */ vcmpnge_uqpd MOne+__svml_dacos_data_internal(%rip), %ymm4, %ymm9 vcvtpd2ps %ymm7, %xmm10 vmovupd poly_coeff+64+__svml_dacos_data_internal(%rip), %ymm8 vcmpnlt_uqpd %ymm7, %ymm2, %ymm1 vrsqrtps %xmm10, %xmm11 vfmadd213pd poly_coeff+96+__svml_dacos_data_internal(%rip), %ymm2, %ymm8 vcvtps2pd %xmm11, %ymm13 vmovupd poly_coeff+128+__svml_dacos_data_internal(%rip), %ymm11 vandnpd %ymm13, %ymm12, %ymm14 vmulpd %ymm14, %ymm14, %ymm15 vfmadd213pd poly_coeff+160+__svml_dacos_data_internal(%rip), %ymm2, %ymm11 vmulpd %ymm2, %ymm2, %ymm13 vmovupd poly_coeff+256+__svml_dacos_data_internal(%rip), %ymm12 vmulpd %ymm13, %ymm13, %ymm10 vfmadd213pd poly_coeff+288+__svml_dacos_data_internal(%rip), %ymm2, %ymm12 vandpd %ymm5, %ymm6, %ymm3 vaddpd %ymm7, %ymm7, %ymm6 vmulpd %ymm6, %ymm14, %ymm7 vfmsub213pd Two+__svml_dacos_data_internal(%rip), %ymm15, %ymm6 vmovupd poly_coeff+320+__svml_dacos_data_internal(%rip), %ymm14 vfmadd213pd sqrt_coeff+32+__svml_dacos_data_internal(%rip), %ymm6, %ymm0 vmulpd %ymm6, %ymm7, %ymm15 vfmadd213pd poly_coeff+352+__svml_dacos_data_internal(%rip), %ymm2, %ymm14 vfmadd213pd sqrt_coeff+64+__svml_dacos_data_internal(%rip), %ymm6, %ymm0 vfmadd213pd sqrt_coeff+96+__svml_dacos_data_internal(%rip), %ymm6, %ymm0 /* polynomial */ vmovupd poly_coeff+__svml_dacos_data_internal(%rip), %ymm6 vfnmadd213pd %ymm7, %ymm15, %ymm0 vfmadd213pd poly_coeff+32+__svml_dacos_data_internal(%rip), %ymm2, %ymm6 vblendvpd %ymm1, %ymm0, %ymm4, %ymm0 vfmadd213pd %ymm8, %ymm13, %ymm6 vmovmskpd %ymm9, %edx vmovupd poly_coeff+192+__svml_dacos_data_internal(%rip), %ymm9 vfmadd213pd poly_coeff+224+__svml_dacos_data_internal(%rip), %ymm2, %ymm9 vfmadd213pd %ymm9, %ymm13, %ymm11 vfmadd213pd %ymm11, %ymm10, %ymm6 vfmadd213pd %ymm12, %ymm13, %ymm6 vfmadd213pd %ymm14, %ymm13, %ymm6 vmulpd %ymm6, %ymm2, %ymm9 /* X