x86_64: Fix SSE4.2 libmvec atan2 function accuracy [BZ #28765]

This patch fixes SSE4.2 libmvec atan2 function accuracy for following inputs to less than 4 ulps. {0x1.bcab29da0e947p-54,0x1.bc41f4d2294b8p-54} 4.19888 ulps {0x1.b836ed678be29p-588,0x1.b7be6f5a03a8cp-588} 4.09889 ulps This fixes BZ #28765. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
author: Sunil K Pandey <skpgkp2@gmail.com> 2022-01-12 11:02:19 -0800
committer: Sunil K Pandey <skpgkp2@gmail.com> 2022-01-12 13:23:22 -0800
commit: 49e2bf58d57758df244eb621d63cedd2ab6d1971 (patch)
tree: 0fc8b8525c5e1efdc8f125c7a050e505ea7e39c9
parent: fcfc9086815bf0d277ad47a90ee3fda4c37acca8 (diff)
download: glibc-49e2bf58d57758df244eb621d63cedd2ab6d1971.zip
glibc-49e2bf58d57758df244eb621d63cedd2ab6d1971.tar.gz
glibc-49e2bf58d57758df244eb621d63cedd2ab6d1971.tar.bz2
1 files changed, 173 insertions, 148 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
index 4983051..138ff2f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
@@ -65,7 +65,7 @@
 ENTRY(_ZGVbN2vv_atan2_sse4)
         subq      $88, %rsp
         cfi_def_cfa_offset(96)
-        movaps    %xmm0, %xmm8
+        movaps    %xmm1, %xmm11
 
 /*
  * #define NO_VECTOR_ZERO_ATAN2_ARGS
@@ -78,134 +78,161 @@ ENTRY(_ZGVbN2vv_atan2_sse4)
  * Cannot be replaced by VQRCP(D, dR0, dB);
  * Argument Absolute values
  */
-        movups    dABS_MASK+__svml_datan2_data_internal(%rip), %xmm4
+        movups    dABS_MASK+__svml_datan2_data_internal(%rip), %xmm1
+        movaps    %xmm0, %xmm10
         movaps    %xmm1, %xmm9
-        movaps    %xmm4, %xmm1
-        andps     %xmm8, %xmm4
-        andps     %xmm9, %xmm1
-        movaps    %xmm4, %xmm2
-        cmpnltpd  %xmm1, %xmm2
+        andps     %xmm10, %xmm1
+        andps     %xmm11, %xmm9
+        movaps    %xmm1, %xmm4
+        cmpnltpd  %xmm9, %xmm4
 
 /* Argument signs */
-        movups    dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm3
-        movaps    %xmm2, %xmm0
-        movups    dPIO2+__svml_datan2_data_internal(%rip), %xmm5
-        movaps    %xmm3, %xmm7
-        movaps    %xmm3, %xmm6
+        movups    dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm5
+        movaps    %xmm4, %xmm0
+        movaps    %xmm5, %xmm8
+        movaps    %xmm5, %xmm7
 
 /*
  * 1) If y<x then a= y, b=x, PIO2=0
  * 2) If y>x then a=-x, b=y, PIO2=Pi/2
  */
-        orps      %xmm1, %xmm3
-        movaps    %xmm2, %xmm10
-        andps     %xmm2, %xmm5
-        andnps    %xmm4, %xmm0
-        andps     %xmm2, %xmm3
-        andnps    %xmm1, %xmm10
-        andps     %xmm4, %xmm2
-        orps      %xmm3, %xmm0
-        orps      %xmm2, %xmm10
-        divpd     %xmm10, %xmm0
-        movq      iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm11
-
-/* if x<0, dPI = Pi, else dPI =0 */
-        movaps    %xmm9, %xmm3
+        orps      %xmm9, %xmm5
+        andnps    %xmm1, %xmm0
+        andps     %xmm4, %xmm5
+        andps     %xmm11, %xmm8
+        movups    dPIO2+__svml_datan2_data_internal(%rip), %xmm6
+        orps      %xmm5, %xmm0
+        movaps    %xmm4, %xmm5
+        andps     %xmm4, %xmm6
+        andnps    %xmm9, %xmm5
+        andps     %xmm1, %xmm4
+        orps      %xmm4, %xmm5
+        andps     %xmm10, %xmm7
+        divpd     %xmm5, %xmm0
+        movq      iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm2
+        xorl      %edx, %edx
 
 /* Check if y and x are on main path. */
-        pshufd    $221, %xmm1, %xmm12
-        andps     %xmm9, %xmm7
-        psubd     %xmm11, %xmm12
-        andps     %xmm8, %xmm6
-        movq      iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm13
-        xorl      %edx, %edx
-        movups    %xmm4, 16(%rsp)
+        pshufd    $221, %xmm9, %xmm3
         xorl      %eax, %eax
-        pshufd    $221, %xmm4, %xmm14
-        movdqa    %xmm12, %xmm4
-        pcmpgtd   %xmm13, %xmm4
-        pcmpeqd   %xmm13, %xmm12
-        por       %xmm12, %xmm4
+        pshufd    $221, %xmm1, %xmm13
+        psubd     %xmm2, %xmm3
+        psubd     %xmm2, %xmm13
+        movdqa    %xmm3, %xmm4
+        movq      iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm12
+        movdqa    %xmm13, %xmm14
+        pcmpgtd   %xmm12, %xmm4
+        pcmpeqd   %xmm12, %xmm3
+        pcmpgtd   %xmm12, %xmm14
+        pcmpeqd   %xmm12, %xmm13
 
 /* Polynomial. */
         movaps    %xmm0, %xmm12
+        por       %xmm3, %xmm4
         mulpd     %xmm0, %xmm12
-        cmplepd   dZERO+__svml_datan2_data_internal(%rip), %xmm3
-        psubd     %xmm11, %xmm14
-        movdqa    %xmm14, %xmm15
-        pcmpeqd   %xmm13, %xmm14
-        pcmpgtd   %xmm13, %xmm15
-        por       %xmm14, %xmm15
-        movaps    %xmm12, %xmm14
-        mulpd     %xmm12, %xmm14
-        por       %xmm15, %xmm4
-        movaps    %xmm14, %xmm15
-        mulpd     %xmm14, %xmm15
-        movmskps  %xmm4, %ecx
-        movups    %xmm10, (%rsp)
-        movups    dA19+__svml_datan2_data_internal(%rip), %xmm10
-        mulpd     %xmm15, %xmm10
-        movups    dA18+__svml_datan2_data_internal(%rip), %xmm13
-        movups    dA17+__svml_datan2_data_internal(%rip), %xmm11
-        addpd     dA15+__svml_datan2_data_internal(%rip), %xmm10
-        mulpd     %xmm15, %xmm13
-        mulpd     %xmm15, %xmm11
-        mulpd     %xmm15, %xmm10
-        addpd     dA14+__svml_datan2_data_internal(%rip), %xmm13
-        addpd     dA13+__svml_datan2_data_internal(%rip), %xmm11
-        addpd     dA11+__svml_datan2_data_internal(%rip), %xmm10
-        mulpd     %xmm15, %xmm13
-        mulpd     %xmm15, %xmm11
-        mulpd     %xmm15, %xmm10
-        addpd     dA10+__svml_datan2_data_internal(%rip), %xmm13
-        addpd     dA09+__svml_datan2_data_internal(%rip), %xmm11
-        addpd     dA07+__svml_datan2_data_internal(%rip), %xmm10
-        mulpd     %xmm15, %xmm13
-        mulpd     %xmm15, %xmm11
-        mulpd     %xmm15, %xmm10
-        addpd     dA06+__svml_datan2_data_internal(%rip), %xmm13
-        addpd     dA05+__svml_datan2_data_internal(%rip), %xmm11
-        addpd     dA03+__svml_datan2_data_internal(%rip), %xmm10
-        mulpd     %xmm15, %xmm13
-        mulpd     %xmm15, %xmm11
-        mulpd     %xmm12, %xmm10
-        addpd     dA02+__svml_datan2_data_internal(%rip), %xmm13
-        addpd     dA01+__svml_datan2_data_internal(%rip), %xmm11
-        addpd     %xmm10, %xmm13
-        mulpd     %xmm11, %xmm12
-        mulpd     %xmm13, %xmm14
-        movups    dA16+__svml_datan2_data_internal(%rip), %xmm2
-        mulpd     %xmm15, %xmm2
-        addpd     dA12+__svml_datan2_data_internal(%rip), %xmm2
-        mulpd     %xmm15, %xmm2
-        addpd     dA08+__svml_datan2_data_internal(%rip), %xmm2
-        mulpd     %xmm15, %xmm2
-        addpd     dA04+__svml_datan2_data_internal(%rip), %xmm2
-
-/* A00=1.0, account for it later  VQFMA(D, dP4, dP4, dR8, dA00); */
-        mulpd     %xmm2, %xmm15
-        addpd     %xmm12, %xmm15
-        addpd     %xmm14, %xmm15
+
+/* P = A19*R2 + A18 */
+        movups    dA19+__svml_datan2_data_internal(%rip), %xmm15
+        movaps    %xmm11, %xmm2
+        mulpd     %xmm12, %xmm15
+        addpd     dA18+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A17 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA17+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A16 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA16+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A15 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA15+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A14 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA14+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A13 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA13+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A12 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA12+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A11 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA11+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A10 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA10+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A09 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA09+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A08 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA08+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A07 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA07+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A06 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA06+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A05 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA05+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A04 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA04+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A03 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA03+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A02 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA02+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A01 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA01+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 */
+        mulpd     %xmm15, %xmm12
 
 /*
  * Reconstruction.
  * dP=(R+R*dP) + dPIO2
  */
-        mulpd     %xmm0, %xmm15
-        addpd     %xmm15, %xmm0
-        addpd     %xmm5, %xmm0
-        andps     __svml_datan2_data_internal(%rip), %xmm3
+        mulpd     %xmm0, %xmm12
+        addpd     %xmm12, %xmm0
+
+/* if x<0, dPI = Pi, else dPI =0 */
+        movups    dZERO+__svml_datan2_data_internal(%rip), %xmm3
+        por       %xmm13, %xmm14
+        cmplepd   %xmm3, %xmm2
+        addpd     %xmm6, %xmm0
+        andps     __svml_datan2_data_internal(%rip), %xmm2
+        orps      %xmm8, %xmm0
+        addpd     %xmm2, %xmm0
+        por       %xmm14, %xmm4
         orps      %xmm7, %xmm0
-        addpd     %xmm3, %xmm0
+        movmskps  %xmm4, %ecx
 
 /*  Special branch for fast (vector) processing of zero arguments  */
-        movups    16(%rsp), %xmm11
-        orps      %xmm6, %xmm0
         testb     $3, %cl
 
 /* Go to auxilary branch */
         jne       L(AUX_BRANCH)
-                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11
+                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11
 
 /* Return from auxilary branch
  * for out of main path inputs
@@ -220,7 +247,7 @@ L(AUX_BRANCH_RETURN):
 
 /* Go to special inputs processing branch */
         jne       L(SPECIAL_VALUES_BRANCH)
-                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
+                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
 
 /* Restore registers
  * and exit the function
@@ -237,8 +264,8 @@ L(EXIT):
  */
 
 L(SPECIAL_VALUES_BRANCH):
-        movups    %xmm8, 32(%rsp)
-        movups    %xmm9, 48(%rsp)
+        movups    %xmm10, 32(%rsp)
+        movups    %xmm11, 48(%rsp)
         movups    %xmm0, 64(%rsp)
                                 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0
 
@@ -315,66 +342,64 @@ L(SCALAR_MATH_CALL):
  */
 
 L(AUX_BRANCH):
-/* Check if at least on of Y or Y is zero: iAXAYZERO */
-        movups    dZERO+__svml_datan2_data_internal(%rip), %xmm2
-
 /* Check if both X & Y are not NaNs:  iXYnotNAN */
-        movaps    %xmm9, %xmm12
-        movaps    %xmm8, %xmm10
-        cmpordpd  %xmm9, %xmm12
-        cmpordpd  %xmm8, %xmm10
-        cmpeqpd   %xmm2, %xmm1
-        cmpeqpd   %xmm2, %xmm11
-        andps     %xmm10, %xmm12
-        orps      %xmm11, %xmm1
-        pshufd    $221, %xmm1, %xmm1
-        pshufd    $221, %xmm12, %xmm11
+        movaps    %xmm11, %xmm13
+        movaps    %xmm10, %xmm12
+        cmpordpd  %xmm11, %xmm13
+        cmpordpd  %xmm10, %xmm12
 
-/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
-        pand      %xmm11, %xmm1
-
-/* Exclude from previous callout mask zero (and not NaN) arguments */
-        movdqa    %xmm1, %xmm13
-        pandn     %xmm4, %xmm13
+/* Check if at least on of Y or Y is zero: iAXAYZERO */
+        cmpeqpd   %xmm3, %xmm9
+        cmpeqpd   %xmm3, %xmm1
 
 /*
  *  Path for zero arguments (at least one of both)
  * Check if both args are zeros (den. is zero)
  */
-        movups    (%rsp), %xmm4
-        cmpeqpd   %xmm2, %xmm4
+        cmpeqpd   %xmm3, %xmm5
+        andps     %xmm12, %xmm13
+        orps      %xmm1, %xmm9
+        pshufd    $221, %xmm9, %xmm1
+        pshufd    $221, %xmm13, %xmm9
 
-/* Go to callout */
-        movmskps  %xmm13, %edx
+/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
+        pand      %xmm9, %xmm1
+
+/* Exclude from previous callout mask zero (and not NaN) arguments */
+        movdqa    %xmm1, %xmm14
+        pandn     %xmm4, %xmm14
 
 /* Set sPIO2 to zero if den. is zero */
-        movaps    %xmm4, %xmm15
-        andps     %xmm2, %xmm4
-        andnps    %xmm5, %xmm15
-        andl      $3, %edx
-        orps      %xmm4, %xmm15
-        pshufd    $221, %xmm9, %xmm5
-        orps      %xmm7, %xmm15
+        movaps    %xmm5, %xmm4
+        andnps    %xmm6, %xmm4
+        andps     %xmm3, %xmm5
 
 /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
-        pshufd    $221, %xmm2, %xmm7
-        pcmpgtd   %xmm5, %xmm7
-        pshufd    $80, %xmm7, %xmm14
-        andps     %xmm3, %xmm14
-        addpd     %xmm14, %xmm15
+        pshufd    $221, %xmm3, %xmm3
+        orps      %xmm5, %xmm4
+        pshufd    $221, %xmm11, %xmm5
+        orps      %xmm8, %xmm4
+        pcmpgtd   %xmm5, %xmm3
+        pshufd    $80, %xmm3, %xmm6
+        andps     %xmm2, %xmm6
+        addpd     %xmm6, %xmm4
+
+/* Go to callout */
+        movmskps  %xmm14, %edx
 
 /* Merge results from main and spec path */
-        pshufd    $80, %xmm1, %xmm3
-        orps      %xmm6, %xmm15
-        movdqa    %xmm3, %xmm6
-        andps     %xmm3, %xmm15
-        andnps    %xmm0, %xmm6
-        movaps    %xmm6, %xmm0
-        orps      %xmm15, %xmm0
+        pshufd    $80, %xmm1, %xmm2
+        orps      %xmm7, %xmm4
+        movdqa    %xmm2, %xmm7
+        andps     %xmm2, %xmm4
+        andnps    %xmm0, %xmm7
+        andl      $3, %edx
+        movaps    %xmm7, %xmm0
+        orps      %xmm4, %xmm0
 
 /* Return to main vector processing path */
         jmp       L(AUX_BRANCH_RETURN)
-                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
+                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
 END(_ZGVbN2vv_atan2_sse4)
 
         .section .rodata, "a"
author	Sunil K Pandey <skpgkp2@gmail.com>	2022-01-12 11:02:19 -0800
committer	Sunil K Pandey <skpgkp2@gmail.com>	2022-01-12 13:23:22 -0800
commit	49e2bf58d57758df244eb621d63cedd2ab6d1971 (patch)
tree	0fc8b8525c5e1efdc8f125c7a050e505ea7e39c9
parent	fcfc9086815bf0d277ad47a90ee3fda4c37acca8 (diff)
download	glibc-49e2bf58d57758df244eb621d63cedd2ab6d1971.zip glibc-49e2bf58d57758df244eb621d63cedd2ab6d1971.tar.gz glibc-49e2bf58d57758df244eb621d63cedd2ab6d1971.tar.bz2