aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSunil K Pandey <skpgkp2@gmail.com>2022-01-12 11:02:19 -0800
committerSunil K Pandey <skpgkp2@gmail.com>2022-01-12 13:23:22 -0800
commit49e2bf58d57758df244eb621d63cedd2ab6d1971 (patch)
tree0fc8b8525c5e1efdc8f125c7a050e505ea7e39c9
parentfcfc9086815bf0d277ad47a90ee3fda4c37acca8 (diff)
downloadglibc-49e2bf58d57758df244eb621d63cedd2ab6d1971.zip
glibc-49e2bf58d57758df244eb621d63cedd2ab6d1971.tar.gz
glibc-49e2bf58d57758df244eb621d63cedd2ab6d1971.tar.bz2
x86_64: Fix SSE4.2 libmvec atan2 function accuracy [BZ #28765]
This patch fixes SSE4.2 libmvec atan2 function accuracy for following inputs to less than 4 ulps. {0x1.bcab29da0e947p-54,0x1.bc41f4d2294b8p-54} 4.19888 ulps {0x1.b836ed678be29p-588,0x1.b7be6f5a03a8cp-588} 4.09889 ulps This fixes BZ #28765. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S321
1 files changed, 173 insertions, 148 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
index 4983051..138ff2f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
@@ -65,7 +65,7 @@
ENTRY(_ZGVbN2vv_atan2_sse4)
subq $88, %rsp
cfi_def_cfa_offset(96)
- movaps %xmm0, %xmm8
+ movaps %xmm1, %xmm11
/*
* #define NO_VECTOR_ZERO_ATAN2_ARGS
@@ -78,134 +78,161 @@ ENTRY(_ZGVbN2vv_atan2_sse4)
* Cannot be replaced by VQRCP(D, dR0, dB);
* Argument Absolute values
*/
- movups dABS_MASK+__svml_datan2_data_internal(%rip), %xmm4
+ movups dABS_MASK+__svml_datan2_data_internal(%rip), %xmm1
+ movaps %xmm0, %xmm10
movaps %xmm1, %xmm9
- movaps %xmm4, %xmm1
- andps %xmm8, %xmm4
- andps %xmm9, %xmm1
- movaps %xmm4, %xmm2
- cmpnltpd %xmm1, %xmm2
+ andps %xmm10, %xmm1
+ andps %xmm11, %xmm9
+ movaps %xmm1, %xmm4
+ cmpnltpd %xmm9, %xmm4
/* Argument signs */
- movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm3
- movaps %xmm2, %xmm0
- movups dPIO2+__svml_datan2_data_internal(%rip), %xmm5
- movaps %xmm3, %xmm7
- movaps %xmm3, %xmm6
+ movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm5
+ movaps %xmm4, %xmm0
+ movaps %xmm5, %xmm8
+ movaps %xmm5, %xmm7
/*
* 1) If y<x then a= y, b=x, PIO2=0
* 2) If y>x then a=-x, b=y, PIO2=Pi/2
*/
- orps %xmm1, %xmm3
- movaps %xmm2, %xmm10
- andps %xmm2, %xmm5
- andnps %xmm4, %xmm0
- andps %xmm2, %xmm3
- andnps %xmm1, %xmm10
- andps %xmm4, %xmm2
- orps %xmm3, %xmm0
- orps %xmm2, %xmm10
- divpd %xmm10, %xmm0
- movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm11
-
-/* if x<0, dPI = Pi, else dPI =0 */
- movaps %xmm9, %xmm3
+ orps %xmm9, %xmm5
+ andnps %xmm1, %xmm0
+ andps %xmm4, %xmm5
+ andps %xmm11, %xmm8
+ movups dPIO2+__svml_datan2_data_internal(%rip), %xmm6
+ orps %xmm5, %xmm0
+ movaps %xmm4, %xmm5
+ andps %xmm4, %xmm6
+ andnps %xmm9, %xmm5
+ andps %xmm1, %xmm4
+ orps %xmm4, %xmm5
+ andps %xmm10, %xmm7
+ divpd %xmm5, %xmm0
+ movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm2
+ xorl %edx, %edx
/* Check if y and x are on main path. */
- pshufd $221, %xmm1, %xmm12
- andps %xmm9, %xmm7
- psubd %xmm11, %xmm12
- andps %xmm8, %xmm6
- movq iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm13
- xorl %edx, %edx
- movups %xmm4, 16(%rsp)
+ pshufd $221, %xmm9, %xmm3
xorl %eax, %eax
- pshufd $221, %xmm4, %xmm14
- movdqa %xmm12, %xmm4
- pcmpgtd %xmm13, %xmm4
- pcmpeqd %xmm13, %xmm12
- por %xmm12, %xmm4
+ pshufd $221, %xmm1, %xmm13
+ psubd %xmm2, %xmm3
+ psubd %xmm2, %xmm13
+ movdqa %xmm3, %xmm4
+ movq iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm12
+ movdqa %xmm13, %xmm14
+ pcmpgtd %xmm12, %xmm4
+ pcmpeqd %xmm12, %xmm3
+ pcmpgtd %xmm12, %xmm14
+ pcmpeqd %xmm12, %xmm13
/* Polynomial. */
movaps %xmm0, %xmm12
+ por %xmm3, %xmm4
mulpd %xmm0, %xmm12
- cmplepd dZERO+__svml_datan2_data_internal(%rip), %xmm3
- psubd %xmm11, %xmm14
- movdqa %xmm14, %xmm15
- pcmpeqd %xmm13, %xmm14
- pcmpgtd %xmm13, %xmm15
- por %xmm14, %xmm15
- movaps %xmm12, %xmm14
- mulpd %xmm12, %xmm14
- por %xmm15, %xmm4
- movaps %xmm14, %xmm15
- mulpd %xmm14, %xmm15
- movmskps %xmm4, %ecx
- movups %xmm10, (%rsp)
- movups dA19+__svml_datan2_data_internal(%rip), %xmm10
- mulpd %xmm15, %xmm10
- movups dA18+__svml_datan2_data_internal(%rip), %xmm13
- movups dA17+__svml_datan2_data_internal(%rip), %xmm11
- addpd dA15+__svml_datan2_data_internal(%rip), %xmm10
- mulpd %xmm15, %xmm13
- mulpd %xmm15, %xmm11
- mulpd %xmm15, %xmm10
- addpd dA14+__svml_datan2_data_internal(%rip), %xmm13
- addpd dA13+__svml_datan2_data_internal(%rip), %xmm11
- addpd dA11+__svml_datan2_data_internal(%rip), %xmm10
- mulpd %xmm15, %xmm13
- mulpd %xmm15, %xmm11
- mulpd %xmm15, %xmm10
- addpd dA10+__svml_datan2_data_internal(%rip), %xmm13
- addpd dA09+__svml_datan2_data_internal(%rip), %xmm11
- addpd dA07+__svml_datan2_data_internal(%rip), %xmm10
- mulpd %xmm15, %xmm13
- mulpd %xmm15, %xmm11
- mulpd %xmm15, %xmm10
- addpd dA06+__svml_datan2_data_internal(%rip), %xmm13
- addpd dA05+__svml_datan2_data_internal(%rip), %xmm11
- addpd dA03+__svml_datan2_data_internal(%rip), %xmm10
- mulpd %xmm15, %xmm13
- mulpd %xmm15, %xmm11
- mulpd %xmm12, %xmm10
- addpd dA02+__svml_datan2_data_internal(%rip), %xmm13
- addpd dA01+__svml_datan2_data_internal(%rip), %xmm11
- addpd %xmm10, %xmm13
- mulpd %xmm11, %xmm12
- mulpd %xmm13, %xmm14
- movups dA16+__svml_datan2_data_internal(%rip), %xmm2
- mulpd %xmm15, %xmm2
- addpd dA12+__svml_datan2_data_internal(%rip), %xmm2
- mulpd %xmm15, %xmm2
- addpd dA08+__svml_datan2_data_internal(%rip), %xmm2
- mulpd %xmm15, %xmm2
- addpd dA04+__svml_datan2_data_internal(%rip), %xmm2
-
-/* A00=1.0, account for it later VQFMA(D, dP4, dP4, dR8, dA00); */
- mulpd %xmm2, %xmm15
- addpd %xmm12, %xmm15
- addpd %xmm14, %xmm15
+
+/* P = A19*R2 + A18 */
+ movups dA19+__svml_datan2_data_internal(%rip), %xmm15
+ movaps %xmm11, %xmm2
+ mulpd %xmm12, %xmm15
+ addpd dA18+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A17 */
+ mulpd %xmm12, %xmm15
+ addpd dA17+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A16 */
+ mulpd %xmm12, %xmm15
+ addpd dA16+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A15 */
+ mulpd %xmm12, %xmm15
+ addpd dA15+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A14 */
+ mulpd %xmm12, %xmm15
+ addpd dA14+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A13 */
+ mulpd %xmm12, %xmm15
+ addpd dA13+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A12 */
+ mulpd %xmm12, %xmm15
+ addpd dA12+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A11 */
+ mulpd %xmm12, %xmm15
+ addpd dA11+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A10 */
+ mulpd %xmm12, %xmm15
+ addpd dA10+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A09 */
+ mulpd %xmm12, %xmm15
+ addpd dA09+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A08 */
+ mulpd %xmm12, %xmm15
+ addpd dA08+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A07 */
+ mulpd %xmm12, %xmm15
+ addpd dA07+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A06 */
+ mulpd %xmm12, %xmm15
+ addpd dA06+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A05 */
+ mulpd %xmm12, %xmm15
+ addpd dA05+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A04 */
+ mulpd %xmm12, %xmm15
+ addpd dA04+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A03 */
+ mulpd %xmm12, %xmm15
+ addpd dA03+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A02 */
+ mulpd %xmm12, %xmm15
+ addpd dA02+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A01 */
+ mulpd %xmm12, %xmm15
+ addpd dA01+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 */
+ mulpd %xmm15, %xmm12
/*
* Reconstruction.
* dP=(R+R*dP) + dPIO2
*/
- mulpd %xmm0, %xmm15
- addpd %xmm15, %xmm0
- addpd %xmm5, %xmm0
- andps __svml_datan2_data_internal(%rip), %xmm3
+ mulpd %xmm0, %xmm12
+ addpd %xmm12, %xmm0
+
+/* if x<0, dPI = Pi, else dPI =0 */
+ movups dZERO+__svml_datan2_data_internal(%rip), %xmm3
+ por %xmm13, %xmm14
+ cmplepd %xmm3, %xmm2
+ addpd %xmm6, %xmm0
+ andps __svml_datan2_data_internal(%rip), %xmm2
+ orps %xmm8, %xmm0
+ addpd %xmm2, %xmm0
+ por %xmm14, %xmm4
orps %xmm7, %xmm0
- addpd %xmm3, %xmm0
+ movmskps %xmm4, %ecx
/* Special branch for fast (vector) processing of zero arguments */
- movups 16(%rsp), %xmm11
- orps %xmm6, %xmm0
testb $3, %cl
/* Go to auxilary branch */
jne L(AUX_BRANCH)
- # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11
+ # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11
/* Return from auxilary branch
* for out of main path inputs
@@ -220,7 +247,7 @@ L(AUX_BRANCH_RETURN):
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
- # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
+ # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
/* Restore registers
* and exit the function
@@ -237,8 +264,8 @@ L(EXIT):
*/
L(SPECIAL_VALUES_BRANCH):
- movups %xmm8, 32(%rsp)
- movups %xmm9, 48(%rsp)
+ movups %xmm10, 32(%rsp)
+ movups %xmm11, 48(%rsp)
movups %xmm0, 64(%rsp)
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0
@@ -315,66 +342,64 @@ L(SCALAR_MATH_CALL):
*/
L(AUX_BRANCH):
-/* Check if at least on of Y or Y is zero: iAXAYZERO */
- movups dZERO+__svml_datan2_data_internal(%rip), %xmm2
-
/* Check if both X & Y are not NaNs: iXYnotNAN */
- movaps %xmm9, %xmm12
- movaps %xmm8, %xmm10
- cmpordpd %xmm9, %xmm12
- cmpordpd %xmm8, %xmm10
- cmpeqpd %xmm2, %xmm1
- cmpeqpd %xmm2, %xmm11
- andps %xmm10, %xmm12
- orps %xmm11, %xmm1
- pshufd $221, %xmm1, %xmm1
- pshufd $221, %xmm12, %xmm11
+ movaps %xmm11, %xmm13
+ movaps %xmm10, %xmm12
+ cmpordpd %xmm11, %xmm13
+ cmpordpd %xmm10, %xmm12
-/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
- pand %xmm11, %xmm1
-
-/* Exclude from previous callout mask zero (and not NaN) arguments */
- movdqa %xmm1, %xmm13
- pandn %xmm4, %xmm13
+/* Check if at least on of Y or Y is zero: iAXAYZERO */
+ cmpeqpd %xmm3, %xmm9
+ cmpeqpd %xmm3, %xmm1
/*
* Path for zero arguments (at least one of both)
* Check if both args are zeros (den. is zero)
*/
- movups (%rsp), %xmm4
- cmpeqpd %xmm2, %xmm4
+ cmpeqpd %xmm3, %xmm5
+ andps %xmm12, %xmm13
+ orps %xmm1, %xmm9
+ pshufd $221, %xmm9, %xmm1
+ pshufd $221, %xmm13, %xmm9
-/* Go to callout */
- movmskps %xmm13, %edx
+/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
+ pand %xmm9, %xmm1
+
+/* Exclude from previous callout mask zero (and not NaN) arguments */
+ movdqa %xmm1, %xmm14
+ pandn %xmm4, %xmm14
/* Set sPIO2 to zero if den. is zero */
- movaps %xmm4, %xmm15
- andps %xmm2, %xmm4
- andnps %xmm5, %xmm15
- andl $3, %edx
- orps %xmm4, %xmm15
- pshufd $221, %xmm9, %xmm5
- orps %xmm7, %xmm15
+ movaps %xmm5, %xmm4
+ andnps %xmm6, %xmm4
+ andps %xmm3, %xmm5
/* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
- pshufd $221, %xmm2, %xmm7
- pcmpgtd %xmm5, %xmm7
- pshufd $80, %xmm7, %xmm14
- andps %xmm3, %xmm14
- addpd %xmm14, %xmm15
+ pshufd $221, %xmm3, %xmm3
+ orps %xmm5, %xmm4
+ pshufd $221, %xmm11, %xmm5
+ orps %xmm8, %xmm4
+ pcmpgtd %xmm5, %xmm3
+ pshufd $80, %xmm3, %xmm6
+ andps %xmm2, %xmm6
+ addpd %xmm6, %xmm4
+
+/* Go to callout */
+ movmskps %xmm14, %edx
/* Merge results from main and spec path */
- pshufd $80, %xmm1, %xmm3
- orps %xmm6, %xmm15
- movdqa %xmm3, %xmm6
- andps %xmm3, %xmm15
- andnps %xmm0, %xmm6
- movaps %xmm6, %xmm0
- orps %xmm15, %xmm0
+ pshufd $80, %xmm1, %xmm2
+ orps %xmm7, %xmm4
+ movdqa %xmm2, %xmm7
+ andps %xmm2, %xmm4
+ andnps %xmm0, %xmm7
+ andl $3, %edx
+ movaps %xmm7, %xmm0
+ orps %xmm4, %xmm0
/* Return to main vector processing path */
jmp L(AUX_BRANCH_RETURN)
- # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
+ # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
END(_ZGVbN2vv_atan2_sse4)
.section .rodata, "a"