aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRoger Sayle <roger@nextmovesoftware.com>2022-03-17 21:56:32 +0000
committerRoger Sayle <roger@nextmovesoftware.com>2022-03-17 21:56:32 +0000
commitc482c28ba4c549006deb70dead90fe8ab34dcbcf (patch)
tree5b73315f9fd67649dd05da3b7fcc3b57ea8dd085
parent8db155ddf8cec9e31f0a4b8d80cc67db2c7a26f9 (diff)
downloadgcc-c482c28ba4c549006deb70dead90fe8ab34dcbcf.zip
gcc-c482c28ba4c549006deb70dead90fe8ab34dcbcf.tar.gz
gcc-c482c28ba4c549006deb70dead90fe8ab34dcbcf.tar.bz2
PR 90356: Use xor to load const_double 0.0 on SSE (always)
Implementations of the x87 floating point instruction set have always had some pretty strange characteristics. For example on the original Intel Pentium the FLDPI instruction (to load 3.14159... into a register) took 5 cycles, and the FLDZ instruction (to load 0.0) took 2 cycles, when a regular FLD (load from memory) took just 1 cycle!? Given that back then memory latencies were much lower (relatively) than they are today, these instructions were all but useless except when optimizing for size (impressively FLDZ/FLDPI require only two bytes). Such was the world back in 2006 when Uros Bizjak first added support for fldz https://gcc.gnu.org/pipermail/gcc-patches/2006-November/202589.html and then shortly after sensibly disabled them for !optimize_size with https://gcc.gnu.org/pipermail/gcc-patches/2006-November/204405.html Alas this vestigial logic still persists in the compiler today, so for example on x86_64 for the following function: double foo(double x) { return x + 0.0; } generates with -O2 foo: addsd .LC0(%rip), %xmm0 ret .LC0: .long 0 .long 0 preferring to read the constant 0.0 from memory [the constant pool], except when optimizing for size. With -Os we get: foo: xorps %xmm1, %xmm1 addsd %xmm1, %xmm0 ret Which is not only smaller (the two instructions require seven bytes vs. eight for the original addsd from mem, even without considering the constant pool) but is also faster on modern hardware. The latter code sequence is generated by both clang and msvc with -O2. Indeed Agner Fogg documents the set of floating point/SSE constants that it's cheaper to materialize than to load from memory. This patch shuffles the conditions on the i386 backend's *movtf_internal, *movdf_internal and *movsf_internal define_insns to untangle the newer TARGET_SSE_MATH clauses from the historical standard_80387_constant_p conditions. Amongst the benefits of this are that it improves the code generated for PR tree-optimization/90356 and resolves PR target/86722. 2022-03-17 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR target/86722 PR tree-optimization/90356 * config/i386/i386.md (*movtf_internal): Don't guard standard_sse_constant_p clause by optimize_function_for_size_p. (*movdf_internal): Likewise. (*movsf_internal): Likewise. gcc/testsuite/ChangeLog PR target/86722 PR tree-optimization/90356 * gcc.target/i386/pr86722.c: New test case. * gcc.target/i386/pr90356.c: New test case.
-rw-r--r--gcc/config/i386/i386.md21
-rw-r--r--gcc/testsuite/gcc.target/i386/pr86722.c11
-rw-r--r--gcc/testsuite/gcc.target/i386/pr90356.c9
3 files changed, 30 insertions, 11 deletions
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 46a2663..5b44c65 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -3455,9 +3455,7 @@
&& !(MEM_P (operands[0]) && MEM_P (operands[1]))
&& (lra_in_progress || reload_completed
|| !CONST_DOUBLE_P (operands[1])
- || ((optimize_function_for_size_p (cfun)
- || (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC))
- && standard_sse_constant_p (operands[1], TFmode) == 1
+ || (standard_sse_constant_p (operands[1], TFmode) == 1
&& !memory_operand (operands[0], TFmode))
|| (!TARGET_MEMORY_MISMATCH_STALL
&& memory_operand (operands[0], TFmode)))"
@@ -3590,10 +3588,11 @@
|| !CONST_DOUBLE_P (operands[1])
|| ((optimize_function_for_size_p (cfun)
|| (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC))
- && ((IS_STACK_MODE (DFmode)
- && standard_80387_constant_p (operands[1]) > 0)
- || (TARGET_SSE2 && TARGET_SSE_MATH
- && standard_sse_constant_p (operands[1], DFmode) == 1))
+ && IS_STACK_MODE (DFmode)
+ && standard_80387_constant_p (operands[1]) > 0
+ && !memory_operand (operands[0], DFmode))
+ || (TARGET_SSE2 && TARGET_SSE_MATH
+ && standard_sse_constant_p (operands[1], DFmode) == 1
&& !memory_operand (operands[0], DFmode))
|| ((TARGET_64BIT || !TARGET_MEMORY_MISMATCH_STALL)
&& memory_operand (operands[0], DFmode))
@@ -3762,10 +3761,10 @@
|| !CONST_DOUBLE_P (operands[1])
|| ((optimize_function_for_size_p (cfun)
|| (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC))
- && ((IS_STACK_MODE (SFmode)
- && standard_80387_constant_p (operands[1]) > 0)
- || (TARGET_SSE && TARGET_SSE_MATH
- && standard_sse_constant_p (operands[1], SFmode) == 1)))
+ && IS_STACK_MODE (SFmode)
+ && standard_80387_constant_p (operands[1]) > 0)
+ || (TARGET_SSE && TARGET_SSE_MATH
+ && standard_sse_constant_p (operands[1], SFmode) == 1)
|| memory_operand (operands[0], SFmode)
|| !TARGET_HARD_SF_REGS)"
{
diff --git a/gcc/testsuite/gcc.target/i386/pr86722.c b/gcc/testsuite/gcc.target/i386/pr86722.c
new file mode 100644
index 0000000..1092c4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr86722.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse" } */
+
+void f(double*d,double*e){
+ for(;d<e;++d)
+ *d=(*d<.5)?.7:0;
+}
+
+/* { dg-final { scan-assembler-not "andnpd" } } */
+/* { dg-final { scan-assembler-not "orpd" } } */
+
diff --git a/gcc/testsuite/gcc.target/i386/pr90356.c b/gcc/testsuite/gcc.target/i386/pr90356.c
new file mode 100644
index 0000000..6f852b3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90356.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mfpmath=sse" } */
+float doit(float k){
+ float c[2]={0.0};
+ c[1]+=k;
+ return c[0]+c[1];
+}
+
+/* { dg-final { scan-assembler "pxor" } } */