aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKyrylo Tkachov <ktkachov@nvidia.com>2024-08-02 06:21:16 -0700
committerKyrylo Tkachov <ktkachov@nvidia.com>2024-08-05 16:37:47 +0530
commit44da85f4455ea11296667434172810ea76a62add (patch)
treef6ce6bc8350cb7e5114c48074b8756e0880e219c
parent8e2c9360c2df4b16582d3b9eb34e8c448798a1f3 (diff)
downloadgcc-44da85f4455ea11296667434172810ea76a62add.zip
gcc-44da85f4455ea11296667434172810ea76a62add.tar.gz
gcc-44da85f4455ea11296667434172810ea76a62add.tar.bz2
tree-reassoc.cc: PR tree-optimization/116139 Don't assert when forming fully-pipelined FMAs on wide MULT targets
The code in get_reassociation_width that forms FMAs aggressively when they are fully pipelined expects the FMUL reassociation width in the target to be less than for FMAs. This doesn't hold for all target tunings. This code shouldn't ICE, just avoid forming these FMAs here. This patch does that. Signed-off-by: Kyrylo Tkachov <ktkachov@nvidia.com> PR tree-optimization/116139 gcc/ChangeLog: * tree-ssa-reassoc.cc (get_reassociation_width): Move width_mult <= width comparison to if condition rather than assert. gcc/testsuite/ChangeLog: * gcc.target/aarch64/pr116139.c: New test.
-rw-r--r--gcc/testsuite/gcc.target/aarch64/pr116139.c35
-rw-r--r--gcc/tree-ssa-reassoc.cc17
2 files changed, 43 insertions, 9 deletions
diff --git a/gcc/testsuite/gcc.target/aarch64/pr116139.c b/gcc/testsuite/gcc.target/aarch64/pr116139.c
new file mode 100644
index 0000000..78a2132
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr116139.c
@@ -0,0 +1,35 @@
+/* PR tree-optimization/116139 */
+/* { dg-do compile } */
+/* { dg-options "-Ofast --param fully-pipelined-fma=1 -mcpu=neoverse-n3" } */
+
+#define LOOP_COUNT 800000000
+typedef double data_e;
+
+data_e
+foo (data_e in)
+{
+ data_e a1, a2, a3, a4;
+ data_e tmp, result = 0;
+ a1 = in + 0.1;
+ a2 = in * 0.1;
+ a3 = in + 0.01;
+ a4 = in * 0.59;
+
+ data_e result2 = 0;
+
+ for (int ic = 0; ic < LOOP_COUNT; ic++)
+ {
+ tmp = a1 + a2 * a2 + a3 * a3 + a4 * a4 ;
+ result += tmp - ic;
+ result2 = result2 / 2 - tmp;
+
+ a1 += 0.91;
+ a2 += 0.1;
+ a3 -= 0.01;
+ a4 -= 0.89;
+
+ }
+
+ return result + result2;
+}
+
diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc
index d743522..70c810c 100644
--- a/gcc/tree-ssa-reassoc.cc
+++ b/gcc/tree-ssa-reassoc.cc
@@ -5509,16 +5509,15 @@ get_reassociation_width (vec<operand_entry *> *ops, int mult_num, tree lhs,
, it is latency(MULT)*2 + latency(ADD)*2. Assuming latency(MULT) >=
latency(ADD), the first variant is preferred.
- Find out if we can get a smaller width considering FMA. */
- if (width > 1 && mult_num && param_fully_pipelined_fma)
+ Find out if we can get a smaller width considering FMA.
+ Assume FMUL and FMA use the same units that can also do FADD.
+ For other scenarios, such as when FMUL and FADD are using separated units,
+ the following code may not apply. */
+
+ int width_mult = targetm.sched.reassociation_width (MULT_EXPR, mode);
+ if (width > 1 && mult_num && param_fully_pipelined_fma
+ && width_mult <= width)
{
- /* When param_fully_pipelined_fma is set, assume FMUL and FMA use the
- same units that can also do FADD. For other scenarios, such as when
- FMUL and FADD are using separated units, the following code may not
- appy. */
- int width_mult = targetm.sched.reassociation_width (MULT_EXPR, mode);
- gcc_checking_assert (width_mult <= width);
-
/* Latency of MULT_EXPRs. */
int lat_mul
= get_mult_latency_consider_fma (ops_num, mult_num, width_mult);