aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gcc/doc/invoke.texi6
-rw-r--r--gcc/params.opt7
-rw-r--r--gcc/testsuite/gcc.dg/pr110279-2.c41
-rw-r--r--gcc/tree-ssa-reassoc.cc150
4 files changed, 177 insertions, 27 deletions
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index f89f926..c671bec 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -16583,6 +16583,12 @@ Maximum number of basic blocks for VRP to use a basic cache vector.
@item avoid-fma-max-bits
Maximum number of bits for which we avoid creating FMAs.
+@item fully-pipelined-fma
+Whether the target fully pipelines FMA instructions. If non-zero,
+reassociation considers the benefit of parallelizing FMA's multiplication
+part and addition part, assuming FMUL and FMA use the same units that can
+also do FADD.
+
@item sms-loop-average-count-threshold
A threshold on the average loop count considered by the swing modulo scheduler.
diff --git a/gcc/params.opt b/gcc/params.opt
index f187659..63a4c30 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -134,6 +134,13 @@ Maximal estimated growth of function body caused by early inlining of single cal
Common Joined UInteger Var(param_fsm_scale_path_stmts) Init(2) IntegerRange(1, 10) Param Optimization
Scale factor to apply to the number of statements in a threading path crossing a loop backedge when comparing to max-jump-thread-duplication-stmts.
+-param=fully-pipelined-fma=
+Common Joined UInteger Var(param_fully_pipelined_fma) Init(0) IntegerRange(0, 1) Param Optimization
+Whether the target fully pipelines FMA instructions. If non-zero,
+reassociation considers the benefit of parallelizing FMA's multiplication
+part and addition part, assuming FMUL and FMA use the same units that can
+also do FADD.
+
-param=gcse-after-reload-critical-fraction=
Common Joined UInteger Var(param_gcse_after_reload_critical_fraction) Init(10) Param Optimization
The threshold ratio of critical edges execution count that permit performing redundancy elimination after reload.
diff --git a/gcc/testsuite/gcc.dg/pr110279-2.c b/gcc/testsuite/gcc.dg/pr110279-2.c
new file mode 100644
index 0000000..0304a77
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr110279-2.c
@@ -0,0 +1,41 @@
+/* PR tree-optimization/110279 */
+/* { dg-do compile } */
+/* { dg-options "-Ofast --param tree-reassoc-width=4 --param fully-pipelined-fma=1 -fdump-tree-reassoc2-details -fdump-tree-optimized" } */
+/* { dg-additional-options "-march=armv8.2-a" { target aarch64-*-* } } */
+
+#define LOOP_COUNT 800000000
+typedef double data_e;
+
+#include <stdio.h>
+
+__attribute_noinline__ data_e
+foo (data_e in)
+{
+ data_e a1, a2, a3, a4;
+ data_e tmp, result = 0;
+ a1 = in + 0.1;
+ a2 = in * 0.1;
+ a3 = in + 0.01;
+ a4 = in * 0.59;
+
+ data_e result2 = 0;
+
+ for (int ic = 0; ic < LOOP_COUNT; ic++)
+ {
+ /* Test that a complete FMA chain with length=4 is not broken. */
+ tmp = a1 + a2 * a2 + a3 * a3 + a4 * a4 ;
+ result += tmp - ic;
+ result2 = result2 / 2 - tmp;
+
+ a1 += 0.91;
+ a2 += 0.1;
+ a3 -= 0.01;
+ a4 -= 0.89;
+
+ }
+
+ return result + result2;
+}
+
+/* { dg-final { scan-tree-dump-not "was chosen for reassociation" "reassoc2"} } */
+/* { dg-final { scan-tree-dump-times {\.FMA } 3 "optimized"} } */ \ No newline at end of file
diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc
index 07fc8e2..cdef9f7 100644
--- a/gcc/tree-ssa-reassoc.cc
+++ b/gcc/tree-ssa-reassoc.cc
@@ -5430,13 +5430,35 @@ get_required_cycles (int ops_num, int cpu_width)
return res;
}
+/* Given that the target fully pipelines FMA instructions, return the latency
+ of MULT_EXPRs that can't be hidden by the FMAs. WIDTH is the number of
+ pipes. */
+
+static inline int
+get_mult_latency_consider_fma (int ops_num, int mult_num, int width)
+{
+ gcc_checking_assert (mult_num && mult_num <= ops_num);
+
+ /* For each partition, if mult_num == ops_num, there's latency(MULT)*2.
+ e.g:
+
+ A * B + C * D
+ =>
+ _1 = A * B;
+ _2 = .FMA (C, D, _1);
+
+ Otherwise there's latency(MULT)*1 in the first FMA. */
+ return CEIL (ops_num, width) == CEIL (mult_num, width) ? 2 : 1;
+}
+
/* Returns an optimal number of registers to use for computation of
given statements.
- LHS is the result ssa name of OPS. */
+ LHS is the result ssa name of OPS. MULT_NUM is number of sub-expressions
+ that are MULT_EXPRs, when OPS are PLUS_EXPRs or MINUS_EXPRs. */
static int
-get_reassociation_width (vec<operand_entry *> *ops, tree lhs,
+get_reassociation_width (vec<operand_entry *> *ops, int mult_num, tree lhs,
enum tree_code opc, machine_mode mode)
{
int param_width = param_tree_reassoc_width;
@@ -5462,16 +5484,68 @@ get_reassociation_width (vec<operand_entry *> *ops, tree lhs,
so we can perform a binary search for the minimal width that still
results in the optimal cycle count. */
width_min = 1;
- while (width > width_min)
+
+ /* If the target fully pipelines FMA instruction, the multiply part can start
+ already if its operands are ready. Assuming symmetric pipes are used for
+ FMUL/FADD/FMA, then for a sequence of FMA like:
+
+ _8 = .FMA (_2, _3, _1);
+ _9 = .FMA (_5, _4, _8);
+ _10 = .FMA (_7, _6, _9);
+
+ , if width=1, the latency is latency(MULT) + latency(ADD)*3.
+ While with width=2:
+
+ _8 = _4 * _5;
+ _9 = .FMA (_2, _3, _1);
+ _10 = .FMA (_6, _7, _8);
+ _11 = _9 + _10;
+
+ , it is latency(MULT)*2 + latency(ADD)*2. Assuming latency(MULT) >=
+ latency(ADD), the first variant is preferred.
+
+ Find out if we can get a smaller width considering FMA. */
+ if (width > 1 && mult_num && param_fully_pipelined_fma)
{
- int width_mid = (width + width_min) / 2;
+ /* When param_fully_pipelined_fma is set, assume FMUL and FMA use the
+ same units that can also do FADD. For other scenarios, such as when
+ FMUL and FADD are using separated units, the following code may not
+ appy. */
+ int width_mult = targetm.sched.reassociation_width (MULT_EXPR, mode);
+ gcc_checking_assert (width_mult <= width);
+
+ /* Latency of MULT_EXPRs. */
+ int lat_mul
+ = get_mult_latency_consider_fma (ops_num, mult_num, width_mult);
+
+ /* Quick search might not apply. So start from 1. */
+ for (int i = 1; i < width_mult; i++)
+ {
+ int lat_mul_new
+ = get_mult_latency_consider_fma (ops_num, mult_num, i);
+ int lat_add_new = get_required_cycles (ops_num, i);
- if (get_required_cycles (ops_num, width_mid) == cycles_best)
- width = width_mid;
- else if (width_min < width_mid)
- width_min = width_mid;
- else
- break;
+ /* Assume latency(MULT) >= latency(ADD). */
+ if (lat_mul - lat_mul_new >= lat_add_new - cycles_best)
+ {
+ width = i;
+ break;
+ }
+ }
+ }
+ else
+ {
+ while (width > width_min)
+ {
+ int width_mid = (width + width_min) / 2;
+
+ if (get_required_cycles (ops_num, width_mid) == cycles_best)
+ width = width_mid;
+ else if (width_min < width_mid)
+ width_min = width_mid;
+ else
+ break;
+ }
}
/* If there's loop dependent FMA result, return width=2 to avoid it. This is
@@ -6841,8 +6915,10 @@ transform_stmt_to_multiply (gimple_stmt_iterator *gsi, gimple *stmt,
Rearrange ops to -> e + a * b + c * d generates:
_4 = .FMA (c_7(D), d_8(D), _3);
- _11 = .FMA (a_5(D), b_6(D), _4); */
-static bool
+ _11 = .FMA (a_5(D), b_6(D), _4);
+
+ Return the number of MULT_EXPRs in the chain. */
+static int
rank_ops_for_fma (vec<operand_entry *> *ops)
{
operand_entry *oe;
@@ -6856,9 +6932,26 @@ rank_ops_for_fma (vec<operand_entry *> *ops)
if (TREE_CODE (oe->op) == SSA_NAME)
{
gimple *def_stmt = SSA_NAME_DEF_STMT (oe->op);
- if (is_gimple_assign (def_stmt)
- && gimple_assign_rhs_code (def_stmt) == MULT_EXPR)
- ops_mult.safe_push (oe);
+ if (is_gimple_assign (def_stmt))
+ {
+ if (gimple_assign_rhs_code (def_stmt) == MULT_EXPR)
+ ops_mult.safe_push (oe);
+ /* A negate on the multiplication leads to FNMA. */
+ else if (gimple_assign_rhs_code (def_stmt) == NEGATE_EXPR
+ && TREE_CODE (gimple_assign_rhs1 (def_stmt)) == SSA_NAME)
+ {
+ gimple *neg_def_stmt
+ = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (def_stmt));
+ if (is_gimple_assign (neg_def_stmt)
+ && gimple_bb (neg_def_stmt) == gimple_bb (def_stmt)
+ && gimple_assign_rhs_code (neg_def_stmt) == MULT_EXPR)
+ ops_mult.safe_push (oe);
+ else
+ ops_others.safe_push (oe);
+ }
+ else
+ ops_others.safe_push (oe);
+ }
else
ops_others.safe_push (oe);
}
@@ -6874,7 +6967,8 @@ rank_ops_for_fma (vec<operand_entry *> *ops)
Putting ops that not def from mult in front can generate more FMAs.
2. If all ops are defined with mult, we don't need to rearrange them. */
- if (ops_mult.length () >= 2 && ops_mult.length () != ops_length)
+ unsigned mult_num = ops_mult.length ();
+ if (mult_num >= 2 && mult_num != ops_length)
{
/* Put no-mult ops and mult ops alternately at the end of the
queue, which is conducive to generating more FMA and reducing the
@@ -6890,9 +6984,8 @@ rank_ops_for_fma (vec<operand_entry *> *ops)
if (opindex > 0)
opindex--;
}
- return true;
}
- return false;
+ return mult_num;
}
/* Reassociate expressions in basic block BB and its post-dominator as
children.
@@ -7057,8 +7150,8 @@ reassociate_bb (basic_block bb)
{
machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
int ops_num = ops.length ();
- int width;
- bool has_fma = false;
+ int width = 0;
+ int mult_num = 0;
/* For binary bit operations, if there are at least 3
operands and the last operand in OPS is a constant,
@@ -7081,16 +7174,17 @@ reassociate_bb (basic_block bb)
opt_type)
&& (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR))
{
- has_fma = rank_ops_for_fma (&ops);
+ mult_num = rank_ops_for_fma (&ops);
}
/* Only rewrite the expression tree to parallel in the
last reassoc pass to avoid useless work back-and-forth
with initial linearization. */
+ bool has_fma = mult_num >= 2 && mult_num != ops_num;
if (!reassoc_insert_powi_p
&& ops.length () > 3
- && (width
- = get_reassociation_width (&ops, lhs, rhs_code, mode))
+ && (width = get_reassociation_width (&ops, mult_num, lhs,
+ rhs_code, mode))
> 1)
{
if (dump_file && (dump_flags & TDF_DETAILS))
@@ -7111,10 +7205,12 @@ reassociate_bb (basic_block bb)
if (len >= 3
&& (!has_fma
/* width > 1 means ranking ops results in better
- parallelism. */
- || get_reassociation_width (&ops, lhs, rhs_code,
- mode)
- > 1))
+ parallelism. Check current value to avoid
+ calling get_reassociation_width again. */
+ || (width != 1
+ && get_reassociation_width (
+ &ops, mult_num, lhs, rhs_code, mode)
+ > 1)))
swap_ops_for_binary_stmt (ops, len - 3);
new_lhs = rewrite_expr_tree (stmt, rhs_code, 0, ops,