aboutsummaryrefslogtreecommitdiff
path: root/gcc/tree-vectorizer.h
diff options
context:
space:
mode:
authorTamar Christina <tamar.christina@arm.com>2021-11-10 15:59:26 +0000
committerTamar Christina <tamar.christina@arm.com>2021-11-10 16:03:18 +0000
commit86ffc845b2d0bff59832dcf3cf6518f1358e30ac (patch)
tree91fa2fc59feb6dbd0f95dfebe316595ce4d6f7b8 /gcc/tree-vectorizer.h
parent8ed62c929c7c44627f41627e085e15d77b2e6ed4 (diff)
downloadgcc-86ffc845b2d0bff59832dcf3cf6518f1358e30ac.zip
gcc-86ffc845b2d0bff59832dcf3cf6518f1358e30ac.tar.gz
gcc-86ffc845b2d0bff59832dcf3cf6518f1358e30ac.tar.bz2
AArch64: do not keep negated mask and inverse mask live at the same time
The following example: void f11(double * restrict z, double * restrict w, double * restrict x, double * restrict y, int n) { for (int i = 0; i < n; i++) { z[i] = (w[i] > 0) ? w[i] : y[i]; } } Generates currently: ptrue p2.b, all ld1d z0.d, p0/z, [x1, x2, lsl 3] fcmgt p1.d, p2/z, z0.d, #0.0 bic p3.b, p2/z, p0.b, p1.b ld1d z1.d, p3/z, [x3, x2, lsl 3] and after the previous patches generates: ptrue p3.b, all ld1d z0.d, p0/z, [x1, x2, lsl 3] fcmgt p1.d, p0/z, z0.d, #0.0 fcmgt p2.d, p3/z, z0.d, #0.0 not p1.b, p0/z, p1.b ld1d z1.d, p1/z, [x3, x2, lsl 3] where a duplicate comparison is performed for w[i] > 0. This is because in the vectorizer we're emitting a comparison for both a and ~a where we just need to emit one of them and invert the other. After this patch we generate: ld1d z0.d, p0/z, [x1, x2, lsl 3] fcmgt p1.d, p0/z, z0.d, #0.0 mov p2.b, p1.b not p1.b, p0/z, p1.b ld1d z1.d, p1/z, [x3, x2, lsl 3] In order to perform the check I have to fully expand the NOT stmts when recording them as the SSA names for the top level expressions differ but their arguments don't. e.g. in _31 = ~_34 the value of _34 differs but not the operands in _34. But we only do this when the operation is an ordered one because mixing ordered and unordered expressions can lead to de-optimized code. Note: This patch series is working incrementally towards generating the most efficient code for this and other loops in small steps. The mov is created by postreload when it does a late CSE. gcc/ChangeLog: * tree-vectorizer.h (struct scalar_cond_masked_key): Add inverted_p. (default_hash_traits<scalar_conf_masked_key>): Likewise. * tree-vect-stmts.c (vectorizable_condition): Check if inverse of mask is live. * tree-vectorizer.c (scalar_cond_masked_key::get_cond_ops_from_tree): Register mask inverses. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/pred-not-gen-1.c: Update testcase. * gcc.target/aarch64/sve/pred-not-gen-2.c: Update testcase. * gcc.target/aarch64/sve/pred-not-gen-3.c: Update testcase. * gcc.target/aarch64/sve/pred-not-gen-4.c: Update testcase.
Diffstat (limited to 'gcc/tree-vectorizer.h')
-rw-r--r--gcc/tree-vectorizer.h10
1 files changed, 7 insertions, 3 deletions
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index f8f3064..bd6f334 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -266,6 +266,7 @@ struct scalar_cond_masked_key
void get_cond_ops_from_tree (tree);
unsigned ncopies;
+ bool inverted_p;
tree_code code;
tree op0;
tree op1;
@@ -285,6 +286,7 @@ struct default_hash_traits<scalar_cond_masked_key>
inchash::add_expr (v.op0, h, 0);
inchash::add_expr (v.op1, h, 0);
h.add_int (v.ncopies);
+ h.add_flag (v.inverted_p);
return h.end ();
}
@@ -292,9 +294,10 @@ struct default_hash_traits<scalar_cond_masked_key>
equal (value_type existing, value_type candidate)
{
return (existing.ncopies == candidate.ncopies
- && existing.code == candidate.code
- && operand_equal_p (existing.op0, candidate.op0, 0)
- && operand_equal_p (existing.op1, candidate.op1, 0));
+ && existing.code == candidate.code
+ && existing.inverted_p == candidate.inverted_p
+ && operand_equal_p (existing.op0, candidate.op0, 0)
+ && operand_equal_p (existing.op1, candidate.op1, 0));
}
static const bool empty_zero_p = true;
@@ -303,6 +306,7 @@ struct default_hash_traits<scalar_cond_masked_key>
mark_empty (value_type &v)
{
v.ncopies = 0;
+ v.inverted_p = false;
}
static inline bool