aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Biener <rguenther@suse.de>2024-10-08 14:28:16 +0200
committerRichard Biener <rguenth@gcc.gnu.org>2024-10-09 09:54:42 +0200
commitdc90578f0b3b766303eef6f1acce45d603dee2c6 (patch)
tree9f664f0970de261528af2f0bdc3e268d73d109e9
parent5977b746db3925aaba37722f5312419d5f2968a5 (diff)
downloadgcc-dc90578f0b3b766303eef6f1acce45d603dee2c6.zip
gcc-dc90578f0b3b766303eef6f1acce45d603dee2c6.tar.gz
gcc-dc90578f0b3b766303eef6f1acce45d603dee2c6.tar.bz2
tree-optimization/116575 - handle SLP of permuted masked loads
The following handles SLP discovery of permuted masked loads which was prohibited (because wrongly handled) for PR114375. In particular with single-lane SLP at the moment all masked group loads appear permuted and we fail to use masked load lanes as well. The following addresses parts of the issues, starting with doing correct basic discovery - namely discover an unpermuted mask load followed by a permute node. In particular groups with gaps do not support masking yet (and didn't before w/o SLP IIRC). There's still issues with how we represent masked load/store-lanes I think, but I first have to get my hands on a good testcase. PR tree-optimization/116575 PR tree-optimization/114375 * tree-vect-slp.cc (vect_build_slp_tree_2): Do not reject permuted mask loads without gaps but instead discover a node for the full unpermuted load and permute that with a VEC_PERM node. * gcc.dg/vect/vect-pr114375.c: Expect vectorization now with avx2.
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect-pr114375.c2
-rw-r--r--gcc/tree-vect-slp.cc58
2 files changed, 56 insertions, 4 deletions
diff --git a/gcc/testsuite/gcc.dg/vect/vect-pr114375.c b/gcc/testsuite/gcc.dg/vect/vect-pr114375.c
index 1e1cb01..61e9bf1 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-pr114375.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-pr114375.c
@@ -30,6 +30,7 @@ int main()
{
check_vect ();
+#pragma GCC novector
for (int i = 0; i < 512; ++i)
a[i] = (i >> 1) & 1;
@@ -42,3 +43,4 @@ int main()
return 0;
}
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target avx2 } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 44ce9db..9bb765e 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -2029,16 +2029,66 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
|| gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
|| gimple_call_internal_p (stmt,
IFN_MASK_LEN_GATHER_LOAD));
- load_permutation.release ();
- /* We cannot handle permuted masked loads, see PR114375. */
+ bool has_gaps = false;
+ if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+ for (stmt_vec_info si = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
+ si; si = DR_GROUP_NEXT_ELEMENT (si))
+ if (DR_GROUP_GAP (si) != 1)
+ has_gaps = true;
+ /* We cannot handle permuted masked loads directly, see
+ PR114375. We cannot handle strided masked loads or masked
+ loads with gaps. */
+ if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
+ && (DR_GROUP_GAP (first_stmt_info) != 0 || has_gaps))
+ || STMT_VINFO_STRIDED_P (stmt_info))
+ {
+ load_permutation.release ();
+ matches[0] = false;
+ return NULL;
+ }
+
+ /* For permuted masked loads do an unpermuted masked load of
+ the whole group followed by a SLP permute node. */
if (any_permute
|| (STMT_VINFO_GROUPED_ACCESS (stmt_info)
- && DR_GROUP_SIZE (first_stmt_info) != group_size)
- || STMT_VINFO_STRIDED_P (stmt_info))
+ && DR_GROUP_SIZE (first_stmt_info) != group_size))
{
+ /* Discover the whole unpermuted load. */
+ vec<stmt_vec_info> stmts2;
+ stmts2.create (DR_GROUP_SIZE (first_stmt_info));
+ stmts2.quick_grow_cleared (DR_GROUP_SIZE (first_stmt_info));
+ unsigned i = 0;
+ for (stmt_vec_info si = first_stmt_info;
+ si; si = DR_GROUP_NEXT_ELEMENT (si))
+ stmts2[i++] = si;
+ bool *matches2
+ = XALLOCAVEC (bool, DR_GROUP_SIZE (first_stmt_info));
+ slp_tree unperm_load
+ = vect_build_slp_tree (vinfo, stmts2,
+ DR_GROUP_SIZE (first_stmt_info),
+ &this_max_nunits, matches2, limit,
+ &this_tree_size, bst_map);
+ /* When we are able to do the full masked load emit that
+ followed by 'node' being the desired final permutation. */
+ if (unperm_load)
+ {
+ lane_permutation_t lperm;
+ lperm.create (group_size);
+ for (unsigned j = 0; j < load_permutation.length (); ++j)
+ lperm.quick_push
+ (std::make_pair (0, load_permutation[j]));
+ SLP_TREE_CODE (node) = VEC_PERM_EXPR;
+ SLP_TREE_CHILDREN (node).safe_push (unperm_load);
+ SLP_TREE_LANE_PERMUTATION (node) = lperm;
+ load_permutation.release ();
+ return node;
+ }
+ stmts2.release ();
+ load_permutation.release ();
matches[0] = false;
return NULL;
}
+ load_permutation.release ();
}
else
{