aboutsummaryrefslogtreecommitdiff
path: root/gcc/omp-expand.c
diff options
context:
space:
mode:
authorJakub Jelinek <jakub@redhat.com>2020-09-26 10:10:09 +0200
committerJakub Jelinek <jakub@redhat.com>2020-09-26 10:10:09 +0200
commita29bd4f59e9eebf52ac41f7d7a6fa83cf2aae09d (patch)
tree596cff33e64350adf7347f302f330b5a6585f0ad /gcc/omp-expand.c
parentd00b1b023ecfc3ddc3fe952c0063dab7529d5f7a (diff)
downloadgcc-a29bd4f59e9eebf52ac41f7d7a6fa83cf2aae09d.zip
gcc-a29bd4f59e9eebf52ac41f7d7a6fa83cf2aae09d.tar.gz
gcc-a29bd4f59e9eebf52ac41f7d7a6fa83cf2aae09d.tar.bz2
openmp: Improve #pragma omp simd vectorization
As mentioned earlier, the vectorizer punts on vectorization of loops with non-constant steps. As for OpenMP loops it is by the language restriction always possible to compute the number of loop iterations before the loop, this change helps those cases by computing it and using an alternate IV that iterates from 0 to < niterations with step of 1 next to the normal IV which will be just linear in that. List of functions where we compared to current trunk vectorize some loops where we previously didn't (for c-c++-common only listing the C function names, both C and C++ are affected though): gcc/testsuite/gcc.dg/vect/vect-simd-17.c doit gcc/testsuite/gcc.dg/vect/vect-simd-18.c foo gcc/testsuite/gcc.dg/vect/vect-simd-19.c foo gcc/testsuite/gcc.dg/vect/vect-simd-20.c foo libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_auto libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_guided32 libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_runtime libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_static libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_static32 libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_auto._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_guided32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_runtime._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_static32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_static._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_simd_normal libgomp/testsuite/libgomp.c-c++-common/for-2.c f5_simd_normal libgomp/testsuite/libgomp.c-c++-common/for-2.c f6_simd_normal libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_auto._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_auto._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_guided32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_runtime._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_static32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_static._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_guided32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_runtime._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_static32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_static._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_ds_ds128_normal libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_ds_normal libgomp/testsuite/libgomp.c-c++-common/for-4.c f3_taskloop_simd_normal._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_auto._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_guided32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_runtime._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_static32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_static._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_t_simd_normal._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_auto._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_auto._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_guided32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_runtime._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_static32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_static._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_guided32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_runtime._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_static32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_static._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttds_ds128_normal._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttds_normal._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-5.c f5_t_simd_normal._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-5.c f6_t_simd_normal._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_auto._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_auto._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_guided32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_runtime._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_static32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_static._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_guided32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_runtime._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_static32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_static._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tds_ds128_normal._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tds_normal._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_auto._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_auto._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_guided32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_runtime._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_static32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_static._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_guided32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_runtime._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_static32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_static._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_ds_ds128_normal libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_ds_normal libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_auto._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_auto._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_guided32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_runtime._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_static32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_static._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_guided32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_runtime._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_static32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_static._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tds_ds128_normal._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tds_normal._omp_fn.0 2020-09-26 Jakub Jelinek <jakub@redhat.com> * omp-expand.c (expand_omp_simd): Help vectorizer for the collapse == 1 and non-composite collapse > 1 case with non-constant innermost loop step by precomputing number of iterations before loop and using an alternate IV from 0 to number of iterations - 1 with step of 1. * gcc.dg/vect/vect-simd-17.c: Expect 11 or more vectorized loops. * gcc.dg/vect/vect-simd-18.c: New test. * gcc.dg/vect/vect-simd-19.c: New test. * gcc.dg/vect/vect-simd-20.c: New test.
Diffstat (limited to 'gcc/omp-expand.c')
-rw-r--r--gcc/omp-expand.c110
1 files changed, 106 insertions, 4 deletions
diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c
index 9160022..99cb4f9 100644
--- a/gcc/omp-expand.c
+++ b/gcc/omp-expand.c
@@ -6452,6 +6452,56 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
}
else
expand_omp_build_assign (&gsi, fd->loop.v, fold_convert (type, n1));
+ tree altv = NULL_TREE, altn2 = NULL_TREE;
+ if (fd->collapse == 1
+ && !broken_loop
+ && TREE_CODE (fd->loops[0].step) != INTEGER_CST)
+ {
+ /* The vectorizer currently punts on loops with non-constant steps
+ for the main IV (can't compute number of iterations and gives up
+ because of that). As for OpenMP loops it is always possible to
+ compute the number of iterations upfront, use an alternate IV
+ as the loop iterator:
+ altn2 = n1 < n2 ? (n2 - n1 + step - 1) / step : 0;
+ for (i = n1, altv = 0; altv < altn2; altv++, i += step) */
+ altv = create_tmp_var (unsigned_type_for (TREE_TYPE (fd->loops[0].v)));
+ expand_omp_build_assign (&gsi, altv, build_zero_cst (TREE_TYPE (altv)));
+ tree itype = TREE_TYPE (fd->loop.v);
+ if (POINTER_TYPE_P (itype))
+ itype = signed_type_for (itype);
+ t = build_int_cst (itype, (fd->loop.cond_code == LT_EXPR ? -1 : 1));
+ t = fold_build2 (PLUS_EXPR, itype,
+ fold_convert (itype, fd->loop.step), t);
+ t = fold_build2 (PLUS_EXPR, itype, t, fold_convert (itype, n2));
+ t = fold_build2 (MINUS_EXPR, itype, t,
+ fold_convert (itype, fd->loop.v));
+ if (TYPE_UNSIGNED (itype) && fd->loop.cond_code == GT_EXPR)
+ t = fold_build2 (TRUNC_DIV_EXPR, itype,
+ fold_build1 (NEGATE_EXPR, itype, t),
+ fold_build1 (NEGATE_EXPR, itype,
+ fold_convert (itype, fd->loop.step)));
+ else
+ t = fold_build2 (TRUNC_DIV_EXPR, itype, t,
+ fold_convert (itype, fd->loop.step));
+ t = fold_convert (TREE_TYPE (altv), t);
+ altn2 = create_tmp_var (TREE_TYPE (altv));
+ expand_omp_build_assign (&gsi, altn2, t);
+ tree t2 = fold_convert (TREE_TYPE (fd->loop.v), n2);
+ t2 = force_gimple_operand_gsi (&gsi, t2, true, NULL_TREE,
+ true, GSI_SAME_STMT);
+ t2 = fold_build2 (fd->loop.cond_code, boolean_type_node, fd->loop.v, t2);
+ gassign *g = gimple_build_assign (altn2, COND_EXPR, t2, altn2,
+ build_zero_cst (TREE_TYPE (altv)));
+ gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+ }
+ else if (fd->collapse > 1
+ && !broken_loop
+ && !gimple_omp_for_combined_into_p (fd->for_stmt)
+ && TREE_CODE (fd->loops[fd->collapse - 1].step) != INTEGER_CST)
+ {
+ altv = create_tmp_var (unsigned_type_for (TREE_TYPE (fd->loops[0].v)));
+ altn2 = create_tmp_var (TREE_TYPE (altv));
+ }
if (cond_var)
{
if (POINTER_TYPE_P (type)
@@ -6486,6 +6536,12 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
}
else if (TREE_CODE (n2) != INTEGER_CST)
expand_omp_build_assign (&gsi, fd->loop.v, build_one_cst (type));
+ if (altv)
+ {
+ t = fold_build2 (PLUS_EXPR, TREE_TYPE (altv), altv,
+ build_one_cst (TREE_TYPE (altv)));
+ expand_omp_build_assign (&gsi, altv, t);
+ }
if (fd->collapse > 1)
{
@@ -6525,9 +6581,11 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
/* Emit the condition in L1_BB. */
gsi = gsi_start_bb (l1_bb);
- if (fd->collapse > 1
- && !gimple_omp_for_combined_into_p (fd->for_stmt)
- && !broken_loop)
+ if (altv)
+ t = build2 (LT_EXPR, boolean_type_node, altv, altn2);
+ else if (fd->collapse > 1
+ && !gimple_omp_for_combined_into_p (fd->for_stmt)
+ && !broken_loop)
{
i = fd->collapse - 1;
tree itype = TREE_TYPE (fd->loops[i].v);
@@ -6704,7 +6762,7 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
expand_omp_build_assign (&gsi, fd->loops[i + 1].v, t);
if (fd->loops[i + 1].m2)
{
- if (i + 2 == fd->collapse && n2var)
+ if (i + 2 == fd->collapse && (n2var || altv))
{
gcc_assert (n2v == NULL_TREE);
n2v = create_tmp_var (TREE_TYPE (fd->loops[i + 1].v));
@@ -6761,6 +6819,50 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
t = fold_build2 (PLUS_EXPR, type, fd->loop.v, t);
expand_omp_build_assign (&gsi, n2var, t);
}
+ if (i + 2 == fd->collapse && altv)
+ {
+ /* The vectorizer currently punts on loops with non-constant
+ steps for the main IV (can't compute number of iterations
+ and gives up because of that). As for OpenMP loops it is
+ always possible to compute the number of iterations upfront,
+ use an alternate IV as the loop iterator. */
+ expand_omp_build_assign (&gsi, altv,
+ build_zero_cst (TREE_TYPE (altv)));
+ tree itype = TREE_TYPE (fd->loops[i + 1].v);
+ if (POINTER_TYPE_P (itype))
+ itype = signed_type_for (itype);
+ t = build_int_cst (itype, (fd->loops[i + 1].cond_code == LT_EXPR
+ ? -1 : 1));
+ t = fold_build2 (PLUS_EXPR, itype,
+ fold_convert (itype, fd->loops[i + 1].step), t);
+ t = fold_build2 (PLUS_EXPR, itype, t,
+ fold_convert (itype,
+ fd->loops[i + 1].m2
+ ? n2v : fd->loops[i + 1].n2));
+ t = fold_build2 (MINUS_EXPR, itype, t,
+ fold_convert (itype, fd->loops[i + 1].v));
+ tree step = fold_convert (itype, fd->loops[i + 1].step);
+ if (TYPE_UNSIGNED (itype)
+ && fd->loops[i + 1].cond_code == GT_EXPR)
+ t = fold_build2 (TRUNC_DIV_EXPR, itype,
+ fold_build1 (NEGATE_EXPR, itype, t),
+ fold_build1 (NEGATE_EXPR, itype, step));
+ else
+ t = fold_build2 (TRUNC_DIV_EXPR, itype, t, step);
+ t = fold_convert (TREE_TYPE (altv), t);
+ expand_omp_build_assign (&gsi, altn2, t);
+ tree t2 = fold_convert (TREE_TYPE (fd->loops[i + 1].v),
+ fd->loops[i + 1].m2
+ ? n2v : fd->loops[i + 1].n2);
+ t2 = force_gimple_operand_gsi (&gsi, t2, true, NULL_TREE,
+ true, GSI_SAME_STMT);
+ t2 = fold_build2 (fd->loops[i + 1].cond_code, boolean_type_node,
+ fd->loops[i + 1].v, t2);
+ gassign *g
+ = gimple_build_assign (altn2, COND_EXPR, t2, altn2,
+ build_zero_cst (TREE_TYPE (altv)));
+ gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+ }
n2v = nextn2v;
make_edge (init_bb, last_bb, EDGE_FALLTHRU);