diff options
author | Jakub Jelinek <jakub@redhat.com> | 2020-09-26 10:10:09 +0200 |
---|---|---|
committer | Jakub Jelinek <jakub@redhat.com> | 2020-09-26 10:10:09 +0200 |
commit | a29bd4f59e9eebf52ac41f7d7a6fa83cf2aae09d (patch) | |
tree | 596cff33e64350adf7347f302f330b5a6585f0ad /gcc/omp-expand.c | |
parent | d00b1b023ecfc3ddc3fe952c0063dab7529d5f7a (diff) | |
download | gcc-a29bd4f59e9eebf52ac41f7d7a6fa83cf2aae09d.zip gcc-a29bd4f59e9eebf52ac41f7d7a6fa83cf2aae09d.tar.gz gcc-a29bd4f59e9eebf52ac41f7d7a6fa83cf2aae09d.tar.bz2 |
openmp: Improve #pragma omp simd vectorization
As mentioned earlier, the vectorizer punts on vectorization of loops with non-constant
steps. As for OpenMP loops it is by the language restriction always possible to compute
the number of loop iterations before the loop, this change helps those cases
by computing it and using an alternate IV that iterates from 0 to < niterations with
step of 1 next to the normal IV which will be just linear in that.
List of functions where we compared to current trunk vectorize some loops where we
previously didn't (for c-c++-common only listing the C function names, both C and C++
are affected though):
gcc/testsuite/gcc.dg/vect/vect-simd-17.c doit
gcc/testsuite/gcc.dg/vect/vect-simd-18.c foo
gcc/testsuite/gcc.dg/vect/vect-simd-19.c foo
gcc/testsuite/gcc.dg/vect/vect-simd-20.c foo
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_auto
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_guided32
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_runtime
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_static
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_static32
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_auto._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_guided32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_runtime._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_static32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_static._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_simd_normal
libgomp/testsuite/libgomp.c-c++-common/for-2.c f5_simd_normal
libgomp/testsuite/libgomp.c-c++-common/for-2.c f6_simd_normal
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_auto._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_auto._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_guided32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_runtime._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_static32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_static._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_guided32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_runtime._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_static32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_static._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_ds_ds128_normal
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_ds_normal
libgomp/testsuite/libgomp.c-c++-common/for-4.c f3_taskloop_simd_normal._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_auto._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_guided32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_runtime._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_static32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_static._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_t_simd_normal._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_auto._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_auto._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_guided32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_runtime._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_static32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_static._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_guided32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_runtime._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_static32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_static._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttds_ds128_normal._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttds_normal._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-5.c f5_t_simd_normal._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-5.c f6_t_simd_normal._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_auto._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_auto._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_guided32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_runtime._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_static32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_static._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_guided32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_runtime._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_static32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_static._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tds_ds128_normal._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tds_normal._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_auto._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_auto._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_guided32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_runtime._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_static32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_static._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_guided32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_runtime._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_static32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_static._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_ds_ds128_normal
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_ds_normal
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_auto._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_auto._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_guided32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_runtime._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_static32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_static._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_guided32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_runtime._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_static32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_static._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tds_ds128_normal._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tds_normal._omp_fn.0
2020-09-26 Jakub Jelinek <jakub@redhat.com>
* omp-expand.c (expand_omp_simd): Help vectorizer for the collapse == 1
and non-composite collapse > 1 case with non-constant innermost loop
step by precomputing number of iterations before loop and using an
alternate IV from 0 to number of iterations - 1 with step of 1.
* gcc.dg/vect/vect-simd-17.c: Expect 11 or more vectorized loops.
* gcc.dg/vect/vect-simd-18.c: New test.
* gcc.dg/vect/vect-simd-19.c: New test.
* gcc.dg/vect/vect-simd-20.c: New test.
Diffstat (limited to 'gcc/omp-expand.c')
-rw-r--r-- | gcc/omp-expand.c | 110 |
1 files changed, 106 insertions, 4 deletions
diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c index 9160022..99cb4f9 100644 --- a/gcc/omp-expand.c +++ b/gcc/omp-expand.c @@ -6452,6 +6452,56 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) } else expand_omp_build_assign (&gsi, fd->loop.v, fold_convert (type, n1)); + tree altv = NULL_TREE, altn2 = NULL_TREE; + if (fd->collapse == 1 + && !broken_loop + && TREE_CODE (fd->loops[0].step) != INTEGER_CST) + { + /* The vectorizer currently punts on loops with non-constant steps + for the main IV (can't compute number of iterations and gives up + because of that). As for OpenMP loops it is always possible to + compute the number of iterations upfront, use an alternate IV + as the loop iterator: + altn2 = n1 < n2 ? (n2 - n1 + step - 1) / step : 0; + for (i = n1, altv = 0; altv < altn2; altv++, i += step) */ + altv = create_tmp_var (unsigned_type_for (TREE_TYPE (fd->loops[0].v))); + expand_omp_build_assign (&gsi, altv, build_zero_cst (TREE_TYPE (altv))); + tree itype = TREE_TYPE (fd->loop.v); + if (POINTER_TYPE_P (itype)) + itype = signed_type_for (itype); + t = build_int_cst (itype, (fd->loop.cond_code == LT_EXPR ? -1 : 1)); + t = fold_build2 (PLUS_EXPR, itype, + fold_convert (itype, fd->loop.step), t); + t = fold_build2 (PLUS_EXPR, itype, t, fold_convert (itype, n2)); + t = fold_build2 (MINUS_EXPR, itype, t, + fold_convert (itype, fd->loop.v)); + if (TYPE_UNSIGNED (itype) && fd->loop.cond_code == GT_EXPR) + t = fold_build2 (TRUNC_DIV_EXPR, itype, + fold_build1 (NEGATE_EXPR, itype, t), + fold_build1 (NEGATE_EXPR, itype, + fold_convert (itype, fd->loop.step))); + else + t = fold_build2 (TRUNC_DIV_EXPR, itype, t, + fold_convert (itype, fd->loop.step)); + t = fold_convert (TREE_TYPE (altv), t); + altn2 = create_tmp_var (TREE_TYPE (altv)); + expand_omp_build_assign (&gsi, altn2, t); + tree t2 = fold_convert (TREE_TYPE (fd->loop.v), n2); + t2 = force_gimple_operand_gsi (&gsi, t2, true, NULL_TREE, + true, GSI_SAME_STMT); + t2 = fold_build2 (fd->loop.cond_code, boolean_type_node, fd->loop.v, t2); + gassign *g = gimple_build_assign (altn2, COND_EXPR, t2, altn2, + build_zero_cst (TREE_TYPE (altv))); + gsi_insert_before (&gsi, g, GSI_SAME_STMT); + } + else if (fd->collapse > 1 + && !broken_loop + && !gimple_omp_for_combined_into_p (fd->for_stmt) + && TREE_CODE (fd->loops[fd->collapse - 1].step) != INTEGER_CST) + { + altv = create_tmp_var (unsigned_type_for (TREE_TYPE (fd->loops[0].v))); + altn2 = create_tmp_var (TREE_TYPE (altv)); + } if (cond_var) { if (POINTER_TYPE_P (type) @@ -6486,6 +6536,12 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) } else if (TREE_CODE (n2) != INTEGER_CST) expand_omp_build_assign (&gsi, fd->loop.v, build_one_cst (type)); + if (altv) + { + t = fold_build2 (PLUS_EXPR, TREE_TYPE (altv), altv, + build_one_cst (TREE_TYPE (altv))); + expand_omp_build_assign (&gsi, altv, t); + } if (fd->collapse > 1) { @@ -6525,9 +6581,11 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) /* Emit the condition in L1_BB. */ gsi = gsi_start_bb (l1_bb); - if (fd->collapse > 1 - && !gimple_omp_for_combined_into_p (fd->for_stmt) - && !broken_loop) + if (altv) + t = build2 (LT_EXPR, boolean_type_node, altv, altn2); + else if (fd->collapse > 1 + && !gimple_omp_for_combined_into_p (fd->for_stmt) + && !broken_loop) { i = fd->collapse - 1; tree itype = TREE_TYPE (fd->loops[i].v); @@ -6704,7 +6762,7 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) expand_omp_build_assign (&gsi, fd->loops[i + 1].v, t); if (fd->loops[i + 1].m2) { - if (i + 2 == fd->collapse && n2var) + if (i + 2 == fd->collapse && (n2var || altv)) { gcc_assert (n2v == NULL_TREE); n2v = create_tmp_var (TREE_TYPE (fd->loops[i + 1].v)); @@ -6761,6 +6819,50 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) t = fold_build2 (PLUS_EXPR, type, fd->loop.v, t); expand_omp_build_assign (&gsi, n2var, t); } + if (i + 2 == fd->collapse && altv) + { + /* The vectorizer currently punts on loops with non-constant + steps for the main IV (can't compute number of iterations + and gives up because of that). As for OpenMP loops it is + always possible to compute the number of iterations upfront, + use an alternate IV as the loop iterator. */ + expand_omp_build_assign (&gsi, altv, + build_zero_cst (TREE_TYPE (altv))); + tree itype = TREE_TYPE (fd->loops[i + 1].v); + if (POINTER_TYPE_P (itype)) + itype = signed_type_for (itype); + t = build_int_cst (itype, (fd->loops[i + 1].cond_code == LT_EXPR + ? -1 : 1)); + t = fold_build2 (PLUS_EXPR, itype, + fold_convert (itype, fd->loops[i + 1].step), t); + t = fold_build2 (PLUS_EXPR, itype, t, + fold_convert (itype, + fd->loops[i + 1].m2 + ? n2v : fd->loops[i + 1].n2)); + t = fold_build2 (MINUS_EXPR, itype, t, + fold_convert (itype, fd->loops[i + 1].v)); + tree step = fold_convert (itype, fd->loops[i + 1].step); + if (TYPE_UNSIGNED (itype) + && fd->loops[i + 1].cond_code == GT_EXPR) + t = fold_build2 (TRUNC_DIV_EXPR, itype, + fold_build1 (NEGATE_EXPR, itype, t), + fold_build1 (NEGATE_EXPR, itype, step)); + else + t = fold_build2 (TRUNC_DIV_EXPR, itype, t, step); + t = fold_convert (TREE_TYPE (altv), t); + expand_omp_build_assign (&gsi, altn2, t); + tree t2 = fold_convert (TREE_TYPE (fd->loops[i + 1].v), + fd->loops[i + 1].m2 + ? n2v : fd->loops[i + 1].n2); + t2 = force_gimple_operand_gsi (&gsi, t2, true, NULL_TREE, + true, GSI_SAME_STMT); + t2 = fold_build2 (fd->loops[i + 1].cond_code, boolean_type_node, + fd->loops[i + 1].v, t2); + gassign *g + = gimple_build_assign (altn2, COND_EXPR, t2, altn2, + build_zero_cst (TREE_TYPE (altv))); + gsi_insert_before (&gsi, g, GSI_SAME_STMT); + } n2v = nextn2v; make_edge (init_bb, last_bb, EDGE_FALLTHRU); |