diff options
author | Chung-Lin Tang <cltang@baylibre.com> | 2025-05-02 00:33:07 +0000 |
---|---|---|
committer | Sandra Loosemore <sloosemore@baylibre.com> | 2025-05-15 20:25:51 +0000 |
commit | a6682e7af891700ae67e7d0f5d3fcc462eb44609 (patch) | |
tree | 61fd0c61939ebbd3a5396fbd9b865f2f529abf61 | |
parent | 5fd60a678116773e99d5fd2d64a118f837e5d6f0 (diff) | |
download | gcc-a6682e7af891700ae67e7d0f5d3fcc462eb44609.zip gcc-a6682e7af891700ae67e7d0f5d3fcc462eb44609.tar.gz gcc-a6682e7af891700ae67e7d0f5d3fcc462eb44609.tar.bz2 |
OpenACC: array reductions bug fixes
This is a merge of the v4 to v5 diff patch from:
https://gcc.gnu.org/pipermail/gcc-patches/2025-March/679682.html
This patch fixes issues found for NVPTX sm_70 testing, and another issue
related to copying to reduction buffer for worker/vector mode.
gcc/ChangeLog:
* config/gcn/gcn-tree.cc (gcn_goacc_reduction_setup): Fix array case
copy source into reduction buffer.
* config/nvptx/nvptx.cc (nvptx_expand_shared_addr): Move default size
init setting place.
(enum nvptx_builtins): Add NVPTX_BUILTIN_BAR_WARPSYNC.
(nvptx_init_builtins): Add DEF() of nvptx_builtin_bar_warpsync.
(nvptx_expand_builtin): Expand NVPTX_BUILTIN_BAR_WARPSYNC.
(nvptx_goacc_reduction_setup): Fix array case copy source into reduction
buffer.
(nvptx_goacc_reduction_fini): Add bar.warpsync for at end of vector-mode
reductions for sm_70 and above.
libgomp/ChangeLog:
* testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c: Adjust test.
* testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c: Likewise.
6 files changed, 122 insertions, 9 deletions
diff --git a/gcc/config/gcn/gcn-tree.cc b/gcc/config/gcn/gcn-tree.cc index 87c4267..c3349d6 100644 --- a/gcc/config/gcn/gcn-tree.cc +++ b/gcc/config/gcn/gcn-tree.cc @@ -750,13 +750,14 @@ gcn_goacc_reduction_setup (gcall *call) tree offset = gimple_call_arg (call, 5); if (array_p) { + tree copy_src = !integer_zerop (ref_to_res) ? ref_to_res : array_addr; tree decl = gcn_goacc_get_worker_array_reduction_buffer (array_type, array_max_idx, &seq); tree ptr = make_ssa_name (TREE_TYPE (array_addr)); gimplify_assign (ptr, build_fold_addr_expr (decl), &seq); /* Store incoming value to worker reduction buffer. */ - oacc_build_array_copy (ptr, array_addr, array_max_idx, &seq); + oacc_build_array_copy (ptr, copy_src, array_max_idx, &seq); } else { diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index ba40a84..115d34f 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -6516,16 +6516,16 @@ nvptx_expand_shared_addr (tree exp, rtx target, if (TREE_CONSTANT (size_expr)) size = TREE_INT_CST_LOW (size_expr); + /* Default size for unknown size expression. */ + if (size == 0) + size = 256; + if (vector) { offload_attrs oa; populate_offload_attrs (&oa); - /* Default size for unknown size expression. */ - if (size == 0) - size = 256; - unsigned int psize = ROUND_UP (size + offset, align); unsigned int pnum = nvptx_mach_max_workers (); vector_red_partition = MAX (vector_red_partition, psize); @@ -6621,6 +6621,7 @@ enum nvptx_builtins NVPTX_BUILTIN_BAR_RED_AND, NVPTX_BUILTIN_BAR_RED_OR, NVPTX_BUILTIN_BAR_RED_POPC, + NVPTX_BUILTIN_BAR_WARPSYNC, NVPTX_BUILTIN_BREV, NVPTX_BUILTIN_BREVLL, NVPTX_BUILTIN_COND_UNI, @@ -6753,6 +6754,8 @@ nvptx_init_builtins (void) DEF (BAR_RED_POPC, "bar_red_popc", (UINT, UINT, UINT, UINT, UINT, NULL_TREE)); + DEF (BAR_WARPSYNC, "bar_warpsync", (VOID, VOID, NULL_TREE)); + DEF (BREV, "brev", (UINT, UINT, NULL_TREE)); DEF (BREVLL, "brevll", (LLUINT, LLUINT, NULL_TREE)); @@ -6803,6 +6806,10 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget), case NVPTX_BUILTIN_BAR_RED_POPC: return nvptx_expand_bar_red (exp, target, mode, ignore); + case NVPTX_BUILTIN_BAR_WARPSYNC: + emit_insn (gen_nvptx_warpsync ()); + return NULL_RTX; + case NVPTX_BUILTIN_BREV: case NVPTX_BUILTIN_BREVLL: return nvptx_expand_brev (exp, target, mode, ignore); @@ -7774,11 +7781,11 @@ nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa) push_gimplify_context (true); + /* Copy the receiver object. */ + tree ref_to_res = gimple_call_arg (call, 1); + if (level != GOMP_DIM_GANG) { - /* Copy the receiver object. */ - tree ref_to_res = gimple_call_arg (call, 1); - if (!integer_zerop (ref_to_res) && !array_p) { ref_to_res = nvptx_adjust_reduction_type (ref_to_res, @@ -7798,13 +7805,14 @@ nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa) tree call, ptr; if (array_p) { + tree copy_src = !integer_zerop (ref_to_res) ? ref_to_res : array_addr; tree array_elem_type = TREE_TYPE (array_type); call = nvptx_get_shared_red_addr (array_elem_type, array_max_idx, offset, level == GOMP_DIM_VECTOR); ptr = make_ssa_name (TREE_TYPE (call)); gimplify_assign (ptr, call, &seq); oacc_build_array_copy (fold_convert (TREE_TYPE (array_addr), ptr), - array_addr, array_max_idx, &seq); + copy_src, array_max_idx, &seq); } else { @@ -8038,6 +8046,14 @@ nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa) else r = nvptx_reduction_update (gimple_location (call), &gsi, accum, var, op, level); + + if (TARGET_SM70 && level == GOMP_DIM_VECTOR) + { + /* After SM70, with Independent Thread Scheduling introduced, + place a warpsync after vector-mode update of accum buffer. */ + tree fn = nvptx_builtin_decl (NVPTX_BUILTIN_BAR_WARPSYNC, true); + gimple_seq_add_stmt (&seq, gimple_build_call (fn, 0)); + } } } diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c index 43e139f..db8b374 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c @@ -24,6 +24,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel + #pragma acc loop gang reduction(+:a[1:2]) + ARRAY_BODY (a, 1, 2) + ARRAY_BODY (o, 1, 2) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel copy(a[3:2]) #pragma acc loop reduction(+:a[3:2]) ARRAY_BODY (a, 3, 2) @@ -32,6 +40,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel copy(a[3:2]) + #pragma acc loop worker reduction(+:a[3:2]) + ARRAY_BODY (a, 3, 2) + ARRAY_BODY (o, 3, 2) + for (int i = 0; i < 6; i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel copy(a) #pragma acc loop reduction(+:a[0:5]) ARRAY_BODY (a, 0, 5) @@ -40,6 +56,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel copy(a) + #pragma acc loop vector reduction(+:a[0:5]) + ARRAY_BODY (a, 0, 5) + ARRAY_BODY (o, 0, 5) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel #pragma acc loop reduction(+:a) ARRAY_BODY (a, 4, 1) diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c index aeae2e0..0f023b7 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c @@ -30,6 +30,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel + #pragma acc loop gang reduction(+:a[one:2]) + ARRAY_BODY (a, one, 2) + ARRAY_BODY (o, one, 2) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel copy(a[three:2]) #pragma acc loop reduction(+:a[three:2]) ARRAY_BODY (a, three, 2) @@ -38,6 +46,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel copy(a[three:2]) + #pragma acc loop worker reduction(+:a[three:2]) + ARRAY_BODY (a, three, 2) + ARRAY_BODY (o, three, 2) + for (int i = 0; i < 6; i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel copy(a) #pragma acc loop reduction(+:a[zero:5]) ARRAY_BODY (a, zero, 5) @@ -46,6 +62,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel copy(a) + #pragma acc loop vector reduction(+:a[zero:5]) + ARRAY_BODY (a, zero, 5) + ARRAY_BODY (o, zero, 5) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel #pragma acc loop reduction(+:a) ARRAY_BODY (a, four, 1) diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c index c095284..94dd4c4 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c @@ -31,6 +31,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel + #pragma acc loop gang reduction(+:a[one:two]) + ARRAY_BODY (a, one, two) + ARRAY_BODY (o, one, two) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel copy(a[three:two]) #pragma acc loop reduction(+:a[three:two]) ARRAY_BODY (a, three, two) @@ -39,6 +47,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel copy(a[three:two]) + #pragma acc loop worker reduction(+:a[three:two]) + ARRAY_BODY (a, three, two) + ARRAY_BODY (o, three, two) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel copy(a) #pragma acc loop reduction(+:a[zero:five]) ARRAY_BODY (a, zero, five) @@ -47,6 +63,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel copy(a) + #pragma acc loop vector reduction(+:a[zero:five]) + ARRAY_BODY (a, zero, five) + ARRAY_BODY (o, zero, five) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel #pragma acc loop reduction(+:a) ARRAY_BODY (a, four, one) diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c index 4794350..56ae020 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c @@ -32,6 +32,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel + #pragma acc loop gang reduction(+:a[one:two]) + ARRAY_BODY (a, one, two) + ARRAY_BODY (o, one, two) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel copy(a[three:two]) #pragma acc loop reduction(+:a[three:two]) ARRAY_BODY (a, three, two) @@ -40,6 +48,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel copy(a[three:two]) + #pragma acc loop worker reduction(+:a[three:two]) + ARRAY_BODY (a, three, two) + ARRAY_BODY (o, three, two) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel copy(a) #pragma acc loop reduction(+:a[zero:five]) ARRAY_BODY (a, zero, five) @@ -48,6 +64,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel copy(a) + #pragma acc loop vector reduction(+:a[zero:five]) + ARRAY_BODY (a, zero, five) + ARRAY_BODY (o, zero, five) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel #pragma acc loop reduction(+:a) ARRAY_BODY (a, four, one) |