aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGergö Barany <gergo@codesourcery.com>2019-01-21 12:50:14 -0800
committerThomas Schwinge <thomas@codesourcery.com>2020-03-03 12:16:57 +0100
commit66d2ab316aa1302b21cd84f1d53286da7c6bd4b4 (patch)
tree13e8cbcd6174eee537574e9de52535ef6832007d
parente2fde505cdca35e4d67160fe84bdb857efe97258 (diff)
downloadgcc-66d2ab316aa1302b21cd84f1d53286da7c6bd4b4.zip
gcc-66d2ab316aa1302b21cd84f1d53286da7c6bd4b4.tar.gz
gcc-66d2ab316aa1302b21cd84f1d53286da7c6bd4b4.tar.bz2
Launch kernels asynchronously in OpenACC kernels regions
Kernels regions are decomposed into one or more smaller regions that are to be executed in sequence. With this patch, all of these regions are launched asynchronously, and a wait directive is added after them. This means that the host only waits once for the kernels to complete, not once per kernel. If the original kernels region was marked async, that asynchronous behavior is preserved, and no wait is added. gcc/ * omp-oacc-kernels.c (add_async_clauses_and_wait): New function... (decompose_kernels_region_body): ... called from here. gcc/testsuite/ * c-c++-common/goacc/kernels-conversion.c: Test automatically generated async clauses. * gfortran.dg/goacc/kernels-conversion.f95: Likewise. (cherry picked from openacc-gcc-9-branch commit 14a66effcef4707c2ba6592814405f652f58329e)
-rw-r--r--gcc/ChangeLog.omp5
-rw-r--r--gcc/omp-oacc-kernels.c56
-rw-r--r--gcc/testsuite/ChangeLog.omp6
-rw-r--r--gcc/testsuite/c-c++-common/goacc/kernels-conversion.c5
-rw-r--r--gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f955
5 files changed, 74 insertions, 3 deletions
diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp
index df1e08f..01c5f61 100644
--- a/gcc/ChangeLog.omp
+++ b/gcc/ChangeLog.omp
@@ -1,3 +1,8 @@
+2019-01-21 Gergö Barany <gergo@codesourcery.com>
+
+ * omp-oacc-kernels.c (add_async_clauses_and_wait): New function...
+ (decompose_kernels_region_body): ... called from here.
+
2019-01-23 Gergö Barany <gergo@codesourcery.com>
* omp-oacc-kernels.c (add_parent_or_loop_num_clause): New function.
diff --git a/gcc/omp-oacc-kernels.c b/gcc/omp-oacc-kernels.c
index c334502..f8553f7 100644
--- a/gcc/omp-oacc-kernels.c
+++ b/gcc/omp-oacc-kernels.c
@@ -66,7 +66,13 @@ along with GCC; see the file COPYING3. If not see
gang-parallelizable loop inside an if statement is "gang-serialized" by
the transformation.
The transformation visits loops inside such new gang-single-regions and
- removes and warns about any gang annotations. */
+ removes and warns about any gang annotations.
+ - In order to make the host wait only once for the whole region instead
+ of once per kernel launch, the new parallel and serial regions are
+ annotated async. Unless the original kernels region was marked async,
+ the entire region ends with a wait construct. If the original kernels
+ region was marked async, the generated async statements use the async
+ queue the kernels region was annotated with (possibly implicitly). */
/* Helper function for decompose_kernels_region_body. If STMT contains a
"top-level" OMP_FOR statement, returns a pointer to that statement;
@@ -671,6 +677,38 @@ maybe_build_inner_data_region (location_t loc, gimple *body,
return body;
}
+/* Helper function of decompose_kernels_region_body. The statements in
+ REGION_BODY are expected to be decomposed parallel regions; add an
+ "async" clause to each. Also add a "wait" pragma at the end of the
+ sequence. */
+
+static void
+add_async_clauses_and_wait (location_t loc, gimple_seq *region_body)
+{
+ tree default_async_queue
+ = build_int_cst (integer_type_node, GOMP_ASYNC_NOVAL);
+ for (gimple_stmt_iterator gsi = gsi_start (*region_body);
+ !gsi_end_p (gsi);
+ gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+ tree target_clauses = gimple_omp_target_clauses (stmt);
+ tree new_async_clause = build_omp_clause (loc, OMP_CLAUSE_ASYNC);
+ OMP_CLAUSE_OPERAND (new_async_clause, 0) = default_async_queue;
+ OMP_CLAUSE_CHAIN (new_async_clause) = target_clauses;
+ target_clauses = new_async_clause;
+ gimple_omp_target_set_clauses (as_a <gomp_target *> (stmt),
+ target_clauses);
+ }
+ /* A "#pragma acc wait" is just a call GOACC_wait (acc_async_sync, 0). */
+ tree wait_fn = builtin_decl_explicit (BUILT_IN_GOACC_WAIT);
+ tree sync_arg = build_int_cst (integer_type_node, GOMP_ASYNC_SYNC);
+ gimple *wait_call = gimple_build_call (wait_fn, 2,
+ sync_arg, integer_zero_node);
+ gimple_set_location (wait_call, loc);
+ gimple_seq_add_stmt (region_body, wait_call);
+}
+
/* Auxiliary analysis of the body of a kernels region, to determine for each
OpenACC loop whether it is control-dependent (i.e., not necessarily
executed every time the kernels region is entered) or not.
@@ -885,10 +923,12 @@ decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
except that the num_gangs, num_workers, and vector_length clauses will
only be added to loop regions. The other regions are "gang-single" and
get an explicit num_gangs(1) clause. So separate out the num_gangs,
- num_workers, and vector_length clauses here. */
+ num_workers, and vector_length clauses here.
+ Also check for the presence of an async clause but do not remove it
+ from the kernels clauses. */
tree num_gangs_clause = NULL, num_workers_clause = NULL,
vector_length_clause = NULL;
- tree prev_clause = NULL, next_clause = NULL;
+ tree prev_clause = NULL, next_clause = NULL, async_clause = NULL;
tree parallel_clauses = kernels_clauses;
for (tree c = parallel_clauses; c; c = next_clause)
{
@@ -922,6 +962,8 @@ decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
}
else
prev_clause = c;
+ if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_ASYNC)
+ async_clause = c;
}
gimple *kernels_body = gimple_omp_body (kernels_region);
@@ -1085,6 +1127,14 @@ decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
gimple_seq_add_stmt (&region_body, single_region);
}
+ /* We want to launch these kernels asynchronously. If the original
+ kernels region had an async clause, this is done automatically because
+ that async clause was copied to the individual regions we created.
+ Otherwise, add an async clause to each newly created region, as well as
+ a wait directive at the end. */
+ if (async_clause == NULL)
+ add_async_clauses_and_wait (loc, &region_body);
+
tree kernels_locals = gimple_bind_vars (as_a <gbind *> (kernels_body));
gimple *body = gimple_build_bind (kernels_locals, region_body,
make_node (BLOCK));
diff --git a/gcc/testsuite/ChangeLog.omp b/gcc/testsuite/ChangeLog.omp
index 5d64bd2..41df80a 100644
--- a/gcc/testsuite/ChangeLog.omp
+++ b/gcc/testsuite/ChangeLog.omp
@@ -1,3 +1,9 @@
+2019-01-21 Gergö Barany <gergo@codesourcery.com>
+
+ * c-c++-common/goacc/kernels-conversion.c: Test automatically generated
+ async clauses.
+ * gfortran.dg/goacc/kernels-conversion.f95: Likewise.
+
2019-01-23 Gergö Barany <gergo@codesourcery.com>
* c-c++-common/goacc/kernels-conversion.c: Add test for conditionally
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c b/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
index ed4d642..3e52ec4 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
@@ -49,5 +49,10 @@ main (void)
/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 3 "convert_oacc_kernels" } } */
/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 2 "convert_oacc_kernels" } } */
+/* Each of the parallel regions is async, and there is a final call to
+ __builtin_GOACC_wait. */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels.* async\(-1\)" 5 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1 "convert_oacc_kernels" } } */
+
/* Check that the original kernels region is removed. */
/* { dg-final { scan-tree-dump-not "oacc_kernels" "convert_oacc_kernels" } } */
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
index f89e46b..559916c 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
@@ -47,5 +47,10 @@ end program main
! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 3 "convert_oacc_kernels" } }
! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 2 "convert_oacc_kernels" } }
+! Each of the parallel regions is async, and there is a final call to
+! __builtin_GOACC_wait.
+! { dg-final { scan-tree-dump-times "oacc_parallel_kernels.* async\(-1\)" 5 "convert_oacc_kernels" } }
+! { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1 "convert_oacc_kernels" } }
+
! Check that the original kernels region is removed.
! { dg-final { scan-tree-dump-not "oacc_kernels" "convert_oacc_kernels" } }