aboutsummaryrefslogtreecommitdiff
path: root/gcc/omp-low.c
diff options
context:
space:
mode:
authorNathan Sidwell <nathan@codesourcery.com>2015-10-28 03:00:50 +0000
committerNathan Sidwell <nathan@gcc.gnu.org>2015-10-28 03:00:50 +0000
commite4834818d22f5c663b24940fd5b04da266e11fe8 (patch)
treecf0f5b4ce7a31924dfd29c43d294f7b3153f4496 /gcc/omp-low.c
parenta1c1908bbd8b298e601f85d76ffa19a487c54ea2 (diff)
downloadgcc-e4834818d22f5c663b24940fd5b04da266e11fe8.zip
gcc-e4834818d22f5c663b24940fd5b04da266e11fe8.tar.gz
gcc-e4834818d22f5c663b24940fd5b04da266e11fe8.tar.bz2
omp-low.c (struct omp_context): Remove gwv_below, gwv_this fields.
* omp-low.c (struct omp_context): Remove gwv_below, gwv_this fields. (is_oacc_parallel, is_oacc_kernels): New. (enclosing_target_ctx): May return NULL. (ctx_in_oacc_kernels_region): New. (check_oacc_kernel_gwv): New. (oacc_loop_or_target_p): Delete. (scan_omp_for): Don't calculate gwv mask. Check parallel clause operands. Strip reductions fro kernels. (scan_omp_target): Don't calculate gwv mask. (lower_oacc_head_mark, lower_oacc_loop_marker, lower_oacc_head_tail): New. (struct oacc_collapse): New. (expand_oacc_collapse_init, expand_oacc_collapse_vars): New. (expand_omp_for_static_nochunk, expand_omp_for_static_chunk): Remove OpenACC handling. (expand_oacc_for): New. (expand_omp_for): Call expand_oacc_for. (lower_omp_for): Call lower_oacc_head_tail. From-SVN: r229472
Diffstat (limited to 'gcc/omp-low.c')
-rw-r--r--gcc/omp-low.c993
1 files changed, 887 insertions, 106 deletions
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 7547835..c441166 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -200,14 +200,6 @@ struct omp_context
/* True if this construct can be cancelled. */
bool cancellable;
-
- /* For OpenACC loops, a mask of gang, worker and vector used at
- levels below this one. */
- int gwv_below;
- /* For OpenACC loops, a mask of gang, worker and vector used at
- this level and above. For parallel and kernels clauses, a mask
- indicating which of num_gangs/num_workers/num_vectors was used. */
- int gwv_this;
};
/* A structure holding the elements of:
@@ -299,6 +291,28 @@ static gphi *find_phi_with_arg_on_edge (tree, edge);
*handled_ops_p = false; \
break;
+/* Return true if CTX corresponds to an oacc parallel region. */
+
+static bool
+is_oacc_parallel (omp_context *ctx)
+{
+ enum gimple_code outer_type = gimple_code (ctx->stmt);
+ return ((outer_type == GIMPLE_OMP_TARGET)
+ && (gimple_omp_target_kind (ctx->stmt)
+ == GF_OMP_TARGET_KIND_OACC_PARALLEL));
+}
+
+/* Return true if CTX corresponds to an oacc kernels region. */
+
+static bool
+is_oacc_kernels (omp_context *ctx)
+{
+ enum gimple_code outer_type = gimple_code (ctx->stmt);
+ return ((outer_type == GIMPLE_OMP_TARGET)
+ && (gimple_omp_target_kind (ctx->stmt)
+ == GF_OMP_TARGET_KIND_OACC_KERNELS));
+}
+
/* Helper function to get the name of the array containing the partial
reductions for OpenACC reductions. */
static const char *
@@ -2933,28 +2947,95 @@ finish_taskreg_scan (omp_context *ctx)
}
}
+/* Find the enclosing offload context. */
static omp_context *
enclosing_target_ctx (omp_context *ctx)
{
- while (ctx != NULL
- && gimple_code (ctx->stmt) != GIMPLE_OMP_TARGET)
- ctx = ctx->outer;
- gcc_assert (ctx != NULL);
+ for (; ctx; ctx = ctx->outer)
+ if (gimple_code (ctx->stmt) == GIMPLE_OMP_TARGET)
+ break;
+
return ctx;
}
+/* Return true if ctx is part of an oacc kernels region. */
+
static bool
-oacc_loop_or_target_p (gimple *stmt)
+ctx_in_oacc_kernels_region (omp_context *ctx)
{
- enum gimple_code outer_type = gimple_code (stmt);
- return ((outer_type == GIMPLE_OMP_TARGET
- && ((gimple_omp_target_kind (stmt)
- == GF_OMP_TARGET_KIND_OACC_PARALLEL)
- || (gimple_omp_target_kind (stmt)
- == GF_OMP_TARGET_KIND_OACC_KERNELS)))
- || (outer_type == GIMPLE_OMP_FOR
- && gimple_omp_for_kind (stmt) == GF_OMP_FOR_KIND_OACC_LOOP));
+ for (;ctx != NULL; ctx = ctx->outer)
+ {
+ gimple *stmt = ctx->stmt;
+ if (gimple_code (stmt) == GIMPLE_OMP_TARGET
+ && gimple_omp_target_kind (stmt) == GF_OMP_TARGET_KIND_OACC_KERNELS)
+ return true;
+ }
+
+ return false;
+}
+
+/* Check the parallelism clauses inside a kernels regions.
+ Until kernels handling moves to use the same loop indirection
+ scheme as parallel, we need to do this checking early. */
+
+static unsigned
+check_oacc_kernel_gwv (gomp_for *stmt, omp_context *ctx)
+{
+ bool checking = true;
+ unsigned outer_mask = 0;
+ unsigned this_mask = 0;
+ bool has_seq = false, has_auto = false;
+
+ if (ctx->outer)
+ outer_mask = check_oacc_kernel_gwv (NULL, ctx->outer);
+ if (!stmt)
+ {
+ checking = false;
+ if (gimple_code (ctx->stmt) != GIMPLE_OMP_FOR)
+ return outer_mask;
+ stmt = as_a <gomp_for *> (ctx->stmt);
+ }
+
+ for (tree c = gimple_omp_for_clauses (stmt); c; c = OMP_CLAUSE_CHAIN (c))
+ {
+ switch (OMP_CLAUSE_CODE (c))
+ {
+ case OMP_CLAUSE_GANG:
+ this_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
+ break;
+ case OMP_CLAUSE_WORKER:
+ this_mask |= GOMP_DIM_MASK (GOMP_DIM_WORKER);
+ break;
+ case OMP_CLAUSE_VECTOR:
+ this_mask |= GOMP_DIM_MASK (GOMP_DIM_VECTOR);
+ break;
+ case OMP_CLAUSE_SEQ:
+ has_seq = true;
+ break;
+ case OMP_CLAUSE_AUTO:
+ has_auto = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (checking)
+ {
+ if (has_seq && (this_mask || has_auto))
+ error_at (gimple_location (stmt), "%<seq%> overrides other"
+ " OpenACC loop specifiers");
+ else if (has_auto && this_mask)
+ error_at (gimple_location (stmt), "%<auto%> conflicts with other"
+ " OpenACC loop specifiers");
+
+ if (this_mask & outer_mask)
+ error_at (gimple_location (stmt), "inner loop uses same"
+ " OpenACC parallelism as containing loop");
+ }
+
+ return outer_mask | this_mask;
}
/* Scan a GIMPLE_OMP_FOR. */
@@ -2962,52 +3043,62 @@ oacc_loop_or_target_p (gimple *stmt)
static void
scan_omp_for (gomp_for *stmt, omp_context *outer_ctx)
{
- enum gimple_code outer_type = GIMPLE_ERROR_MARK;
omp_context *ctx;
size_t i;
tree clauses = gimple_omp_for_clauses (stmt);
- if (outer_ctx)
- outer_type = gimple_code (outer_ctx->stmt);
-
ctx = new_omp_context (stmt, outer_ctx);
if (is_gimple_omp_oacc (stmt))
{
- if (outer_ctx && outer_type == GIMPLE_OMP_FOR)
- ctx->gwv_this = outer_ctx->gwv_this;
- for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+ omp_context *tgt = enclosing_target_ctx (outer_ctx);
+
+ if (!tgt || is_oacc_parallel (tgt))
+ for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+ {
+ char const *check = NULL;
+
+ switch (OMP_CLAUSE_CODE (c))
+ {
+ case OMP_CLAUSE_GANG:
+ check = "gang";
+ break;
+
+ case OMP_CLAUSE_WORKER:
+ check = "worker";
+ break;
+
+ case OMP_CLAUSE_VECTOR:
+ check = "vector";
+ break;
+
+ default:
+ break;
+ }
+
+ if (check && OMP_CLAUSE_OPERAND (c, 0))
+ error_at (gimple_location (stmt),
+ "argument not permitted on %qs clause in"
+ " OpenACC %<parallel%>", check);
+ }
+
+ if (tgt && is_oacc_kernels (tgt))
{
- int val;
- if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_GANG)
- val = MASK_GANG;
- else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_WORKER)
- val = MASK_WORKER;
- else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_VECTOR)
- val = MASK_VECTOR;
- else
- continue;
- ctx->gwv_this |= val;
- if (!outer_ctx)
- {
- /* Skip; not nested inside a region. */
- continue;
- }
- if (!oacc_loop_or_target_p (outer_ctx->stmt))
- {
- /* Skip; not nested inside an OpenACC region. */
- continue;
- }
- if (outer_type == GIMPLE_OMP_FOR)
- outer_ctx->gwv_below |= val;
- if (OMP_CLAUSE_OPERAND (c, 0) != NULL_TREE)
+ /* Strip out reductions, as they are not handled yet. */
+ tree *prev_ptr = &clauses;
+
+ while (tree probe = *prev_ptr)
{
- omp_context *enclosing = enclosing_target_ctx (outer_ctx);
- if (gimple_omp_target_kind (enclosing->stmt)
- == GF_OMP_TARGET_KIND_OACC_PARALLEL)
- error_at (gimple_location (stmt),
- "no arguments allowed to gang, worker and vector clauses inside parallel");
+ tree *next_ptr = &OMP_CLAUSE_CHAIN (probe);
+
+ if (OMP_CLAUSE_CODE (probe) == OMP_CLAUSE_REDUCTION)
+ *prev_ptr = *next_ptr;
+ else
+ prev_ptr = next_ptr;
}
+
+ gimple_omp_for_set_clauses (stmt, clauses);
+ check_oacc_kernel_gwv (stmt, ctx);
}
}
@@ -3022,19 +3113,6 @@ scan_omp_for (gomp_for *stmt, omp_context *outer_ctx)
scan_omp_op (gimple_omp_for_incr_ptr (stmt, i), ctx);
}
scan_omp (gimple_omp_body_ptr (stmt), ctx);
-
- if (is_gimple_omp_oacc (stmt))
- {
- if (ctx->gwv_this & ctx->gwv_below)
- error_at (gimple_location (stmt),
- "gang, worker and vector may occur only once in a loop nest");
- else if (ctx->gwv_below != 0
- && ctx->gwv_this > ctx->gwv_below)
- error_at (gimple_location (stmt),
- "gang, worker and vector must occur in this order in a loop nest");
- if (outer_ctx && outer_type == GIMPLE_OMP_FOR)
- outer_ctx->gwv_below |= ctx->gwv_below;
- }
}
/* Scan an OpenMP sections directive. */
@@ -3105,19 +3183,6 @@ scan_omp_target (gomp_target *stmt, omp_context *outer_ctx)
gimple_omp_target_set_child_fn (stmt, ctx->cb.dst_fn);
}
- if (is_gimple_omp_oacc (stmt))
- {
- for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
- {
- if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_NUM_GANGS)
- ctx->gwv_this |= MASK_GANG;
- else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_NUM_WORKERS)
- ctx->gwv_this |= MASK_WORKER;
- else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_VECTOR_LENGTH)
- ctx->gwv_this |= MASK_VECTOR;
- }
- }
-
scan_sharing_clauses (clauses, ctx);
scan_omp (gimple_omp_body_ptr (stmt), ctx);
@@ -5850,6 +5915,176 @@ lower_send_shared_vars (gimple_seq *ilist, gimple_seq *olist, omp_context *ctx)
}
}
+/* Emit an OpenACC head marker call, encapulating the partitioning and
+ other information that must be processed by the target compiler.
+ Return the maximum number of dimensions the associated loop might
+ be partitioned over. */
+
+static unsigned
+lower_oacc_head_mark (location_t loc, tree ddvar, tree clauses,
+ gimple_seq *seq, omp_context *ctx)
+{
+ unsigned levels = 0;
+ unsigned tag = 0;
+ tree gang_static = NULL_TREE;
+ auto_vec<tree, 5> args;
+
+ args.quick_push (build_int_cst
+ (integer_type_node, IFN_UNIQUE_OACC_HEAD_MARK));
+ args.quick_push (ddvar);
+ for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+ {
+ switch (OMP_CLAUSE_CODE (c))
+ {
+ case OMP_CLAUSE_GANG:
+ tag |= OLF_DIM_GANG;
+ gang_static = OMP_CLAUSE_GANG_STATIC_EXPR (c);
+ /* static:* is represented by -1, and we can ignore it, as
+ scheduling is always static. */
+ if (gang_static && integer_minus_onep (gang_static))
+ gang_static = NULL_TREE;
+ levels++;
+ break;
+
+ case OMP_CLAUSE_WORKER:
+ tag |= OLF_DIM_WORKER;
+ levels++;
+ break;
+
+ case OMP_CLAUSE_VECTOR:
+ tag |= OLF_DIM_VECTOR;
+ levels++;
+ break;
+
+ case OMP_CLAUSE_SEQ:
+ tag |= OLF_SEQ;
+ break;
+
+ case OMP_CLAUSE_AUTO:
+ tag |= OLF_AUTO;
+ break;
+
+ case OMP_CLAUSE_INDEPENDENT:
+ tag |= OLF_INDEPENDENT;
+ break;
+
+ default:
+ continue;
+ }
+ }
+
+ if (gang_static)
+ {
+ if (DECL_P (gang_static))
+ gang_static = build_outer_var_ref (gang_static, ctx);
+ tag |= OLF_GANG_STATIC;
+ }
+
+ /* In a parallel region, loops are implicitly INDEPENDENT. */
+ omp_context *tgt = enclosing_target_ctx (ctx);
+ if (!tgt || is_oacc_parallel (tgt))
+ tag |= OLF_INDEPENDENT;
+
+ /* A loop lacking SEQ, GANG, WORKER and/or VECTOR is implicitly AUTO. */
+ if (!(tag & (((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE)
+ | OLF_SEQ)))
+ tag |= OLF_AUTO;
+
+ /* Ensure at least one level. */
+ if (!levels)
+ levels++;
+
+ args.quick_push (build_int_cst (integer_type_node, levels));
+ args.quick_push (build_int_cst (integer_type_node, tag));
+ if (gang_static)
+ args.quick_push (gang_static);
+
+ gcall *call = gimple_build_call_internal_vec (IFN_UNIQUE, args);
+ gimple_set_location (call, loc);
+ gimple_set_lhs (call, ddvar);
+ gimple_seq_add_stmt (seq, call);
+
+ return levels;
+}
+
+/* Emit an OpenACC lopp head or tail marker to SEQ. LEVEL is the
+ partitioning level of the enclosed region. */
+
+static void
+lower_oacc_loop_marker (location_t loc, tree ddvar, bool head,
+ tree tofollow, gimple_seq *seq)
+{
+ int marker_kind = (head ? IFN_UNIQUE_OACC_HEAD_MARK
+ : IFN_UNIQUE_OACC_TAIL_MARK);
+ tree marker = build_int_cst (integer_type_node, marker_kind);
+ int nargs = 2 + (tofollow != NULL_TREE);
+ gcall *call = gimple_build_call_internal (IFN_UNIQUE, nargs,
+ marker, ddvar, tofollow);
+ gimple_set_location (call, loc);
+ gimple_set_lhs (call, ddvar);
+ gimple_seq_add_stmt (seq, call);
+}
+
+/* Generate the before and after OpenACC loop sequences. CLAUSES are
+ the loop clauses, from which we extract reductions. Initialize
+ HEAD and TAIL. */
+
+static void
+lower_oacc_head_tail (location_t loc, tree clauses,
+ gimple_seq *head, gimple_seq *tail, omp_context *ctx)
+{
+ bool inner = false;
+ tree ddvar = create_tmp_var (integer_type_node, ".data_dep");
+ gimple_seq_add_stmt (head, gimple_build_assign (ddvar, integer_zero_node));
+
+ unsigned count = lower_oacc_head_mark (loc, ddvar, clauses, head, ctx);
+ if (!count)
+ lower_oacc_loop_marker (loc, ddvar, false, integer_zero_node, tail);
+
+ tree fork_kind = build_int_cst (unsigned_type_node, IFN_UNIQUE_OACC_FORK);
+ tree join_kind = build_int_cst (unsigned_type_node, IFN_UNIQUE_OACC_JOIN);
+
+ for (unsigned done = 1; count; count--, done++)
+ {
+ gimple_seq fork_seq = NULL;
+ gimple_seq join_seq = NULL;
+
+ tree place = build_int_cst (integer_type_node, -1);
+ gcall *fork = gimple_build_call_internal (IFN_UNIQUE, 3,
+ fork_kind, ddvar, place);
+ gimple_set_location (fork, loc);
+ gimple_set_lhs (fork, ddvar);
+
+ gcall *join = gimple_build_call_internal (IFN_UNIQUE, 3,
+ join_kind, ddvar, place);
+ gimple_set_location (join, loc);
+ gimple_set_lhs (join, ddvar);
+
+ /* Mark the beginning of this level sequence. */
+ if (inner)
+ lower_oacc_loop_marker (loc, ddvar, true,
+ build_int_cst (integer_type_node, count),
+ &fork_seq);
+ lower_oacc_loop_marker (loc, ddvar, false,
+ build_int_cst (integer_type_node, done),
+ &join_seq);
+
+ gimple_seq_add_stmt (&fork_seq, fork);
+ gimple_seq_add_stmt (&join_seq, join);
+
+ /* Append this level to head. */
+ gimple_seq_add_seq (head, fork_seq);
+ /* Prepend it to tail. */
+ gimple_seq_add_seq (&join_seq, *tail);
+ *tail = join_seq;
+
+ inner = true;
+ }
+
+ /* Mark the end of the sequence. */
+ lower_oacc_loop_marker (loc, ddvar, true, NULL_TREE, head);
+ lower_oacc_loop_marker (loc, ddvar, false, NULL_TREE, tail);
+}
/* A convenience function to build an empty GIMPLE_COND with just the
condition. */
@@ -6760,6 +6995,149 @@ expand_omp_taskreg (struct omp_region *region)
update_ssa (TODO_update_ssa_only_virtuals);
}
+/* Information about members of an OpenACC collapsed loop nest. */
+
+struct oacc_collapse
+{
+ tree base; /* Base value. */
+ tree iters; /* Number of steps. */
+ tree step; /* step size. */
+};
+
+/* Helper for expand_oacc_for. Determine collapsed loop information.
+ Fill in COUNTS array. Emit any initialization code before GSI.
+ Return the calculated outer loop bound of BOUND_TYPE. */
+
+static tree
+expand_oacc_collapse_init (const struct omp_for_data *fd,
+ gimple_stmt_iterator *gsi,
+ oacc_collapse *counts, tree bound_type)
+{
+ tree total = build_int_cst (bound_type, 1);
+ int ix;
+
+ gcc_assert (integer_onep (fd->loop.step));
+ gcc_assert (integer_zerop (fd->loop.n1));
+
+ for (ix = 0; ix != fd->collapse; ix++)
+ {
+ const omp_for_data_loop *loop = &fd->loops[ix];
+
+ tree iter_type = TREE_TYPE (loop->v);
+ tree diff_type = iter_type;
+ tree plus_type = iter_type;
+
+ gcc_assert (loop->cond_code == fd->loop.cond_code);
+
+ if (POINTER_TYPE_P (iter_type))
+ plus_type = sizetype;
+ if (POINTER_TYPE_P (diff_type) || TYPE_UNSIGNED (diff_type))
+ diff_type = signed_type_for (diff_type);
+
+ tree b = loop->n1;
+ tree e = loop->n2;
+ tree s = loop->step;
+ bool up = loop->cond_code == LT_EXPR;
+ tree dir = build_int_cst (diff_type, up ? +1 : -1);
+ bool negating;
+ tree expr;
+
+ b = force_gimple_operand_gsi (gsi, b, true, NULL_TREE,
+ true, GSI_SAME_STMT);
+ e = force_gimple_operand_gsi (gsi, e, true, NULL_TREE,
+ true, GSI_SAME_STMT);
+
+ /* Convert the step, avoiding possible unsigned->signed overflow. */
+ negating = !up && TYPE_UNSIGNED (TREE_TYPE (s));
+ if (negating)
+ s = fold_build1 (NEGATE_EXPR, TREE_TYPE (s), s);
+ s = fold_convert (diff_type, s);
+ if (negating)
+ s = fold_build1 (NEGATE_EXPR, diff_type, s);
+ s = force_gimple_operand_gsi (gsi, s, true, NULL_TREE,
+ true, GSI_SAME_STMT);
+
+ /* Determine the range, avoiding possible unsigned->signed overflow. */
+ negating = !up && TYPE_UNSIGNED (iter_type);
+ expr = fold_build2 (MINUS_EXPR, plus_type,
+ fold_convert (plus_type, negating ? b : e),
+ fold_convert (plus_type, negating ? e : b));
+ expr = fold_convert (diff_type, expr);
+ if (negating)
+ expr = fold_build1 (NEGATE_EXPR, diff_type, expr);
+ tree range = force_gimple_operand_gsi
+ (gsi, expr, true, NULL_TREE, true, GSI_SAME_STMT);
+
+ /* Determine number of iterations. */
+ expr = fold_build2 (MINUS_EXPR, diff_type, range, dir);
+ expr = fold_build2 (PLUS_EXPR, diff_type, expr, s);
+ expr = fold_build2 (TRUNC_DIV_EXPR, diff_type, expr, s);
+
+ tree iters = force_gimple_operand_gsi (gsi, expr, true, NULL_TREE,
+ true, GSI_SAME_STMT);
+
+ counts[ix].base = b;
+ counts[ix].iters = iters;
+ counts[ix].step = s;
+
+ total = fold_build2 (MULT_EXPR, bound_type, total,
+ fold_convert (bound_type, iters));
+ }
+
+ return total;
+}
+
+/* Emit initializers for collapsed loop members. IVAR is the outer
+ loop iteration variable, from which collapsed loop iteration values
+ are calculated. COUNTS array has been initialized by
+ expand_oacc_collapse_inits. */
+
+static void
+expand_oacc_collapse_vars (const struct omp_for_data *fd,
+ gimple_stmt_iterator *gsi,
+ const oacc_collapse *counts, tree ivar)
+{
+ tree ivar_type = TREE_TYPE (ivar);
+
+ /* The most rapidly changing iteration variable is the innermost
+ one. */
+ for (int ix = fd->collapse; ix--;)
+ {
+ const omp_for_data_loop *loop = &fd->loops[ix];
+ const oacc_collapse *collapse = &counts[ix];
+ tree iter_type = TREE_TYPE (loop->v);
+ tree diff_type = TREE_TYPE (collapse->step);
+ tree plus_type = iter_type;
+ enum tree_code plus_code = PLUS_EXPR;
+ tree expr;
+
+ if (POINTER_TYPE_P (iter_type))
+ {
+ plus_code = POINTER_PLUS_EXPR;
+ plus_type = sizetype;
+ }
+
+ expr = fold_build2 (TRUNC_MOD_EXPR, ivar_type, ivar,
+ fold_convert (ivar_type, collapse->iters));
+ expr = fold_build2 (MULT_EXPR, diff_type, fold_convert (diff_type, expr),
+ collapse->step);
+ expr = fold_build2 (plus_code, iter_type, collapse->base,
+ fold_convert (plus_type, expr));
+ expr = force_gimple_operand_gsi (gsi, expr, false, NULL_TREE,
+ true, GSI_SAME_STMT);
+ gassign *ass = gimple_build_assign (loop->v, expr);
+ gsi_insert_before (gsi, ass, GSI_SAME_STMT);
+
+ if (ix)
+ {
+ expr = fold_build2 (TRUNC_DIV_EXPR, ivar_type, ivar,
+ fold_convert (ivar_type, collapse->iters));
+ ivar = force_gimple_operand_gsi (gsi, expr, true, NULL_TREE,
+ true, GSI_SAME_STMT);
+ }
+ }
+}
+
/* Helper function for expand_omp_{for_*,simd}. If this is the outermost
of the combined collapse > 1 loop constructs, generate code like:
@@ -8406,10 +8784,6 @@ expand_omp_for_static_nochunk (struct omp_region *region,
tree *counts = NULL;
tree n1, n2, step;
- gcc_checking_assert ((gimple_omp_for_kind (fd->for_stmt)
- != GF_OMP_FOR_KIND_OACC_LOOP)
- || !inner_stmt);
-
itype = type = TREE_TYPE (fd->loop.v);
if (POINTER_TYPE_P (type))
itype = signed_type_for (type);
@@ -8502,10 +8876,6 @@ expand_omp_for_static_nochunk (struct omp_region *region,
nthreads = builtin_decl_explicit (BUILT_IN_OMP_GET_NUM_TEAMS);
threadid = builtin_decl_explicit (BUILT_IN_OMP_GET_TEAM_NUM);
break;
- case GF_OMP_FOR_KIND_OACC_LOOP:
- nthreads = builtin_decl_explicit (BUILT_IN_GOACC_GET_NUM_THREADS);
- threadid = builtin_decl_explicit (BUILT_IN_GOACC_GET_THREAD_NUM);
- break;
default:
gcc_unreachable ();
}
@@ -8732,10 +9102,7 @@ expand_omp_for_static_nochunk (struct omp_region *region,
if (!gimple_omp_return_nowait_p (gsi_stmt (gsi)))
{
t = gimple_omp_return_lhs (gsi_stmt (gsi));
- if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
- gcc_checking_assert (t == NULL_TREE);
- else
- gsi_insert_after (&gsi, build_omp_barrier (t), GSI_SAME_STMT);
+ gsi_insert_after (&gsi, build_omp_barrier (t), GSI_SAME_STMT);
}
gsi_remove (&gsi, true);
@@ -8873,10 +9240,6 @@ expand_omp_for_static_chunk (struct omp_region *region,
tree *counts = NULL;
tree n1, n2, step;
- gcc_checking_assert ((gimple_omp_for_kind (fd->for_stmt)
- != GF_OMP_FOR_KIND_OACC_LOOP)
- || !inner_stmt);
-
itype = type = TREE_TYPE (fd->loop.v);
if (POINTER_TYPE_P (type))
itype = signed_type_for (type);
@@ -8973,10 +9336,6 @@ expand_omp_for_static_chunk (struct omp_region *region,
nthreads = builtin_decl_explicit (BUILT_IN_OMP_GET_NUM_TEAMS);
threadid = builtin_decl_explicit (BUILT_IN_OMP_GET_TEAM_NUM);
break;
- case GF_OMP_FOR_KIND_OACC_LOOP:
- nthreads = builtin_decl_explicit (BUILT_IN_GOACC_GET_NUM_THREADS);
- threadid = builtin_decl_explicit (BUILT_IN_GOACC_GET_THREAD_NUM);
- break;
default:
gcc_unreachable ();
}
@@ -9236,10 +9595,7 @@ expand_omp_for_static_chunk (struct omp_region *region,
if (!gimple_omp_return_nowait_p (gsi_stmt (gsi)))
{
t = gimple_omp_return_lhs (gsi_stmt (gsi));
- if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
- gcc_checking_assert (t == NULL_TREE);
- else
- gsi_insert_after (&gsi, build_omp_barrier (t), GSI_SAME_STMT);
+ gsi_insert_after (&gsi, build_omp_barrier (t), GSI_SAME_STMT);
}
gsi_remove (&gsi, true);
@@ -10289,6 +10645,410 @@ expand_omp_taskloop_for_inner (struct omp_region *region,
}
}
+/* A subroutine of expand_omp_for. Generate code for an OpenACC
+ partitioned loop. The lowering here is abstracted, in that the
+ loop parameters are passed through internal functions, which are
+ further lowered by oacc_device_lower, once we get to the target
+ compiler. The loop is of the form:
+
+ for (V = B; V LTGT E; V += S) {BODY}
+
+ where LTGT is < or >. We may have a specified chunking size, CHUNKING
+ (constant 0 for no chunking) and we will have a GWV partitioning
+ mask, specifying dimensions over which the loop is to be
+ partitioned (see note below). We generate code that looks like:
+
+ <entry_bb> [incoming FALL->body, BRANCH->exit]
+ typedef signedintify (typeof (V)) T; // underlying signed integral type
+ T range = E - B;
+ T chunk_no = 0;
+ T DIR = LTGT == '<' ? +1 : -1;
+ T chunk_max = GOACC_LOOP_CHUNK (dir, range, S, CHUNK_SIZE, GWV);
+ T step = GOACC_LOOP_STEP (dir, range, S, CHUNK_SIZE, GWV);
+
+ <head_bb> [created by splitting end of entry_bb]
+ T offset = GOACC_LOOP_OFFSET (dir, range, S, CHUNK_SIZE, GWV, chunk_no);
+ T bound = GOACC_LOOP_BOUND (dir, range, S, CHUNK_SIZE, GWV, offset);
+ if (!(offset LTGT bound)) goto bottom_bb;
+
+ <body_bb> [incoming]
+ V = B + offset;
+ {BODY}
+
+ <cont_bb> [incoming, may == body_bb FALL->exit_bb, BRANCH->body_bb]
+ offset += step;
+ if (offset LTGT bound) goto body_bb; [*]
+
+ <bottom_bb> [created by splitting start of exit_bb] insert BRANCH->head_bb
+ chunk_no++;
+ if (chunk < chunk_max) goto head_bb;
+
+ <exit_bb> [incoming]
+ V = B + ((range -/+ 1) / S +/- 1) * S [*]
+
+ [*] Needed if V live at end of loop
+
+ Note: CHUNKING & GWV mask are specified explicitly here. This is a
+ transition, and will be specified by a more general mechanism shortly.
+ */
+
+static void
+expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
+{
+ tree v = fd->loop.v;
+ enum tree_code cond_code = fd->loop.cond_code;
+ enum tree_code plus_code = PLUS_EXPR;
+
+ tree chunk_size = integer_minus_one_node;
+ tree gwv = integer_zero_node;
+ tree iter_type = TREE_TYPE (v);
+ tree diff_type = iter_type;
+ tree plus_type = iter_type;
+ struct oacc_collapse *counts = NULL;
+
+ gcc_checking_assert (gimple_omp_for_kind (fd->for_stmt)
+ == GF_OMP_FOR_KIND_OACC_LOOP);
+ gcc_assert (!gimple_omp_for_combined_into_p (fd->for_stmt));
+ gcc_assert (cond_code == LT_EXPR || cond_code == GT_EXPR);
+
+ if (POINTER_TYPE_P (iter_type))
+ {
+ plus_code = POINTER_PLUS_EXPR;
+ plus_type = sizetype;
+ }
+ if (POINTER_TYPE_P (diff_type) || TYPE_UNSIGNED (diff_type))
+ diff_type = signed_type_for (diff_type);
+
+ basic_block entry_bb = region->entry; /* BB ending in OMP_FOR */
+ basic_block exit_bb = region->exit; /* BB ending in OMP_RETURN */
+ basic_block cont_bb = region->cont; /* BB ending in OMP_CONTINUE */
+ basic_block bottom_bb = NULL;
+
+ /* entry_bb has two sucessors; the branch edge is to the exit
+ block, fallthrough edge to body. */
+ gcc_assert (EDGE_COUNT (entry_bb->succs) == 2
+ && BRANCH_EDGE (entry_bb)->dest == exit_bb);
+
+ /* If cont_bb non-NULL, it has 2 successors. The branch successor is
+ body_bb, or to a block whose only successor is the body_bb. Its
+ fallthrough successor is the final block (same as the branch
+ successor of the entry_bb). */
+ if (cont_bb)
+ {
+ basic_block body_bb = FALLTHRU_EDGE (entry_bb)->dest;
+ basic_block bed = BRANCH_EDGE (cont_bb)->dest;
+
+ gcc_assert (FALLTHRU_EDGE (cont_bb)->dest == exit_bb);
+ gcc_assert (bed == body_bb || single_succ_edge (bed)->dest == body_bb);
+ }
+ else
+ gcc_assert (!gimple_in_ssa_p (cfun));
+
+ /* The exit block only has entry_bb and cont_bb as predecessors. */
+ gcc_assert (EDGE_COUNT (exit_bb->preds) == 1 + (cont_bb != NULL));
+
+ tree chunk_no;
+ tree chunk_max = NULL_TREE;
+ tree bound, offset;
+ tree step = create_tmp_var (diff_type, ".step");
+ bool up = cond_code == LT_EXPR;
+ tree dir = build_int_cst (diff_type, up ? +1 : -1);
+ bool chunking = !gimple_in_ssa_p (cfun);;
+ bool negating;
+
+ /* SSA instances. */
+ tree offset_incr = NULL_TREE;
+ tree offset_init = NULL_TREE;
+
+ gimple_stmt_iterator gsi;
+ gassign *ass;
+ gcall *call;
+ gimple *stmt;
+ tree expr;
+ location_t loc;
+ edge split, be, fte;
+
+ /* Split the end of entry_bb to create head_bb. */
+ split = split_block (entry_bb, last_stmt (entry_bb));
+ basic_block head_bb = split->dest;
+ entry_bb = split->src;
+
+ /* Chunk setup goes at end of entry_bb, replacing the omp_for. */
+ gsi = gsi_last_bb (entry_bb);
+ gomp_for *for_stmt = as_a <gomp_for *> (gsi_stmt (gsi));
+ loc = gimple_location (for_stmt);
+
+ if (gimple_in_ssa_p (cfun))
+ {
+ offset_init = gimple_omp_for_index (for_stmt, 0);
+ gcc_assert (integer_zerop (fd->loop.n1));
+ /* The SSA parallelizer does gang parallelism. */
+ gwv = build_int_cst (integer_type_node, GOMP_DIM_MASK (GOMP_DIM_GANG));
+ }
+
+ if (fd->collapse > 1)
+ {
+ counts = XALLOCAVEC (struct oacc_collapse, fd->collapse);
+ tree total = expand_oacc_collapse_init (fd, &gsi, counts,
+ TREE_TYPE (fd->loop.n2));
+
+ if (SSA_VAR_P (fd->loop.n2))
+ {
+ total = force_gimple_operand_gsi (&gsi, total, false, NULL_TREE,
+ true, GSI_SAME_STMT);
+ ass = gimple_build_assign (fd->loop.n2, total);
+ gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+ }
+
+ }
+
+ tree b = fd->loop.n1;
+ tree e = fd->loop.n2;
+ tree s = fd->loop.step;
+
+ b = force_gimple_operand_gsi (&gsi, b, true, NULL_TREE, true, GSI_SAME_STMT);
+ e = force_gimple_operand_gsi (&gsi, e, true, NULL_TREE, true, GSI_SAME_STMT);
+
+ /* Convert the step, avoiding possible unsigned->signed overflow. */
+ negating = !up && TYPE_UNSIGNED (TREE_TYPE (s));
+ if (negating)
+ s = fold_build1 (NEGATE_EXPR, TREE_TYPE (s), s);
+ s = fold_convert (diff_type, s);
+ if (negating)
+ s = fold_build1 (NEGATE_EXPR, diff_type, s);
+ s = force_gimple_operand_gsi (&gsi, s, true, NULL_TREE, true, GSI_SAME_STMT);
+
+ if (!chunking)
+ chunk_size = integer_zero_node;
+ expr = fold_convert (diff_type, chunk_size);
+ chunk_size = force_gimple_operand_gsi (&gsi, expr, true,
+ NULL_TREE, true, GSI_SAME_STMT);
+ /* Determine the range, avoiding possible unsigned->signed overflow. */
+ negating = !up && TYPE_UNSIGNED (iter_type);
+ expr = fold_build2 (MINUS_EXPR, plus_type,
+ fold_convert (plus_type, negating ? b : e),
+ fold_convert (plus_type, negating ? e : b));
+ expr = fold_convert (diff_type, expr);
+ if (negating)
+ expr = fold_build1 (NEGATE_EXPR, diff_type, expr);
+ tree range = force_gimple_operand_gsi (&gsi, expr, true,
+ NULL_TREE, true, GSI_SAME_STMT);
+
+ chunk_no = build_int_cst (diff_type, 0);
+ if (chunking)
+ {
+ gcc_assert (!gimple_in_ssa_p (cfun));
+
+ expr = chunk_no;
+ chunk_max = create_tmp_var (diff_type, ".chunk_max");
+ chunk_no = create_tmp_var (diff_type, ".chunk_no");
+
+ ass = gimple_build_assign (chunk_no, expr);
+ gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 6,
+ build_int_cst (integer_type_node,
+ IFN_GOACC_LOOP_CHUNKS),
+ dir, range, s, chunk_size, gwv);
+ gimple_call_set_lhs (call, chunk_max);
+ gimple_set_location (call, loc);
+ gsi_insert_before (&gsi, call, GSI_SAME_STMT);
+ }
+ else
+ chunk_size = chunk_no;
+
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 6,
+ build_int_cst (integer_type_node,
+ IFN_GOACC_LOOP_STEP),
+ dir, range, s, chunk_size, gwv);
+ gimple_call_set_lhs (call, step);
+ gimple_set_location (call, loc);
+ gsi_insert_before (&gsi, call, GSI_SAME_STMT);
+
+ /* Remove the GIMPLE_OMP_FOR. */
+ gsi_remove (&gsi, true);
+
+ /* Fixup edges from head_bb */
+ be = BRANCH_EDGE (head_bb);
+ fte = FALLTHRU_EDGE (head_bb);
+ be->flags |= EDGE_FALSE_VALUE;
+ fte->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
+
+ basic_block body_bb = fte->dest;
+
+ if (gimple_in_ssa_p (cfun))
+ {
+ gsi = gsi_last_bb (cont_bb);
+ gomp_continue *cont_stmt = as_a <gomp_continue *> (gsi_stmt (gsi));
+
+ offset = gimple_omp_continue_control_use (cont_stmt);
+ offset_incr = gimple_omp_continue_control_def (cont_stmt);
+ }
+ else
+ {
+ offset = create_tmp_var (diff_type, ".offset");
+ offset_init = offset_incr = offset;
+ }
+ bound = create_tmp_var (TREE_TYPE (offset), ".bound");
+
+ /* Loop offset & bound go into head_bb. */
+ gsi = gsi_start_bb (head_bb);
+
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
+ build_int_cst (integer_type_node,
+ IFN_GOACC_LOOP_OFFSET),
+ dir, range, s,
+ chunk_size, gwv, chunk_no);
+ gimple_call_set_lhs (call, offset_init);
+ gimple_set_location (call, loc);
+ gsi_insert_after (&gsi, call, GSI_CONTINUE_LINKING);
+
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
+ build_int_cst (integer_type_node,
+ IFN_GOACC_LOOP_BOUND),
+ dir, range, s,
+ chunk_size, gwv, offset_init);
+ gimple_call_set_lhs (call, bound);
+ gimple_set_location (call, loc);
+ gsi_insert_after (&gsi, call, GSI_CONTINUE_LINKING);
+
+ expr = build2 (cond_code, boolean_type_node, offset_init, bound);
+ gsi_insert_after (&gsi, gimple_build_cond_empty (expr),
+ GSI_CONTINUE_LINKING);
+
+ /* V assignment goes into body_bb. */
+ if (!gimple_in_ssa_p (cfun))
+ {
+ gsi = gsi_start_bb (body_bb);
+
+ expr = build2 (plus_code, iter_type, b,
+ fold_convert (plus_type, offset));
+ expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE,
+ true, GSI_SAME_STMT);
+ ass = gimple_build_assign (v, expr);
+ gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+ if (fd->collapse > 1)
+ expand_oacc_collapse_vars (fd, &gsi, counts, v);
+ }
+
+ /* Loop increment goes into cont_bb. If this is not a loop, we
+ will have spawned threads as if it was, and each one will
+ execute one iteration. The specification is not explicit about
+ whether such constructs are ill-formed or not, and they can
+ occur, especially when noreturn routines are involved. */
+ if (cont_bb)
+ {
+ gsi = gsi_last_bb (cont_bb);
+ gomp_continue *cont_stmt = as_a <gomp_continue *> (gsi_stmt (gsi));
+ loc = gimple_location (cont_stmt);
+
+ /* Increment offset. */
+ if (gimple_in_ssa_p (cfun))
+ expr= build2 (plus_code, iter_type, offset,
+ fold_convert (plus_type, step));
+ else
+ expr = build2 (PLUS_EXPR, diff_type, offset, step);
+ expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE,
+ true, GSI_SAME_STMT);
+ ass = gimple_build_assign (offset_incr, expr);
+ gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+ expr = build2 (cond_code, boolean_type_node, offset_incr, bound);
+ gsi_insert_before (&gsi, gimple_build_cond_empty (expr), GSI_SAME_STMT);
+
+ /* Remove the GIMPLE_OMP_CONTINUE. */
+ gsi_remove (&gsi, true);
+
+ /* Fixup edges from cont_bb */
+ be = BRANCH_EDGE (cont_bb);
+ fte = FALLTHRU_EDGE (cont_bb);
+ be->flags |= EDGE_TRUE_VALUE;
+ fte->flags ^= EDGE_FALLTHRU | EDGE_FALSE_VALUE;
+
+ if (chunking)
+ {
+ /* Split the beginning of exit_bb to make bottom_bb. We
+ need to insert a nop at the start, because splitting is
+ after a stmt, not before. */
+ gsi = gsi_start_bb (exit_bb);
+ stmt = gimple_build_nop ();
+ gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+ split = split_block (exit_bb, stmt);
+ bottom_bb = split->src;
+ exit_bb = split->dest;
+ gsi = gsi_last_bb (bottom_bb);
+
+ /* Chunk increment and test goes into bottom_bb. */
+ expr = build2 (PLUS_EXPR, diff_type, chunk_no,
+ build_int_cst (diff_type, 1));
+ ass = gimple_build_assign (chunk_no, expr);
+ gsi_insert_after (&gsi, ass, GSI_CONTINUE_LINKING);
+
+ /* Chunk test at end of bottom_bb. */
+ expr = build2 (LT_EXPR, boolean_type_node, chunk_no, chunk_max);
+ gsi_insert_after (&gsi, gimple_build_cond_empty (expr),
+ GSI_CONTINUE_LINKING);
+
+ /* Fixup edges from bottom_bb. */
+ split->flags ^= EDGE_FALLTHRU | EDGE_FALSE_VALUE;
+ make_edge (bottom_bb, head_bb, EDGE_TRUE_VALUE);
+ }
+ }
+
+ gsi = gsi_last_bb (exit_bb);
+ gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_RETURN);
+ loc = gimple_location (gsi_stmt (gsi));
+
+ if (!gimple_in_ssa_p (cfun))
+ {
+ /* Insert the final value of V, in case it is live. This is the
+ value for the only thread that survives past the join. */
+ expr = fold_build2 (MINUS_EXPR, diff_type, range, dir);
+ expr = fold_build2 (PLUS_EXPR, diff_type, expr, s);
+ expr = fold_build2 (TRUNC_DIV_EXPR, diff_type, expr, s);
+ expr = fold_build2 (MULT_EXPR, diff_type, expr, s);
+ expr = build2 (plus_code, iter_type, b, fold_convert (plus_type, expr));
+ expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE,
+ true, GSI_SAME_STMT);
+ ass = gimple_build_assign (v, expr);
+ gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+ }
+
+ /* Remove the OMP_RETURN. */
+ gsi_remove (&gsi, true);
+
+ if (cont_bb)
+ {
+ /* We now have one or two nested loops. Update the loop
+ structures. */
+ struct loop *parent = entry_bb->loop_father;
+ struct loop *body = body_bb->loop_father;
+
+ if (chunking)
+ {
+ struct loop *chunk_loop = alloc_loop ();
+ chunk_loop->header = head_bb;
+ chunk_loop->latch = bottom_bb;
+ add_loop (chunk_loop, parent);
+ parent = chunk_loop;
+ }
+ else if (parent != body)
+ {
+ gcc_assert (body->header == body_bb);
+ gcc_assert (body->latch == cont_bb
+ || single_pred (body->latch) == cont_bb);
+ parent = NULL;
+ }
+
+ if (parent)
+ {
+ struct loop *body_loop = alloc_loop ();
+ body_loop->header = body_bb;
+ body_loop->latch = cont_bb;
+ add_loop (body_loop, parent);
+ }
+ }
+}
+
/* Expand the OMP loop defined by REGION. */
static void
@@ -10324,6 +11084,11 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt)
expand_omp_simd (region, &fd);
else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_CILKFOR)
expand_cilk_for (region, &fd);
+ else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
+ {
+ gcc_assert (!inner_stmt);
+ expand_oacc_for (region, &fd);
+ }
else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_TASKLOOP)
{
if (gimple_omp_for_combined_into_p (fd.for_stmt))
@@ -13521,6 +14286,7 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx)
gomp_for *stmt = as_a <gomp_for *> (gsi_stmt (*gsi_p));
gbind *new_stmt;
gimple_seq omp_for_body, body, dlist;
+ gimple_seq oacc_head = NULL, oacc_tail = NULL;
size_t i;
push_gimplify_context ();
@@ -13629,6 +14395,16 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx)
/* Once lowered, extract the bounds and clauses. */
extract_omp_for_data (stmt, &fd, NULL);
+ if (is_gimple_omp_oacc (ctx->stmt)
+ && !ctx_in_oacc_kernels_region (ctx))
+ lower_oacc_head_tail (gimple_location (stmt),
+ gimple_omp_for_clauses (stmt),
+ &oacc_head, &oacc_tail, ctx);
+
+ /* Add OpenACC partitioning markers just before the loop */
+ if (oacc_head)
+ gimple_seq_add_seq (&body, oacc_head);
+
lower_omp_for_lastprivate (&fd, &body, &dlist, ctx);
if (gimple_omp_for_kind (stmt) == GF_OMP_FOR_KIND_FOR)
@@ -13662,6 +14438,11 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx)
/* Region exit marker goes at the end of the loop body. */
gimple_seq_add_stmt (&body, gimple_build_omp_return (fd.have_nowait));
maybe_add_implicit_barrier_cancel (ctx, &body);
+
+ /* Add OpenACC joining and reduction markers just after the loop. */
+ if (oacc_tail)
+ gimple_seq_add_seq (&body, oacc_tail);
+
pop_gimplify_context (new_stmt);
gimple_bind_append_vars (new_stmt, ctx->block_vars);