Decompose OpenACC 'kernels' constructs into parts, a sequence of compute constructs

Not yet enabled by default: for now, the current mode of OpenACC 'kernels' constructs handling still remains '-fopenacc-kernels=parloops', but that is to change later. gcc/ * omp-oacc-kernels-decompose.cc: New. * Makefile.in (OBJS): Add it. * passes.def: Instantiate it. * tree-pass.h (make_pass_omp_oacc_kernels_decompose): Declare. * flag-types.h (enum openacc_kernels): Add. * doc/invoke.texi (-fopenacc-kernels): Document. * gimple.h (enum gf_mask): Add 'GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED', 'GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE', 'GF_OMP_TARGET_KIND_OACC_DATA_KERNELS'. (is_gimple_omp_oacc, is_gimple_omp_offloaded): Handle these. * gimple-pretty-print.c (dump_gimple_omp_target): Likewise. * omp-expand.c (expand_omp_target, build_omp_regions_1) (omp_make_gimple_edges): Likewise. * omp-low.c (scan_sharing_clauses, scan_omp_for) (check_omp_nesting_restrictions, lower_oacc_reductions) (lower_oacc_head_mark, lower_omp_target): Likewise. * omp-offload.c (execute_oacc_device_lower): Likewise. gcc/c-family/ * c.opt (fopenacc-kernels): Add. gcc/fortran/ * lang.opt (fopenacc-kernels): Add. gcc/testsuite/ * c-c++-common/goacc/kernels-decompose-1.c: New. * c-c++-common/goacc/kernels-decompose-2.c: New. * c-c++-common/goacc/kernels-decompose-ice-1.c: New. * c-c++-common/goacc/kernels-decompose-ice-2.c: New. * gfortran.dg/goacc/kernels-decompose-1.f95: New. * gfortran.dg/goacc/kernels-decompose-2.f95: New. * c-c++-common/goacc/if-clause-2.c: Adjust. * gfortran.dg/goacc/kernels-tree.f95: Likewise. libgomp/ * testsuite/libgomp.oacc-c-c++-common/declare-vla-kernels-decompose-ice-1.c: New. * testsuite/libgomp.oacc-c-c++-common/declare-vla-kernels-decompose.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/declare-vla.c: Adjust. * testsuite/libgomp.oacc-fortran/pr94358-1.f90: Likewise. Co-authored-by: Thomas Schwinge <thomas@codesourcery.com>
author: Gergö Barany <gergo@codesourcery.com> 2019-02-01 00:59:30 +0100
committer: Thomas Schwinge <thomas@codesourcery.com> 2020-11-13 22:58:57 +0100
commit: e898ce7997733c29dcab9c3c62ca102c7f9fa6eb (patch)
tree: 332bb0438f3383bc22d777c3b0008c269dba4d8a
parent: bd7885755405bc9947ebe805a53d6100c78c8e82 (diff)
download: gcc-e898ce7997733c29dcab9c3c62ca102c7f9fa6eb.zip
gcc-e898ce7997733c29dcab9c3c62ca102c7f9fa6eb.tar.gz
gcc-e898ce7997733c29dcab9c3c62ca102c7f9fa6eb.tar.bz2
26 files changed, 2355 insertions, 16 deletions
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 978a08f..273654c 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1480,6 +1480,7 @@ OBJS = \
 	omp-expand.o \
 	omp-general.o \
 	omp-low.o \
+	omp-oacc-kernels-decompose.o \
 	omp-simd-clone.o \
 	opt-problem.o \
 	optabs.o \
diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt
index a008363..0532cb7 100644
--- a/gcc/c-family/c.opt
+++ b/gcc/c-family/c.opt
@@ -1796,6 +1796,19 @@ fopenacc-dim=
 C ObjC C++ ObjC++ LTO Joined Var(flag_openacc_dims)
 Specify default OpenACC compute dimensions.
 
+fopenacc-kernels=
+C ObjC C++ ObjC++ RejectNegative Joined Enum(openacc_kernels) Var(flag_openacc_kernels) Init(OPENACC_KERNELS_PARLOOPS)
+-fopenacc-kernels=[decompose|parloops]	Specify mode of OpenACC 'kernels' constructs handling.
+
+Enum
+Name(openacc_kernels) Type(enum openacc_kernels)
+
+EnumValue
+Enum(openacc_kernels) String(decompose) Value(OPENACC_KERNELS_DECOMPOSE)
+
+EnumValue
+Enum(openacc_kernels) String(parloops) Value(OPENACC_KERNELS_PARLOOPS)
+
 fopenmp
 C ObjC C++ ObjC++ LTO Var(flag_openmp)
 Enable OpenMP (implies -frecursive in Fortran).
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 85f7969..8a164ef 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -201,7 +201,7 @@ in the following sections.
 -aux-info @var{filename}  -fallow-parameterless-variadic-functions @gol
 -fno-asm  -fno-builtin  -fno-builtin-@var{function}  -fgimple@gol
 -fhosted  -ffreestanding @gol
--fopenacc  -fopenacc-dim=@var{geom} @gol
+-fopenacc  -fopenacc-dim=@var{geom}  -fopenacc-kernels=@var{mode} @gol
 -fopenmp  -fopenmp-simd @gol
 -fms-extensions  -fplan9-extensions  -fsso-struct=@var{endianness} @gol
 -fallow-single-precision  -fcond-mismatch  -flax-vector-conversions @gol
@@ -2589,6 +2589,18 @@ not explicitly specify.  The @var{geom} value is a triple of
 ':'-separated sizes, in order 'gang', 'worker' and, 'vector'.  A size
 can be omitted, to use a target-specific default value.
 
+@item -fopenacc-kernels=@var{mode}
+@opindex fopenacc-kernels
+@cindex OpenACC accelerator programming
+Specify mode of OpenACC `kernels' constructs handling.
+With @option{-fopenacc-kernels=decompose}, OpenACC `kernels'
+constructs are decomposed into parts, a sequence of compute
+constructs, each then handled individually.
+This is work in progress.
+With @option{-fopenacc-kernels=parloops}, OpenACC `kernels' constructs
+are handled by the @samp{parloops} pass, en bloc.
+This is the current default.
+
 @item -fopenmp
 @opindex fopenmp
 @cindex OpenMP parallel
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index a887c75..648ed09 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -415,6 +415,13 @@ enum evrp_mode
   EVRP_MODE_RVRP_DEBUG = EVRP_MODE_RVRP_ONLY | EVRP_MODE_DEBUG
 };
 
+/* Modes of OpenACC 'kernels' constructs handling.  */
+enum openacc_kernels
+{
+  OPENACC_KERNELS_DECOMPOSE,
+  OPENACC_KERNELS_PARLOOPS
+};
+
 #endif
 
 #endif /* ! GCC_FLAG_TYPES_H */
diff --git a/gcc/fortran/lang.opt b/gcc/fortran/lang.opt
index da4b1aa..96ed208 100644
--- a/gcc/fortran/lang.opt
+++ b/gcc/fortran/lang.opt
@@ -687,6 +687,10 @@ fopenacc-dim=
 Fortran LTO Joined Var(flag_openacc_dims)
 ; Documented in C
 
+fopenacc-kernels=
+Fortran RejectNegative Joined Enum(openacc_kernels) Var(flag_openacc_kernels) Init(OPENACC_KERNELS_PARLOOPS)
+; Documented in C
+
 fopenmp
 Fortran LTO
 ; Documented in C
diff --git a/gcc/gimple-pretty-print.c b/gcc/gimple-pretty-print.c
index a01bf90..d97a231 100644
--- a/gcc/gimple-pretty-print.c
+++ b/gcc/gimple-pretty-print.c
@@ -1700,6 +1700,15 @@ dump_gimple_omp_target (pretty_printer *buffer, const gomp_target *gs,
     case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
       kind = " oacc_host_data";
       break;
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+      kind = " oacc_parallel_kernels_parallelized";
+      break;
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+      kind = " oacc_parallel_kernels_gang_single";
+      break;
+    case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
+      kind = " oacc_data_kernels";
+      break;
     default:
       gcc_unreachable ();
     }
diff --git a/gcc/gimple.h b/gcc/gimple.h
index b935ad4..8a1db3c 100644
--- a/gcc/gimple.h
+++ b/gcc/gimple.h
@@ -175,6 +175,15 @@ enum gf_mask {
     GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA = 10,
     GF_OMP_TARGET_KIND_OACC_DECLARE = 11,
     GF_OMP_TARGET_KIND_OACC_HOST_DATA = 12,
+    /* A 'GF_OMP_TARGET_KIND_OACC_PARALLEL' representing an OpenACC 'kernels'
+       decomposed part, parallelized.  */
+    GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED = 13,
+    /* A 'GF_OMP_TARGET_KIND_OACC_PARALLEL' representing an OpenACC 'kernels'
+       decomposed part, "gang-single".  */
+    GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE = 14,
+    /* A 'GF_OMP_TARGET_KIND_OACC_DATA' representing an OpenACC 'kernels'
+       decomposed parts' 'data' construct.  */
+    GF_OMP_TARGET_KIND_OACC_DATA_KERNELS = 15,
     GF_OMP_TEAMS_HOST		= 1 << 0,
 
     /* True on an GIMPLE_OMP_RETURN statement if the return does not require
@@ -6511,6 +6520,9 @@ is_gimple_omp_oacc (const gimple *stmt)
 	case GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA:
 	case GF_OMP_TARGET_KIND_OACC_DECLARE:
 	case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
+	case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+	case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+	case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
 	  return true;
 	default:
 	  return false;
@@ -6536,6 +6548,8 @@ is_gimple_omp_offloaded (const gimple *stmt)
 	case GF_OMP_TARGET_KIND_OACC_PARALLEL:
 	case GF_OMP_TARGET_KIND_OACC_KERNELS:
 	case GF_OMP_TARGET_KIND_OACC_SERIAL:
+	case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+	case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
 	  return true;
 	default:
 	  return false;
diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c
index c6ee3eb..b731fd6 100644
--- a/gcc/omp-expand.c
+++ b/gcc/omp-expand.c
@@ -9257,11 +9257,14 @@ expand_omp_target (struct omp_region *region)
     case GF_OMP_TARGET_KIND_OACC_UPDATE:
     case GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA:
     case GF_OMP_TARGET_KIND_OACC_DECLARE:
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
       data_region = false;
       break;
     case GF_OMP_TARGET_KIND_DATA:
     case GF_OMP_TARGET_KIND_OACC_DATA:
     case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
+    case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
       data_region = true;
       break;
     default:
@@ -9307,6 +9310,16 @@ expand_omp_target (struct omp_region *region)
 	= tree_cons (get_identifier ("oacc serial"),
 		     NULL_TREE, DECL_ATTRIBUTES (child_fn));
       break;
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+      DECL_ATTRIBUTES (child_fn)
+	= tree_cons (get_identifier ("oacc parallel_kernels_parallelized"),
+		     NULL_TREE, DECL_ATTRIBUTES (child_fn));
+      break;
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+      DECL_ATTRIBUTES (child_fn)
+	= tree_cons (get_identifier ("oacc parallel_kernels_gang_single"),
+		     NULL_TREE, DECL_ATTRIBUTES (child_fn));
+      break;
     default:
       /* Make sure we don't miss any.  */
       gcc_checking_assert (!(is_gimple_omp_oacc (entry_stmt)
@@ -9517,10 +9530,13 @@ expand_omp_target (struct omp_region *region)
     case GF_OMP_TARGET_KIND_OACC_PARALLEL:
     case GF_OMP_TARGET_KIND_OACC_KERNELS:
     case GF_OMP_TARGET_KIND_OACC_SERIAL:
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
       start_ix = BUILT_IN_GOACC_PARALLEL;
       break;
     case GF_OMP_TARGET_KIND_OACC_DATA:
     case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
+    case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
       start_ix = BUILT_IN_GOACC_DATA_START;
       break;
     case GF_OMP_TARGET_KIND_OACC_UPDATE:
@@ -9993,6 +10009,9 @@ build_omp_regions_1 (basic_block bb, struct omp_region *parent,
 		case GF_OMP_TARGET_KIND_OACC_SERIAL:
 		case GF_OMP_TARGET_KIND_OACC_DATA:
 		case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
+		case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+		case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+		case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
 		  break;
 		case GF_OMP_TARGET_KIND_UPDATE:
 		case GF_OMP_TARGET_KIND_ENTER_DATA:
@@ -10247,6 +10266,9 @@ omp_make_gimple_edges (basic_block bb, struct omp_region **region,
 	case GF_OMP_TARGET_KIND_OACC_SERIAL:
 	case GF_OMP_TARGET_KIND_OACC_DATA:
 	case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
+	case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+	case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+	case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
 	  break;
 	case GF_OMP_TARGET_KIND_UPDATE:
 	case GF_OMP_TARGET_KIND_ENTER_DATA:
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 2602189..a1604e0 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -193,8 +193,8 @@ static tree scan_omp_1_op (tree *, int *, void *);
       *handled_ops_p = false; \
       break;
 
-/* Return true if CTX corresponds to an OpenACC 'parallel' or 'serial'
-   region.  */
+/* Return whether CTX represents an OpenACC 'parallel' or 'serial' construct.
+   (This doesn't include OpenACC 'kernels' decomposed parts.)  */
 
 static bool
 is_oacc_parallel_or_serial (omp_context *ctx)
@@ -207,7 +207,8 @@ is_oacc_parallel_or_serial (omp_context *ctx)
 		  == GF_OMP_TARGET_KIND_OACC_SERIAL)));
 }
 
-/* Return true if CTX corresponds to an oacc kernels region.  */
+/* Return whether CTX represents an OpenACC 'kernels' construct.
+   (This doesn't include OpenACC 'kernels' decomposed parts.)  */
 
 static bool
 is_oacc_kernels (omp_context *ctx)
@@ -218,6 +219,21 @@ is_oacc_kernels (omp_context *ctx)
 	      == GF_OMP_TARGET_KIND_OACC_KERNELS));
 }
 
+/* Return whether CTX represents an OpenACC 'kernels' decomposed part.  */
+
+static bool
+is_oacc_kernels_decomposed_part (omp_context *ctx)
+{
+  enum gimple_code outer_type = gimple_code (ctx->stmt);
+  return ((outer_type == GIMPLE_OMP_TARGET)
+	  && ((gimple_omp_target_kind (ctx->stmt)
+	       == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
+	      || (gimple_omp_target_kind (ctx->stmt)
+		  == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
+	      || (gimple_omp_target_kind (ctx->stmt)
+		  == GF_OMP_TARGET_KIND_OACC_DATA_KERNELS)));
+}
+
 /* Return true if STMT corresponds to an OpenMP target region.  */
 static bool
 is_omp_target (gimple *stmt)
@@ -1200,6 +1216,8 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
 	    {
 	      /* No 'reduction' clauses on OpenACC 'kernels'.  */
 	      gcc_checking_assert (!is_oacc_kernels (ctx));
+	      /* Likewise, on OpenACC 'kernels' decomposed parts.  */
+	      gcc_checking_assert (!is_oacc_kernels_decomposed_part (ctx));
 
 	      ctx->local_reduction_clauses
 		= tree_cons (NULL, c, ctx->local_reduction_clauses);
@@ -2415,7 +2433,9 @@ enclosing_target_ctx (omp_context *ctx)
   return ctx;
 }
 
-/* Return true if ctx is part of an oacc kernels region.  */
+/* Return whether CTX's parent compute construct is an OpenACC 'kernels'
+   construct.
+   (This doesn't include OpenACC 'kernels' decomposed parts.)  */
 
 static bool
 ctx_in_oacc_kernels_region (omp_context *ctx)
@@ -2431,7 +2451,8 @@ ctx_in_oacc_kernels_region (omp_context *ctx)
   return false;
 }
 
-/* Check the parallelism clauses inside a kernels regions.
+/* Check the parallelism clauses inside a OpenACC 'kernels' region.
+   (This doesn't include OpenACC 'kernels' decomposed parts.)
    Until kernels handling moves to use the same loop indirection
    scheme as parallel, we need to do this checking early.  */
 
@@ -2533,6 +2554,10 @@ scan_omp_for (gomp_for *stmt, omp_context *outer_ctx)
 
 	    if (c_op0)
 	      {
+		/* By construction, this is impossible for OpenACC 'kernels'
+		   decomposed parts.  */
+		gcc_assert (!(tgt && is_oacc_kernels_decomposed_part (tgt)));
+
 		error_at (OMP_CLAUSE_LOCATION (c),
 			  "argument not permitted on %qs clause",
 			  omp_clause_code_name[OMP_CLAUSE_CODE (c)]);
@@ -3070,6 +3095,8 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
 		  case GF_OMP_TARGET_KIND_OACC_PARALLEL:
 		  case GF_OMP_TARGET_KIND_OACC_KERNELS:
 		  case GF_OMP_TARGET_KIND_OACC_SERIAL:
+		  case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+		  case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
 		    ok = true;
 		    break;
 
@@ -3526,6 +3553,11 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
 	    case GF_OMP_TARGET_KIND_OACC_DECLARE: stmt_name = "declare"; break;
 	    case GF_OMP_TARGET_KIND_OACC_HOST_DATA: stmt_name = "host_data";
 	      break;
+	    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+	    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+	    case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
+	      /* OpenACC 'kernels' decomposed parts.  */
+	      stmt_name = "kernels"; break;
 	    default: gcc_unreachable ();
 	    }
 	  switch (gimple_omp_target_kind (ctx->stmt))
@@ -3541,6 +3573,11 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
 	    case GF_OMP_TARGET_KIND_OACC_DATA: ctx_stmt_name = "data"; break;
 	    case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
 	      ctx_stmt_name = "host_data"; break;
+	    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+	    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+	    case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
+	      /* OpenACC 'kernels' decomposed parts.  */
+	      ctx_stmt_name = "kernels"; break;
 	    default: gcc_unreachable ();
 	    }
 
@@ -6930,6 +6967,8 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
       {
 	/* No 'reduction' clauses on OpenACC 'kernels'.  */
 	gcc_checking_assert (!is_oacc_kernels (ctx));
+	/* Likewise, on OpenACC 'kernels' decomposed parts.  */
+	gcc_checking_assert (!is_oacc_kernels_decomposed_part (ctx));
 
 	tree orig = OMP_CLAUSE_DECL (c);
 	tree var = maybe_lookup_decl (orig, ctx);
@@ -7785,6 +7824,8 @@ lower_oacc_head_mark (location_t loc, tree ddvar, tree clauses,
   else if (is_oacc_kernels (tgt))
     /* Not using this loops handling inside OpenACC 'kernels' regions.  */
     gcc_unreachable ();
+  else if (is_oacc_kernels_decomposed_part (tgt))
+    ;
   else
     gcc_unreachable ();
 
@@ -7792,6 +7833,14 @@ lower_oacc_head_mark (location_t loc, tree ddvar, tree clauses,
   if (!tgt || is_oacc_parallel_or_serial (tgt))
     tag |= OLF_INDEPENDENT;
 
+  /* Loops inside OpenACC 'kernels' decomposed parts' regions are expected to
+     have an explicit 'seq' or 'independent' clause, and no 'auto' clause.  */
+  if (tgt && is_oacc_kernels_decomposed_part (tgt))
+    {
+      gcc_assert (tag & (OLF_SEQ | OLF_INDEPENDENT));
+      gcc_assert (!(tag & OLF_AUTO));
+    }
+
   if (tag & OLF_TILE)
     /* Tiling could use all 3 levels.  */ 
     levels = 3;
@@ -11639,11 +11688,14 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
     case GF_OMP_TARGET_KIND_OACC_UPDATE:
     case GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA:
     case GF_OMP_TARGET_KIND_OACC_DECLARE:
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
       data_region = false;
       break;
     case GF_OMP_TARGET_KIND_DATA:
     case GF_OMP_TARGET_KIND_OACC_DATA:
     case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
+    case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
       data_region = true;
       break;
     default:
@@ -11829,6 +11881,8 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 	  {
 	    /* No 'firstprivate' clauses on OpenACC 'kernels'.  */
 	    gcc_checking_assert (!is_oacc_kernels (ctx));
+	    /* Likewise, on OpenACC 'kernels' decomposed parts.  */
+	    gcc_checking_assert (!is_oacc_kernels_decomposed_part (ctx));
 
 	    goto oacc_firstprivate;
 	  }
@@ -11861,6 +11915,8 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 	  {
 	    /* No 'private' clauses on OpenACC 'kernels'.  */
 	    gcc_checking_assert (!is_oacc_kernels (ctx));
+	    /* Likewise, on OpenACC 'kernels' decomposed parts.  */
+	    gcc_checking_assert (!is_oacc_kernels_decomposed_part (ctx));
 
 	    break;
 	  }
diff --git a/gcc/omp-oacc-kernels-decompose.cc b/gcc/omp-oacc-kernels-decompose.cc
new file mode 100644
index 0000000..c585e5d
--- /dev/null
+++ b/gcc/omp-oacc-kernels-decompose.cc
@@ -0,0 +1,1531 @@
+/* Decompose OpenACC 'kernels' constructs into parts, a sequence of compute
+   constructs
+
+   Copyright (C) 2020 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "target.h"
+#include "tree.h"
+#include "cp/cp-tree.h"
+#include "gimple.h"
+#include "tree-pass.h"
+#include "cgraph.h"
+#include "fold-const.h"
+#include "gimplify.h"
+#include "gimple-iterator.h"
+#include "gimple-walk.h"
+#include "gomp-constants.h"
+#include "omp-general.h"
+#include "diagnostic-core.h"
+
+
+/* This preprocessing pass is run immediately before lower_omp.  It decomposes
+   OpenACC 'kernels' constructs into parts, a sequence of compute constructs.
+
+   The translation is as follows:
+     - The entire 'kernels' region is turned into a 'data' region with clauses
+       taken from the 'kernels' region.  New 'create' clauses are added for all
+       variables declared at the top level in the kernels region.
+     - Any loop nests annotated with an OpenACC 'loop' directive are wrapped in
+       a new compute construct.
+	 - 'loop' directives without an explicit 'independent' or 'seq' clause
+	   get an 'auto' clause added; other clauses are preserved on the loop
+	   or moved to the new surrounding compute construct, as applicable.
+     - Any sequences of other code (non-loops, non-OpenACC 'loop's) are wrapped
+       in new "gang-single" compute construct: 'worker'/'vector' parallelism is
+       preserved, but 'num_gangs (1)' is enforced.
+     - Both points above only apply at the topmost level in the region, that
+       is, the transformation does not introduce new compute constructs inside
+       nested statement bodies.  In particular, this means that a
+       gang-parallelizable loop inside an 'if' statement is made "gang-single".
+     - In order to make the host wait only once for the whole region instead
+       of once per device kernel launch, the new compute constructs are
+       annotated 'async'.  Unless the original 'kernels' construct already was
+       marked 'async', the entire region ends with a 'wait' directive.  If the
+       original 'kernels' construct was marked 'async', the synthesized 'async'
+       clauses use the original 'kernels' construct's 'async' argument
+       (possibly implicit).
+*/
+
+
+/*TODO Things are conceptually wrong here: 'loop' clauses may be hidden behind
+  'device_type', so we have to defer a lot of processing until we're in the
+  offloading compilation.  "Fortunately", GCC doesn't support the OpenACC
+  'device_type' clause yet, so we get away that.  */
+
+
+/* Helper function for decompose_kernels_region_body.  If STMT contains a
+   "top-level" OMP_FOR statement, returns a pointer to that statement;
+   returns NULL otherwise.
+
+   A "top-level" OMP_FOR statement is one that is possibly accompanied by
+   small snippets of setup code.  Specifically, this function accepts an
+   OMP_FOR possibly wrapped in a singleton bind and a singleton try
+   statement to allow for a local loop variable, but not an OMP_FOR
+   statement nested in any other constructs.  Alternatively, it accepts a
+   non-singleton bind containing only assignments and then an OMP_FOR
+   statement at the very end.  The former style can be generated by the C
+   frontend, the latter by the Fortran frontend.  */
+
+static gimple *
+top_level_omp_for_in_stmt (gimple *stmt)
+{
+  if (gimple_code (stmt) == GIMPLE_OMP_FOR)
+    return stmt;
+
+  if (gimple_code (stmt) == GIMPLE_BIND)
+    {
+      gimple_seq body = gimple_bind_body (as_a <gbind *> (stmt));
+      if (gimple_seq_singleton_p (body))
+	{
+	  /* Accept an OMP_FOR statement, or a try statement containing only
+	     a single OMP_FOR.  */
+	  gimple *maybe_for_or_try = gimple_seq_first_stmt (body);
+	  if (gimple_code (maybe_for_or_try) == GIMPLE_OMP_FOR)
+	    return maybe_for_or_try;
+	  else if (gimple_code (maybe_for_or_try) == GIMPLE_TRY)
+	    {
+	      gimple_seq try_body = gimple_try_eval (maybe_for_or_try);
+	      if (!gimple_seq_singleton_p (try_body))
+		return NULL;
+	      gimple *maybe_omp_for_stmt = gimple_seq_first_stmt (try_body);
+	      if (gimple_code (maybe_omp_for_stmt) == GIMPLE_OMP_FOR)
+		return maybe_omp_for_stmt;
+	    }
+	}
+      else
+	{
+	  gimple_stmt_iterator gsi;
+	  /* Accept only a block of optional assignments followed by an
+	     OMP_FOR at the end.  No other kinds of statements allowed.  */
+	  for (gsi = gsi_start (body); !gsi_end_p (gsi); gsi_next (&gsi))
+	    {
+	      gimple *body_stmt = gsi_stmt (gsi);
+	      if (gimple_code (body_stmt) == GIMPLE_ASSIGN)
+		continue;
+	      else if (gimple_code (body_stmt) == GIMPLE_OMP_FOR
+		       && gsi_one_before_end_p (gsi))
+		return body_stmt;
+	      else
+		return NULL;
+	    }
+	}
+    }
+
+  return NULL;
+}
+
+/* Helper for adjust_region_code: evaluate the statement at GSI_P.  */
+
+static tree
+adjust_region_code_walk_stmt_fn (gimple_stmt_iterator *gsi_p,
+				 bool *handled_ops_p,
+				 struct walk_stmt_info *wi)
+{
+  int *region_code = (int *) wi->info;
+
+  gimple *stmt = gsi_stmt (*gsi_p);
+  switch (gimple_code (stmt))
+    {
+    case GIMPLE_OMP_FOR:
+      {
+	tree clauses = gimple_omp_for_clauses (stmt);
+	if (omp_find_clause (clauses, OMP_CLAUSE_INDEPENDENT))
+	  {
+	    /* Explicit 'independent' clause.  */
+	    /* Keep going; recurse into loop body.  */
+	    break;
+	  }
+	else if (omp_find_clause (clauses, OMP_CLAUSE_SEQ))
+	  {
+	    /* Explicit 'seq' clause.  */
+	    /* We'll "parallelize" if at some level a loop construct has been
+	       marked up by the user as unparallelizable ('seq' clause; we'll
+	       respect that in the later processing).  Given that the user has
+	       explicitly marked it up, this loop construct cannot be
+	       performance-critical, and in this case it's also fine to
+	       "parallelize" instead of "gang-single", because any outer or
+	       inner loops may still exploit the available parallelism.  */
+	    /* Keep going; recurse into loop body.  */
+	    break;
+	  }
+	else
+	  {
+	    /* Explicit or implicit 'auto' clause.  */
+	    /* The user would like this loop analyzed ('auto' clause) and
+	       typically parallelized, but we don't have available yet the
+	       compiler logic to analyze this, so can't parallelize it here, so
+	       we'd very likely be running into a performance problem if we
+	       were to execute this unparallelized, thus forward the whole loop
+	       nest to 'parloops'.  */
+	    *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+	    /* Terminate: final decision for this region.  */
+	    *handled_ops_p = true;
+	    return integer_zero_node;
+	  }
+	gcc_unreachable ();
+      }
+
+    case GIMPLE_COND:
+    case GIMPLE_GOTO:
+    case GIMPLE_SWITCH:
+    case GIMPLE_ASM:
+    case GIMPLE_TRANSACTION:
+    case GIMPLE_RETURN:
+      /* Statement that might constitute some looping/control flow pattern.  */
+      /* The user would like this code analyzed (implicit inside a 'kernels'
+	 region) and typically parallelized, but we don't have available yet
+	 the compiler logic to analyze this, so can't parallelize it here, so
+	 we'd very likely be running into a performance problem if we were to
+	 execute this unparallelized, thus forward the whole thing to
+	 'parloops'.  */
+      *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+      /* Terminate: final decision for this region.  */
+      *handled_ops_p = true;
+      return integer_zero_node;
+
+    default:
+      /* Keep going.  */
+      break;
+    }
+
+  return NULL;
+}
+
+/* Adjust the REGION_CODE for the region in GS.  */
+
+static void
+adjust_region_code (gimple_seq gs, int *region_code)
+{
+  struct walk_stmt_info wi;
+  memset (&wi, 0, sizeof (wi));
+  wi.info = region_code;
+  walk_gimple_seq (gs, adjust_region_code_walk_stmt_fn, NULL, &wi);
+}
+
+/* Helper function for make_loops_gang_single for walking the tree.  If the
+   statement indicated by GSI_P is an OpenACC for loop with a gang clause,
+   issue a warning and remove the clause.  */
+
+static tree
+visit_loops_in_gang_single_region (gimple_stmt_iterator *gsi_p,
+				   bool *handled_ops_p,
+				   struct walk_stmt_info *)
+{
+  *handled_ops_p = false;
+
+  gimple *stmt = gsi_stmt (*gsi_p);
+  switch (gimple_code (stmt))
+    {
+    case GIMPLE_OMP_FOR:
+      /*TODO Given the current 'adjust_region_code' algorithm, this is
+	actually...  */
+      gcc_unreachable ();
+
+      {
+	tree clauses = gimple_omp_for_clauses (stmt);
+	tree prev_clause = NULL;
+	for (tree clause = clauses; clause; clause = OMP_CLAUSE_CHAIN (clause))
+	  {
+	    if (OMP_CLAUSE_CODE (clause) == OMP_CLAUSE_GANG)
+	      {
+		/* It makes no sense to have a 'gang' clause in a "gang-single"
+		   region, so warn and remove it.  */
+		warning_at (gimple_location (stmt), 0,
+			    "conditionally executed loop in %<kernels%> region"
+			    " will be executed by a single gang;"
+			    " ignoring %<gang%> clause");
+		if (prev_clause != NULL)
+		  OMP_CLAUSE_CHAIN (prev_clause) = OMP_CLAUSE_CHAIN (clause);
+		else
+		  clauses = OMP_CLAUSE_CHAIN (clause);
+
+		break;
+	      }
+	    prev_clause = clause;
+	  }
+	gimple_omp_for_set_clauses (stmt, clauses);
+      }
+      /* No need to recurse into nested statements; no loop nested inside
+	 this loop can be gang-partitioned.  */
+      sorry ("%<gang%> loop in %<gang-single%> region");
+      *handled_ops_p = true;
+      break;
+
+    default:
+      break;
+    }
+
+  return NULL;
+}
+
+/* Visit all nested OpenACC loops in the sequence indicated by GS.  This
+   statement is expected to be inside a gang-single region.  Issue a warning
+   for any loops inside it that have gang clauses and remove the clauses.  */
+
+static void
+make_loops_gang_single (gimple_seq gs)
+{
+  struct walk_stmt_info wi;
+  memset (&wi, 0, sizeof (wi));
+  walk_gimple_seq (gs, visit_loops_in_gang_single_region, NULL, &wi);
+}
+
+/* Construct a "gang-single" compute construct at LOC containing the STMTS.
+   Annotate with CLAUSES, which must not contain a 'num_gangs' clause, and an
+   additional 'num_gangs (1)' clause to force "gang-single" execution.  */
+
+static gimple *
+make_region_seq (location_t loc, gimple_seq stmts,
+		 tree num_gangs_clause,
+		 tree num_workers_clause,
+		 tree vector_length_clause,
+		 tree clauses)
+{
+  /* This correctly unshares the entire clause chain rooted here.  */
+  clauses = unshare_expr (clauses);
+
+  dump_user_location_t loc_stmts_first = gimple_seq_first (stmts);
+
+  /* Figure out the region code for this region.  */
+  /* Optimistic default: assume "setup code", no looping; thus not
+     performance-critical.  */
+  int region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE;
+  adjust_region_code (stmts, &region_code);
+
+  if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
+    {
+      if (dump_enabled_p ())
+	/*TODO MSG_MISSED_OPTIMIZATION? */
+	dump_printf_loc (MSG_NOTE, loc_stmts_first,
+			 "beginning %<gang-single%> part"
+			 " in OpenACC %<kernels%> region\n");
+
+      /* Synthesize a 'num_gangs (1)' clause.  */
+      tree gang_single_clause = build_omp_clause (loc, OMP_CLAUSE_NUM_GANGS);
+      OMP_CLAUSE_OPERAND (gang_single_clause, 0) = integer_one_node;
+      OMP_CLAUSE_CHAIN (gang_single_clause) = clauses;
+      clauses = gang_single_clause;
+
+      /* Remove and issue warnings about gang clauses on any OpenACC
+	 loops nested inside this sequentially executed statement.  */
+      make_loops_gang_single (stmts);
+    }
+  else if (region_code == GF_OMP_TARGET_KIND_OACC_KERNELS)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, loc_stmts_first,
+			 "beginning %<parloops%> part"
+			 " in OpenACC %<kernels%> region\n");
+
+      /* As we're transforming a 'GF_OMP_TARGET_KIND_OACC_KERNELS' into another
+	 'GF_OMP_TARGET_KIND_OACC_KERNELS', this isn't doing any of the clauses
+	 mangling that 'make_region_loop_nest' is doing.  */
+      /* Re-assemble the clauses stripped off earlier.  */
+      if (num_gangs_clause != NULL)
+	{
+	  tree c = unshare_expr (num_gangs_clause);
+	  OMP_CLAUSE_CHAIN (c) = clauses;
+	  clauses = c;
+	}
+      if (num_workers_clause != NULL)
+	{
+	  tree c = unshare_expr (num_workers_clause);
+	  OMP_CLAUSE_CHAIN (c) = clauses;
+	  clauses = c;
+	}
+      if (vector_length_clause != NULL)
+	{
+	  tree c = unshare_expr (vector_length_clause);
+	  OMP_CLAUSE_CHAIN (c) = clauses;
+	  clauses = c;
+	}
+    }
+  else
+    gcc_unreachable ();
+
+  /* Build the gang-single region.  */
+  gimple *single_region = gimple_build_omp_target (NULL, region_code, clauses);
+  gimple_set_location (single_region, loc);
+  gbind *single_body = gimple_build_bind (NULL, stmts, make_node (BLOCK));
+  gimple_omp_set_body (single_region, single_body);
+
+  return single_region;
+}
+
+/* Helper function for make_region_loop_nest.  Adds a 'num_gangs'
+   ('num_workers', 'vector_length') clause to the given CLAUSES, either the one
+   from the parent compute construct (PARENT_CLAUSE) or a new one based on the
+   loop's own LOOP_CLAUSE ('gang (num: N)' or similar for 'worker' or 'vector'
+   clauses) with the given CLAUSE_CODE.  Does nothing if neither PARENT_CLAUSE
+   nor LOOP_CLAUSE exist.  Returns the new clauses.  */
+
+static tree
+add_parent_or_loop_num_clause (tree parent_clause, tree loop_clause,
+			       omp_clause_code clause_code, tree clauses)
+{
+  if (parent_clause != NULL)
+    {
+      tree num_clause = unshare_expr (parent_clause);
+      OMP_CLAUSE_CHAIN (num_clause) = clauses;
+      clauses = num_clause;
+    }
+  else if (loop_clause != NULL)
+    {
+      /* The kernels region does not have a 'num_gangs' clause, but the loop
+	 itself had a 'gang (num: N)' clause.  Honor it by adding a
+	 'num_gangs (N)' clause on the compute construct.  */
+      tree num = OMP_CLAUSE_OPERAND (loop_clause, 0);
+      tree new_num_clause
+	= build_omp_clause (OMP_CLAUSE_LOCATION (loop_clause), clause_code);
+      OMP_CLAUSE_OPERAND (new_num_clause, 0) = num;
+      OMP_CLAUSE_CHAIN (new_num_clause) = clauses;
+      clauses = new_num_clause;
+    }
+  return clauses;
+}
+
+/* Helper for make_region_loop_nest, looking for 'worker (num: N)' or 'vector
+   (length: N)' clauses in nested loops.  Removes the argument, transferring it
+   to the enclosing compute construct (via WI->INFO).  If arguments within the
+   same loop nest conflict, emits a warning.
+
+   This function also decides whether to add an 'auto' clause on each of these
+   nested loops.  */
+
+struct adjust_nested_loop_clauses_wi_info
+{
+  tree *loop_gang_clause_ptr;
+  tree *loop_worker_clause_ptr;
+  tree *loop_vector_clause_ptr;
+};
+
+static tree
+adjust_nested_loop_clauses (gimple_stmt_iterator *gsi_p, bool *,
+			    struct walk_stmt_info *wi)
+{
+  struct adjust_nested_loop_clauses_wi_info *wi_info
+    = (struct adjust_nested_loop_clauses_wi_info *) wi->info;
+  gimple *stmt = gsi_stmt (*gsi_p);
+
+  if (gimple_code (stmt) == GIMPLE_OMP_FOR)
+    {
+      bool add_auto_clause = true;
+      tree loop_clauses = gimple_omp_for_clauses (stmt);
+      tree loop_clause = loop_clauses;
+      for (; loop_clause; loop_clause = OMP_CLAUSE_CHAIN (loop_clause))
+	{
+	  tree *outer_clause_ptr = NULL;
+	  switch (OMP_CLAUSE_CODE (loop_clause))
+	    {
+	    case OMP_CLAUSE_GANG:
+	      outer_clause_ptr = wi_info->loop_gang_clause_ptr;
+	      break;
+	    case OMP_CLAUSE_WORKER:
+	      outer_clause_ptr = wi_info->loop_worker_clause_ptr;
+	      break;
+	    case OMP_CLAUSE_VECTOR:
+	      outer_clause_ptr = wi_info->loop_vector_clause_ptr;
+	      break;
+	    case OMP_CLAUSE_SEQ:
+	    case OMP_CLAUSE_INDEPENDENT:
+	    case OMP_CLAUSE_AUTO:
+	      add_auto_clause = false;
+	    default:
+	      break;
+	    }
+	  if (outer_clause_ptr != NULL)
+	    {
+	      if (OMP_CLAUSE_OPERAND (loop_clause, 0) != NULL
+		  && *outer_clause_ptr == NULL)
+		{
+		  /* Transfer the clause to the enclosing compute construct and
+		     remove the numerical argument from the 'loop'.  */
+		  *outer_clause_ptr = unshare_expr (loop_clause);
+		  OMP_CLAUSE_OPERAND (loop_clause, 0) = NULL;
+		}
+	      else if (OMP_CLAUSE_OPERAND (loop_clause, 0) != NULL &&
+		       OMP_CLAUSE_OPERAND (*outer_clause_ptr, 0) != NULL)
+		{
+		  /* See if both of these are the same constant.  If they
+		     aren't, emit a warning.  */
+		  tree old_op = OMP_CLAUSE_OPERAND (*outer_clause_ptr, 0);
+		  tree new_op = OMP_CLAUSE_OPERAND (loop_clause, 0);
+		  if (!(cst_and_fits_in_hwi (old_op) &&
+			cst_and_fits_in_hwi (new_op) &&
+			int_cst_value (old_op) == int_cst_value (new_op)))
+		    {
+		      const char *clause_name
+			= omp_clause_code_name[OMP_CLAUSE_CODE (loop_clause)];
+		      error_at (gimple_location (stmt),
+				"cannot honor conflicting %qs clause",
+				clause_name);
+		      inform (OMP_CLAUSE_LOCATION (*outer_clause_ptr),
+			      "location of the previous clause"
+			      " in the same loop nest");
+		    }
+		  OMP_CLAUSE_OPERAND (loop_clause, 0) = NULL;
+		}
+	    }
+	}
+      if (add_auto_clause)
+	{
+	  tree auto_clause
+	    = build_omp_clause (gimple_location (stmt), OMP_CLAUSE_AUTO);
+	  OMP_CLAUSE_CHAIN (auto_clause) = loop_clauses;
+	  gimple_omp_for_set_clauses (stmt, auto_clause);
+	}
+    }
+
+  return NULL;
+}
+
+/* Helper for make_region_loop_nest.  Transform OpenACC 'kernels'/'loop'
+   construct clauses into OpenACC 'parallel'/'loop' construct ones.  */
+
+static tree
+transform_kernels_loop_clauses (gimple *omp_for,
+				tree num_gangs_clause,
+				tree num_workers_clause,
+				tree vector_length_clause,
+				tree clauses)
+{
+  /* If this loop in a kernels region does not have an explicit 'seq',
+     'independent', or 'auto' clause, we must give it an explicit 'auto'
+     clause.
+     We also check for 'gang (num: N)' clauses.  These must not appear in
+     kernels regions that have their own 'num_gangs' clause.  Otherwise, they
+     must be converted and put on the region; similarly for 'worker' and
+     'vector' clauses.  */
+  bool add_auto_clause = true;
+  tree loop_gang_clause = NULL, loop_worker_clause = NULL,
+       loop_vector_clause = NULL;
+  tree loop_clauses = gimple_omp_for_clauses (omp_for);
+  for (tree loop_clause = loop_clauses;
+       loop_clause;
+       loop_clause = OMP_CLAUSE_CHAIN (loop_clause))
+    {
+      bool found_num_clause = false;
+      tree *clause_ptr, clause_to_check;
+      switch (OMP_CLAUSE_CODE (loop_clause))
+	{
+	case OMP_CLAUSE_GANG:
+	  found_num_clause = true;
+	  clause_ptr = &loop_gang_clause;
+	  clause_to_check = num_gangs_clause;
+	  break;
+	case OMP_CLAUSE_WORKER:
+	  found_num_clause = true;
+	  clause_ptr = &loop_worker_clause;
+	  clause_to_check = num_workers_clause;
+	  break;
+	case OMP_CLAUSE_VECTOR:
+	  found_num_clause = true;
+	  clause_ptr = &loop_vector_clause;
+	  clause_to_check = vector_length_clause;
+	  break;
+	case OMP_CLAUSE_INDEPENDENT:
+	case OMP_CLAUSE_SEQ:
+	case OMP_CLAUSE_AUTO:
+	  add_auto_clause = false;
+	default:
+	  break;
+	}
+      if (found_num_clause && OMP_CLAUSE_OPERAND (loop_clause, 0) != NULL)
+	{
+	  if (clause_to_check)
+	    {
+	      const char *clause_name
+		= omp_clause_code_name[OMP_CLAUSE_CODE (loop_clause)];
+	      const char *parent_clause_name
+		= omp_clause_code_name[OMP_CLAUSE_CODE (clause_to_check)];
+	      error_at (OMP_CLAUSE_LOCATION (loop_clause),
+			"argument not permitted on %qs clause"
+			" in OpenACC %<kernels%> region with a %qs clause",
+			clause_name, parent_clause_name);
+	      inform (OMP_CLAUSE_LOCATION (clause_to_check),
+		      "location of OpenACC %<kernels%>");
+	    }
+	  /* Copy the 'gang (N)'/'worker (N)'/'vector (N)' clause to the
+	     enclosing compute construct.  */
+	  *clause_ptr = unshare_expr (loop_clause);
+	  OMP_CLAUSE_CHAIN (*clause_ptr) = NULL;
+	  /* Leave a 'gang'/'worker'/'vector' clause on the 'loop', but without
+	     argument.  */
+	  OMP_CLAUSE_OPERAND (loop_clause, 0) = NULL;
+	}
+    }
+  if (add_auto_clause)
+    {
+      tree auto_clause = build_omp_clause (gimple_location (omp_for),
+					   OMP_CLAUSE_AUTO);
+      OMP_CLAUSE_CHAIN (auto_clause) = loop_clauses;
+      loop_clauses = auto_clause;
+    }
+  gimple_omp_for_set_clauses (omp_for, loop_clauses);
+  /* We must also recurse into the loop; it might contain nested loops having
+     their own 'worker (num: W)' or 'vector (length: V)' clauses.  Turn these
+     into 'worker'/'vector' clauses on the compute construct.  */
+  struct walk_stmt_info wi;
+  memset (&wi, 0, sizeof (wi));
+  struct adjust_nested_loop_clauses_wi_info wi_info;
+  wi_info.loop_gang_clause_ptr = &loop_gang_clause;
+  wi_info.loop_worker_clause_ptr = &loop_worker_clause;
+  wi_info.loop_vector_clause_ptr = &loop_vector_clause;
+  wi.info = &wi_info;
+  gimple *body = gimple_omp_body (omp_for);
+  walk_gimple_seq (body, adjust_nested_loop_clauses, NULL, &wi);
+  /* Check if there were conflicting numbers of workers or vector length.  */
+  if (loop_gang_clause != NULL &&
+      OMP_CLAUSE_OPERAND (loop_gang_clause, 0) == NULL)
+    loop_gang_clause = NULL;
+  if (loop_worker_clause != NULL &&
+      OMP_CLAUSE_OPERAND (loop_worker_clause, 0) == NULL)
+    loop_worker_clause = NULL;
+  if (loop_vector_clause != NULL &&
+      OMP_CLAUSE_OPERAND (loop_vector_clause, 0) == NULL)
+    vector_length_clause = NULL;
+
+  /* If the kernels region had 'num_gangs', 'num_worker', 'vector_length'
+     clauses, add these to this new compute construct.  */
+  clauses
+    = add_parent_or_loop_num_clause (num_gangs_clause, loop_gang_clause,
+				     OMP_CLAUSE_NUM_GANGS, clauses);
+  clauses
+    = add_parent_or_loop_num_clause (num_workers_clause, loop_worker_clause,
+				     OMP_CLAUSE_NUM_WORKERS, clauses);
+  clauses
+    = add_parent_or_loop_num_clause (vector_length_clause, loop_vector_clause,
+				     OMP_CLAUSE_VECTOR_LENGTH, clauses);
+
+  return clauses;
+}
+
+/* Construct a possibly gang-parallel compute construct containing the STMT,
+   which must be identical to, or a bind containing, the loop OMP_FOR.
+
+   The NUM_GANGS_CLAUSE, NUM_WORKERS_CLAUSE, and VECTOR_LENGTH_CLAUSE are
+   optional clauses from the original kernels region and must not be contained
+   in the other CLAUSES. The newly created compute construct is annotated with
+   the optional NUM_GANGS_CLAUSE as well as the other CLAUSES.  If there is no
+   NUM_GANGS_CLAUSE but the loop has a 'gang (num: N)' clause, that is
+   converted to a 'num_gangs (N)' clause on the new compute construct, and
+   similarly for 'worker' and 'vector' clauses.
+
+   The outermost loop gets an 'auto' clause unless there already is an
+   'seq'/'independent'/'auto' clause.  Nested loops inside OMP_FOR are treated
+   similarly by the adjust_nested_loop_clauses function.  */
+
+static gimple *
+make_region_loop_nest (gimple *omp_for, gimple_seq stmts,
+		       tree num_gangs_clause,
+		       tree num_workers_clause,
+		       tree vector_length_clause,
+		       tree clauses)
+{
+  /* This correctly unshares the entire clause chain rooted here.  */
+  clauses = unshare_expr (clauses);
+
+  /* Figure out the region code for this region.  */
+  /* Optimistic default: assume that the loop nest is parallelizable
+     (essentially, no GIMPLE_OMP_FOR with (explicit or implicit) 'auto' clause,
+     and no un-annotated loops).  */
+  int region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED;
+  adjust_region_code (stmts, &region_code);
+
+  if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
+    {
+      if (dump_enabled_p ())
+	/* This is not MSG_OPTIMIZED_LOCATIONS, as we're just doing what the
+	   user asked us to.  */
+	dump_printf_loc (MSG_NOTE, omp_for,
+			 "parallelized loop nest"
+			 " in OpenACC %<kernels%> region\n");
+
+      clauses = transform_kernels_loop_clauses (omp_for,
+						num_gangs_clause,
+						num_workers_clause,
+						vector_length_clause,
+						clauses);
+    }
+  else if (region_code == GF_OMP_TARGET_KIND_OACC_KERNELS)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, omp_for,
+			 "forwarded loop nest"
+			 " in OpenACC %<kernels%> region"
+			 " to %<parloops%> for analysis\n");
+
+      /* We're transforming one 'GF_OMP_TARGET_KIND_OACC_KERNELS' into another
+	 'GF_OMP_TARGET_KIND_OACC_KERNELS', so don't have to
+	 'transform_kernels_loop_clauses'.  */
+      /* Re-assemble the clauses stripped off earlier.  */
+      clauses
+	= add_parent_or_loop_num_clause (num_gangs_clause, NULL,
+					 OMP_CLAUSE_NUM_GANGS, clauses);
+      clauses
+	= add_parent_or_loop_num_clause (num_workers_clause, NULL,
+					 OMP_CLAUSE_NUM_WORKERS, clauses);
+      clauses
+	= add_parent_or_loop_num_clause (vector_length_clause, NULL,
+					 OMP_CLAUSE_VECTOR_LENGTH, clauses);
+    }
+  else
+    gcc_unreachable ();
+
+  gimple *parallel_body_bind
+    = gimple_build_bind (NULL, stmts, make_node (BLOCK));
+  gimple *parallel_region
+    = gimple_build_omp_target (parallel_body_bind, region_code, clauses);
+  gimple_set_location (parallel_region, gimple_location (omp_for));
+
+  return parallel_region;
+}
+
+/* Eliminate any binds directly inside BIND by adding their statements to
+   BIND (i.e., modifying it in place), excluding binds that hold only an
+   OMP_FOR loop and associated setup/cleanup code.  Recurse into binds but
+   not other statements.  Return a chain of the local variables of eliminated
+   binds, i.e., the local variables found in nested binds.  If
+   INCLUDE_TOPLEVEL_VARS is true, this also includes the variables belonging
+   to BIND itself. */
+
+static tree
+flatten_binds (gbind *bind, bool include_toplevel_vars = false)
+{
+  tree vars = NULL, last_var = NULL;
+
+  if (include_toplevel_vars)
+    {
+      vars = gimple_bind_vars (bind);
+      last_var = vars;
+    }
+
+  gimple_seq new_body = NULL;
+  gimple_seq body_sequence = gimple_bind_body (bind);
+  gimple_stmt_iterator gsi, gsi_n;
+  for (gsi = gsi_start (body_sequence); !gsi_end_p (gsi); gsi = gsi_n)
+    {
+      /* Advance the iterator here because otherwise it would be invalidated
+	 by moving statements below.  */
+      gsi_n = gsi;
+      gsi_next (&gsi_n);
+
+      gimple *stmt = gsi_stmt (gsi);
+      /* Flatten bind statements, except the ones that contain only an
+	 OpenACC for loop.  */
+      if (gimple_code (stmt) == GIMPLE_BIND
+	  && !top_level_omp_for_in_stmt (stmt))
+	{
+	  gbind *inner_bind = as_a <gbind *> (stmt);
+	  /* Flatten recursively, and collect all variables.  */
+	  tree inner_vars = flatten_binds (inner_bind, true);
+	  gimple_seq inner_sequence = gimple_bind_body (inner_bind);
+	  gcc_assert (gimple_code (inner_sequence) != GIMPLE_BIND
+		      || top_level_omp_for_in_stmt (inner_sequence));
+	  gimple_seq_add_seq (&new_body, inner_sequence);
+	  /* Find the last variable; we will append others to it.  */
+	  while (last_var != NULL && TREE_CHAIN (last_var) != NULL)
+	    last_var = TREE_CHAIN (last_var);
+	  if (last_var != NULL)
+	    {
+	      TREE_CHAIN (last_var) = inner_vars;
+	      last_var = inner_vars;
+	    }
+	  else
+	    {
+	      vars = inner_vars;
+	      last_var = vars;
+	    }
+	}
+      else
+	gimple_seq_add_stmt (&new_body, stmt);
+    }
+
+  /* Put the possibly transformed body back into the bind.  */
+  gimple_bind_set_body (bind, new_body);
+  return vars;
+}
+
+/* Helper function for places where we construct data regions.  Wraps the BODY
+   inside a try-finally construct at LOC that calls __builtin_GOACC_data_end
+   in its cleanup block.  Returns this try statement.  */
+
+static gimple *
+make_data_region_try_statement (location_t loc, gimple *body)
+{
+  tree data_end_fn = builtin_decl_explicit (BUILT_IN_GOACC_DATA_END);
+  gimple *call = gimple_build_call (data_end_fn, 0);
+  gimple_seq cleanup = NULL;
+  gimple_seq_add_stmt (&cleanup, call);
+  gimple *try_stmt = gimple_build_try (body, cleanup, GIMPLE_TRY_FINALLY);
+  gimple_set_location (body, loc);
+  return try_stmt;
+}
+
+/* If INNER_BIND_VARS holds variables, build an OpenACC data region with
+   location LOC containing BODY and having 'create (var)' clauses for each
+   variable.  If INNER_CLEANUP is present, add a try-finally statement with
+   this cleanup code in the finally block.  Return the new data region, or
+   the original BODY if no data region was needed.  */
+
+static gimple *
+maybe_build_inner_data_region (location_t loc, gimple *body,
+			       tree inner_bind_vars, gimple *inner_cleanup)
+{
+  /* Build data 'create (var)' clauses for these local variables.
+     Below we will add these to a data region enclosing the entire body
+     of the decomposed kernels region.  */
+  tree prev_mapped_var = NULL, next = NULL, artificial_vars = NULL,
+       inner_data_clauses = NULL;
+  for (tree v = inner_bind_vars; v; v = next)
+    {
+      next = TREE_CHAIN (v);
+      if (DECL_ARTIFICIAL (v)
+	  || TREE_CODE (v) == CONST_DECL
+	  || (DECL_LANG_SPECIFIC (current_function_decl)
+	      && DECL_TEMPLATE_INSTANTIATION (current_function_decl)))
+	{
+	  /* If this is an artificial temporary, it need not be mapped.  We
+	     move its declaration into the bind inside the data region.
+	     Also avoid mapping variables if we are inside a template
+	     instantiation; the code does not contain all the copies to
+	     temporaries that would make this legal.  */
+	  TREE_CHAIN (v) = artificial_vars;
+	  artificial_vars = v;
+	  if (prev_mapped_var != NULL)
+	    TREE_CHAIN (prev_mapped_var) = next;
+	  else
+	    inner_bind_vars = next;
+	}
+      else
+	{
+	  /* Otherwise, build the map clause.  */
+	  tree new_clause = build_omp_clause (loc, OMP_CLAUSE_MAP);
+	  OMP_CLAUSE_SET_MAP_KIND (new_clause, GOMP_MAP_ALLOC);
+	  OMP_CLAUSE_DECL (new_clause) = v;
+	  OMP_CLAUSE_SIZE (new_clause) = DECL_SIZE_UNIT (v);
+	  OMP_CLAUSE_CHAIN (new_clause) = inner_data_clauses;
+	  inner_data_clauses = new_clause;
+
+	  prev_mapped_var = v;
+	}
+    }
+
+  if (artificial_vars)
+    body = gimple_build_bind (artificial_vars, body, make_node (BLOCK));
+
+  /* If we determined above that there are variables that need to be created
+     on the device, construct a data region for them and wrap the body
+     inside that.  */
+  if (inner_data_clauses != NULL)
+    {
+      gcc_assert (inner_bind_vars != NULL);
+      gimple *inner_data_region
+	= gimple_build_omp_target (NULL, GF_OMP_TARGET_KIND_OACC_DATA_KERNELS,
+				   inner_data_clauses);
+      gimple_set_location (inner_data_region, loc);
+      /* Make sure __builtin_GOACC_data_end is called at the end.  */
+      gimple *try_stmt = make_data_region_try_statement (loc, body);
+      gimple_omp_set_body (inner_data_region, try_stmt);
+      gimple *bind_body;
+      if (inner_cleanup != NULL)
+	/* Clobber all the inner variables that need to be clobbered.  */
+	bind_body = gimple_build_try (inner_data_region, inner_cleanup,
+				      GIMPLE_TRY_FINALLY);
+      else
+	bind_body = inner_data_region;
+      body = gimple_build_bind (inner_bind_vars, bind_body, make_node (BLOCK));
+    }
+
+  return body;
+}
+
+/* Helper function of decompose_kernels_region_body.  The statements in
+   REGION_BODY are expected to be decomposed parts; add an 'async' clause to
+   each.  Also add a 'wait' directive at the end of the sequence.  */
+
+static void
+add_async_clauses_and_wait (location_t loc, gimple_seq *region_body)
+{
+  tree default_async_queue
+    = build_int_cst (integer_type_node, GOMP_ASYNC_NOVAL);
+  for (gimple_stmt_iterator gsi = gsi_start (*region_body);
+       !gsi_end_p (gsi);
+       gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+      tree target_clauses = gimple_omp_target_clauses (stmt);
+      tree new_async_clause = build_omp_clause (loc, OMP_CLAUSE_ASYNC);
+      OMP_CLAUSE_OPERAND (new_async_clause, 0) = default_async_queue;
+      OMP_CLAUSE_CHAIN (new_async_clause) = target_clauses;
+      target_clauses = new_async_clause;
+      gimple_omp_target_set_clauses (as_a <gomp_target *> (stmt),
+				     target_clauses);
+    }
+  /* A '#pragma acc wait' is just a call 'GOACC_wait (acc_async_sync, 0)'.  */
+  tree wait_fn = builtin_decl_explicit (BUILT_IN_GOACC_WAIT);
+  tree sync_arg = build_int_cst (integer_type_node, GOMP_ASYNC_SYNC);
+  gimple *wait_call = gimple_build_call (wait_fn, 2,
+					 sync_arg, integer_zero_node);
+  gimple_set_location (wait_call, loc);
+  gimple_seq_add_stmt (region_body, wait_call);
+}
+
+/* Auxiliary analysis of the body of a kernels region, to determine for each
+   OpenACC loop whether it is control-dependent (i.e., not necessarily
+   executed every time the kernels region is entered) or not.
+   We say that a loop is control-dependent if there is some cond, switch, or
+   goto statement that jumps over it, forwards or backwards.  For example,
+   if the loop is controlled by an if statement, then a jump to the true
+   block, the false block, or from one of those blocks to the control flow
+   join point will necessarily jump over the loop.
+   This analysis implements an ad-hoc union-find data structure classifying
+   statements into "control-flow regions" as follows: Most statements are in
+   the same region as their predecessor, except that each OpenACC loop is in
+   a region of its own, and each OpenACC loop's successor starts a new
+   region.  We then unite the regions of any statements linked by jumps,
+   placing any cond, switch, or goto statement in the same region as its
+   target label(s).
+   In the end, control dependence of OpenACC loops can be determined by
+   comparing their immediate predecessor and successor statements' regions.
+   A jump crosses the loop if and only if the predecessor and successor are
+   in the same region.  (If there is no predecessor or successor, the loop
+   is executed unconditionally.)
+   The methods in this class identify statements by their index in the
+   kernels region's body.  */
+
+class control_flow_regions
+{
+  public:
+    /* Initialize an instance and pre-compute the control-flow region
+       information for the statement sequence SEQ.  */
+    control_flow_regions (gimple_seq seq);
+
+    /* Return true if the statement with the given index IDX in the analyzed
+       statement sequence is an unconditionally executed OpenACC loop.  */
+    bool is_unconditional_oacc_for_loop (size_t idx);
+
+  private:
+    /* Find the region representative for the statement identified by index
+       STMT_IDX.  */
+    size_t find_rep (size_t stmt_idx);
+
+    /* Union the regions containing the statements represented by
+       representatives A and B.  */
+    void union_reps (size_t a, size_t b);
+
+    /* Helper for the constructor.  Performs the actual computation of the
+       control-flow regions in the statement sequence SEQ.  */
+    void compute_regions (gimple_seq seq);
+
+    /* The mapping from statement indices to region representatives.  */
+    vec <size_t> representatives;
+
+    /* A cache mapping statement indices to a flag indicating whether the
+       statement is a top level OpenACC for loop.  */
+    vec <bool> omp_for_loops;
+};
+
+control_flow_regions::control_flow_regions (gimple_seq seq)
+{
+  representatives.create (1);
+  omp_for_loops.create (1);
+  compute_regions (seq);
+}
+
+bool
+control_flow_regions::is_unconditional_oacc_for_loop (size_t idx)
+{
+  if (idx == 0 || idx == representatives.length () - 1)
+    /* The first or last statement in the kernels region.  This means that
+       there is no room before or after it for a jump or a label.  Thus
+       there cannot be a jump across it, so it is unconditional.  */
+    return true;
+  /* Otherwise, the loop is unconditional if the statements before and after
+     it are in different control flow regions.  Scan forward and backward,
+     skipping over neighboring OpenACC for loops, to find these preceding
+     statements.  */
+  size_t prev_index = idx - 1;
+  while (prev_index > 0 && omp_for_loops [prev_index] == true)
+    prev_index--;
+  /* If all preceding statements are also OpenACC loops, all of these are
+     unconditional.  */
+  if (prev_index == 0)
+    return true;
+  size_t succ_index = idx + 1;
+  while (succ_index < omp_for_loops.length ()
+	 && omp_for_loops [succ_index] == true)
+    succ_index++;
+  /* If all following statements are also OpenACC loops, all of these are
+     unconditional.  */
+  if (succ_index == omp_for_loops.length ())
+    return true;
+  return (find_rep (prev_index) != find_rep (succ_index));
+}
+
+size_t
+control_flow_regions::find_rep (size_t stmt_idx)
+{
+  size_t rep = stmt_idx, aux = stmt_idx;
+  /* Find the root representative of this statement.  */
+  while (representatives[rep] != rep)
+    rep = representatives[rep];
+  /* Compress the path from the original statement to the representative.  */
+  while (representatives[aux] != rep)
+    {
+      size_t tmp = representatives[aux];
+      representatives[aux] = rep;
+      aux = tmp;
+    }
+  return rep;
+}
+
+void
+control_flow_regions::union_reps (size_t a, size_t b)
+{
+  a = find_rep (a);
+  b = find_rep (b);
+  representatives[b] = a;
+}
+
+void
+control_flow_regions::compute_regions (gimple_seq seq)
+{
+  hash_map <gimple *, size_t> control_flow_reps;
+  hash_map <tree, size_t> label_reps;
+  size_t current_region = 0, idx = 0;
+
+  /* In a first pass, assign an initial region to each statement.  Except in
+     the case of OpenACC loops, each statement simply gets the same region
+     representative as its predecessor.  */
+  for (gimple_stmt_iterator gsi = gsi_start (seq);
+       !gsi_end_p (gsi);
+       gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+      gimple *omp_for = top_level_omp_for_in_stmt (stmt);
+      omp_for_loops.safe_push (omp_for != NULL);
+      if (omp_for != NULL)
+	{
+	  /* Assign a new region to this loop and to its successor.  */
+	  current_region = idx;
+	  representatives.safe_push (current_region);
+	  current_region++;
+	}
+      else
+	{
+	  representatives.safe_push (current_region);
+	  /* Remember any jumps and labels for the second pass below.  */
+	  if (gimple_code (stmt) == GIMPLE_COND
+	      || gimple_code (stmt) == GIMPLE_SWITCH
+	      || gimple_code (stmt) == GIMPLE_GOTO)
+	    control_flow_reps.put (stmt, current_region);
+	  else if (gimple_code (stmt) == GIMPLE_LABEL)
+	    label_reps.put (gimple_label_label (as_a <glabel *> (stmt)),
+			    current_region);
+	}
+      idx++;
+    }
+  gcc_assert (representatives.length () == omp_for_loops.length ());
+
+  /* Revisit all the control flow statements and union the region of each
+     cond, switch, or goto statement with the target labels' regions.  */
+  for (hash_map <gimple *, size_t>::iterator it = control_flow_reps.begin ();
+       it != control_flow_reps.end ();
+       ++it)
+    {
+      gimple *stmt = (*it).first;
+      size_t stmt_rep = (*it).second;
+      switch (gimple_code (stmt))
+	{
+	  tree label;
+	  unsigned int n;
+
+	case GIMPLE_COND:
+	  label = gimple_cond_true_label (as_a <gcond *> (stmt));
+	  union_reps (stmt_rep, *label_reps.get (label));
+	  label = gimple_cond_false_label (as_a <gcond *> (stmt));
+	  union_reps (stmt_rep, *label_reps.get (label));
+	  break;
+
+	case GIMPLE_SWITCH:
+	  n = gimple_switch_num_labels (as_a <gswitch *> (stmt));
+	  for (unsigned int i = 0; i < n; i++)
+	    {
+	      tree switch_case
+		= gimple_switch_label (as_a <gswitch *> (stmt), i);
+	      label = CASE_LABEL (switch_case);
+	      union_reps (stmt_rep, *label_reps.get (label));
+	    }
+	  break;
+
+	case GIMPLE_GOTO:
+	  label = gimple_goto_dest (stmt);
+	  union_reps (stmt_rep, *label_reps.get (label));
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+    }
+}
+
+/* Decompose the body of the KERNELS_REGION, which was originally annotated
+   with the KERNELS_CLAUSES, into a series of compute constructs.  */
+
+static gimple *
+decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
+{
+  location_t loc = gimple_location (kernels_region);
+
+  /* The kernels clauses will be propagated to the child clauses unmodified,
+     except that the 'num_gangs', 'num_workers', and 'vector_length' clauses
+     will only be added to loop regions.  The other regions are "gang-single"
+     and get an explicit 'num_gangs (1)' clause.  So separate out the
+     'num_gangs', 'num_workers', and 'vector_length' clauses here.
+     Also check for the presence of an 'async' clause but do not remove it from
+     the 'kernels' clauses.  */
+  tree num_gangs_clause = NULL, num_workers_clause = NULL,
+       vector_length_clause = NULL;
+  tree async_clause = NULL;
+  tree prev_clause = NULL, next_clause = NULL;
+  tree parallel_clauses = kernels_clauses;
+  for (tree c = parallel_clauses; c; c = next_clause)
+    {
+      /* Preserve this here, as we might NULL it later.  */
+      next_clause = OMP_CLAUSE_CHAIN (c);
+
+      if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_NUM_GANGS
+	  || OMP_CLAUSE_CODE (c) == OMP_CLAUSE_NUM_WORKERS
+	  || OMP_CLAUSE_CODE (c) == OMP_CLAUSE_VECTOR_LENGTH)
+	{
+	  /* Cut this clause out of the chain.  */
+	  if (prev_clause != NULL)
+	    OMP_CLAUSE_CHAIN (prev_clause) = OMP_CLAUSE_CHAIN (c);
+	  else
+	    kernels_clauses = OMP_CLAUSE_CHAIN (c);
+	  OMP_CLAUSE_CHAIN (c) = NULL;
+	  switch (OMP_CLAUSE_CODE (c))
+	    {
+	    case OMP_CLAUSE_NUM_GANGS:
+	      num_gangs_clause = c;
+	      break;
+	    case OMP_CLAUSE_NUM_WORKERS:
+	      num_workers_clause = c;
+	      break;
+	    case OMP_CLAUSE_VECTOR_LENGTH:
+	      vector_length_clause = c;
+	      break;
+	    default:
+	      gcc_unreachable ();
+	    }
+	}
+      else
+	prev_clause = c;
+      if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_ASYNC)
+	async_clause = c;
+    }
+
+  gimple *kernels_body = gimple_omp_body (kernels_region);
+  gbind *kernels_bind = as_a <gbind *> (kernels_body);
+
+  /* The body of the region may contain other nested binds declaring inner
+     local variables.  Collapse all these binds into one to ensure that we
+     have a single sequence of statements to iterate over; also, collect all
+     inner variables.  */
+  tree inner_bind_vars = flatten_binds (kernels_bind);
+  gimple_seq body_sequence = gimple_bind_body (kernels_bind);
+
+  /* All these inner variables will get allocated on the device (below, by
+     calling maybe_build_inner_data_region).  Here we create 'present'
+     clauses for them and add these clauses to the list of clauses to be
+     attached to each inner compute construct.  */
+  tree present_clauses = kernels_clauses;
+  for (tree var = inner_bind_vars; var; var = TREE_CHAIN (var))
+    {
+      if (!DECL_ARTIFICIAL (var) && TREE_CODE (var) != CONST_DECL)
+	{
+	  tree present_clause = build_omp_clause (loc, OMP_CLAUSE_MAP);
+	  OMP_CLAUSE_SET_MAP_KIND (present_clause, GOMP_MAP_FORCE_PRESENT);
+	  OMP_CLAUSE_DECL (present_clause) = var;
+	  OMP_CLAUSE_SIZE (present_clause) = DECL_SIZE_UNIT (var);
+	  OMP_CLAUSE_CHAIN (present_clause) = present_clauses;
+	  present_clauses = present_clause;
+	}
+    }
+  kernels_clauses = present_clauses;
+
+  /* In addition to nested binds, the "real" body of the region may be
+     nested inside a try-finally block.  Find its cleanup block, which
+     contains code to clobber the local variables that must be clobbered.  */
+  gimple *inner_cleanup = NULL;
+  if (body_sequence != NULL && gimple_code (body_sequence) == GIMPLE_TRY)
+    {
+      if (gimple_seq_singleton_p (body_sequence))
+	{
+	  /* The try statement is the only thing inside the bind.  */
+	  inner_cleanup = gimple_try_cleanup (body_sequence);
+	  body_sequence = gimple_try_eval (body_sequence);
+	}
+      else
+	{
+	  /* The bind's body starts with a try statement, but it is followed
+	     by other things.  */
+	  gimple_stmt_iterator gsi = gsi_start (body_sequence);
+	  gimple *try_stmt = gsi_stmt (gsi);
+	  inner_cleanup = gimple_try_cleanup (try_stmt);
+	  gimple *try_body = gimple_try_eval (try_stmt);
+
+	  gsi_remove (&gsi, false);
+	  /* Now gsi indicates the sequence of statements after the try
+	     statement in the bind.  Append the statement in the try body and
+	     the trailing statements from gsi.  */
+	  gsi_insert_seq_before (&gsi, try_body, GSI_CONTINUE_LINKING);
+	  body_sequence = gsi_stmt (gsi);
+	}
+    }
+
+  /* This sequence will collect all the top-level statements in the body of
+     the data region we are about to construct.  */
+  gimple_seq region_body = NULL;
+  /* This sequence will collect consecutive statements to be put into a
+     gang-single region.  */
+  gimple_seq gang_single_seq = NULL;
+  /* Flag recording whether the gang_single_seq only contains copies to
+     local variables.  These may be loop setup code that should not be
+     separated from the loop.  */
+  bool only_simple_assignments = true;
+
+  /* Precompute the control flow region information to determine whether an
+     OpenACC loop is executed conditionally or unconditionally.  */
+  control_flow_regions cf_regions (body_sequence);
+
+  /* Iterate over the statements in the kernels region's body.  */
+  size_t idx = 0;
+  gimple_stmt_iterator gsi, gsi_n;
+  for (gsi = gsi_start (body_sequence); !gsi_end_p (gsi); gsi = gsi_n, idx++)
+    {
+      /* Advance the iterator here because otherwise it would be invalidated
+	 by moving statements below.  */
+      gsi_n = gsi;
+      gsi_next (&gsi_n);
+
+      gimple *stmt = gsi_stmt (gsi);
+      gimple *omp_for = top_level_omp_for_in_stmt (stmt);
+      bool is_unconditional_oacc_for_loop = false;
+      if (omp_for != NULL)
+	is_unconditional_oacc_for_loop
+	  = cf_regions.is_unconditional_oacc_for_loop (idx);
+      if (omp_for != NULL
+	  && is_unconditional_oacc_for_loop)
+	{
+	  /* This is an OMP for statement, put it into a separate region.
+	     But first, construct a gang-single region containing any
+	     complex sequential statements we may have seen.  */
+	  if (gang_single_seq != NULL && !only_simple_assignments)
+	    {
+	      gimple *single_region
+		= make_region_seq (loc, gang_single_seq,
+				   num_gangs_clause,
+				   num_workers_clause,
+				   vector_length_clause,
+				   kernels_clauses);
+	      gimple_seq_add_stmt (&region_body, single_region);
+	    }
+	  else if (gang_single_seq != NULL && only_simple_assignments)
+	    {
+	      /* There is a sequence of sequential statements preceding this
+		 loop, but they are all simple assignments.  This is
+		 probably setup code for the loop; in particular, Fortran DO
+		 loops are preceded by code to copy the loop limit variable
+		 to a temporary.  Group this code together with the loop
+		 itself.  */
+	      gimple_seq_add_stmt (&gang_single_seq, stmt);
+	      stmt = gimple_build_bind (NULL, gang_single_seq,
+					make_node (BLOCK));
+	    }
+	  gang_single_seq = NULL;
+	  only_simple_assignments = true;
+
+	  gimple_seq parallel_seq = NULL;
+	  gimple_seq_add_stmt (&parallel_seq, stmt);
+	  gimple *parallel_region
+	    = make_region_loop_nest (omp_for, parallel_seq,
+				     num_gangs_clause,
+				     num_workers_clause,
+				     vector_length_clause,
+				     kernels_clauses);
+	  gimple_seq_add_stmt (&region_body, parallel_region);
+	}
+      else
+	{
+	  if (omp_for != NULL)
+	    {
+	      gcc_checking_assert (!is_unconditional_oacc_for_loop);
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, omp_for,
+				 "unparallelized loop nest"
+				 " in OpenACC %<kernels%> region:"
+				 " it's executed conditionally\n");
+	    }
+
+	  /* This is not an unconditional OMP for statement, so it will be
+	     put into a gang-single region.  */
+	  gimple_seq_add_stmt (&gang_single_seq, stmt);
+	  /* Is this a simple assignment? We call it simple if it is an
+	     assignment to an artificial local variable.  This captures
+	     Fortran loop setup code computing loop bounds and offsets.  */
+	  bool is_simple_assignment
+	    = (gimple_code (stmt) == GIMPLE_ASSIGN
+	       && TREE_CODE (gimple_assign_lhs (stmt)) == VAR_DECL
+	       && DECL_ARTIFICIAL (gimple_assign_lhs (stmt)));
+	  if (!is_simple_assignment)
+	    only_simple_assignments = false;
+	}
+    }
+
+  /* If we did not emit a new region, and are not going to emit one now
+     (that is, the original region was empty), prepare to emit a dummy so as
+     to preserve the original construct, which other processing (at least
+     test cases) depend on.  */
+  if (region_body == NULL && gang_single_seq == NULL)
+    {
+      gimple *stmt = gimple_build_nop ();
+      gimple_set_location (stmt, loc);
+      gimple_seq_add_stmt (&gang_single_seq, stmt);
+    }
+
+  /* Gather up any remaining gang-single statements.  */
+  if (gang_single_seq != NULL)
+    {
+      gimple *single_region
+	= make_region_seq (loc, gang_single_seq,
+			   num_gangs_clause,
+			   num_workers_clause,
+			   vector_length_clause,
+			   kernels_clauses);
+      gimple_seq_add_stmt (&region_body, single_region);
+    }
+
+  /* We want to launch these kernels asynchronously.  If the original
+     kernels region had an async clause, this is done automatically because
+     that async clause was copied to the individual regions we created.
+     Otherwise, add an async clause to each newly created region, as well as
+     a wait directive at the end.  */
+  if (async_clause == NULL)
+    add_async_clauses_and_wait (loc, &region_body);
+
+  tree kernels_locals = gimple_bind_vars (as_a <gbind *> (kernels_body));
+  gimple *body = gimple_build_bind (kernels_locals, region_body,
+				    make_node (BLOCK));
+
+  /* If we found variables declared in nested scopes, build a data region to
+     map them to the device.  */
+  body = maybe_build_inner_data_region (loc, body, inner_bind_vars,
+					inner_cleanup);
+
+  return body;
+}
+
+/* Decompose one OpenACC 'kernels' construct into an OpenACC 'data' construct
+   containing the original OpenACC 'kernels' construct's region cut up into a
+   sequence of compute constructs.  */
+
+static gimple *
+omp_oacc_kernels_decompose_1 (gimple *kernels_stmt)
+{
+  gcc_checking_assert (gimple_omp_target_kind (kernels_stmt)
+		       == GF_OMP_TARGET_KIND_OACC_KERNELS);
+  location_t loc = gimple_location (kernels_stmt);
+
+  /* Collect the data clauses of the OpenACC 'kernels' directive and create a
+     new OpenACC 'data' construct with those clauses.  */
+  tree kernels_clauses = gimple_omp_target_clauses (kernels_stmt);
+  tree data_clauses = NULL;
+  for (tree c = kernels_clauses; c; c = OMP_CLAUSE_CHAIN (c))
+    {
+      /* Certain clauses are copied to the enclosing OpenACC 'data'.  Other
+	 clauses remain on the OpenACC 'kernels'.  */
+      if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP)
+	{
+	  tree decl = OMP_CLAUSE_DECL (c);
+	  HOST_WIDE_INT map_kind = OMP_CLAUSE_MAP_KIND (c);
+	  switch (map_kind)
+	    {
+	    default:
+	      if (map_kind == GOMP_MAP_ALLOC
+		  && integer_zerop (OMP_CLAUSE_SIZE (c)))
+		/* ??? This is an alloc clause for mapping a pointer whose
+		   target is already mapped.  We leave these on the inner
+		   compute constructs because moving them to the outer data
+		   region causes runtime errors.  */
+		break;
+
+	      /* For non-artificial variables, and for non-declaration
+		 expressions like A[0:n], copy the clause to the data
+		 region.  */
+	      if ((DECL_P (decl) && !DECL_ARTIFICIAL (decl))
+		  || !DECL_P (decl))
+		{
+		  tree new_clause = build_omp_clause (OMP_CLAUSE_LOCATION (c),
+						      OMP_CLAUSE_MAP);
+		  OMP_CLAUSE_SET_MAP_KIND (new_clause, map_kind);
+		  /* This must be unshared here to avoid "incorrect sharing
+		     of tree nodes" errors from verify_gimple.  */
+		  OMP_CLAUSE_DECL (new_clause) = unshare_expr (decl);
+		  OMP_CLAUSE_SIZE (new_clause) = OMP_CLAUSE_SIZE (c);
+		  OMP_CLAUSE_CHAIN (new_clause) = data_clauses;
+		  data_clauses = new_clause;
+
+		  /* Now that this data is mapped, turn the data clause on the
+		     inner OpenACC 'kernels' into a 'present' clause.  */
+		  OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_FORCE_PRESENT);
+		}
+	      break;
+
+	    case GOMP_MAP_POINTER:
+	    case GOMP_MAP_TO_PSET:
+	    case GOMP_MAP_FORCE_TOFROM:
+	    case GOMP_MAP_FIRSTPRIVATE_POINTER:
+	    case GOMP_MAP_FIRSTPRIVATE_REFERENCE:
+	      /* ??? Copying these map kinds leads to internal compiler
+		 errors in later passes.  */
+	      break;
+	    }
+	}
+      else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_IF)
+	{
+	  /* If there is an 'if' clause, it must be duplicated to the
+	     enclosing data region.  Temporarily remove the if clause's
+	     chain to avoid copying it.  */
+	  tree saved_chain = OMP_CLAUSE_CHAIN (c);
+	  OMP_CLAUSE_CHAIN (c) = NULL;
+	  tree new_if_clause = unshare_expr (c);
+	  OMP_CLAUSE_CHAIN (c) = saved_chain;
+	  OMP_CLAUSE_CHAIN (new_if_clause) = data_clauses;
+	  data_clauses = new_if_clause;
+	}
+    }
+  /* Restore the original order of the clauses.  */
+  data_clauses = nreverse (data_clauses);
+
+  gimple *data_region
+    = gimple_build_omp_target (NULL, GF_OMP_TARGET_KIND_OACC_DATA_KERNELS,
+			       data_clauses);
+  gimple_set_location (data_region, loc);
+
+  /* Transform the body of the kernels region into a sequence of compute
+     constructs.  */
+  gimple *body = decompose_kernels_region_body (kernels_stmt,
+						kernels_clauses);
+
+  /* Put the transformed pieces together.  The entire body of the region is
+     wrapped in a try-finally statement that calls __builtin_GOACC_data_end
+     for cleanup.  */
+  gimple *try_stmt = make_data_region_try_statement (loc, body);
+  gimple_omp_set_body (data_region, try_stmt);
+
+  return data_region;
+}
+
+
+/* Decompose OpenACC 'kernels' constructs in the current function.  */
+
+static tree
+omp_oacc_kernels_decompose_callback_stmt (gimple_stmt_iterator *gsi_p,
+					  bool *handled_ops_p,
+					  struct walk_stmt_info *)
+{
+  gimple *stmt = gsi_stmt (*gsi_p);
+
+  if ((gimple_code (stmt) == GIMPLE_OMP_TARGET)
+      && gimple_omp_target_kind (stmt) == GF_OMP_TARGET_KIND_OACC_KERNELS)
+    {
+      gimple *stmt_new = omp_oacc_kernels_decompose_1 (stmt);
+      gsi_replace (gsi_p, stmt_new, false);
+      *handled_ops_p = true;
+    }
+  else
+    *handled_ops_p = false;
+
+  return NULL;
+}
+
+static unsigned int
+omp_oacc_kernels_decompose (void)
+{
+  gimple_seq body = gimple_body (current_function_decl);
+
+  struct walk_stmt_info wi;
+  memset (&wi, 0, sizeof (wi));
+  walk_gimple_seq_mod (&body, omp_oacc_kernels_decompose_callback_stmt, NULL,
+		       &wi);
+
+  gimple_set_body (current_function_decl, body);
+
+  return 0;
+}
+
+
+namespace {
+
+const pass_data pass_data_omp_oacc_kernels_decompose =
+{
+  GIMPLE_PASS, /* type */
+  "omp_oacc_kernels_decompose", /* name */
+  OPTGROUP_OMP, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  PROP_gimple_any, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_omp_oacc_kernels_decompose : public gimple_opt_pass
+{
+public:
+  pass_omp_oacc_kernels_decompose (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_omp_oacc_kernels_decompose, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+  {
+    return (flag_openacc
+	    && flag_openacc_kernels == OPENACC_KERNELS_DECOMPOSE);
+  }
+  virtual unsigned int execute (function *)
+  {
+    return omp_oacc_kernels_decompose ();
+  }
+
+}; // class pass_omp_oacc_kernels_decompose
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_omp_oacc_kernels_decompose (gcc::context *ctxt)
+{
+  return new pass_omp_oacc_kernels_decompose (ctxt);
+}
diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c
index 2158343..9013961 100644
--- a/gcc/omp-offload.c
+++ b/gcc/omp-offload.c
@@ -1771,11 +1771,19 @@ execute_oacc_device_lower ()
   bool is_oacc_serial
     = (lookup_attribute ("oacc serial",
 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
+  bool is_oacc_parallel_kernels_parallelized
+    = (lookup_attribute ("oacc parallel_kernels_parallelized",
+			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
+  bool is_oacc_parallel_kernels_gang_single
+    = (lookup_attribute ("oacc parallel_kernels_gang_single",
+			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
   int fn_level = oacc_fn_attrib_level (attrs);
   bool is_oacc_routine = (fn_level >= 0);
   gcc_checking_assert (is_oacc_parallel
 		       + is_oacc_kernels
 		       + is_oacc_serial
+		       + is_oacc_parallel_kernels_parallelized
+		       + is_oacc_parallel_kernels_gang_single
 		       + is_oacc_routine
 		       == 1);
 
@@ -1795,6 +1803,12 @@ execute_oacc_device_lower ()
 		  ? "parallelized" : "unparallelized"));
       else if (is_oacc_serial)
 	fprintf (dump_file, "Function is OpenACC serial offload\n");
+      else if (is_oacc_parallel_kernels_parallelized)
+	fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
+		 "parallel_kernels_parallelized");
+      else if (is_oacc_parallel_kernels_gang_single)
+	fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
+		 "parallel_kernels_gang_single");
       else if (is_oacc_routine)
 	fprintf (dump_file, "Function is OpenACC routine level %d\n",
 		 fn_level);
@@ -1838,6 +1852,11 @@ execute_oacc_device_lower ()
       fprintf (dump_file, "]\n");
     }
 
+  /* Verify that for OpenACC 'kernels' decomposed "gang-single" parts we launch
+     a single gang only.  */
+  if (is_oacc_parallel_kernels_gang_single)
+    gcc_checking_assert (dims[GOMP_DIM_GANG] == 1);
+
   oacc_loop_process (loops);
   if (dump_file)
     {
diff --git a/gcc/passes.def b/gcc/passes.def
index c682312..fc56e69 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -34,6 +34,7 @@ along with GCC; see the file COPYING3.  If not see
   NEXT_PASS (pass_warn_unused_result);
   NEXT_PASS (pass_diagnose_omp_blocks);
   NEXT_PASS (pass_diagnose_tm_blocks);
+  NEXT_PASS (pass_omp_oacc_kernels_decompose);
   NEXT_PASS (pass_lower_omp);
   NEXT_PASS (pass_lower_cf);
   NEXT_PASS (pass_lower_tm);
diff --git a/gcc/testsuite/c-c++-common/goacc/if-clause-2.c b/gcc/testsuite/c-c++-common/goacc/if-clause-2.c
index 5ab8459..7bb1153 100644
--- a/gcc/testsuite/c-c++-common/goacc/if-clause-2.c
+++ b/gcc/testsuite/c-c++-common/goacc/if-clause-2.c
@@ -1,11 +1,21 @@
+/* { dg-additional-options "-fdump-tree-gimple" } */
+/* { dg-additional-options "-fopenacc-kernels=decompose" }
+   { dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose" } */
+
 void
 f (short c)
 {
-#pragma acc parallel if(c)
-  ;
-#pragma acc kernels if(c)
-  ;
-#pragma acc data if(c)
-  ;
-#pragma acc update device(c) if(c)
+#pragma acc parallel if(c) copy(c)
+  ++c;
+
+#pragma acc kernels if(c) copy(c)
+  /* { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_kernels map\(tofrom:c \[len: [0-9]+\]\) if\(_[0-9]+\)$} 1 "gimple" } } */
+  /* { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_data_kernels map\(tofrom:c \[len: [0-9]+\]\) if\(_[0-9]+\)$} 1 "omp_oacc_kernels_decompose" } }
+     { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_parallel_kernels_gang_single async\(-1\) num_gangs\(1\) map\(force_present:c \[len: [0-9]+\]\) if\(_[0-9]+\)$} 1 "omp_oacc_kernels_decompose" } } */
+  ++c;
+
+#pragma acc data if(c) copy(c)
+  ++c;
+
+#pragma acc update if(c) device(c)
 }
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-decompose-1.c b/gcc/testsuite/c-c++-common/goacc/kernels-decompose-1.c
new file mode 100644
index 0000000..92db332
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-decompose-1.c
@@ -0,0 +1,83 @@
+/* Test OpenACC 'kernels' construct decomposition.  */
+
+/* { dg-additional-options "-fopt-info-omp-all" } */
+/* { dg-additional-options "-fdump-tree-gimple" } */
+/* { dg-additional-options "-fopenacc-kernels=decompose" }
+   { dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose" } */
+
+/* See also '../../gfortran.dg/goacc/kernels-decompose-1.f95'.  */
+
+#define N 1024
+
+unsigned int a[N];
+
+int
+main (void)
+{
+  int i;
+  unsigned int sum = 1;
+
+#pragma acc kernels copyin(a[0:N]) copy(sum)
+  /* { dg-bogus "optimized: assigned OpenACC seq loop parallelism" "TODO" { xfail *-*-* } .-1 }
+     TODO Is this maybe the report that belongs to the XFAILed report further down?  */
+  {
+    #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */
+    /* { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i } */
+    /* { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
+    for (i = 0; i < N; ++i)
+      sum += a[i];
+
+    sum++; /* { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" } */
+    a[0]++;
+
+    #pragma acc loop independent /* { dg-line l_loop_i[incr c_loop_i] } */
+    /* { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i } */
+    /* { dg-optimized "assigned OpenACC gang vector loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
+    for (i = 0; i < N; ++i)
+      sum += a[i];
+
+    if (sum > 10) /* { dg-message "note: beginning 'parloops' part in OpenACC 'kernels' region" } */
+      { 
+        #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */
+	/* { dg-missed "unparallelized loop nest in OpenACC 'kernels' region: it's executed conditionally" "" { target *-*-* } l_loop_i$c_loop_i } */
+	/*TODO { dg-optimized "assigned OpenACC seq loop parallelism" "TODO" { xfail *-*-* } l_loop_i$c_loop_i } */
+        for (i = 0; i < N; ++i)
+          sum += a[i];
+      }
+
+    #pragma acc loop auto /* { dg-line l_loop_i[incr c_loop_i] } */
+    /* { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i } */
+    /* { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
+    for (i = 0; i < N; ++i)
+      sum += a[i];
+  }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_kernels map\(tofrom:sum \[len: [0-9]+\]\) map\(to:a\[0\] \[len: [0-9]+\]\) map\(firstprivate:a \[pointer assign, bias: 0\]\)$} 1 "gimple" } }
+
+   { dg-final { scan-tree-dump-times {(?n)#pragma acc loop private\(i\)$} 2 "gimple" } }
+   { dg-final { scan-tree-dump-times {(?n)#pragma acc loop independent private\(i\)$} 1 "gimple" } }
+   { dg-final { scan-tree-dump-times {(?n)#pragma acc loop auto private\(i\)$} 1 "gimple" } }
+   { dg-final { scan-tree-dump-times {(?n)#pragma acc loop} 4 "gimple" } } */
+
+/* Check that the OpenACC 'kernels' got decomposed into 'data' and an enclosed
+   sequence of compute constructs.
+   { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_data_kernels map\(tofrom:sum \[len: [0-9]+\]\) map\(to:a\[0\] \[len: [0-9]+\]\)$} 1 "omp_oacc_kernels_decompose" } }
+   As noted above, we get three "old-style" kernel regions, one gang-single region, and one parallelized loop region.
+   { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_kernels async\(-1\) map\(force_present:sum \[len: [0-9]+\]\) map\(force_present:a\[0\] \[len: [0-9]+\]\) map\(firstprivate:a \[pointer assign, bias: 0\]\)$} 3 "omp_oacc_kernels_decompose" } }
+   { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_parallel_kernels_parallelized async\(-1\) map\(force_present:sum \[len: [0-9]+\]\) map\(force_present:a\[0\] \[len: [0-9]+\]\) map\(firstprivate:a \[pointer assign, bias: 0\]\)$} 1 "omp_oacc_kernels_decompose" } }
+   { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_parallel_kernels_gang_single async\(-1\) num_gangs\(1\) map\(force_present:sum \[len: [0-9]+\]\) map\(force_present:a\[0\] \[len: [0-9]+\]\) map\(firstprivate:a \[pointer assign, bias: 0\]\)$} 1 "omp_oacc_kernels_decompose" } }
+
+   'data' plus five CCs.
+   { dg-final { scan-tree-dump-times {(?n)#pragma omp target } 6 "omp_oacc_kernels_decompose" } }
+
+   { dg-final { scan-tree-dump-times {(?n)#pragma acc loop private\(i\)$} 2 "omp_oacc_kernels_decompose" } }
+   { dg-final { scan-tree-dump-times {(?n)#pragma acc loop independent private\(i\)$} 1 "omp_oacc_kernels_decompose" } }
+   { dg-final { scan-tree-dump-times {(?n)#pragma acc loop auto private\(i\)$} 1 "omp_oacc_kernels_decompose" } }
+   { dg-final { scan-tree-dump-times {(?n)#pragma acc loop} 4 "omp_oacc_kernels_decompose" } }
+
+   Each of the parallel regions is async, and there is a final call to
+   __builtin_GOACC_wait.
+   { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1 "omp_oacc_kernels_decompose" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-decompose-2.c b/gcc/testsuite/c-c++-common/goacc/kernels-decompose-2.c
new file mode 100644
index 0000000..ec6c4af
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-decompose-2.c
@@ -0,0 +1,141 @@
+/* Test OpenACC 'kernels' construct decomposition.  */
+
+/* { dg-additional-options "-fopt-info-omp-all" } */
+/* { dg-additional-options "-fopenacc-kernels=decompose" }
+/* { dg-additional-options "-O2" } for 'parloops'.  */
+
+/* See also '../../gfortran.dg/goacc/kernels-decompose-2.f95'.  */
+
+#pragma acc routine gang
+extern int
+f_g (int);
+
+#pragma acc routine worker
+extern int
+f_w (int);
+
+#pragma acc routine vector
+extern int
+f_v (int);
+
+#pragma acc routine seq
+extern int
+f_s (int);
+
+int
+main ()
+{
+  int x, y, z;
+#define N 10
+  int a[N], b[N], c[N];
+
+#pragma acc kernels
+  {
+    x = 0; /* { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" } */
+    y = x < 10;
+    z = x++;
+    ;
+  }
+
+  { /*TODO Instead of using 'for (int i = 0; [...])', move 'int i' outside, to work around for ICE detailed in 'kernels-decompose-ice-1.c'.  */
+    int i;
+#pragma acc kernels /* { dg-optimized "assigned OpenACC gang loop parallelism" } */
+  for (i = 0; i < N; i++) /* { dg-message "note: beginning 'parloops' part in OpenACC 'kernels' region" } */
+    a[i] = 0;
+  }
+
+#pragma acc kernels loop /* { dg-line l_loop_i[incr c_loop_i] } */
+  /* { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i } */
+  /* { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
+  for (int i = 0; i < N; i++)
+    b[i] = a[N - i - 1];
+
+#pragma acc kernels
+  {
+#pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */
+    /* { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i } */
+    /* { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
+    for (int i = 0; i < N; i++)
+      b[i] = a[N - i - 1];
+
+#pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */
+    /* { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i } */
+    /* { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
+    for (int i = 0; i < N; i++)
+      c[i] = a[i] * b[i];
+
+    a[z] = 0; /* { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" } */
+
+#pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */
+    /* { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i } */
+    /* { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
+    for (int i = 0; i < N; i++)
+      c[i] += a[i];
+
+#pragma acc loop seq /* { dg-line l_loop_i[incr c_loop_i] } */
+    /* { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i } */
+    /* { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
+    for (int i = 0 + 1; i < N; i++)
+      c[i] += c[i - 1];
+  }
+
+#pragma acc kernels
+  /*TODO What does this mean?
+    TODO { dg-optimized "assigned OpenACC worker vector loop parallelism" "" { target *-*-* } .-2 } */
+  {
+#pragma acc loop independent /* { dg-line l_loop_i[incr c_loop_i] } */
+    /* { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
+    /* { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i } */
+    for (int i = 0; i < N; ++i)
+#pragma acc loop independent /* { dg-line l_loop_j[incr c_loop_j] } */
+      /* { dg-optimized "assigned OpenACC worker loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j } */
+      for (int j = 0; j < N; ++j)
+#pragma acc loop independent /* { dg-line l_loop_k[incr c_loop_k] } */
+	/* { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } l_loop_k$c_loop_k } */
+	/* { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_k$c_loop_k } */
+	for (int k = 0; k < N; ++k)
+	  a[(i + j + k) % N]
+	    = b[j]
+	    + f_v (c[k]); /* { dg-optimized "assigned OpenACC vector loop parallelism" } */
+
+    /*TODO Should the following turn into "gang-single" instead of "parloops"?
+      TODO The problem is that the first STMT is 'if (y <= 4) goto <D.2547>; else goto <D.2548>;', thus "parloops".  */
+    if (y < 5) /* { dg-message "note: beginning 'parloops' part in OpenACC 'kernels' region" } */
+#pragma acc loop independent /* { dg-line l_loop_j[incr c_loop_j] } */
+      /* { dg-missed "unparallelized loop nest in OpenACC 'kernels' region: it's executed conditionally" "" { target *-*-* } l_loop_j$c_loop_j } */
+      for (int j = 0; j < N; ++j)
+	b[j] = f_w (c[j]);
+  }
+
+#pragma acc kernels
+  {
+    y = f_g (a[5]); /* { dg-line l_part[incr c_part] } */
+    /*TODO If such a construct is placed in its own part (like it is, here), can't this actually use gang paralelism, instead of "gang-single"?
+      { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" "" { target *-*-* } l_part$c_part } */
+    /* { dg-optimized "assigned OpenACC gang worker vector loop parallelism" "" { target *-*-* } l_part$c_part } */
+
+#pragma acc loop independent /* { dg-line l_loop_j[incr c_loop_j] } */
+    /* { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_j$c_loop_j } */
+    /* { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j } */
+    for (int j = 0; j < N; ++j)
+      b[j] = y + f_w (c[j]); /* { dg-optimized "assigned OpenACC worker vector loop parallelism" } */
+  }
+
+#pragma acc kernels
+  {
+    y = 3; /* { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" } */
+
+#pragma acc loop independent /* { dg-line l_loop_j[incr c_loop_j] } */
+    /* { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_j$c_loop_j } */
+    /* { dg-optimized "assigned OpenACC gang worker loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j } */
+    for (int j = 0; j < N; ++j)
+      b[j] = y + f_v (c[j]); /* { dg-optimized "assigned OpenACC vector loop parallelism" } */
+
+    z = 2; /* { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" } */
+  }
+
+#pragma acc kernels /* { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" } */
+  ;
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-decompose-ice-1.c b/gcc/testsuite/c-c++-common/goacc/kernels-decompose-ice-1.c
new file mode 100644
index 0000000..9e27d1fb
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-decompose-ice-1.c
@@ -0,0 +1,108 @@
+/* Test OpenACC 'kernels' construct decomposition.  */
+
+/* { dg-additional-options "-fopt-info-omp-all" } */
+/* { dg-additional-options "-fopenacc-kernels=decompose" } */
+/* { dg-ice "TODO" }
+   { dg-prune-output "during GIMPLE pass: omplower" } */
+
+/* Reduced from 'kernels-decompose-2.c'.
+   (Hopefully) similar instances:
+     - 'libgomp.oacc-c-c++-common/declare-vla-kernels-decompose-ice-1.c'
+     - 'libgomp.oacc-c-c++-common/kernels-decompose-1.c'
+*/
+
+int
+main ()
+{
+#define N 10
+
+#pragma acc kernels
+  for (int i = 0; i < N; i++) /* { dg-message "note: beginning 'parloops' part in OpenACC 'kernels' region" } */
+    ;
+
+  return 0;
+}
+
+/*
+  In 'gimple' we've got:
+
+      main ()
+      {
+        int D.2087;
+      
+        {
+          int a[10];
+      
+          try
+            {
+              #pragma omp target oacc_kernels map(tofrom:a [len: 40])
+                {
+                  {
+                    int i;
+      
+                    i = 0;
+                    goto <D.2085>;
+      [...]
+
+  ..., which in 'omp_oacc_kernels_decompose' we turn into:
+
+      main ()
+      {
+        int D.2087;
+      
+        {
+          int a[10];
+      
+          try
+            {
+              #pragma omp target oacc_data_kernels map(tofrom:a [len: 40])
+                {
+                  try
+                    {
+                      {
+                        int i;
+      
+                        #pragma omp target oacc_data_kernels map(alloc:i [len: 4])
+                          {
+                            try
+                              {
+                                {
+                                  #pragma omp target oacc_kernels async(-1) map(force_present:i [len: 4]) map(force_present:a [len: 40])
+                                    {
+                                      i = 0;
+                                      goto <D.2085>;
+      [...]
+
+  ..., which results in ICE in:
+
+    #1  0x0000000000d2247b in lower_omp_target (gsi_p=gsi_p@entry=0x7fffffffbc90, ctx=ctx@entry=0x2c994c0) at [...]/gcc/omp-low.c:11981
+    11981                       gcc_assert (offloaded);
+    (gdb) list
+    11976                         talign = TYPE_ALIGN_UNIT (TREE_TYPE (TREE_TYPE (ovar)));
+    11977                       gimplify_assign (x, var, &ilist);
+    11978                     }
+    11979                   else if (is_gimple_reg (var))
+    11980                     {
+    11981                       gcc_assert (offloaded);
+    11982                       tree avar = create_tmp_var (TREE_TYPE (var));
+    11983                       mark_addressable (avar);
+    11984                       enum gomp_map_kind map_kind = OMP_CLAUSE_MAP_KIND (c);
+    11985                       if (GOMP_MAP_COPY_TO_P (map_kind)
+    (gdb) call debug_tree(var)
+     <var_decl 0x7ffff7feebd0 i
+        type <integer_type 0x7ffff67be5e8 int sizes-gimplified public SI
+            size <integer_cst 0x7ffff67a5f18 constant 32>
+            unit-size <integer_cst 0x7ffff67a5f30 constant 4>
+            align:32 warn_if_not_align:0 symtab:0 alias-set -1 canonical-type 0x7ffff67be5e8 precision:32 min <integer_cst 0x7ffff67a5ed0 -2147483648> max <integer_cst 0x7ffff67a5ee8 2147483647>
+            pointer_to_this <pointer_type 0x7ffff67c69d8>>
+        used read SI [...]:15:12 size <integer_cst 0x7ffff67a5f18 32> unit-size <integer_cst 0x7ffff67a5f30 4>
+        align:32 warn_if_not_align:0 context <function_decl 0x7ffff68eea00 main>>
+
+  Just defusing the 'assert' is not sufficient:
+
+      libgomp: present clause: !acc_is_present (0x7ffe29cba3ec, 4 (0x4))
+
+  TODO Can't the 'omp_oacc_kernels_decompose' transformation be much simpler, such that we avoid the intermediate 'data' if we've got just one compute construct inside it?
+  TODO But it's not clear if that'd just resolve one simple instance of the general problem?
+
+*/
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-decompose-ice-2.c b/gcc/testsuite/c-c++-common/goacc/kernels-decompose-ice-2.c
new file mode 100644
index 0000000..839e680
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-decompose-ice-2.c
@@ -0,0 +1,16 @@
+/* Test OpenACC 'kernels' construct decomposition.  */
+
+/* { dg-additional-options "-fopenacc-kernels=decompose" } */
+/* { dg-ice "TODO" }
+   { dg-prune-output "during GIMPLE pass: omp_oacc_kernels_decompose" } */
+
+/* Reduced from 'kernels-decompose-ice-1.c'.  */
+
+int
+main ()
+{
+#pragma acc kernels
+  {
+    int i;
+  }
+}
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-1.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-1.f95
new file mode 100644
index 0000000..95a7862
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-1.f95
@@ -0,0 +1,81 @@
+! Test OpenACC 'kernels' construct decomposition.
+
+! { dg-additional-options "-fopt-info-omp-all" }
+! { dg-additional-options "-fdump-tree-gimple" }
+! { dg-additional-options "-fopenacc-kernels=decompose" }
+! { dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose" }
+
+! See also '../../c-c++-common/goacc/kernels-decompose-1.c'.
+
+program main
+  implicit none
+  integer, parameter         :: N = 1024
+  integer, dimension (1:N)   :: a
+  integer                    :: i, sum
+
+  !$acc kernels copyin(a(1:N)) copy(sum)
+  ! { dg-bogus "optimized: assigned OpenACC seq loop parallelism" "TODO" { xfail *-*-* } .-1 }
+  !TODO Is this maybe the report that belongs to the XFAILed report further down?  */
+
+  !$acc loop ! { dg-line l_loop_i[incr c_loop_i] }
+  ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
+  ! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+  do i = 1, N
+    sum = sum + a(i)
+  end do
+
+  sum = sum + 1 ! { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" }
+  a(1) = a(1) + 1
+
+  !$acc loop independent ! { dg-line l_loop_i[incr c_loop_i] }
+  ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i }
+  ! { dg-optimized "assigned OpenACC gang vector loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+  do i = 1, N
+    sum = sum + a(i)
+  end do
+
+  if (sum .gt. 10) then ! { dg-message "note: beginning 'parloops' part in OpenACC 'kernels' region" }
+    !$acc loop ! { dg-line l_loop_i[incr c_loop_i] }
+    ! { dg-missed "unparallelized loop nest in OpenACC 'kernels' region: it's executed conditionally" "" { target *-*-* } l_loop_i$c_loop_i }
+    !TODO { dg-optimized "assigned OpenACC seq loop parallelism" "TODO" { xfail *-*-* } l_loop_i$c_loop_i }
+    do i = 1, N
+      sum = sum + a(i)
+    end do
+  end if
+
+  !$acc loop auto ! { dg-line l_loop_i[incr c_loop_i] }
+  ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
+  ! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+  do i = 1, N
+    sum = sum + a(i)
+  end do
+
+  !$acc end kernels
+end program main
+
+! { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_kernels map\(to:a\[_[0-9]+\] \[len: _[0-9]+\]\) map\(alloc:a \[pointer assign, bias: _[0-9]+\]\) map\(tofrom:sum \[len: [0-9]+\]\)$} 1 "gimple" } }
+
+! { dg-final { scan-tree-dump-times {(?n)#pragma acc loop private\(i\)$} 2 "gimple" } }
+! { dg-final { scan-tree-dump-times {(?n)#pragma acc loop private\(i\) independent$} 1 "gimple" } }
+! { dg-final { scan-tree-dump-times {(?n)#pragma acc loop private\(i\) auto$} 1 "gimple" } }
+! { dg-final { scan-tree-dump-times {(?n)#pragma acc loop} 4 "gimple" } }
+
+! Check that the OpenACC 'kernels' got decomposed into 'data' and an enclosed
+! sequence of compute constructs.
+! { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_data_kernels map\(to:a\[_[0-9]+\] \[len: _[0-9]+\]\) map\(tofrom:sum \[len: [0-9]+\]\)$} 1 "omp_oacc_kernels_decompose" } }
+! As noted above, we get three "old-style" kernel regions, one gang-single region, and one parallelized loop region.
+! { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_kernels async\(-1\) map\(force_present:a\[_[0-9]+\] \[len: _[0-9]+\]\) map\(alloc:a \[pointer assign, bias: _[0-9]+\]\) map\(force_present:sum \[len: [0-9]+\]\)$} 3 "omp_oacc_kernels_decompose" } }
+! { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_parallel_kernels_parallelized async\(-1\) map\(force_present:a\[_[0-9]+\] \[len: _[0-9]+\]\) map\(alloc:a \[pointer assign, bias: _[0-9]+\]\) map\(force_present:sum \[len: [0-9]+\]\)$} 1 "omp_oacc_kernels_decompose" } }
+! { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_parallel_kernels_gang_single async\(-1\) num_gangs\(1\) map\(force_present:a\[_[0-9]+\] \[len: _[0-9]+\]\) map\(alloc:a \[pointer assign, bias: _[0-9]+\]\) map\(force_present:sum \[len: [0-9]+\]\)$} 1 "omp_oacc_kernels_decompose" } }
+!
+! 'data' plus five CCs.
+! { dg-final { scan-tree-dump-times {(?n)#pragma omp target } 6 "omp_oacc_kernels_decompose" } }
+
+! { dg-final { scan-tree-dump-times {(?n)#pragma acc loop private\(i\)$} 2 "omp_oacc_kernels_decompose" } }
+! { dg-final { scan-tree-dump-times {(?n)#pragma acc loop private\(i\) independent$} 1 "omp_oacc_kernels_decompose" } }
+! { dg-final { scan-tree-dump-times {(?n)#pragma acc loop private\(i\) auto} 1 "omp_oacc_kernels_decompose" } }
+! { dg-final { scan-tree-dump-times {(?n)#pragma acc loop} 4 "omp_oacc_kernels_decompose" } }
+
+! Each of the parallel regions is async, and there is a final call to
+! __builtin_GOACC_wait.
+! { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1 "omp_oacc_kernels_decompose" } }
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-2.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-2.f95
new file mode 100644
index 0000000..58d687d
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-2.f95
@@ -0,0 +1,142 @@
+! Test OpenACC 'kernels' construct decomposition.
+
+! { dg-additional-options "-fopt-info-omp-all" }
+! { dg-additional-options "-fopenacc-kernels=decompose" }
+! { dg-additional-options "-O2" } for 'parloops'.
+
+! See also '../../c-c++-common/goacc/kernels-decompose-2.c'.
+
+program main
+  implicit none
+
+  integer, external :: f_g
+  !$acc routine (f_g) gang
+  integer, external :: f_w
+  !$acc routine (f_w) worker
+  integer, external :: f_v
+  !$acc routine (f_v) vector
+  integer, external :: f_s
+  !$acc routine (f_s) seq
+
+  integer :: i, j, k
+  integer :: x, y, z
+  logical :: y_l
+  integer, parameter :: N = 10
+  integer :: a(N), b(N), c(N)
+
+  !$acc kernels
+  x = 0 ! { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" }
+  y = 0
+  y_l = x < 10
+  z = x
+  x = x + 1
+  ;
+  !$acc end kernels
+
+  !$acc kernels ! { dg-optimized "assigned OpenACC gang loop parallelism" }
+  do i = 1, N ! { dg-message "note: beginning 'parloops' part in OpenACC 'kernels' region" }
+     a(i) = 0
+  end do
+  !$acc end kernels
+
+  !$acc kernels loop ! { dg-line l_loop_i[incr c_loop_i] }
+  ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
+  ! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+  do i = 1, N
+     b(i) = a(N - i + 1)
+  end do
+
+  !$acc kernels
+  !$acc loop ! { dg-line l_loop_i[incr c_loop_i] }
+  ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
+  ! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+  do i = 1, N
+     b(i) = a(N - i + 1)
+  end do
+
+  !$acc loop ! { dg-line l_loop_i[incr c_loop_i] }
+  ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
+  ! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+  do i = 1, N
+     c(i) = a(i) * b(i)
+  end do
+
+  a(z) = 0 ! { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" }
+
+  !$acc loop ! { dg-line l_loop_i[incr c_loop_i] }
+  ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
+  ! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+  do i = 1, N
+     c(i) = c(i) + a(i)
+  end do
+
+  !$acc loop seq ! { dg-line l_loop_i[incr c_loop_i] }
+  ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i }
+  ! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+  do i = 1 + 1, N
+     c(i) = c(i) + c(i - 1)
+  end do
+  !$acc end kernels
+
+  !$acc kernels
+  !TODO What does this mean?
+  !TODO { dg-optimized "assigned OpenACC worker vector loop parallelism" "" { target *-*-* } .-2 }
+  !$acc loop independent ! { dg-line l_loop_i[incr c_loop_i] }
+  ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i }
+  ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+  do i = 1, N
+     !$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] }
+     ! { dg-optimized "assigned OpenACC worker loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j }
+     do j = 1, N
+        !$acc loop independent ! { dg-line l_loop_k[incr c_loop_k] }
+        ! { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } l_loop_k$c_loop_k }
+        ! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_k$c_loop_k }
+        do k = 1, N
+           a(1 + mod(i + j + k, N)) &
+                = b(j) &
+                + f_v (c(k)) ! { dg-optimized "assigned OpenACC vector loop parallelism" }
+        end do
+     end do
+  end do
+
+  !TODO Should the following turn into "gang-single" instead of "parloops"?
+  !TODO The problem is that the first STMT is 'if (y <= 4) goto <D.2547>; else goto <D.2548>;', thus "parloops".
+  if (y < 5) then ! { dg-message "note: beginning 'parloops' part in OpenACC 'kernels' region" }
+     !$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] }
+     ! { dg-missed "unparallelized loop nest in OpenACC 'kernels' region: it's executed conditionally" "" { target *-*-* } l_loop_j$c_loop_j }
+     do j = 1, N
+        b(j) = f_w (c(j))
+     end do
+  end if
+  !$acc end kernels
+
+  !$acc kernels
+  y = f_g (a(5)) ! { dg-line l_part[incr c_part] }
+  !TODO If such a construct is placed in its own part (like it is, here), can't this actually use gang paralelism, instead of "gang-single"?
+  ! { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" "" { target *-*-* } l_part$c_part }
+  ! { dg-optimized "assigned OpenACC gang worker vector loop parallelism" "" { target *-*-* } l_part$c_part }
+
+  !$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] }
+  ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_j$c_loop_j }
+  ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j }
+  do j = 1, N
+     b(j) = y + f_w (c(j)) ! { dg-optimized "assigned OpenACC worker vector loop parallelism" }
+  end do
+  !$acc end kernels
+
+  !$acc kernels
+  y = 3 ! { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" }
+
+  !$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] }
+  ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_j$c_loop_j }
+  ! { dg-optimized "assigned OpenACC gang worker loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j }
+  do j = 1, N
+     b(j) = y + f_v (c(j)) ! { dg-optimized "assigned OpenACC vector loop parallelism" }
+  end do
+
+  z = 2 ! { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" }
+  !$acc end kernels
+
+  !$acc kernels ! { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" }
+  !$acc end kernels  
+end program main
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-tree.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-tree.f95
index 5583ffb..d01eee2 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-tree.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-tree.f95
@@ -1,5 +1,7 @@
 ! { dg-do compile } 
 ! { dg-additional-options "-fdump-tree-original" } 
+! { dg-additional-options "-fopenacc-kernels=decompose" }
+! { dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose" }
 
 program test
   implicit none
@@ -34,3 +36,6 @@ end program test
 ! { dg-final { scan-tree-dump-times "map\\(alloc:t\\)" 1 "original" } } 
 
 ! { dg-final { scan-tree-dump-times "map\\(force_deviceptr:u\\)" 1 "original" } } 
+
+! { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_data_kernels if\(D\.[0-9]+\)$} 1 "omp_oacc_kernels_decompose" } }
+! { dg-final { scan-tree-dump-times {(?n)#pragma omp target oacc_parallel_kernels_gang_single num_gangs\(1\) if\(D\.[0-9]+\) async\(-1\)$} 1 "omp_oacc_kernels_decompose" } }
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 9cb22ac..cc4870e 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -416,6 +416,7 @@ extern gimple_opt_pass *make_pass_lower_switch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_lower_switch_O0 (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_lower_vector (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_lower_vector_ssa (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_omp_oacc_kernels_decompose (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_lower_omp (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_diagnose_omp_blocks (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_expand_omp (gcc::context *ctxt);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/declare-vla-kernels-decompose-ice-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/declare-vla-kernels-decompose-ice-1.c
new file mode 100644
index 0000000..c7eae12
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/declare-vla-kernels-decompose-ice-1.c
@@ -0,0 +1,8 @@
+/* { dg-additional-options "-fopenacc-kernels=decompose" } */
+/* Hopefully, this is the same issue as '../../../gcc/testsuite/c-c++-common/goacc/kernels-decompose-ice-1.c'.
+   { dg-ice "TODO" }
+   TODO { dg-prune-output "during GIMPLE pass: omplower" }
+   TODO { dg-do link } */
+
+#undef KERNELS_DECOMPOSE_ICE_HACK
+#include "declare-vla.c"
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/declare-vla-kernels-decompose.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/declare-vla-kernels-decompose.c
new file mode 100644
index 0000000..dd8a1c1
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/declare-vla-kernels-decompose.c
@@ -0,0 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=decompose" } */
+
+/* See also 'declare-vla-kernels-decompose-ice-1.c'.  */
+
+#define KERNELS_DECOMPOSE_ICE_HACK
+#include "declare-vla.c"
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/declare-vla.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/declare-vla.c
index 7149357..3bd6331 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/declare-vla.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/declare-vla.c
@@ -38,6 +38,12 @@ f_data (void)
     for (i = 0; i < N; i++)
       A[i] = -i;
 
+    /* See 'declare-vla-kernels-decompose.c'.  */
+#ifdef KERNELS_DECOMPOSE_ICE_HACK
+    (volatile int *) &i;
+    (volatile int *) &N;
+#endif
+
 # pragma acc kernels
     for (i = 0; i < N; i++)
       A[i] = i;
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c
new file mode 100644
index 0000000..fa8ae6c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c
@@ -0,0 +1,38 @@
+/* Test OpenACC 'kernels' construct decomposition.  */
+
+/* { dg-additional-options "-fopt-info-omp-all" } */
+/* { dg-additional-options "-fopenacc-kernels=decompose" } */
+
+#undef NDEBUG
+#include <assert.h>
+
+int main()
+{
+  int a = 0;
+  /*TODO Without making 'a' addressable, for GCN offloading we will not see the expected value copied out.  (But it does work for nvptx offloading, strange...)  */
+  (volatile int *) &a;
+#define N 123
+  int b[N] = { 0 };
+
+#pragma acc kernels
+  {
+    int c = 234; /* { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" } */
+
+    /*TODO Hopefully, this is the same issue as '../../../gcc/testsuite/c-c++-common/goacc/kernels-decompose-ice-1.c'.  */
+    (volatile int *) &c;
+
+#pragma acc loop independent gang /* { dg-line l_loop_i[incr c_loop_i] } */
+    /* { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i } */
+    /* { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i } */
+    for (int i = 0; i < N; ++i)
+      b[i] = c;
+
+    a = c; /* { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" } */
+  }
+
+  for (int i = 0; i < N; ++i)
+    assert (b[i] == 234);
+  assert (a == 234);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/pr94358-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/pr94358-1.f90
index 5013c5ba..82d8351 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/pr94358-1.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/pr94358-1.f90
@@ -1,17 +1,22 @@
 ! { dg-do run }
 ! { dg-additional-options "-fopt-info-omp-all" }
+! { dg-additional-options "-fopenacc-kernels=decompose" }
 
 subroutine kernel(lo, hi, a, b, c)
   implicit none
   integer :: lo, hi, i
   real, dimension(lo:hi) :: a, b, c
 
-  !$acc kernels copyin(lo, hi) ! { dg-optimized "assigned OpenACC seq loop parallelism" }
-  !$acc loop independent
+  !$acc kernels copyin(lo, hi)
+  !$acc loop independent ! { dg-line l_loop_i[incr c_loop_i] }
+  ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i }
+  ! { dg-optimized "assigned OpenACC gang vector loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
   do i = lo, hi
      b(i) = a(i)
   end do
-  !$acc loop independent
+  !$acc loop independent ! { dg-line l_loop_i[incr c_loop_i] }
+  ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i }
+  ! { dg-optimized "assigned OpenACC gang vector loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
   do i = lo, hi
      c(i) = b(i)
   end do
author	Gergö Barany <gergo@codesourcery.com>	2019-02-01 00:59:30 +0100
committer	Thomas Schwinge <thomas@codesourcery.com>	2020-11-13 22:58:57 +0100
commit	e898ce7997733c29dcab9c3c62ca102c7f9fa6eb (patch)
tree	332bb0438f3383bc22d777c3b0008c269dba4d8a
parent	bd7885755405bc9947ebe805a53d6100c78c8e82 (diff)
download	gcc-e898ce7997733c29dcab9c3c62ca102c7f9fa6eb.zip gcc-e898ce7997733c29dcab9c3c62ca102c7f9fa6eb.tar.gz gcc-e898ce7997733c29dcab9c3c62ca102c7f9fa6eb.tar.bz2