From b2b40051500c944e882c274727cea7231eefaaf5 Mon Sep 17 00:00:00 2001 From: Martin Jambor Date: Tue, 19 Jan 2016 11:35:10 +0100 Subject: Merge of HSA 2016-01-19 Martin Jambor Martin Liska Michael Matz libgomp/ * plugin/Makefrag.am: Add HSA plugin requirements. * plugin/configfrag.ac (HSA_RUNTIME_INCLUDE): New variable. (HSA_RUNTIME_LIB): Likewise. (HSA_RUNTIME_CPPFLAGS): Likewise. (HSA_RUNTIME_INCLUDE): New substitution. (HSA_RUNTIME_LIB): Likewise. (HSA_RUNTIME_LDFLAGS): Likewise. (hsa-runtime): New configure option. (hsa-runtime-include): Likewise. (hsa-runtime-lib): Likewise. (PLUGIN_HSA): New substitution variable. Fill HSA_RUNTIME_INCLUDE and HSA_RUNTIME_LIB according to the new configure options. (PLUGIN_HSA_CPPFLAGS): Likewise. (PLUGIN_HSA_LDFLAGS): Likewise. (PLUGIN_HSA_LIBS): Likewise. Check that we have access to HSA run-time. * libgomp-plugin.h (offload_target_type): New element OFFLOAD_TARGET_TYPE_HSA. * libgomp.h (gomp_target_task): New fields firstprivate_copies and args. (bool gomp_create_target_task): Updated. (gomp_device_descr): Extra parameter of run_func and async_run_func, new field can_run_func. * libgomp_g.h (GOMP_target_ext): Update prototype. * oacc-host.c (host_run): Added a new parameter args. * target.c (calculate_firstprivate_requirements): New function. (copy_firstprivate_data): Likewise. (gomp_target_fallback_firstprivate): Use them. (gomp_target_unshare_firstprivate): New function. (gomp_get_target_fn_addr): Allow returning NULL for shared memory devices. (GOMP_target): Do host fallback for all shared memory devices. Do not pass any args to plugins. (GOMP_target_ext): Introduce device-specific argument parameter args. Allow host fallback if device shares memory. Do not remap data if device has shared memory. (gomp_target_task_fn): Likewise. Also treat shared memory devices like host fallback for mappings. (GOMP_target_data): Treat shared memory devices like host fallback. (GOMP_target_data_ext): Likewise. (GOMP_target_update): Likewise. (GOMP_target_update_ext): Likewise. Also pass NULL as args to gomp_create_target_task. (GOMP_target_enter_exit_data): Likewise. (omp_target_alloc): Treat shared memory devices like host fallback. (omp_target_free): Likewise. (omp_target_is_present): Likewise. (omp_target_memcpy): Likewise. (omp_target_memcpy_rect): Likewise. (omp_target_associate_ptr): Likewise. (gomp_load_plugin_for_device): Also load can_run. * task.c (GOMP_PLUGIN_target_task_completion): Free firstprivate_copies. (gomp_create_target_task): Accept new argument args and store it to ttask. * plugin/plugin-hsa.c: New file. gcc/ * Makefile.in (OBJS): Add new source files. (GTFILES): Add hsa.c. * common.opt (disable_hsa): New variable. (-Whsa): New warning. * config.in (ENABLE_HSA): New. * configure.ac: Treat hsa differently from other accelerators. (OFFLOAD_TARGETS): Define ENABLE_OFFLOADING according to $enable_offloading. (ENABLE_HSA): Define ENABLE_HSA according to $enable_hsa. * doc/install.texi (Configuration): Document --with-hsa-runtime, --with-hsa-runtime-include, --with-hsa-runtime-lib and --with-hsa-kmt-lib. * doc/invoke.texi (-Whsa): Document. (hsa-gen-debug-stores): Likewise. * lto-wrapper.c (compile_images_for_offload_targets): Do not attempt to invoke offload compiler for hsa acclerator. * opts.c (common_handle_option): Determine whether HSA offloading should be performed. * params.def (PARAM_HSA_GEN_DEBUG_STORES): New parameter. * builtin-types.def (BT_FN_VOID_UINT_PTR_INT_PTR): New. (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_INT_INT): Removed. (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_PTR): New. * gimple-low.c (lower_stmt): Also handle GIMPLE_OMP_GRID_BODY. * gimple-pretty-print.c (dump_gimple_omp_for): Also handle GF_OMP_FOR_KIND_GRID_LOOP. (dump_gimple_omp_block): Also handle GIMPLE_OMP_GRID_BODY. (pp_gimple_stmt_1): Likewise. * gimple-walk.c (walk_gimple_stmt): Likewise. * gimple.c (gimple_build_omp_grid_body): New function. (gimple_copy): Also handle GIMPLE_OMP_GRID_BODY. * gimple.def (GIMPLE_OMP_GRID_BODY): New. * gimple.h (enum gf_mask): Added GF_OMP_PARALLEL_GRID_PHONY, GF_OMP_FOR_KIND_GRID_LOOP, GF_OMP_FOR_GRID_PHONY and GF_OMP_TEAMS_GRID_PHONY. (gimple_statement_omp_single_layout): Updated comments. (gimple_build_omp_grid_body): New function. (gimple_has_substatements): Also handle GIMPLE_OMP_GRID_BODY. (gimple_omp_for_grid_phony): New function. (gimple_omp_for_set_grid_phony): Likewise. (gimple_omp_parallel_grid_phony): Likewise. (gimple_omp_parallel_set_grid_phony): Likewise. (gimple_omp_teams_grid_phony): Likewise. (gimple_omp_teams_set_grid_phony): Likewise. (gimple_return_set_retbnd): Also handle GIMPLE_OMP_GRID_BODY. * omp-builtins.def (BUILT_IN_GOMP_OFFLOAD_REGISTER): New. (BUILT_IN_GOMP_OFFLOAD_UNREGISTER): Likewise. (BUILT_IN_GOMP_TARGET): Updated type. * omp-low.c: Include symbol-summary.h, hsa.h and params.h. (adjust_for_condition): New function. (get_omp_for_step_from_incr): Likewise. (extract_omp_for_data): Moved parts to adjust_for_condition and get_omp_for_step_from_incr. (build_outer_var_ref): Handle GIMPLE_OMP_GRID_BODY. (fixup_child_record_type): Bail out if receiver_decl is NULL. (scan_sharing_clauses): Handle OMP_CLAUSE__GRIDDIM_. (scan_omp_parallel): Do not create child functions for phony constructs. (check_omp_nesting_restrictions): Handle GIMPLE_OMP_GRID_BODY. (scan_omp_1_op): Checking assert we are not remapping to ERROR_MARK. Also also handle GIMPLE_OMP_GRID_BODY. (parallel_needs_hsa_kernel_p): New function. (expand_parallel_call): Register apprpriate parallel child functions as HSA kernels. (grid_launch_attributes_trees): New type. (grid_attr_trees): New variable. (grid_create_kernel_launch_attr_types): New function. (grid_insert_store_range_dim): Likewise. (grid_get_kernel_launch_attributes): Likewise. (get_target_argument_identifier_1): Likewise. (get_target_argument_identifier): Likewise. (get_target_argument_value): Likewise. (push_target_argument_according_to_value): Likewise. (get_target_arguments): Likewise. (expand_omp_target): Call get_target_arguments instead of looking up for teams and thread limit. (grid_expand_omp_for_loop): New function. (grid_arg_decl_map): New type. (grid_remap_kernel_arg_accesses): New function. (grid_expand_target_kernel_body): New function. (expand_omp): Call it. (lower_omp_for): Do not emit phony constructs. (lower_omp_taskreg): Do not emit phony constructs but create for them a temporary variable receiver_decl. (lower_omp_taskreg): Do not emit phony constructs. (lower_omp_teams): Likewise. (lower_omp_grid_body): New function. (lower_omp_1): Call it. (grid_reg_assignment_to_local_var_p): New function. (grid_seq_only_contains_local_assignments): Likewise. (grid_find_single_omp_among_assignments_1): Likewise. (grid_find_single_omp_among_assignments): Likewise. (grid_find_ungridifiable_statement): Likewise. (grid_target_follows_gridifiable_pattern): Likewise. (grid_remap_prebody_decls): Likewise. (grid_copy_leading_local_assignments): Likewise. (grid_process_kernel_body_copy): Likewise. (grid_attempt_target_gridification): Likewise. (grid_gridify_all_targets_stmt): Likewise. (grid_gridify_all_targets): Likewise. (execute_lower_omp): Call grid_gridify_all_targets. (make_gimple_omp_edges): Handle GIMPLE_OMP_GRID_BODY. * tree-core.h (omp_clause_code): Added OMP_CLAUSE__GRIDDIM_. (tree_omp_clause): Added union field dimension. * tree-pretty-print.c (dump_omp_clause): Handle OMP_CLAUSE__GRIDDIM_. * tree.c (omp_clause_num_ops): Added number of arguments of OMP_CLAUSE__GRIDDIM_. (omp_clause_code_name): Added name of OMP_CLAUSE__GRIDDIM_. (walk_tree_1): Handle OMP_CLAUSE__GRIDDIM_. * tree.h (OMP_CLAUSE_GRIDDIM_DIMENSION): New. (OMP_CLAUSE_SET_GRIDDIM_DIMENSION): Likewise. (OMP_CLAUSE_GRIDDIM_SIZE): Likewise. (OMP_CLAUSE_GRIDDIM_GROUP): Likewise. * passes.def: Schedule pass_ipa_hsa and pass_gen_hsail. * tree-pass.h (make_pass_gen_hsail): Declare. (make_pass_ipa_hsa): Likewise. * ipa-hsa.c: New file. * lto-section-in.c (lto_section_name): Add hsa section name. * lto-streamer.h (lto_section_type): Add hsa section. * timevar.def (TV_IPA_HSA): New. * hsa-brig-format.h: New file. * hsa-brig.c: New file. * hsa-dump.c: Likewise. * hsa-gen.c: Likewise. * hsa.c: Likewise. * hsa.h: Likewise. * toplev.c (compile_file): Call hsa_output_brig. * hsa-regalloc.c: New file. gcc/fortran/ * types.def (BT_FN_VOID_UINT_PTR_INT_PTR): New. (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_INT_INT): Removed. (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_PTR): New. gcc/lto/ * lto-partition.c: Include "hsa.h" (add_symbol_to_partition_1): Put hsa implementations into the same partition as host implementations. liboffloadmic/ * plugin/libgomp-plugin-intelmic.cpp (GOMP_OFFLOAD_async_run): New unused parameter. (GOMP_OFFLOAD_run): Likewise. include/ * gomp-constants.h (GOMP_DEVICE_HSA): New macro. (GOMP_VERSION_HSA): Likewise. (GOMP_TARGET_ARG_DEVICE_MASK): Likewise. (GOMP_TARGET_ARG_DEVICE_ALL): Likewise. (GOMP_TARGET_ARG_SUBSEQUENT_PARAM): Likewise. (GOMP_TARGET_ARG_ID_MASK): Likewise. (GOMP_TARGET_ARG_NUM_TEAMS): Likewise. (GOMP_TARGET_ARG_THREAD_LIMIT): Likewise. (GOMP_TARGET_ARG_VALUE_SHIFT): Likewise. (GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES): Likewise. From-SVN: r232549 --- gcc/omp-low.c | 1594 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 1430 insertions(+), 164 deletions(-) (limited to 'gcc/omp-low.c') diff --git a/gcc/omp-low.c b/gcc/omp-low.c index d7df3db..673dee3 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -80,6 +80,9 @@ along with GCC; see the file COPYING3. If not see #include "lto-section-names.h" #include "gomp-constants.h" #include "gimple-pretty-print.h" +#include "symbol-summary.h" +#include "hsa.h" +#include "params.h" /* Lowering of OMP parallel and workshare constructs proceeds in two phases. The first phase scans the function looking for OMP statements @@ -450,6 +453,63 @@ is_combined_parallel (struct omp_region *region) return region->is_combined_parallel; } +/* Adjust *COND_CODE and *N2 so that the former is either LT_EXPR or + GT_EXPR. */ + +static void +adjust_for_condition (location_t loc, enum tree_code *cond_code, tree *n2) +{ + switch (*cond_code) + { + case LT_EXPR: + case GT_EXPR: + case NE_EXPR: + break; + case LE_EXPR: + if (POINTER_TYPE_P (TREE_TYPE (*n2))) + *n2 = fold_build_pointer_plus_hwi_loc (loc, *n2, 1); + else + *n2 = fold_build2_loc (loc, PLUS_EXPR, TREE_TYPE (*n2), *n2, + build_int_cst (TREE_TYPE (*n2), 1)); + *cond_code = LT_EXPR; + break; + case GE_EXPR: + if (POINTER_TYPE_P (TREE_TYPE (*n2))) + *n2 = fold_build_pointer_plus_hwi_loc (loc, *n2, -1); + else + *n2 = fold_build2_loc (loc, MINUS_EXPR, TREE_TYPE (*n2), *n2, + build_int_cst (TREE_TYPE (*n2), 1)); + *cond_code = GT_EXPR; + break; + default: + gcc_unreachable (); + } +} + +/* Return the looping step from INCR, extracted from the step of a gimple omp + for statement. */ + +static tree +get_omp_for_step_from_incr (location_t loc, tree incr) +{ + tree step; + switch (TREE_CODE (incr)) + { + case PLUS_EXPR: + step = TREE_OPERAND (incr, 1); + break; + case POINTER_PLUS_EXPR: + step = fold_convert (ssizetype, TREE_OPERAND (incr, 1)); + break; + case MINUS_EXPR: + step = TREE_OPERAND (incr, 1); + step = fold_build1_loc (loc, NEGATE_EXPR, TREE_TYPE (step), step); + break; + default: + gcc_unreachable (); + } + return step; +} /* Extract the header elements of parallel loop FOR_STMT and store them into *FD. */ @@ -579,58 +639,14 @@ extract_omp_for_data (gomp_for *for_stmt, struct omp_for_data *fd, loop->cond_code = gimple_omp_for_cond (for_stmt, i); loop->n2 = gimple_omp_for_final (for_stmt, i); - switch (loop->cond_code) - { - case LT_EXPR: - case GT_EXPR: - break; - case NE_EXPR: - gcc_assert (gimple_omp_for_kind (for_stmt) - == GF_OMP_FOR_KIND_CILKSIMD - || (gimple_omp_for_kind (for_stmt) - == GF_OMP_FOR_KIND_CILKFOR)); - break; - case LE_EXPR: - if (POINTER_TYPE_P (TREE_TYPE (loop->n2))) - loop->n2 = fold_build_pointer_plus_hwi_loc (loc, loop->n2, 1); - else - loop->n2 = fold_build2_loc (loc, - PLUS_EXPR, TREE_TYPE (loop->n2), loop->n2, - build_int_cst (TREE_TYPE (loop->n2), 1)); - loop->cond_code = LT_EXPR; - break; - case GE_EXPR: - if (POINTER_TYPE_P (TREE_TYPE (loop->n2))) - loop->n2 = fold_build_pointer_plus_hwi_loc (loc, loop->n2, -1); - else - loop->n2 = fold_build2_loc (loc, - MINUS_EXPR, TREE_TYPE (loop->n2), loop->n2, - build_int_cst (TREE_TYPE (loop->n2), 1)); - loop->cond_code = GT_EXPR; - break; - default: - gcc_unreachable (); - } + gcc_assert (loop->cond_code != NE_EXPR + || gimple_omp_for_kind (for_stmt) == GF_OMP_FOR_KIND_CILKSIMD + || gimple_omp_for_kind (for_stmt) == GF_OMP_FOR_KIND_CILKFOR); + adjust_for_condition (loc, &loop->cond_code, &loop->n2); t = gimple_omp_for_incr (for_stmt, i); gcc_assert (TREE_OPERAND (t, 0) == var); - switch (TREE_CODE (t)) - { - case PLUS_EXPR: - loop->step = TREE_OPERAND (t, 1); - break; - case POINTER_PLUS_EXPR: - loop->step = fold_convert (ssizetype, TREE_OPERAND (t, 1)); - break; - case MINUS_EXPR: - loop->step = TREE_OPERAND (t, 1); - loop->step = fold_build1_loc (loc, - NEGATE_EXPR, TREE_TYPE (loop->step), - loop->step); - break; - default: - gcc_unreachable (); - } + loop->step = get_omp_for_step_from_incr (loc, t); if (simd || (fd->sched_kind == OMP_CLAUSE_SCHEDULE_STATIC @@ -1321,7 +1337,16 @@ build_outer_var_ref (tree var, omp_context *ctx, bool lastprivate = false) } } else if (ctx->outer) - x = lookup_decl (var, ctx->outer); + { + omp_context *outer = ctx->outer; + if (gimple_code (outer->stmt) == GIMPLE_OMP_GRID_BODY) + { + outer = outer->outer; + gcc_assert (outer + && gimple_code (outer->stmt) != GIMPLE_OMP_GRID_BODY); + } + x = lookup_decl (var, outer); + } else if (is_reference (var)) /* This can happen with orphaned constructs. If var is reference, it is possible it is shared and as such valid. */ @@ -1774,6 +1799,8 @@ fixup_child_record_type (omp_context *ctx) { tree f, type = ctx->record_type; + if (!ctx->receiver_decl) + return; /* ??? It isn't sufficient to just call remap_type here, because variably_modified_type_p doesn't work the way we expect for record types. Testing each field for whether it needs remapping @@ -2132,6 +2159,14 @@ scan_sharing_clauses (tree clauses, omp_context *ctx, } break; + case OMP_CLAUSE__GRIDDIM_: + if (ctx->outer) + { + scan_omp_op (&OMP_CLAUSE__GRIDDIM__SIZE (c), ctx->outer); + scan_omp_op (&OMP_CLAUSE__GRIDDIM__GROUP (c), ctx->outer); + } + break; + case OMP_CLAUSE_NOWAIT: case OMP_CLAUSE_ORDERED: case OMP_CLAUSE_COLLAPSE: @@ -2327,6 +2362,7 @@ scan_sharing_clauses (tree clauses, omp_context *ctx, case OMP_CLAUSE_INDEPENDENT: case OMP_CLAUSE_AUTO: case OMP_CLAUSE_SEQ: + case OMP_CLAUSE__GRIDDIM_: break; case OMP_CLAUSE_DEVICE_RESIDENT: @@ -2648,8 +2684,11 @@ scan_omp_parallel (gimple_stmt_iterator *gsi, omp_context *outer_ctx) DECL_NAMELESS (name) = 1; TYPE_NAME (ctx->record_type) = name; TYPE_ARTIFICIAL (ctx->record_type) = 1; - create_omp_child_function (ctx, false); - gimple_omp_parallel_set_child_fn (stmt, ctx->cb.dst_fn); + if (!gimple_omp_parallel_grid_phony (stmt)) + { + create_omp_child_function (ctx, false); + gimple_omp_parallel_set_child_fn (stmt, ctx->cb.dst_fn); + } scan_sharing_clauses (gimple_omp_parallel_clauses (stmt), ctx); scan_omp (gimple_omp_body_ptr (stmt), ctx); @@ -3189,6 +3228,11 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx) { tree c; + if (ctx && gimple_code (ctx->stmt) == GIMPLE_OMP_GRID_BODY) + /* GRID_BODY is an artificial construct, nesting rules will be checked in + the original copy of its contents. */ + return true; + /* No nesting of non-OpenACC STMT (that is, an OpenMP one, or a GOMP builtin) inside an OpenACC CTX. */ if (!(is_gimple_omp (stmt) @@ -3777,7 +3821,11 @@ scan_omp_1_op (tree *tp, int *walk_subtrees, void *data) case LABEL_DECL: case RESULT_DECL: if (ctx) - *tp = remap_decl (t, &ctx->cb); + { + tree repl = remap_decl (t, &ctx->cb); + gcc_checking_assert (TREE_CODE (repl) != ERROR_MARK); + *tp = repl; + } break; default: @@ -3911,6 +3959,7 @@ scan_omp_1_stmt (gimple_stmt_iterator *gsi, bool *handled_ops_p, case GIMPLE_OMP_TASKGROUP: case GIMPLE_OMP_ORDERED: case GIMPLE_OMP_CRITICAL: + case GIMPLE_OMP_GRID_BODY: ctx = new_omp_context (stmt, ctx); scan_omp (gimple_omp_body_ptr (stmt), ctx); break; @@ -6343,6 +6392,37 @@ gimple_build_cond_empty (tree cond) return gimple_build_cond (pred_code, lhs, rhs, NULL_TREE, NULL_TREE); } +/* Return true if a parallel REGION is within a declare target function or + within a target region and is not a part of a gridified target. */ + +static bool +parallel_needs_hsa_kernel_p (struct omp_region *region) +{ + bool indirect = false; + for (region = region->outer; region; region = region->outer) + { + if (region->type == GIMPLE_OMP_PARALLEL) + indirect = true; + else if (region->type == GIMPLE_OMP_TARGET) + { + gomp_target *tgt_stmt + = as_a (last_stmt (region->entry)); + + if (find_omp_clause (gimple_omp_target_clauses (tgt_stmt), + OMP_CLAUSE__GRIDDIM_)) + return indirect; + else + return true; + } + } + + if (lookup_attribute ("omp declare target", + DECL_ATTRIBUTES (current_function_decl))) + return true; + + return false; +} + static void expand_omp_build_assign (gimple_stmt_iterator *, tree, tree, bool = false); @@ -6512,7 +6592,8 @@ expand_parallel_call (struct omp_region *region, basic_block bb, t1 = null_pointer_node; else t1 = build_fold_addr_expr (t); - t2 = build_fold_addr_expr (gimple_omp_parallel_child_fn (entry_stmt)); + tree child_fndecl = gimple_omp_parallel_child_fn (entry_stmt); + t2 = build_fold_addr_expr (child_fndecl); vec_alloc (args, 4 + vec_safe_length (ws_args)); args->quick_push (t2); @@ -6527,6 +6608,13 @@ expand_parallel_call (struct omp_region *region, basic_block bb, force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, false, GSI_CONTINUE_LINKING); + + if (hsa_gen_requested_p () + && parallel_needs_hsa_kernel_p (region)) + { + cgraph_node *child_cnode = cgraph_node::get (child_fndecl); + hsa_register_kernel (child_cnode); + } } /* Insert a function call whose name is FUNC_NAME with the information from @@ -12570,6 +12658,236 @@ mark_loops_in_oacc_kernels_region (basic_block region_entry, loop->in_oacc_kernels_region = true; } +/* Types used to pass grid and wortkgroup sizes to kernel invocation. */ + +struct GTY(()) grid_launch_attributes_trees +{ + tree kernel_dim_array_type; + tree kernel_lattrs_dimnum_decl; + tree kernel_lattrs_grid_decl; + tree kernel_lattrs_group_decl; + tree kernel_launch_attributes_type; +}; + +static GTY(()) struct grid_launch_attributes_trees *grid_attr_trees; + +/* Create types used to pass kernel launch attributes to target. */ + +static void +grid_create_kernel_launch_attr_types (void) +{ + if (grid_attr_trees) + return; + grid_attr_trees = ggc_alloc (); + + tree dim_arr_index_type + = build_index_type (build_int_cst (integer_type_node, 2)); + grid_attr_trees->kernel_dim_array_type + = build_array_type (uint32_type_node, dim_arr_index_type); + + grid_attr_trees->kernel_launch_attributes_type = make_node (RECORD_TYPE); + grid_attr_trees->kernel_lattrs_dimnum_decl + = build_decl (BUILTINS_LOCATION, FIELD_DECL, get_identifier ("ndim"), + uint32_type_node); + DECL_CHAIN (grid_attr_trees->kernel_lattrs_dimnum_decl) = NULL_TREE; + + grid_attr_trees->kernel_lattrs_grid_decl + = build_decl (BUILTINS_LOCATION, FIELD_DECL, get_identifier ("grid_size"), + grid_attr_trees->kernel_dim_array_type); + DECL_CHAIN (grid_attr_trees->kernel_lattrs_grid_decl) + = grid_attr_trees->kernel_lattrs_dimnum_decl; + grid_attr_trees->kernel_lattrs_group_decl + = build_decl (BUILTINS_LOCATION, FIELD_DECL, get_identifier ("group_size"), + grid_attr_trees->kernel_dim_array_type); + DECL_CHAIN (grid_attr_trees->kernel_lattrs_group_decl) + = grid_attr_trees->kernel_lattrs_grid_decl; + finish_builtin_struct (grid_attr_trees->kernel_launch_attributes_type, + "__gomp_kernel_launch_attributes", + grid_attr_trees->kernel_lattrs_group_decl, NULL_TREE); +} + +/* Insert before the current statement in GSI a store of VALUE to INDEX of + array (of type kernel_dim_array_type) FLD_DECL of RANGE_VAR. VALUE must be + of type uint32_type_node. */ + +static void +grid_insert_store_range_dim (gimple_stmt_iterator *gsi, tree range_var, + tree fld_decl, int index, tree value) +{ + tree ref = build4 (ARRAY_REF, uint32_type_node, + build3 (COMPONENT_REF, + grid_attr_trees->kernel_dim_array_type, + range_var, fld_decl, NULL_TREE), + build_int_cst (integer_type_node, index), + NULL_TREE, NULL_TREE); + gsi_insert_before (gsi, gimple_build_assign (ref, value), GSI_SAME_STMT); +} + +/* Return a tree representation of a pointer to a structure with grid and + work-group size information. Statements filling that information will be + inserted before GSI, TGT_STMT is the target statement which has the + necessary information in it. */ + +static tree +grid_get_kernel_launch_attributes (gimple_stmt_iterator *gsi, + gomp_target *tgt_stmt) +{ + grid_create_kernel_launch_attr_types (); + tree u32_one = build_one_cst (uint32_type_node); + tree lattrs = create_tmp_var (grid_attr_trees->kernel_launch_attributes_type, + "__kernel_launch_attrs"); + + unsigned max_dim = 0; + for (tree clause = gimple_omp_target_clauses (tgt_stmt); + clause; + clause = OMP_CLAUSE_CHAIN (clause)) + { + if (OMP_CLAUSE_CODE (clause) != OMP_CLAUSE__GRIDDIM_) + continue; + + unsigned dim = OMP_CLAUSE__GRIDDIM__DIMENSION (clause); + max_dim = MAX (dim, max_dim); + + grid_insert_store_range_dim (gsi, lattrs, + grid_attr_trees->kernel_lattrs_grid_decl, + dim, OMP_CLAUSE__GRIDDIM__SIZE (clause)); + grid_insert_store_range_dim (gsi, lattrs, + grid_attr_trees->kernel_lattrs_group_decl, + dim, OMP_CLAUSE__GRIDDIM__GROUP (clause)); + } + + tree dimref = build3 (COMPONENT_REF, uint32_type_node, lattrs, + grid_attr_trees->kernel_lattrs_dimnum_decl, NULL_TREE); + /* At this moment we cannot gridify a loop with a collapse clause. */ + /* TODO: Adjust when we support bigger collapse. */ + gcc_assert (max_dim == 0); + gsi_insert_before (gsi, gimple_build_assign (dimref, u32_one), GSI_SAME_STMT); + TREE_ADDRESSABLE (lattrs) = 1; + return build_fold_addr_expr (lattrs); +} + +/* Build target argument identifier from the DEVICE identifier, value + identifier ID and whether the element also has a SUBSEQUENT_PARAM. */ + +static tree +get_target_argument_identifier_1 (int device, bool subseqent_param, int id) +{ + tree t = build_int_cst (integer_type_node, device); + if (subseqent_param) + t = fold_build2 (BIT_IOR_EXPR, integer_type_node, t, + build_int_cst (integer_type_node, + GOMP_TARGET_ARG_SUBSEQUENT_PARAM)); + t = fold_build2 (BIT_IOR_EXPR, integer_type_node, t, + build_int_cst (integer_type_node, id)); + return t; +} + +/* Like above but return it in type that can be directly stored as an element + of the argument array. */ + +static tree +get_target_argument_identifier (int device, bool subseqent_param, int id) +{ + tree t = get_target_argument_identifier_1 (device, subseqent_param, id); + return fold_convert (ptr_type_node, t); +} + +/* Return a target argument consisting of DEVICE identifier, value identifier + ID, and the actual VALUE. */ + +static tree +get_target_argument_value (gimple_stmt_iterator *gsi, int device, int id, + tree value) +{ + tree t = fold_build2 (LSHIFT_EXPR, integer_type_node, + fold_convert (integer_type_node, value), + build_int_cst (unsigned_type_node, + GOMP_TARGET_ARG_VALUE_SHIFT)); + t = fold_build2 (BIT_IOR_EXPR, integer_type_node, t, + get_target_argument_identifier_1 (device, false, id)); + t = fold_convert (ptr_type_node, t); + return force_gimple_operand_gsi (gsi, t, true, NULL, true, GSI_SAME_STMT); +} + +/* If VALUE is an integer constant greater than -2^15 and smaller than 2^15, + push one argument to ARGS with both the DEVICE, ID and VALUE embedded in it, + otherwise push an identifier (with DEVICE and ID) and the VALUE in two + arguments. */ + +static void +push_target_argument_according_to_value (gimple_stmt_iterator *gsi, int device, + int id, tree value, vec *args) +{ + if (tree_fits_shwi_p (value) + && tree_to_shwi (value) > -(1 << 15) + && tree_to_shwi (value) < (1 << 15)) + args->quick_push (get_target_argument_value (gsi, device, id, value)); + else + { + args->quick_push (get_target_argument_identifier (device, true, id)); + value = fold_convert (ptr_type_node, value); + value = force_gimple_operand_gsi (gsi, value, true, NULL, true, + GSI_SAME_STMT); + args->quick_push (value); + } +} + +/* Create an array of arguments that is then passed to GOMP_target. */ + +static tree +get_target_arguments (gimple_stmt_iterator *gsi, gomp_target *tgt_stmt) +{ + auto_vec args; + tree clauses = gimple_omp_target_clauses (tgt_stmt); + tree t, c = find_omp_clause (clauses, OMP_CLAUSE_NUM_TEAMS); + if (c) + t = OMP_CLAUSE_NUM_TEAMS_EXPR (c); + else + t = integer_minus_one_node; + push_target_argument_according_to_value (gsi, GOMP_TARGET_ARG_DEVICE_ALL, + GOMP_TARGET_ARG_NUM_TEAMS, t, &args); + + c = find_omp_clause (clauses, OMP_CLAUSE_THREAD_LIMIT); + if (c) + t = OMP_CLAUSE_THREAD_LIMIT_EXPR (c); + else + t = integer_minus_one_node; + push_target_argument_according_to_value (gsi, GOMP_TARGET_ARG_DEVICE_ALL, + GOMP_TARGET_ARG_THREAD_LIMIT, t, + &args); + + /* Add HSA-specific grid sizes, if available. */ + if (find_omp_clause (gimple_omp_target_clauses (tgt_stmt), + OMP_CLAUSE__GRIDDIM_)) + { + t = get_target_argument_identifier (GOMP_DEVICE_HSA, true, + GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES); + args.quick_push (t); + args.quick_push (grid_get_kernel_launch_attributes (gsi, tgt_stmt)); + } + + /* Produce more, perhaps device specific, arguments here. */ + + tree argarray = create_tmp_var (build_array_type_nelts (ptr_type_node, + args.length () + 1), + ".omp_target_args"); + for (unsigned i = 0; i < args.length (); i++) + { + tree ref = build4 (ARRAY_REF, ptr_type_node, argarray, + build_int_cst (integer_type_node, i), + NULL_TREE, NULL_TREE); + gsi_insert_before (gsi, gimple_build_assign (ref, args[i]), + GSI_SAME_STMT); + } + tree ref = build4 (ARRAY_REF, ptr_type_node, argarray, + build_int_cst (integer_type_node, args.length ()), + NULL_TREE, NULL_TREE); + gsi_insert_before (gsi, gimple_build_assign (ref, null_pointer_node), + GSI_SAME_STMT); + TREE_ADDRESSABLE (argarray) = 1; + return build_fold_addr_expr (argarray); +} + /* Expand the GIMPLE_OMP_TARGET starting at REGION. */ static void @@ -12982,30 +13300,7 @@ expand_omp_target (struct omp_region *region) depend = build_int_cst (ptr_type_node, 0); args.quick_push (depend); if (start_ix == BUILT_IN_GOMP_TARGET) - { - c = find_omp_clause (clauses, OMP_CLAUSE_NUM_TEAMS); - if (c) - { - t = fold_convert (integer_type_node, - OMP_CLAUSE_NUM_TEAMS_EXPR (c)); - t = force_gimple_operand_gsi (&gsi, t, true, NULL, - true, GSI_SAME_STMT); - } - else - t = integer_minus_one_node; - args.quick_push (t); - c = find_omp_clause (clauses, OMP_CLAUSE_THREAD_LIMIT); - if (c) - { - t = fold_convert (integer_type_node, - OMP_CLAUSE_THREAD_LIMIT_EXPR (c)); - t = force_gimple_operand_gsi (&gsi, t, true, NULL, - true, GSI_SAME_STMT); - } - else - t = integer_minus_one_node; - args.quick_push (t); - } + args.quick_push (get_target_arguments (&gsi, entry_stmt)); break; case BUILT_IN_GOACC_PARALLEL: { @@ -13109,90 +13404,343 @@ expand_omp_target (struct omp_region *region) } } - -/* Expand the parallel region tree rooted at REGION. Expansion - proceeds in depth-first order. Innermost regions are expanded - first. This way, parallel regions that require a new function to - be created (e.g., GIMPLE_OMP_PARALLEL) can be expanded without having any - internal dependencies in their body. */ +/* Expand KFOR loop as a GPGPU kernel, i.e. as a body only with iteration + variable derived from the thread number. */ static void -expand_omp (struct omp_region *region) +grid_expand_omp_for_loop (struct omp_region *kfor) { - omp_any_child_fn_dumped = false; - while (region) - { - location_t saved_location; - gimple *inner_stmt = NULL; + tree t, threadid; + tree type, itype; + gimple_stmt_iterator gsi; + tree n1, step; + struct omp_for_data fd; - /* First, determine whether this is a combined parallel+workshare - region. */ - if (region->type == GIMPLE_OMP_PARALLEL) - determine_parallel_type (region); + gomp_for *for_stmt = as_a (last_stmt (kfor->entry)); + gcc_checking_assert (gimple_omp_for_kind (for_stmt) + == GF_OMP_FOR_KIND_GRID_LOOP); + basic_block body_bb = FALLTHRU_EDGE (kfor->entry)->dest; - if (region->type == GIMPLE_OMP_FOR - && gimple_omp_for_combined_p (last_stmt (region->entry))) - inner_stmt = last_stmt (region->inner->entry); + gcc_assert (gimple_omp_for_collapse (for_stmt) == 1); + gcc_assert (kfor->cont); + extract_omp_for_data (for_stmt, &fd, NULL); - if (region->inner) - expand_omp (region->inner); + itype = type = TREE_TYPE (fd.loop.v); + if (POINTER_TYPE_P (type)) + itype = signed_type_for (type); - saved_location = input_location; - if (gimple_has_location (last_stmt (region->entry))) - input_location = gimple_location (last_stmt (region->entry)); + gsi = gsi_start_bb (body_bb); - switch (region->type) - { - case GIMPLE_OMP_PARALLEL: - case GIMPLE_OMP_TASK: - expand_omp_taskreg (region); - break; + n1 = fd.loop.n1; + step = fd.loop.step; + n1 = force_gimple_operand_gsi (&gsi, fold_convert (type, n1), + true, NULL_TREE, true, GSI_SAME_STMT); + step = force_gimple_operand_gsi (&gsi, fold_convert (itype, step), + true, NULL_TREE, true, GSI_SAME_STMT); + threadid = build_call_expr (builtin_decl_explicit + (BUILT_IN_OMP_GET_THREAD_NUM), 0); + threadid = fold_convert (itype, threadid); + threadid = force_gimple_operand_gsi (&gsi, threadid, true, NULL_TREE, + true, GSI_SAME_STMT); - case GIMPLE_OMP_FOR: - expand_omp_for (region, inner_stmt); - break; + tree startvar = fd.loop.v; + t = fold_build2 (MULT_EXPR, itype, threadid, step); + if (POINTER_TYPE_P (type)) + t = fold_build_pointer_plus (n1, t); + else + t = fold_build2 (PLUS_EXPR, type, t, n1); + t = fold_convert (type, t); + t = force_gimple_operand_gsi (&gsi, t, + DECL_P (startvar) + && TREE_ADDRESSABLE (startvar), + NULL_TREE, true, GSI_SAME_STMT); + gassign *assign_stmt = gimple_build_assign (startvar, t); + gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT); - case GIMPLE_OMP_SECTIONS: - expand_omp_sections (region); - break; + /* Remove the omp for statement */ + gsi = gsi_last_bb (kfor->entry); + gsi_remove (&gsi, true); - case GIMPLE_OMP_SECTION: - /* Individual omp sections are handled together with their - parent GIMPLE_OMP_SECTIONS region. */ - break; + /* Remove the GIMPLE_OMP_CONTINUE statement. */ + gsi = gsi_last_bb (kfor->cont); + gcc_assert (!gsi_end_p (gsi) + && gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_CONTINUE); + gsi_remove (&gsi, true); - case GIMPLE_OMP_SINGLE: - expand_omp_single (region); - break; + /* Replace the GIMPLE_OMP_RETURN with a real return. */ + gsi = gsi_last_bb (kfor->exit); + gcc_assert (!gsi_end_p (gsi) + && gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_RETURN); + gsi_remove (&gsi, true); - case GIMPLE_OMP_ORDERED: - { - gomp_ordered *ord_stmt - = as_a (last_stmt (region->entry)); - if (find_omp_clause (gimple_omp_ordered_clauses (ord_stmt), - OMP_CLAUSE_DEPEND)) - { - /* We'll expand these when expanding corresponding - worksharing region with ordered(n) clause. */ - gcc_assert (region->outer - && region->outer->type == GIMPLE_OMP_FOR); - region->ord_stmt = ord_stmt; - break; - } - } - /* FALLTHRU */ - case GIMPLE_OMP_MASTER: - case GIMPLE_OMP_TASKGROUP: - case GIMPLE_OMP_CRITICAL: - case GIMPLE_OMP_TEAMS: - expand_omp_synch (region); - break; + /* Fixup the much simpler CFG. */ + remove_edge (find_edge (kfor->cont, body_bb)); - case GIMPLE_OMP_ATOMIC_LOAD: - expand_omp_atomic (region); - break; + if (kfor->cont != body_bb) + set_immediate_dominator (CDI_DOMINATORS, kfor->cont, body_bb); + set_immediate_dominator (CDI_DOMINATORS, kfor->exit, kfor->cont); +} - case GIMPLE_OMP_TARGET: +/* Structure passed to grid_remap_kernel_arg_accesses so that it can remap + argument_decls. */ + +struct grid_arg_decl_map +{ + tree old_arg; + tree new_arg; +}; + +/* Invoked through walk_gimple_op, will remap all PARM_DECLs to the ones + pertaining to kernel function. */ + +static tree +grid_remap_kernel_arg_accesses (tree *tp, int *walk_subtrees, void *data) +{ + struct walk_stmt_info *wi = (struct walk_stmt_info *) data; + struct grid_arg_decl_map *adm = (struct grid_arg_decl_map *) wi->info; + tree t = *tp; + + if (t == adm->old_arg) + *tp = adm->new_arg; + *walk_subtrees = !TYPE_P (t) && !DECL_P (t); + return NULL_TREE; +} + +static void expand_omp (struct omp_region *region); + +/* If TARGET region contains a kernel body for loop, remove its region from the + TARGET and expand it in GPGPU kernel fashion. */ + +static void +grid_expand_target_grid_body (struct omp_region *target) +{ + if (!hsa_gen_requested_p ()) + return; + + gomp_target *tgt_stmt = as_a (last_stmt (target->entry)); + struct omp_region **pp; + + for (pp = &target->inner; *pp; pp = &(*pp)->next) + if ((*pp)->type == GIMPLE_OMP_GRID_BODY) + break; + + struct omp_region *gpukernel = *pp; + + tree orig_child_fndecl = gimple_omp_target_child_fn (tgt_stmt); + if (!gpukernel) + { + /* HSA cannot handle OACC stuff. */ + if (gimple_omp_target_kind (tgt_stmt) != GF_OMP_TARGET_KIND_REGION) + return; + gcc_checking_assert (orig_child_fndecl); + gcc_assert (!find_omp_clause (gimple_omp_target_clauses (tgt_stmt), + OMP_CLAUSE__GRIDDIM_)); + cgraph_node *n = cgraph_node::get (orig_child_fndecl); + + hsa_register_kernel (n); + return; + } + + gcc_assert (find_omp_clause (gimple_omp_target_clauses (tgt_stmt), + OMP_CLAUSE__GRIDDIM_)); + tree inside_block = gimple_block (first_stmt (single_succ (gpukernel->entry))); + *pp = gpukernel->next; + for (pp = &gpukernel->inner; *pp; pp = &(*pp)->next) + if ((*pp)->type == GIMPLE_OMP_FOR) + break; + + struct omp_region *kfor = *pp; + gcc_assert (kfor); + gcc_assert (gimple_omp_for_kind (last_stmt ((kfor)->entry)) + == GF_OMP_FOR_KIND_GRID_LOOP); + *pp = kfor->next; + if (kfor->inner) + expand_omp (kfor->inner); + if (gpukernel->inner) + expand_omp (gpukernel->inner); + + tree kern_fndecl = copy_node (orig_child_fndecl); + DECL_NAME (kern_fndecl) = clone_function_name (kern_fndecl, "kernel"); + SET_DECL_ASSEMBLER_NAME (kern_fndecl, DECL_NAME (kern_fndecl)); + tree tgtblock = gimple_block (tgt_stmt); + tree fniniblock = make_node (BLOCK); + BLOCK_ABSTRACT_ORIGIN (fniniblock) = tgtblock; + BLOCK_SOURCE_LOCATION (fniniblock) = BLOCK_SOURCE_LOCATION (tgtblock); + BLOCK_SOURCE_END_LOCATION (fniniblock) = BLOCK_SOURCE_END_LOCATION (tgtblock); + DECL_INITIAL (kern_fndecl) = fniniblock; + push_struct_function (kern_fndecl); + cfun->function_end_locus = gimple_location (tgt_stmt); + pop_cfun (); + + tree old_parm_decl = DECL_ARGUMENTS (kern_fndecl); + gcc_assert (!DECL_CHAIN (old_parm_decl)); + tree new_parm_decl = copy_node (DECL_ARGUMENTS (kern_fndecl)); + DECL_CONTEXT (new_parm_decl) = kern_fndecl; + DECL_ARGUMENTS (kern_fndecl) = new_parm_decl; + struct function *kern_cfun = DECL_STRUCT_FUNCTION (kern_fndecl); + kern_cfun->curr_properties = cfun->curr_properties; + + remove_edge (BRANCH_EDGE (kfor->entry)); + grid_expand_omp_for_loop (kfor); + + /* Remove the omp for statement */ + gimple_stmt_iterator gsi = gsi_last_bb (gpukernel->entry); + gsi_remove (&gsi, true); + /* Replace the GIMPLE_OMP_RETURN at the end of the kernel region with a real + return. */ + gsi = gsi_last_bb (gpukernel->exit); + gcc_assert (!gsi_end_p (gsi) + && gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_RETURN); + gimple *ret_stmt = gimple_build_return (NULL); + gsi_insert_after (&gsi, ret_stmt, GSI_SAME_STMT); + gsi_remove (&gsi, true); + + /* Statements in the first BB in the target construct have been produced by + target lowering and must be copied inside the GPUKERNEL, with the two + exceptions of the first OMP statement and the OMP_DATA assignment + statement. */ + gsi = gsi_start_bb (single_succ (gpukernel->entry)); + tree data_arg = gimple_omp_target_data_arg (tgt_stmt); + tree sender = data_arg ? TREE_VEC_ELT (data_arg, 0) : NULL; + for (gimple_stmt_iterator tsi = gsi_start_bb (single_succ (target->entry)); + !gsi_end_p (tsi); gsi_next (&tsi)) + { + gimple *stmt = gsi_stmt (tsi); + if (is_gimple_omp (stmt)) + break; + if (sender + && is_gimple_assign (stmt) + && TREE_CODE (gimple_assign_rhs1 (stmt)) == ADDR_EXPR + && TREE_OPERAND (gimple_assign_rhs1 (stmt), 0) == sender) + continue; + gimple *copy = gimple_copy (stmt); + gsi_insert_before (&gsi, copy, GSI_SAME_STMT); + gimple_set_block (copy, fniniblock); + } + + move_sese_region_to_fn (kern_cfun, single_succ (gpukernel->entry), + gpukernel->exit, inside_block); + + cgraph_node *kcn = cgraph_node::get_create (kern_fndecl); + kcn->mark_force_output (); + cgraph_node *orig_child = cgraph_node::get (orig_child_fndecl); + + hsa_register_kernel (kcn, orig_child); + + cgraph_node::add_new_function (kern_fndecl, true); + push_cfun (kern_cfun); + cgraph_edge::rebuild_edges (); + + /* Re-map any mention of the PARM_DECL of the original function to the + PARM_DECL of the new one. + + TODO: It would be great if lowering produced references into the GPU + kernel decl straight away and we did not have to do this. */ + struct grid_arg_decl_map adm; + adm.old_arg = old_parm_decl; + adm.new_arg = new_parm_decl; + basic_block bb; + FOR_EACH_BB_FN (bb, kern_cfun) + { + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + struct walk_stmt_info wi; + memset (&wi, 0, sizeof (wi)); + wi.info = &adm; + walk_gimple_op (stmt, grid_remap_kernel_arg_accesses, &wi); + } + } + pop_cfun (); + + return; +} + +/* Expand the parallel region tree rooted at REGION. Expansion + proceeds in depth-first order. Innermost regions are expanded + first. This way, parallel regions that require a new function to + be created (e.g., GIMPLE_OMP_PARALLEL) can be expanded without having any + internal dependencies in their body. */ + +static void +expand_omp (struct omp_region *region) +{ + omp_any_child_fn_dumped = false; + while (region) + { + location_t saved_location; + gimple *inner_stmt = NULL; + + /* First, determine whether this is a combined parallel+workshare + region. */ + if (region->type == GIMPLE_OMP_PARALLEL) + determine_parallel_type (region); + else if (region->type == GIMPLE_OMP_TARGET) + grid_expand_target_grid_body (region); + + if (region->type == GIMPLE_OMP_FOR + && gimple_omp_for_combined_p (last_stmt (region->entry))) + inner_stmt = last_stmt (region->inner->entry); + + if (region->inner) + expand_omp (region->inner); + + saved_location = input_location; + if (gimple_has_location (last_stmt (region->entry))) + input_location = gimple_location (last_stmt (region->entry)); + + switch (region->type) + { + case GIMPLE_OMP_PARALLEL: + case GIMPLE_OMP_TASK: + expand_omp_taskreg (region); + break; + + case GIMPLE_OMP_FOR: + expand_omp_for (region, inner_stmt); + break; + + case GIMPLE_OMP_SECTIONS: + expand_omp_sections (region); + break; + + case GIMPLE_OMP_SECTION: + /* Individual omp sections are handled together with their + parent GIMPLE_OMP_SECTIONS region. */ + break; + + case GIMPLE_OMP_SINGLE: + expand_omp_single (region); + break; + + case GIMPLE_OMP_ORDERED: + { + gomp_ordered *ord_stmt + = as_a (last_stmt (region->entry)); + if (find_omp_clause (gimple_omp_ordered_clauses (ord_stmt), + OMP_CLAUSE_DEPEND)) + { + /* We'll expand these when expanding corresponding + worksharing region with ordered(n) clause. */ + gcc_assert (region->outer + && region->outer->type == GIMPLE_OMP_FOR); + region->ord_stmt = ord_stmt; + break; + } + } + /* FALLTHRU */ + case GIMPLE_OMP_MASTER: + case GIMPLE_OMP_TASKGROUP: + case GIMPLE_OMP_CRITICAL: + case GIMPLE_OMP_TEAMS: + expand_omp_synch (region); + break; + + case GIMPLE_OMP_ATOMIC_LOAD: + expand_omp_atomic (region); + break; + + case GIMPLE_OMP_TARGET: expand_omp_target (region); break; @@ -14507,11 +15055,13 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx) ctx); } - gimple_seq_add_stmt (&body, stmt); + if (!gimple_omp_for_grid_phony (stmt)) + gimple_seq_add_stmt (&body, stmt); gimple_seq_add_seq (&body, gimple_omp_body (stmt)); - gimple_seq_add_stmt (&body, gimple_build_omp_continue (fd.loop.v, - fd.loop.v)); + if (!gimple_omp_for_grid_phony (stmt)) + gimple_seq_add_stmt (&body, gimple_build_omp_continue (fd.loop.v, + fd.loop.v)); /* After the loop, add exit clauses. */ lower_reduction_clauses (gimple_omp_for_clauses (stmt), &body, ctx); @@ -14523,9 +15073,12 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx) body = maybe_catch_exception (body); - /* Region exit marker goes at the end of the loop body. */ - gimple_seq_add_stmt (&body, gimple_build_omp_return (fd.have_nowait)); - maybe_add_implicit_barrier_cancel (ctx, &body); + if (!gimple_omp_for_grid_phony (stmt)) + { + /* Region exit marker goes at the end of the loop body. */ + gimple_seq_add_stmt (&body, gimple_build_omp_return (fd.have_nowait)); + maybe_add_implicit_barrier_cancel (ctx, &body); + } /* Add OpenACC joining and reduction markers just after the loop. */ if (oacc_tail) @@ -14968,6 +15521,14 @@ lower_omp_taskreg (gimple_stmt_iterator *gsi_p, omp_context *ctx) par_olist = NULL; par_ilist = NULL; par_rlist = NULL; + bool phony_construct = gimple_code (stmt) == GIMPLE_OMP_PARALLEL + && gimple_omp_parallel_grid_phony (as_a (stmt)); + if (phony_construct && ctx->record_type) + { + gcc_checking_assert (!ctx->receiver_decl); + ctx->receiver_decl = create_tmp_var + (build_reference_type (ctx->record_type), ".omp_rec"); + } lower_rec_input_clauses (clauses, &par_ilist, &par_olist, ctx, NULL); lower_omp (&par_body, ctx); if (gimple_code (stmt) == GIMPLE_OMP_PARALLEL) @@ -15026,13 +15587,19 @@ lower_omp_taskreg (gimple_stmt_iterator *gsi_p, omp_context *ctx) gimple_seq_add_stmt (&new_body, gimple_build_omp_continue (integer_zero_node, integer_zero_node)); - gimple_seq_add_stmt (&new_body, gimple_build_omp_return (false)); - gimple_omp_set_body (stmt, new_body); + if (!phony_construct) + { + gimple_seq_add_stmt (&new_body, gimple_build_omp_return (false)); + gimple_omp_set_body (stmt, new_body); + } bind = gimple_build_bind (NULL, NULL, gimple_bind_block (par_bind)); gsi_replace (gsi_p, dep_bind ? dep_bind : bind, true); gimple_bind_add_seq (bind, ilist); - gimple_bind_add_stmt (bind, stmt); + if (!phony_construct) + gimple_bind_add_stmt (bind, stmt); + else + gimple_bind_add_seq (bind, new_body); gimple_bind_add_seq (bind, olist); pop_gimplify_context (NULL); @@ -16165,19 +16732,22 @@ lower_omp_teams (gimple_stmt_iterator *gsi_p, omp_context *ctx) &bind_body, &dlist, ctx, NULL); lower_omp (gimple_omp_body_ptr (teams_stmt), ctx); lower_reduction_clauses (gimple_omp_teams_clauses (teams_stmt), &olist, ctx); - gimple_seq_add_stmt (&bind_body, teams_stmt); - - location_t loc = gimple_location (teams_stmt); - tree decl = builtin_decl_explicit (BUILT_IN_GOMP_TEAMS); - gimple *call = gimple_build_call (decl, 2, num_teams, thread_limit); - gimple_set_location (call, loc); - gimple_seq_add_stmt (&bind_body, call); + if (!gimple_omp_teams_grid_phony (teams_stmt)) + { + gimple_seq_add_stmt (&bind_body, teams_stmt); + location_t loc = gimple_location (teams_stmt); + tree decl = builtin_decl_explicit (BUILT_IN_GOMP_TEAMS); + gimple *call = gimple_build_call (decl, 2, num_teams, thread_limit); + gimple_set_location (call, loc); + gimple_seq_add_stmt (&bind_body, call); + } gimple_seq_add_seq (&bind_body, gimple_omp_body (teams_stmt)); gimple_omp_set_body (teams_stmt, NULL); gimple_seq_add_seq (&bind_body, olist); gimple_seq_add_seq (&bind_body, dlist); - gimple_seq_add_stmt (&bind_body, gimple_build_omp_return (true)); + if (!gimple_omp_teams_grid_phony (teams_stmt)) + gimple_seq_add_stmt (&bind_body, gimple_build_omp_return (true)); gimple_bind_set_body (bind, bind_body); pop_gimplify_context (bind); @@ -16188,6 +16758,17 @@ lower_omp_teams (gimple_stmt_iterator *gsi_p, omp_context *ctx) TREE_USED (block) = 1; } +/* Expand code within an artificial GIMPLE_OMP_GRID_BODY OMP construct. */ + +static void +lower_omp_grid_body (gimple_stmt_iterator *gsi_p, omp_context *ctx) +{ + gimple *stmt = gsi_stmt (*gsi_p); + lower_omp (gimple_omp_body_ptr (stmt), ctx); + gimple_seq_add_stmt (gimple_omp_body_ptr (stmt), + gimple_build_omp_return (false)); +} + /* Callback for lower_omp_1. Return non-NULL if *tp needs to be regimplified. If DATA is non-NULL, lower_omp_1 is outside @@ -16399,6 +16980,11 @@ lower_omp_1 (gimple_stmt_iterator *gsi_p, omp_context *ctx) gcc_assert (ctx); lower_omp_teams (gsi_p, ctx); break; + case GIMPLE_OMP_GRID_BODY: + ctx = maybe_lookup_ctx (stmt); + gcc_assert (ctx); + lower_omp_grid_body (gsi_p, ctx); + break; case GIMPLE_CALL: tree fndecl; call_stmt = as_a (stmt); @@ -16488,7 +17074,682 @@ lower_omp (gimple_seq *body, omp_context *ctx) fold_stmt (&gsi); input_location = saved_location; } + +/* Returen true if STMT is an assignment of a register-type into a local + VAR_DECL. */ + +static bool +grid_reg_assignment_to_local_var_p (gimple *stmt) +{ + gassign *assign = dyn_cast (stmt); + if (!assign) + return false; + tree lhs = gimple_assign_lhs (assign); + if (TREE_CODE (lhs) != VAR_DECL + || !is_gimple_reg_type (TREE_TYPE (lhs)) + || is_global_var (lhs)) + return false; + return true; +} + +/* Return true if all statements in SEQ are assignments to local register-type + variables. */ + +static bool +grid_seq_only_contains_local_assignments (gimple_seq seq) +{ + if (!seq) + return true; + + gimple_stmt_iterator gsi; + for (gsi = gsi_start (seq); !gsi_end_p (gsi); gsi_next (&gsi)) + if (!grid_reg_assignment_to_local_var_p (gsi_stmt (gsi))) + return false; + return true; +} + +/* Scan statements in SEQ and call itself recursively on any bind. If during + whole search only assignments to register-type local variables and one + single OMP statement is encountered, return true, otherwise return false. + RET is where we store any OMP statement encountered. TARGET_LOC and NAME + are used for dumping a note about a failure. */ + +static bool +grid_find_single_omp_among_assignments_1 (gimple_seq seq, location_t target_loc, + const char *name, gimple **ret) +{ + gimple_stmt_iterator gsi; + for (gsi = gsi_start (seq); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + + if (grid_reg_assignment_to_local_var_p (stmt)) + continue; + if (gbind *bind = dyn_cast (stmt)) + { + if (!grid_find_single_omp_among_assignments_1 (gimple_bind_body (bind), + target_loc, name, ret)) + return false; + } + else if (is_gimple_omp (stmt)) + { + if (*ret) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, target_loc, + "Will not turn target construct into a simple " + "GPGPU kernel because %s construct contains " + "multiple OpenMP constructs\n", name); + return false; + } + *ret = stmt; + } + else + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, target_loc, + "Will not turn target construct into a simple " + "GPGPU kernel because %s construct contains " + "a complex statement\n", name); + return false; + } + } + return true; +} + +/* Scan statements in SEQ and make sure that it and any binds in it contain + only assignments to local register-type variables and one OMP construct. If + so, return that construct, otherwise return NULL. If dumping is enabled and + function fails, use TARGET_LOC and NAME to dump a note with the reason for + failure. */ + +static gimple * +grid_find_single_omp_among_assignments (gimple_seq seq, location_t target_loc, + const char *name) +{ + if (!seq) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, target_loc, + "Will not turn target construct into a simple " + "GPGPU kernel because %s construct has empty " + "body\n", + name); + return NULL; + } + + gimple *ret = NULL; + if (grid_find_single_omp_among_assignments_1 (seq, target_loc, name, &ret)) + { + if (!ret && dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, target_loc, + "Will not turn target construct into a simple " + "GPGPU kernel because %s construct does not contain" + "any other OpenMP construct\n", name); + return ret; + } + else + return NULL; +} + +/* Walker function looking for statements there is no point gridifying (and for + noreturn function calls which we cannot do). Return non-NULL if such a + function is found. */ + +static tree +grid_find_ungridifiable_statement (gimple_stmt_iterator *gsi, + bool *handled_ops_p, + struct walk_stmt_info *) +{ + *handled_ops_p = false; + gimple *stmt = gsi_stmt (*gsi); + switch (gimple_code (stmt)) + { + case GIMPLE_CALL: + if (gimple_call_noreturn_p (as_a (stmt))) + { + *handled_ops_p = true; + return error_mark_node; + } + break; + + /* We may reduce the following list if we find a way to implement the + clauses, but now there is no point trying further. */ + case GIMPLE_OMP_CRITICAL: + case GIMPLE_OMP_TASKGROUP: + case GIMPLE_OMP_TASK: + case GIMPLE_OMP_SECTION: + case GIMPLE_OMP_SECTIONS: + case GIMPLE_OMP_SECTIONS_SWITCH: + case GIMPLE_OMP_TARGET: + case GIMPLE_OMP_ORDERED: + *handled_ops_p = true; + return error_mark_node; + + default: + break; + } + return NULL; +} + + +/* If TARGET follows a pattern that can be turned into a gridified GPGPU + kernel, return true, otherwise return false. In the case of success, also + fill in GROUP_SIZE_P with the requested group size or NULL if there is + none. */ + +static bool +grid_target_follows_gridifiable_pattern (gomp_target *target, tree *group_size_p) +{ + if (gimple_omp_target_kind (target) != GF_OMP_TARGET_KIND_REGION) + return false; + + location_t tloc = gimple_location (target); + gimple *stmt + = grid_find_single_omp_among_assignments (gimple_omp_body (target), + tloc, "target"); + if (!stmt) + return false; + gomp_teams *teams = dyn_cast (stmt); + tree group_size = NULL; + if (!teams) + { + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a simple " + "GPGPU kernel because it does not have a sole teams " + "construct in it.\n"); + return false; + } + + tree clauses = gimple_omp_teams_clauses (teams); + while (clauses) + { + switch (OMP_CLAUSE_CODE (clauses)) + { + case OMP_CLAUSE_NUM_TEAMS: + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a " + "gridified GPGPU kernel because we cannot " + "handle num_teams clause of teams " + "construct\n "); + return false; + + case OMP_CLAUSE_REDUCTION: + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a " + "gridified GPGPU kernel because a reduction " + "clause is present\n "); + return false; + + case OMP_CLAUSE_LASTPRIVATE: + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a " + "gridified GPGPU kernel because a lastprivate " + "clause is present\n "); + return false; + + case OMP_CLAUSE_THREAD_LIMIT: + group_size = OMP_CLAUSE_OPERAND (clauses, 0); + break; + + default: + break; + } + clauses = OMP_CLAUSE_CHAIN (clauses); + } + + stmt = grid_find_single_omp_among_assignments (gimple_omp_body (teams), tloc, + "teams"); + if (!stmt) + return false; + gomp_for *dist = dyn_cast (stmt); + if (!dist) + { + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a simple " + "GPGPU kernel because the teams construct does not have " + "a sole distribute construct in it.\n"); + return false; + } + + gcc_assert (gimple_omp_for_kind (dist) == GF_OMP_FOR_KIND_DISTRIBUTE); + if (!gimple_omp_for_combined_p (dist)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a gridified GPGPU " + "kernel because we cannot handle a standalone " + "distribute construct\n "); + return false; + } + if (dist->collapse > 1) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a gridified GPGPU " + "kernel because the distribute construct contains " + "collapse clause\n"); + return false; + } + struct omp_for_data fd; + extract_omp_for_data (dist, &fd, NULL); + if (fd.chunk_size) + { + if (group_size && !operand_equal_p (group_size, fd.chunk_size, 0)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a " + "gridified GPGPU kernel because the teams " + "thread limit is different from distribute " + "schedule chunk\n"); + return false; + } + group_size = fd.chunk_size; + } + stmt = grid_find_single_omp_among_assignments (gimple_omp_body (dist), tloc, + "distribute"); + gomp_parallel *par; + if (!stmt || !(par = dyn_cast (stmt))) + return false; + + clauses = gimple_omp_parallel_clauses (par); + while (clauses) + { + switch (OMP_CLAUSE_CODE (clauses)) + { + case OMP_CLAUSE_NUM_THREADS: + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a gridified" + "GPGPU kernel because there is a num_threads " + "clause of the parallel construct\n"); + return false; + + case OMP_CLAUSE_REDUCTION: + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a " + "gridified GPGPU kernel because a reduction " + "clause is present\n "); + return false; + + case OMP_CLAUSE_LASTPRIVATE: + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a " + "gridified GPGPU kernel because a lastprivate " + "clause is present\n "); + return false; + + default: + break; + } + clauses = OMP_CLAUSE_CHAIN (clauses); + } + + stmt = grid_find_single_omp_among_assignments (gimple_omp_body (par), tloc, + "parallel"); + gomp_for *gfor; + if (!stmt || !(gfor = dyn_cast (stmt))) + return false; + + if (gimple_omp_for_kind (gfor) != GF_OMP_FOR_KIND_FOR) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a gridified GPGPU " + "kernel because the inner loop is not a simple for " + "loop\n"); + return false; + } + if (gfor->collapse > 1) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a gridified GPGPU " + "kernel because the inner loop contains collapse " + "clause\n"); + return false; + } + + if (!grid_seq_only_contains_local_assignments (gimple_omp_for_pre_body (gfor))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a gridified GPGPU " + "kernel because the inner loop pre_body contains" + "a complex instruction\n"); + return false; + } + + clauses = gimple_omp_for_clauses (gfor); + while (clauses) + { + switch (OMP_CLAUSE_CODE (clauses)) + { + case OMP_CLAUSE_SCHEDULE: + if (OMP_CLAUSE_SCHEDULE_KIND (clauses) != OMP_CLAUSE_SCHEDULE_AUTO) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a " + "gridified GPGPU kernel because the inner " + "loop has a non-automatic scheduling clause\n"); + return false; + } + break; + + case OMP_CLAUSE_REDUCTION: + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a " + "gridified GPGPU kernel because a reduction " + "clause is present\n "); + return false; + + case OMP_CLAUSE_LASTPRIVATE: + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a " + "gridified GPGPU kernel because a lastprivate " + "clause is present\n "); + return false; + + default: + break; + } + clauses = OMP_CLAUSE_CHAIN (clauses); + } + + struct walk_stmt_info wi; + memset (&wi, 0, sizeof (wi)); + if (gimple *bad = walk_gimple_seq (gimple_omp_body (gfor), + grid_find_ungridifiable_statement, + NULL, &wi)) + { + if (dump_enabled_p ()) + { + if (is_gimple_call (bad)) + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a gridified " + " GPGPU kernel because the inner loop contains " + "call to a noreturn function\n"); + else + dump_printf_loc (MSG_NOTE, tloc, + "Will not turn target construct into a gridified " + "GPGPU kernel because the inner loop contains " + "statement %s which cannot be transformed\n", + gimple_code_name[(int) gimple_code (bad)]); + } + return false; + } + + *group_size_p = group_size; + return true; +} + +/* Operand walker, used to remap pre-body declarations according to a hash map + provided in DATA. */ + +static tree +grid_remap_prebody_decls (tree *tp, int *walk_subtrees, void *data) +{ + tree t = *tp; + + if (DECL_P (t) || TYPE_P (t)) + *walk_subtrees = 0; + else + *walk_subtrees = 1; + + if (TREE_CODE (t) == VAR_DECL) + { + struct walk_stmt_info *wi = (struct walk_stmt_info *) data; + hash_map *declmap = (hash_map *) wi->info; + tree *repl = declmap->get (t); + if (repl) + *tp = *repl; + } + return NULL_TREE; +} + +/* Copy leading register-type assignments to local variables in SRC to just + before DST, Creating temporaries, adjusting mapping of operands in WI and + remapping operands as necessary. Add any new temporaries to TGT_BIND. + Return the first statement that does not conform to + grid_reg_assignment_to_local_var_p or NULL. */ + +static gimple * +grid_copy_leading_local_assignments (gimple_seq src, gimple_stmt_iterator *dst, + gbind *tgt_bind, struct walk_stmt_info *wi) +{ + hash_map *declmap = (hash_map *) wi->info; + gimple_stmt_iterator gsi; + for (gsi = gsi_start (src); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + if (gbind *bind = dyn_cast (stmt)) + { + gimple *r = grid_copy_leading_local_assignments + (gimple_bind_body (bind), dst, tgt_bind, wi); + if (r) + return r; + else + continue; + } + if (!grid_reg_assignment_to_local_var_p (stmt)) + return stmt; + tree lhs = gimple_assign_lhs (as_a (stmt)); + tree repl = copy_var_decl (lhs, create_tmp_var_name (NULL), + TREE_TYPE (lhs)); + DECL_CONTEXT (repl) = current_function_decl; + gimple_bind_append_vars (tgt_bind, repl); + + declmap->put (lhs, repl); + gassign *copy = as_a (gimple_copy (stmt)); + walk_gimple_op (copy, grid_remap_prebody_decls, wi); + gsi_insert_before (dst, copy, GSI_SAME_STMT); + } + return NULL; +} + +/* Given freshly copied top level kernel SEQ, identify the individual OMP + components, mark them as part of kernel and return the inner loop, and copy + assignment leading to them just before DST, remapping them using WI and + adding new temporaries to TGT_BIND. */ + +static gomp_for * +grid_process_kernel_body_copy (gimple_seq seq, gimple_stmt_iterator *dst, + gbind *tgt_bind, struct walk_stmt_info *wi) +{ + gimple *stmt = grid_copy_leading_local_assignments (seq, dst, tgt_bind, wi); + gomp_teams *teams = dyn_cast (stmt); + gcc_assert (teams); + gimple_omp_teams_set_grid_phony (teams, true); + stmt = grid_copy_leading_local_assignments (gimple_omp_body (teams), dst, + tgt_bind, wi); + gcc_checking_assert (stmt); + gomp_for *dist = dyn_cast (stmt); + gcc_assert (dist); + gimple_seq prebody = gimple_omp_for_pre_body (dist); + if (prebody) + grid_copy_leading_local_assignments (prebody, dst, tgt_bind, wi); + gimple_omp_for_set_grid_phony (dist, true); + stmt = grid_copy_leading_local_assignments (gimple_omp_body (dist), dst, + tgt_bind, wi); + gcc_checking_assert (stmt); + + gomp_parallel *parallel = as_a (stmt); + gimple_omp_parallel_set_grid_phony (parallel, true); + stmt = grid_copy_leading_local_assignments (gimple_omp_body (parallel), dst, + tgt_bind, wi); + gomp_for *inner_loop = as_a (stmt); + gimple_omp_for_set_kind (inner_loop, GF_OMP_FOR_KIND_GRID_LOOP); + prebody = gimple_omp_for_pre_body (inner_loop); + if (prebody) + grid_copy_leading_local_assignments (prebody, dst, tgt_bind, wi); + + return inner_loop; +} + +/* If TARGET points to a GOMP_TARGET which follows a gridifiable pattern, + create a GPU kernel for it. GSI must point to the same statement, TGT_BIND + is the bind into which temporaries inserted before TARGET should be + added. */ + +static void +grid_attempt_target_gridification (gomp_target *target, + gimple_stmt_iterator *gsi, + gbind *tgt_bind) +{ + tree group_size; + if (!target || !grid_target_follows_gridifiable_pattern (target, &group_size)) + return; + + location_t loc = gimple_location (target); + if (dump_enabled_p ()) + dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, + "Target construct will be turned into a gridified GPGPU " + "kernel\n"); + + /* Copy target body to a GPUKERNEL construct: */ + gimple_seq kernel_seq = copy_gimple_seq_and_replace_locals + (gimple_omp_body (target)); + + hash_map *declmap = new hash_map; + struct walk_stmt_info wi; + memset (&wi, 0, sizeof (struct walk_stmt_info)); + wi.info = declmap; + + /* Copy assignments in between OMP statements before target, mark OMP + statements within copy appropriatly. */ + gomp_for *inner_loop = grid_process_kernel_body_copy (kernel_seq, gsi, + tgt_bind, &wi); + + gbind *old_bind = as_a (gimple_seq_first (gimple_omp_body (target))); + gbind *new_bind = as_a (gimple_seq_first (kernel_seq)); + tree new_block = gimple_bind_block (new_bind); + tree enc_block = BLOCK_SUPERCONTEXT (gimple_bind_block (old_bind)); + BLOCK_CHAIN (new_block) = BLOCK_SUBBLOCKS (enc_block); + BLOCK_SUBBLOCKS (enc_block) = new_block; + BLOCK_SUPERCONTEXT (new_block) = enc_block; + gimple *gpukernel = gimple_build_omp_grid_body (kernel_seq); + gimple_seq_add_stmt + (gimple_bind_body_ptr (as_a (gimple_omp_body (target))), + gpukernel); + + walk_tree (&group_size, grid_remap_prebody_decls, &wi, NULL); + push_gimplify_context (); + size_t collapse = gimple_omp_for_collapse (inner_loop); + for (size_t i = 0; i < collapse; i++) + { + tree itype, type = TREE_TYPE (gimple_omp_for_index (inner_loop, i)); + if (POINTER_TYPE_P (type)) + itype = signed_type_for (type); + else + itype = type; + + enum tree_code cond_code = gimple_omp_for_cond (inner_loop, i); + tree n1 = unshare_expr (gimple_omp_for_initial (inner_loop, i)); + walk_tree (&n1, grid_remap_prebody_decls, &wi, NULL); + tree n2 = unshare_expr (gimple_omp_for_final (inner_loop, i)); + walk_tree (&n2, grid_remap_prebody_decls, &wi, NULL); + adjust_for_condition (loc, &cond_code, &n2); + tree step; + step = get_omp_for_step_from_incr (loc, + gimple_omp_for_incr (inner_loop, i)); + gimple_seq tmpseq = NULL; + n1 = fold_convert (itype, n1); + n2 = fold_convert (itype, n2); + tree t = build_int_cst (itype, (cond_code == LT_EXPR ? -1 : 1)); + t = fold_build2 (PLUS_EXPR, itype, step, t); + t = fold_build2 (PLUS_EXPR, itype, t, n2); + t = fold_build2 (MINUS_EXPR, itype, t, n1); + if (TYPE_UNSIGNED (itype) && cond_code == GT_EXPR) + t = fold_build2 (TRUNC_DIV_EXPR, itype, + fold_build1 (NEGATE_EXPR, itype, t), + fold_build1 (NEGATE_EXPR, itype, step)); + else + t = fold_build2 (TRUNC_DIV_EXPR, itype, t, step); + tree gs = fold_convert (uint32_type_node, t); + gimplify_expr (&gs, &tmpseq, NULL, is_gimple_val, fb_rvalue); + if (!gimple_seq_empty_p (tmpseq)) + gsi_insert_seq_before (gsi, tmpseq, GSI_SAME_STMT); + + tree ws; + if (i == 0 && group_size) + { + ws = fold_convert (uint32_type_node, group_size); + tmpseq = NULL; + gimplify_expr (&ws, &tmpseq, NULL, is_gimple_val, fb_rvalue); + if (!gimple_seq_empty_p (tmpseq)) + gsi_insert_seq_before (gsi, tmpseq, GSI_SAME_STMT); + } + else + ws = build_zero_cst (uint32_type_node); + + tree c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE__GRIDDIM_); + OMP_CLAUSE__GRIDDIM__DIMENSION (c) = i; + OMP_CLAUSE__GRIDDIM__SIZE (c) = gs; + OMP_CLAUSE__GRIDDIM__GROUP (c) = ws; + OMP_CLAUSE_CHAIN (c) = gimple_omp_target_clauses (target); + gimple_omp_target_set_clauses (target, c); + } + pop_gimplify_context (tgt_bind); + delete declmap; + return; +} + +/* Walker function doing all the work for create_target_kernels. */ + +static tree +grid_gridify_all_targets_stmt (gimple_stmt_iterator *gsi, + bool *handled_ops_p, + struct walk_stmt_info *incoming) +{ + *handled_ops_p = false; + + gimple *stmt = gsi_stmt (*gsi); + gomp_target *target = dyn_cast (stmt); + if (target) + { + gbind *tgt_bind = (gbind *) incoming->info; + gcc_checking_assert (tgt_bind); + grid_attempt_target_gridification (target, gsi, tgt_bind); + return NULL_TREE; + } + gbind *bind = dyn_cast (stmt); + if (bind) + { + *handled_ops_p = true; + struct walk_stmt_info wi; + memset (&wi, 0, sizeof (wi)); + wi.info = bind; + walk_gimple_seq_mod (gimple_bind_body_ptr (bind), + grid_gridify_all_targets_stmt, NULL, &wi); + } + return NULL_TREE; +} + +/* Attempt to gridify all target constructs in BODY_P. All such targets will + have their bodies duplicated, with the new copy being put into a + gimple_omp_grid_body statement. All kernel-related construct within the + grid_body will be marked with phony flags or kernel kinds. Moreover, some + re-structuring is often needed, such as copying pre-bodies before the target + construct so that kernel grid sizes can be computed. */ + +static void +grid_gridify_all_targets (gimple_seq *body_p) +{ + struct walk_stmt_info wi; + memset (&wi, 0, sizeof (wi)); + walk_gimple_seq_mod (body_p, grid_gridify_all_targets_stmt, NULL, &wi); +} + /* Main entry point. */ static unsigned int @@ -16508,6 +17769,10 @@ execute_lower_omp (void) delete_omp_context); body = gimple_body (current_function_decl); + + if (hsa_gen_requested_p ()) + grid_gridify_all_targets (&body); + scan_omp (&body, NULL); gcc_assert (taskreg_nesting_level == 0); FOR_EACH_VEC_ELT (taskreg_contexts, i, ctx) @@ -16845,6 +18110,7 @@ make_gimple_omp_edges (basic_block bb, struct omp_region **region, case GIMPLE_OMP_TASKGROUP: case GIMPLE_OMP_CRITICAL: case GIMPLE_OMP_SECTION: + case GIMPLE_OMP_GRID_BODY: cur_region = new_omp_region (bb, code, cur_region); fallthru = true; break; -- cgit v1.1