diff options
author | Martin Jambor <mjambor@suse.cz> | 2016-12-14 23:30:41 +0100 |
---|---|---|
committer | Martin Jambor <jamborm@gcc.gnu.org> | 2016-12-14 23:30:41 +0100 |
commit | 629b3d75c8c5a244d891a9c292bca6912d4b0dd9 (patch) | |
tree | 21a84ad4210bfa6213a06a75a3311f6c824b1b65 /gcc/omp-offload.c | |
parent | cfce1a4a42a9f76477e732fbe7408459742a92a2 (diff) | |
download | gcc-629b3d75c8c5a244d891a9c292bca6912d4b0dd9.zip gcc-629b3d75c8c5a244d891a9c292bca6912d4b0dd9.tar.gz gcc-629b3d75c8c5a244d891a9c292bca6912d4b0dd9.tar.bz2 |
Split omp-low into multiple files
2016-12-14 Martin Jambor <mjambor@suse.cz>
* omp-general.h: New file.
* omp-general.c: New file.
* omp-expand.h: Likewise.
* omp-expand.c: Likewise.
* omp-offload.h: Likewise.
* omp-offload.c: Likewise.
* omp-grid.c: Likewise.
* omp-grid.c: Likewise.
* omp-low.h: Include omp-general.h and omp-grid.h. Removed includes
of params.h, symbol-summary.h, lto-section-names.h, cilk.h, tree-eh.h,
ipa-prop.h, tree-cfgcleanup.h, cfgloop.h, except.h, expr.h, stmt.h,
varasm.h, calls.h, explow.h, dojump.h, flags.h, tree-into-ssa.h,
tree-cfg.h, cfganal.h, alias.h, emit-rtl.h, optabs.h, expmed.h,
alloc-pool.h, cfghooks.h, rtl.h and memmodel.h.
(omp_find_combined_for): Declare.
(find_omp_clause): Renamed to omp_find_clause and moved to
omp-general.h.
(free_omp_regions): Renamed to omp_free_regions and moved to
omp-expand.h.
(replace_oacc_fn_attrib): Renamed to oacc_replace_fn_attrib and moved
to omp-general.h.
(set_oacc_fn_attrib): Renamed to oacc_set_fn_attrib and moved to
omp-general.h.
(build_oacc_routine_dims): Renamed to oacc_build_routine_dims and
moved to omp-general.h.
(get_oacc_fn_attrib): Renamed to oacc_get_fn_attrib and moved to
omp-general.h.
(oacc_fn_attrib_kernels_p): Moved to omp-general.h.
(get_oacc_fn_dim_size): Renamed to oacc_get_fn_dim_size and moved to
omp-general.c.
(omp_expand_local): Moved to omp-expand.h.
(make_gimple_omp_edges): Renamed to omp_make_gimple_edges and moved to
omp-expand.h.
(omp_finish_file): Moved to omp-offload.h.
(default_goacc_validate_dims): Renamed to
oacc_default_goacc_validate_dims and moved to omp-offload.h.
(offload_funcs, offload_vars): Moved to omp-offload.h.
* omp-low.c: Include omp-general.h, omp-offload.h and omp-grid.h.
(omp_region): Moved to omp-expand.c.
(omp_for_data_loop): Moved to omp-general.h.
(omp_for_data): Likewise.
(oacc_loop): Moved to omp-offload.c.
(oacc_loop_flags): Moved to omp-general.h.
(offload_funcs, offload_vars): Moved to omp-offload.c.
(root_omp_region): Moved to omp-expand.c.
(omp_any_child_fn_dumped): Likewise.
(find_omp_clause): Renamed to omp_find_clause and moved to
omp-general.c.
(is_combined_parallel): Moved to omp-expand.c.
(is_reference): Renamed to omp_is_reference and and moved to
omp-general.c.
(adjust_for_condition): Renamed to omp_adjust_for_condition and moved
to omp-general.c.
(get_omp_for_step_from_incr): Renamed to omp_get_for_step_from_incr
and moved to omp-general.c.
(extract_omp_for_data): Renamed to omp_extract_for_data and moved to
omp-general.c.
(workshare_safe_to_combine_p): Moved to omp-expand.c.
(omp_adjust_chunk_size): Likewise.
(get_ws_args_for): Likewise.
(get_base_type): Removed.
(dump_omp_region): Moved to omp-expand.c.
(debug_omp_region): Likewise.
(debug_all_omp_regions): Likewise.
(new_omp_region): Likewise.
(free_omp_region_1): Likewise.
(free_omp_regions): Renamed to omp_free_regions and moved to
omp-expand.c.
(find_combined_for): Renamed to omp_find_combined_for, made global.
(build_omp_barrier): Renamed to omp_build_barrier and moved to
omp-general.c.
(omp_max_vf): Moved to omp-general.c.
(omp_max_simt_vf): Likewise.
(gimple_build_cond_empty): Moved to omp-expand.c.
(parallel_needs_hsa_kernel_p): Likewise.
(expand_omp_build_assign): Moved declaration to omp-expand.c.
(expand_parallel_call): Moved to omp-expand.c.
(expand_cilk_for_call): Likewise.
(expand_task_call): Likewise.
(vec2chain): Likewise.
(remove_exit_barrier): Likewise.
(remove_exit_barriers): Likewise.
(optimize_omp_library_calls): Likewise.
(expand_omp_regimplify_p): Likewise.
(expand_omp_build_assign): Likewise.
(expand_omp_taskreg): Likewise.
(oacc_collapse): Likewise.
(expand_oacc_collapse_init): Likewise.
(expand_oacc_collapse_vars): Likewise.
(expand_omp_for_init_counts): Likewise.
(expand_omp_for_init_vars): Likewise.
(extract_omp_for_update_vars): Likewise.
(expand_omp_ordered_source): Likewise.
(expand_omp_ordered_sink): Likewise.
(expand_omp_ordered_source_sink): Likewise.
(expand_omp_for_ordered_loops): Likewise.
(expand_omp_for_generic): Likewise.
(expand_omp_for_static_nochunk): Likewise.
(find_phi_with_arg_on_edge): Likewise.
(expand_omp_for_static_chunk): Likewise.
(expand_cilk_for): Likewise.
(expand_omp_simd): Likewise.
(expand_omp_taskloop_for_outer): Likewise.
(expand_omp_taskloop_for_inner): Likewise.
(expand_oacc_for): Likewise.
(expand_omp_for): Likewise.
(expand_omp_sections): Likewise.
(expand_omp_single): Likewise.
(expand_omp_synch): Likewise.
(expand_omp_atomic_load): Likewise.
(expand_omp_atomic_store): Likewise.
(expand_omp_atomic_fetch_op): Likewise.
(expand_omp_atomic_pipeline): Likewise.
(expand_omp_atomic_mutex): Likewise.
(expand_omp_atomic): Likewise.
(oacc_launch_pack): and moved to omp-general.c, made public.
(OACC_FN_ATTRIB): Likewise.
(replace_oacc_fn_attrib): Renamed to oacc_replace_fn_attrib and moved
to omp-general.c.
(set_oacc_fn_attrib): Renamed to oacc_set_fn_attrib and moved to
omp-general.c.
(build_oacc_routine_dims): Renamed to oacc_build_routine_dims and
moved to omp-general.c.
(get_oacc_fn_attrib): Renamed to oacc_get_fn_attrib and moved to
omp-general.c.
(oacc_fn_attrib_kernels_p): Moved to omp-general.c.
(oacc_fn_attrib_level): Moved to omp-offload.c.
(get_oacc_fn_dim_size): Renamed to oacc_get_fn_dim_size and moved to
omp-general.c.
(get_oacc_ifn_dim_arg): Renamed to oacc_get_ifn_dim_arg and moved to
omp-general.c.
(mark_loops_in_oacc_kernels_region): Moved to omp-expand.c.
(grid_launch_attributes_trees): Likewise.
(grid_attr_trees): Likewise.
(grid_create_kernel_launch_attr_types): Likewise.
(grid_insert_store_range_dim): Likewise.
(grid_get_kernel_launch_attributes): Likewise.
(get_target_argument_identifier_1): Likewise.
(get_target_argument_identifier): Likewise.
(get_target_argument_value): Likewise.
(push_target_argument_according_to_value): Likewise.
(get_target_arguments): Likewise.
(expand_omp_target): Likewise.
(grid_expand_omp_for_loop): Moved to omp-grid.c.
(grid_arg_decl_map): Likewise.
(grid_remap_kernel_arg_accesses): Likewise.
(grid_expand_target_grid_body): Likewise.
(expand_omp): Renamed to omp_expand and moved to omp-expand.c.
(build_omp_regions_1): Moved to omp-expand.c.
(build_omp_regions_root): Likewise.
(omp_expand_local): Likewise.
(build_omp_regions): Likewise.
(execute_expand_omp): Likewise.
(pass_data_expand_omp): Likewise.
(pass_expand_omp): Likewise.
(make_pass_expand_omp): Likewise.
(pass_data_expand_omp_ssa): Likewise.
(pass_expand_omp_ssa): Likewise.
(make_pass_expand_omp_ssa): Likewise.
(grid_lastprivate_predicate): Renamed to
omp_grid_lastprivate_predicate and moved to omp-grid.c, made public.
(grid_prop): Moved to omp-grid.c.
(GRID_MISSED_MSG_PREFIX): Likewise.
(grid_safe_assignment_p): Likewise.
(grid_seq_only_contains_local_assignments): Likewise.
(grid_find_single_omp_among_assignments_1): Likewise.
(grid_find_single_omp_among_assignments): Likewise.
(grid_find_ungridifiable_statement): Likewise.
(grid_parallel_clauses_gridifiable): Likewise.
(grid_inner_loop_gridifiable_p): Likewise.
(grid_dist_follows_simple_pattern): Likewise.
(grid_gfor_follows_tiling_pattern): Likewise.
(grid_call_permissible_in_distribute_p): Likewise.
(grid_handle_call_in_distribute): Likewise.
(grid_dist_follows_tiling_pattern): Likewise.
(grid_target_follows_gridifiable_pattern): Likewise.
(grid_remap_prebody_decls): Likewise.
(grid_var_segment): Likewise.
(grid_mark_variable_segment): Likewise.
(grid_copy_leading_local_assignments): Likewise.
(grid_process_grid_body): Likewise.
(grid_eliminate_combined_simd_part): Likewise.
(grid_mark_tiling_loops): Likewise.
(grid_mark_tiling_parallels_and_loops): Likewise.
(grid_process_kernel_body_copy): Likewise.
(grid_attempt_target_gridification): Likewise.
(grid_gridify_all_targets_stmt): Likewise.
(grid_gridify_all_targets): Renamed to omp_grid_gridify_all_targets
and moved to omp-grid.c, made public.
(make_gimple_omp_edges): Renamed to omp_make_gimple_edges and moved to
omp-expand.c.
(add_decls_addresses_to_decl_constructor): Moved to omp-offload.c.
(omp_finish_file): Likewise.
(oacc_thread_numbers): Likewise.
(oacc_xform_loop): Likewise.
(oacc_default_dims, oacc_min_dims): Likewise.
(oacc_parse_default_dims): Likewise.
(oacc_validate_dims): Likewise.
(new_oacc_loop_raw): Likewise.
(new_oacc_loop_outer): Likewise.
(new_oacc_loop): Likewise.
(new_oacc_loop_routine): Likewise.
(finish_oacc_loop): Likewise.
(free_oacc_loop): Likewise.
(dump_oacc_loop_part): Likewise.
(dump_oacc_loop): Likewise.
(debug_oacc_loop): Likewise.
(oacc_loop_discover_walk): Likewise.
(oacc_loop_sibling_nreverse): Likewise.
(oacc_loop_discovery): Likewise.
(oacc_loop_xform_head_tail): Likewise.
(oacc_loop_xform_loop): Likewise.
(oacc_loop_process): Likewise.
(oacc_loop_fixed_partitions): Likewise.
(oacc_loop_auto_partitions): Likewise.
(oacc_loop_partition): Likewise.
(default_goacc_fork_join): Likewise.
(default_goacc_reduction): Likewise.
(execute_oacc_device_lower): Likewise.
(default_goacc_validate_dims): Likewise.
(default_goacc_dim_limit): Likewise.
(pass_data_oacc_device_lower): Likewise.
(pass_oacc_device_lower): Likewise.
(make_pass_oacc_device_lower): Likewise.
(execute_omp_device_lower): Likewise.
(pass_data_omp_device_lower): Likewise.
(pass_omp_device_lower): Likewise.
(make_pass_omp_device_lower): Likewise.
(pass_data_omp_target_link): Likewise.
(pass_omp_target_link): Likewise.
(find_link_var_op): Likewise.
(pass_omp_target_link::execute): Likewise.
(make_pass_omp_target_link): Likewise.
* Makefile.in (OBJS): Added omp-offload.o, omp-expand.o, omp-general.o
and omp-grid.o.
(GTFILES): Added omp-offload.h, omp-offload.c and omp-expand.c, removed
omp-low.h.
* gimple-fold.c: Include omp-general.h instead of omp-low.h.
(fold_internal_goacc_dim): Adjusted calls to
get_oacc_ifn_dim_arg and get_oacc_fn_dim_size to use their new names.
* gimplify.c: Include omp-low.h.
(omp_notice_variable): Adjust the call to get_oacc_fn_attrib to use
its new name.
(gimplify_omp_task): Adjusted calls to find_omp_clause to use its new
name.
(gimplify_omp_for): Likewise.
* lto-cgraph.c: Include omp-offload.h instead of omp-low.h.
* toplev.c: Include omp-offload.h instead of omp-low.h.
* tree-cfg.c: Include omp-general.h instead of omp-low.h. Also
include omp-expand.h.
(make_edges_bb): Adjusted the call to make_gimple_omp_edges to use its
new name.
(make_edges): Adjust the call to free_omp_regions to use its new name.
* tree-parloops.c: Include omp-general.h.
(create_parallel_loop): Adjusted the call to set_oacc_fn_attrib to use
its new name.
(parallelize_loops): Adjusted the call to get_oacc_fn_attrib to use
its new name.
* tree-ssa-loop.c: Include omp-general.h instead of omp-low.h.
(gate_oacc_kernels): Adjusted the call to get_oacc_fn_attrib to use
its new name.
* tree-vrp.c: Include omp-general.h instead of omp-low.h.
(extract_range_basic): Adjusted calls to get_oacc_ifn_dim_arg and
get_oacc_fn_dim_size to use their new names.
* varpool.c: Include omp-offload.h instead of omp-low.h.
* gengtype.c (open_base_files): Replace omp-low.h with omp-offload.h in
ifiles.
* config/nvptx/nvptx.c: Include omp-general.c.
(nvptx_expand_call): Adjusted the call to get_oacc_fn_attrib to use
its new name.
(nvptx_reorg): Likewise.
(nvptx_record_offload_symbol): Likewise.
gcc/c-family:
* c-omp.c: Include omp-general.h instead of omp-low.h.
(c_finish_oacc_wait): Adjusted call to find_omp_clause to use its new
name.
gcc/c/
* c-parser.c: Include omp-general.h and omp-offload.h instead of
omp-low.h.
(c_finish_oacc_routine): Adjusted call to
get_oacc_fn_attrib, build_oacc_routine_dims and replace_oacc_fn_attrib
to use their new names.
(c_parser_oacc_enter_exit_data): Adjusted call to find_omp_clause to
use its new name.
(c_parser_oacc_update): Likewise.
(c_parser_omp_simd): Likewise.
(c_parser_omp_target_update): Likewise.
* c-typeck.c: Include omp-general.h instead of omp-low.h.
(c_finish_omp_cancel): Adjusted call to find_omp_clause to use its new
name.
(c_finish_omp_cancellation_point): Likewise.
* gimple-parser.c: Do not include omp-low.h
gcc/cp/
* parser.c: Include omp-general.h and omp-offload.h instead of
omp-low.h.
(cp_parser_omp_simd): Adjusted calls to find_omp_clause to use its new
name.
(cp_parser_omp_target_update): Likewise.
(cp_parser_oacc_declare): Likewise.
(cp_parser_oacc_enter_exit_data): Likewise.
(cp_parser_oacc_update): Likewise.
(cp_finalize_oacc_routine): Adjusted call to get_oacc_fn_attrib,
build_oacc_routine_dims and replace_oacc_fn_attrib to use their new
names.
* semantics.c: Include omp-general insteda of omp-low.h.
(finish_omp_for): Adjusted calls to find_omp_clause to use its new
name.
(finish_omp_cancel): Likewise.
(finish_omp_cancellation_point): Likewise.
fortran/
* trans-openmp.c: Include omp-general.h.
From-SVN: r243673
Diffstat (limited to 'gcc/omp-offload.c')
-rw-r--r-- | gcc/omp-offload.c | 1718 |
1 files changed, 1718 insertions, 0 deletions
diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c new file mode 100644 index 0000000..fabdf2d --- /dev/null +++ b/gcc/omp-offload.c @@ -0,0 +1,1718 @@ +/* Bits of OpenMP and OpenACC handling that is specific to device offloading + and a lowering pass for OpenACC device directives. + + Copyright (C) 2005-2016 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +<http://www.gnu.org/licenses/>. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "target.h" +#include "tree.h" +#include "gimple.h" +#include "tree-pass.h" +#include "ssa.h" +#include "cgraph.h" +#include "pretty-print.h" +#include "diagnostic-core.h" +#include "fold-const.h" +#include "internal-fn.h" +#include "gimplify.h" +#include "gimple-iterator.h" +#include "gimplify-me.h" +#include "gimple-walk.h" +#include "tree-cfg.h" +#include "tree-into-ssa.h" +#include "common/common-target.h" +#include "omp-general.h" +#include "omp-offload.h" +#include "lto-section-names.h" +#include "gomp-constants.h" +#include "gimple-pretty-print.h" + +/* Describe the OpenACC looping structure of a function. The entire + function is held in a 'NULL' loop. */ + +struct oacc_loop +{ + oacc_loop *parent; /* Containing loop. */ + + oacc_loop *child; /* First inner loop. */ + + oacc_loop *sibling; /* Next loop within same parent. */ + + location_t loc; /* Location of the loop start. */ + + gcall *marker; /* Initial head marker. */ + + gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */ + gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */ + + tree routine; /* Pseudo-loop enclosing a routine. */ + + unsigned mask; /* Partitioning mask. */ + unsigned inner; /* Partitioning of inner loops. */ + unsigned flags; /* Partitioning flags. */ + unsigned ifns; /* Contained loop abstraction functions. */ + tree chunk_size; /* Chunk size. */ + gcall *head_end; /* Final marker of head sequence. */ +}; + +/* Holds offload tables with decls. */ +vec<tree, va_gc> *offload_funcs, *offload_vars; + +/* Return level at which oacc routine may spawn a partitioned loop, or + -1 if it is not a routine (i.e. is an offload fn). */ + +static int +oacc_fn_attrib_level (tree attr) +{ + tree pos = TREE_VALUE (attr); + + if (!TREE_PURPOSE (pos)) + return -1; + + int ix = 0; + for (ix = 0; ix != GOMP_DIM_MAX; + ix++, pos = TREE_CHAIN (pos)) + if (!integer_zerop (TREE_PURPOSE (pos))) + break; + + return ix; +} + +/* Helper function for omp_finish_file routine. Takes decls from V_DECLS and + adds their addresses and sizes to constructor-vector V_CTOR. */ + +static void +add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls, + vec<constructor_elt, va_gc> *v_ctor) +{ + unsigned len = vec_safe_length (v_decls); + for (unsigned i = 0; i < len; i++) + { + tree it = (*v_decls)[i]; + bool is_var = VAR_P (it); + bool is_link_var + = is_var +#ifdef ACCEL_COMPILER + && DECL_HAS_VALUE_EXPR_P (it) +#endif + && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it)); + + tree size = NULL_TREE; + if (is_var) + size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it)); + + tree addr; + if (!is_link_var) + addr = build_fold_addr_expr (it); + else + { +#ifdef ACCEL_COMPILER + /* For "omp declare target link" vars add address of the pointer to + the target table, instead of address of the var. */ + tree value_expr = DECL_VALUE_EXPR (it); + tree link_ptr_decl = TREE_OPERAND (value_expr, 0); + varpool_node::finalize_decl (link_ptr_decl); + addr = build_fold_addr_expr (link_ptr_decl); +#else + addr = build_fold_addr_expr (it); +#endif + + /* Most significant bit of the size marks "omp declare target link" + vars in host and target tables. */ + unsigned HOST_WIDE_INT isize = tree_to_uhwi (size); + isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node) + * BITS_PER_UNIT - 1); + size = wide_int_to_tree (const_ptr_type_node, isize); + } + + CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr); + if (is_var) + CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size); + } +} + +/* Create new symbols containing (address, size) pairs for global variables, + marked with "omp declare target" attribute, as well as addresses for the + functions, which are outlined offloading regions. */ +void +omp_finish_file (void) +{ + unsigned num_funcs = vec_safe_length (offload_funcs); + unsigned num_vars = vec_safe_length (offload_vars); + + if (num_funcs == 0 && num_vars == 0) + return; + + if (targetm_common.have_named_sections) + { + vec<constructor_elt, va_gc> *v_f, *v_v; + vec_alloc (v_f, num_funcs); + vec_alloc (v_v, num_vars * 2); + + add_decls_addresses_to_decl_constructor (offload_funcs, v_f); + add_decls_addresses_to_decl_constructor (offload_vars, v_v); + + tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node, + num_vars * 2); + tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node, + num_funcs); + SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node)); + SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node)); + tree ctor_v = build_constructor (vars_decl_type, v_v); + tree ctor_f = build_constructor (funcs_decl_type, v_f); + TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1; + TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1; + tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL, + get_identifier (".offload_func_table"), + funcs_decl_type); + tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL, + get_identifier (".offload_var_table"), + vars_decl_type); + TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1; + /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node), + otherwise a joint table in a binary will contain padding between + tables from multiple object files. */ + DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1; + SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type)); + SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type)); + DECL_INITIAL (funcs_decl) = ctor_f; + DECL_INITIAL (vars_decl) = ctor_v; + set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME); + set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME); + + varpool_node::finalize_decl (vars_decl); + varpool_node::finalize_decl (funcs_decl); + } + else + { + for (unsigned i = 0; i < num_funcs; i++) + { + tree it = (*offload_funcs)[i]; + targetm.record_offload_symbol (it); + } + for (unsigned i = 0; i < num_vars; i++) + { + tree it = (*offload_vars)[i]; + targetm.record_offload_symbol (it); + } + } +} + +/* Find the number of threads (POS = false), or thread number (POS = + true) for an OpenACC region partitioned as MASK. Setup code + required for the calculation is added to SEQ. */ + +static tree +oacc_thread_numbers (bool pos, int mask, gimple_seq *seq) +{ + tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1); + unsigned ix; + + /* Start at gang level, and examine relevant dimension indices. */ + for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++) + if (GOMP_DIM_MASK (ix) & mask) + { + tree arg = build_int_cst (unsigned_type_node, ix); + + if (res) + { + /* We had an outer index, so scale that by the size of + this dimension. */ + tree n = create_tmp_var (integer_type_node); + gimple *call + = gimple_build_call_internal (IFN_GOACC_DIM_SIZE, 1, arg); + + gimple_call_set_lhs (call, n); + gimple_seq_add_stmt (seq, call); + res = fold_build2 (MULT_EXPR, integer_type_node, res, n); + } + if (pos) + { + /* Determine index in this dimension. */ + tree id = create_tmp_var (integer_type_node); + gimple *call = gimple_build_call_internal + (IFN_GOACC_DIM_POS, 1, arg); + + gimple_call_set_lhs (call, id); + gimple_seq_add_stmt (seq, call); + if (res) + res = fold_build2 (PLUS_EXPR, integer_type_node, res, id); + else + res = id; + } + } + + if (res == NULL_TREE) + res = integer_zero_node; + + return res; +} + +/* Transform IFN_GOACC_LOOP calls to actual code. See + expand_oacc_for for where these are generated. At the vector + level, we stride loops, such that each member of a warp will + operate on adjacent iterations. At the worker and gang level, + each gang/warp executes a set of contiguous iterations. Chunking + can override this such that each iteration engine executes a + contiguous chunk, and then moves on to stride to the next chunk. */ + +static void +oacc_xform_loop (gcall *call) +{ + gimple_stmt_iterator gsi = gsi_for_stmt (call); + enum ifn_goacc_loop_kind code + = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0)); + tree dir = gimple_call_arg (call, 1); + tree range = gimple_call_arg (call, 2); + tree step = gimple_call_arg (call, 3); + tree chunk_size = NULL_TREE; + unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5)); + tree lhs = gimple_call_lhs (call); + tree type = TREE_TYPE (lhs); + tree diff_type = TREE_TYPE (range); + tree r = NULL_TREE; + gimple_seq seq = NULL; + bool chunking = false, striding = true; + unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning + unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any) + +#ifdef ACCEL_COMPILER + chunk_size = gimple_call_arg (call, 4); + if (integer_minus_onep (chunk_size) /* Force static allocation. */ + || integer_zerop (chunk_size)) /* Default (also static). */ + { + /* If we're at the gang level, we want each to execute a + contiguous run of iterations. Otherwise we want each element + to stride. */ + striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG)); + chunking = false; + } + else + { + /* Chunk of size 1 is striding. */ + striding = integer_onep (chunk_size); + chunking = !striding; + } +#endif + + /* striding=true, chunking=true + -> invalid. + striding=true, chunking=false + -> chunks=1 + striding=false,chunking=true + -> chunks=ceil (range/(chunksize*threads*step)) + striding=false,chunking=false + -> chunk_size=ceil(range/(threads*step)),chunks=1 */ + push_gimplify_context (true); + + switch (code) + { + default: gcc_unreachable (); + + case IFN_GOACC_LOOP_CHUNKS: + if (!chunking) + r = build_int_cst (type, 1); + else + { + /* chunk_max + = (range - dir) / (chunks * step * num_threads) + dir */ + tree per = oacc_thread_numbers (false, mask, &seq); + per = fold_convert (type, per); + chunk_size = fold_convert (type, chunk_size); + per = fold_build2 (MULT_EXPR, type, per, chunk_size); + per = fold_build2 (MULT_EXPR, type, per, step); + r = build2 (MINUS_EXPR, type, range, dir); + r = build2 (PLUS_EXPR, type, r, per); + r = build2 (TRUNC_DIV_EXPR, type, r, per); + } + break; + + case IFN_GOACC_LOOP_STEP: + { + /* If striding, step by the entire compute volume, otherwise + step by the inner volume. */ + unsigned volume = striding ? mask : inner_mask; + + r = oacc_thread_numbers (false, volume, &seq); + r = build2 (MULT_EXPR, type, fold_convert (type, r), step); + } + break; + + case IFN_GOACC_LOOP_OFFSET: + if (striding) + { + r = oacc_thread_numbers (true, mask, &seq); + r = fold_convert (diff_type, r); + } + else + { + tree inner_size = oacc_thread_numbers (false, inner_mask, &seq); + tree outer_size = oacc_thread_numbers (false, outer_mask, &seq); + tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size), + inner_size, outer_size); + + volume = fold_convert (diff_type, volume); + if (chunking) + chunk_size = fold_convert (diff_type, chunk_size); + else + { + tree per = fold_build2 (MULT_EXPR, diff_type, volume, step); + + chunk_size = build2 (MINUS_EXPR, diff_type, range, dir); + chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per); + chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per); + } + + tree span = build2 (MULT_EXPR, diff_type, chunk_size, + fold_convert (diff_type, inner_size)); + r = oacc_thread_numbers (true, outer_mask, &seq); + r = fold_convert (diff_type, r); + r = build2 (MULT_EXPR, diff_type, r, span); + + tree inner = oacc_thread_numbers (true, inner_mask, &seq); + inner = fold_convert (diff_type, inner); + r = fold_build2 (PLUS_EXPR, diff_type, r, inner); + + if (chunking) + { + tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6)); + tree per + = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size); + per = build2 (MULT_EXPR, diff_type, per, chunk); + + r = build2 (PLUS_EXPR, diff_type, r, per); + } + } + r = fold_build2 (MULT_EXPR, diff_type, r, step); + if (type != diff_type) + r = fold_convert (type, r); + break; + + case IFN_GOACC_LOOP_BOUND: + if (striding) + r = range; + else + { + tree inner_size = oacc_thread_numbers (false, inner_mask, &seq); + tree outer_size = oacc_thread_numbers (false, outer_mask, &seq); + tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size), + inner_size, outer_size); + + volume = fold_convert (diff_type, volume); + if (chunking) + chunk_size = fold_convert (diff_type, chunk_size); + else + { + tree per = fold_build2 (MULT_EXPR, diff_type, volume, step); + + chunk_size = build2 (MINUS_EXPR, diff_type, range, dir); + chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per); + chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per); + } + + tree span = build2 (MULT_EXPR, diff_type, chunk_size, + fold_convert (diff_type, inner_size)); + + r = fold_build2 (MULT_EXPR, diff_type, span, step); + + tree offset = gimple_call_arg (call, 6); + r = build2 (PLUS_EXPR, diff_type, r, + fold_convert (diff_type, offset)); + r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR, + diff_type, r, range); + } + if (diff_type != type) + r = fold_convert (type, r); + break; + } + + gimplify_assign (lhs, r, &seq); + + pop_gimplify_context (NULL); + + gsi_replace_with_seq (&gsi, seq, true); +} + +/* Default partitioned and minimum partitioned dimensions. */ + +static int oacc_default_dims[GOMP_DIM_MAX]; +static int oacc_min_dims[GOMP_DIM_MAX]; + +/* Parse the default dimension parameter. This is a set of + :-separated optional compute dimensions. Each specified dimension + is a positive integer. When device type support is added, it is + planned to be a comma separated list of such compute dimensions, + with all but the first prefixed by the colon-terminated device + type. */ + +static void +oacc_parse_default_dims (const char *dims) +{ + int ix; + + for (ix = GOMP_DIM_MAX; ix--;) + { + oacc_default_dims[ix] = -1; + oacc_min_dims[ix] = 1; + } + +#ifndef ACCEL_COMPILER + /* Cannot be overridden on the host. */ + dims = NULL; +#endif + if (dims) + { + const char *pos = dims; + + for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++) + { + if (ix) + { + if (*pos != ':') + goto malformed; + pos++; + } + + if (*pos != ':') + { + long val; + const char *eptr; + + errno = 0; + val = strtol (pos, CONST_CAST (char **, &eptr), 10); + if (errno || val <= 0 || (int) val != val) + goto malformed; + pos = eptr; + oacc_default_dims[ix] = (int) val; + } + } + if (*pos) + { + malformed: + error_at (UNKNOWN_LOCATION, + "-fopenacc-dim operand is malformed at '%s'", pos); + } + } + + /* Allow the backend to validate the dimensions. */ + targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1); + targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2); +} + +/* Validate and update the dimensions for offloaded FN. ATTRS is the + raw attribute. DIMS is an array of dimensions, which is filled in. + LEVEL is the partitioning level of a routine, or -1 for an offload + region itself. USED is the mask of partitioned execution in the + function. */ + +static void +oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used) +{ + tree purpose[GOMP_DIM_MAX]; + unsigned ix; + tree pos = TREE_VALUE (attrs); + bool is_kernel = oacc_fn_attrib_kernels_p (attrs); + + /* Make sure the attribute creator attached the dimension + information. */ + gcc_assert (pos); + + for (ix = 0; ix != GOMP_DIM_MAX; ix++) + { + purpose[ix] = TREE_PURPOSE (pos); + tree val = TREE_VALUE (pos); + dims[ix] = val ? TREE_INT_CST_LOW (val) : -1; + pos = TREE_CHAIN (pos); + } + + bool changed = targetm.goacc.validate_dims (fn, dims, level); + + /* Default anything left to 1 or a partitioned default. */ + for (ix = 0; ix != GOMP_DIM_MAX; ix++) + if (dims[ix] < 0) + { + /* The OpenACC spec says 'If the [num_gangs] clause is not + specified, an implementation-defined default will be used; + the default may depend on the code within the construct.' + (2.5.6). Thus an implementation is free to choose + non-unity default for a parallel region that doesn't have + any gang-partitioned loops. However, it appears that there + is a sufficient body of user code that expects non-gang + partitioned regions to not execute in gang-redundant mode. + So we (a) don't warn about the non-portability and (b) pick + the minimum permissible dimension size when there is no + partitioned execution. Otherwise we pick the global + default for the dimension, which the user can control. The + same wording and logic applies to num_workers and + vector_length, however the worker- or vector- single + execution doesn't have the same impact as gang-redundant + execution. (If the minimum gang-level partioning is not 1, + the target is probably too confusing.) */ + dims[ix] = (used & GOMP_DIM_MASK (ix) + ? oacc_default_dims[ix] : oacc_min_dims[ix]); + changed = true; + } + + if (changed) + { + /* Replace the attribute with new values. */ + pos = NULL_TREE; + for (ix = GOMP_DIM_MAX; ix--;) + { + pos = tree_cons (purpose[ix], + build_int_cst (integer_type_node, dims[ix]), + pos); + if (is_kernel) + TREE_PUBLIC (pos) = 1; + } + oacc_replace_fn_attrib (fn, pos); + } +} + +/* Create an empty OpenACC loop structure at LOC. */ + +static oacc_loop * +new_oacc_loop_raw (oacc_loop *parent, location_t loc) +{ + oacc_loop *loop = XCNEW (oacc_loop); + + loop->parent = parent; + loop->child = loop->sibling = NULL; + + if (parent) + { + loop->sibling = parent->child; + parent->child = loop; + } + + loop->loc = loc; + loop->marker = NULL; + memset (loop->heads, 0, sizeof (loop->heads)); + memset (loop->tails, 0, sizeof (loop->tails)); + loop->routine = NULL_TREE; + + loop->mask = loop->flags = loop->inner = 0; + loop->ifns = 0; + loop->chunk_size = 0; + loop->head_end = NULL; + + return loop; +} + +/* Create an outermost, dummy OpenACC loop for offloaded function + DECL. */ + +static oacc_loop * +new_oacc_loop_outer (tree decl) +{ + return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl)); +} + +/* Start a new OpenACC loop structure beginning at head marker HEAD. + Link into PARENT loop. Return the new loop. */ + +static oacc_loop * +new_oacc_loop (oacc_loop *parent, gcall *marker) +{ + oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker)); + + loop->marker = marker; + + /* TODO: This is where device_type flattening would occur for the loop + flags. */ + + loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3)); + + tree chunk_size = integer_zero_node; + if (loop->flags & OLF_GANG_STATIC) + chunk_size = gimple_call_arg (marker, 4); + loop->chunk_size = chunk_size; + + return loop; +} + +/* Create a dummy loop encompassing a call to a openACC routine. + Extract the routine's partitioning requirements. */ + +static void +new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs) +{ + oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call)); + int level = oacc_fn_attrib_level (attrs); + + gcc_assert (level >= 0); + + loop->marker = call; + loop->routine = decl; + loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) + ^ (GOMP_DIM_MASK (level) - 1)); +} + +/* Finish off the current OpenACC loop ending at tail marker TAIL. + Return the parent loop. */ + +static oacc_loop * +finish_oacc_loop (oacc_loop *loop) +{ + /* If the loop has been collapsed, don't partition it. */ + if (!loop->ifns) + loop->mask = loop->flags = 0; + return loop->parent; +} + +/* Free all OpenACC loop structures within LOOP (inclusive). */ + +static void +free_oacc_loop (oacc_loop *loop) +{ + if (loop->sibling) + free_oacc_loop (loop->sibling); + if (loop->child) + free_oacc_loop (loop->child); + + free (loop); +} + +/* Dump out the OpenACC loop head or tail beginning at FROM. */ + +static void +dump_oacc_loop_part (FILE *file, gcall *from, int depth, + const char *title, int level) +{ + enum ifn_unique_kind kind + = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0)); + + fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level); + for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;) + { + gimple *stmt = gsi_stmt (gsi); + + if (gimple_call_internal_p (stmt, IFN_UNIQUE)) + { + enum ifn_unique_kind k + = ((enum ifn_unique_kind) TREE_INT_CST_LOW + (gimple_call_arg (stmt, 0))); + + if (k == kind && stmt != from) + break; + } + print_gimple_stmt (file, stmt, depth * 2 + 2, 0); + + gsi_next (&gsi); + while (gsi_end_p (gsi)) + gsi = gsi_start_bb (single_succ (gsi_bb (gsi))); + } +} + +/* Dump OpenACC loops LOOP, its siblings and its children. */ + +static void +dump_oacc_loop (FILE *file, oacc_loop *loop, int depth) +{ + int ix; + + fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "", + loop->flags, loop->mask, + LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc)); + + if (loop->marker) + print_gimple_stmt (file, loop->marker, depth * 2, 0); + + if (loop->routine) + fprintf (file, "%*sRoutine %s:%u:%s\n", + depth * 2, "", DECL_SOURCE_FILE (loop->routine), + DECL_SOURCE_LINE (loop->routine), + IDENTIFIER_POINTER (DECL_NAME (loop->routine))); + + for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++) + if (loop->heads[ix]) + dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix); + for (ix = GOMP_DIM_MAX; ix--;) + if (loop->tails[ix]) + dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix); + + if (loop->child) + dump_oacc_loop (file, loop->child, depth + 1); + if (loop->sibling) + dump_oacc_loop (file, loop->sibling, depth); +} + +void debug_oacc_loop (oacc_loop *); + +/* Dump loops to stderr. */ + +DEBUG_FUNCTION void +debug_oacc_loop (oacc_loop *loop) +{ + dump_oacc_loop (stderr, loop, 0); +} + +/* DFS walk of basic blocks BB onwards, creating OpenACC loop + structures as we go. By construction these loops are properly + nested. */ + +static void +oacc_loop_discover_walk (oacc_loop *loop, basic_block bb) +{ + int marker = 0; + int remaining = 0; + + if (bb->flags & BB_VISITED) + return; + + follow: + bb->flags |= BB_VISITED; + + /* Scan for loop markers. */ + for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + + if (!is_gimple_call (stmt)) + continue; + + gcall *call = as_a <gcall *> (stmt); + + /* If this is a routine, make a dummy loop for it. */ + if (tree decl = gimple_call_fndecl (call)) + if (tree attrs = oacc_get_fn_attrib (decl)) + { + gcc_assert (!marker); + new_oacc_loop_routine (loop, call, decl, attrs); + } + + if (!gimple_call_internal_p (call)) + continue; + + switch (gimple_call_internal_fn (call)) + { + default: + break; + + case IFN_GOACC_LOOP: + /* Count the goacc loop abstraction fns, to determine if the + loop was collapsed already. */ + loop->ifns++; + break; + + case IFN_UNIQUE: + enum ifn_unique_kind kind + = (enum ifn_unique_kind) (TREE_INT_CST_LOW + (gimple_call_arg (call, 0))); + if (kind == IFN_UNIQUE_OACC_HEAD_MARK + || kind == IFN_UNIQUE_OACC_TAIL_MARK) + { + if (gimple_call_num_args (call) == 2) + { + gcc_assert (marker && !remaining); + marker = 0; + if (kind == IFN_UNIQUE_OACC_TAIL_MARK) + loop = finish_oacc_loop (loop); + else + loop->head_end = call; + } + else + { + int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); + + if (!marker) + { + if (kind == IFN_UNIQUE_OACC_HEAD_MARK) + loop = new_oacc_loop (loop, call); + remaining = count; + } + gcc_assert (count == remaining); + if (remaining) + { + remaining--; + if (kind == IFN_UNIQUE_OACC_HEAD_MARK) + loop->heads[marker] = call; + else + loop->tails[remaining] = call; + } + marker++; + } + } + } + } + if (remaining || marker) + { + bb = single_succ (bb); + gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED)); + goto follow; + } + + /* Walk successor blocks. */ + edge e; + edge_iterator ei; + + FOR_EACH_EDGE (e, ei, bb->succs) + oacc_loop_discover_walk (loop, e->dest); +} + +/* LOOP is the first sibling. Reverse the order in place and return + the new first sibling. Recurse to child loops. */ + +static oacc_loop * +oacc_loop_sibling_nreverse (oacc_loop *loop) +{ + oacc_loop *last = NULL; + do + { + if (loop->child) + loop->child = oacc_loop_sibling_nreverse (loop->child); + + oacc_loop *next = loop->sibling; + loop->sibling = last; + last = loop; + loop = next; + } + while (loop); + + return last; +} + +/* Discover the OpenACC loops marked up by HEAD and TAIL markers for + the current function. */ + +static oacc_loop * +oacc_loop_discovery () +{ + /* Clear basic block flags, in particular BB_VISITED which we're going to use + in the following. */ + clear_bb_flags (); + + oacc_loop *top = new_oacc_loop_outer (current_function_decl); + oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun)); + + /* The siblings were constructed in reverse order, reverse them so + that diagnostics come out in an unsurprising order. */ + top = oacc_loop_sibling_nreverse (top); + + return top; +} + +/* Transform the abstract internal function markers starting at FROM + to be for partitioning level LEVEL. Stop when we meet another HEAD + or TAIL marker. */ + +static void +oacc_loop_xform_head_tail (gcall *from, int level) +{ + enum ifn_unique_kind kind + = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0)); + tree replacement = build_int_cst (unsigned_type_node, level); + + for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;) + { + gimple *stmt = gsi_stmt (gsi); + + if (gimple_call_internal_p (stmt, IFN_UNIQUE)) + { + enum ifn_unique_kind k + = ((enum ifn_unique_kind) + TREE_INT_CST_LOW (gimple_call_arg (stmt, 0))); + + if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN) + *gimple_call_arg_ptr (stmt, 2) = replacement; + else if (k == kind && stmt != from) + break; + } + else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION)) + *gimple_call_arg_ptr (stmt, 3) = replacement; + + gsi_next (&gsi); + while (gsi_end_p (gsi)) + gsi = gsi_start_bb (single_succ (gsi_bb (gsi))); + } +} + +/* Transform the IFN_GOACC_LOOP internal functions by providing the + determined partitioning mask and chunking argument. END_MARKER + points at the end IFN_HEAD_TAIL call intgroducing the loop. IFNS + is the number of IFN_GOACC_LOOP calls for the loop. MASK_ARG is + the replacement partitioning mask and CHUNK_ARG is the replacement + chunking arg. */ + +static void +oacc_loop_xform_loop (gcall *end_marker, unsigned ifns, + tree mask_arg, tree chunk_arg) +{ + gimple_stmt_iterator gsi = gsi_for_stmt (end_marker); + + gcc_checking_assert (ifns); + for (;;) + { + for (; !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + + if (!is_gimple_call (stmt)) + continue; + + gcall *call = as_a <gcall *> (stmt); + + if (!gimple_call_internal_p (call)) + continue; + + if (gimple_call_internal_fn (call) != IFN_GOACC_LOOP) + continue; + + *gimple_call_arg_ptr (call, 5) = mask_arg; + *gimple_call_arg_ptr (call, 4) = chunk_arg; + ifns--; + if (!ifns) + return; + } + + /* The LOOP_BOUND ifn could be in the single successor + block. */ + basic_block bb = single_succ (gsi_bb (gsi)); + gsi = gsi_start_bb (bb); + } +} + +/* Process the discovered OpenACC loops, setting the correct + partitioning level etc. */ + +static void +oacc_loop_process (oacc_loop *loop) +{ + if (loop->child) + oacc_loop_process (loop->child); + + if (loop->mask && !loop->routine) + { + int ix; + unsigned mask = loop->mask; + unsigned dim = GOMP_DIM_GANG; + tree mask_arg = build_int_cst (unsigned_type_node, mask); + tree chunk_arg = loop->chunk_size; + + oacc_loop_xform_loop (loop->head_end, loop->ifns, mask_arg, chunk_arg); + + for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++) + { + while (!(GOMP_DIM_MASK (dim) & mask)) + dim++; + + oacc_loop_xform_head_tail (loop->heads[ix], dim); + oacc_loop_xform_head_tail (loop->tails[ix], dim); + + mask ^= GOMP_DIM_MASK (dim); + } + } + + if (loop->sibling) + oacc_loop_process (loop->sibling); +} + +/* Walk the OpenACC loop heirarchy checking and assigning the + programmer-specified partitionings. OUTER_MASK is the partitioning + this loop is contained within. Return mask of partitioning + encountered. If any auto loops are discovered, set GOMP_DIM_MAX + bit. */ + +static unsigned +oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask) +{ + unsigned this_mask = loop->mask; + unsigned mask_all = 0; + bool noisy = true; + +#ifdef ACCEL_COMPILER + /* When device_type is supported, we want the device compiler to be + noisy, if the loop parameters are device_type-specific. */ + noisy = false; +#endif + + if (!loop->routine) + { + bool auto_par = (loop->flags & OLF_AUTO) != 0; + bool seq_par = (loop->flags & OLF_SEQ) != 0; + + this_mask = ((loop->flags >> OLF_DIM_BASE) + & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)); + + if ((this_mask != 0) + auto_par + seq_par > 1) + { + if (noisy) + error_at (loop->loc, + seq_par + ? "%<seq%> overrides other OpenACC loop specifiers" + : "%<auto%> conflicts with other OpenACC loop " + "specifiers"); + auto_par = false; + loop->flags &= ~OLF_AUTO; + if (seq_par) + { + loop->flags &= + ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE); + this_mask = 0; + } + } + if (auto_par && (loop->flags & OLF_INDEPENDENT)) + mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX); + } + + if (this_mask & outer_mask) + { + const oacc_loop *outer; + for (outer = loop->parent; outer; outer = outer->parent) + if (outer->mask & this_mask) + break; + + if (noisy) + { + if (outer) + { + error_at (loop->loc, + "%s uses same OpenACC parallelism as containing loop", + loop->routine ? "routine call" : "inner loop"); + inform (outer->loc, "containing loop here"); + } + else + error_at (loop->loc, + "%s uses OpenACC parallelism disallowed by containing " + "routine", loop->routine ? "routine call" : "loop"); + + if (loop->routine) + inform (DECL_SOURCE_LOCATION (loop->routine), + "routine %qD declared here", loop->routine); + } + this_mask &= ~outer_mask; + } + else + { + unsigned outermost = least_bit_hwi (this_mask); + + if (outermost && outermost <= outer_mask) + { + if (noisy) + { + error_at (loop->loc, + "incorrectly nested OpenACC loop parallelism"); + + const oacc_loop *outer; + for (outer = loop->parent; + outer->flags && outer->flags < outermost; + outer = outer->parent) + continue; + inform (outer->loc, "containing loop here"); + } + + this_mask &= ~outermost; + } + } + + loop->mask = this_mask; + mask_all |= this_mask; + + if (loop->child) + { + loop->inner = oacc_loop_fixed_partitions (loop->child, + outer_mask | this_mask); + mask_all |= loop->inner; + } + + if (loop->sibling) + mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask); + + return mask_all; +} + +/* Walk the OpenACC loop heirarchy to assign auto-partitioned loops. + OUTER_MASK is the partitioning this loop is contained within. + Return the cumulative partitioning used by this loop, siblings and + children. */ + +static unsigned +oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask) +{ + bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT); + bool noisy = true; + +#ifdef ACCEL_COMPILER + /* When device_type is supported, we want the device compiler to be + noisy, if the loop parameters are device_type-specific. */ + noisy = false; +#endif + + if (assign && outer_mask < GOMP_DIM_MASK (GOMP_DIM_MAX - 1)) + { + /* Allocate the outermost loop at the outermost available + level. */ + unsigned this_mask = outer_mask + 1; + + if (!(this_mask & loop->inner)) + loop->mask = this_mask; + } + + if (loop->child) + { + unsigned child_mask = outer_mask | loop->mask; + + if (loop->mask || assign) + child_mask |= GOMP_DIM_MASK (GOMP_DIM_MAX); + + loop->inner = oacc_loop_auto_partitions (loop->child, child_mask); + } + + if (assign && !loop->mask) + { + /* Allocate the loop at the innermost available level. */ + unsigned this_mask = 0; + + /* Determine the outermost partitioning used within this loop. */ + this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX); + this_mask = least_bit_hwi (this_mask); + + /* Pick the partitioning just inside that one. */ + this_mask >>= 1; + + /* And avoid picking one use by an outer loop. */ + this_mask &= ~outer_mask; + + if (!this_mask && noisy) + warning_at (loop->loc, 0, + "insufficient partitioning available to parallelize loop"); + + loop->mask = this_mask; + } + + if (assign && dump_file) + fprintf (dump_file, "Auto loop %s:%d assigned %d\n", + LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc), + loop->mask); + + unsigned inner_mask = 0; + + if (loop->sibling) + inner_mask |= oacc_loop_auto_partitions (loop->sibling, outer_mask); + + inner_mask |= loop->inner | loop->mask; + + return inner_mask; +} + +/* Walk the OpenACC loop heirarchy to check and assign partitioning + axes. Return mask of partitioning. */ + +static unsigned +oacc_loop_partition (oacc_loop *loop, unsigned outer_mask) +{ + unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask); + + if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX)) + { + mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX); + mask_all |= oacc_loop_auto_partitions (loop, outer_mask); + } + return mask_all; +} + +/* Default fork/join early expander. Delete the function calls if + there is no RTL expander. */ + +bool +default_goacc_fork_join (gcall *ARG_UNUSED (call), + const int *ARG_UNUSED (dims), bool is_fork) +{ + if (is_fork) + return targetm.have_oacc_fork (); + else + return targetm.have_oacc_join (); +} + +/* Default goacc.reduction early expander. + + LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET) + If RES_PTR is not integer-zerop: + SETUP - emit 'LHS = *RES_PTR', LHS = NULL + TEARDOWN - emit '*RES_PTR = VAR' + If LHS is not NULL + emit 'LHS = VAR' */ + +void +default_goacc_reduction (gcall *call) +{ + unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0)); + gimple_stmt_iterator gsi = gsi_for_stmt (call); + tree lhs = gimple_call_lhs (call); + tree var = gimple_call_arg (call, 2); + gimple_seq seq = NULL; + + if (code == IFN_GOACC_REDUCTION_SETUP + || code == IFN_GOACC_REDUCTION_TEARDOWN) + { + /* Setup and Teardown need to copy from/to the receiver object, + if there is one. */ + tree ref_to_res = gimple_call_arg (call, 1); + + if (!integer_zerop (ref_to_res)) + { + tree dst = build_simple_mem_ref (ref_to_res); + tree src = var; + + if (code == IFN_GOACC_REDUCTION_SETUP) + { + src = dst; + dst = lhs; + lhs = NULL; + } + gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src)); + } + } + + /* Copy VAR to LHS, if there is an LHS. */ + if (lhs) + gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var)); + + gsi_replace_with_seq (&gsi, seq, true); +} + +/* Main entry point for oacc transformations which run on the device + compiler after LTO, so we know what the target device is at this + point (including the host fallback). */ + +static unsigned int +execute_oacc_device_lower () +{ + tree attrs = oacc_get_fn_attrib (current_function_decl); + + if (!attrs) + /* Not an offloaded function. */ + return 0; + + /* Parse the default dim argument exactly once. */ + if ((const void *)flag_openacc_dims != &flag_openacc_dims) + { + oacc_parse_default_dims (flag_openacc_dims); + flag_openacc_dims = (char *)&flag_openacc_dims; + } + + /* Discover, partition and process the loops. */ + oacc_loop *loops = oacc_loop_discovery (); + int fn_level = oacc_fn_attrib_level (attrs); + + if (dump_file) + fprintf (dump_file, oacc_fn_attrib_kernels_p (attrs) + ? "Function is kernels offload\n" + : fn_level < 0 ? "Function is parallel offload\n" + : "Function is routine level %d\n", fn_level); + + unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0; + unsigned used_mask = oacc_loop_partition (loops, outer_mask); + int dims[GOMP_DIM_MAX]; + + oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask); + + if (dump_file) + { + const char *comma = "Compute dimensions ["; + for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ") + fprintf (dump_file, "%s%d", comma, dims[ix]); + fprintf (dump_file, "]\n"); + } + + oacc_loop_process (loops); + if (dump_file) + { + fprintf (dump_file, "OpenACC loops\n"); + dump_oacc_loop (dump_file, loops, 0); + fprintf (dump_file, "\n"); + } + + /* Offloaded targets may introduce new basic blocks, which require + dominance information to update SSA. */ + calculate_dominance_info (CDI_DOMINATORS); + + /* Now lower internal loop functions to target-specific code + sequences. */ + basic_block bb; + FOR_ALL_BB_FN (bb, cfun) + for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);) + { + gimple *stmt = gsi_stmt (gsi); + if (!is_gimple_call (stmt)) + { + gsi_next (&gsi); + continue; + } + + gcall *call = as_a <gcall *> (stmt); + if (!gimple_call_internal_p (call)) + { + gsi_next (&gsi); + continue; + } + + /* Rewind to allow rescan. */ + gsi_prev (&gsi); + bool rescan = false, remove = false; + enum internal_fn ifn_code = gimple_call_internal_fn (call); + + switch (ifn_code) + { + default: break; + + case IFN_GOACC_LOOP: + oacc_xform_loop (call); + rescan = true; + break; + + case IFN_GOACC_REDUCTION: + /* Mark the function for SSA renaming. */ + mark_virtual_operands_for_renaming (cfun); + + /* If the level is -1, this ended up being an unused + axis. Handle as a default. */ + if (integer_minus_onep (gimple_call_arg (call, 3))) + default_goacc_reduction (call); + else + targetm.goacc.reduction (call); + rescan = true; + break; + + case IFN_UNIQUE: + { + enum ifn_unique_kind kind + = ((enum ifn_unique_kind) + TREE_INT_CST_LOW (gimple_call_arg (call, 0))); + + switch (kind) + { + default: + gcc_unreachable (); + + case IFN_UNIQUE_OACC_FORK: + case IFN_UNIQUE_OACC_JOIN: + if (integer_minus_onep (gimple_call_arg (call, 2))) + remove = true; + else if (!targetm.goacc.fork_join + (call, dims, kind == IFN_UNIQUE_OACC_FORK)) + remove = true; + break; + + case IFN_UNIQUE_OACC_HEAD_MARK: + case IFN_UNIQUE_OACC_TAIL_MARK: + remove = true; + break; + } + break; + } + } + + if (gsi_end_p (gsi)) + /* We rewound past the beginning of the BB. */ + gsi = gsi_start_bb (bb); + else + /* Undo the rewind. */ + gsi_next (&gsi); + + if (remove) + { + if (gimple_vdef (call)) + replace_uses_by (gimple_vdef (call), gimple_vuse (call)); + if (gimple_call_lhs (call)) + { + /* Propagate the data dependency var. */ + gimple *ass = gimple_build_assign (gimple_call_lhs (call), + gimple_call_arg (call, 1)); + gsi_replace (&gsi, ass, false); + } + else + gsi_remove (&gsi, true); + } + else if (!rescan) + /* If not rescanning, advance over the call. */ + gsi_next (&gsi); + } + + free_oacc_loop (loops); + + return 0; +} + +/* Default launch dimension validator. Force everything to 1. A + backend that wants to provide larger dimensions must override this + hook. */ + +bool +default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims, + int ARG_UNUSED (fn_level)) +{ + bool changed = false; + + for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++) + { + if (dims[ix] != 1) + { + dims[ix] = 1; + changed = true; + } + } + + return changed; +} + +/* Default dimension bound is unknown on accelerator and 1 on host. */ + +int +default_goacc_dim_limit (int ARG_UNUSED (axis)) +{ +#ifdef ACCEL_COMPILER + return 0; +#else + return 1; +#endif +} + +namespace { + +const pass_data pass_data_oacc_device_lower = +{ + GIMPLE_PASS, /* type */ + "oaccdevlow", /* name */ + OPTGROUP_OPENMP, /* optinfo_flags */ + TV_NONE, /* tv_id */ + PROP_cfg, /* properties_required */ + 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */ +}; + +class pass_oacc_device_lower : public gimple_opt_pass +{ +public: + pass_oacc_device_lower (gcc::context *ctxt) + : gimple_opt_pass (pass_data_oacc_device_lower, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) { return flag_openacc; }; + + virtual unsigned int execute (function *) + { + return execute_oacc_device_lower (); + } + +}; // class pass_oacc_device_lower + +} // anon namespace + +gimple_opt_pass * +make_pass_oacc_device_lower (gcc::context *ctxt) +{ + return new pass_oacc_device_lower (ctxt); +} + +/* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets, + VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and + LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT + internal functions on non-SIMT targets, and likewise some SIMD internal + functions on SIMT targets. */ + +static unsigned int +execute_omp_device_lower () +{ + int vf = targetm.simt.vf ? targetm.simt.vf () : 1; + basic_block bb; + gimple_stmt_iterator gsi; + FOR_EACH_BB_FN (bb, cfun) + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt)) + continue; + tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE; + tree type = lhs ? TREE_TYPE (lhs) : integer_type_node; + switch (gimple_call_internal_fn (stmt)) + { + case IFN_GOMP_USE_SIMT: + rhs = vf == 1 ? integer_zero_node : integer_one_node; + break; + case IFN_GOMP_SIMT_LANE: + case IFN_GOMP_SIMT_LAST_LANE: + rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE; + break; + case IFN_GOMP_SIMT_VF: + rhs = build_int_cst (type, vf); + break; + case IFN_GOMP_SIMT_ORDERED_PRED: + rhs = vf == 1 ? integer_zero_node : NULL_TREE; + if (rhs || !lhs) + unlink_stmt_vdef (stmt); + break; + case IFN_GOMP_SIMT_VOTE_ANY: + case IFN_GOMP_SIMT_XCHG_BFLY: + case IFN_GOMP_SIMT_XCHG_IDX: + rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE; + break; + case IFN_GOMP_SIMD_LANE: + case IFN_GOMP_SIMD_LAST_LANE: + rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE; + break; + case IFN_GOMP_SIMD_VF: + rhs = vf != 1 ? build_one_cst (type) : NULL_TREE; + break; + default: + continue; + } + if (lhs && !rhs) + continue; + stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop (); + gsi_replace (&gsi, stmt, false); + } + if (vf != 1) + cfun->has_force_vectorize_loops = false; + return 0; +} + +namespace { + +const pass_data pass_data_omp_device_lower = +{ + GIMPLE_PASS, /* type */ + "ompdevlow", /* name */ + OPTGROUP_OPENMP, /* optinfo_flags */ + TV_NONE, /* tv_id */ + PROP_cfg, /* properties_required */ + PROP_gimple_lomp_dev, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_update_ssa, /* todo_flags_finish */ +}; + +class pass_omp_device_lower : public gimple_opt_pass +{ +public: + pass_omp_device_lower (gcc::context *ctxt) + : gimple_opt_pass (pass_data_omp_device_lower, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *ARG_UNUSED (fun)) + { + /* FIXME: this should use PROP_gimple_lomp_dev. */ +#ifdef ACCEL_COMPILER + return true; +#else + return ENABLE_OFFLOADING && (flag_openmp || in_lto_p); +#endif + } + virtual unsigned int execute (function *) + { + return execute_omp_device_lower (); + } + +}; // class pass_expand_omp_ssa + +} // anon namespace + +gimple_opt_pass * +make_pass_omp_device_lower (gcc::context *ctxt) +{ + return new pass_omp_device_lower (ctxt); +} + +/* "omp declare target link" handling pass. */ + +namespace { + +const pass_data pass_data_omp_target_link = +{ + GIMPLE_PASS, /* type */ + "omptargetlink", /* name */ + OPTGROUP_OPENMP, /* optinfo_flags */ + TV_NONE, /* tv_id */ + PROP_ssa, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_update_ssa, /* todo_flags_finish */ +}; + +class pass_omp_target_link : public gimple_opt_pass +{ +public: + pass_omp_target_link (gcc::context *ctxt) + : gimple_opt_pass (pass_data_omp_target_link, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *fun) + { +#ifdef ACCEL_COMPILER + tree attrs = DECL_ATTRIBUTES (fun->decl); + return lookup_attribute ("omp declare target", attrs) + || lookup_attribute ("omp target entrypoint", attrs); +#else + (void) fun; + return false; +#endif + } + + virtual unsigned execute (function *); +}; + +/* Callback for walk_gimple_stmt used to scan for link var operands. */ + +static tree +find_link_var_op (tree *tp, int *walk_subtrees, void *) +{ + tree t = *tp; + + if (VAR_P (t) && DECL_HAS_VALUE_EXPR_P (t) + && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t))) + { + *walk_subtrees = 0; + return t; + } + + return NULL_TREE; +} + +unsigned +pass_omp_target_link::execute (function *fun) +{ + basic_block bb; + FOR_EACH_BB_FN (bb, fun) + { + gimple_stmt_iterator gsi; + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL)) + gimple_regimplify_operands (gsi_stmt (gsi), &gsi); + } + + return 0; +} + +} // anon namespace + +gimple_opt_pass * +make_pass_omp_target_link (gcc::context *ctxt) +{ + return new pass_omp_target_link (ctxt); +} |