Merge current set of OpenACC changes from gomp-4_0-branch.

contrib/ * gcc_update (files_and_dependencies): Update rules for new libgomp/plugin/Makefrag.am and libgomp/plugin/configfrag.ac files. gcc/ * builtin-types.def (BT_FN_VOID_INT_INT_VAR) (BT_FN_VOID_INT_PTR_SIZE_PTR_PTR_PTR_INT_INT_VAR) (BT_FN_VOID_INT_OMPFN_PTR_SIZE_PTR_PTR_PTR_INT_INT_INT_INT_INT_VAR): New function types. * builtins.c: Include "gomp-constants.h". (expand_builtin_acc_on_device): New function. (expand_builtin, is_inexpensive_builtin): Handle BUILT_IN_ACC_ON_DEVICE. * builtins.def (DEF_GOACC_BUILTIN, DEF_GOACC_BUILTIN_COMPILER): New macros. * cgraph.c (cgraph_node::create): Consider flag_openacc next to flag_openmp. * config.gcc <nvptx-*> (tm_file): Add nvptx/offload.h. <*-intelmic-* | *-intelmicemul-*> (tm_file): Add i386/intelmic-offload.h. * gcc.c (LINK_COMMAND_SPEC, GOMP_SELF_SPECS): For -fopenacc, link to libgomp and its dependencies. * config/arc/arc.h (LINK_COMMAND_SPEC): Likewise. * config/darwin.h (LINK_COMMAND_SPEC_A): Likewise. * config/i386/mingw32.h (GOMP_SELF_SPECS): Likewise. * config/ia64/hpux.h (LIB_SPEC): Likewise. * config/pa/pa-hpux11.h (LIB_SPEC): Likewise. * config/pa/pa64-hpux.h (LIB_SPEC): Likewise. * doc/generic.texi: Update for OpenACC changes. * doc/gimple.texi: Likewise. * doc/invoke.texi: Likewise. * doc/sourcebuild.texi: Likewise. * gimple-pretty-print.c (dump_gimple_omp_for): Handle GF_OMP_FOR_KIND_OACC_LOOP. (dump_gimple_omp_target): Handle GF_OMP_TARGET_KIND_OACC_KERNELS, GF_OMP_TARGET_KIND_OACC_PARALLEL, GF_OMP_TARGET_KIND_OACC_DATA, GF_OMP_TARGET_KIND_OACC_UPDATE, GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA. Dump more data. * gimple.c: Update comments for OpenACC changes. * gimple.def: Likewise. * gimple.h: Likewise. (enum gf_mask): Add GF_OMP_FOR_KIND_OACC_LOOP, GF_OMP_TARGET_KIND_OACC_PARALLEL, GF_OMP_TARGET_KIND_OACC_KERNELS, GF_OMP_TARGET_KIND_OACC_DATA, GF_OMP_TARGET_KIND_OACC_UPDATE, GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA. (gimple_omp_for_cond, gimple_omp_for_set_cond): Sort in the appropriate place. (is_gimple_omp_oacc, is_gimple_omp_offloaded): New functions. * gimplify.c: Include "gomp-constants.h". Update comments for OpenACC changes. (is_gimple_stmt): Handle OACC_PARALLEL, OACC_KERNELS, OACC_DATA, OACC_HOST_DATA, OACC_DECLARE, OACC_UPDATE, OACC_ENTER_DATA, OACC_EXIT_DATA, OACC_CACHE, OACC_LOOP. (gimplify_scan_omp_clauses, gimplify_adjust_omp_clauses): Handle OMP_CLAUSE__CACHE_, OMP_CLAUSE_ASYNC, OMP_CLAUSE_WAIT, OMP_CLAUSE_NUM_GANGS, OMP_CLAUSE_NUM_WORKERS, OMP_CLAUSE_VECTOR_LENGTH, OMP_CLAUSE_GANG, OMP_CLAUSE_WORKER, OMP_CLAUSE_VECTOR, OMP_CLAUSE_DEVICE_RESIDENT, OMP_CLAUSE_USE_DEVICE, OMP_CLAUSE_INDEPENDENT, OMP_CLAUSE_AUTO, OMP_CLAUSE_SEQ. (gimplify_adjust_omp_clauses_1, gimplify_adjust_omp_clauses): Use GOMP_MAP_* instead of OMP_CLAUSE_MAP_*. Use OMP_CLAUSE_SET_MAP_KIND. (gimplify_oacc_cache): New function. (gimplify_omp_for): Handle OACC_LOOP. (gimplify_omp_workshare): Handle OACC_KERNELS, OACC_PARALLEL, OACC_DATA. (gimplify_omp_target_update): Handle OACC_ENTER_DATA, OACC_EXIT_DATA, OACC_UPDATE. (gimplify_expr): Handle OACC_LOOP, OACC_CACHE, OACC_HOST_DATA, OACC_DECLARE, OACC_KERNELS, OACC_PARALLEL, OACC_DATA, OACC_ENTER_DATA, OACC_EXIT_DATA, OACC_UPDATE. (gimplify_body): Consider flag_openacc next to flag_openmp. * lto-streamer-out.c: Include "gomp-constants.h". * omp-builtins.def (BUILT_IN_ACC_GET_DEVICE_TYPE) (BUILT_IN_GOACC_DATA_START, BUILT_IN_GOACC_DATA_END) (BUILT_IN_GOACC_ENTER_EXIT_DATA, BUILT_IN_GOACC_PARALLEL) (BUILT_IN_GOACC_UPDATE, BUILT_IN_GOACC_WAIT) (BUILT_IN_GOACC_GET_THREAD_NUM, BUILT_IN_GOACC_GET_NUM_THREADS) (BUILT_IN_ACC_ON_DEVICE): New builtins. * omp-low.c: Include "gomp-constants.h". Update comments for OpenACC changes. (struct omp_context): Add reduction_map, gwv_below, gwv_this members. (extract_omp_for_data, use_pointer_for_field, install_var_field) (new_omp_context, delete_omp_context, scan_sharing_clauses) (create_omp_child_function, scan_omp_for, scan_omp_target) (check_omp_nesting_restrictions, lower_reduction_clauses) (build_omp_regions_1, diagnose_sb_0, make_gimple_omp_edges): Update for OpenACC changes. (scan_sharing_clauses): Handle OMP_CLAUSE_NUM_GANGS: OMP_CLAUSE_NUM_WORKERS: OMP_CLAUSE_VECTOR_LENGTH, OMP_CLAUSE_ASYNC, OMP_CLAUSE_WAIT, OMP_CLAUSE_GANG, OMP_CLAUSE_WORKER, OMP_CLAUSE_VECTOR, OMP_CLAUSE_DEVICE_RESIDENT, OMP_CLAUSE_USE_DEVICE, OMP_CLAUSE__CACHE_, OMP_CLAUSE_INDEPENDENT, OMP_CLAUSE_AUTO, OMP_CLAUSE_SEQ. Use GOMP_MAP_* instead of OMP_CLAUSE_MAP_*. (expand_omp_for_static_nochunk, expand_omp_for_static_chunk): Handle GF_OMP_FOR_KIND_OACC_LOOP. (expand_omp_target, lower_omp_target): Handle GF_OMP_TARGET_KIND_OACC_PARALLEL, GF_OMP_TARGET_KIND_OACC_KERNELS, GF_OMP_TARGET_KIND_OACC_UPDATE, GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA, GF_OMP_TARGET_KIND_OACC_DATA. (pass_expand_omp::execute, execute_lower_omp) (pass_diagnose_omp_blocks::gate): Consider flag_openacc next to flag_openmp. (offload_symbol_decl): New variable. (oacc_get_reduction_array_id, oacc_max_threads) (get_offload_symbol_decl, get_base_type, lookup_oacc_reduction) (maybe_lookup_oacc_reduction, enclosing_target_ctx) (oacc_loop_or_target_p, oacc_lower_reduction_var_helper) (oacc_gimple_assign, oacc_initialize_reduction_data) (oacc_finalize_reduction_data, oacc_process_reduction_data): New functions. (is_targetreg_ctx): Remove function. * tree-core.h (enum omp_clause_code): Add OMP_CLAUSE__CACHE_, OMP_CLAUSE_DEVICE_RESIDENT, OMP_CLAUSE_USE_DEVICE, OMP_CLAUSE_GANG, OMP_CLAUSE_ASYNC, OMP_CLAUSE_WAIT, OMP_CLAUSE_AUTO, OMP_CLAUSE_SEQ, OMP_CLAUSE_INDEPENDENT, OMP_CLAUSE_WORKER, OMP_CLAUSE_VECTOR, OMP_CLAUSE_NUM_GANGS, OMP_CLAUSE_NUM_WORKERS, OMP_CLAUSE_VECTOR_LENGTH. * tree.c (omp_clause_code_name, walk_tree_1): Update accordingly. * tree.h (OMP_CLAUSE_GANG_EXPR, OMP_CLAUSE_GANG_STATIC_EXPR) (OMP_CLAUSE_ASYNC_EXPR, OMP_CLAUSE_WAIT_EXPR) (OMP_CLAUSE_VECTOR_EXPR, OMP_CLAUSE_WORKER_EXPR) (OMP_CLAUSE_NUM_GANGS_EXPR, OMP_CLAUSE_NUM_WORKERS_EXPR) (OMP_CLAUSE_VECTOR_LENGTH_EXPR): New macros. * tree-core.h: Update comments for OpenACC changes. (enum omp_clause_map_kind): Remove. (struct tree_omp_clause): Change type of map_kind member from enum omp_clause_map_kind to unsigned char. * tree-inline.c: Update comments for OpenACC changes. * tree-nested.c: Likewise. Include "gomp-constants.h". (convert_nonlocal_reference_stmt, convert_local_reference_stmt) (convert_tramp_reference_stmt, convert_gimple_call): Update for OpenACC changes. Use GOMP_MAP_* instead of OMP_CLAUSE_MAP_*. Use OMP_CLAUSE_SET_MAP_KIND. * tree-pretty-print.c: Include "gomp-constants.h". (dump_omp_clause): Handle OMP_CLAUSE_DEVICE_RESIDENT, OMP_CLAUSE_USE_DEVICE, OMP_CLAUSE__CACHE_, OMP_CLAUSE_GANG, OMP_CLAUSE_ASYNC, OMP_CLAUSE_AUTO, OMP_CLAUSE_SEQ, OMP_CLAUSE_WAIT, OMP_CLAUSE_WORKER, OMP_CLAUSE_VECTOR, OMP_CLAUSE_NUM_GANGS, OMP_CLAUSE_NUM_WORKERS, OMP_CLAUSE_VECTOR_LENGTH, OMP_CLAUSE_INDEPENDENT. Use GOMP_MAP_* instead of OMP_CLAUSE_MAP_*. (dump_generic_node): Handle OACC_PARALLEL, OACC_KERNELS, OACC_DATA, OACC_HOST_DATA, OACC_DECLARE, OACC_UPDATE, OACC_ENTER_DATA, OACC_EXIT_DATA, OACC_CACHE, OACC_LOOP. * tree-streamer-in.c: Include "gomp-constants.h". (unpack_ts_omp_clause_value_fields) Use GOMP_MAP_* instead of OMP_CLAUSE_MAP_*. Use OMP_CLAUSE_SET_MAP_KIND. * tree-streamer-out.c: Include "gomp-constants.h". (pack_ts_omp_clause_value_fields): Use GOMP_MAP_* instead of OMP_CLAUSE_MAP_*. * tree.def (OACC_PARALLEL, OACC_KERNELS, OACC_DATA) (OACC_HOST_DATA, OACC_LOOP, OACC_CACHE, OACC_DECLARE) (OACC_ENTER_DATA, OACC_EXIT_DATA, OACC_UPDATE): New tree codes. * tree.c (omp_clause_num_ops): Update accordingly. * tree.h (OMP_BODY, OMP_CLAUSES, OMP_LOOP_CHECK, OMP_CLAUSE_SIZE): Likewise. (OACC_PARALLEL_BODY, OACC_PARALLEL_CLAUSES, OACC_KERNELS_BODY) (OACC_KERNELS_CLAUSES, OACC_DATA_BODY, OACC_DATA_CLAUSES) (OACC_HOST_DATA_BODY, OACC_HOST_DATA_CLAUSES, OACC_CACHE_CLAUSES) (OACC_DECLARE_CLAUSES, OACC_ENTER_DATA_CLAUSES) (OACC_EXIT_DATA_CLAUSES, OACC_UPDATE_CLAUSES) (OACC_KERNELS_COMBINED, OACC_PARALLEL_COMBINED): New macros. * tree.h (OMP_CLAUSE_MAP_KIND): Cast it to enum gomp_map_kind. (OMP_CLAUSE_SET_MAP_KIND): New macro. * varpool.c (varpool_node::get_create): Consider flag_openacc next to flag_openmp. * config/i386/intelmic-offload.h: New file. * config/nvptx/offload.h: Likewise. gcc/ada/ * gcc-interface/utils.c (DEF_FUNCTION_TYPE_VAR_8) (DEF_FUNCTION_TYPE_VAR_12): New macros. gcc/c-family/ * c.opt (fopenacc): New option. * c-cppbuiltin.c (c_cpp_builtins): Conditionally define _OPENACC. * c-common.c (DEF_FUNCTION_TYPE_VAR_8, DEF_FUNCTION_TYPE_VAR_12): New macros. * c-common.h (c_finish_oacc_wait): New prototype. * c-omp.c: Include "omp-low.h" and "gomp-constants.h". (c_finish_oacc_wait): New function. * c-pragma.c (oacc_pragmas): New variable. (c_pp_lookup_pragma, init_pragma): Handle it. * c-pragma.h (enum pragma_kind): Add PRAGMA_OACC_CACHE, PRAGMA_OACC_DATA, PRAGMA_OACC_ENTER_DATA, PRAGMA_OACC_EXIT_DATA, PRAGMA_OACC_KERNELS, PRAGMA_OACC_LOOP, PRAGMA_OACC_PARALLEL, PRAGMA_OACC_UPDATE, PRAGMA_OACC_WAIT. (enum pragma_omp_clause): Add PRAGMA_OACC_CLAUSE_ASYNC, PRAGMA_OACC_CLAUSE_AUTO, PRAGMA_OACC_CLAUSE_COLLAPSE, PRAGMA_OACC_CLAUSE_COPY, PRAGMA_OACC_CLAUSE_COPYIN, PRAGMA_OACC_CLAUSE_COPYOUT, PRAGMA_OACC_CLAUSE_CREATE, PRAGMA_OACC_CLAUSE_DELETE, PRAGMA_OACC_CLAUSE_DEVICE, PRAGMA_OACC_CLAUSE_DEVICEPTR, PRAGMA_OACC_CLAUSE_FIRSTPRIVATE, PRAGMA_OACC_CLAUSE_GANG, PRAGMA_OACC_CLAUSE_HOST, PRAGMA_OACC_CLAUSE_IF, PRAGMA_OACC_CLAUSE_NUM_GANGS, PRAGMA_OACC_CLAUSE_NUM_WORKERS, PRAGMA_OACC_CLAUSE_PRESENT, PRAGMA_OACC_CLAUSE_PRESENT_OR_COPY, PRAGMA_OACC_CLAUSE_PRESENT_OR_COPYIN, PRAGMA_OACC_CLAUSE_PRESENT_OR_COPYOUT, PRAGMA_OACC_CLAUSE_PRESENT_OR_CREATE, PRAGMA_OACC_CLAUSE_PRIVATE, PRAGMA_OACC_CLAUSE_REDUCTION, PRAGMA_OACC_CLAUSE_SELF, PRAGMA_OACC_CLAUSE_SEQ, PRAGMA_OACC_CLAUSE_VECTOR, PRAGMA_OACC_CLAUSE_VECTOR_LENGTH, PRAGMA_OACC_CLAUSE_WAIT, PRAGMA_OACC_CLAUSE_WORKER. gcc/c/ * c-parser.c: Include "gomp-constants.h". (c_parser_omp_clause_map): Use enum gomp_map_kind instead of enum omp_clause_map_kind. Use GOMP_MAP_* instead of OMP_CLAUSE_MAP_*. Use OMP_CLAUSE_SET_MAP_KIND. (c_parser_pragma): Handle PRAGMA_OACC_ENTER_DATA, PRAGMA_OACC_EXIT_DATA, PRAGMA_OACC_UPDATE. (c_parser_omp_construct): Handle PRAGMA_OACC_CACHE, PRAGMA_OACC_DATA, PRAGMA_OACC_KERNELS, PRAGMA_OACC_LOOP, PRAGMA_OACC_PARALLEL, PRAGMA_OACC_WAIT. (c_parser_omp_clause_name): Handle "auto", "async", "copy", "copyout", "create", "delete", "deviceptr", "gang", "host", "num_gangs", "num_workers", "present", "present_or_copy", "pcopy", "present_or_copyin", "pcopyin", "present_or_copyout", "pcopyout", "present_or_create", "pcreate", "seq", "self", "vector", "vector_length", "wait", "worker". (OACC_DATA_CLAUSE_MASK, OACC_KERNELS_CLAUSE_MASK) (OACC_ENTER_DATA_CLAUSE_MASK, OACC_EXIT_DATA_CLAUSE_MASK) (OACC_LOOP_CLAUSE_MASK, OACC_PARALLEL_CLAUSE_MASK) (OACC_UPDATE_CLAUSE_MASK, OACC_WAIT_CLAUSE_MASK): New macros. (c_parser_omp_variable_list): Handle OMP_CLAUSE__CACHE_. (c_parser_oacc_wait_list, c_parser_oacc_data_clause) (c_parser_oacc_data_clause_deviceptr) (c_parser_omp_clause_num_gangs, c_parser_omp_clause_num_workers) (c_parser_oacc_clause_async, c_parser_oacc_clause_wait) (c_parser_omp_clause_vector_length, c_parser_oacc_all_clauses) (c_parser_oacc_cache, c_parser_oacc_data, c_parser_oacc_kernels) (c_parser_oacc_enter_exit_data, c_parser_oacc_loop) (c_parser_oacc_parallel, c_parser_oacc_update) (c_parser_oacc_wait): New functions. * c-tree.h (c_finish_oacc_parallel, c_finish_oacc_kernels) (c_finish_oacc_data): New prototypes. * c-typeck.c: Include "gomp-constants.h". (handle_omp_array_sections): Handle GOMP_MAP_FORCE_DEVICEPTR. Use GOMP_MAP_* instead of OMP_CLAUSE_MAP_*. Use OMP_CLAUSE_SET_MAP_KIND. (c_finish_oacc_parallel, c_finish_oacc_kernels) (c_finish_oacc_data): New functions. (c_finish_omp_clauses): Handle OMP_CLAUSE__CACHE_, OMP_CLAUSE_NUM_GANGS, OMP_CLAUSE_NUM_WORKERS, OMP_CLAUSE_VECTOR_LENGTH, OMP_CLAUSE_ASYNC, OMP_CLAUSE_WAIT, OMP_CLAUSE_AUTO, OMP_CLAUSE_SEQ, OMP_CLAUSE_GANG, OMP_CLAUSE_WORKER, OMP_CLAUSE_VECTOR, and OMP_CLAUSE_MAP's GOMP_MAP_FORCE_DEVICEPTR. gcc/cp/ * parser.c: Include "gomp-constants.h". (cp_parser_omp_clause_map): Use enum gomp_map_kind instead of enum omp_clause_map_kind. Use GOMP_MAP_* instead of OMP_CLAUSE_MAP_*. Use OMP_CLAUSE_SET_MAP_KIND. (cp_parser_omp_construct, cp_parser_pragma): Handle PRAGMA_OACC_CACHE, PRAGMA_OACC_DATA, PRAGMA_OACC_ENTER_DATA, PRAGMA_OACC_EXIT_DATA, PRAGMA_OACC_KERNELS, PRAGMA_OACC_PARALLEL, PRAGMA_OACC_LOOP, PRAGMA_OACC_UPDATE, PRAGMA_OACC_WAIT. (cp_parser_omp_clause_name): Handle "async", "copy", "copyout", "create", "delete", "deviceptr", "host", "num_gangs", "num_workers", "present", "present_or_copy", "pcopy", "present_or_copyin", "pcopyin", "present_or_copyout", "pcopyout", "present_or_create", "pcreate", "vector_length", "wait". (OACC_DATA_CLAUSE_MASK, OACC_ENTER_DATA_CLAUSE_MASK) (OACC_EXIT_DATA_CLAUSE_MASK, OACC_KERNELS_CLAUSE_MASK) (OACC_LOOP_CLAUSE_MASK, OACC_PARALLEL_CLAUSE_MASK) (OACC_UPDATE_CLAUSE_MASK, OACC_WAIT_CLAUSE_MASK): New macros. (cp_parser_omp_var_list_no_open): Handle OMP_CLAUSE__CACHE_. (cp_parser_oacc_data_clause, cp_parser_oacc_data_clause_deviceptr) (cp_parser_oacc_clause_vector_length, cp_parser_oacc_wait_list) (cp_parser_oacc_clause_wait, cp_parser_omp_clause_num_gangs) (cp_parser_omp_clause_num_workers, cp_parser_oacc_clause_async) (cp_parser_oacc_all_clauses, cp_parser_oacc_cache) (cp_parser_oacc_data, cp_parser_oacc_enter_exit_data) (cp_parser_oacc_kernels, cp_parser_oacc_loop) (cp_parser_oacc_parallel, cp_parser_oacc_update) (cp_parser_oacc_wait): New functions. * cp-tree.h (finish_oacc_data, finish_oacc_kernels) (finish_oacc_parallel): New prototypes. * semantics.c: Include "gomp-constants.h". (handle_omp_array_sections): Handle GOMP_MAP_FORCE_DEVICEPTR. Use GOMP_MAP_* instead of OMP_CLAUSE_MAP_*. Use OMP_CLAUSE_SET_MAP_KIND. (finish_omp_clauses): Handle OMP_CLAUSE_ASYNC, OMP_CLAUSE_VECTOR_LENGTH, OMP_CLAUSE_WAIT, OMP_CLAUSE__CACHE_. Use GOMP_MAP_* instead of OMP_CLAUSE_MAP_*. (finish_oacc_data, finish_oacc_kernels, finish_oacc_parallel): New functions. gcc/fortran/ * lang.opt (fopenacc): New option. * cpp.c (cpp_define_builtins): Conditionally define _OPENACC. * dump-parse-tree.c (show_omp_node): Split part of it into... (show_omp_clauses): ... this new function. (show_omp_node, show_code_node): Handle EXEC_OACC_PARALLEL_LOOP, EXEC_OACC_PARALLEL, EXEC_OACC_KERNELS_LOOP, EXEC_OACC_KERNELS, EXEC_OACC_DATA, EXEC_OACC_HOST_DATA, EXEC_OACC_LOOP, EXEC_OACC_UPDATE, EXEC_OACC_WAIT, EXEC_OACC_CACHE, EXEC_OACC_ENTER_DATA, EXEC_OACC_EXIT_DATA. (show_namespace): Update for OpenACC. * f95-lang.c (DEF_FUNCTION_TYPE_VAR_2, DEF_FUNCTION_TYPE_VAR_8) (DEF_FUNCTION_TYPE_VAR_12, DEF_GOACC_BUILTIN) (DEF_GOACC_BUILTIN_COMPILER): New macros. * types.def (BT_FN_VOID_INT_INT_VAR) (BT_FN_VOID_INT_PTR_SIZE_PTR_PTR_PTR_INT_INT_VAR) (BT_FN_VOID_INT_OMPFN_PTR_SIZE_PTR_PTR_PTR_INT_INT_INT_INT_INT_VAR): New function types. * gfortran.h (gfc_statement): Add ST_OACC_PARALLEL_LOOP, ST_OACC_END_PARALLEL_LOOP, ST_OACC_PARALLEL, ST_OACC_END_PARALLEL, ST_OACC_KERNELS, ST_OACC_END_KERNELS, ST_OACC_DATA, ST_OACC_END_DATA, ST_OACC_HOST_DATA, ST_OACC_END_HOST_DATA, ST_OACC_LOOP, ST_OACC_END_LOOP, ST_OACC_DECLARE, ST_OACC_UPDATE, ST_OACC_WAIT, ST_OACC_CACHE, ST_OACC_KERNELS_LOOP, ST_OACC_END_KERNELS_LOOP, ST_OACC_ENTER_DATA, ST_OACC_EXIT_DATA, ST_OACC_ROUTINE. (struct gfc_expr_list): New data type. (gfc_get_expr_list): New macro. (gfc_omp_map_op): Add OMP_MAP_FORCE_ALLOC, OMP_MAP_FORCE_DEALLOC, OMP_MAP_FORCE_TO, OMP_MAP_FORCE_FROM, OMP_MAP_FORCE_TOFROM, OMP_MAP_FORCE_PRESENT, OMP_MAP_FORCE_DEVICEPTR. (OMP_LIST_FIRST, OMP_LIST_DEVICE_RESIDENT, OMP_LIST_USE_DEVICE) (OMP_LIST_CACHE): New enumerators. (struct gfc_omp_clauses): Add async_expr, gang_expr, worker_expr, vector_expr, num_gangs_expr, num_workers_expr, vector_length_expr, wait_list, tile_list, async, gang, worker, vector, seq, independent, wait, par_auto, gang_static, and loc members. (struct gfc_namespace): Add oacc_declare_clauses member. (gfc_exec_op): Add EXEC_OACC_KERNELS_LOOP, EXEC_OACC_PARALLEL_LOOP, EXEC_OACC_PARALLEL, EXEC_OACC_KERNELS, EXEC_OACC_DATA, EXEC_OACC_HOST_DATA, EXEC_OACC_LOOP, EXEC_OACC_UPDATE, EXEC_OACC_WAIT, EXEC_OACC_CACHE, EXEC_OACC_ENTER_DATA, EXEC_OACC_EXIT_DATA. (gfc_free_expr_list, gfc_resolve_oacc_directive) (gfc_resolve_oacc_declare, gfc_resolve_oacc_parallel_loop_blocks) (gfc_resolve_oacc_blocks): New prototypes. * match.c (match_exit_cycle): Handle EXEC_OACC_LOOP and EXEC_OACC_PARALLEL_LOOP. * match.h (gfc_match_oacc_cache, gfc_match_oacc_wait) (gfc_match_oacc_update, gfc_match_oacc_declare) (gfc_match_oacc_loop, gfc_match_oacc_host_data) (gfc_match_oacc_data, gfc_match_oacc_kernels) (gfc_match_oacc_kernels_loop, gfc_match_oacc_parallel) (gfc_match_oacc_parallel_loop, gfc_match_oacc_enter_data) (gfc_match_oacc_exit_data, gfc_match_oacc_routine): New prototypes. * openmp.c: Include "diagnostic.h" and "gomp-constants.h". (gfc_free_omp_clauses): Update for members added to struct gfc_omp_clauses. (gfc_match_omp_clauses): Change mask paramter to uint64_t. Add openacc parameter. (resolve_omp_clauses): Add openacc parameter. Update for OpenACC. (struct fortran_omp_context): Add is_openmp member. (gfc_resolve_omp_parallel_blocks): Initialize it. (gfc_resolve_do_iterator): Update for OpenACC. (gfc_resolve_omp_directive): Call resolve_omp_directive_inside_oacc_region. (OMP_CLAUSE_PRIVATE, OMP_CLAUSE_FIRSTPRIVATE) (OMP_CLAUSE_LASTPRIVATE, OMP_CLAUSE_COPYPRIVATE) (OMP_CLAUSE_SHARED, OMP_CLAUSE_COPYIN, OMP_CLAUSE_REDUCTION) (OMP_CLAUSE_IF, OMP_CLAUSE_NUM_THREADS, OMP_CLAUSE_SCHEDULE) (OMP_CLAUSE_DEFAULT, OMP_CLAUSE_ORDERED, OMP_CLAUSE_COLLAPSE) (OMP_CLAUSE_UNTIED, OMP_CLAUSE_FINAL, OMP_CLAUSE_MERGEABLE) (OMP_CLAUSE_ALIGNED, OMP_CLAUSE_DEPEND, OMP_CLAUSE_INBRANCH) (OMP_CLAUSE_LINEAR, OMP_CLAUSE_NOTINBRANCH, OMP_CLAUSE_PROC_BIND) (OMP_CLAUSE_SAFELEN, OMP_CLAUSE_SIMDLEN, OMP_CLAUSE_UNIFORM) (OMP_CLAUSE_DEVICE, OMP_CLAUSE_MAP, OMP_CLAUSE_TO) (OMP_CLAUSE_FROM, OMP_CLAUSE_NUM_TEAMS, OMP_CLAUSE_THREAD_LIMIT) (OMP_CLAUSE_DIST_SCHEDULE): Use uint64_t. (OMP_CLAUSE_ASYNC, OMP_CLAUSE_NUM_GANGS, OMP_CLAUSE_NUM_WORKERS) (OMP_CLAUSE_VECTOR_LENGTH, OMP_CLAUSE_COPY, OMP_CLAUSE_COPYOUT) (OMP_CLAUSE_CREATE, OMP_CLAUSE_PRESENT) (OMP_CLAUSE_PRESENT_OR_COPY, OMP_CLAUSE_PRESENT_OR_COPYIN) (OMP_CLAUSE_PRESENT_OR_COPYOUT, OMP_CLAUSE_PRESENT_OR_CREATE) (OMP_CLAUSE_DEVICEPTR, OMP_CLAUSE_GANG, OMP_CLAUSE_WORKER) (OMP_CLAUSE_VECTOR, OMP_CLAUSE_SEQ, OMP_CLAUSE_INDEPENDENT) (OMP_CLAUSE_USE_DEVICE, OMP_CLAUSE_DEVICE_RESIDENT) (OMP_CLAUSE_HOST_SELF, OMP_CLAUSE_OACC_DEVICE, OMP_CLAUSE_WAIT) (OMP_CLAUSE_DELETE, OMP_CLAUSE_AUTO, OMP_CLAUSE_TILE): New macros. (gfc_match_omp_clauses): Handle those. (OACC_PARALLEL_CLAUSES, OACC_KERNELS_CLAUSES, OACC_DATA_CLAUSES) (OACC_LOOP_CLAUSES, OACC_PARALLEL_LOOP_CLAUSES) (OACC_KERNELS_LOOP_CLAUSES, OACC_HOST_DATA_CLAUSES) (OACC_DECLARE_CLAUSES, OACC_UPDATE_CLAUSES) (OACC_ENTER_DATA_CLAUSES, OACC_EXIT_DATA_CLAUSES) (OACC_WAIT_CLAUSES): New macros. (gfc_free_expr_list, match_oacc_expr_list, match_oacc_clause_gang) (gfc_match_omp_map_clause, gfc_match_oacc_parallel_loop) (gfc_match_oacc_parallel, gfc_match_oacc_kernels_loop) (gfc_match_oacc_kernels, gfc_match_oacc_data) (gfc_match_oacc_host_data, gfc_match_oacc_loop) (gfc_match_oacc_declare, gfc_match_oacc_update) (gfc_match_oacc_enter_data, gfc_match_oacc_exit_data) (gfc_match_oacc_wait, gfc_match_oacc_cache) (gfc_match_oacc_routine, oacc_is_loop) (resolve_oacc_scalar_int_expr, resolve_oacc_positive_int_expr) (check_symbol_not_pointer, check_array_not_assumed) (resolve_oacc_data_clauses, resolve_oacc_deviceptr_clause) (oacc_compatible_clauses, oacc_is_parallel, oacc_is_kernels) (omp_code_to_statement, oacc_code_to_statement) (resolve_oacc_directive_inside_omp_region) (resolve_omp_directive_inside_oacc_region) (resolve_oacc_nested_loops, resolve_oacc_params_in_parallel) (resolve_oacc_loop_blocks, gfc_resolve_oacc_blocks) (resolve_oacc_loop, resolve_oacc_cache, gfc_resolve_oacc_declare) (gfc_resolve_oacc_directive): New functions. * parse.c (next_free): Update for OpenACC. Move some code into... (verify_token_free): ... this new function. (next_fixed): Update for OpenACC. Move some code into... (verify_token_fixed): ... this new function. (case_executable): Add ST_OACC_UPDATE, ST_OACC_WAIT, ST_OACC_CACHE, ST_OACC_ENTER_DATA, and ST_OACC_EXIT_DATA. (case_exec_markers): Add ST_OACC_PARALLEL_LOOP, ST_OACC_PARALLEL, ST_OACC_KERNELS, ST_OACC_DATA, ST_OACC_HOST_DATA, ST_OACC_LOOP, ST_OACC_KERNELS_LOOP. (case_decl): Add ST_OACC_ROUTINE. (push_state, parse_critical_block, parse_progunit): Update for OpenACC. (gfc_ascii_statement): Handle ST_OACC_PARALLEL_LOOP, ST_OACC_END_PARALLEL_LOOP, ST_OACC_PARALLEL, ST_OACC_END_PARALLEL, ST_OACC_KERNELS, ST_OACC_END_KERNELS, ST_OACC_KERNELS_LOOP, ST_OACC_END_KERNELS_LOOP, ST_OACC_DATA, ST_OACC_END_DATA, ST_OACC_HOST_DATA, ST_OACC_END_HOST_DATA, ST_OACC_LOOP, ST_OACC_END_LOOP, ST_OACC_DECLARE, ST_OACC_UPDATE, ST_OACC_WAIT, ST_OACC_CACHE, ST_OACC_ENTER_DATA, ST_OACC_EXIT_DATA, ST_OACC_ROUTINE. (verify_st_order, parse_spec): Handle ST_OACC_DECLARE. (parse_executable): Handle ST_OACC_PARALLEL_LOOP, ST_OACC_KERNELS_LOOP, ST_OACC_LOOP, ST_OACC_PARALLEL, ST_OACC_KERNELS, ST_OACC_DATA, ST_OACC_HOST_DATA. (decode_oacc_directive, parse_oacc_structured_block) (parse_oacc_loop, is_oacc): New functions. * parse.h (struct gfc_state_data): Add oacc_declare_clauses member. (is_oacc): New prototype. * resolve.c (gfc_resolve_blocks, gfc_resolve_code): Handle EXEC_OACC_PARALLEL_LOOP, EXEC_OACC_PARALLEL, EXEC_OACC_KERNELS_LOOP, EXEC_OACC_KERNELS, EXEC_OACC_DATA, EXEC_OACC_HOST_DATA, EXEC_OACC_LOOP, EXEC_OACC_UPDATE, EXEC_OACC_WAIT, EXEC_OACC_CACHE, EXEC_OACC_ENTER_DATA, EXEC_OACC_EXIT_DATA. (resolve_codes): Call gfc_resolve_oacc_declare. * scanner.c (openacc_flag, openacc_locus): New variables. (skip_free_comments): Update for OpenACC. Move some code into... (skip_omp_attribute): ... this new function. (skip_oacc_attribute): New function. (skip_fixed_comments, gfc_next_char_literal): Update for OpenACC. * st.c (gfc_free_statement): Handle EXEC_OACC_PARALLEL_LOOP, EXEC_OACC_PARALLEL, EXEC_OACC_KERNELS_LOOP, EXEC_OACC_KERNELS, EXEC_OACC_DATA, EXEC_OACC_HOST_DATA, EXEC_OACC_LOOP, EXEC_OACC_UPDATE, EXEC_OACC_WAIT, EXEC_OACC_CACHE, EXEC_OACC_ENTER_DATA, EXEC_OACC_EXIT_DATA. * trans-decl.c (gfc_generate_function_code): Update for OpenACC. * trans-openmp.c: Include "gomp-constants.h". (gfc_omp_finish_clause, gfc_trans_omp_clauses): Use GOMP_MAP_* instead of OMP_CLAUSE_MAP_*. Use OMP_CLAUSE_SET_MAP_KIND. (gfc_trans_omp_clauses): Handle OMP_LIST_USE_DEVICE, OMP_LIST_DEVICE_RESIDENT, OMP_LIST_CACHE, and OMP_MAP_FORCE_ALLOC, OMP_MAP_FORCE_DEALLOC, OMP_MAP_FORCE_TO, OMP_MAP_FORCE_FROM, OMP_MAP_FORCE_TOFROM, OMP_MAP_FORCE_PRESENT, OMP_MAP_FORCE_DEVICEPTR, and gfc_omp_clauses' async, seq, independent, wait_list, num_gangs_expr, num_workers_expr, vector_length_expr, vector, vector_expr, worker, worker_expr, gang, gang_expr members. (gfc_trans_omp_do): Handle EXEC_OACC_LOOP. (gfc_convert_expr_to_tree, gfc_trans_oacc_construct) (gfc_trans_oacc_executable_directive) (gfc_trans_oacc_wait_directive, gfc_trans_oacc_combined_directive) (gfc_trans_oacc_declare, gfc_trans_oacc_directive): New functions. * trans-stmt.c (gfc_trans_block_construct): Update for OpenACC. * trans-stmt.h (gfc_trans_oacc_directive, gfc_trans_oacc_declare): New prototypes. * trans.c (tranc_code): Handle EXEC_OACC_CACHE, EXEC_OACC_WAIT, EXEC_OACC_UPDATE, EXEC_OACC_LOOP, EXEC_OACC_HOST_DATA, EXEC_OACC_DATA, EXEC_OACC_KERNELS, EXEC_OACC_KERNELS_LOOP, EXEC_OACC_PARALLEL, EXEC_OACC_PARALLEL_LOOP, EXEC_OACC_ENTER_DATA, EXEC_OACC_EXIT_DATA. * gfortran.texi: Update for OpenACC. * intrinsic.texi: Likewise. * invoke.texi: Likewise. gcc/lto/ * lto-lang.c (DEF_FUNCTION_TYPE_VAR_8, DEF_FUNCTION_TYPE_VAR_12): New macros. * lto.c: Include "gomp-constants.h". gcc/testsuite/ * lib/target-supports.exp (check_effective_target_fopenacc): New procedure. * g++.dg/goacc-gomp/goacc-gomp.exp: New file. * g++.dg/goacc/goacc.exp: Likewise. * gcc.dg/goacc-gomp/goacc-gomp.exp: Likewise. * gcc.dg/goacc/goacc.exp: Likewise. * gfortran.dg/goacc/goacc.exp: Likewise. * c-c++-common/cpp/openacc-define-1.c: New file. * c-c++-common/cpp/openacc-define-2.c: Likewise. * c-c++-common/cpp/openacc-define-3.c: Likewise. * c-c++-common/goacc-gomp/nesting-1.c: Likewise. * c-c++-common/goacc-gomp/nesting-fail-1.c: Likewise. * c-c++-common/goacc/acc_on_device-2-off.c: Likewise. * c-c++-common/goacc/acc_on_device-2.c: Likewise. * c-c++-common/goacc/asyncwait-1.c: Likewise. * c-c++-common/goacc/cache-1.c: Likewise. * c-c++-common/goacc/clauses-fail.c: Likewise. * c-c++-common/goacc/collapse-1.c: Likewise. * c-c++-common/goacc/data-1.c: Likewise. * c-c++-common/goacc/data-2.c: Likewise. * c-c++-common/goacc/data-clause-duplicate-1.c: Likewise. * c-c++-common/goacc/deviceptr-1.c: Likewise. * c-c++-common/goacc/deviceptr-2.c: Likewise. * c-c++-common/goacc/deviceptr-3.c: Likewise. * c-c++-common/goacc/if-clause-1.c: Likewise. * c-c++-common/goacc/if-clause-2.c: Likewise. * c-c++-common/goacc/kernels-1.c: Likewise. * c-c++-common/goacc/loop-1.c: Likewise. * c-c++-common/goacc/loop-private-1.c: Likewise. * c-c++-common/goacc/nesting-1.c: Likewise. * c-c++-common/goacc/nesting-data-1.c: Likewise. * c-c++-common/goacc/nesting-fail-1.c: Likewise. * c-c++-common/goacc/parallel-1.c: Likewise. * c-c++-common/goacc/pcopy.c: Likewise. * c-c++-common/goacc/pcopyin.c: Likewise. * c-c++-common/goacc/pcopyout.c: Likewise. * c-c++-common/goacc/pcreate.c: Likewise. * c-c++-common/goacc/pragma_context.c: Likewise. * c-c++-common/goacc/present-1.c: Likewise. * c-c++-common/goacc/reduction-1.c: Likewise. * c-c++-common/goacc/reduction-2.c: Likewise. * c-c++-common/goacc/reduction-3.c: Likewise. * c-c++-common/goacc/reduction-4.c: Likewise. * c-c++-common/goacc/sb-1.c: Likewise. * c-c++-common/goacc/sb-2.c: Likewise. * c-c++-common/goacc/sb-3.c: Likewise. * c-c++-common/goacc/update-1.c: Likewise. * gcc.dg/goacc/acc_on_device-1.c: Likewise. * gfortran.dg/goacc/acc_on_device-1.f95: Likewise. * gfortran.dg/goacc/acc_on_device-2-off.f95: Likewise. * gfortran.dg/goacc/acc_on_device-2.f95: Likewise. * gfortran.dg/goacc/assumed.f95: Likewise. * gfortran.dg/goacc/asyncwait-1.f95: Likewise. * gfortran.dg/goacc/asyncwait-2.f95: Likewise. * gfortran.dg/goacc/asyncwait-3.f95: Likewise. * gfortran.dg/goacc/asyncwait-4.f95: Likewise. * gfortran.dg/goacc/branch.f95: Likewise. * gfortran.dg/goacc/cache-1.f95: Likewise. * gfortran.dg/goacc/coarray.f95: Likewise. * gfortran.dg/goacc/continuation-free-form.f95: Likewise. * gfortran.dg/goacc/cray.f95: Likewise. * gfortran.dg/goacc/critical.f95: Likewise. * gfortran.dg/goacc/data-clauses.f95: Likewise. * gfortran.dg/goacc/data-tree.f95: Likewise. * gfortran.dg/goacc/declare-1.f95: Likewise. * gfortran.dg/goacc/enter-exit-data.f95: Likewise. * gfortran.dg/goacc/fixed-1.f: Likewise. * gfortran.dg/goacc/fixed-2.f: Likewise. * gfortran.dg/goacc/fixed-3.f: Likewise. * gfortran.dg/goacc/fixed-4.f: Likewise. * gfortran.dg/goacc/host_data-tree.f95: Likewise. * gfortran.dg/goacc/if.f95: Likewise. * gfortran.dg/goacc/kernels-tree.f95: Likewise. * gfortran.dg/goacc/list.f95: Likewise. * gfortran.dg/goacc/literal.f95: Likewise. * gfortran.dg/goacc/loop-1.f95: Likewise. * gfortran.dg/goacc/loop-2.f95: Likewise. * gfortran.dg/goacc/loop-3.f95: Likewise. * gfortran.dg/goacc/loop-tree-1.f90: Likewise. * gfortran.dg/goacc/omp.f95: Likewise. * gfortran.dg/goacc/parallel-kernels-clauses.f95: Likewise. * gfortran.dg/goacc/parallel-kernels-regions.f95: Likewise. * gfortran.dg/goacc/parallel-tree.f95: Likewise. * gfortran.dg/goacc/parameter.f95: Likewise. * gfortran.dg/goacc/private-1.f95: Likewise. * gfortran.dg/goacc/private-2.f95: Likewise. * gfortran.dg/goacc/private-3.f95: Likewise. * gfortran.dg/goacc/pure-elemental-procedures.f95: Likewise. * gfortran.dg/goacc/reduction-2.f95: Likewise. * gfortran.dg/goacc/reduction.f95: Likewise. * gfortran.dg/goacc/routine-1.f90: Likewise. * gfortran.dg/goacc/routine-2.f90: Likewise. * gfortran.dg/goacc/sentinel-free-form.f95: Likewise. * gfortran.dg/goacc/several-directives.f95: Likewise. * gfortran.dg/goacc/sie.f95: Likewise. * gfortran.dg/goacc/subarrays.f95: Likewise. * gfortran.dg/gomp/map-1.f90: Likewise. * gfortran.dg/openacc-define-1.f90: Likewise. * gfortran.dg/openacc-define-2.f90: Likewise. * gfortran.dg/openacc-define-3.f90: Likewise. * g++.dg/gomp/block-1.C: Update for changed compiler output. * g++.dg/gomp/block-2.C: Likewise. * g++.dg/gomp/block-3.C: Likewise. * g++.dg/gomp/block-5.C: Likewise. * g++.dg/gomp/target-1.C: Likewise. * g++.dg/gomp/target-2.C: Likewise. * g++.dg/gomp/taskgroup-1.C: Likewise. * g++.dg/gomp/teams-1.C: Likewise. * gcc.dg/cilk-plus/jump-openmp.c: Likewise. * gcc.dg/cilk-plus/jump.c: Likewise. * gcc.dg/gomp/block-1.c: Likewise. * gcc.dg/gomp/block-10.c: Likewise. * gcc.dg/gomp/block-2.c: Likewise. * gcc.dg/gomp/block-3.c: Likewise. * gcc.dg/gomp/block-4.c: Likewise. * gcc.dg/gomp/block-5.c: Likewise. * gcc.dg/gomp/block-6.c: Likewise. * gcc.dg/gomp/block-7.c: Likewise. * gcc.dg/gomp/block-8.c: Likewise. * gcc.dg/gomp/block-9.c: Likewise. * gcc.dg/gomp/target-1.c: Likewise. * gcc.dg/gomp/target-2.c: Likewise. * gcc.dg/gomp/taskgroup-1.c: Likewise. * gcc.dg/gomp/teams-1.c: Likewise. include/ * gomp-constants.h: New file. libgomp/ * Makefile.am (search_path): Add $(top_srcdir)/../include. (libgomp_la_SOURCES): Add splay-tree.c, libgomp-plugin.c, oacc-parallel.c, oacc-host.c, oacc-init.c, oacc-mem.c, oacc-async.c, oacc-plugin.c, oacc-cuda.c. [USE_FORTRAN] (libgomp_la_SOURCES): Add openacc.f90. Include $(top_srcdir)/plugin/Makefrag.am. (nodist_libsubinclude_HEADERS): Add openacc.h. [USE_FORTRAN] (nodist_finclude_HEADERS): Add openacc_lib.h, openacc.f90, openacc.mod, openacc_kinds.mod. (omp_lib.mod): Generalize into... (%.mod): ... this new rule. (openacc_kinds.mod, openacc.mod): New rules. * plugin/configfrag.ac: New file. * configure.ac: Move plugin/offloading support into it. Include it. Instantiate testsuite/libgomp-test-support.pt.exp. * plugin/Makefrag.am: New file. * testsuite/Makefile.am (OFFLOAD_TARGETS) (OFFLOAD_ADDITIONAL_OPTIONS, OFFLOAD_ADDITIONAL_LIB_PATHS): Don't export. (libgomp-test-support.exp): New rule. (all-local): Depend on it. * Makefile.in: Regenerate. * testsuite/Makefile.in: Regenerate. * config.h.in: Likewise. * configure: Likewise. * configure.tgt: Harden shell syntax. * env.c: Include "oacc-int.h". (parse_acc_device_type): New function. (gomp_debug_var, goacc_device_type, goacc_device_num): New variables. (initialize_env): Initialize those. Call goacc_runtime_initialize. * error.c (gomp_vdebug, gomp_debug, gomp_vfatal): New functions. (gomp_fatal): Call gomp_vfatal. * libgomp.h: Include "libgomp-plugin.h" and <stdarg.h>. (gomp_debug_var, goacc_device_type, goacc_device_num, gomp_vdebug) (gomp_debug, gomp_verror, gomp_vfatal, gomp_init_targets_once) (splay_tree_node, splay_tree, splay_tree_key) (struct target_mem_desc, struct splay_tree_key_s) (struct gomp_memory_mapping, struct acc_dispatch_t) (struct gomp_device_descr, gomp_acc_insert_pointer) (gomp_acc_remove_pointer, target_mem_desc, gomp_copy_from_async) (gomp_unmap_vars, gomp_init_device, gomp_init_tables) (gomp_free_memmap, gomp_fini_device): New declarations. (gomp_vdebug, gomp_debug): New macros. Include "splay-tree.h". * libgomp.map (OACC_2.0): New symbol version. Use for acc_get_num_devices, acc_get_num_devices_h_, acc_set_device_type, acc_set_device_type_h_, acc_get_device_type, acc_get_device_type_h_, acc_set_device_num, acc_set_device_num_h_, acc_get_device_num, acc_get_device_num_h_, acc_async_test, acc_async_test_h_, acc_async_test_all, acc_async_test_all_h_, acc_wait, acc_wait_h_, acc_wait_async, acc_wait_async_h_, acc_wait_all, acc_wait_all_h_, acc_wait_all_async, acc_wait_all_async_h_, acc_init, acc_init_h_, acc_shutdown, acc_shutdown_h_, acc_on_device, acc_on_device_h_, acc_malloc, acc_free, acc_copyin, acc_copyin_32_h_, acc_copyin_64_h_, acc_copyin_array_h_, acc_present_or_copyin, acc_present_or_copyin_32_h_, acc_present_or_copyin_64_h_, acc_present_or_copyin_array_h_, acc_create, acc_create_32_h_, acc_create_64_h_, acc_create_array_h_, acc_present_or_create, acc_present_or_create_32_h_, acc_present_or_create_64_h_, acc_present_or_create_array_h_, acc_copyout, acc_copyout_32_h_, acc_copyout_64_h_, acc_copyout_array_h_, acc_delete, acc_delete_32_h_, acc_delete_64_h_, acc_delete_array_h_, acc_update_device, acc_update_device_32_h_, acc_update_device_64_h_, acc_update_device_array_h_, acc_update_self, acc_update_self_32_h_, acc_update_self_64_h_, acc_update_self_array_h_, acc_map_data, acc_unmap_data, acc_deviceptr, acc_hostptr, acc_is_present, acc_is_present_32_h_, acc_is_present_64_h_, acc_is_present_array_h_, acc_memcpy_to_device, acc_memcpy_from_device, acc_get_current_cuda_device, acc_get_current_cuda_context, acc_get_cuda_stream, acc_set_cuda_stream. (GOACC_2.0): New symbol version. Use for GOACC_data_end, GOACC_data_start, GOACC_enter_exit_data, GOACC_parallel, GOACC_update, GOACC_wait, GOACC_get_thread_num, GOACC_get_num_threads. (GOMP_PLUGIN_1.0): New symbol version. Use for GOMP_PLUGIN_malloc, GOMP_PLUGIN_malloc_cleared, GOMP_PLUGIN_realloc, GOMP_PLUGIN_debug, GOMP_PLUGIN_error, GOMP_PLUGIN_fatal, GOMP_PLUGIN_async_unmap_vars, GOMP_PLUGIN_acc_thread. * libgomp.texi: Update for OpenACC changes, and GOMP_DEBUG environment variable. * libgomp_g.h (GOACC_data_start, GOACC_data_end) (GOACC_enter_exit_data, GOACC_parallel, GOACC_update, GOACC_wait) (GOACC_get_num_threads, GOACC_get_thread_num): New declarations. * splay-tree.h (splay_tree_lookup, splay_tree_insert) (splay_tree_remove): New declarations. (rotate_left, rotate_right, splay_tree_splay, splay_tree_insert) (splay_tree_remove, splay_tree_lookup): Move into... * splay-tree.c: ... this new file. * target.c: Include "oacc-plugin.h", "oacc-int.h", <assert.h>. (splay_tree_node, splay_tree, splay_tree_key) (struct target_mem_desc, struct splay_tree_key_s) (struct gomp_device_descr): Don't declare. (num_devices_openmp): New variable. (gomp_get_num_devices ): Use it. (gomp_init_targets_once): New function. (gomp_get_num_devices ): Use it. (get_kind, gomp_copy_from_async, gomp_free_memmap) (gomp_fini_device, gomp_register_image_for_device): New functions. (gomp_map_vars): Add devaddrs parameter. (gomp_update): Add mm parameter. (gomp_init_device): Move most of it into... (gomp_init_tables): ... this new function. (gomp_register_images_for_device): Remove function. (splay_compare, gomp_map_vars, gomp_unmap_vars, gomp_init_device): Make them hidden instead of static. (gomp_map_vars_existing, gomp_map_vars, gomp_unmap_vars) (gomp_update, gomp_init_device, GOMP_target, GOMP_target_data) (GOMP_target_end_data, GOMP_target_update) (gomp_load_plugin_for_device, gomp_target_init): Update for OpenACC changes. * oacc-async.c: New file. * oacc-cuda.c: Likewise. * oacc-host.c: Likewise. * oacc-init.c: Likewise. * oacc-int.h: Likewise. * oacc-mem.c: Likewise. * oacc-parallel.c: Likewise. * oacc-plugin.c: Likewise. * oacc-plugin.h: Likewise. * oacc-ptx.h: Likewise. * openacc.f90: Likewise. * openacc.h: Likewise. * openacc_lib.h: Likewise. * plugin/plugin-host.c: Likewise. * plugin/plugin-nvptx.c: Likewise. * libgomp-plugin.c: Likewise. * libgomp-plugin.h: Likewise. * libgomp_target.h: Remove file after merging content into the former file. Update all users. * testsuite/lib/libgomp.exp: Load libgomp-test-support.exp. (offload_targets_s, offload_targets_s_openacc): New variables. (check_effective_target_openacc_nvidia_accel_present) (check_effective_target_openacc_nvidia_accel_selected): New procedures. (libgomp_init): Update for OpenACC changes. * testsuite/libgomp-test-support.exp.in: New file. * testsuite/libgomp.oacc-c++/c++.exp: Likewise. * testsuite/libgomp.oacc-c/c.exp: Likewise. * testsuite/libgomp.oacc-fortran/fortran.exp: Likewise. * testsuite/libgomp.oacc-c-c++-common/abort-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/abort-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/abort-3.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/abort-4.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/asyncwait-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/cache-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/clauses-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/clauses-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/collapse-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/collapse-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/collapse-3.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/collapse-4.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/context-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/context-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/context-3.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/context-4.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/data-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/data-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/data-3.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/data-already-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/data-already-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/data-already-3.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/data-already-4.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/data-already-5.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/data-already-6.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/data-already-7.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/data-already-8.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/deviceptr-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/if-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/kernels-empty.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-10.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-11.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-12.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-13.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-14.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-15.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-16.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-17.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-18.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-19.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-20.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-21.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-22.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-23.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-24.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-25.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-26.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-27.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-28.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-29.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-3.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-30.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-31.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-32.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-33.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-34.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-35.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-36.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-37.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-38.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-39.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-4.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-40.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-41.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-42.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-43.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-44.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-45.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-46.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-47.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-48.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-49.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-5.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-50.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-51.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-52.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-53.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-54.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-55.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-56.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-57.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-58.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-59.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-6.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-60.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-61.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-62.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-63.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-64.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-65.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-66.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-67.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-68.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-69.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-7.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-70.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-71.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-72.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-73.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-74.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-75.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-76.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-77.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-78.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-79.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-80.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-81.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-82.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-83.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-84.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-85.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-86.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-87.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-88.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-89.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-9.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-90.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-91.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-92.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/nested-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/nested-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/offset-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/parallel-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/parallel-empty.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/pointer-align-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/present-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/present-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/reduction-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/reduction-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/reduction-3.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/reduction-4.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/reduction-5.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/reduction-initial-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/subr.h: Likewise. * testsuite/libgomp.oacc-c-c++-common/subr.ptx: Likewise. * testsuite/libgomp.oacc-c-c++-common/timer.h: Likewise. * testsuite/libgomp.oacc-c-c++-common/update-1-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/update-1.c: Likewise. * testsuite/libgomp.oacc-fortran/abort-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/abort-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/acc_on_device-1-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/acc_on_device-1-2.f: Likewise. * testsuite/libgomp.oacc-fortran/acc_on_device-1-3.f: Likewise. * testsuite/libgomp.oacc-fortran/asyncwait-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/asyncwait-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/asyncwait-3.f90: Likewise. * testsuite/libgomp.oacc-fortran/collapse-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/collapse-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/collapse-3.f90: Likewise. * testsuite/libgomp.oacc-fortran/collapse-4.f90: Likewise. * testsuite/libgomp.oacc-fortran/collapse-5.f90: Likewise. * testsuite/libgomp.oacc-fortran/collapse-6.f90: Likewise. * testsuite/libgomp.oacc-fortran/collapse-7.f90: Likewise. * testsuite/libgomp.oacc-fortran/collapse-8.f90: Likewise. * testsuite/libgomp.oacc-fortran/data-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/data-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/data-3.f90: Likewise. * testsuite/libgomp.oacc-fortran/data-4-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/data-4.f90: Likewise. * testsuite/libgomp.oacc-fortran/data-already-1.f: Likewise. * testsuite/libgomp.oacc-fortran/data-already-2.f: Likewise. * testsuite/libgomp.oacc-fortran/data-already-3.f: Likewise. * testsuite/libgomp.oacc-fortran/data-already-4.f: Likewise. * testsuite/libgomp.oacc-fortran/data-already-5.f: Likewise. * testsuite/libgomp.oacc-fortran/data-already-6.f: Likewise. * testsuite/libgomp.oacc-fortran/data-already-7.f: Likewise. * testsuite/libgomp.oacc-fortran/data-already-8.f: Likewise. * testsuite/libgomp.oacc-fortran/lib-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/lib-10.f90: Likewise. * testsuite/libgomp.oacc-fortran/lib-2.f: Likewise. * testsuite/libgomp.oacc-fortran/lib-3.f: Likewise. * testsuite/libgomp.oacc-fortran/lib-4.f90: Likewise. * testsuite/libgomp.oacc-fortran/lib-5.f90: Likewise. * testsuite/libgomp.oacc-fortran/lib-6.f90: Likewise. * testsuite/libgomp.oacc-fortran/lib-7.f90: Likewise. * testsuite/libgomp.oacc-fortran/lib-8.f90: Likewise. * testsuite/libgomp.oacc-fortran/map-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/openacc_version-1.f: Likewise. * testsuite/libgomp.oacc-fortran/openacc_version-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/pointer-align-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/pset-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/reduction-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/reduction-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/reduction-3.f90: Likewise. * testsuite/libgomp.oacc-fortran/reduction-4.f90: Likewise. * testsuite/libgomp.oacc-fortran/reduction-5.f90: Likewise. * testsuite/libgomp.oacc-fortran/reduction-6.f90: Likewise. * testsuite/libgomp.oacc-fortran/routine-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/routine-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/routine-3.f90: Likewise. * testsuite/libgomp.oacc-fortran/routine-4.f90: Likewise. * testsuite/libgomp.oacc-fortran/subarrays-1.f90: Likewise. * testsuite/libgomp.oacc-fortran/subarrays-2.f90: Likewise. liboffloadmic/ * plugin/libgomp-plugin-intelmic.cpp (GOMP_OFFLOAD_get_name) (GOMP_OFFLOAD_get_caps, GOMP_OFFLOAD_fini_device): New functions. Co-Authored-By: Bernd Schmidt <bernds@codesourcery.com> Co-Authored-By: Cesar Philippidis <cesar@codesourcery.com> Co-Authored-By: Dmitry Bocharnikov <dmitry.b@samsung.com> Co-Authored-By: Evgeny Gavrin <e.gavrin@samsung.com> Co-Authored-By: Ilmir Usmanov <i.usmanov@samsung.com> Co-Authored-By: Jakub Jelinek <jakub@redhat.com> Co-Authored-By: James Norris <jnorris@codesourcery.com> Co-Authored-By: Julian Brown <julian@codesourcery.com> Co-Authored-By: Nathan Sidwell <nathan@codesourcery.com> Co-Authored-By: Tobias Burnus <burnus@net-b.de> Co-Authored-By: Tom de Vries <tom@codesourcery.com> From-SVN: r219682
author: Thomas Schwinge <thomas@codesourcery.com> 2015-01-15 21:11:12 +0100
committer: Thomas Schwinge <tschwinge@gcc.gnu.org> 2015-01-15 21:11:12 +0100
commit: 41dbbb3789850dfea98dd8984f69806284f87b6e (patch)
tree: 97a0bb274cc7583206397ba37ab5c0bbe01cb04d /libgomp/plugin
parent: 96a87981994da859c17259d8c4dccb6602476b0e (diff)
download: gcc-41dbbb3789850dfea98dd8984f69806284f87b6e.zip
gcc-41dbbb3789850dfea98dd8984f69806284f87b6e.tar.gz
gcc-41dbbb3789850dfea98dd8984f69806284f87b6e.tar.bz2
4 files changed, 2254 insertions, 0 deletions
diff --git a/libgomp/plugin/Makefrag.am b/libgomp/plugin/Makefrag.am
new file mode 100644
index 0000000..167485f
--- /dev/null
+++ b/libgomp/plugin/Makefrag.am
@@ -0,0 +1,49 @@
+# Plugins for offload execution, Makefile.am fragment.
+#
+# Copyright (C) 2014-2015 Free Software Foundation, Inc.
+#
+# Contributed by Mentor Embedded.
+#
+# This file is part of the GNU Offloading and Multi Processing Library
+# (libgomp).
+#
+# Libgomp is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+# more details.
+#
+# Under Section 7 of GPL version 3, you are granted additional
+# permissions described in the GCC Runtime Library Exception, version
+# 3.1, as published by the Free Software Foundation.
+#
+# You should have received a copy of the GNU General Public License and
+# a copy of the GCC Runtime Library Exception along with this program;
+# see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+# <http://www.gnu.org/licenses/>.
+
+if PLUGIN_NVPTX
+# Nvidia PTX OpenACC plugin.
+libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
+toolexeclib_LTLIBRARIES += libgomp-plugin-nvptx.la
+libgomp_plugin_nvptx_la_SOURCES = plugin/plugin-nvptx.c
+libgomp_plugin_nvptx_la_CPPFLAGS = $(AM_CPPFLAGS) $(PLUGIN_NVPTX_CPPFLAGS)
+libgomp_plugin_nvptx_la_LDFLAGS = $(libgomp_plugin_nvptx_version_info) \
+	$(lt_host_flags)
+libgomp_plugin_nvptx_la_LDFLAGS += $(PLUGIN_NVPTX_LDFLAGS)
+libgomp_plugin_nvptx_la_LIBADD = libgomp.la $(PLUGIN_NVPTX_LIBS)
+libgomp_plugin_nvptx_la_LIBTOOLFLAGS = --tag=disable-static
+endif
+
+libgomp_plugin_host_nonshm_version_info = -version-info $(libtool_VERSION)
+toolexeclib_LTLIBRARIES += libgomp-plugin-host_nonshm.la
+libgomp_plugin_host_nonshm_la_SOURCES = plugin/plugin-host.c
+libgomp_plugin_host_nonshm_la_CPPFLAGS = $(AM_CPPFLAGS) -DHOST_NONSHM_PLUGIN
+libgomp_plugin_host_nonshm_la_LDFLAGS = \
+	$(libgomp_plugin_host_nonshm_version_info) $(lt_host_flags)
+libgomp_plugin_host_nonshm_la_LIBADD = libgomp.la
+libgomp_plugin_host_nonshm_la_LIBTOOLFLAGS = --tag=disable-static
diff --git a/libgomp/plugin/configfrag.ac b/libgomp/plugin/configfrag.ac
new file mode 100644
index 0000000..254c688
--- /dev/null
+++ b/libgomp/plugin/configfrag.ac
@@ -0,0 +1,148 @@
+# Plugins for offload execution, configure.ac fragment.  -*- mode: autoconf -*-
+#
+# Copyright (C) 2014-2015 Free Software Foundation, Inc.
+#
+# Contributed by Mentor Embedded.
+#
+# This file is part of the GNU Offloading and Multi Processing Library
+# (libgomp).
+#
+# Libgomp is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+# more details.
+#
+# Under Section 7 of GPL version 3, you are granted additional
+# permissions described in the GCC Runtime Library Exception, version
+# 3.1, as published by the Free Software Foundation.
+#
+# You should have received a copy of the GNU General Public License and
+# a copy of the GCC Runtime Library Exception along with this program;
+# see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+# <http://www.gnu.org/licenses/>.
+
+offload_targets=
+AC_SUBST(offload_targets)
+plugin_support=yes
+AC_CHECK_LIB(dl, dlsym, , [plugin_support=no])
+if test x"$plugin_support" = xyes; then
+  AC_DEFINE(PLUGIN_SUPPORT, 1,
+    [Define if all infrastructure, needed for plugins, is supported.])
+  offload_targets=host_nonshm
+elif test "x${enable_offload_targets-no}" != xno; then
+  AC_MSG_ERROR([Can't support offloading without support for plugins])
+fi
+
+# Look for the CUDA driver package.
+CUDA_DRIVER_INCLUDE=
+CUDA_DRIVER_LIB=
+AC_SUBST(CUDA_DRIVER_INCLUDE)
+AC_SUBST(CUDA_DRIVER_LIB)
+CUDA_DRIVER_CPPFLAGS=
+CUDA_DRIVER_LDFLAGS=
+AC_ARG_WITH(cuda-driver,
+	[AS_HELP_STRING([--with-cuda-driver=PATH],
+		[specify prefix directory for installed CUDA driver package.
+		 Equivalent to --with-cuda-driver-include=PATH/include
+		 plus --with-cuda-driver-lib=PATH/lib])])
+AC_ARG_WITH(cuda-driver-include,
+	[AS_HELP_STRING([--with-cuda-driver-include=PATH],
+		[specify directory for installed CUDA driver include files])])
+AC_ARG_WITH(cuda-driver-lib,
+	[AS_HELP_STRING([--with-cuda-driver-lib=PATH],
+		[specify directory for the installed CUDA driver library])])
+if test "x$with_cuda_driver" != x; then
+  CUDA_DRIVER_INCLUDE=$with_cuda_driver/include
+  CUDA_DRIVER_LIB=$with_cuda_driver/lib
+fi
+if test "x$with_cuda_driver_include" != x; then
+  CUDA_DRIVER_INCLUDE=$with_cuda_driver_include
+fi
+if test "x$with_cuda_driver_lib" != x; then
+  CUDA_DRIVER_LIB=$with_cuda_driver_lib
+fi
+if test "x$CUDA_DRIVER_INCLUDE" != x; then
+  CUDA_DRIVER_CPPFLAGS=-I$CUDA_DRIVER_INCLUDE
+fi
+if test "x$CUDA_DRIVER_LIB" != x; then
+  CUDA_DRIVER_LDFLAGS=-L$CUDA_DRIVER_LIB
+fi
+
+PLUGIN_NVPTX=0
+PLUGIN_NVPTX_CPPFLAGS=
+PLUGIN_NVPTX_LDFLAGS=
+PLUGIN_NVPTX_LIBS=
+AC_SUBST(PLUGIN_NVPTX)
+AC_SUBST(PLUGIN_NVPTX_CPPFLAGS)
+AC_SUBST(PLUGIN_NVPTX_LDFLAGS)
+AC_SUBST(PLUGIN_NVPTX_LIBS)
+
+# Get offload targets and path to install tree of offloading compiler.
+offload_additional_options=
+offload_additional_lib_paths=
+AC_SUBST(offload_additional_options)
+AC_SUBST(offload_additional_lib_paths)
+if test x"$enable_offload_targets" != x; then
+  for tgt in `echo $enable_offload_targets | sed -e 's#,# #g'`; do
+    tgt_dir=`echo $tgt | grep '=' | sed 's/.*=//'`
+    tgt=`echo $tgt | sed 's/=.*//'`
+    case $tgt in
+      *-intelmic-* | *-intelmicemul-*)
+	tgt_name=intelmic
+	;;
+      nvptx*)
+        tgt_name=nvptx
+	PLUGIN_NVPTX=$tgt
+	PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS
+	PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS
+	PLUGIN_NVPTX_LIBS='-lcuda'
+
+	PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS
+	CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS"
+	PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS
+	LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS"
+	PLUGIN_NVPTX_save_LIBS=$LIBS
+	LIBS="$PLUGIN_NVPTX_LIBS $LIBS"
+	AC_LINK_IFELSE(
+	  [AC_LANG_PROGRAM(
+	    [#include "cuda.h"],
+	      [CUresult r = cuCtxPushCurrent (NULL);])],
+	  [PLUGIN_NVPTX=1])
+	CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS
+	LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS
+	LIBS=$PLUGIN_NVPTX_save_LIBS
+	case $PLUGIN_NVPTX in
+	  nvptx*)
+	    PLUGIN_NVPTX=0
+	    AC_MSG_ERROR([CUDA driver package required for nvptx support])
+	    ;;
+	esac
+	;;
+      *)
+	AC_MSG_ERROR([unknown offload target specified])
+	;;
+    esac
+    if test x"$offload_targets" = x; then
+      offload_targets=$tgt_name
+    else
+      offload_targets=$offload_targets,$tgt_name
+    fi
+    if test x"$tgt_dir" != x; then
+      offload_additional_options="$offload_additional_options -B$tgt_dir/libexec/gcc/\$(target_alias)/\$(gcc_version) -B$tgt_dir/bin"
+      offload_additional_lib_paths="$offload_additional_lib_paths:$tgt_dir/lib64:$tgt_dir/lib:$tgt_dir/lib32"
+    else
+      offload_additional_options="$offload_additional_options -B\$(libexecdir)/gcc/\$(target_alias)/\$(gcc_version) -B\$(bindir)"
+      offload_additional_lib_paths="$offload_additional_lib_paths:$toolexeclibdir"
+    fi
+  done
+fi
+AC_DEFINE_UNQUOTED(OFFLOAD_TARGETS, "$offload_targets",
+  [Define to hold the list of target names suitable for offloading.])
+AM_CONDITIONAL([PLUGIN_NVPTX], [test $PLUGIN_NVPTX = 1])
+AC_DEFINE_UNQUOTED([PLUGIN_NVPTX], [$PLUGIN_NVPTX],
+  [Define to 1 if the NVIDIA plugin is built, 0 if not.])
diff --git a/libgomp/plugin/plugin-host.c b/libgomp/plugin/plugin-host.c
new file mode 100644
index 0000000..ebf7f11
--- /dev/null
+++ b/libgomp/plugin/plugin-host.c
@@ -0,0 +1,266 @@
+/* OpenACC Runtime Library: acc_device_host, acc_device_host_nonshm.
+
+   Copyright (C) 2013-2015 Free Software Foundation, Inc.
+
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Simple implementation of support routines for a shared-memory
+   acc_device_host, and a non-shared memory acc_device_host_nonshm, with the
+   latter built as a plugin.  */
+
+#include "openacc.h"
+#include "config.h"
+#ifdef HOST_NONSHM_PLUGIN
+#include "libgomp-plugin.h"
+#include "oacc-plugin.h"
+#else
+#include "libgomp.h"
+#include "oacc-int.h"
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#ifdef HOST_NONSHM_PLUGIN
+#define STATIC
+#define GOMP(X) GOMP_PLUGIN_##X
+#define SELF "host_nonshm plugin: "
+#else
+#define STATIC static
+#define GOMP(X) gomp_##X
+#define SELF "host: "
+#endif
+
+STATIC const char *
+GOMP_OFFLOAD_get_name (void)
+{
+#ifdef HOST_NONSHM_PLUGIN
+  return "host_nonshm";
+#else
+  return "host";
+#endif
+}
+
+STATIC unsigned int
+GOMP_OFFLOAD_get_caps (void)
+{
+  unsigned int caps = (GOMP_OFFLOAD_CAP_OPENACC_200
+		       | GOMP_OFFLOAD_CAP_NATIVE_EXEC);
+
+#ifndef HOST_NONSHM_PLUGIN
+  caps |= GOMP_OFFLOAD_CAP_SHARED_MEM;
+#endif
+
+  return caps;
+}
+
+STATIC int
+GOMP_OFFLOAD_get_type (void)
+{
+#ifdef HOST_NONSHM_PLUGIN
+  return OFFLOAD_TARGET_TYPE_HOST_NONSHM;
+#else
+  return OFFLOAD_TARGET_TYPE_HOST;
+#endif
+}
+
+STATIC int
+GOMP_OFFLOAD_get_num_devices (void)
+{
+  return 1;
+}
+
+STATIC void
+GOMP_OFFLOAD_register_image (void *host_table __attribute__ ((unused)),
+			     void *target_data __attribute__ ((unused)))
+{
+}
+
+STATIC void
+GOMP_OFFLOAD_init_device (int n __attribute__ ((unused)))
+{
+}
+
+STATIC void
+GOMP_OFFLOAD_fini_device (int n __attribute__ ((unused)))
+{
+}
+
+STATIC int
+GOMP_OFFLOAD_get_table (int n __attribute__ ((unused)),
+			struct mapping_table **table __attribute__ ((unused)))
+{
+  return 0;
+}
+
+STATIC void *
+GOMP_OFFLOAD_openacc_open_device (int n)
+{
+  return (void *) (intptr_t) n;
+}
+
+STATIC int
+GOMP_OFFLOAD_openacc_close_device (void *hnd)
+{
+  return 0;
+}
+
+STATIC int
+GOMP_OFFLOAD_openacc_get_device_num (void)
+{
+  return 0;
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_set_device_num (int n)
+{
+  if (n > 0)
+    GOMP (fatal) ("device number %u out of range for host execution", n);
+}
+
+STATIC void *
+GOMP_OFFLOAD_alloc (int n __attribute__ ((unused)), size_t s)
+{
+  return GOMP (malloc) (s);
+}
+
+STATIC void
+GOMP_OFFLOAD_free (int n __attribute__ ((unused)), void *p)
+{
+  free (p);
+}
+
+STATIC void *
+GOMP_OFFLOAD_host2dev (int n __attribute__ ((unused)), void *d, const void *h,
+		       size_t s)
+{
+#ifdef HOST_NONSHM_PLUGIN
+  memcpy (d, h, s);
+#endif
+
+  return 0;
+}
+
+STATIC void *
+GOMP_OFFLOAD_dev2host (int n __attribute__ ((unused)), void *h, const void *d,
+		       size_t s)
+{
+#ifdef HOST_NONSHM_PLUGIN
+  memcpy (h, d, s);
+#endif
+
+  return 0;
+}
+
+STATIC void
+GOMP_OFFLOAD_run (int n __attribute__ ((unused)), void *fn_ptr, void *vars)
+{
+  void (*fn)(void *) = (void (*)(void *)) fn_ptr;
+
+  fn (vars);
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *),
+			       size_t mapnum __attribute__ ((unused)),
+			       void **hostaddrs __attribute__ ((unused)),
+			       void **devaddrs __attribute__ ((unused)),
+			       size_t *sizes __attribute__ ((unused)),
+			       unsigned short *kinds __attribute__ ((unused)),
+			       int num_gangs __attribute__ ((unused)),
+			       int num_workers __attribute__ ((unused)),
+			       int vector_length __attribute__ ((unused)),
+			       int async __attribute__ ((unused)),
+			       void *targ_mem_desc __attribute__ ((unused)))
+{
+#ifdef HOST_NONSHM_PLUGIN
+  fn (devaddrs);
+#else
+  fn (hostaddrs);
+#endif
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
+{
+#ifdef HOST_NONSHM_PLUGIN
+  /* "Asynchronous" launches are executed synchronously on the (non-SHM) host,
+     so there's no point in delaying host-side cleanup -- just do it now.  */
+  GOMP_PLUGIN_async_unmap_vars (targ_mem_desc);
+#endif
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_async_set_async (int async __attribute__ ((unused)))
+{
+}
+
+STATIC int
+GOMP_OFFLOAD_openacc_async_test (int async __attribute__ ((unused)))
+{
+  return 1;
+}
+
+STATIC int
+GOMP_OFFLOAD_openacc_async_test_all (void)
+{
+  return 1;
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_async_wait (int async __attribute__ ((unused)))
+{
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_async_wait_all (void)
+{
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_async_wait_async (int async1 __attribute__ ((unused)),
+				       int async2 __attribute__ ((unused)))
+{
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_async_wait_all_async (int async __attribute__ ((unused)))
+{
+}
+
+STATIC void *
+GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data
+					 __attribute__ ((unused)))
+{
+  return NULL;
+}
+
+STATIC void
+GOMP_OFFLOAD_openacc_destroy_thread_data (void *tls_data
+					  __attribute__ ((unused)))
+{
+}
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
new file mode 100644
index 0000000..483cb75
--- /dev/null
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -0,0 +1,1791 @@
+/* Plugin for NVPTX execution.
+
+   Copyright (C) 2013-2015 Free Software Foundation, Inc.
+
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
+   library appears to hold some implicit state, but the documentation
+   is not clear as to what that state might be.  Or how one might
+   propagate it from one thread to another.  */
+
+#include "openacc.h"
+#include "config.h"
+#include "libgomp-plugin.h"
+#include "oacc-ptx.h"
+#include "oacc-plugin.h"
+
+#include <pthread.h>
+#include <cuda.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <dlfcn.h>
+#include <unistd.h>
+#include <assert.h>
+
+#define	ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
+
+static struct
+{
+  CUresult r;
+  char *m;
+} cuda_errlist[]=
+{
+  { CUDA_ERROR_INVALID_VALUE, "invalid value" },
+  { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
+  { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
+  { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
+  { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
+  { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
+  { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
+  { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
+  { CUDA_ERROR_NO_DEVICE, "no device" },
+  { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
+  { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
+  { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
+  { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
+  { CUDA_ERROR_MAP_FAILED, "map error" },
+  { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
+  { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
+  { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
+  { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
+  { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
+  { CUDA_ERROR_NOT_MAPPED, "not mapped" },
+  { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
+  { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
+  { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
+  { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
+  { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
+  { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
+  { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
+  { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
+  { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+                                           "shared object symbol not found" },
+  { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
+  { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
+  { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
+  { CUDA_ERROR_NOT_FOUND, "not found" },
+  { CUDA_ERROR_NOT_READY, "not ready" },
+  { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
+  { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
+  { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
+  { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+                                             "launch incompatibe texturing" },
+  { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
+  { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
+  { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
+  { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
+  { CUDA_ERROR_ASSERT, "assert" },
+  { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
+  { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
+                                           "host memory already registered" },
+  { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
+  { CUDA_ERROR_NOT_PERMITTED, "not permitted" },
+  { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
+  { CUDA_ERROR_UNKNOWN, "unknown" }
+};
+
+static char errmsg[128];
+
+static char *
+cuda_error (CUresult r)
+{
+  int i;
+
+  for (i = 0; i < ARRAYSIZE (cuda_errlist); i++)
+    {
+      if (cuda_errlist[i].r == r)
+	return &cuda_errlist[i].m[0];
+    }
+
+  sprintf (&errmsg[0], "unknown result code: %5d", r);
+
+  return &errmsg[0];
+}
+
+struct targ_fn_descriptor
+{
+  CUfunction fn;
+  const char *name;
+};
+
+static bool ptx_inited = false;
+
+struct ptx_stream
+{
+  CUstream stream;
+  pthread_t host_thread;
+  bool multithreaded;
+
+  CUdeviceptr d;
+  void *h;
+  void *h_begin;
+  void *h_end;
+  void *h_next;
+  void *h_prev;
+  void *h_tail;
+
+  struct ptx_stream *next;
+};
+
+/* Thread-specific data for PTX.  */
+
+struct nvptx_thread
+{
+  struct ptx_stream *current_stream;
+  struct ptx_device *ptx_dev;
+};
+
+struct map
+{
+  int     async;
+  size_t  size;
+  char    mappings[0];
+};
+
+static void
+map_init (struct ptx_stream *s)
+{
+  CUresult r;
+
+  int size = getpagesize ();
+
+  assert (s);
+  assert (!s->d);
+  assert (!s->h);
+
+  r = cuMemAllocHost (&s->h, size);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
+
+  r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
+
+  assert (s->h);
+
+  s->h_begin = s->h;
+  s->h_end = s->h_begin + size;
+  s->h_next = s->h_prev = s->h_tail = s->h_begin;
+
+  assert (s->h_next);
+  assert (s->h_end);
+}
+
+static void
+map_fini (struct ptx_stream *s)
+{
+  CUresult r;
+
+  r = cuMemFreeHost (s->h);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
+}
+
+static void
+map_pop (struct ptx_stream *s)
+{
+  struct map *m;
+
+  assert (s != NULL);
+  assert (s->h_next);
+  assert (s->h_prev);
+  assert (s->h_tail);
+
+  m = s->h_tail;
+
+  s->h_tail += m->size;
+
+  if (s->h_tail >= s->h_end)
+    s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
+
+  if (s->h_next == s->h_tail)
+    s->h_prev = s->h_next;
+
+  assert (s->h_next >= s->h_begin);
+  assert (s->h_tail >= s->h_begin);
+  assert (s->h_prev >= s->h_begin);
+
+  assert (s->h_next <= s->h_end);
+  assert (s->h_tail <= s->h_end);
+  assert (s->h_prev <= s->h_end);
+}
+
+static void
+map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
+{
+  int left;
+  int offset;
+  struct map *m;
+
+  assert (s != NULL);
+
+  left = s->h_end - s->h_next;
+  size += sizeof (struct map);
+
+  assert (s->h_prev);
+  assert (s->h_next);
+
+  if (size >= left)
+    {
+      m = s->h_prev;
+      m->size += left;
+      s->h_next = s->h_begin;
+
+      if (s->h_next + size > s->h_end)
+	GOMP_PLUGIN_fatal ("unable to push map");
+    }
+
+  assert (s->h_next);
+
+  m = s->h_next;
+  m->async = async;
+  m->size = size;
+
+  offset = (void *)&m->mappings[0] - s->h;
+
+  *d = (void *)(s->d + offset);
+  *h = (void *)(s->h + offset);
+
+  s->h_prev = s->h_next;
+  s->h_next += size;
+
+  assert (s->h_prev);
+  assert (s->h_next);
+
+  assert (s->h_next >= s->h_begin);
+  assert (s->h_tail >= s->h_begin);
+  assert (s->h_prev >= s->h_begin);
+  assert (s->h_next <= s->h_end);
+  assert (s->h_tail <= s->h_end);
+  assert (s->h_prev <= s->h_end);
+
+  return;
+}
+
+struct ptx_device
+{
+  CUcontext ctx;
+  bool ctx_shared;
+  CUdevice dev;
+  struct ptx_stream *null_stream;
+  /* All non-null streams associated with this device (actually context),
+     either created implicitly or passed in from the user (via
+     acc_set_cuda_stream).  */
+  struct ptx_stream *active_streams;
+  struct {
+    struct ptx_stream **arr;
+    int size;
+  } async_streams;
+  /* A lock for use when manipulating the above stream list and array.  */
+  pthread_mutex_t stream_lock;
+  int ord;
+  bool overlap;
+  bool map;
+  bool concur;
+  int  mode;
+  bool mkern;
+
+  struct ptx_device *next;
+};
+
+enum ptx_event_type
+{
+  PTX_EVT_MEM,
+  PTX_EVT_KNL,
+  PTX_EVT_SYNC,
+  PTX_EVT_ASYNC_CLEANUP
+};
+
+struct ptx_event
+{
+  CUevent *evt;
+  int type;
+  void *addr;
+  int ord;
+
+  struct ptx_event *next;
+};
+
+static pthread_mutex_t ptx_event_lock;
+static struct ptx_event *ptx_events;
+
+#define _XSTR(s) _STR(s)
+#define _STR(s) #s
+
+static struct _synames
+{
+  char *n;
+} cuda_symnames[] =
+{
+  { _XSTR (cuCtxCreate) },
+  { _XSTR (cuCtxDestroy) },
+  { _XSTR (cuCtxGetCurrent) },
+  { _XSTR (cuCtxPushCurrent) },
+  { _XSTR (cuCtxSynchronize) },
+  { _XSTR (cuDeviceGet) },
+  { _XSTR (cuDeviceGetAttribute) },
+  { _XSTR (cuDeviceGetCount) },
+  { _XSTR (cuEventCreate) },
+  { _XSTR (cuEventDestroy) },
+  { _XSTR (cuEventQuery) },
+  { _XSTR (cuEventRecord) },
+  { _XSTR (cuInit) },
+  { _XSTR (cuLaunchKernel) },
+  { _XSTR (cuLinkAddData) },
+  { _XSTR (cuLinkComplete) },
+  { _XSTR (cuLinkCreate) },
+  { _XSTR (cuMemAlloc) },
+  { _XSTR (cuMemAllocHost) },
+  { _XSTR (cuMemcpy) },
+  { _XSTR (cuMemcpyDtoH) },
+  { _XSTR (cuMemcpyDtoHAsync) },
+  { _XSTR (cuMemcpyHtoD) },
+  { _XSTR (cuMemcpyHtoDAsync) },
+  { _XSTR (cuMemFree) },
+  { _XSTR (cuMemFreeHost) },
+  { _XSTR (cuMemGetAddressRange) },
+  { _XSTR (cuMemHostGetDevicePointer) },
+  { _XSTR (cuMemHostRegister) },
+  { _XSTR (cuMemHostUnregister) },
+  { _XSTR (cuModuleGetFunction) },
+  { _XSTR (cuModuleLoadData) },
+  { _XSTR (cuStreamDestroy) },
+  { _XSTR (cuStreamQuery) },
+  { _XSTR (cuStreamSynchronize) },
+  { _XSTR (cuStreamWaitEvent) }
+};
+
+static int
+verify_device_library (void)
+{
+  int i;
+  void *dh, *ds;
+
+  dh = dlopen ("libcuda.so", RTLD_LAZY);
+  if (!dh)
+    return -1;
+
+  for (i = 0; i < ARRAYSIZE (cuda_symnames); i++)
+    {
+      ds = dlsym (dh, cuda_symnames[i].n);
+      if (!ds)
+        return -1;
+    }
+
+  dlclose (dh);
+
+  return 0;
+}
+
+static inline struct nvptx_thread *
+nvptx_thread (void)
+{
+  return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
+}
+
+static void
+init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
+{
+  int i;
+  struct ptx_stream *null_stream
+    = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
+
+  null_stream->stream = NULL;
+  null_stream->host_thread = pthread_self ();
+  null_stream->multithreaded = true;
+  null_stream->d = (CUdeviceptr) NULL;
+  null_stream->h = NULL;
+  map_init (null_stream);
+  ptx_dev->null_stream = null_stream;
+
+  ptx_dev->active_streams = NULL;
+  pthread_mutex_init (&ptx_dev->stream_lock, NULL);
+
+  if (concurrency < 1)
+    concurrency = 1;
+
+  /* This is just a guess -- make space for as many async streams as the
+     current device is capable of concurrently executing.  This can grow
+     later as necessary.  No streams are created yet.  */
+  ptx_dev->async_streams.arr
+    = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
+  ptx_dev->async_streams.size = concurrency;
+
+  for (i = 0; i < concurrency; i++)
+    ptx_dev->async_streams.arr[i] = NULL;
+}
+
+static void
+fini_streams_for_device (struct ptx_device *ptx_dev)
+{
+  free (ptx_dev->async_streams.arr);
+
+  while (ptx_dev->active_streams != NULL)
+    {
+      struct ptx_stream *s = ptx_dev->active_streams;
+      ptx_dev->active_streams = ptx_dev->active_streams->next;
+
+      cuStreamDestroy (s->stream);
+      map_fini (s);
+      free (s);
+    }
+
+  map_fini (ptx_dev->null_stream);
+  free (ptx_dev->null_stream);
+}
+
+/* Select a stream for (OpenACC-semantics) ASYNC argument for the current
+   thread THREAD (and also current device/context).  If CREATE is true, create
+   the stream if it does not exist (or use EXISTING if it is non-NULL), and
+   associate the stream with the same thread argument.  Returns stream to use
+   as result.  */
+
+static struct ptx_stream *
+select_stream_for_async (int async, pthread_t thread, bool create,
+			 CUstream existing)
+{
+  struct nvptx_thread *nvthd = nvptx_thread ();
+  /* Local copy of TLS variable.  */
+  struct ptx_device *ptx_dev = nvthd->ptx_dev;
+  struct ptx_stream *stream = NULL;
+  int orig_async = async;
+
+  /* The special value acc_async_noval (-1) maps (for now) to an
+     implicitly-created stream, which is then handled the same as any other
+     numbered async stream.  Other options are available, e.g. using the null
+     stream for anonymous async operations, or choosing an idle stream from an
+     active set.  But, stick with this for now.  */
+  if (async > acc_async_sync)
+    async++;
+
+  if (create)
+    pthread_mutex_lock (&ptx_dev->stream_lock);
+
+  /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
+     null stream, and in fact better performance may be obtainable if it doesn't
+     (because the null stream enforces overly-strict synchronisation with
+     respect to other streams for legacy reasons, and that's probably not
+     needed with OpenACC).  Maybe investigate later.  */
+  if (async == acc_async_sync)
+    stream = ptx_dev->null_stream;
+  else if (async >= 0 && async < ptx_dev->async_streams.size
+	   && ptx_dev->async_streams.arr[async] && !(create && existing))
+    stream = ptx_dev->async_streams.arr[async];
+  else if (async >= 0 && create)
+    {
+      if (async >= ptx_dev->async_streams.size)
+	{
+	  int i, newsize = ptx_dev->async_streams.size * 2;
+
+	  if (async >= newsize)
+	    newsize = async + 1;
+
+	  ptx_dev->async_streams.arr
+	    = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
+				   newsize * sizeof (struct ptx_stream *));
+
+	  for (i = ptx_dev->async_streams.size; i < newsize; i++)
+	    ptx_dev->async_streams.arr[i] = NULL;
+
+	  ptx_dev->async_streams.size = newsize;
+	}
+
+      /* Create a new stream on-demand if there isn't one already, or if we're
+	 setting a particular async value to an existing (externally-provided)
+	 stream.  */
+      if (!ptx_dev->async_streams.arr[async] || existing)
+        {
+	  CUresult r;
+	  struct ptx_stream *s
+	    = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
+
+	  if (existing)
+	    s->stream = existing;
+	  else
+	    {
+	      r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
+	      if (r != CUDA_SUCCESS)
+		GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
+	    }
+
+	  /* If CREATE is true, we're going to be queueing some work on this
+	     stream.  Associate it with the current host thread.  */
+	  s->host_thread = thread;
+	  s->multithreaded = false;
+
+	  s->d = (CUdeviceptr) NULL;
+	  s->h = NULL;
+	  map_init (s);
+
+	  s->next = ptx_dev->active_streams;
+	  ptx_dev->active_streams = s;
+	  ptx_dev->async_streams.arr[async] = s;
+	}
+
+      stream = ptx_dev->async_streams.arr[async];
+    }
+  else if (async < 0)
+    GOMP_PLUGIN_fatal ("bad async %d", async);
+
+  if (create)
+    {
+      assert (stream != NULL);
+
+      /* If we're trying to use the same stream from different threads
+	 simultaneously, set stream->multithreaded to true.  This affects the
+	 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
+	 only wait for asynchronous launches from the same host thread they are
+	 invoked on.  If multiple threads use the same async value, we make note
+	 of that here and fall back to testing/waiting for all threads in those
+	 functions.  */
+      if (thread != stream->host_thread)
+        stream->multithreaded = true;
+
+      pthread_mutex_unlock (&ptx_dev->stream_lock);
+    }
+  else if (stream && !stream->multithreaded
+	   && !pthread_equal (stream->host_thread, thread))
+    GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
+
+  return stream;
+}
+
+static int nvptx_get_num_devices (void);
+
+/* Initialize the device.  */
+static int
+nvptx_init (void)
+{
+  CUresult r;
+  int rc;
+
+  if (ptx_inited)
+    return nvptx_get_num_devices ();
+
+  rc = verify_device_library ();
+  if (rc < 0)
+    return -1;
+
+  r = cuInit (0);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
+
+  ptx_events = NULL;
+
+  pthread_mutex_init (&ptx_event_lock, NULL);
+
+  ptx_inited = true;
+
+  return nvptx_get_num_devices ();
+}
+
+static void
+nvptx_fini (void)
+{
+  ptx_inited = false;
+}
+
+static void *
+nvptx_open_device (int n)
+{
+  struct ptx_device *ptx_dev;
+  CUdevice dev;
+  CUresult r;
+  int async_engines, pi;
+
+  r = cuDeviceGet (&dev, n);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
+
+  ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
+
+  ptx_dev->ord = n;
+  ptx_dev->dev = dev;
+  ptx_dev->ctx_shared = false;
+
+  r = cuCtxGetCurrent (&ptx_dev->ctx);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
+
+  if (!ptx_dev->ctx)
+    {
+      r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
+      if (r != CUDA_SUCCESS)
+	GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
+    }
+  else
+    ptx_dev->ctx_shared = true;
+
+  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+
+  ptx_dev->overlap = pi;
+
+  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+
+  ptx_dev->map = pi;
+
+  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+
+  ptx_dev->concur = pi;
+
+  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+
+  ptx_dev->mode = pi;
+
+  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+
+  ptx_dev->mkern = pi;
+
+  r = cuDeviceGetAttribute (&async_engines,
+			    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
+  if (r != CUDA_SUCCESS)
+    async_engines = 1;
+
+  init_streams_for_device (ptx_dev, async_engines);
+
+  return (void *) ptx_dev;
+}
+
+static int
+nvptx_close_device (void *targ_data)
+{
+  CUresult r;
+  struct ptx_device *ptx_dev = targ_data;
+
+  if (!ptx_dev)
+    return 0;
+
+  fini_streams_for_device (ptx_dev);
+
+  if (!ptx_dev->ctx_shared)
+    {
+      r = cuCtxDestroy (ptx_dev->ctx);
+      if (r != CUDA_SUCCESS)
+	GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
+    }
+
+  free (ptx_dev);
+
+  return 0;
+}
+
+static int
+nvptx_get_num_devices (void)
+{
+  int n;
+  CUresult r;
+
+  /* This function will be called before the plugin has been initialized in
+     order to enumerate available devices, but CUDA API routines can't be used
+     until cuInit has been called.  Just call it now (but don't yet do any
+     further initialization).  */
+  if (!ptx_inited)
+    cuInit (0);
+
+  r = cuDeviceGetCount (&n);
+  if (r!= CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
+
+  return n;
+}
+
+
+static void
+link_ptx (CUmodule *module, char *ptx_code)
+{
+  CUjit_option opts[7];
+  void *optvals[7];
+  float elapsed = 0.0;
+#define LOGSIZE 8192
+  char elog[LOGSIZE];
+  char ilog[LOGSIZE];
+  unsigned long logsize = LOGSIZE;
+  CUlinkState linkstate;
+  CUresult r;
+  void *linkout;
+  size_t linkoutsize __attribute__ ((unused));
+
+  GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code);
+
+  opts[0] = CU_JIT_WALL_TIME;
+  optvals[0] = &elapsed;
+
+  opts[1] = CU_JIT_INFO_LOG_BUFFER;
+  optvals[1] = &ilog[0];
+
+  opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+  optvals[2] = (void *) logsize;
+
+  opts[3] = CU_JIT_ERROR_LOG_BUFFER;
+  optvals[3] = &elog[0];
+
+  opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+  optvals[4] = (void *) logsize;
+
+  opts[5] = CU_JIT_LOG_VERBOSE;
+  optvals[5] = (void *) 1;
+
+  opts[6] = CU_JIT_TARGET;
+  optvals[6] = (void *) CU_TARGET_COMPUTE_30;
+
+  r = cuLinkCreate (7, opts, optvals, &linkstate);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
+
+  char *abort_ptx = ABORT_PTX;
+  r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
+		     strlen (abort_ptx) + 1, 0, 0, 0, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
+      GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
+    }
+
+  char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
+  r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
+		     strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
+      GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
+			 cuda_error (r));
+    }
+
+  char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
+  r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
+		     strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
+      GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
+			 cuda_error (r));
+    }
+
+  r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code,
+              strlen (ptx_code) + 1, 0, 0, 0, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
+      GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
+    }
+
+  r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
+
+  GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
+  GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
+
+  r = cuModuleLoadData (module, linkout);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
+}
+
+static void
+event_gc (bool memmap_lockable)
+{
+  struct ptx_event *ptx_event = ptx_events;
+  struct nvptx_thread *nvthd = nvptx_thread ();
+
+  pthread_mutex_lock (&ptx_event_lock);
+
+  while (ptx_event != NULL)
+    {
+      CUresult r;
+      struct ptx_event *e = ptx_event;
+
+      ptx_event = ptx_event->next;
+
+      if (e->ord != nvthd->ptx_dev->ord)
+	continue;
+
+      r = cuEventQuery (*e->evt);
+      if (r == CUDA_SUCCESS)
+	{
+	  CUevent *te;
+
+	  te = e->evt;
+
+	  switch (e->type)
+	    {
+	    case PTX_EVT_MEM:
+	    case PTX_EVT_SYNC:
+	      break;
+
+	    case PTX_EVT_KNL:
+	      map_pop (e->addr);
+	      break;
+
+	    case PTX_EVT_ASYNC_CLEANUP:
+	      {
+		/* The function gomp_plugin_async_unmap_vars needs to claim the
+		   memory-map splay tree lock for the current device, so we
+		   can't call it when one of our callers has already claimed
+		   the lock.  In that case, just delay the GC for this event
+		   until later.  */
+		if (!memmap_lockable)
+		  continue;
+
+		GOMP_PLUGIN_async_unmap_vars (e->addr);
+	      }
+	      break;
+	    }
+
+	  cuEventDestroy (*te);
+	  free ((void *)te);
+
+	  if (ptx_events == e)
+	    ptx_events = ptx_events->next;
+	  else
+	    {
+	      struct ptx_event *e_ = ptx_events;
+	      while (e_->next != e)
+		e_ = e_->next;
+	      e_->next = e_->next->next;
+	    }
+
+	  free (e);
+	}
+    }
+
+  pthread_mutex_unlock (&ptx_event_lock);
+}
+
+static void
+event_add (enum ptx_event_type type, CUevent *e, void *h)
+{
+  struct ptx_event *ptx_event;
+  struct nvptx_thread *nvthd = nvptx_thread ();
+
+  assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
+	  || type == PTX_EVT_ASYNC_CLEANUP);
+
+  ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
+  ptx_event->type = type;
+  ptx_event->evt = e;
+  ptx_event->addr = h;
+  ptx_event->ord = nvthd->ptx_dev->ord;
+
+  pthread_mutex_lock (&ptx_event_lock);
+
+  ptx_event->next = ptx_events;
+  ptx_events = ptx_event;
+
+  pthread_mutex_unlock (&ptx_event_lock);
+}
+
+void
+nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
+	  size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
+	  int vector_length, int async, void *targ_mem_desc)
+{
+  struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
+  CUfunction function;
+  CUresult r;
+  int i;
+  struct ptx_stream *dev_str;
+  void *kargs[1];
+  void *hp, *dp;
+  unsigned int nthreads_in_block;
+  struct nvptx_thread *nvthd = nvptx_thread ();
+  const char *maybe_abort_msg = "(perhaps abort was called)";
+
+  function = targ_fn->fn;
+
+  dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
+  assert (dev_str == nvthd->current_stream);
+
+  /* This reserves a chunk of a pre-allocated page of memory mapped on both
+     the host and the device. HP is a host pointer to the new chunk, and DP is
+     the corresponding device pointer.  */
+  map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
+
+  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
+
+  /* Copy the array of arguments to the mapped page.  */
+  for (i = 0; i < mapnum; i++)
+    ((void **) hp)[i] = devaddrs[i];
+
+  /* Copy the (device) pointers to arguments to the device (dp and hp might in
+     fact have the same value on a unified-memory system).  */
+  r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
+
+  GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
+
+  // OpenACC		CUDA
+  //
+  // num_gangs		blocks
+  // num_workers	warps (where a warp is equivalent to 32 threads)
+  // vector length	threads
+  //
+
+  /* The openacc vector_length clause 'determines the vector length to use for
+     vector or SIMD operations'.  The question is how to map this to CUDA.
+
+     In CUDA, the warp size is the vector length of a CUDA device.  However, the
+     CUDA interface abstracts away from that, and only shows us warp size
+     indirectly in maximum number of threads per block, which is a product of
+     warp size and the number of hyperthreads of a multiprocessor.
+
+     We choose to map openacc vector_length directly onto the number of threads
+     in a block, in the x dimension.  This is reflected in gcc code generation
+     that uses ThreadIdx.x to access vector elements.
+
+     Attempting to use an openacc vector_length of more than the maximum number
+     of threads per block will result in a cuda error.  */
+  nthreads_in_block = vector_length;
+
+  kargs[0] = &dp;
+  r = cuLaunchKernel (function,
+		      num_gangs, 1, 1,
+		      nthreads_in_block, 1, 1,
+		      0, dev_str->stream, kargs, 0);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
+
+#ifndef DISABLE_ASYNC
+  if (async < acc_async_noval)
+    {
+      r = cuStreamSynchronize (dev_str->stream);
+      if (r == CUDA_ERROR_LAUNCH_FAILED)
+	GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
+			   maybe_abort_msg);
+      else if (r != CUDA_SUCCESS)
+        GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
+    }
+  else
+    {
+      CUevent *e;
+
+      e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+      r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+      if (r == CUDA_ERROR_LAUNCH_FAILED)
+	GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
+			   maybe_abort_msg);
+      else if (r != CUDA_SUCCESS)
+        GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
+
+      event_gc (true);
+
+      r = cuEventRecord (*e, dev_str->stream);
+      if (r != CUDA_SUCCESS)
+        GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
+
+      event_add (PTX_EVT_KNL, e, (void *)dev_str);
+    }
+#else
+  r = cuCtxSynchronize ();
+  if (r == CUDA_ERROR_LAUNCH_FAILED)
+    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
+		       maybe_abort_msg);
+  else if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
+#endif
+
+  GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
+		     targ_fn->name);
+
+#ifndef DISABLE_ASYNC
+  if (async < acc_async_noval)
+#endif
+    map_pop (dev_str);
+}
+
+void * openacc_get_current_cuda_context (void);
+
+static void *
+nvptx_alloc (size_t s)
+{
+  CUdeviceptr d;
+  CUresult r;
+
+  r = cuMemAlloc (&d, s);
+  if (r == CUDA_ERROR_OUT_OF_MEMORY)
+    return 0;
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
+  return (void *)d;
+}
+
+static void
+nvptx_free (void *p)
+{
+  CUresult r;
+  CUdeviceptr pb;
+  size_t ps;
+
+  r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
+
+  if ((CUdeviceptr)p != pb)
+    GOMP_PLUGIN_fatal ("invalid device address");
+
+  r = cuMemFree ((CUdeviceptr)p);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
+}
+
+static void *
+nvptx_host2dev (void *d, const void *h, size_t s)
+{
+  CUresult r;
+  CUdeviceptr pb;
+  size_t ps;
+  struct nvptx_thread *nvthd = nvptx_thread ();
+
+  if (!s)
+    return 0;
+
+  if (!d)
+    GOMP_PLUGIN_fatal ("invalid device address");
+
+  r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
+
+  if (!pb)
+    GOMP_PLUGIN_fatal ("invalid device address");
+
+  if (!h)
+    GOMP_PLUGIN_fatal ("invalid host address");
+
+  if (d == h)
+    GOMP_PLUGIN_fatal ("invalid host or device address");
+
+  if ((void *)(d + s) > (void *)(pb + ps))
+    GOMP_PLUGIN_fatal ("invalid size");
+
+#ifndef DISABLE_ASYNC
+  if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
+    {
+      CUevent *e;
+
+      e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+      r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+      if (r != CUDA_SUCCESS)
+        GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
+
+      event_gc (false);
+
+      r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
+			     nvthd->current_stream->stream);
+      if (r != CUDA_SUCCESS)
+        GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
+
+      r = cuEventRecord (*e, nvthd->current_stream->stream);
+      if (r != CUDA_SUCCESS)
+        GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
+
+      event_add (PTX_EVT_MEM, e, (void *)h);
+    }
+  else
+#endif
+    {
+      r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
+      if (r != CUDA_SUCCESS)
+        GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
+    }
+
+  return 0;
+}
+
+static void *
+nvptx_dev2host (void *h, const void *d, size_t s)
+{
+  CUresult r;
+  CUdeviceptr pb;
+  size_t ps;
+  struct nvptx_thread *nvthd = nvptx_thread ();
+
+  if (!s)
+    return 0;
+
+  if (!d)
+    GOMP_PLUGIN_fatal ("invalid device address");
+
+  r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
+
+  if (!pb)
+    GOMP_PLUGIN_fatal ("invalid device address");
+
+  if (!h)
+    GOMP_PLUGIN_fatal ("invalid host address");
+
+  if (d == h)
+    GOMP_PLUGIN_fatal ("invalid host or device address");
+
+  if ((void *)(d + s) > (void *)(pb + ps))
+    GOMP_PLUGIN_fatal ("invalid size");
+
+#ifndef DISABLE_ASYNC
+  if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
+    {
+      CUevent *e;
+
+      e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+      r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+      if (r != CUDA_SUCCESS)
+        GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
+
+      event_gc (false);
+
+      r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
+			     nvthd->current_stream->stream);
+      if (r != CUDA_SUCCESS)
+        GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
+
+      r = cuEventRecord (*e, nvthd->current_stream->stream);
+      if (r != CUDA_SUCCESS)
+        GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
+
+      event_add (PTX_EVT_MEM, e, (void *)h);
+    }
+  else
+#endif
+    {
+      r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
+      if (r != CUDA_SUCCESS)
+	GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
+    }
+
+  return 0;
+}
+
+static void
+nvptx_set_async (int async)
+{
+  struct nvptx_thread *nvthd = nvptx_thread ();
+  nvthd->current_stream
+    = select_stream_for_async (async, pthread_self (), true, NULL);
+}
+
+static int
+nvptx_async_test (int async)
+{
+  CUresult r;
+  struct ptx_stream *s;
+
+  s = select_stream_for_async (async, pthread_self (), false, NULL);
+
+  if (!s)
+    GOMP_PLUGIN_fatal ("unknown async %d", async);
+
+  r = cuStreamQuery (s->stream);
+  if (r == CUDA_SUCCESS)
+    {
+      /* The oacc-parallel.c:goacc_wait function calls this hook to determine
+	 whether all work has completed on this stream, and if so omits the call
+	 to the wait hook.  If that happens, event_gc might not get called
+	 (which prevents variables from getting unmapped and their associated
+	 device storage freed), so call it here.  */
+      event_gc (true);
+      return 1;
+    }
+  else if (r == CUDA_ERROR_NOT_READY)
+    return 0;
+
+  GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
+
+  return 0;
+}
+
+static int
+nvptx_async_test_all (void)
+{
+  struct ptx_stream *s;
+  pthread_t self = pthread_self ();
+  struct nvptx_thread *nvthd = nvptx_thread ();
+
+  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
+
+  for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
+    {
+      if ((s->multithreaded || pthread_equal (s->host_thread, self))
+	  && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
+	{
+	  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
+	  return 0;
+	}
+    }
+
+  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
+
+  event_gc (true);
+
+  return 1;
+}
+
+static void
+nvptx_wait (int async)
+{
+  CUresult r;
+  struct ptx_stream *s;
+
+  s = select_stream_for_async (async, pthread_self (), false, NULL);
+
+  if (!s)
+    GOMP_PLUGIN_fatal ("unknown async %d", async);
+
+  r = cuStreamSynchronize (s->stream);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
+
+  event_gc (true);
+}
+
+static void
+nvptx_wait_async (int async1, int async2)
+{
+  CUresult r;
+  CUevent *e;
+  struct ptx_stream *s1, *s2;
+  pthread_t self = pthread_self ();
+
+  /* The stream that is waiting (rather than being waited for) doesn't
+     necessarily have to exist already.  */
+  s2 = select_stream_for_async (async2, self, true, NULL);
+
+  s1 = select_stream_for_async (async1, self, false, NULL);
+  if (!s1)
+    GOMP_PLUGIN_fatal ("invalid async 1\n");
+
+  if (s1 == s2)
+    GOMP_PLUGIN_fatal ("identical parameters");
+
+  e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+  r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
+
+  event_gc (true);
+
+  r = cuEventRecord (*e, s1->stream);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
+
+  event_add (PTX_EVT_SYNC, e, NULL);
+
+  r = cuStreamWaitEvent (s2->stream, *e, 0);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
+}
+
+static void
+nvptx_wait_all (void)
+{
+  CUresult r;
+  struct ptx_stream *s;
+  pthread_t self = pthread_self ();
+  struct nvptx_thread *nvthd = nvptx_thread ();
+
+  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
+
+  /* Wait for active streams initiated by this thread (or by multiple threads)
+     to complete.  */
+  for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
+    {
+      if (s->multithreaded || pthread_equal (s->host_thread, self))
+	{
+	  r = cuStreamQuery (s->stream);
+	  if (r == CUDA_SUCCESS)
+	    continue;
+	  else if (r != CUDA_ERROR_NOT_READY)
+	    GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
+
+	  r = cuStreamSynchronize (s->stream);
+	  if (r != CUDA_SUCCESS)
+	    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
+	}
+    }
+
+  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
+
+  event_gc (true);
+}
+
+static void
+nvptx_wait_all_async (int async)
+{
+  CUresult r;
+  struct ptx_stream *waiting_stream, *other_stream;
+  CUevent *e;
+  struct nvptx_thread *nvthd = nvptx_thread ();
+  pthread_t self = pthread_self ();
+
+  /* The stream doing the waiting.  This could be the first mention of the
+     stream, so create it if necessary.  */
+  waiting_stream
+    = select_stream_for_async (async, pthread_self (), true, NULL);
+
+  /* Launches on the null stream already block on other streams in the
+     context.  */
+  if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
+    return;
+
+  event_gc (true);
+
+  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
+
+  for (other_stream = nvthd->ptx_dev->active_streams;
+       other_stream != NULL;
+       other_stream = other_stream->next)
+    {
+      if (!other_stream->multithreaded
+	  && !pthread_equal (other_stream->host_thread, self))
+	continue;
+
+      e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+      r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+      if (r != CUDA_SUCCESS)
+	GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
+
+      /* Record an event on the waited-for stream.  */
+      r = cuEventRecord (*e, other_stream->stream);
+      if (r != CUDA_SUCCESS)
+	GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
+
+      event_add (PTX_EVT_SYNC, e, NULL);
+
+      r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
+      if (r != CUDA_SUCCESS)
+	GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
+   }
+
+  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
+}
+
+static void *
+nvptx_get_current_cuda_device (void)
+{
+  struct nvptx_thread *nvthd = nvptx_thread ();
+
+  if (!nvthd || !nvthd->ptx_dev)
+    return NULL;
+
+  return &nvthd->ptx_dev->dev;
+}
+
+static void *
+nvptx_get_current_cuda_context (void)
+{
+  struct nvptx_thread *nvthd = nvptx_thread ();
+
+  if (!nvthd || !nvthd->ptx_dev)
+    return NULL;
+
+  return nvthd->ptx_dev->ctx;
+}
+
+static void *
+nvptx_get_cuda_stream (int async)
+{
+  struct ptx_stream *s;
+  struct nvptx_thread *nvthd = nvptx_thread ();
+
+  if (!nvthd || !nvthd->ptx_dev)
+    return NULL;
+
+  s = select_stream_for_async (async, pthread_self (), false, NULL);
+
+  return s ? s->stream : NULL;
+}
+
+static int
+nvptx_set_cuda_stream (int async, void *stream)
+{
+  struct ptx_stream *oldstream;
+  pthread_t self = pthread_self ();
+  struct nvptx_thread *nvthd = nvptx_thread ();
+
+  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
+
+  if (async < 0)
+    GOMP_PLUGIN_fatal ("bad async %d", async);
+
+  /* We have a list of active streams and an array mapping async values to
+     entries of that list.  We need to take "ownership" of the passed-in stream,
+     and add it to our list, removing the previous entry also (if there was one)
+     in order to prevent resource leaks.  Note the potential for surprise
+     here: maybe we should keep track of passed-in streams and leave it up to
+     the user to tidy those up, but that doesn't work for stream handles
+     returned from acc_get_cuda_stream above...  */
+
+  oldstream = select_stream_for_async (async, self, false, NULL);
+
+  if (oldstream)
+    {
+      if (nvthd->ptx_dev->active_streams == oldstream)
+	nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
+      else
+	{
+	  struct ptx_stream *s = nvthd->ptx_dev->active_streams;
+	  while (s->next != oldstream)
+	    s = s->next;
+	  s->next = s->next->next;
+	}
+
+      cuStreamDestroy (oldstream->stream);
+      map_fini (oldstream);
+      free (oldstream);
+    }
+
+  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
+
+  (void) select_stream_for_async (async, self, true, (CUstream) stream);
+
+  return 1;
+}
+
+/* Plugin entry points.  */
+
+const char *
+GOMP_OFFLOAD_get_name (void)
+{
+  return "nvptx";
+}
+
+unsigned int
+GOMP_OFFLOAD_get_caps (void)
+{
+  return GOMP_OFFLOAD_CAP_OPENACC_200;
+}
+
+int
+GOMP_OFFLOAD_get_type (void)
+{
+  return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
+}
+
+int
+GOMP_OFFLOAD_get_num_devices (void)
+{
+  return nvptx_get_num_devices ();
+}
+
+static void **kernel_target_data;
+static void **kernel_host_table;
+
+void
+GOMP_OFFLOAD_register_image (void *host_table, void *target_data)
+{
+  kernel_target_data = target_data;
+  kernel_host_table = host_table;
+}
+
+void
+GOMP_OFFLOAD_init_device (int n __attribute__ ((unused)))
+{
+  (void) nvptx_init ();
+}
+
+void
+GOMP_OFFLOAD_fini_device (int n __attribute__ ((unused)))
+{
+  nvptx_fini ();
+}
+
+int
+GOMP_OFFLOAD_get_table (int n __attribute__ ((unused)),
+			struct mapping_table **tablep)
+{
+  CUmodule module;
+  void **fn_table;
+  char **fn_names;
+  int fn_entries, i;
+  CUresult r;
+  struct targ_fn_descriptor *targ_fns;
+
+  if (nvptx_init () <= 0)
+    return 0;
+
+  /* This isn't an error, because an image may legitimately have no offloaded
+     regions and so will not call GOMP_offload_register.  */
+  if (kernel_target_data == NULL)
+    return 0;
+
+  link_ptx (&module, kernel_target_data[0]);
+
+  /* kernel_target_data[0] -> ptx code
+     kernel_target_data[1] -> variable mappings
+     kernel_target_data[2] -> array of kernel names in ascii
+
+     kernel_host_table[0] -> start of function addresses (__offload_func_table)
+     kernel_host_table[1] -> end of function addresses (__offload_funcs_end)
+
+     The array of kernel names and the functions addresses form a
+     one-to-one correspondence.  */
+
+  fn_table = kernel_host_table[0];
+  fn_names = (char **) kernel_target_data[2];
+  fn_entries = (kernel_host_table[1] - kernel_host_table[0]) / sizeof (void *);
+
+  *tablep = GOMP_PLUGIN_malloc (sizeof (struct mapping_table) * fn_entries);
+  targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
+				 * fn_entries);
+
+  for (i = 0; i < fn_entries; i++)
+    {
+      CUfunction function;
+
+      r = cuModuleGetFunction (&function, module, fn_names[i]);
+      if (r != CUDA_SUCCESS)
+	GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
+
+      targ_fns[i].fn = function;
+      targ_fns[i].name = (const char *) fn_names[i];
+
+      (*tablep)[i].host_start = (uintptr_t) fn_table[i];
+      (*tablep)[i].host_end = (*tablep)[i].host_start + 1;
+      (*tablep)[i].tgt_start = (uintptr_t) &targ_fns[i];
+      (*tablep)[i].tgt_end = (*tablep)[i].tgt_start + 1;
+    }
+
+  return fn_entries;
+}
+
+void *
+GOMP_OFFLOAD_alloc (int n __attribute__ ((unused)), size_t size)
+{
+  return nvptx_alloc (size);
+}
+
+void
+GOMP_OFFLOAD_free (int n __attribute__ ((unused)), void *ptr)
+{
+  nvptx_free (ptr);
+}
+
+void *
+GOMP_OFFLOAD_dev2host (int ord __attribute__ ((unused)), void *dst,
+		       const void *src, size_t n)
+{
+  return nvptx_dev2host (dst, src, n);
+}
+
+void *
+GOMP_OFFLOAD_host2dev (int ord __attribute__ ((unused)), void *dst,
+		       const void *src, size_t n)
+{
+  return nvptx_host2dev (dst, src, n);
+}
+
+void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
+
+void
+GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
+			       void **hostaddrs, void **devaddrs, size_t *sizes,
+			       unsigned short *kinds, int num_gangs,
+			       int num_workers, int vector_length, int async,
+			       void *targ_mem_desc)
+{
+  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
+	    num_workers, vector_length, async, targ_mem_desc);
+}
+
+void *
+GOMP_OFFLOAD_openacc_open_device (int n)
+{
+  return nvptx_open_device (n);
+}
+
+int
+GOMP_OFFLOAD_openacc_close_device (void *h)
+{
+  return nvptx_close_device (h);
+}
+
+void
+GOMP_OFFLOAD_openacc_set_device_num (int n)
+{
+  struct nvptx_thread *nvthd = nvptx_thread ();
+
+  assert (n >= 0);
+
+  if (!nvthd->ptx_dev || nvthd->ptx_dev->ord != n)
+    (void) nvptx_open_device (n);
+}
+
+/* This can be called before the device is "opened" for the current thread, in
+   which case we can't tell which device number should be returned.  We don't
+   actually want to open the device here, so just return -1 and let the caller
+   (oacc-init.c:acc_get_device_num) handle it.  */
+
+int
+GOMP_OFFLOAD_openacc_get_device_num (void)
+{
+  struct nvptx_thread *nvthd = nvptx_thread ();
+
+  if (nvthd && nvthd->ptx_dev)
+    return nvthd->ptx_dev->ord;
+  else
+    return -1;
+}
+
+void
+GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
+{
+  CUevent *e;
+  CUresult r;
+  struct nvptx_thread *nvthd = nvptx_thread ();
+
+  e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
+
+  r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
+
+  r = cuEventRecord (*e, nvthd->current_stream->stream);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
+
+  event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
+}
+
+int
+GOMP_OFFLOAD_openacc_async_test (int async)
+{
+  return nvptx_async_test (async);
+}
+
+int
+GOMP_OFFLOAD_openacc_async_test_all (void)
+{
+  return nvptx_async_test_all ();
+}
+
+void
+GOMP_OFFLOAD_openacc_async_wait (int async)
+{
+  nvptx_wait (async);
+}
+
+void
+GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
+{
+  nvptx_wait_async (async1, async2);
+}
+
+void
+GOMP_OFFLOAD_openacc_async_wait_all (void)
+{
+  nvptx_wait_all ();
+}
+
+void
+GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
+{
+  nvptx_wait_all_async (async);
+}
+
+void
+GOMP_OFFLOAD_openacc_async_set_async (int async)
+{
+  nvptx_set_async (async);
+}
+
+void *
+GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data)
+{
+  struct ptx_device *ptx_dev = (struct ptx_device *) targ_data;
+  struct nvptx_thread *nvthd
+    = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
+  CUresult r;
+  CUcontext thd_ctx;
+
+  r = cuCtxGetCurrent (&thd_ctx);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
+
+  assert (ptx_dev->ctx);
+
+  if (!thd_ctx)
+    {
+      r = cuCtxPushCurrent (ptx_dev->ctx);
+      if (r != CUDA_SUCCESS)
+	GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
+    }
+
+  nvthd->current_stream = ptx_dev->null_stream;
+  nvthd->ptx_dev = ptx_dev;
+
+  return (void *) nvthd;
+}
+
+void
+GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
+{
+  free (data);
+}
+
+void *
+GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
+{
+  return nvptx_get_current_cuda_device ();
+}
+
+void *
+GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
+{
+  return nvptx_get_current_cuda_context ();
+}
+
+/* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
+
+void *
+GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
+{
+  return nvptx_get_cuda_stream (async);
+}
+
+/* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
+
+int
+GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
+{
+  return nvptx_set_cuda_stream (async, stream);
+}
author	Thomas Schwinge <thomas@codesourcery.com>	2015-01-15 21:11:12 +0100
committer	Thomas Schwinge <tschwinge@gcc.gnu.org>	2015-01-15 21:11:12 +0100
commit	41dbbb3789850dfea98dd8984f69806284f87b6e (patch)
tree	97a0bb274cc7583206397ba37ab5c0bbe01cb04d /libgomp/plugin
parent	96a87981994da859c17259d8c4dccb6602476b0e (diff)
download	gcc-41dbbb3789850dfea98dd8984f69806284f87b6e.zip gcc-41dbbb3789850dfea98dd8984f69806284f87b6e.tar.gz gcc-41dbbb3789850dfea98dd8984f69806284f87b6e.tar.bz2