diff options
35 files changed, 2888 insertions, 722 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 6e85a5b..69a3044 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2015-11-14 Jakub Jelinek <jakub@redhat.com> + + * omp-low.c (lower_omp_ordered): Add argument to GOMP_SMD_ORDERED_* + internal calls - 0 if ordered simd and 1 for ordered threads simd. + * tree-vectorizer.c (adjust_simduid_builtins): If GOMP_SIMD_ORDERED_* + argument is 1, replace it with GOMP_ordered_* call instead of removing + it. + 2015-11-13 Rich Felker <dalias@libc.org> * config/sh/sh.md (symGOT_load): Suppress __stack_chk_guard diff --git a/gcc/c/ChangeLog b/gcc/c/ChangeLog index 04fe64d..5be190f 100644 --- a/gcc/c/ChangeLog +++ b/gcc/c/ChangeLog @@ -1,3 +1,8 @@ +2015-11-14 Jakub Jelinek <jakub@redhat.com> + + * c-typeck.c (c_finish_omp_clauses): Don't mark + GOMP_MAP_FIRSTPRIVATE_POINTER decls addressable. + 2015-11-14 Marek Polacek <polacek@redhat.com> * c-decl.c: Use RECORD_OR_UNION_TYPE_P throughout. diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c index 0215eda..c18c307 100644 --- a/gcc/c/c-typeck.c +++ b/gcc/c/c-typeck.c @@ -12918,7 +12918,10 @@ c_finish_omp_clauses (tree clauses, bool is_omp, bool declare_simd) omp_clause_code_name[OMP_CLAUSE_CODE (c)]); remove = true; } - else if (!c_mark_addressable (t)) + else if ((OMP_CLAUSE_CODE (c) != OMP_CLAUSE_MAP + || (OMP_CLAUSE_MAP_KIND (c) + != GOMP_MAP_FIRSTPRIVATE_POINTER)) + && !c_mark_addressable (t)) remove = true; else if (!(OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP && (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_POINTER diff --git a/gcc/cp/ChangeLog b/gcc/cp/ChangeLog index 2ea5448..baaabc8 100644 --- a/gcc/cp/ChangeLog +++ b/gcc/cp/ChangeLog @@ -1,3 +1,8 @@ +2015-11-14 Jakub Jelinek <jakub@redhat.com> + + * semantics.c (finish_omp_clauses): Don't mark + GOMP_MAP_FIRSTPRIVATE_POINTER decls addressable. + 2015-11-13 Kai Tietz <ktietz70@googlemail.com> Marek Polacek <polacek@redhat.com> Jason Merrill <jason@redhat.com> diff --git a/gcc/cp/semantics.c b/gcc/cp/semantics.c index 67f8590..e7e5d8e 100644 --- a/gcc/cp/semantics.c +++ b/gcc/cp/semantics.c @@ -6581,6 +6581,9 @@ finish_omp_clauses (tree clauses, bool allow_fields, bool declare_simd) } else if (!processing_template_decl && TREE_CODE (TREE_TYPE (t)) != REFERENCE_TYPE + && (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_MAP + || (OMP_CLAUSE_MAP_KIND (c) + != GOMP_MAP_FIRSTPRIVATE_POINTER)) && !cxx_mark_addressable (t)) remove = true; else if (!(OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP diff --git a/gcc/omp-low.c b/gcc/omp-low.c index 4b2b477..987bc3e 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -13951,8 +13951,10 @@ lower_omp_ordered (gimple_stmt_iterator *gsi_p, omp_context *ctx) gomp_ordered *ord_stmt = as_a <gomp_ordered *> (stmt); gcall *x; gbind *bind; - bool simd - = find_omp_clause (gimple_omp_ordered_clauses (ord_stmt), OMP_CLAUSE_SIMD); + bool simd = find_omp_clause (gimple_omp_ordered_clauses (ord_stmt), + OMP_CLAUSE_SIMD); + bool threads = find_omp_clause (gimple_omp_ordered_clauses (ord_stmt), + OMP_CLAUSE_THREADS); if (find_omp_clause (gimple_omp_ordered_clauses (ord_stmt), OMP_CLAUSE_DEPEND)) @@ -13975,7 +13977,8 @@ lower_omp_ordered (gimple_stmt_iterator *gsi_p, omp_context *ctx) if (simd) { - x = gimple_build_call_internal (IFN_GOMP_SIMD_ORDERED_START, 0); + x = gimple_build_call_internal (IFN_GOMP_SIMD_ORDERED_START, 1, + build_int_cst (NULL_TREE, threads)); cfun->has_simduid_loops = true; } else @@ -13989,7 +13992,8 @@ lower_omp_ordered (gimple_stmt_iterator *gsi_p, omp_context *ctx) gimple_omp_set_body (stmt, NULL); if (simd) - x = gimple_build_call_internal (IFN_GOMP_SIMD_ORDERED_END, 0); + x = gimple_build_call_internal (IFN_GOMP_SIMD_ORDERED_END, 1, + build_int_cst (NULL_TREE, threads)); else x = gimple_build_call (builtin_decl_explicit (BUILT_IN_GOMP_ORDERED_END), 0); diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index 08c8025..b721c56 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -177,6 +177,21 @@ adjust_simduid_builtins (hash_table<simduid_to_vf> *htab) break; case IFN_GOMP_SIMD_ORDERED_START: case IFN_GOMP_SIMD_ORDERED_END: + if (integer_onep (gimple_call_arg (stmt, 0))) + { + enum built_in_function bcode + = (ifn == IFN_GOMP_SIMD_ORDERED_START + ? BUILT_IN_GOMP_ORDERED_START + : BUILT_IN_GOMP_ORDERED_END); + gimple *g + = gimple_build_call (builtin_decl_explicit (bcode), 0); + tree vdef = gimple_vdef (stmt); + gimple_set_vdef (g, vdef); + SSA_NAME_DEF_STMT (vdef) = g; + gimple_set_vuse (g, gimple_vuse (stmt)); + gsi_replace (&i, g, true); + continue; + } gsi_remove (&i, true); unlink_stmt_vdef (stmt); continue; diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index 2f5a4d1..15be014 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,3 +1,104 @@ +2015-11-14 Jakub Jelinek <jakub@redhat.com> + Aldy Hernandez <aldyh@redhat.com> + Ilya Verbin <ilya.verbin@intel.com> + + * ordered.c (gomp_doacross_init, GOMP_doacross_post, + GOMP_doacross_wait, gomp_doacross_ull_init, GOMP_doacross_ull_post, + GOMP_doacross_ull_wait): For GFS_GUIDED don't divide number of + iterators or IV by chunk size. + * parallel.c (gomp_resolve_num_threads): Don't assume that + if thr->ts.team is non-NULL, then pool must be non-NULL. + * libgomp-plugin.h (GOMP_PLUGIN_target_task_completion): Declare. + * libgomp.map (GOMP_PLUGIN_1.1): New symbol version, export + GOMP_PLUGIN_target_task_completion. + * Makefile.am (libgomp_la_SOURCES): Add priority_queue.c. + * Makefile.in: Regenerate. + * libgomp.h: Shuffle prototypes and forward definitions around so + priority queues can be defined. + (enum gomp_task_kind): Add GOMP_TASK_ASYNC_RUNNING. + (enum gomp_target_task_state): New enum. + (struct gomp_target_task): Add state, tgt, task and team fields. + (gomp_create_target_task): Change return type to bool, add + state argument. + (gomp_target_task_fn): Change return type to bool. + (struct gomp_device_descr): Add async_run_func. + (struct gomp_task): Remove children, next_child, prev_child, + next_queue, prev_queue, next_taskgroup, prev_taskgroup. + Add pnode field. + (struct gomp_taskgroup): Remove children. + Add taskgroup_queue. + (struct gomp_team): Change task_queue type to a priority queue. + (splay_compare): Define inline. + (priority_queue_offset): New. + (priority_node_to_task): New. + (task_to_priority_node): New. + * oacc-mem.c: Do not include splay-tree.h. + * priority_queue.c: New file. + * priority_queue.h: New file. + * splay-tree.c: Do not include splay-tree.h. + (splay_tree_foreach_internal): New. + (splay_tree_foreach): New. + * splay-tree.h: Become re-entrant if splay_tree_prefix is defined. + (splay_tree_callback): Define typedef. + * target.c (splay_compare): Move to libgomp.h. + (GOMP_target): Don't adjust *thr in any way around running offloaded + task. + (GOMP_target_ext): Likewise. Handle target nowait. + (GOMP_target_update_ext, GOMP_target_enter_exit_data): Check + return value from gomp_create_target_task, if false, fallthrough + as if no dependencies exist. + (gomp_target_task_fn): Change return type to bool, return true + if the task should have another part scheduled later. Handle + target nowait. + (gomp_load_plugin_for_device): Initialize async_run. + * task.c (gomp_init_task): Initialize children_queue. + (gomp_clear_parent_in_list): New. + (gomp_clear_parent_in_tree): New. + (gomp_clear_parent): Handle priorities. + (GOMP_task): Likewise. + (priority_queue_move_task_first, + gomp_target_task_completion, GOMP_PLUGIN_target_task_completion): + New functions. + (gomp_create_target_task): Use priority queues. Change return type + to bool, add state argument, return false if for async + {{enter,exit} data,update} constructs no dependencies need to be + waited for, handle target nowait. Set task->fn to NULL instead of + gomp_target_task_fn. + (verify_children_queue): Remove. + (priority_list_upgrade_task): New. + (priority_queue_upgrade_task): New. + (verify_task_queue): Remove. + (priority_list_downgrade_task): New. + (priority_queue_downgrade_task): New. + (gomp_task_run_pre): Use priority queues. + Abstract code out to priority_queue_downgrade_task. + (gomp_task_run_post_handle_dependers): Use priority queues. + (gomp_task_run_post_remove_parent): Likewise. + (gomp_task_run_post_remove_taskgroup): Likewise. + (gomp_barrier_handle_tasks): Likewise. Handle target nowait target + tasks specially. + (GOMP_taskwait): Likewise. + (gomp_task_maybe_wait_for_dependencies): Likewise. Abstract code to + priority-queue_upgrade_task. + (GOMP_taskgroup_start): Use priority queues. + (GOMP_taskgroup_end): Likewise. Handle target nowait target tasks + specially. If taskgroup is NULL, and thr->ts.level is 0, act as a + barrier. + * taskloop.c (GOMP_taskloop): Handle priorities. + * team.c (gomp_new_team): Call priority_queue_init. + (free_team): Call priority_queue_free. + (gomp_free_thread): Call gomp_team_end if thr->ts.team is artificial + team created for target nowait in implicit parallel region. + (gomp_team_start): For nested check, test thr->ts.level instead of + thr->ts.team != NULL. + * testsuite/libgomp.c/doacross-3.c: New test. + * testsuite/libgomp.c/ordered-5.c: New test. + * testsuite/libgomp.c/priority.c: New test. + * testsuite/libgomp.c/target-31.c: New test. + * testsuite/libgomp.c/target-32.c: New test. + * testsuite/libgomp.c/target-33.c: New test. + * testsuite/libgomp.c/target-34.c: New test. + 2015-11-13 Nathan Sidwell <nathan@codesourcery.com> * testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: New. diff --git a/libgomp/Makefile.am b/libgomp/Makefile.am index 5411278..a3e1c2b 100644 --- a/libgomp/Makefile.am +++ b/libgomp/Makefile.am @@ -63,7 +63,7 @@ libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \ task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \ time.c fortran.c affinity.c target.c splay-tree.c libgomp-plugin.c \ oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c oacc-async.c \ - oacc-plugin.c oacc-cuda.c + oacc-plugin.c oacc-cuda.c priority_queue.c include $(top_srcdir)/plugin/Makefrag.am diff --git a/libgomp/Makefile.in b/libgomp/Makefile.in index 79745ce..7a1c976 100644 --- a/libgomp/Makefile.in +++ b/libgomp/Makefile.in @@ -168,7 +168,7 @@ am_libgomp_la_OBJECTS = alloc.lo barrier.lo critical.lo env.lo \ fortran.lo affinity.lo target.lo splay-tree.lo \ libgomp-plugin.lo oacc-parallel.lo oacc-host.lo oacc-init.lo \ oacc-mem.lo oacc-async.lo oacc-plugin.lo oacc-cuda.lo \ - $(am__objects_1) + priority_queue.lo $(am__objects_1) libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS) DEFAULT_INCLUDES = -I.@am__isrc@ depcomp = $(SHELL) $(top_srcdir)/../depcomp @@ -415,7 +415,7 @@ libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \ bar.c ptrlock.c time.c fortran.c affinity.c target.c \ splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c \ oacc-init.c oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c \ - $(am__append_2) + priority_queue.c $(am__append_2) # Nvidia PTX OpenACC plugin. @PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION) @@ -589,6 +589,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-plugin.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ordered.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parallel.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/priority_queue.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/proc.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ptrlock.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sections.Plo@am__quote@ diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h index 24fbb94..ab22e85 100644 --- a/libgomp/libgomp-plugin.h +++ b/libgomp/libgomp-plugin.h @@ -63,6 +63,7 @@ struct addr_pair extern void *GOMP_PLUGIN_malloc (size_t) __attribute__ ((malloc)); extern void *GOMP_PLUGIN_malloc_cleared (size_t) __attribute__ ((malloc)); extern void *GOMP_PLUGIN_realloc (void *, size_t); +void GOMP_PLUGIN_target_task_completion (void *); extern void GOMP_PLUGIN_debug (int, const char *, ...) __attribute__ ((format (printf, 2, 3))); diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 23b516e..c467f97 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -50,6 +50,22 @@ #include <stdlib.h> #include <stdarg.h> +/* Needed for memset in priority_queue.c. */ +#if _LIBGOMP_CHECKING_ +# ifdef STRING_WITH_STRINGS +# include <string.h> +# include <strings.h> +# else +# ifdef HAVE_STRING_H +# include <string.h> +# else +# ifdef HAVE_STRINGS_H +# include <strings.h> +# endif +# endif +# endif +#endif + #ifdef HAVE_ATTRIBUTE_VISIBILITY # pragma GCC visibility push(hidden) #endif @@ -65,6 +81,44 @@ enum memmodel MEMMODEL_SEQ_CST = 5 }; +/* alloc.c */ + +extern void *gomp_malloc (size_t) __attribute__((malloc)); +extern void *gomp_malloc_cleared (size_t) __attribute__((malloc)); +extern void *gomp_realloc (void *, size_t); + +/* Avoid conflicting prototypes of alloca() in system headers by using + GCC's builtin alloca(). */ +#define gomp_alloca(x) __builtin_alloca(x) + +/* error.c */ + +extern void gomp_vdebug (int, const char *, va_list); +extern void gomp_debug (int, const char *, ...) + __attribute__ ((format (printf, 2, 3))); +#define gomp_vdebug(KIND, FMT, VALIST) \ + do { \ + if (__builtin_expect (gomp_debug_var, 0)) \ + (gomp_vdebug) ((KIND), (FMT), (VALIST)); \ + } while (0) +#define gomp_debug(KIND, ...) \ + do { \ + if (__builtin_expect (gomp_debug_var, 0)) \ + (gomp_debug) ((KIND), __VA_ARGS__); \ + } while (0) +extern void gomp_verror (const char *, va_list); +extern void gomp_error (const char *, ...) + __attribute__ ((format (printf, 1, 2))); +extern void gomp_vfatal (const char *, va_list) + __attribute__ ((noreturn)); +extern void gomp_fatal (const char *, ...) + __attribute__ ((noreturn, format (printf, 1, 2))); + +struct gomp_task; +struct gomp_taskgroup; +struct htab; + +#include "priority_queue.h" #include "sem.h" #include "mutex.h" #include "bar.h" @@ -298,6 +352,7 @@ extern gomp_mutex_t gomp_managed_threads_lock; #endif extern unsigned long gomp_max_active_levels_var; extern bool gomp_cancel_var; +extern int gomp_max_task_priority_var; extern unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var; extern unsigned long gomp_available_cpus, gomp_managed_threads; extern unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len; @@ -318,13 +373,14 @@ enum gomp_task_kind /* Task created by GOMP_task and waiting to be run. */ GOMP_TASK_WAITING, /* Task currently executing or scheduled and about to execute. */ - GOMP_TASK_TIED + GOMP_TASK_TIED, + /* Used for target tasks that have vars mapped and async run started, + but not yet completed. Once that completes, they will be readded + into the queues as GOMP_TASK_WAITING in order to perform the var + unmapping. */ + GOMP_TASK_ASYNC_RUNNING }; -struct gomp_task; -struct gomp_taskgroup; -struct htab; - struct gomp_task_depend_entry { /* Address of dependency. */ @@ -352,8 +408,8 @@ struct gomp_taskwait { bool in_taskwait; bool in_depend_wait; + /* Number of tasks we are waiting for. */ size_t n_depend; - struct gomp_task *last_parent_depends_on; gomp_sem_t taskwait_sem; }; @@ -361,26 +417,10 @@ struct gomp_taskwait struct gomp_task { - /* Parent circular list. See children description below. */ + /* Parent of this task. */ struct gomp_task *parent; - /* Circular list representing the children of this task. - - In this list we first have parent_depends_on ready to run tasks, - then !parent_depends_on ready to run tasks, and finally already - running tasks. */ - struct gomp_task *children; - struct gomp_task *next_child; - struct gomp_task *prev_child; - /* Circular task_queue in `struct gomp_team'. - - GOMP_TASK_WAITING tasks come before GOMP_TASK_TIED tasks. */ - struct gomp_task *next_queue; - struct gomp_task *prev_queue; - /* Circular queue in gomp_taskgroup->children. - - GOMP_TASK_WAITING tasks come before GOMP_TASK_TIED tasks. */ - struct gomp_task *next_taskgroup; - struct gomp_task *prev_taskgroup; + /* Children of this task. */ + struct priority_queue children_queue; /* Taskgroup this task belongs in. */ struct gomp_taskgroup *taskgroup; /* Tasks that depend on this task. */ @@ -389,8 +429,19 @@ struct gomp_task struct gomp_taskwait *taskwait; /* Number of items in DEPEND. */ size_t depend_count; - /* Number of tasks in the DEPENDERS field above. */ + /* Number of tasks this task depends on. Once this counter reaches + 0, we have no unsatisfied dependencies, and this task can be put + into the various queues to be scheduled. */ size_t num_dependees; + + /* Priority of this task. */ + int priority; + /* The priority node for this task in each of the different queues. + We put this here to avoid allocating space for each priority + node. Then we play offsetof() games to convert between pnode[] + entries and the gomp_task in which they reside. */ + struct priority_node pnode[3]; + struct gomp_task_icv icv; void (*fn) (void *); void *fn_data; @@ -407,21 +458,32 @@ struct gomp_task struct gomp_task_depend_entry depend[]; }; +/* This structure describes a single #pragma omp taskgroup. */ + struct gomp_taskgroup { struct gomp_taskgroup *prev; - /* Circular list of tasks that belong in this taskgroup. - - Tasks are chained by next/prev_taskgroup within gomp_task, and - are sorted by GOMP_TASK_WAITING tasks, and then GOMP_TASK_TIED - tasks. */ - struct gomp_task *children; + /* Queue of tasks that belong in this taskgroup. */ + struct priority_queue taskgroup_queue; bool in_taskgroup_wait; bool cancelled; gomp_sem_t taskgroup_sem; size_t num_children; }; +/* Various state of OpenMP async offloading tasks. */ +enum gomp_target_task_state +{ + GOMP_TARGET_TASK_DATA, + GOMP_TARGET_TASK_BEFORE_MAP, + GOMP_TARGET_TASK_FALLBACK, + GOMP_TARGET_TASK_READY_TO_RUN, + GOMP_TARGET_TASK_RUNNING, + GOMP_TARGET_TASK_FINISHED +}; + +/* This structure describes a target task. */ + struct gomp_target_task { struct gomp_device_descr *devicep; @@ -430,6 +492,10 @@ struct gomp_target_task size_t *sizes; unsigned short *kinds; unsigned int flags; + enum gomp_target_task_state state; + struct target_mem_desc *tgt; + struct gomp_task *task; + struct gomp_team *team; void *hostaddrs[]; }; @@ -495,9 +561,8 @@ struct gomp_team struct gomp_work_share work_shares[8]; gomp_mutex_t task_lock; - /* Scheduled tasks. Chain fields are next/prev_queue within a - gomp_task. */ - struct gomp_task *task_queue; + /* Scheduled tasks. */ + struct priority_queue task_queue; /* Number of all GOMP_TASK_{WAITING,TIED} tasks in the team. */ unsigned int task_count; /* Number of GOMP_TASK_WAITING tasks currently waiting to be scheduled. */ @@ -627,39 +692,6 @@ extern bool gomp_affinity_init_level (int, unsigned long, bool); extern void gomp_affinity_print_place (void *); extern void gomp_get_place_proc_ids_8 (int, int64_t *); -/* alloc.c */ - -extern void *gomp_malloc (size_t) __attribute__((malloc)); -extern void *gomp_malloc_cleared (size_t) __attribute__((malloc)); -extern void *gomp_realloc (void *, size_t); - -/* Avoid conflicting prototypes of alloca() in system headers by using - GCC's builtin alloca(). */ -#define gomp_alloca(x) __builtin_alloca(x) - -/* error.c */ - -extern void gomp_vdebug (int, const char *, va_list); -extern void gomp_debug (int, const char *, ...) - __attribute__ ((format (printf, 2, 3))); -#define gomp_vdebug(KIND, FMT, VALIST) \ - do { \ - if (__builtin_expect (gomp_debug_var, 0)) \ - (gomp_vdebug) ((KIND), (FMT), (VALIST)); \ - } while (0) -#define gomp_debug(KIND, ...) \ - do { \ - if (__builtin_expect (gomp_debug_var, 0)) \ - (gomp_debug) ((KIND), __VA_ARGS__); \ - } while (0) -extern void gomp_verror (const char *, va_list); -extern void gomp_error (const char *, ...) - __attribute__ ((format (printf, 1, 2))); -extern void gomp_vfatal (const char *, va_list) - __attribute__ ((noreturn)); -extern void gomp_fatal (const char *, ...) - __attribute__ ((noreturn, format (printf, 1, 2))); - /* iter.c */ extern int gomp_iter_static_next (long *, long *); @@ -715,10 +747,10 @@ extern void gomp_init_task (struct gomp_task *, struct gomp_task *, extern void gomp_end_task (void); extern void gomp_barrier_handle_tasks (gomp_barrier_state_t); extern void gomp_task_maybe_wait_for_dependencies (void **); -extern void gomp_create_target_task (struct gomp_device_descr *, +extern bool gomp_create_target_task (struct gomp_device_descr *, void (*) (void *), size_t, void **, size_t *, unsigned short *, unsigned int, - void **); + void **, enum gomp_target_task_state); static void inline gomp_finish_task (struct gomp_task *task) @@ -739,8 +771,9 @@ extern void gomp_free_thread (void *); extern void gomp_init_targets_once (void); extern int gomp_get_num_devices (void); -extern void gomp_target_task_fn (void *); +extern bool gomp_target_task_fn (void *); +/* Splay tree definitions. */ typedef struct splay_tree_node_s *splay_tree_node; typedef struct splay_tree_s *splay_tree; typedef struct splay_tree_key_s *splay_tree_key; @@ -800,6 +833,21 @@ struct splay_tree_key_s { uintptr_t async_refcount; }; +/* The comparison function. */ + +static inline int +splay_compare (splay_tree_key x, splay_tree_key y) +{ + if (x->host_start == x->host_end + && y->host_start == y->host_end) + return 0; + if (x->host_end <= y->host_start) + return -1; + if (x->host_start >= y->host_end) + return 1; + return 0; +} + #include "splay-tree.h" typedef struct acc_dispatch_t @@ -877,6 +925,7 @@ struct gomp_device_descr void *(*host2dev_func) (int, void *, const void *, size_t); void *(*dev2dev_func) (int, void *, const void *, size_t); void (*run_func) (int, void *, void *); + void (*async_run_func) (int, void *, void *, void *); /* Splay tree containing information about mapped memory regions. */ struct splay_tree_s mem_map; @@ -1016,4 +1065,34 @@ extern int gomp_test_nest_lock_25 (omp_nest_lock_25_t *) __GOMP_NOTHROW; # define ialias_call(fn) fn #endif +/* Helper function for priority_node_to_task() and + task_to_priority_node(). + + Return the offset from a task to its priority_node entry. The + priority_node entry is has a type of TYPE. */ + +static inline size_t +priority_queue_offset (enum priority_queue_type type) +{ + return offsetof (struct gomp_task, pnode[(int) type]); +} + +/* Return the task associated with a priority NODE of type TYPE. */ + +static inline struct gomp_task * +priority_node_to_task (enum priority_queue_type type, + struct priority_node *node) +{ + return (struct gomp_task *) ((char *) node - priority_queue_offset (type)); +} + +/* Return the priority node of type TYPE for a given TASK. */ + +static inline struct priority_node * +task_to_priority_node (enum priority_queue_type type, + struct gomp_task *task) +{ + return (struct priority_node *) ((char *) task + + priority_queue_offset (type)); +} #endif /* LIBGOMP_H */ diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map index d16710f..4d42c42 100644 --- a/libgomp/libgomp.map +++ b/libgomp/libgomp.map @@ -407,3 +407,8 @@ GOMP_PLUGIN_1.0 { GOMP_PLUGIN_async_unmap_vars; GOMP_PLUGIN_acc_thread; }; + +GOMP_PLUGIN_1.1 { + global: + GOMP_PLUGIN_target_task_completion; +} GOMP_PLUGIN_1.0; diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c index 5410906..2488480 100644 --- a/libgomp/oacc-mem.c +++ b/libgomp/oacc-mem.c @@ -31,7 +31,6 @@ #include "libgomp.h" #include "gomp-constants.h" #include "oacc-int.h" -#include "splay-tree.h" #include <stdint.h> #include <assert.h> diff --git a/libgomp/ordered.c b/libgomp/ordered.c index fdac3ee..52df385 100644 --- a/libgomp/ordered.c +++ b/libgomp/ordered.c @@ -297,6 +297,8 @@ gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size) if (ws->sched == GFS_STATIC) num_ents = team->nthreads; + else if (ws->sched == GFS_GUIDED) + num_ents = counts[0]; else num_ents = (counts[0] - 1) / chunk_size + 1; if (num_bits <= MAX_COLLAPSED_BITS) @@ -366,6 +368,8 @@ GOMP_doacross_post (long *counts) if (__builtin_expect (ws->sched == GFS_STATIC, 1)) ent = thr->ts.team_id; + else if (ws->sched == GFS_GUIDED) + ent = counts[0]; else ent = counts[0] / doacross->chunk_size; unsigned long *array = (unsigned long *) (doacross->array @@ -426,6 +430,8 @@ GOMP_doacross_wait (long first, ...) else ent = first / ws->chunk_size % thr->ts.team->nthreads; } + else if (ws->sched == GFS_GUIDED) + ent = first; else ent = first / doacross->chunk_size; unsigned long *array = (unsigned long *) (doacross->array @@ -520,6 +526,8 @@ gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size) if (ws->sched == GFS_STATIC) num_ents = team->nthreads; + else if (ws->sched == GFS_GUIDED) + num_ents = counts[0]; else num_ents = (counts[0] - 1) / chunk_size + 1; if (num_bits <= MAX_COLLAPSED_BITS) @@ -595,6 +603,8 @@ GOMP_doacross_ull_post (gomp_ull *counts) if (__builtin_expect (ws->sched == GFS_STATIC, 1)) ent = thr->ts.team_id; + else if (ws->sched == GFS_GUIDED) + ent = counts[0]; else ent = counts[0] / doacross->chunk_size_ull; @@ -676,6 +686,8 @@ GOMP_doacross_ull_wait (gomp_ull first, ...) else ent = first / ws->chunk_size_ull % thr->ts.team->nthreads; } + else if (ws->sched == GFS_GUIDED) + ent = first; else ent = first / doacross->chunk_size_ull; diff --git a/libgomp/parallel.c b/libgomp/parallel.c index 6d5ef05..228086f 100644 --- a/libgomp/parallel.c +++ b/libgomp/parallel.c @@ -85,7 +85,7 @@ gomp_resolve_num_threads (unsigned specified, unsigned count) nested parallel, so there is just one thread in the contention group as well, no need to handle it atomically. */ pool = thr->thread_pool; - if (thr->ts.team == NULL) + if (thr->ts.team == NULL || pool == NULL) { num_threads = max_num_threads; if (num_threads > icv->thread_limit_var) diff --git a/libgomp/priority_queue.c b/libgomp/priority_queue.c new file mode 100644 index 0000000..4bc5f9b --- /dev/null +++ b/libgomp/priority_queue.c @@ -0,0 +1,300 @@ +/* Copyright (C) 2015 Free Software Foundation, Inc. + Contributed by Aldy Hernandez <aldyh@redhat.com>. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* Priority queue implementation of GOMP tasks. */ + +#include "libgomp.h" + +#if _LIBGOMP_CHECKING_ +#include <stdio.h> + +/* Sanity check to verify whether a TASK is in LIST. Return TRUE if + found, FALSE otherwise. + + TYPE is the type of priority queue this task resides in. */ + +static inline bool +priority_queue_task_in_list_p (enum priority_queue_type type, + struct priority_list *list, + struct gomp_task *task) +{ + struct priority_node *p = list->tasks; + do + { + if (priority_node_to_task (type, p) == task) + return true; + p = p->next; + } + while (p != list->tasks); + return false; +} + +/* Tree version of priority_queue_task_in_list_p. */ + +static inline bool +priority_queue_task_in_tree_p (enum priority_queue_type type, + struct priority_queue *head, + struct gomp_task *task) +{ + struct priority_list *list + = priority_queue_lookup_priority (head, task->priority); + if (!list) + return false; + return priority_queue_task_in_list_p (type, list, task); +} + +/* Generic version of priority_queue_task_in_list_p that works for + trees or lists. */ + +bool +priority_queue_task_in_queue_p (enum priority_queue_type type, + struct priority_queue *head, + struct gomp_task *task) +{ + if (priority_queue_empty_p (head, MEMMODEL_RELAXED)) + return false; + if (priority_queue_multi_p (head)) + return priority_queue_task_in_tree_p (type, head, task); + else + return priority_queue_task_in_list_p (type, &head->l, task); +} + +/* Sanity check LIST to make sure the tasks therein are in the right + order. LIST is a priority list of type TYPE. + + The expected order is that GOMP_TASK_WAITING tasks come before + GOMP_TASK_TIED/GOMP_TASK_ASYNC_RUNNING ones. + + If CHECK_DEPS is TRUE, we also check that parent_depends_on WAITING + tasks come before !parent_depends_on WAITING tasks. This is only + applicable to the children queue, and the caller is expected to + ensure that we are verifying the children queue. */ + +static void +priority_list_verify (enum priority_queue_type type, + struct priority_list *list, bool check_deps) +{ + bool seen_tied = false; + bool seen_plain_waiting = false; + struct priority_node *p = list->tasks; + while (1) + { + struct gomp_task *t = priority_node_to_task (type, p); + if (seen_tied && t->kind == GOMP_TASK_WAITING) + gomp_fatal ("priority_queue_verify: WAITING task after TIED"); + if (t->kind >= GOMP_TASK_TIED) + seen_tied = true; + else if (check_deps && t->kind == GOMP_TASK_WAITING) + { + if (t->parent_depends_on) + { + if (seen_plain_waiting) + gomp_fatal ("priority_queue_verify: " + "parent_depends_on after !parent_depends_on"); + } + else + seen_plain_waiting = true; + } + p = p->next; + if (p == list->tasks) + break; + } +} + +/* Callback type for priority_tree_verify_callback. */ +struct cbtype +{ + enum priority_queue_type type; + bool check_deps; +}; + +/* Verify every task in NODE. + + Callback for splay_tree_foreach. */ + +static void +priority_tree_verify_callback (prio_splay_tree_key key, void *data) +{ + struct cbtype *cb = (struct cbtype *) data; + priority_list_verify (cb->type, &key->l, cb->check_deps); +} + +/* Generic version of priority_list_verify. + + Sanity check HEAD to make sure the tasks therein are in the right + order. The priority_queue holds tasks of type TYPE. + + If CHECK_DEPS is TRUE, we also check that parent_depends_on WAITING + tasks come before !parent_depends_on WAITING tasks. This is only + applicable to the children queue, and the caller is expected to + ensure that we are verifying the children queue. */ + +void +priority_queue_verify (enum priority_queue_type type, + struct priority_queue *head, bool check_deps) +{ + if (priority_queue_empty_p (head, MEMMODEL_RELAXED)) + return; + if (priority_queue_multi_p (head)) + { + struct cbtype cb = { type, check_deps }; + prio_splay_tree_foreach (&head->t, + priority_tree_verify_callback, &cb); + } + else + priority_list_verify (type, &head->l, check_deps); +} +#endif /* _LIBGOMP_CHECKING_ */ + +/* Remove NODE from priority queue HEAD, wherever it may be inside the + tree. HEAD contains tasks of type TYPE. */ + +void +priority_tree_remove (enum priority_queue_type type, + struct priority_queue *head, + struct priority_node *node) +{ + /* ?? The only reason this function is not inlined is because we + need to find the priority within gomp_task (which has not been + completely defined in the header file). If the lack of inlining + is a concern, we could pass the priority number as a + parameter, or we could move this to libgomp.h. */ + int priority = priority_node_to_task (type, node)->priority; + + /* ?? We could avoid this lookup by keeping a pointer to the key in + the priority_node. */ + struct priority_list *list + = priority_queue_lookup_priority (head, priority); +#if _LIBGOMP_CHECKING_ + if (!list) + gomp_fatal ("Unable to find priority %d", priority); +#endif + /* If NODE was the last in its priority, clean up the priority. */ + if (priority_list_remove (list, node, MEMMODEL_RELAXED)) + { + prio_splay_tree_remove (&head->t, (prio_splay_tree_key) list); + list->tasks = NULL; +#if _LIBGOMP_CHECKING_ + memset (list, 0xaf, sizeof (*list)); +#endif + free (list); + } +} + +/* Return the highest priority WAITING task in a splay tree NODE. If + there are no WAITING tasks available, return NULL. + + NODE is a priority list containing tasks of type TYPE. + + The right most node in a tree contains the highest priority. + Recurse down to find such a node. If the task at that max node is + not WAITING, bubble back up and look at the remaining tasks + in-order. */ + +static struct gomp_task * +priority_tree_next_task_1 (enum priority_queue_type type, + prio_splay_tree_node node) +{ + again: + if (!node) + return NULL; + struct gomp_task *ret = priority_tree_next_task_1 (type, node->right); + if (ret) + return ret; + ret = priority_node_to_task (type, node->key.l.tasks); + if (ret->kind == GOMP_TASK_WAITING) + return ret; + node = node->left; + goto again; +} + +/* Return the highest priority WAITING task from within Q1 and Q2, + while giving preference to tasks from Q1. Q1 is a queue containing + items of type TYPE1. Q2 is a queue containing items of type TYPE2. + + Since we are mostly interested in Q1, if there are no WAITING tasks + in Q1, we don't bother checking Q2, and just return NULL. + + As a special case, Q2 can be NULL, in which case, we just choose + the highest priority WAITING task in Q1. This is an optimization + to speed up looking through only one queue. + + If the returned task is chosen from Q1, *Q1_CHOSEN_P is set to + TRUE, otherwise it is set to FALSE. */ + +struct gomp_task * +priority_tree_next_task (enum priority_queue_type type1, + struct priority_queue *q1, + enum priority_queue_type type2, + struct priority_queue *q2, + bool *q1_chosen_p) +{ + struct gomp_task *t1 = priority_tree_next_task_1 (type1, q1->t.root); + if (!t1 + /* Special optimization when only searching through one queue. */ + || !q2) + { + *q1_chosen_p = true; + return t1; + } + struct gomp_task *t2 = priority_tree_next_task_1 (type2, q2->t.root); + if (!t2 || t1->priority > t2->priority) + { + *q1_chosen_p = true; + return t1; + } + if (t2->priority > t1->priority) + { + *q1_chosen_p = false; + return t2; + } + /* If we get here, the priorities are the same, so we must look at + parent_depends_on to make our decision. */ +#if _LIBGOMP_CHECKING_ + if (t1 != t2) + gomp_fatal ("priority_tree_next_task: t1 != t2"); +#endif + if (t2->parent_depends_on && !t1->parent_depends_on) + { + *q1_chosen_p = false; + return t2; + } + *q1_chosen_p = true; + return t1; +} + +/* Priority splay trees comparison function. */ +static inline int +prio_splay_compare (prio_splay_tree_key x, prio_splay_tree_key y) +{ + if (x->l.priority == y->l.priority) + return 0; + return x->l.priority < y->l.priority ? -1 : 1; +} + +/* Define another splay tree instantiation, for priority_list's. */ +#define splay_tree_prefix prio +#define splay_tree_c +#include "splay-tree.h" diff --git a/libgomp/priority_queue.h b/libgomp/priority_queue.h new file mode 100644 index 0000000..e9c369b --- /dev/null +++ b/libgomp/priority_queue.h @@ -0,0 +1,485 @@ +/* Copyright (C) 2015 Free Software Foundation, Inc. + Contributed by Aldy Hernandez <aldyh@redhat.com>. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* Header file for a priority queue of GOMP tasks. */ + +/* ?? Perhaps all the priority_tree_* functions are complex and rare + enough to go out-of-line and be moved to priority_queue.c. ?? */ + +#ifndef _PRIORITY_QUEUE_H_ +#define _PRIORITY_QUEUE_H_ + +/* One task. */ + +struct priority_node +{ + /* Next and previous chains in a circular doubly linked list for + tasks within this task's priority. */ + struct priority_node *next, *prev; +}; + +/* All tasks within the same priority. */ + +struct priority_list +{ + /* Priority of the tasks in this set. */ + int priority; + + /* Tasks. */ + struct priority_node *tasks; + + /* This points to the last of the higher priority WAITING tasks. + Remember that for the children queue, we have: + + parent_depends_on WAITING tasks. + !parent_depends_on WAITING tasks. + TIED tasks. + + This is a pointer to the last of the parent_depends_on WAITING + tasks which are essentially, higher priority items within their + priority. */ + struct priority_node *last_parent_depends_on; +}; + +/* Another splay tree instantiation, for priority_list's. */ +typedef struct prio_splay_tree_node_s *prio_splay_tree_node; +typedef struct prio_splay_tree_s *prio_splay_tree; +typedef struct prio_splay_tree_key_s *prio_splay_tree_key; +struct prio_splay_tree_key_s { + /* This structure must only containing a priority_list, as we cast + prio_splay_tree_key to priority_list throughout. */ + struct priority_list l; +}; +#define splay_tree_prefix prio +#include "splay-tree.h" + +/* The entry point into a priority queue of tasks. + + There are two alternate implementations with which to store tasks: + as a balanced tree of sorts, or as a simple list of tasks. If + there are only priority-0 items (ROOT is NULL), we use the simple + list, otherwise (ROOT is non-NULL) we use the tree. */ + +struct priority_queue +{ + /* If t.root != NULL, this is a splay tree of priority_lists to hold + all tasks. This is only used if multiple priorities are in play, + otherwise we use the priority_list `l' below to hold all + (priority-0) tasks. */ + struct prio_splay_tree_s t; + + /* If T above is NULL, only priority-0 items exist, so keep them + in a simple list. */ + struct priority_list l; +}; + +enum priority_insert_type { + /* Insert at the beginning of a priority list. */ + PRIORITY_INSERT_BEGIN, + /* Insert at the end of a priority list. */ + PRIORITY_INSERT_END +}; + +/* Used to determine in which queue a given priority node belongs in. + See pnode field of gomp_task. */ + +enum priority_queue_type +{ + PQ_TEAM, /* Node belongs in gomp_team's task_queue. */ + PQ_CHILDREN, /* Node belongs in parent's children_queue. */ + PQ_TASKGROUP, /* Node belongs in taskgroup->taskgroup_queue. */ + PQ_IGNORED = 999 +}; + +/* Priority queue implementation prototypes. */ + +extern bool priority_queue_task_in_queue_p (enum priority_queue_type, + struct priority_queue *, + struct gomp_task *); +extern void priority_queue_dump (enum priority_queue_type, + struct priority_queue *); +extern void priority_queue_verify (enum priority_queue_type, + struct priority_queue *, bool); +extern void priority_tree_remove (enum priority_queue_type, + struct priority_queue *, + struct priority_node *); +extern struct gomp_task *priority_tree_next_task (enum priority_queue_type, + struct priority_queue *, + enum priority_queue_type, + struct priority_queue *, + bool *); + +/* Return TRUE if there is more than one priority in HEAD. This is + used throughout to to choose between the fast path (priority 0 only + items) and a world with multiple priorities. */ + +static inline bool +priority_queue_multi_p (struct priority_queue *head) +{ + return __builtin_expect (head->t.root != NULL, 0); +} + +/* Initialize a priority queue. */ + +static inline void +priority_queue_init (struct priority_queue *head) +{ + head->t.root = NULL; + /* To save a few microseconds, we don't initialize head->l.priority + to 0 here. It is implied that priority will be 0 if head->t.root + == NULL. + + priority_tree_insert() will fix this when we encounter multiple + priorities. */ + head->l.tasks = NULL; + head->l.last_parent_depends_on = NULL; +} + +static inline void +priority_queue_free (struct priority_queue *head) +{ + /* There's nothing to do, as tasks were freed as they were removed + in priority_queue_remove. */ +} + +/* Forward declarations. */ +static inline size_t priority_queue_offset (enum priority_queue_type); +static inline struct gomp_task *priority_node_to_task + (enum priority_queue_type, + struct priority_node *); +static inline struct priority_node *task_to_priority_node + (enum priority_queue_type, + struct gomp_task *); + +/* Return TRUE if priority queue HEAD is empty. + + MODEL IS MEMMODEL_ACQUIRE if we should use an acquire atomic to + read from the root of the queue, otherwise MEMMODEL_RELAXED if we + should use a plain load. */ + +static inline _Bool +priority_queue_empty_p (struct priority_queue *head, enum memmodel model) +{ + /* Note: The acquire barriers on the loads here synchronize with + the write of a NULL in gomp_task_run_post_remove_parent. It is + not necessary that we synchronize with other non-NULL writes at + this point, but we must ensure that all writes to memory by a + child thread task work function are seen before we exit from + GOMP_taskwait. */ + if (priority_queue_multi_p (head)) + { + if (model == MEMMODEL_ACQUIRE) + return __atomic_load_n (&head->t.root, MEMMODEL_ACQUIRE) == NULL; + return head->t.root == NULL; + } + if (model == MEMMODEL_ACQUIRE) + return __atomic_load_n (&head->l.tasks, MEMMODEL_ACQUIRE) == NULL; + return head->l.tasks == NULL; +} + +/* Look for a given PRIORITY in HEAD. Return it if found, otherwise + return NULL. This only applies to the tree variant in HEAD. There + is no point in searching for priorities in HEAD->L. */ + +static inline struct priority_list * +priority_queue_lookup_priority (struct priority_queue *head, int priority) +{ + if (head->t.root == NULL) + return NULL; + struct prio_splay_tree_key_s k; + k.l.priority = priority; + return (struct priority_list *) + prio_splay_tree_lookup (&head->t, &k); +} + +/* Insert task in DATA, with PRIORITY, in the priority list in LIST. + LIST contains items of type TYPE. + + If POS is PRIORITY_INSERT_BEGIN, the new task is inserted at the + top of its respective priority. If POS is PRIORITY_INSERT_END, the + task is inserted at the end of its priority. + + If ADJUST_PARENT_DEPENDS_ON is TRUE, LIST is a children queue, and + we must keep track of higher and lower priority WAITING tasks by + keeping the queue's last_parent_depends_on field accurate. This + only applies to the children queue, and the caller must ensure LIST + is a children queue in this case. + + If ADJUST_PARENT_DEPENDS_ON is TRUE, TASK_IS_PARENT_DEPENDS_ON is + set to the task's parent_depends_on field. If + ADJUST_PARENT_DEPENDS_ON is FALSE, this field is irrelevant. + + Return the new priority_node. */ + +static inline void +priority_list_insert (enum priority_queue_type type, + struct priority_list *list, + struct gomp_task *task, + int priority, + enum priority_insert_type pos, + bool adjust_parent_depends_on, + bool task_is_parent_depends_on) +{ + struct priority_node *node = task_to_priority_node (type, task); + if (list->tasks) + { + /* If we are keeping track of higher/lower priority items, + but this is a lower priority WAITING task + (parent_depends_on != NULL), put it after all ready to + run tasks. See the comment in + priority_queue_upgrade_task for a visual on how tasks + should be organized. */ + if (adjust_parent_depends_on + && pos == PRIORITY_INSERT_BEGIN + && list->last_parent_depends_on + && !task_is_parent_depends_on) + { + struct priority_node *last_parent_depends_on + = list->last_parent_depends_on; + node->next = last_parent_depends_on->next; + node->prev = last_parent_depends_on; + } + /* Otherwise, put it at the top/bottom of the queue. */ + else + { + node->next = list->tasks; + node->prev = list->tasks->prev; + if (pos == PRIORITY_INSERT_BEGIN) + list->tasks = node; + } + node->next->prev = node; + node->prev->next = node; + } + else + { + node->next = node; + node->prev = node; + list->tasks = node; + } + if (adjust_parent_depends_on + && list->last_parent_depends_on == NULL + && task_is_parent_depends_on) + list->last_parent_depends_on = node; +} + +/* Tree version of priority_list_insert. */ + +static inline void +priority_tree_insert (enum priority_queue_type type, + struct priority_queue *head, + struct gomp_task *task, + int priority, + enum priority_insert_type pos, + bool adjust_parent_depends_on, + bool task_is_parent_depends_on) +{ + if (__builtin_expect (head->t.root == NULL, 0)) + { + /* The first time around, transfer any priority 0 items to the + tree. */ + if (head->l.tasks != NULL) + { + prio_splay_tree_node k = gomp_malloc (sizeof (*k)); + k->left = NULL; + k->right = NULL; + k->key.l.priority = 0; + k->key.l.tasks = head->l.tasks; + k->key.l.last_parent_depends_on = head->l.last_parent_depends_on; + prio_splay_tree_insert (&head->t, k); + head->l.tasks = NULL; + } + } + struct priority_list *list + = priority_queue_lookup_priority (head, priority); + if (!list) + { + prio_splay_tree_node k = gomp_malloc (sizeof (*k)); + k->left = NULL; + k->right = NULL; + k->key.l.priority = priority; + k->key.l.tasks = NULL; + k->key.l.last_parent_depends_on = NULL; + prio_splay_tree_insert (&head->t, k); + list = &k->key.l; + } + priority_list_insert (type, list, task, priority, pos, + adjust_parent_depends_on, + task_is_parent_depends_on); +} + +/* Generic version of priority_*_insert. */ + +static inline void +priority_queue_insert (enum priority_queue_type type, + struct priority_queue *head, + struct gomp_task *task, + int priority, + enum priority_insert_type pos, + bool adjust_parent_depends_on, + bool task_is_parent_depends_on) +{ +#if _LIBGOMP_CHECKING_ + if (priority_queue_task_in_queue_p (type, head, task)) + gomp_fatal ("Attempt to insert existing task %p", task); +#endif + if (priority_queue_multi_p (head) || __builtin_expect (priority > 0, 0)) + priority_tree_insert (type, head, task, priority, pos, + adjust_parent_depends_on, + task_is_parent_depends_on); + else + priority_list_insert (type, &head->l, task, priority, pos, + adjust_parent_depends_on, + task_is_parent_depends_on); +} + +/* If multiple priorities are in play, return the highest priority + task from within Q1 and Q2, while giving preference to tasks from + Q1. If the returned task is chosen from Q1, *Q1_CHOSEN_P is set to + TRUE, otherwise it is set to FALSE. + + If multiple priorities are not in play (only 0 priorities are + available), the next task is chosen exclusively from Q1. + + As a special case, Q2 can be NULL, in which case, we just choose + the highest priority WAITING task in Q1. This is an optimization + to speed up looking through only one queue. + + We assume Q1 has at least one item. */ + +static inline struct gomp_task * +priority_queue_next_task (enum priority_queue_type t1, + struct priority_queue *q1, + enum priority_queue_type t2, + struct priority_queue *q2, + bool *q1_chosen_p) +{ +#if _LIBGOMP_CHECKING_ + if (priority_queue_empty_p (q1, MEMMODEL_RELAXED)) + gomp_fatal ("priority_queue_next_task: Q1 is empty"); +#endif + if (priority_queue_multi_p (q1)) + { + struct gomp_task *t + = priority_tree_next_task (t1, q1, t2, q2, q1_chosen_p); + /* If T is NULL, there are no WAITING tasks in Q1. In which + case, return any old (non-waiting) task which will cause the + caller to do the right thing when checking T->KIND == + GOMP_TASK_WAITING. */ + if (!t) + { +#if _LIBGOMP_CHECKING_ + if (*q1_chosen_p == false) + gomp_fatal ("priority_queue_next_task inconsistency"); +#endif + return priority_node_to_task (t1, q1->t.root->key.l.tasks); + } + return t; + } + else + { + *q1_chosen_p = true; + return priority_node_to_task (t1, q1->l.tasks); + } +} + +/* Remove NODE from LIST. + + If we are removing the one and only item in the list, and MODEL is + MEMMODEL_RELEASE, use an atomic release to clear the list. + + If the list becomes empty after the remove, return TRUE. */ + +static inline bool +priority_list_remove (struct priority_list *list, + struct priority_node *node, + enum memmodel model) +{ + bool empty = false; + node->prev->next = node->next; + node->next->prev = node->prev; + if (list->tasks == node) + { + if (node->next != node) + list->tasks = node->next; + else + { + /* We access task->children in GOMP_taskwait outside of + the task lock mutex region, so need a release barrier + here to ensure memory written by child_task->fn above + is flushed before the NULL is written. */ + if (model == MEMMODEL_RELEASE) + __atomic_store_n (&list->tasks, NULL, MEMMODEL_RELEASE); + else + list->tasks = NULL; + empty = true; + goto remove_out; + } + } +remove_out: +#if _LIBGOMP_CHECKING_ + memset (node, 0xaf, sizeof (*node)); +#endif + return empty; +} + +/* This is the generic version of priority_list_remove. + + Remove NODE from priority queue HEAD. HEAD contains tasks of type TYPE. + + If we are removing the one and only item in the priority queue and + MODEL is MEMMODEL_RELEASE, use an atomic release to clear the queue. + + If the queue becomes empty after the remove, return TRUE. */ + +static inline bool +priority_queue_remove (enum priority_queue_type type, + struct priority_queue *head, + struct gomp_task *task, + enum memmodel model) +{ +#if _LIBGOMP_CHECKING_ + if (!priority_queue_task_in_queue_p (type, head, task)) + gomp_fatal ("Attempt to remove missing task %p", task); +#endif + if (priority_queue_multi_p (head)) + { + priority_tree_remove (type, head, task_to_priority_node (type, task)); + if (head->t.root == NULL) + { + if (model == MEMMODEL_RELEASE) + /* Errr, we store NULL twice, the alternative would be to + use an atomic release directly in the splay tree + routines. Worth it? */ + __atomic_store_n (&head->t.root, NULL, MEMMODEL_RELEASE); + return true; + } + return false; + } + else + return priority_list_remove (&head->l, + task_to_priority_node (type, task), model); +} + +#endif /* _PRIORITY_QUEUE_H_ */ diff --git a/libgomp/splay-tree.c b/libgomp/splay-tree.c index 030ca8f..862bbb8 100644 --- a/libgomp/splay-tree.c +++ b/libgomp/splay-tree.c @@ -37,9 +37,6 @@ are amortized O(log n) time for a tree with n nodes. */ #include "libgomp.h" -#include "splay-tree.h" - -extern int splay_compare (splay_tree_key, splay_tree_key); /* Rotate the edge joining the left child N with its parent P. PP is the grandparents' pointer to P. */ @@ -215,3 +212,27 @@ splay_tree_lookup (splay_tree sp, splay_tree_key key) else return NULL; } + +/* Helper function for splay_tree_foreach. + + Run FUNC on every node in KEY. */ + +static void +splay_tree_foreach_internal (splay_tree_node node, splay_tree_callback func, + void *data) +{ + if (!node) + return; + func (&node->key, data); + splay_tree_foreach_internal (node->left, func, data); + /* Yeah, whatever. GCC can fix my tail recursion. */ + splay_tree_foreach_internal (node->right, func, data); +} + +/* Run FUNC on each of the nodes in SP. */ + +attribute_hidden void +splay_tree_foreach (splay_tree sp, splay_tree_callback func, void *data) +{ + splay_tree_foreach_internal (sp->root, func, data); +} diff --git a/libgomp/splay-tree.h b/libgomp/splay-tree.h index 085021c..92c51bf 100644 --- a/libgomp/splay-tree.h +++ b/libgomp/splay-tree.h @@ -33,7 +33,17 @@ typedef struct splay_tree_node_s *splay_tree_node; typedef struct splay_tree_s *splay_tree; typedef struct splay_tree_key_s *splay_tree_key; define splay_tree_key_s structure, and define - splay_compare inline function. */ + splay_compare inline function. + + Alternatively, they can define splay_tree_prefix macro before + including this header and then all the above types, the + splay_compare function and the splay_tree_{lookup,insert_remove} + function will be prefixed by that prefix. If splay_tree_prefix + macro is defined, this header must be included twice: once where + you need the header file definitions, and once where you need the + .c implementation routines. In the latter case, you must also + define the macro splay_tree_c. See the include of splay-tree.h in + priority_queue.[hc] for an example. */ /* For an easily readable description of splay-trees, see: @@ -43,8 +53,37 @@ typedef struct splay_tree_key_s *splay_tree_key; The major feature of splay trees is that all basic tree operations are amortized O(log n) time for a tree with n nodes. */ -#ifndef _SPLAY_TREE_H -#define _SPLAY_TREE_H 1 +#ifdef splay_tree_prefix +# define splay_tree_name_1(prefix, name) prefix ## _ ## name +# define splay_tree_name(prefix, name) splay_tree_name_1 (prefix, name) +# define splay_tree_node_s \ + splay_tree_name (splay_tree_prefix, splay_tree_node_s) +# define splay_tree_s \ + splay_tree_name (splay_tree_prefix, splay_tree_s) +# define splay_tree_key_s \ + splay_tree_name (splay_tree_prefix, splay_tree_key_s) +# define splay_tree_node \ + splay_tree_name (splay_tree_prefix, splay_tree_node) +# define splay_tree \ + splay_tree_name (splay_tree_prefix, splay_tree) +# define splay_tree_key \ + splay_tree_name (splay_tree_prefix, splay_tree_key) +# define splay_compare \ + splay_tree_name (splay_tree_prefix, splay_compare) +# define splay_tree_lookup \ + splay_tree_name (splay_tree_prefix, splay_tree_lookup) +# define splay_tree_insert \ + splay_tree_name (splay_tree_prefix, splay_tree_insert) +# define splay_tree_remove \ + splay_tree_name (splay_tree_prefix, splay_tree_remove) +# define splay_tree_foreach \ + splay_tree_name (splay_tree_prefix, splay_tree_foreach) +# define splay_tree_callback \ + splay_tree_name (splay_tree_prefix, splay_tree_callback) +#endif + +#ifndef splay_tree_c +/* Header file definitions and prototypes. */ /* The nodes in the splay tree. */ struct splay_tree_node_s { @@ -59,8 +98,33 @@ struct splay_tree_s { splay_tree_node root; }; +typedef void (*splay_tree_callback) (splay_tree_key, void *); + extern splay_tree_key splay_tree_lookup (splay_tree, splay_tree_key); extern void splay_tree_insert (splay_tree, splay_tree_node); extern void splay_tree_remove (splay_tree, splay_tree_key); +extern void splay_tree_foreach (splay_tree, splay_tree_callback, void *); +#else /* splay_tree_c */ +# ifdef splay_tree_prefix +# include "splay-tree.c" +# undef splay_tree_name_1 +# undef splay_tree_name +# undef splay_tree_node_s +# undef splay_tree_s +# undef splay_tree_key_s +# undef splay_tree_node +# undef splay_tree +# undef splay_tree_key +# undef splay_compare +# undef splay_tree_lookup +# undef splay_tree_insert +# undef splay_tree_remove +# undef splay_tree_foreach +# undef splay_tree_callback +# undef splay_tree_c +# endif +#endif /* #ifndef splay_tree_c */ -#endif /* _SPLAY_TREE_H */ +#ifdef splay_tree_prefix +# undef splay_tree_prefix +#endif diff --git a/libgomp/target.c b/libgomp/target.c index 1bddc6f..cf9d0e6 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -92,23 +92,6 @@ gomp_realloc_unlock (void *old, size_t size) return ret; } -/* The comparison function. */ - -attribute_hidden int -splay_compare (splay_tree_key x, splay_tree_key y) -{ - if (x->host_start == x->host_end - && y->host_start == y->host_end) - return 0; - if (x->host_end <= y->host_start) - return -1; - if (x->host_start >= y->host_end) - return 1; - return 0; -} - -#include "splay-tree.h" - attribute_hidden void gomp_init_targets_once (void) { @@ -1365,17 +1348,7 @@ GOMP_target (int device, void (*fn) (void *), const void *unused, struct target_mem_desc *tgt_vars = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, false, GOMP_MAP_VARS_TARGET); - struct gomp_thread old_thr, *thr = gomp_thread (); - old_thr = *thr; - memset (thr, '\0', sizeof (*thr)); - if (gomp_places_list) - { - thr->place = old_thr.place; - thr->ts.place_partition_len = gomp_places_list_len; - } devicep->run_func (devicep->target_id, fn_addr, (void *) tgt_vars->tgt_start); - gomp_free_thread (thr); - *thr = old_thr; gomp_unmap_vars (tgt_vars, true); } @@ -1404,10 +1377,52 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum, (void) num_teams; (void) thread_limit; - /* If there are depend clauses, but nowait is not present, - block the parent task until the dependencies are resolved - and then just continue with the rest of the function as if it - is a merged task. */ + if (flags & GOMP_TARGET_FLAG_NOWAIT) + { + struct gomp_thread *thr = gomp_thread (); + /* Create a team if we don't have any around, as nowait + target tasks make sense to run asynchronously even when + outside of any parallel. */ + if (__builtin_expect (thr->ts.team == NULL, 0)) + { + struct gomp_team *team = gomp_new_team (1); + struct gomp_task *task = thr->task; + struct gomp_task_icv *icv = task ? &task->icv : &gomp_global_icv; + team->prev_ts = thr->ts; + thr->ts.team = team; + thr->ts.team_id = 0; + thr->ts.work_share = &team->work_shares[0]; + thr->ts.last_work_share = NULL; +#ifdef HAVE_SYNC_BUILTINS + thr->ts.single_count = 0; +#endif + thr->ts.static_trip = 0; + thr->task = &team->implicit_task[0]; + gomp_init_task (thr->task, NULL, icv); + if (task) + { + thr->task = task; + gomp_end_task (); + free (task); + thr->task = &team->implicit_task[0]; + } + else + pthread_setspecific (gomp_thread_destructor, thr); + } + if (thr->ts.team + && !thr->task->final_task) + { + gomp_create_target_task (devicep, fn, mapnum, hostaddrs, + sizes, kinds, flags, depend, + GOMP_TARGET_TASK_BEFORE_MAP); + return; + } + } + + /* If there are depend clauses, but nowait is not present + (or we are in a final task), block the parent task until the + dependencies are resolved and then just continue with the rest + of the function as if it is a merged task. */ if (depend != NULL) { struct gomp_thread *thr = gomp_thread (); @@ -1427,17 +1442,7 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum, struct target_mem_desc *tgt_vars = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, true, GOMP_MAP_VARS_TARGET); - struct gomp_thread old_thr, *thr = gomp_thread (); - old_thr = *thr; - memset (thr, '\0', sizeof (*thr)); - if (gomp_places_list) - { - thr->place = old_thr.place; - thr->ts.place_partition_len = gomp_places_list_len; - } devicep->run_func (devicep->target_id, fn_addr, (void *) tgt_vars->tgt_start); - gomp_free_thread (thr); - *thr = old_thr; gomp_unmap_vars (tgt_vars, true); } @@ -1544,23 +1549,25 @@ GOMP_target_update_ext (int device, size_t mapnum, void **hostaddrs, && thr->ts.team && !thr->task->final_task) { - gomp_create_target_task (devicep, (void (*) (void *)) NULL, - mapnum, hostaddrs, sizes, kinds, - flags | GOMP_TARGET_FLAG_UPDATE, - depend); - return; + if (gomp_create_target_task (devicep, (void (*) (void *)) NULL, + mapnum, hostaddrs, sizes, kinds, + flags | GOMP_TARGET_FLAG_UPDATE, + depend, GOMP_TARGET_TASK_DATA)) + return; + } + else + { + struct gomp_team *team = thr->ts.team; + /* If parallel or taskgroup has been cancelled, don't start new + tasks. */ + if (team + && (gomp_team_barrier_cancelled (&team->barrier) + || (thr->task->taskgroup + && thr->task->taskgroup->cancelled))) + return; + + gomp_task_maybe_wait_for_dependencies (depend); } - - struct gomp_team *team = thr->ts.team; - /* If parallel or taskgroup has been cancelled, don't start new - tasks. */ - if (team - && (gomp_team_barrier_cancelled (&team->barrier) - || (thr->task->taskgroup - && thr->task->taskgroup->cancelled))) - return; - - gomp_task_maybe_wait_for_dependencies (depend); } } @@ -1664,22 +1671,25 @@ GOMP_target_enter_exit_data (int device, size_t mapnum, void **hostaddrs, && thr->ts.team && !thr->task->final_task) { - gomp_create_target_task (devicep, (void (*) (void *)) NULL, - mapnum, hostaddrs, sizes, kinds, - flags, depend); - return; + if (gomp_create_target_task (devicep, (void (*) (void *)) NULL, + mapnum, hostaddrs, sizes, kinds, + flags, depend, + GOMP_TARGET_TASK_DATA)) + return; + } + else + { + struct gomp_team *team = thr->ts.team; + /* If parallel or taskgroup has been cancelled, don't start new + tasks. */ + if (team + && (gomp_team_barrier_cancelled (&team->barrier) + || (thr->task->taskgroup + && thr->task->taskgroup->cancelled))) + return; + + gomp_task_maybe_wait_for_dependencies (depend); } - - struct gomp_team *team = thr->ts.team; - /* If parallel or taskgroup has been cancelled, don't start new - tasks. */ - if (team - && (gomp_team_barrier_cancelled (&team->barrier) - || (thr->task->taskgroup - && thr->task->taskgroup->cancelled))) - return; - - gomp_task_maybe_wait_for_dependencies (depend); } } @@ -1711,38 +1721,65 @@ GOMP_target_enter_exit_data (int device, size_t mapnum, void **hostaddrs, gomp_exit_data (devicep, mapnum, hostaddrs, sizes, kinds); } -void +bool gomp_target_task_fn (void *data) { struct gomp_target_task *ttask = (struct gomp_target_task *) data; + struct gomp_device_descr *devicep = ttask->devicep; + if (ttask->fn != NULL) { - /* GOMP_target_ext */ + if (devicep == NULL + || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) + { + ttask->state = GOMP_TARGET_TASK_FALLBACK; + gomp_target_fallback_firstprivate (ttask->fn, ttask->mapnum, + ttask->hostaddrs, ttask->sizes, + ttask->kinds); + return false; + } + + if (ttask->state == GOMP_TARGET_TASK_FINISHED) + { + gomp_unmap_vars (ttask->tgt, true); + return false; + } + + void *fn_addr = gomp_get_target_fn_addr (devicep, ttask->fn); + ttask->tgt + = gomp_map_vars (devicep, ttask->mapnum, ttask->hostaddrs, NULL, + ttask->sizes, ttask->kinds, true, + GOMP_MAP_VARS_TARGET); + ttask->state = GOMP_TARGET_TASK_READY_TO_RUN; + + devicep->async_run_func (devicep->target_id, fn_addr, + (void *) ttask->tgt->tgt_start, (void *) ttask); + return true; } - else if (ttask->devicep == NULL - || !(ttask->devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) - return; + else if (devicep == NULL + || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) + return false; size_t i; if (ttask->flags & GOMP_TARGET_FLAG_UPDATE) - gomp_update (ttask->devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes, + gomp_update (devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes, ttask->kinds, true); else if ((ttask->flags & GOMP_TARGET_FLAG_EXIT_DATA) == 0) for (i = 0; i < ttask->mapnum; i++) if ((ttask->kinds[i] & 0xff) == GOMP_MAP_STRUCT) { - gomp_map_vars (ttask->devicep, ttask->sizes[i] + 1, - &ttask->hostaddrs[i], NULL, &ttask->sizes[i], - &ttask->kinds[i], true, GOMP_MAP_VARS_ENTER_DATA); + gomp_map_vars (devicep, ttask->sizes[i] + 1, &ttask->hostaddrs[i], + NULL, &ttask->sizes[i], &ttask->kinds[i], true, + GOMP_MAP_VARS_ENTER_DATA); i += ttask->sizes[i]; } else - gomp_map_vars (ttask->devicep, 1, &ttask->hostaddrs[i], NULL, - &ttask->sizes[i], &ttask->kinds[i], - true, GOMP_MAP_VARS_ENTER_DATA); + gomp_map_vars (devicep, 1, &ttask->hostaddrs[i], NULL, &ttask->sizes[i], + &ttask->kinds[i], true, GOMP_MAP_VARS_ENTER_DATA); else - gomp_exit_data (ttask->devicep, ttask->mapnum, ttask->hostaddrs, - ttask->sizes, ttask->kinds); + gomp_exit_data (devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes, + ttask->kinds); + return false; } void @@ -2187,6 +2224,7 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device, if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) { DLSYM (run); + DLSYM (async_run); DLSYM (dev2dev); } if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200) diff --git a/libgomp/task.c b/libgomp/task.c index 1246c6a..541008d 100644 --- a/libgomp/task.c +++ b/libgomp/task.c @@ -65,6 +65,14 @@ void gomp_init_task (struct gomp_task *task, struct gomp_task *parent_task, struct gomp_task_icv *prev_icv) { + /* It would seem that using memset here would be a win, but it turns + out that partially filling gomp_task allows us to keep the + overhead of task creation low. In the nqueens-1.c test, for a + sufficiently large N, we drop the overhead from 5-6% to 1%. + + Note, the nqueens-1.c test in serial mode is a good test to + benchmark the overhead of creating tasks as there are millions of + tiny tasks created that all run undeferred. */ task->parent = parent_task; task->icv = *prev_icv; task->kind = GOMP_TASK_IMPLICIT; @@ -73,7 +81,7 @@ gomp_init_task (struct gomp_task *task, struct gomp_task *parent_task, task->final_task = false; task->copy_ctors_done = false; task->parent_depends_on = false; - task->children = NULL; + priority_queue_init (&task->children_queue); task->taskgroup = NULL; task->dependers = NULL; task->depend_hash = NULL; @@ -92,24 +100,66 @@ gomp_end_task (void) thr->task = task->parent; } -/* Orphan the task in CHILDREN and all its siblings. */ +/* Clear the parent field of every task in LIST. */ static inline void -gomp_clear_parent (struct gomp_task *children) +gomp_clear_parent_in_list (struct priority_list *list) { - struct gomp_task *task = children; - - if (task) + struct priority_node *p = list->tasks; + if (p) do { - task->parent = NULL; - task = task->next_child; + priority_node_to_task (PQ_CHILDREN, p)->parent = NULL; + p = p->next; } - while (task != children); + while (p != list->tasks); } -/* Helper function for GOMP_task and gomp_create_target_task. Depend clause - handling for undeferred task creation. */ +/* Splay tree version of gomp_clear_parent_in_list. + + Clear the parent field of every task in NODE within SP, and free + the node when done. */ + +static void +gomp_clear_parent_in_tree (prio_splay_tree sp, prio_splay_tree_node node) +{ + if (!node) + return; + prio_splay_tree_node left = node->left, right = node->right; + gomp_clear_parent_in_list (&node->key.l); +#if _LIBGOMP_CHECKING_ + memset (node, 0xaf, sizeof (*node)); +#endif + /* No need to remove the node from the tree. We're nuking + everything, so just free the nodes and our caller can clear the + entire splay tree. */ + free (node); + gomp_clear_parent_in_tree (sp, left); + gomp_clear_parent_in_tree (sp, right); +} + +/* Clear the parent field of every task in Q and remove every task + from Q. */ + +static inline void +gomp_clear_parent (struct priority_queue *q) +{ + if (priority_queue_multi_p (q)) + { + gomp_clear_parent_in_tree (&q->t, q->t.root); + /* All the nodes have been cleared in gomp_clear_parent_in_tree. + No need to remove anything. We can just nuke everything. */ + q->t.root = NULL; + } + else + gomp_clear_parent_in_list (&q->l); +} + +/* Helper function for GOMP_task and gomp_create_target_task. + + For a TASK with in/out dependencies, fill in the various dependency + queues. PARENT is the parent of said task. DEPEND is as in + GOMP_task. */ static void gomp_task_handle_depend (struct gomp_task *task, struct gomp_task *parent, @@ -260,8 +310,8 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), if ((flags & GOMP_TASK_FLAG_PRIORITY) == 0) priority = 0; - /* FIXME, use priority. */ - (void) priority; + else if (priority > gomp_max_task_priority_var) + priority = gomp_max_task_priority_var; if (!if_clause || team == NULL || (thr->task && thr->task->final_task) @@ -283,6 +333,7 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), task.kind = GOMP_TASK_UNDEFERRED; task.final_task = (thr->task && thr->task->final_task) || (flags & GOMP_TASK_FLAG_FINAL); + task.priority = priority; if (thr->task) { task.in_tied_task = thr->task->in_tied_task; @@ -308,10 +359,10 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), child thread, but seeing a stale non-NULL value is not a problem. Once past the task_lock acquisition, this thread will see the real value of task.children. */ - if (task.children != NULL) + if (!priority_queue_empty_p (&task.children_queue, MEMMODEL_RELAXED)) { gomp_mutex_lock (&team->task_lock); - gomp_clear_parent (task.children); + gomp_clear_parent (&task.children_queue); gomp_mutex_unlock (&team->task_lock); } gomp_end_task (); @@ -333,6 +384,7 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), arg = (char *) (((uintptr_t) (task + 1) + depend_size + arg_align - 1) & ~(uintptr_t) (arg_align - 1)); gomp_init_task (task, parent, gomp_icv (false)); + task->priority = priority; task->kind = GOMP_TASK_UNDEFERRED; task->in_tied_task = parent->in_tied_task; task->taskgroup = taskgroup; @@ -368,53 +420,36 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), gomp_task_handle_depend (task, parent, depend); if (task->num_dependees) { + /* Tasks that depend on other tasks are not put into the + various waiting queues, so we are done for now. Said + tasks are instead put into the queues via + gomp_task_run_post_handle_dependers() after their + dependencies have been satisfied. After which, they + can be picked up by the various scheduling + points. */ gomp_mutex_unlock (&team->task_lock); return; } } - if (parent->children) - { - task->next_child = parent->children; - task->prev_child = parent->children->prev_child; - task->next_child->prev_child = task; - task->prev_child->next_child = task; - } - else - { - task->next_child = task; - task->prev_child = task; - } - parent->children = task; + + priority_queue_insert (PQ_CHILDREN, &parent->children_queue, + task, priority, + PRIORITY_INSERT_BEGIN, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); if (taskgroup) - { - /* If applicable, place task into its taskgroup. */ - if (taskgroup->children) - { - task->next_taskgroup = taskgroup->children; - task->prev_taskgroup = taskgroup->children->prev_taskgroup; - task->next_taskgroup->prev_taskgroup = task; - task->prev_taskgroup->next_taskgroup = task; - } - else - { - task->next_taskgroup = task; - task->prev_taskgroup = task; - } - taskgroup->children = task; - } - if (team->task_queue) - { - task->next_queue = team->task_queue; - task->prev_queue = team->task_queue->prev_queue; - task->next_queue->prev_queue = task; - task->prev_queue->next_queue = task; - } - else - { - task->next_queue = task; - task->prev_queue = task; - team->task_queue = task; - } + priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, + task, priority, + PRIORITY_INSERT_BEGIN, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); + + priority_queue_insert (PQ_TEAM, &team->task_queue, + task, priority, + PRIORITY_INSERT_END, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); + ++team->task_count; ++team->task_queued_count; gomp_team_barrier_set_task_pending (&team->barrier); @@ -445,13 +480,119 @@ ialias (GOMP_taskgroup_end) #undef UTYPE #undef GOMP_taskloop -/* Called for nowait target tasks. */ +static void inline +priority_queue_move_task_first (enum priority_queue_type type, + struct priority_queue *head, + struct gomp_task *task) +{ +#if _LIBGOMP_CHECKING_ + if (!priority_queue_task_in_queue_p (type, head, task)) + gomp_fatal ("Attempt to move first missing task %p", task); +#endif + struct priority_list *list; + if (priority_queue_multi_p (head)) + { + list = priority_queue_lookup_priority (head, task->priority); +#if _LIBGOMP_CHECKING_ + if (!list) + gomp_fatal ("Unable to find priority %d", task->priority); +#endif + } + else + list = &head->l; + priority_list_remove (list, task_to_priority_node (type, task), 0); + priority_list_insert (type, list, task, task->priority, + PRIORITY_INSERT_BEGIN, type == PQ_CHILDREN, + task->parent_depends_on); +} + +/* Actual body of GOMP_PLUGIN_target_task_completion that is executed + with team->task_lock held, or is executed in the thread that called + gomp_target_task_fn if GOMP_PLUGIN_target_task_completion has been + run before it acquires team->task_lock. */ + +static void +gomp_target_task_completion (struct gomp_team *team, struct gomp_task *task) +{ + struct gomp_task *parent = task->parent; + if (parent) + priority_queue_move_task_first (PQ_CHILDREN, &parent->children_queue, + task); + + struct gomp_taskgroup *taskgroup = task->taskgroup; + if (taskgroup) + priority_queue_move_task_first (PQ_TASKGROUP, &taskgroup->taskgroup_queue, + task); + + priority_queue_insert (PQ_TEAM, &team->task_queue, task, task->priority, + PRIORITY_INSERT_BEGIN, false, + task->parent_depends_on); + task->kind = GOMP_TASK_WAITING; + if (parent && parent->taskwait) + { + if (parent->taskwait->in_taskwait) + { + /* One more task has had its dependencies met. + Inform any waiters. */ + parent->taskwait->in_taskwait = false; + gomp_sem_post (&parent->taskwait->taskwait_sem); + } + else if (parent->taskwait->in_depend_wait) + { + /* One more task has had its dependencies met. + Inform any waiters. */ + parent->taskwait->in_depend_wait = false; + gomp_sem_post (&parent->taskwait->taskwait_sem); + } + } + if (taskgroup && taskgroup->in_taskgroup_wait) + { + /* One more task has had its dependencies met. + Inform any waiters. */ + taskgroup->in_taskgroup_wait = false; + gomp_sem_post (&taskgroup->taskgroup_sem); + } + + ++team->task_queued_count; + gomp_team_barrier_set_task_pending (&team->barrier); + /* I'm afraid this can't be done after releasing team->task_lock, + as gomp_target_task_completion is run from unrelated thread and + therefore in between gomp_mutex_unlock and gomp_team_barrier_wake + the team could be gone already. */ + if (team->nthreads > team->task_running_count) + gomp_team_barrier_wake (&team->barrier, 1); +} + +/* Signal that a target task TTASK has completed the asynchronously + running phase and should be requeued as a task to handle the + variable unmapping. */ void +GOMP_PLUGIN_target_task_completion (void *data) +{ + struct gomp_target_task *ttask = (struct gomp_target_task *) data; + struct gomp_task *task = ttask->task; + struct gomp_team *team = ttask->team; + + gomp_mutex_lock (&team->task_lock); + if (ttask->state == GOMP_TARGET_TASK_READY_TO_RUN) + { + ttask->state = GOMP_TARGET_TASK_FINISHED; + gomp_mutex_unlock (&team->task_lock); + } + ttask->state = GOMP_TARGET_TASK_FINISHED; + gomp_target_task_completion (team, task); + gomp_mutex_unlock (&team->task_lock); +} + +/* Called for nowait target tasks. */ + +bool gomp_create_target_task (struct gomp_device_descr *devicep, void (*fn) (void *), size_t mapnum, void **hostaddrs, size_t *sizes, unsigned short *kinds, - unsigned int flags, void **depend) + unsigned int flags, void **depend, + enum gomp_target_task_state state) { struct gomp_thread *thr = gomp_thread (); struct gomp_team *team = thr->ts.team; @@ -460,7 +601,7 @@ gomp_create_target_task (struct gomp_device_descr *devicep, if (team && (gomp_team_barrier_cancelled (&team->barrier) || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) - return; + return true; struct gomp_target_task *ttask; struct gomp_task *task; @@ -468,19 +609,45 @@ gomp_create_target_task (struct gomp_device_descr *devicep, struct gomp_taskgroup *taskgroup = parent->taskgroup; bool do_wake; size_t depend_size = 0; + uintptr_t depend_cnt = 0; + size_t tgt_align = 0, tgt_size = 0; if (depend != NULL) - depend_size = ((uintptr_t) depend[0] - * sizeof (struct gomp_task_depend_entry)); + { + depend_cnt = (uintptr_t) depend[0]; + depend_size = depend_cnt * sizeof (struct gomp_task_depend_entry); + } + if (fn) + { + /* GOMP_MAP_FIRSTPRIVATE need to be copied first, as they are + firstprivate on the target task. */ + size_t i; + for (i = 0; i < mapnum; i++) + if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE) + { + size_t align = (size_t) 1 << (kinds[i] >> 8); + if (tgt_align < align) + tgt_align = align; + tgt_size = (tgt_size + align - 1) & ~(align - 1); + tgt_size += sizes[i]; + } + if (tgt_align) + tgt_size += tgt_align - 1; + else + tgt_size = 0; + } + task = gomp_malloc (sizeof (*task) + depend_size + sizeof (*ttask) + mapnum * (sizeof (void *) + sizeof (size_t) - + sizeof (unsigned short))); + + sizeof (unsigned short)) + + tgt_size); gomp_init_task (task, parent, gomp_icv (false)); + task->priority = 0; task->kind = GOMP_TASK_WAITING; task->in_tied_task = parent->in_tied_task; task->taskgroup = taskgroup; - ttask = (struct gomp_target_task *) &task->depend[(uintptr_t) depend[0]]; + ttask = (struct gomp_target_task *) &task->depend[depend_cnt]; ttask->devicep = devicep; ttask->fn = fn; ttask->mapnum = mapnum; @@ -489,8 +656,29 @@ gomp_create_target_task (struct gomp_device_descr *devicep, memcpy (ttask->sizes, sizes, mapnum * sizeof (size_t)); ttask->kinds = (unsigned short *) &ttask->sizes[mapnum]; memcpy (ttask->kinds, kinds, mapnum * sizeof (unsigned short)); + if (tgt_align) + { + char *tgt = (char *) &ttask->kinds[mapnum]; + size_t i; + uintptr_t al = (uintptr_t) tgt & (tgt_align - 1); + if (al) + tgt += tgt_align - al; + tgt_size = 0; + for (i = 0; i < mapnum; i++) + if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE) + { + size_t align = (size_t) 1 << (kinds[i] >> 8); + tgt_size = (tgt_size + align - 1) & ~(align - 1); + memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]); + ttask->hostaddrs[i] = tgt + tgt_size; + tgt_size = tgt_size + sizes[i]; + } + } ttask->flags = flags; - task->fn = gomp_target_task_fn; + ttask->state = state; + ttask->task = task; + ttask->team = team; + task->fn = NULL; task->fn_data = ttask; task->final_task = 0; gomp_mutex_lock (&team->task_lock); @@ -501,62 +689,78 @@ gomp_create_target_task (struct gomp_device_descr *devicep, gomp_mutex_unlock (&team->task_lock); gomp_finish_task (task); free (task); - return; + return true; } - if (taskgroup) - taskgroup->num_children++; if (depend_size) { gomp_task_handle_depend (task, parent, depend); if (task->num_dependees) { + if (taskgroup) + taskgroup->num_children++; gomp_mutex_unlock (&team->task_lock); - return; + return true; } } - if (parent->children) - { - task->next_child = parent->children; - task->prev_child = parent->children->prev_child; - task->next_child->prev_child = task; - task->prev_child->next_child = task; - } - else + if (state == GOMP_TARGET_TASK_DATA) { - task->next_child = task; - task->prev_child = task; + gomp_mutex_unlock (&team->task_lock); + gomp_finish_task (task); + free (task); + return false; } - parent->children = task; if (taskgroup) + taskgroup->num_children++; + /* For async offloading, if we don't need to wait for dependencies, + run the gomp_target_task_fn right away, essentially schedule the + mapping part of the task in the current thread. */ + if (devicep != NULL + && (devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) { - /* If applicable, place task into its taskgroup. */ - if (taskgroup->children) - { - task->next_taskgroup = taskgroup->children; - task->prev_taskgroup = taskgroup->children->prev_taskgroup; - task->next_taskgroup->prev_taskgroup = task; - task->prev_taskgroup->next_taskgroup = task; - } + priority_queue_insert (PQ_CHILDREN, &parent->children_queue, task, 0, + PRIORITY_INSERT_END, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); + if (taskgroup) + priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, + task, 0, PRIORITY_INSERT_END, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); + task->pnode[PQ_TEAM].next = NULL; + task->pnode[PQ_TEAM].prev = NULL; + task->kind = GOMP_TASK_TIED; + ++team->task_count; + gomp_mutex_unlock (&team->task_lock); + + thr->task = task; + gomp_target_task_fn (task->fn_data); + thr->task = parent; + + gomp_mutex_lock (&team->task_lock); + task->kind = GOMP_TASK_ASYNC_RUNNING; + /* If GOMP_PLUGIN_target_task_completion has run already + in between gomp_target_task_fn and the mutex lock, + perform the requeuing here. */ + if (ttask->state == GOMP_TARGET_TASK_FINISHED) + gomp_target_task_completion (team, task); else - { - task->next_taskgroup = task; - task->prev_taskgroup = task; - } - taskgroup->children = task; - } - if (team->task_queue) - { - task->next_queue = team->task_queue; - task->prev_queue = team->task_queue->prev_queue; - task->next_queue->prev_queue = task; - task->prev_queue->next_queue = task; - } - else - { - task->next_queue = task; - task->prev_queue = task; - team->task_queue = task; + ttask->state = GOMP_TARGET_TASK_RUNNING; + gomp_mutex_unlock (&team->task_lock); + return true; } + priority_queue_insert (PQ_CHILDREN, &parent->children_queue, task, 0, + PRIORITY_INSERT_BEGIN, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); + if (taskgroup) + priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, task, 0, + PRIORITY_INSERT_BEGIN, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); + priority_queue_insert (PQ_TEAM, &team->task_queue, task, 0, + PRIORITY_INSERT_END, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); ++team->task_count; ++team->task_queued_count; gomp_team_barrier_set_task_pending (&team->barrier); @@ -565,210 +769,214 @@ gomp_create_target_task (struct gomp_device_descr *devicep, gomp_mutex_unlock (&team->task_lock); if (do_wake) gomp_team_barrier_wake (&team->barrier, 1); + return true; } -#if _LIBGOMP_CHECKING -/* Sanity check TASK to make sure it is in its parent's children - queue, and that the tasks therein are in the right order. +/* Given a parent_depends_on task in LIST, move it to the front of its + priority so it is run as soon as possible. - The expected order is: - parent_depends_on WAITING tasks - !parent_depends_on WAITING tasks - TIED tasks + Care is taken to update the list's LAST_PARENT_DEPENDS_ON field. - PARENT is the alleged parent of TASK. */ + We rearrange the queue such that all parent_depends_on tasks are + first, and last_parent_depends_on points to the last such task we + rearranged. For example, given the following tasks in a queue + where PD[123] are the parent_depends_on tasks: -static void -verify_children_queue (struct gomp_task *task, struct gomp_task *parent) -{ - if (task->parent != parent) - gomp_fatal ("verify_children_queue: incompatible parents"); - /* It's OK, Annie was an orphan and she turned out all right. */ - if (!parent) - return; + task->children + | + V + C1 -> C2 -> C3 -> PD1 -> PD2 -> PD3 -> C4 - bool seen_tied = false; - bool seen_plain_waiting = false; - bool found = false; - struct gomp_task *t = parent->children; - while (1) + We rearrange such that: + + task->children + | +--- last_parent_depends_on + | | + V V + PD1 -> PD2 -> PD3 -> C1 -> C2 -> C3 -> C4. */ + +static void inline +priority_list_upgrade_task (struct priority_list *list, + struct priority_node *node) +{ + struct priority_node *last_parent_depends_on + = list->last_parent_depends_on; + if (last_parent_depends_on) { - if (t == task) - found = true; - if (seen_tied && t->kind == GOMP_TASK_WAITING) - gomp_fatal ("verify_children_queue: WAITING task after TIED"); - if (t->kind == GOMP_TASK_TIED) - seen_tied = true; - else if (t->kind == GOMP_TASK_WAITING) - { - if (t->parent_depends_on) - { - if (seen_plain_waiting) - gomp_fatal ("verify_children_queue: parent_depends_on after " - "!parent_depends_on"); - } - else - seen_plain_waiting = true; - } - t = t->next_child; - if (t == parent->children) - break; + node->prev->next = node->next; + node->next->prev = node->prev; + node->prev = last_parent_depends_on; + node->next = last_parent_depends_on->next; + node->prev->next = node; + node->next->prev = node; } - if (!found) - gomp_fatal ("verify_children_queue: child not found in parent queue"); + else if (node != list->tasks) + { + node->prev->next = node->next; + node->next->prev = node->prev; + node->prev = list->tasks->prev; + node->next = list->tasks; + list->tasks = node; + node->prev->next = node; + node->next->prev = node; + } + list->last_parent_depends_on = node; } -/* Sanity check TASK to make sure it is in its taskgroup queue (if - applicable), and that the tasks therein are in the right order. +/* Given a parent_depends_on TASK in its parent's children_queue, move + it to the front of its priority so it is run as soon as possible. - The expected order is that GOMP_TASK_WAITING tasks must come before - GOMP_TASK_TIED tasks. + PARENT is passed as an optimization. - TASK is the task. */ + (This function could be defined in priority_queue.c, but we want it + inlined, and putting it in priority_queue.h is not an option, given + that gomp_task has not been properly defined at that point). */ -static void -verify_taskgroup_queue (struct gomp_task *task) +static void inline +priority_queue_upgrade_task (struct gomp_task *task, + struct gomp_task *parent) { - struct gomp_taskgroup *taskgroup = task->taskgroup; - if (!taskgroup) - return; - - bool seen_tied = false; - bool found = false; - struct gomp_task *t = taskgroup->children; - while (1) + struct priority_queue *head = &parent->children_queue; + struct priority_node *node = &task->pnode[PQ_CHILDREN]; +#if _LIBGOMP_CHECKING_ + if (!task->parent_depends_on) + gomp_fatal ("priority_queue_upgrade_task: task must be a " + "parent_depends_on task"); + if (!priority_queue_task_in_queue_p (PQ_CHILDREN, head, task)) + gomp_fatal ("priority_queue_upgrade_task: cannot find task=%p", task); +#endif + if (priority_queue_multi_p (head)) { - if (t == task) - found = true; - if (t->kind == GOMP_TASK_WAITING && seen_tied) - gomp_fatal ("verify_taskgroup_queue: WAITING task after TIED"); - if (t->kind == GOMP_TASK_TIED) - seen_tied = true; - t = t->next_taskgroup; - if (t == taskgroup->children) - break; + struct priority_list *list + = priority_queue_lookup_priority (head, task->priority); + priority_list_upgrade_task (list, node); } - if (!found) - gomp_fatal ("verify_taskgroup_queue: child not found in parent queue"); + else + priority_list_upgrade_task (&head->l, node); } -/* Verify that TASK is in the team's task queue. */ +/* Given a CHILD_TASK in LIST that is about to be executed, move it out of + the way in LIST so that other tasks can be considered for + execution. LIST contains tasks of type TYPE. -static void -verify_task_queue (struct gomp_task *task, struct gomp_team *team) + Care is taken to update the queue's LAST_PARENT_DEPENDS_ON field + if applicable. */ + +static void inline +priority_list_downgrade_task (enum priority_queue_type type, + struct priority_list *list, + struct gomp_task *child_task) { - struct gomp_task *t = team->task_queue; - if (team) - while (1) - { - if (t == task) - return; - t = t->next_queue; - if (t == team->task_queue) - break; - } - gomp_fatal ("verify_team_queue: child not in team"); + struct priority_node *node = task_to_priority_node (type, child_task); + if (list->tasks == node) + list->tasks = node->next; + else if (node->next != list->tasks) + { + /* The task in NODE is about to become TIED and TIED tasks + cannot come before WAITING tasks. If we're about to + leave the queue in such an indeterminate state, rewire + things appropriately. However, a TIED task at the end is + perfectly fine. */ + struct gomp_task *next_task = priority_node_to_task (type, node->next); + if (next_task->kind == GOMP_TASK_WAITING) + { + /* Remove from list. */ + node->prev->next = node->next; + node->next->prev = node->prev; + /* Rewire at the end. */ + node->next = list->tasks; + node->prev = list->tasks->prev; + list->tasks->prev->next = node; + list->tasks->prev = node; + } + } + + /* If the current task is the last_parent_depends_on for its + priority, adjust last_parent_depends_on appropriately. */ + if (__builtin_expect (child_task->parent_depends_on, 0) + && list->last_parent_depends_on == node) + { + struct gomp_task *prev_child = priority_node_to_task (type, node->prev); + if (node->prev != node + && prev_child->kind == GOMP_TASK_WAITING + && prev_child->parent_depends_on) + list->last_parent_depends_on = node->prev; + else + { + /* There are no more parent_depends_on entries waiting + to run, clear the list. */ + list->last_parent_depends_on = NULL; + } + } } + +/* Given a TASK in HEAD that is about to be executed, move it out of + the way so that other tasks can be considered for execution. HEAD + contains tasks of type TYPE. + + Care is taken to update the queue's LAST_PARENT_DEPENDS_ON field + if applicable. + + (This function could be defined in priority_queue.c, but we want it + inlined, and putting it in priority_queue.h is not an option, given + that gomp_task has not been properly defined at that point). */ + +static void inline +priority_queue_downgrade_task (enum priority_queue_type type, + struct priority_queue *head, + struct gomp_task *task) +{ +#if _LIBGOMP_CHECKING_ + if (!priority_queue_task_in_queue_p (type, head, task)) + gomp_fatal ("Attempt to downgrade missing task %p", task); #endif + if (priority_queue_multi_p (head)) + { + struct priority_list *list + = priority_queue_lookup_priority (head, task->priority); + priority_list_downgrade_task (type, list, task); + } + else + priority_list_downgrade_task (type, &head->l, task); +} + +/* Setup CHILD_TASK to execute. This is done by setting the task to + TIED, and updating all relevant queues so that CHILD_TASK is no + longer chosen for scheduling. Also, remove CHILD_TASK from the + overall team task queue entirely. + + Return TRUE if task or its containing taskgroup has been + cancelled. */ static inline bool gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent, struct gomp_team *team) { -#if _LIBGOMP_CHECKING - verify_children_queue (child_task, parent); - verify_taskgroup_queue (child_task); - verify_task_queue (child_task, team); +#if _LIBGOMP_CHECKING_ + if (child_task->parent) + priority_queue_verify (PQ_CHILDREN, + &child_task->parent->children_queue, true); + if (child_task->taskgroup) + priority_queue_verify (PQ_TASKGROUP, + &child_task->taskgroup->taskgroup_queue, false); + priority_queue_verify (PQ_TEAM, &team->task_queue, false); #endif + /* Task is about to go tied, move it out of the way. */ if (parent) - { - /* Adjust children such that it will point to a next child, - while the current one is scheduled to be executed. This way, - GOMP_taskwait (and others) can schedule a next task while - waiting. - - Do not remove it entirely from the circular list, as it is - still a child, though not one we should consider first (say - by GOMP_taskwait). */ - if (parent->children == child_task) - parent->children = child_task->next_child; - /* TIED tasks cannot come before WAITING tasks. If we're about - to make this task TIED, rewire things appropriately. - However, a TIED task at the end is perfectly fine. */ - else if (child_task->next_child->kind == GOMP_TASK_WAITING - && child_task->next_child != parent->children) - { - /* Remove from the list. */ - child_task->prev_child->next_child = child_task->next_child; - child_task->next_child->prev_child = child_task->prev_child; - /* Rewire at the end of its siblings. */ - child_task->next_child = parent->children; - child_task->prev_child = parent->children->prev_child; - parent->children->prev_child->next_child = child_task; - parent->children->prev_child = child_task; - } + priority_queue_downgrade_task (PQ_CHILDREN, &parent->children_queue, + child_task); - /* If the current task (child_task) is at the top of the - parent's last_parent_depends_on, it's about to be removed - from it. Adjust last_parent_depends_on appropriately. */ - if (__builtin_expect (child_task->parent_depends_on, 0) - && parent->taskwait->last_parent_depends_on == child_task) - { - /* The last_parent_depends_on list was built with all - parent_depends_on entries linked to the prev_child. Grab - the next last_parent_depends_on head from this prev_child if - available... */ - if (child_task->prev_child->kind == GOMP_TASK_WAITING - && child_task->prev_child->parent_depends_on) - parent->taskwait->last_parent_depends_on = child_task->prev_child; - else - { - /* ...otherwise, there are no more parent_depends_on - entries waiting to run. In which case, clear the - list. */ - parent->taskwait->last_parent_depends_on = NULL; - } - } - } - - /* Adjust taskgroup to point to the next taskgroup. See note above - regarding adjustment of children as to why the child_task is not - removed entirely from the circular list. */ + /* Task is about to go tied, move it out of the way. */ struct gomp_taskgroup *taskgroup = child_task->taskgroup; if (taskgroup) - { - if (taskgroup->children == child_task) - taskgroup->children = child_task->next_taskgroup; - /* TIED tasks cannot come before WAITING tasks. If we're about - to make this task TIED, rewire things appropriately. - However, a TIED task at the end is perfectly fine. */ - else if (child_task->next_taskgroup->kind == GOMP_TASK_WAITING - && child_task->next_taskgroup != taskgroup->children) - { - /* Remove from the list. */ - child_task->prev_taskgroup->next_taskgroup - = child_task->next_taskgroup; - child_task->next_taskgroup->prev_taskgroup - = child_task->prev_taskgroup; - /* Rewire at the end of its taskgroup. */ - child_task->next_taskgroup = taskgroup->children; - child_task->prev_taskgroup = taskgroup->children->prev_taskgroup; - taskgroup->children->prev_taskgroup->next_taskgroup = child_task; - taskgroup->children->prev_taskgroup = child_task; - } - } + priority_queue_downgrade_task (PQ_TASKGROUP, &taskgroup->taskgroup_queue, + child_task); - /* Remove child_task from the task_queue. */ - child_task->prev_queue->next_queue = child_task->next_queue; - child_task->next_queue->prev_queue = child_task->prev_queue; - if (team->task_queue == child_task) - { - if (child_task->next_queue != child_task) - team->task_queue = child_task->next_queue; - else - team->task_queue = NULL; - } + priority_queue_remove (PQ_TEAM, &team->task_queue, child_task, + MEMMODEL_RELAXED); + child_task->pnode[PQ_TEAM].next = NULL; + child_task->pnode[PQ_TEAM].prev = NULL; child_task->kind = GOMP_TASK_TIED; if (--team->task_queued_count == 0) @@ -808,8 +1016,11 @@ gomp_task_run_post_handle_depend_hash (struct gomp_task *child_task) } } -/* After CHILD_TASK has been run, adjust the various task queues to - give higher priority to the tasks that depend on CHILD_TASK. +/* After a CHILD_TASK has been run, adjust the dependency queue for + each task that depends on CHILD_TASK, to record the fact that there + is one less dependency to worry about. If a task that depended on + CHILD_TASK now has no dependencies, place it in the various queues + so it gets scheduled to run. TEAM is the team to which CHILD_TASK belongs to. */ @@ -822,99 +1033,60 @@ gomp_task_run_post_handle_dependers (struct gomp_task *child_task, for (i = 0; i < count; i++) { struct gomp_task *task = child_task->dependers->elem[i]; + + /* CHILD_TASK satisfies a dependency for TASK. Keep track of + TASK's remaining dependencies. Once TASK has no other + depenencies, put it into the various queues so it will get + scheduled for execution. */ if (--task->num_dependees != 0) continue; struct gomp_taskgroup *taskgroup = task->taskgroup; if (parent) { - if (parent->children) - { - /* If parent is in gomp_task_maybe_wait_for_dependencies - and it doesn't need to wait for this task, put it after - all ready to run tasks it needs to wait for. */ - if (parent->taskwait && parent->taskwait->last_parent_depends_on - && !task->parent_depends_on) - { - /* Put depender in last_parent_depends_on. */ - struct gomp_task *last_parent_depends_on - = parent->taskwait->last_parent_depends_on; - task->next_child = last_parent_depends_on->next_child; - task->prev_child = last_parent_depends_on; - } - else - { - /* Make depender a sibling of child_task, and place - it at the top of said sibling list. */ - task->next_child = parent->children; - task->prev_child = parent->children->prev_child; - parent->children = task; - } - task->next_child->prev_child = task; - task->prev_child->next_child = task; - } - else - { - /* Make depender a sibling of child_task. */ - task->next_child = task; - task->prev_child = task; - parent->children = task; - } + priority_queue_insert (PQ_CHILDREN, &parent->children_queue, + task, task->priority, + PRIORITY_INSERT_BEGIN, + /*adjust_parent_depends_on=*/true, + task->parent_depends_on); if (parent->taskwait) { if (parent->taskwait->in_taskwait) { + /* One more task has had its dependencies met. + Inform any waiters. */ parent->taskwait->in_taskwait = false; gomp_sem_post (&parent->taskwait->taskwait_sem); } else if (parent->taskwait->in_depend_wait) { + /* One more task has had its dependencies met. + Inform any waiters. */ parent->taskwait->in_depend_wait = false; gomp_sem_post (&parent->taskwait->taskwait_sem); } - if (parent->taskwait->last_parent_depends_on == NULL - && task->parent_depends_on) - parent->taskwait->last_parent_depends_on = task; } } - /* If depender is in a taskgroup, put it at the TOP of its - taskgroup. */ if (taskgroup) { - if (taskgroup->children) - { - task->next_taskgroup = taskgroup->children; - task->prev_taskgroup = taskgroup->children->prev_taskgroup; - task->next_taskgroup->prev_taskgroup = task; - task->prev_taskgroup->next_taskgroup = task; - } - else - { - task->next_taskgroup = task; - task->prev_taskgroup = task; - } - taskgroup->children = task; + priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, + task, task->priority, + PRIORITY_INSERT_BEGIN, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); if (taskgroup->in_taskgroup_wait) { + /* One more task has had its dependencies met. + Inform any waiters. */ taskgroup->in_taskgroup_wait = false; gomp_sem_post (&taskgroup->taskgroup_sem); } } - /* Put depender of child_task at the END of the team's - task_queue. */ - if (team->task_queue) - { - task->next_queue = team->task_queue; - task->prev_queue = team->task_queue->prev_queue; - task->next_queue->prev_queue = task; - task->prev_queue->next_queue = task; - } - else - { - task->next_queue = task; - task->prev_queue = task; - team->task_queue = task; - } + priority_queue_insert (PQ_TEAM, &team->task_queue, + task, task->priority, + PRIORITY_INSERT_END, + /*adjust_parent_depends_on=*/false, + task->parent_depends_on); ++team->task_count; ++team->task_queued_count; ++ret; @@ -964,27 +1136,15 @@ gomp_task_run_post_remove_parent (struct gomp_task *child_task) gomp_sem_post (&parent->taskwait->taskwait_sem); } - /* Remove CHILD_TASK from its sibling list. */ - child_task->prev_child->next_child = child_task->next_child; - child_task->next_child->prev_child = child_task->prev_child; - if (parent->children != child_task) - return; - if (child_task->next_child != child_task) - parent->children = child_task->next_child; - else + if (priority_queue_remove (PQ_CHILDREN, &parent->children_queue, + child_task, MEMMODEL_RELEASE) + && parent->taskwait && parent->taskwait->in_taskwait) { - /* We access task->children in GOMP_taskwait - outside of the task lock mutex region, so - need a release barrier here to ensure memory - written by child_task->fn above is flushed - before the NULL is written. */ - __atomic_store_n (&parent->children, NULL, MEMMODEL_RELEASE); - if (parent->taskwait && parent->taskwait->in_taskwait) - { - parent->taskwait->in_taskwait = false; - gomp_sem_post (&parent->taskwait->taskwait_sem); - } + parent->taskwait->in_taskwait = false; + gomp_sem_post (&parent->taskwait->taskwait_sem); } + child_task->pnode[PQ_CHILDREN].next = NULL; + child_task->pnode[PQ_CHILDREN].prev = NULL; } /* Remove CHILD_TASK from its taskgroup. */ @@ -995,8 +1155,11 @@ gomp_task_run_post_remove_taskgroup (struct gomp_task *child_task) struct gomp_taskgroup *taskgroup = child_task->taskgroup; if (taskgroup == NULL) return; - child_task->prev_taskgroup->next_taskgroup = child_task->next_taskgroup; - child_task->next_taskgroup->prev_taskgroup = child_task->prev_taskgroup; + bool empty = priority_queue_remove (PQ_TASKGROUP, + &taskgroup->taskgroup_queue, + child_task, MEMMODEL_RELAXED); + child_task->pnode[PQ_TASKGROUP].next = NULL; + child_task->pnode[PQ_TASKGROUP].prev = NULL; if (taskgroup->num_children > 1) --taskgroup->num_children; else @@ -1008,18 +1171,10 @@ gomp_task_run_post_remove_taskgroup (struct gomp_task *child_task) before the NULL is written. */ __atomic_store_n (&taskgroup->num_children, 0, MEMMODEL_RELEASE); } - if (taskgroup->children != child_task) - return; - if (child_task->next_taskgroup != child_task) - taskgroup->children = child_task->next_taskgroup; - else + if (empty && taskgroup->in_taskgroup_wait) { - taskgroup->children = NULL; - if (taskgroup->in_taskgroup_wait) - { - taskgroup->in_taskgroup_wait = false; - gomp_sem_post (&taskgroup->taskgroup_sem); - } + taskgroup->in_taskgroup_wait = false; + gomp_sem_post (&taskgroup->taskgroup_sem); } } @@ -1049,9 +1204,13 @@ gomp_barrier_handle_tasks (gomp_barrier_state_t state) while (1) { bool cancelled = false; - if (team->task_queue != NULL) + if (!priority_queue_empty_p (&team->task_queue, MEMMODEL_RELAXED)) { - child_task = team->task_queue; + bool ignored; + child_task + = priority_queue_next_task (PQ_TEAM, &team->task_queue, + PQ_IGNORED, NULL, + &ignored); cancelled = gomp_task_run_pre (child_task, child_task->parent, team); if (__builtin_expect (cancelled, 0)) @@ -1082,7 +1241,29 @@ gomp_barrier_handle_tasks (gomp_barrier_state_t state) if (child_task) { thr->task = child_task; - child_task->fn (child_task->fn_data); + if (__builtin_expect (child_task->fn == NULL, 0)) + { + if (gomp_target_task_fn (child_task->fn_data)) + { + thr->task = task; + gomp_mutex_lock (&team->task_lock); + child_task->kind = GOMP_TASK_ASYNC_RUNNING; + team->task_running_count--; + struct gomp_target_task *ttask + = (struct gomp_target_task *) child_task->fn_data; + /* If GOMP_PLUGIN_target_task_completion has run already + in between gomp_target_task_fn and the mutex lock, + perform the requeuing here. */ + if (ttask->state == GOMP_TARGET_TASK_FINISHED) + gomp_target_task_completion (team, child_task); + else + ttask->state = GOMP_TARGET_TASK_RUNNING; + child_task = NULL; + continue; + } + } + else + child_task->fn (child_task->fn_data); thr->task = task; } else @@ -1094,7 +1275,7 @@ gomp_barrier_handle_tasks (gomp_barrier_state_t state) size_t new_tasks = gomp_task_run_post_handle_depend (child_task, team); gomp_task_run_post_remove_parent (child_task); - gomp_clear_parent (child_task->children); + gomp_clear_parent (&child_task->children_queue); gomp_task_run_post_remove_taskgroup (child_task); to_free = child_task; child_task = NULL; @@ -1140,15 +1321,16 @@ GOMP_taskwait (void) child thread task work function are seen before we exit from GOMP_taskwait. */ if (task == NULL - || __atomic_load_n (&task->children, MEMMODEL_ACQUIRE) == NULL) + || priority_queue_empty_p (&task->children_queue, MEMMODEL_ACQUIRE)) return; memset (&taskwait, 0, sizeof (taskwait)); + bool child_q = false; gomp_mutex_lock (&team->task_lock); while (1) { bool cancelled = false; - if (task->children == NULL) + if (priority_queue_empty_p (&task->children_queue, MEMMODEL_RELAXED)) { bool destroy_taskwait = task->taskwait != NULL; task->taskwait = NULL; @@ -1162,9 +1344,12 @@ GOMP_taskwait (void) gomp_sem_destroy (&taskwait.taskwait_sem); return; } - if (task->children->kind == GOMP_TASK_WAITING) + struct gomp_task *next_task + = priority_queue_next_task (PQ_CHILDREN, &task->children_queue, + PQ_TEAM, &team->task_queue, &child_q); + if (next_task->kind == GOMP_TASK_WAITING) { - child_task = task->children; + child_task = next_task; cancelled = gomp_task_run_pre (child_task, task, team); if (__builtin_expect (cancelled, 0)) @@ -1180,8 +1365,10 @@ GOMP_taskwait (void) } else { - /* All tasks we are waiting for are already running - in other threads. Wait for them. */ + /* All tasks we are waiting for are either running in other + threads, or they are tasks that have not had their + dependencies met (so they're not even in the queue). Wait + for them. */ if (task->taskwait == NULL) { taskwait.in_depend_wait = false; @@ -1205,7 +1392,28 @@ GOMP_taskwait (void) if (child_task) { thr->task = child_task; - child_task->fn (child_task->fn_data); + if (__builtin_expect (child_task->fn == NULL, 0)) + { + if (gomp_target_task_fn (child_task->fn_data)) + { + thr->task = task; + gomp_mutex_lock (&team->task_lock); + child_task->kind = GOMP_TASK_ASYNC_RUNNING; + struct gomp_target_task *ttask + = (struct gomp_target_task *) child_task->fn_data; + /* If GOMP_PLUGIN_target_task_completion has run already + in between gomp_target_task_fn and the mutex lock, + perform the requeuing here. */ + if (ttask->state == GOMP_TARGET_TASK_FINISHED) + gomp_target_task_completion (team, child_task); + else + ttask->state = GOMP_TARGET_TASK_RUNNING; + child_task = NULL; + continue; + } + } + else + child_task->fn (child_task->fn_data); thr->task = task; } else @@ -1217,21 +1425,16 @@ GOMP_taskwait (void) size_t new_tasks = gomp_task_run_post_handle_depend (child_task, team); - /* Remove child_task from children list, and set up the next - sibling to be run. */ - child_task->prev_child->next_child = child_task->next_child; - child_task->next_child->prev_child = child_task->prev_child; - if (task->children == child_task) + if (child_q) { - if (child_task->next_child != child_task) - task->children = child_task->next_child; - else - task->children = NULL; + priority_queue_remove (PQ_CHILDREN, &task->children_queue, + child_task, MEMMODEL_RELAXED); + child_task->pnode[PQ_CHILDREN].next = NULL; + child_task->pnode[PQ_CHILDREN].prev = NULL; } - /* Orphan all the children of CHILD_TASK. */ - gomp_clear_parent (child_task->children); - /* Remove CHILD_TASK from its taskgroup. */ + gomp_clear_parent (&child_task->children_queue); + gomp_task_run_post_remove_taskgroup (child_task); to_free = child_task; @@ -1248,8 +1451,16 @@ GOMP_taskwait (void) } } -/* This is like GOMP_taskwait, but we only wait for tasks that the - upcoming task depends on. +/* An undeferred task is about to run. Wait for all tasks that this + undeferred task depends on. + + This is done by first putting all known ready dependencies + (dependencies that have their own dependencies met) at the top of + the scheduling queues. Then we iterate through these imminently + ready tasks (and possibly other high priority tasks), and run them. + If we run out of ready dependencies to execute, we either wait for + the reamining dependencies to finish, or wait for them to get + scheduled so we can run them. DEPEND is as in GOMP_task. */ @@ -1261,7 +1472,6 @@ gomp_task_maybe_wait_for_dependencies (void **depend) struct gomp_team *team = thr->ts.team; struct gomp_task_depend_entry elem, *ent = NULL; struct gomp_taskwait taskwait; - struct gomp_task *last_parent_depends_on = NULL; size_t ndepend = (uintptr_t) depend[0]; size_t nout = (uintptr_t) depend[1]; size_t i; @@ -1285,54 +1495,11 @@ gomp_task_maybe_wait_for_dependencies (void **depend) { tsk->parent_depends_on = true; ++num_awaited; - /* If a task we need to wait for is not already - running and is ready to be scheduled, move it to - front, so that we run it as soon as possible. - - We rearrange the children queue such that all - parent_depends_on tasks are first, and - last_parent_depends_on points to the last such task - we rearranged. For example, given the following - children where PD[123] are the parent_depends_on - tasks: - - task->children - | - V - C1 -> C2 -> C3 -> PD1 -> PD2 -> PD3 -> C4 - - We rearrange such that: - - task->children - | +--- last_parent_depends_on - | | - V V - PD1 -> PD2 -> PD3 -> C1 -> C2 -> C3 -> C4 - */ - + /* If depenency TSK itself has no dependencies and is + ready to run, move it up front so that we run it as + soon as possible. */ if (tsk->num_dependees == 0 && tsk->kind == GOMP_TASK_WAITING) - { - if (last_parent_depends_on) - { - tsk->prev_child->next_child = tsk->next_child; - tsk->next_child->prev_child = tsk->prev_child; - tsk->prev_child = last_parent_depends_on; - tsk->next_child = last_parent_depends_on->next_child; - tsk->prev_child->next_child = tsk; - tsk->next_child->prev_child = tsk; - } - else if (tsk != task->children) - { - tsk->prev_child->next_child = tsk->next_child; - tsk->next_child->prev_child = tsk->prev_child; - tsk->prev_child = task->children->prev_child; - tsk->next_child = task->children; - task->children = tsk; - tsk->prev_child->next_child = tsk; - tsk->next_child->prev_child = tsk; - } - last_parent_depends_on = tsk; - } + priority_queue_upgrade_task (tsk, task); } } } @@ -1344,7 +1511,6 @@ gomp_task_maybe_wait_for_dependencies (void **depend) memset (&taskwait, 0, sizeof (taskwait)); taskwait.n_depend = num_awaited; - taskwait.last_parent_depends_on = last_parent_depends_on; gomp_sem_init (&taskwait.taskwait_sem, 0); task->taskwait = &taskwait; @@ -1363,9 +1529,28 @@ gomp_task_maybe_wait_for_dependencies (void **depend) gomp_sem_destroy (&taskwait.taskwait_sem); return; } - if (task->children->kind == GOMP_TASK_WAITING) + + /* Theoretically when we have multiple priorities, we should + chose between the highest priority item in + task->children_queue and team->task_queue here, so we should + use priority_queue_next_task(). However, since we are + running an undeferred task, perhaps that makes all tasks it + depends on undeferred, thus a priority of INF? This would + make it unnecessary to take anything into account here, + but the dependencies. + + On the other hand, if we want to use priority_queue_next_task(), + care should be taken to only use priority_queue_remove() + below if the task was actually removed from the children + queue. */ + bool ignored; + struct gomp_task *next_task + = priority_queue_next_task (PQ_CHILDREN, &task->children_queue, + PQ_IGNORED, NULL, &ignored); + + if (next_task->kind == GOMP_TASK_WAITING) { - child_task = task->children; + child_task = next_task; cancelled = gomp_task_run_pre (child_task, task, team); if (__builtin_expect (cancelled, 0)) @@ -1380,8 +1565,10 @@ gomp_task_maybe_wait_for_dependencies (void **depend) } } else - /* All tasks we are waiting for are already running - in other threads. Wait for them. */ + /* All tasks we are waiting for are either running in other + threads, or they are tasks that have not had their + dependencies met (so they're not even in the queue). Wait + for them. */ taskwait.in_depend_wait = true; gomp_mutex_unlock (&team->task_lock); if (do_wake) @@ -1398,7 +1585,28 @@ gomp_task_maybe_wait_for_dependencies (void **depend) if (child_task) { thr->task = child_task; - child_task->fn (child_task->fn_data); + if (__builtin_expect (child_task->fn == NULL, 0)) + { + if (gomp_target_task_fn (child_task->fn_data)) + { + thr->task = task; + gomp_mutex_lock (&team->task_lock); + child_task->kind = GOMP_TASK_ASYNC_RUNNING; + struct gomp_target_task *ttask + = (struct gomp_target_task *) child_task->fn_data; + /* If GOMP_PLUGIN_target_task_completion has run already + in between gomp_target_task_fn and the mutex lock, + perform the requeuing here. */ + if (ttask->state == GOMP_TARGET_TASK_FINISHED) + gomp_target_task_completion (team, child_task); + else + ttask->state = GOMP_TARGET_TASK_RUNNING; + child_task = NULL; + continue; + } + } + else + child_task->fn (child_task->fn_data); thr->task = task; } else @@ -1412,18 +1620,12 @@ gomp_task_maybe_wait_for_dependencies (void **depend) if (child_task->parent_depends_on) --taskwait.n_depend; - /* Remove child_task from sibling list. */ - child_task->prev_child->next_child = child_task->next_child; - child_task->next_child->prev_child = child_task->prev_child; - if (task->children == child_task) - { - if (child_task->next_child != child_task) - task->children = child_task->next_child; - else - task->children = NULL; - } + priority_queue_remove (PQ_CHILDREN, &task->children_queue, + child_task, MEMMODEL_RELAXED); + child_task->pnode[PQ_CHILDREN].next = NULL; + child_task->pnode[PQ_CHILDREN].prev = NULL; - gomp_clear_parent (child_task->children); + gomp_clear_parent (&child_task->children_queue); gomp_task_run_post_remove_taskgroup (child_task); to_free = child_task; child_task = NULL; @@ -1463,7 +1665,7 @@ GOMP_taskgroup_start (void) return; taskgroup = gomp_malloc (sizeof (struct gomp_taskgroup)); taskgroup->prev = task->taskgroup; - taskgroup->children = NULL; + priority_queue_init (&taskgroup->taskgroup_queue); taskgroup->in_taskgroup_wait = false; taskgroup->cancelled = false; taskgroup->num_children = 0; @@ -1485,6 +1687,17 @@ GOMP_taskgroup_end (void) if (team == NULL) return; taskgroup = task->taskgroup; + if (__builtin_expect (taskgroup == NULL, 0) + && thr->ts.level == 0) + { + /* This can happen if GOMP_taskgroup_start is called when + thr->ts.team == NULL, but inside of the taskgroup there + is #pragma omp target nowait that creates an implicit + team with a single thread. In this case, we want to wait + for all outstanding tasks in this team. */ + gomp_team_barrier_wait (&team->barrier); + return; + } /* The acquire barrier on load of taskgroup->num_children here synchronizes with the write of 0 in gomp_task_run_post_remove_taskgroup. @@ -1495,19 +1708,25 @@ GOMP_taskgroup_end (void) if (__atomic_load_n (&taskgroup->num_children, MEMMODEL_ACQUIRE) == 0) goto finish; + bool unused; gomp_mutex_lock (&team->task_lock); while (1) { bool cancelled = false; - if (taskgroup->children == NULL) + if (priority_queue_empty_p (&taskgroup->taskgroup_queue, + MEMMODEL_RELAXED)) { if (taskgroup->num_children) { - if (task->children == NULL) + if (priority_queue_empty_p (&task->children_queue, + MEMMODEL_RELAXED)) goto do_wait; - child_task = task->children; - } - else + child_task + = priority_queue_next_task (PQ_CHILDREN, &task->children_queue, + PQ_TEAM, &team->task_queue, + &unused); + } + else { gomp_mutex_unlock (&team->task_lock); if (to_free) @@ -1519,7 +1738,9 @@ GOMP_taskgroup_end (void) } } else - child_task = taskgroup->children; + child_task + = priority_queue_next_task (PQ_TASKGROUP, &taskgroup->taskgroup_queue, + PQ_TEAM, &team->task_queue, &unused); if (child_task->kind == GOMP_TASK_WAITING) { cancelled @@ -1539,8 +1760,10 @@ GOMP_taskgroup_end (void) { child_task = NULL; do_wait: - /* All tasks we are waiting for are already running - in other threads. Wait for them. */ + /* All tasks we are waiting for are either running in other + threads, or they are tasks that have not had their + dependencies met (so they're not even in the queue). Wait + for them. */ taskgroup->in_taskgroup_wait = true; } gomp_mutex_unlock (&team->task_lock); @@ -1558,7 +1781,28 @@ GOMP_taskgroup_end (void) if (child_task) { thr->task = child_task; - child_task->fn (child_task->fn_data); + if (__builtin_expect (child_task->fn == NULL, 0)) + { + if (gomp_target_task_fn (child_task->fn_data)) + { + thr->task = task; + gomp_mutex_lock (&team->task_lock); + child_task->kind = GOMP_TASK_ASYNC_RUNNING; + struct gomp_target_task *ttask + = (struct gomp_target_task *) child_task->fn_data; + /* If GOMP_PLUGIN_target_task_completion has run already + in between gomp_target_task_fn and the mutex lock, + perform the requeuing here. */ + if (ttask->state == GOMP_TARGET_TASK_FINISHED) + gomp_target_task_completion (team, child_task); + else + ttask->state = GOMP_TARGET_TASK_RUNNING; + child_task = NULL; + continue; + } + } + else + child_task->fn (child_task->fn_data); thr->task = task; } else @@ -1570,7 +1814,7 @@ GOMP_taskgroup_end (void) size_t new_tasks = gomp_task_run_post_handle_depend (child_task, team); gomp_task_run_post_remove_parent (child_task); - gomp_clear_parent (child_task->children); + gomp_clear_parent (&child_task->children_queue); gomp_task_run_post_remove_taskgroup (child_task); to_free = child_task; child_task = NULL; diff --git a/libgomp/taskloop.c b/libgomp/taskloop.c index f57a5a1..bcee326 100644 --- a/libgomp/taskloop.c +++ b/libgomp/taskloop.c @@ -155,8 +155,8 @@ GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), else ialias_call (GOMP_taskgroup_start) (); - /* FIXME, use priority. */ - (void) priority; + if (priority > gomp_max_task_priority_var) + priority = gomp_max_task_priority_var; if ((flags & GOMP_TASK_FLAG_IF) == 0 || team == NULL || (thr->task && thr->task->final_task) @@ -175,6 +175,7 @@ GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), for (i = 0; i < num_tasks; i++) { gomp_init_task (&task[i], parent, gomp_icv (false)); + task[i].priority = priority; task[i].kind = GOMP_TASK_UNDEFERRED; task[i].final_task = (thr->task && thr->task->final_task) || (flags & GOMP_TASK_FLAG_FINAL); @@ -198,10 +199,11 @@ GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), task_step -= step; fn (arg); arg += arg_size; - if (task[i].children != NULL) + if (!priority_queue_empty_p (&task[i].children_queue, + MEMMODEL_RELAXED)) { gomp_mutex_lock (&team->task_lock); - gomp_clear_parent (task[i].children); + gomp_clear_parent (&task[i].children_queue); gomp_mutex_unlock (&team->task_lock); } gomp_end_task (); @@ -213,6 +215,7 @@ GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), struct gomp_task task; gomp_init_task (&task, thr->task, gomp_icv (false)); + task.priority = priority; task.kind = GOMP_TASK_UNDEFERRED; task.final_task = (thr->task && thr->task->final_task) || (flags & GOMP_TASK_FLAG_FINAL); @@ -228,10 +231,11 @@ GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), if (i == nfirst) task_step -= step; fn (data); - if (task.children != NULL) + if (!priority_queue_empty_p (&task.children_queue, + MEMMODEL_RELAXED)) { gomp_mutex_lock (&team->task_lock); - gomp_clear_parent (task.children); + gomp_clear_parent (&task.children_queue); gomp_mutex_unlock (&team->task_lock); } gomp_end_task (); @@ -254,6 +258,7 @@ GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), arg = (char *) (((uintptr_t) (task + 1) + arg_align - 1) & ~(uintptr_t) (arg_align - 1)); gomp_init_task (task, parent, gomp_icv (false)); + task->priority = priority; task->kind = GOMP_TASK_UNDEFERRED; task->in_tied_task = parent->in_tied_task; task->taskgroup = taskgroup; @@ -298,48 +303,20 @@ GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), for (i = 0; i < num_tasks; i++) { struct gomp_task *task = tasks[i]; - if (parent->children) - { - task->next_child = parent->children; - task->prev_child = parent->children->prev_child; - task->next_child->prev_child = task; - task->prev_child->next_child = task; - } - else - { - task->next_child = task; - task->prev_child = task; - } - parent->children = task; + priority_queue_insert (PQ_CHILDREN, &parent->children_queue, + task, priority, + PRIORITY_INSERT_BEGIN, + /*last_parent_depends_on=*/false, + task->parent_depends_on); if (taskgroup) - { - if (taskgroup->children) - { - task->next_taskgroup = taskgroup->children; - task->prev_taskgroup = taskgroup->children->prev_taskgroup; - task->next_taskgroup->prev_taskgroup = task; - task->prev_taskgroup->next_taskgroup = task; - } - else - { - task->next_taskgroup = task; - task->prev_taskgroup = task; - } - taskgroup->children = task; - } - if (team->task_queue) - { - task->next_queue = team->task_queue; - task->prev_queue = team->task_queue->prev_queue; - task->next_queue->prev_queue = task; - task->prev_queue->next_queue = task; - } - else - { - task->next_queue = task; - task->prev_queue = task; - team->task_queue = task; - } + priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, + task, priority, PRIORITY_INSERT_BEGIN, + /*last_parent_depends_on=*/false, + task->parent_depends_on); + priority_queue_insert (PQ_TEAM, &team->task_queue, task, priority, + PRIORITY_INSERT_END, + /*last_parent_depends_on=*/false, + task->parent_depends_on); ++team->task_count; ++team->task_queued_count; } diff --git a/libgomp/team.c b/libgomp/team.c index 67e25b3..34b77c0 100644 --- a/libgomp/team.c +++ b/libgomp/team.c @@ -193,7 +193,7 @@ gomp_new_team (unsigned nthreads) team->ordered_release = (void *) &team->implicit_task[nthreads]; team->ordered_release[0] = &team->master_release; - team->task_queue = NULL; + priority_queue_init (&team->task_queue); team->task_count = 0; team->task_queued_count = 0; team->task_running_count = 0; @@ -214,6 +214,7 @@ free_team (struct gomp_team *team) #endif gomp_barrier_destroy (&team->barrier); gomp_mutex_destroy (&team->task_lock); + priority_queue_free (&team->task_queue); free (team); } @@ -271,6 +272,8 @@ gomp_free_thread (void *arg __attribute__((unused))) free (pool); thr->thread_pool = NULL; } + if (thr->ts.level == 0 && __builtin_expect (thr->ts.team != NULL, 0)) + gomp_team_end (); if (thr->task != NULL) { struct gomp_task *task = thr->task; @@ -300,7 +303,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, struct gomp_thread **affinity_thr = NULL; thr = gomp_thread (); - nested = thr->ts.team != NULL; + nested = thr->ts.level; pool = thr->thread_pool; task = thr->task; icv = task ? &task->icv : &gomp_global_icv; diff --git a/libgomp/testsuite/libgomp.c/doacross-3.c b/libgomp/testsuite/libgomp.c/doacross-3.c new file mode 100644 index 0000000..eef0d5e --- /dev/null +++ b/libgomp/testsuite/libgomp.c/doacross-3.c @@ -0,0 +1,225 @@ +extern void abort (void); + +#define N 256 +int a[N], b[N / 16][8][4], c[N / 32][8][8], g[N / 16][8][6]; +volatile int d, e; +volatile unsigned long long f; + +int +main () +{ + unsigned long long i; + int j, k, l, m; + #pragma omp parallel private (l) + { + #pragma omp for schedule(guided, 3) ordered (1) nowait + for (i = 1; i < N + f; i++) + { + #pragma omp atomic write + a[i] = 1; + #pragma omp ordered depend(sink: i - 1) + if (i > 1) + { + #pragma omp atomic read + l = a[i - 1]; + if (l < 2) + abort (); + } + #pragma omp atomic write + a[i] = 2; + if (i < N - 1) + { + #pragma omp atomic read + l = a[i + 1]; + if (l == 3) + abort (); + } + #pragma omp ordered depend(source) + #pragma omp atomic write + a[i] = 3; + } + #pragma omp for schedule(guided) ordered (3) nowait + for (i = 3; i < N / 16 - 1 + f; i++) + for (j = 0; j < 8; j += 2) + for (k = 1; k <= 3; k++) + { + #pragma omp atomic write + b[i][j][k] = 1; + #pragma omp ordered depend(sink: i, j - 2, k - 1) \ + depend(sink: i - 2, j - 2, k + 1) + #pragma omp ordered depend(sink: i - 3, j + 2, k - 2) + if (j >= 2 && k > 1) + { + #pragma omp atomic read + l = b[i][j - 2][k - 1]; + if (l < 2) + abort (); + } + #pragma omp atomic write + b[i][j][k] = 2; + if (i >= 5 && j >= 2 && k < 3) + { + #pragma omp atomic read + l = b[i - 2][j - 2][k + 1]; + if (l < 2) + abort (); + } + if (i >= 6 && j < N / 16 - 3 && k == 3) + { + #pragma omp atomic read + l = b[i - 3][j + 2][k - 2]; + if (l < 2) + abort (); + } + #pragma omp ordered depend(source) + #pragma omp atomic write + b[i][j][k] = 3; + } +#define A(n) int n; +#define B(n) A(n##0) A(n##1) A(n##2) A(n##3) +#define C(n) B(n##0) B(n##1) B(n##2) B(n##3) +#define D(n) C(n##0) C(n##1) C(n##2) C(n##3) + D(m) +#undef A + #pragma omp for collapse (2) ordered(61) schedule(guided, 15) + for (i = 2; i < N / 32 + f; i++) + for (j = 7; j > 1; j--) + for (k = 6; k >= 0; k -= 2) +#define A(n) for (n = 4; n < 5; n++) + D(m) +#undef A + { + #pragma omp atomic write + c[i][j][k] = 1; +#define A(n) ,n +#define E(n) C(n##0) C(n##1) C(n##2) B(n##30) B(n##31) A(n##320) A(n##321) + #pragma omp ordered depend (sink: i, j, k + 2 E(m)) \ + depend (sink:i - 2, j + 1, k - 4 E(m)) \ + depend(sink: i - 1, j - 2, k - 2 E(m)) + if (k <= 4) + { + l = c[i][j][k + 2]; + if (l < 2) + abort (); + } + #pragma omp atomic write + c[i][j][k] = 2; + if (i >= 4 && j < 7 && k >= 4) + { + l = c[i - 2][j + 1][k - 4]; + if (l < 2) + abort (); + } + if (i >= 3 && j >= 4 && k >= 2) + { + l = c[i - 1][j - 2][k - 2]; + if (l < 2) + abort (); + } + #pragma omp ordered depend (source) + #pragma omp atomic write + c[i][j][k] = 3; + } + #pragma omp for schedule(guided, 5) ordered (3) nowait + for (j = 0; j < N / 16 - 1; j++) + for (k = 0; k < 8; k += 2) + for (i = 3; i <= 5 + f; i++) + { + #pragma omp atomic write + g[j][k][i] = 1; + #pragma omp ordered depend(sink: j, k - 2, i - 1) \ + depend(sink: j - 2, k - 2, i + 1) + #pragma omp ordered depend(sink: j - 3, k + 2, i - 2) + if (k >= 2 && i > 3) + { + #pragma omp atomic read + l = g[j][k - 2][i - 1]; + if (l < 2) + abort (); + } + #pragma omp atomic write + g[j][k][i] = 2; + if (j >= 2 && k >= 2 && i < 5) + { + #pragma omp atomic read + l = g[j - 2][k - 2][i + 1]; + if (l < 2) + abort (); + } + if (j >= 3 && k < N / 16 - 3 && i == 5) + { + #pragma omp atomic read + l = g[j - 3][k + 2][i - 2]; + if (l < 2) + abort (); + } + #pragma omp ordered depend(source) + #pragma omp atomic write + g[j][k][i] = 3; + } + #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k) + for (i = 2; i < f + 3; i++) + for (j = d + 1; j >= 0; j--) + for (k = 0; k < d; k++) + for (l = 0; l < d + 2; l++) + { + #pragma omp ordered depend (source) + #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l) + if (!e) + abort (); + } + #pragma omp single + { + if (i != 3 || j != -1 || k != 0) + abort (); + i = 8; j = 9; k = 10; + } + #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k, m) + for (i = 2; i < f + 3; i++) + for (j = d + 1; j >= 0; j--) + for (k = 0; k < d + 2; k++) + for (m = 0; m < d; m++) + { + #pragma omp ordered depend (source) + #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, m) + abort (); + } + #pragma omp single + if (i != 3 || j != -1 || k != 2 || m != 0) + abort (); + #pragma omp for collapse(2) ordered(4) nowait + for (i = 2; i < f + 3; i++) + for (j = d; j > 0; j--) + for (k = 0; k < d + 2; k++) + for (l = 0; l < d + 4; l++) + { + #pragma omp ordered depend (source) + #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l) + if (!e) + abort (); + } + #pragma omp for nowait + for (i = 0; i < N; i++) + if (a[i] != 3 * (i >= 1)) + abort (); + #pragma omp for collapse(2) private(k) nowait + for (i = 0; i < N / 16; i++) + for (j = 0; j < 8; j++) + for (k = 0; k < 4; k++) + if (b[i][j][k] != 3 * (i >= 3 && i < N / 16 - 1 && (j & 1) == 0 && k >= 1)) + abort (); + #pragma omp for collapse(3) nowait + for (i = 0; i < N / 32; i++) + for (j = 0; j < 8; j++) + for (k = 0; k < 8; k++) + if (c[i][j][k] != 3 * (i >= 2 && j >= 2 && (k & 1) == 0)) + abort (); + #pragma omp for collapse(2) private(k) nowait + for (i = 0; i < N / 16; i++) + for (j = 0; j < 8; j++) + for (k = 0; k < 6; k++) + if (g[i][j][k] != 3 * (i < N / 16 - 1 && (j & 1) == 0 && k >= 3)) + abort (); + } + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/ordered-5.c b/libgomp/testsuite/libgomp.c/ordered-5.c new file mode 100644 index 0000000..fac2440 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/ordered-5.c @@ -0,0 +1,30 @@ +/* { dg-do run } */ +/* { dg-additional-options "-msse2" { target sse2_runtime } } */ +/* { dg-additional-options "-mavx" { target avx_runtime } } */ + +extern void abort (void); +int a[1024], b = -1; + +int +main () +{ + int i; + #pragma omp parallel for simd ordered + for (i = 0; i < 1024; i++) + { + a[i] = i; + #pragma omp ordered threads simd + { + if (b + 1 != i) + abort (); + b = i; + } + a[i] += 3; + } + if (b != 1023) + abort (); + for (i = 0; i < 1024; i++) + if (a[i] != i + 3) + abort (); + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/priority.c b/libgomp/testsuite/libgomp.c/priority.c new file mode 100644 index 0000000..012f09d --- /dev/null +++ b/libgomp/testsuite/libgomp.c/priority.c @@ -0,0 +1,62 @@ +/* { dg-do run } */ +/* { dg-set-target-env-var OMP_MAX_TASK_PRIORITY "10" } */ + +/* This test verifies that the "priority" clause of omp task works as + advertised. + + Testing the OpenMP task scheduler is a bit tricky, especially when + trying to determine what ran first (without explicitly calling + time() and/or synchronizing between threads). What we do here is + run in single threaded mode which guarantees that we won't run into + data races while accessing the "prio" array. + + We give each task a priority from 0..63, while setting + OMP_MAX_TASK_PRIORITY to 10, which basically gives us 10 lower + priority tasks, and the rest scheduled to run earlier. We verify + that the priority < 10 tasks run last. */ + +#include <omp.h> +#include <stdlib.h> + +#define N 64 + +int main() +{ + int tsknum=0, prio[N]; + int max_priority = omp_get_max_task_priority (); + int saved_tsknum = -1; + int i; + +#pragma omp parallel num_threads(1) +#pragma omp single private (i) + { + for (i = 0; i < N; i++) + #pragma omp task priority(i ^ 1) + { + int t; + #pragma omp atomic capture seq_cst + t = tsknum++; + prio[t] = i ^ 1; + } + #pragma omp atomic read seq_cst + saved_tsknum = tsknum; + } + + /* If any of the tasks have run before all tasks were created, don't + make any assumption on the task order. Otherwise, we should have + tasks with >= max_priority scheduled first in arbitrary order, + followed by the rest of tasks in decreasing priority order, as + there is only one thread that can schedule them. */ + if (saved_tsknum == 0) + { + for (i = 0; i < N; i++) + if (i < N - max_priority) + { + if (prio[i] < max_priority) + abort (); + } + else if (i != N - prio[i] - 1) + abort (); + } + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/target-31.c b/libgomp/testsuite/libgomp.c/target-31.c new file mode 100644 index 0000000..255327c --- /dev/null +++ b/libgomp/testsuite/libgomp.c/target-31.c @@ -0,0 +1,163 @@ +#include <omp.h> +#include <stdlib.h> + +int a = 1, b = 2, c = 3, d = 4; +int e[2] = { 5, 6 }, f[2] = { 7, 8 }, g[2] = { 9, 10 }, h[2] = { 11, 12 }; + +__attribute__((noinline, noclone)) void +use (int *k, int *l, int *m, int *n, int *o, int *p, int *q, int *r) +{ + asm volatile ("" : : "r" (k) : "memory"); + asm volatile ("" : : "r" (l) : "memory"); + asm volatile ("" : : "r" (m) : "memory"); + asm volatile ("" : : "r" (n) : "memory"); + asm volatile ("" : : "r" (o) : "memory"); + asm volatile ("" : : "r" (p) : "memory"); + asm volatile ("" : : "r" (q) : "memory"); + asm volatile ("" : : "r" (r) : "memory"); +} + +#pragma omp declare target to (use) + +int +main () +{ + int err = 0, r = -1, t[4]; + long s[4] = { -1, -2, -3, -4 }; + int j = 13, k = 14, l[2] = { 15, 16 }, m[2] = { 17, 18 }; + #pragma omp target private (a, b, e, f) firstprivate (c, d, g, h) map(from: r, s, t) \ + map(tofrom: err, j, l) map(to: k, m) + #pragma omp teams num_teams (4) thread_limit (8) private (b, f) firstprivate (d, h, k, m) + { + int u1 = k, u2[2] = { m[0], m[1] }; + int u3[64]; + int i; + for (i = 0; i < 64; i++) + u3[i] = k + i; + #pragma omp parallel num_threads (1) + { + if (c != 3 || d != 4 || g[0] != 9 || g[1] != 10 || h[0] != 11 || h[1] != 12 || k != 14 || m[0] != 17 || m[1] != 18) + #pragma omp atomic write + err = 1; + b = omp_get_team_num (); + if (b >= 4) + #pragma omp atomic write + err = 1; + if (b == 0) + { + a = omp_get_num_teams (); + e[0] = 2 * a; + e[1] = 3 * a; + } + f[0] = 2 * b; + f[1] = 3 * b; + #pragma omp atomic update + c++; + #pragma omp atomic update + g[0] += 2; + #pragma omp atomic update + g[1] += 3; + d++; + h[0] += 2; + h[1] += 3; + k += b; + m[0] += 2 * b; + m[1] += 3 * b; + } + use (&a, &b, &c, &d, e, f, g, h); + #pragma omp parallel firstprivate (u1, u2) + { + int w = omp_get_thread_num (); + int x = 19; + int y[2] = { 20, 21 }; + int v = 24; + int ll[64]; + if (u1 != 14 || u2[0] != 17 || u2[1] != 18) + #pragma omp atomic write + err = 1; + u1 += w; + u2[0] += 2 * w; + u2[1] += 3 * w; + use (&u1, u2, &t[b], l, &k, m, &j, h); + #pragma omp master + t[b] = omp_get_num_threads (); + #pragma omp atomic update + j++; + #pragma omp atomic update + l[0] += 2; + #pragma omp atomic update + l[1] += 3; + #pragma omp atomic update + k += 4; + #pragma omp atomic update + m[0] += 5; + #pragma omp atomic update + m[1] += 6; + x += w; + y[0] += 2 * w; + y[1] += 3 * w; + #pragma omp simd safelen(32) private (v) + for (i = 0; i < 64; i++) + { + v = 3 * i; + ll[i] = u1 + v * u2[0] + u2[1] + x + y[0] + y[1] + v + h[0] + u3[i]; + } + #pragma omp barrier + use (&u1, u2, &t[b], l, &k, m, &x, y); + if (w < 0 || w > 8 || w != omp_get_thread_num () || u1 != 14 + w + || u2[0] != 17 + 2 * w || u2[1] != 18 + 3 * w + || x != 19 + w || y[0] != 20 + 2 * w || y[1] != 21 + 3 * w + || v != 24) + #pragma omp atomic write + err = 1; + for (i = 0; i < 64; i++) + if (ll[i] != u1 + 3 * i * u2[0] + u2[1] + x + y[0] + y[1] + 3 * i + 13 + 14 + i) + #pragma omp atomic write + err = 1; + } + #pragma omp parallel num_threads (1) + { + if (b == 0) + { + r = a; + if (a != omp_get_num_teams () + || e[0] != 2 * a + || e[1] != 3 * a) + #pragma omp atomic write + err = 1; + } + int v1, v2, v3; + #pragma omp atomic read + v1 = c; + #pragma omp atomic read + v2 = g[0]; + #pragma omp atomic read + v3 = g[1]; + s[b] = v1 * 65536L + v2 * 256L + v3; + if (d != 5 || h[0] != 13 || h[1] != 15 + || k != 14 + b + 4 * t[b] + || m[0] != 17 + 2 * b + 5 * t[b] + || m[1] != 18 + 3 * b + 6 * t[b] + || b != omp_get_team_num () + || f[0] != 2 * b || f[1] != 3 * b) + #pragma omp atomic write + err = 1; + } + } + if (err != 0) abort (); + if (r < 1 || r > 4) abort (); + if (a != 1 || b != 2 || c != 3 || d != 4) abort (); + if (e[0] != 5 || e[1] != 6 || f[0] != 7 || f[1] != 8) abort (); + if (g[0] != 9 || g[1] != 10 || h[0] != 11 || h[1] != 12) abort (); + int i, cnt = 0; + for (i = 0; i < r; i++) + if ((s[i] >> 16) < 3 + 1 || (s[i] >> 16) > 3 + 4 + || ((s[i] >> 8) & 0xff) < 9 + 2 * 1 || ((s[i] >> 8) & 0xff) > 9 + 2 * 4 + || (s[i] & 0xff) < 10 + 3 * 1 || (s[i] & 0xff) > 10 + 3 * 4 + || t[i] < 1 || t[i] > 8) + abort (); + else + cnt += t[i]; + if (j != 13 + cnt || l[0] != 15 + 2 * cnt || l[1] != 16 + 3 * cnt) abort (); + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/target-32.c b/libgomp/testsuite/libgomp.c/target-32.c new file mode 100644 index 0000000..233877b --- /dev/null +++ b/libgomp/testsuite/libgomp.c/target-32.c @@ -0,0 +1,54 @@ +#include <stdlib.h> +#include <unistd.h> + +int main () +{ + int a = 0, b = 0, c = 0, d[7]; + + #pragma omp parallel + #pragma omp single + { + #pragma omp task depend(out: d[0]) + a = 2; + + #pragma omp target enter data nowait map(to: a,b,c) depend(in: d[0]) depend(out: d[1]) + + #pragma omp target nowait map(alloc: a) depend(in: d[1]) depend(out: d[2]) + a++; + + #pragma omp target nowait map(alloc: b) depend(in: d[2]) depend(out: d[3]) + { + usleep (1000); + #pragma omp atomic update + b |= 4; + } + + #pragma omp target nowait map(alloc: b) depend(in: d[2]) depend(out: d[4]) + { + usleep (5000); + #pragma omp atomic update + b |= 1; + } + + #pragma omp target nowait map(alloc: c) depend(in: d[3], d[4]) depend(out: d[5]) + { + usleep (5000); + #pragma omp atomic update + c |= 8; + } + + #pragma omp target nowait map(alloc: c) depend(in: d[3], d[4]) depend(out: d[6]) + { + usleep (1000); + #pragma omp atomic update + c |= 2; + } + + #pragma omp target exit data map(always,from: a,b,c) depend(in: d[5], d[6]) + } + + if (a != 3 || b != 5 || c != 10) + abort (); + + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/target-33.c b/libgomp/testsuite/libgomp.c/target-33.c new file mode 100644 index 0000000..1bed4b6 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/target-33.c @@ -0,0 +1,93 @@ +extern void abort (void); + +int +main () +{ + int a = 1, b = 2, c = 4, d[7]; + #pragma omp taskgroup + { + #pragma omp target enter data nowait map (to: a, b, c) depend(out: d[0]) + #pragma omp target nowait map (alloc: a, b) depend(in: d[0]) depend(out: d[1]) + { + #pragma omp atomic update + a |= 4; + #pragma omp atomic update + b |= 8; + } + #pragma omp target nowait map (alloc: a, c) depend(in: d[0]) depend(out: d[2]) + { + #pragma omp atomic update + a |= 16; + #pragma omp atomic update + c |= 32; + } + #pragma omp target exit data nowait map (from: a, b, c) depend(in: d[1], d[2]) + } + if (a != 21 || b != 10 || c != 36) + abort (); + #pragma omp target map (tofrom: a, b) nowait + { + a &= ~16; + b &= ~2; + } + #pragma omp target map (tofrom: c) nowait + { + c |= 8; + } + #pragma omp barrier + if (a != 5 || b != 8 || c != 44) + abort (); + #pragma omp target map (tofrom: a, b) nowait + { + a |= 32; + b |= 4; + } + #pragma omp target map (tofrom: c) nowait + { + c &= ~4; + } + #pragma omp taskwait + if (a != 37 || b != 12 || c != 40) + abort (); + #pragma omp target nowait map (tofrom: a, b) depend(out: d[3]) + { + #pragma omp atomic update + a = a + 9; + b -= 8; + } + #pragma omp target nowait map (tofrom: a, c) depend(out: d[4]) + { + #pragma omp atomic update + a = a + 4; + c >>= 1; + } + #pragma omp task if (0) depend (in: d[3], d[4]) shared (a, b, c) + if (a != 50 || b != 4 || c != 20) + abort (); + #pragma omp task shared (a) + a += 50; + #pragma omp target nowait map (tofrom: b) + b++; + #pragma omp target map (tofrom: c) nowait + c--; + #pragma omp taskwait + if (a != 100 || b != 5 || c != 19) + abort (); + #pragma omp target map (tofrom: a) nowait depend(out: d[5]) + a++; + #pragma omp target map (tofrom: b) nowait depend(out: d[6]) + b++; + #pragma omp target map (tofrom: a, b) depend(in: d[5], d[6]) + { + if (a != 101 || b != 6) + a = -9; + else + { + a = 24; + b = 38; + } + } + if (a != 24 || b != 38) + abort (); + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/target-34.c b/libgomp/testsuite/libgomp.c/target-34.c new file mode 100644 index 0000000..66d9f54 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/target-34.c @@ -0,0 +1,112 @@ +extern void abort (void); + +int +main () +{ + int a = 1, b = 2, c = 4, d[7]; + #pragma omp parallel + { + #pragma omp single + { + #pragma omp taskgroup + { + #pragma omp target enter data nowait map (to: a, b, c) depend(out: d[0]) + #pragma omp target nowait map (alloc: a, b) depend(in: d[0]) depend(out: d[1]) + { + #pragma omp atomic update + a |= 4; + #pragma omp atomic update + b |= 8; + } + #pragma omp target nowait map (alloc: a, c) depend(in: d[0]) depend(out: d[2]) + { + #pragma omp atomic update + a |= 16; + #pragma omp atomic update + c |= 32; + } + #pragma omp target exit data nowait map (from: a, b, c) depend(in: d[1], d[2]) + } + if (a != 21 || b != 10 || c != 36) + abort (); + #pragma omp target map (tofrom: a, b) nowait + { + a &= ~16; + b &= ~2; + } + #pragma omp target map (tofrom: c) nowait + { + c |= 8; + } + } /* Implicit barrier here. */ + #pragma omp single + { + if (a != 5 || b != 8 || c != 44) + abort (); + #pragma omp target map (tofrom: a, b) nowait + { + a |= 32; + b |= 4; + } + #pragma omp target map (tofrom: c) nowait + c &= ~4; + #pragma omp taskwait + if (a != 37 || b != 12 || c != 40) + abort (); + #pragma omp target nowait map (tofrom: a, b) depend(out: d[3]) + { + #pragma omp atomic update + a = a + 9; + b -= 8; + } + #pragma omp target nowait map (tofrom: a, c) depend(out: d[4]) + { + #pragma omp atomic update + a = a + 4; + c >>= 1; + } + #pragma omp task if (0) depend (in: d[3], d[4]) shared (a, b, c) + if (a != 50 || b != 4 || c != 20) + abort (); + #pragma omp task shared (a) + a += 50; + #pragma omp target nowait map (tofrom: b) + b++; + #pragma omp target map (tofrom: c) nowait + c--; + #pragma omp taskwait + if (a != 100 || b != 5 || c != 19) + abort (); + #pragma omp target map (tofrom: a) nowait depend(out: d[5]) + a++; + #pragma omp target map (tofrom: b) nowait depend(out: d[6]) + b++; + #pragma omp target map (tofrom: a, b) depend(in: d[5], d[6]) + { + if (a != 101 || b != 6) + a = -9; + else + { + a = 24; + b = 38; + } + } + if (a != 24 || b != 38) + abort (); + } /* Implicit barrier here. */ + #pragma omp master + { + #pragma omp target nowait map (tofrom: a, b) + { + a *= 2; + b++; + } + #pragma omp target map (tofrom: c) nowait + c--; + } + #pragma omp barrier + if (a != 48 || b != 39 || c != 18) + abort (); + } + return 0; +} diff --git a/liboffloadmic/ChangeLog b/liboffloadmic/ChangeLog index 93419bd..4086d54 100644 --- a/liboffloadmic/ChangeLog +++ b/liboffloadmic/ChangeLog @@ -1,3 +1,20 @@ +2015-11-14 Ilya Verbin <ilya.verbin@intel.com> + + * runtime/offload_host.cpp (task_completion_callback): New + variable. + (offload_proxy_task_completed_ooo): Call task_completion_callback. + (__offload_register_task_callback): New function. + * runtime/offload_host.h (__offload_register_task_callback): New + declaration. + * plugin/libgomp-plugin-intelmic.cpp (offload): Add async_data + argument, handle async offloading. + (register_main_image): Call register_main_image. + (GOMP_OFFLOAD_init_device, get_target_table, GOMP_OFFLOAD_alloc, + GOMP_OFFLOAD_free, GOMP_OFFLOAD_host2dev, GOMP_OFFLOAD_dev2host, + GOMP_OFFLOAD_dev2dev) Adjust offload callers. + (GOMP_OFFLOAD_async_run): New function. + (GOMP_OFFLOAD_run): Implement using GOMP_OFFLOAD_async_run. + 2015-10-26 Ilya Verbin <ilya.verbin@intel.com> Aleksander Ivanushenko <aleksander.ivanushenko@intel.com> diff --git a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp index 26ac6fe..772e198 100644 --- a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp +++ b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp @@ -192,11 +192,23 @@ GOMP_OFFLOAD_get_num_devices (void) static void offload (const char *file, uint64_t line, int device, const char *name, - int num_vars, VarDesc *vars, VarDesc2 *vars2) + int num_vars, VarDesc *vars, VarDesc2 *vars2, const void **async_data) { OFFLOAD ofld = __offload_target_acquire1 (&device, file, line); if (ofld) - __offload_offload1 (ofld, name, 0, num_vars, vars, vars2, 0, NULL, NULL); + { + if (async_data == NULL) + __offload_offload1 (ofld, name, 0, num_vars, vars, vars2, 0, NULL, + NULL); + else + { + OffloadFlags flags; + flags.flags = 0; + flags.bits.omp_async = 1; + __offload_offload3 (ofld, name, 0, num_vars, vars, NULL, 0, NULL, + async_data, 0, NULL, flags, NULL); + } + } else { fprintf (stderr, "%s:%d: Offload target acquire failed\n", file, line); @@ -208,6 +220,10 @@ static void register_main_image () { __offload_register_image (&main_target_image); + + /* liboffloadmic will call GOMP_PLUGIN_target_task_completion when + asynchronous task on target is completed. */ + __offload_register_task_callback (GOMP_PLUGIN_target_task_completion); } /* liboffloadmic loads and runs offload_target_main on all available devices @@ -218,7 +234,7 @@ GOMP_OFFLOAD_init_device (int device) TRACE (""); pthread_once (&main_image_is_registered, register_main_image); offload (__FILE__, __LINE__, device, "__offload_target_init_proc", 0, - NULL, NULL); + NULL, NULL, NULL); } extern "C" void @@ -240,7 +256,7 @@ get_target_table (int device, int &num_funcs, int &num_vars, void **&table) VarDesc2 vd1g[2] = { { "num_funcs", 0 }, { "num_vars", 0 } }; offload (__FILE__, __LINE__, device, "__offload_target_table_p1", 2, - vd1, vd1g); + vd1, vd1g, NULL); int table_size = num_funcs + 2 * num_vars; if (table_size > 0) @@ -254,7 +270,7 @@ get_target_table (int device, int &num_funcs, int &num_vars, void **&table) VarDesc2 vd2g = { "table", 0 }; offload (__FILE__, __LINE__, device, "__offload_target_table_p2", 1, - &vd2, &vd2g); + &vd2, &vd2g, NULL); } } @@ -401,8 +417,8 @@ GOMP_OFFLOAD_alloc (int device, size_t size) vd1[1].size = sizeof (void *); VarDesc2 vd1g[2] = { { "size", 0 }, { "tgt_ptr", 0 } }; - offload (__FILE__, __LINE__, device, "__offload_target_alloc", 2, vd1, vd1g); - + offload (__FILE__, __LINE__, device, "__offload_target_alloc", 2, vd1, vd1g, + NULL); return tgt_ptr; } @@ -416,7 +432,8 @@ GOMP_OFFLOAD_free (int device, void *tgt_ptr) vd1.size = sizeof (void *); VarDesc2 vd1g = { "tgt_ptr", 0 }; - offload (__FILE__, __LINE__, device, "__offload_target_free", 1, &vd1, &vd1g); + offload (__FILE__, __LINE__, device, "__offload_target_free", 1, &vd1, &vd1g, + NULL); } extern "C" void * @@ -435,7 +452,7 @@ GOMP_OFFLOAD_host2dev (int device, void *tgt_ptr, const void *host_ptr, VarDesc2 vd1g[2] = { { "tgt_ptr", 0 }, { "size", 0 } }; offload (__FILE__, __LINE__, device, "__offload_target_host2tgt_p1", 2, - vd1, vd1g); + vd1, vd1g, NULL); VarDesc vd2 = vd_host2tgt; vd2.ptr = (void *) host_ptr; @@ -443,7 +460,7 @@ GOMP_OFFLOAD_host2dev (int device, void *tgt_ptr, const void *host_ptr, VarDesc2 vd2g = { "var", 0 }; offload (__FILE__, __LINE__, device, "__offload_target_host2tgt_p2", 1, - &vd2, &vd2g); + &vd2, &vd2g, NULL); return tgt_ptr; } @@ -464,7 +481,7 @@ GOMP_OFFLOAD_dev2host (int device, void *host_ptr, const void *tgt_ptr, VarDesc2 vd1g[2] = { { "tgt_ptr", 0 }, { "size", 0 } }; offload (__FILE__, __LINE__, device, "__offload_target_tgt2host_p1", 2, - vd1, vd1g); + vd1, vd1g, NULL); VarDesc vd2 = vd_tgt2host; vd2.ptr = (void *) host_ptr; @@ -472,7 +489,7 @@ GOMP_OFFLOAD_dev2host (int device, void *host_ptr, const void *tgt_ptr, VarDesc2 vd2g = { "var", 0 }; offload (__FILE__, __LINE__, device, "__offload_target_tgt2host_p2", 1, - &vd2, &vd2g); + &vd2, &vd2g, NULL); return host_ptr; } @@ -495,22 +512,32 @@ GOMP_OFFLOAD_dev2dev (int device, void *dst_ptr, const void *src_ptr, VarDesc2 vd1g[3] = { { "dst_ptr", 0 }, { "src_ptr", 0 }, { "size", 0 } }; offload (__FILE__, __LINE__, device, "__offload_target_tgt2tgt", 3, vd1, - vd1g); + vd1g, NULL); return dst_ptr; } extern "C" void -GOMP_OFFLOAD_run (int device, void *tgt_fn, void *tgt_vars) +GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars, + void *async_data) { - TRACE ("(tgt_fn = %p, tgt_vars = %p)", tgt_fn, tgt_vars); + TRACE ("(device = %d, tgt_fn = %p, tgt_vars = %p, async_data = %p)", device, + tgt_fn, tgt_vars, async_data); - VarDesc vd1[2] = { vd_host2tgt, vd_host2tgt }; - vd1[0].ptr = &tgt_fn; - vd1[0].size = sizeof (void *); - vd1[1].ptr = &tgt_vars; - vd1[1].size = sizeof (void *); - VarDesc2 vd1g[2] = { { "tgt_fn", 0 }, { "tgt_vars", 0 } }; + VarDesc vd[2] = { vd_host2tgt, vd_host2tgt }; + vd[0].ptr = &tgt_fn; + vd[0].size = sizeof (void *); + vd[1].ptr = &tgt_vars; + vd[1].size = sizeof (void *); + + offload (__FILE__, __LINE__, device, "__offload_target_run", 2, vd, NULL, + (const void **) async_data); +} + +extern "C" void +GOMP_OFFLOAD_run (int device, void *tgt_fn, void *tgt_vars) +{ + TRACE ("(device = %d, tgt_fn = %p, tgt_vars = %p)", device, tgt_fn, tgt_vars); - offload (__FILE__, __LINE__, device, "__offload_target_run", 2, vd1, vd1g); + GOMP_OFFLOAD_async_run (device, tgt_fn, tgt_vars, NULL); } diff --git a/liboffloadmic/runtime/offload_host.cpp b/liboffloadmic/runtime/offload_host.cpp index c10dc4e..e52019d 100644 --- a/liboffloadmic/runtime/offload_host.cpp +++ b/liboffloadmic/runtime/offload_host.cpp @@ -64,6 +64,8 @@ static void __offload_fini_library(void); #define GET_OFFLOAD_NUMBER(timer_data) \ timer_data? timer_data->offload_number : 0 +static void (*task_completion_callback)(void *); + extern "C" { #ifdef TARGET_WINNT // Windows does not support imports from libraries without actually @@ -2507,7 +2509,7 @@ extern "C" { const void *info ) { - /* TODO: Call callback function, pass info. */ + task_completion_callback ((void *) info); } } @@ -5669,6 +5671,11 @@ extern "C" void __offload_unregister_image(const void *target_image) } } +extern "C" void __offload_register_task_callback(void (*cb)(void *)) +{ + task_completion_callback = cb; +} + // Runtime trace interface for user programs void __offload_console_trace(int level) diff --git a/liboffloadmic/runtime/offload_host.h b/liboffloadmic/runtime/offload_host.h index afd5c99..2a43fd6 100644 --- a/liboffloadmic/runtime/offload_host.h +++ b/liboffloadmic/runtime/offload_host.h @@ -376,6 +376,9 @@ extern "C" bool __offload_target_image_is_executable(const void *target_image); extern "C" bool __offload_register_image(const void* image); extern "C" void __offload_unregister_image(const void* image); +// Registers asynchronous task completion callback +extern "C" void __offload_register_task_callback(void (*cb)(void *)); + // Initializes offload runtime library. DLL_LOCAL extern int __offload_init_library(void); |