diff options
author | Jonathan Peyton <jonathan.l.peyton@intel.com> | 2024-05-07 08:41:51 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-07 08:41:51 -0500 |
commit | 41ca9104ac1e0bf248d4082f45c5ad03ddd55727 (patch) | |
tree | 60565343107ec5893bf5530aaf5ff82311777930 /openmp/runtime | |
parent | 1d87465a0a95cee9accc5dce7abdabbbc3f3c122 (diff) | |
download | llvm-41ca9104ac1e0bf248d4082f45c5ad03ddd55727.zip llvm-41ca9104ac1e0bf248d4082f45c5ad03ddd55727.tar.gz llvm-41ca9104ac1e0bf248d4082f45c5ad03ddd55727.tar.bz2 |
[OpenMP] Fix task state and taskteams for serial teams (#86859)
* Serial teams now use a stack (similar to dispatch buffers)
* Serial teams always use `t_task_team[0]` as the task team and the
second pointer is a next pointer for the stack
`t_task_team[1]` is interpreted as a stack of task teams where each
level is a nested level
```
inner serial team outer serial team
[ t_task_team[0] ] -> (task_team) [ t_task_team[0] ] -> (task_team)
[ next ] ----------------> [ next ] -> ...
```
* Remove the task state memo stack from thread structure.
* Instead of a thread-private stack, use team structure to store
th_task_state of the primary thread. When coming out of a parallel,
restore the primary thread's task state. The new field in the team
structure doesn't cause sizeof(team) to change and is in the cache line
which is only read/written by the primary thread.
Fixes: #50602
Fixes: #69368
Fixes: #69733
Fixes: #79416
Diffstat (limited to 'openmp/runtime')
-rw-r--r-- | openmp/runtime/src/kmp.h | 29 | ||||
-rw-r--r-- | openmp/runtime/src/kmp_barrier.cpp | 15 | ||||
-rw-r--r-- | openmp/runtime/src/kmp_csupport.cpp | 11 | ||||
-rw-r--r-- | openmp/runtime/src/kmp_runtime.cpp | 179 | ||||
-rw-r--r-- | openmp/runtime/src/kmp_tasking.cpp | 193 | ||||
-rw-r--r-- | openmp/runtime/test/target/issue-81488.c | 36 | ||||
-rw-r--r-- | openmp/runtime/test/tasking/issue-50602.c | 40 | ||||
-rw-r--r-- | openmp/runtime/test/tasking/issue-69368.c | 27 | ||||
-rw-r--r-- | openmp/runtime/test/tasking/issue-69733.c | 147 | ||||
-rw-r--r-- | openmp/runtime/test/tasking/issue-79416.c | 33 | ||||
-rw-r--r-- | openmp/runtime/test/tasking/task_teams_stress_test.cpp | 318 |
11 files changed, 792 insertions, 236 deletions
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index 18ccf10..64a3ea6 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -2871,6 +2871,11 @@ union KMP_ALIGN_CACHE kmp_task_team { char tt_pad[KMP_PAD(kmp_base_task_team_t, CACHE_LINE)]; }; +typedef struct kmp_task_team_list_t { + kmp_task_team_t *task_team; + kmp_task_team_list_t *next; +} kmp_task_team_list_t; + #if (USE_FAST_MEMORY == 3) || (USE_FAST_MEMORY == 5) // Free lists keep same-size free memory slots for fast memory allocation // routines @@ -3008,10 +3013,6 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info { kmp_task_team_t *th_task_team; // Task team struct kmp_taskdata_t *th_current_task; // Innermost Task being executed kmp_uint8 th_task_state; // alternating 0/1 for task team identification - kmp_uint8 *th_task_state_memo_stack; // Stack holding memos of th_task_state - // at nested levels - kmp_uint32 th_task_state_top; // Top element of th_task_state_memo_stack - kmp_uint32 th_task_state_stack_sz; // Size of th_task_state_memo_stack kmp_uint32 th_reap_state; // Non-zero indicates thread is not // tasking, thus safe to reap @@ -3133,6 +3134,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team { kmp_disp_t *t_dispatch; // thread's dispatch data kmp_task_team_t *t_task_team[2]; // Task team struct; switch between 2 kmp_proc_bind_t t_proc_bind; // bind type for par region + int t_primary_task_state; // primary thread's task state saved #if USE_ITT_BUILD kmp_uint64 t_region_time; // region begin timestamp #endif /* USE_ITT_BUILD */ @@ -3204,6 +3206,12 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team { distributedBarrier *b; // Distributed barrier data associated with team } kmp_base_team_t; +// Assert that the list structure fits and aligns within +// the double task team pointer +KMP_BUILD_ASSERT(sizeof(kmp_task_team_t *[2]) == sizeof(kmp_task_team_list_t)); +KMP_BUILD_ASSERT(alignof(kmp_task_team_t *[2]) == + alignof(kmp_task_team_list_t)); + union KMP_ALIGN_CACHE kmp_team { kmp_base_team_t t; double t_align; /* use worst case alignment */ @@ -4114,9 +4122,10 @@ extern void __kmp_fulfill_event(kmp_event_t *event); extern void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team); extern void __kmp_reap_task_teams(void); +extern void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team); +extern void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team); extern void __kmp_wait_to_unref_task_teams(void); -extern void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, - int always); +extern void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team); extern void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team); extern void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team #if USE_ITT_BUILD @@ -4127,6 +4136,14 @@ extern void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team int wait = 1); extern void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid); +#if KMP_DEBUG +#define KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, thr) \ + KMP_DEBUG_ASSERT( \ + __kmp_tasking_mode != tskm_task_teams || team->t.t_nproc == 1 || \ + thr->th.th_task_team == team->t.t_task_team[thr->th.th_task_state]) +#else +#define KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, thr) /* Nothing */ +#endif extern int __kmp_is_address_mapped(void *addr); extern kmp_uint64 __kmp_hardware_timestamp(void); diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp index e9ab15f..b381694 100644 --- a/openmp/runtime/src/kmp_barrier.cpp +++ b/openmp/runtime/src/kmp_barrier.cpp @@ -1858,8 +1858,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, } if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec) - // use 0 to only setup the current team if nthreads > 1 - __kmp_task_team_setup(this_thr, team, 0); + __kmp_task_team_setup(this_thr, team); if (cancellable) { cancelled = __kmp_linear_barrier_gather_cancellable( @@ -2042,7 +2041,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered == TRUE); __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); - __kmp_task_team_setup(this_thr, team, 0); + __kmp_task_team_setup(this_thr, team); #if USE_ITT_BUILD if (__itt_sync_create_ptr || KMP_ITT_DEBUG) @@ -2243,9 +2242,7 @@ void __kmp_join_barrier(int gtid) { __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state], this_thr->th.th_task_team)); - if (this_thr->th.th_task_team) - KMP_DEBUG_ASSERT(this_thr->th.th_task_team == - team->t.t_task_team[this_thr->th.th_task_state]); + KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr); } #endif /* KMP_DEBUG */ @@ -2440,10 +2437,8 @@ void __kmp_fork_barrier(int gtid, int tid) { } #endif - if (__kmp_tasking_mode != tskm_immediate_exec) { - // 0 indicates setup current task team if nthreads > 1 - __kmp_task_team_setup(this_thr, team, 0); - } + if (__kmp_tasking_mode != tskm_immediate_exec) + __kmp_task_team_setup(this_thr, team); /* The primary thread may have changed its blocktime between join barrier and fork barrier. Copy the blocktime info to the thread, where diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp index 0268f69..f45fe64 100644 --- a/openmp/runtime/src/kmp_csupport.cpp +++ b/openmp/runtime/src/kmp_csupport.cpp @@ -654,6 +654,12 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { serial_team->t.t_dispatch->th_disp_buffer->next; __kmp_free(disp_buffer); } + + /* pop the task team stack */ + if (serial_team->t.t_serialized > 1) { + __kmp_pop_task_team_node(this_thr, serial_team); + } + this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore --serial_team->t.t_serialized; @@ -692,6 +698,11 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { this_thr->th.th_current_task->td_flags.executing = 1; if (__kmp_tasking_mode != tskm_immediate_exec) { + // Restore task state from serial team structure + KMP_DEBUG_ASSERT(serial_team->t.t_primary_task_state == 0 || + serial_team->t.t_primary_task_state == 1); + this_thr->th.th_task_state = + (kmp_uint8)serial_team->t.t_primary_task_state; // Copy the task team from the new child / old parent team to the thread. this_thr->th.th_task_team = this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]; diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp index 95acf4d..4be67f3 100644 --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -1042,6 +1042,41 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, } } + // Take care of primary thread's task state + if (__kmp_tasking_mode != tskm_immediate_exec) { + if (use_hot_team) { + KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th); + KA_TRACE( + 20, + ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team " + "%p, new task_team %p / team %p\n", + __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, + team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state], + team)); + + // Store primary thread's current task state on new team + KMP_CHECK_UPDATE(team->t.t_primary_task_state, + master_th->th.th_task_state); + + // Restore primary thread's task state to hot team's state + // by using thread 1's task state + if (team->t.t_nproc > 1) { + KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 || + team->t.t_threads[1]->th.th_task_state == 1); + KMP_CHECK_UPDATE(master_th->th.th_task_state, + team->t.t_threads[1]->th.th_task_state); + } else { + master_th->th.th_task_state = 0; + } + } else { + // Store primary thread's current task_state on new team + KMP_CHECK_UPDATE(team->t.t_primary_task_state, + master_th->th.th_task_state); + // Are not using hot team, so set task state to 0. + master_th->th.th_task_state = 0; + } + } + if (__kmp_display_affinity && team->t.t_display_affinity != 1) { for (i = 0; i < team->t.t_nproc; i++) { kmp_info_t *thr = team->t.t_threads[i]; @@ -1145,18 +1180,6 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { KMP_DEBUG_ASSERT(serial_team); KMP_MB(); - if (__kmp_tasking_mode != tskm_immediate_exec) { - KMP_DEBUG_ASSERT( - this_thr->th.th_task_team == - this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); - KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == - NULL); - KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " - "team %p, new task_team = NULL\n", - global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); - this_thr->th.th_task_team = NULL; - } - kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { proc_bind = proc_bind_false; @@ -1242,6 +1265,8 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { serial_team->t.t_serialized = 1; serial_team->t.t_nproc = 1; serial_team->t.t_parent = this_thr->th.th_team; + // Save previous team's task state on serial team structure + serial_team->t.t_primary_task_state = this_thr->th.th_task_state; serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; this_thr->th.th_team = serial_team; serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; @@ -1281,6 +1306,8 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { this_thr->th.th_team_nproc = 1; this_thr->th.th_team_master = this_thr; this_thr->th.th_team_serialized = 1; + this_thr->th.th_task_team = NULL; + this_thr->th.th_task_state = 0; serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; @@ -1332,6 +1359,9 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { } this_thr->th.th_dispatch = serial_team->t.t_dispatch; + /* allocate/push task team stack */ + __kmp_push_task_team_node(this_thr, serial_team); + KMP_MB(); } KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); @@ -1985,17 +2015,12 @@ int __kmp_fork_call(ident_t *loc, int gtid, ap); } // End parallel closely nested in teams construct -#if KMP_DEBUG - if (__kmp_tasking_mode != tskm_immediate_exec) { - KMP_DEBUG_ASSERT(master_th->th.th_task_team == - parent_team->t.t_task_team[master_th->th.th_task_state]); - } -#endif - // Need this to happen before we determine the number of threads, not while // we are allocating the team //__kmp_push_current_task_to_thread(master_th, parent_team, 0); + KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th); + // Determine the number of threads int enter_teams = __kmp_is_entering_teams(active_level, level, teams_level, ap); @@ -2186,64 +2211,6 @@ int __kmp_fork_call(ident_t *loc, int gtid, ompd_bp_parallel_begin(); #endif - if (__kmp_tasking_mode != tskm_immediate_exec) { - // Set primary thread's task team to team's task team. Unless this is hot - // team, it should be NULL. - KMP_DEBUG_ASSERT(master_th->th.th_task_team == - parent_team->t.t_task_team[master_th->th.th_task_state]); - KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team " - "%p, new task_team %p / team %p\n", - __kmp_gtid_from_thread(master_th), - master_th->th.th_task_team, parent_team, - team->t.t_task_team[master_th->th.th_task_state], team)); - - if (active_level || master_th->th.th_task_team) { - // Take a memo of primary thread's task_state - KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); - if (master_th->th.th_task_state_top >= - master_th->th.th_task_state_stack_sz) { // increase size - kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; - kmp_uint8 *old_stack, *new_stack; - kmp_uint32 i; - new_stack = (kmp_uint8 *)__kmp_allocate(new_size); - for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { - new_stack[i] = master_th->th.th_task_state_memo_stack[i]; - } - for (i = master_th->th.th_task_state_stack_sz; i < new_size; - ++i) { // zero-init rest of stack - new_stack[i] = 0; - } - old_stack = master_th->th.th_task_state_memo_stack; - master_th->th.th_task_state_memo_stack = new_stack; - master_th->th.th_task_state_stack_sz = new_size; - __kmp_free(old_stack); - } - // Store primary thread's task_state on stack - master_th->th - .th_task_state_memo_stack[master_th->th.th_task_state_top] = - master_th->th.th_task_state; - master_th->th.th_task_state_top++; -#if KMP_NESTED_HOT_TEAMS - if (master_th->th.th_hot_teams && - active_level < __kmp_hot_teams_max_level && - team == master_th->th.th_hot_teams[active_level].hot_team) { - // Restore primary thread's nested state if nested hot team - master_th->th.th_task_state = - master_th->th - .th_task_state_memo_stack[master_th->th.th_task_state_top]; - } else { -#endif - master_th->th.th_task_state = 0; -#if KMP_NESTED_HOT_TEAMS - } -#endif - } -#if !KMP_NESTED_HOT_TEAMS - KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || - (team == root->r.r_hot_team)); -#endif - } - KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", @@ -2451,8 +2418,7 @@ void __kmp_join_call(ident_t *loc, int gtid __kmp_gtid_from_thread(master_th), team, team->t.t_task_team[master_th->th.th_task_state], master_th->th.th_task_team)); - KMP_DEBUG_ASSERT(master_th->th.th_task_team == - team->t.t_task_team[master_th->th.th_task_state]); + KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th); } #endif @@ -2690,24 +2656,11 @@ void __kmp_join_call(ident_t *loc, int gtid } if (__kmp_tasking_mode != tskm_immediate_exec) { - if (master_th->th.th_task_state_top > - 0) { // Restore task state from memo stack - KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); - // Remember primary thread's state if we re-use this nested hot team - master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = - master_th->th.th_task_state; - --master_th->th.th_task_state_top; // pop - // Now restore state at this level - master_th->th.th_task_state = - master_th->th - .th_task_state_memo_stack[master_th->th.th_task_state_top]; - } else if (team != root->r.r_hot_team) { - // Reset the task state of primary thread if we are not hot team because - // in this case all the worker threads will be free, and their task state - // will be reset. If not reset the primary's, the task state will be - // inconsistent. - master_th->th.th_task_state = 0; - } + // Restore primary thread's task state from team structure + KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 || + team->t.t_primary_task_state == 1); + master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state; + // Copy the task team from the parent team to the primary thread master_th->th.th_task_team = parent_team->t.t_task_team[master_th->th.th_task_state]; @@ -4396,17 +4349,6 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, this_thr->th.th_next_pool = NULL; - if (!this_thr->th.th_task_state_memo_stack) { - size_t i; - this_thr->th.th_task_state_memo_stack = - (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); - this_thr->th.th_task_state_top = 0; - this_thr->th.th_task_state_stack_sz = 4; - for (i = 0; i < this_thr->th.th_task_state_stack_sz; - ++i) // zero init the stack - this_thr->th.th_task_state_memo_stack[i] = 0; - } - KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); @@ -4463,8 +4405,6 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, TCW_4(__kmp_nth, __kmp_nth + 1); new_thr->th.th_task_state = 0; - new_thr->th.th_task_state_top = 0; - new_thr->th.th_task_state_stack_sz = 4; if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { // Make sure pool thread has transitioned to waiting on own thread struct @@ -5262,6 +5202,15 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, // Activate team threads via th_used_in_team __kmp_add_threads_to_team(team, new_nproc); } + // When decreasing team size, threads no longer in the team should + // unref task team. + if (__kmp_tasking_mode != tskm_immediate_exec) { + for (f = new_nproc; f < team->t.t_nproc; f++) { + kmp_info_t *th = team->t.t_threads[f]; + KMP_DEBUG_ASSERT(th); + th->th.th_task_team = NULL; + } + } #if KMP_NESTED_HOT_TEAMS if (__kmp_hot_teams_mode == 0) { // AC: saved number of threads should correspond to team's value in this @@ -5272,11 +5221,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, /* release the extra threads we don't need any more */ for (f = new_nproc; f < team->t.t_nproc; f++) { KMP_DEBUG_ASSERT(team->t.t_threads[f]); - if (__kmp_tasking_mode != tskm_immediate_exec) { - // When decreasing team size, threads no longer in the team should - // unref task team. - team->t.t_threads[f]->th.th_task_team = NULL; - } __kmp_free_thread(team->t.t_threads[f]); team->t.t_threads[f] = NULL; } @@ -6248,11 +6192,6 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { thread->th.th_pri_common = NULL; } - if (thread->th.th_task_state_memo_stack != NULL) { - __kmp_free(thread->th.th_task_state_memo_stack); - thread->th.th_task_state_memo_stack = NULL; - } - #if KMP_USE_BGET if (thread->th.th_local.bget_data != NULL) { __kmp_finalize_bget(thread); diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp index 6303bb0..a782027 100644 --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -1511,8 +1511,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, KA_TRACE(30, ("T#%d creating task team in __kmp_task_alloc for proxy task\n", gtid)); - // 1 indicates setup the current team regardless of nthreads - __kmp_task_team_setup(thread, team, 1); + __kmp_task_team_setup(thread, team); thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state]; } kmp_task_team_t *task_team = thread->th.th_task_team; @@ -3390,8 +3389,6 @@ static inline int __kmp_execute_tasks_template( nthreads = task_team->tt.tt_nproc; unfinished_threads = &(task_team->tt.tt_unfinished_threads); - KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks || - task_team->tt.tt_hidden_helper_task_encountered); KMP_DEBUG_ASSERT(*unfinished_threads >= 0); while (1) { // Outer loop keeps trying to find tasks in case of single thread @@ -3943,6 +3940,20 @@ static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) { __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock); } +static inline void __kmp_task_team_init(kmp_task_team_t *task_team, + kmp_team_t *team) { + int team_nth = team->t.t_nproc; + // Only need to init if task team is isn't active or team size changed + if (!task_team->tt.tt_active || team_nth != task_team->tt.tt_nproc) { + TCW_4(task_team->tt.tt_found_tasks, FALSE); + TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); + TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); + TCW_4(task_team->tt.tt_nproc, team_nth); + KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth); + TCW_4(task_team->tt.tt_active, TRUE); + } +} + // __kmp_allocate_task_team: // Allocates a task team associated with a specific team, taking it from // the global task team free list if possible. Also initializes data @@ -3950,7 +3961,6 @@ static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) { static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread, kmp_team_t *team) { kmp_task_team_t *task_team = NULL; - int nthreads; KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n", (thread ? __kmp_gtid_from_thread(thread) : -1), team)); @@ -3992,14 +4002,7 @@ static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread, // task_team->tt.tt_next = NULL; } - TCW_4(task_team->tt.tt_found_tasks, FALSE); - TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); - TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); - task_team->tt.tt_nproc = nthreads = team->t.t_nproc; - - KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads); - TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); - TCW_4(task_team->tt.tt_active, TRUE); + __kmp_task_team_init(task_team, team); KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p " "unfinished_threads init'd to %d\n", @@ -4053,6 +4056,40 @@ void __kmp_reap_task_teams(void) { } } +// View the array of two task team pointers as a pair of pointers: +// 1) a single task_team pointer +// 2) next pointer for stack +// Serial teams can create a stack of task teams for nested serial teams. +void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team) { + KMP_DEBUG_ASSERT(team->t.t_nproc == 1); + kmp_task_team_list_t *current = + (kmp_task_team_list_t *)(&team->t.t_task_team[0]); + kmp_task_team_list_t *node = + (kmp_task_team_list_t *)__kmp_allocate(sizeof(kmp_task_team_list_t)); + node->task_team = current->task_team; + node->next = current->next; + thread->th.th_task_team = current->task_team = NULL; + current->next = node; +} + +// Serial team pops a task team off the stack +void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team) { + KMP_DEBUG_ASSERT(team->t.t_nproc == 1); + kmp_task_team_list_t *current = + (kmp_task_team_list_t *)(&team->t.t_task_team[0]); + if (current->task_team) { + __kmp_free_task_team(thread, current->task_team); + } + kmp_task_team_list_t *next = current->next; + if (next) { + current->task_team = next->task_team; + current->next = next->next; + KMP_DEBUG_ASSERT(next != current); + __kmp_free(next); + thread->th.th_task_team = current->task_team; + } +} + // __kmp_wait_to_unref_task_teams: // Some threads could still be in the fork barrier release code, possibly // trying to steal tasks. Wait for each thread to unreference its task team. @@ -4117,55 +4154,34 @@ void __kmp_wait_to_unref_task_teams(void) { } } -void __kmp_shift_task_state_stack(kmp_info_t *this_thr, kmp_uint8 value) { - // Shift values from th_task_state_top+1 to task_state_stack_sz - if (this_thr->th.th_task_state_top + 1 >= - this_thr->th.th_task_state_stack_sz) { // increase size - kmp_uint32 new_size = 2 * this_thr->th.th_task_state_stack_sz; - kmp_uint8 *old_stack, *new_stack; - kmp_uint32 i; - new_stack = (kmp_uint8 *)__kmp_allocate(new_size); - for (i = 0; i <= this_thr->th.th_task_state_top; ++i) { - new_stack[i] = this_thr->th.th_task_state_memo_stack[i]; - } - // If we need to reallocate do the shift at the same time. - for (; i < this_thr->th.th_task_state_stack_sz; ++i) { - new_stack[i + 1] = this_thr->th.th_task_state_memo_stack[i]; - } - for (i = this_thr->th.th_task_state_stack_sz; i < new_size; - ++i) { // zero-init rest of stack - new_stack[i] = 0; - } - old_stack = this_thr->th.th_task_state_memo_stack; - this_thr->th.th_task_state_memo_stack = new_stack; - this_thr->th.th_task_state_stack_sz = new_size; - __kmp_free(old_stack); - } else { - kmp_uint8 *end; - kmp_uint32 i; - - end = &this_thr->th - .th_task_state_memo_stack[this_thr->th.th_task_state_stack_sz]; - - for (i = this_thr->th.th_task_state_stack_sz - 1; - i > this_thr->th.th_task_state_top; i--, end--) - end[0] = end[-1]; - } - this_thr->th.th_task_state_memo_stack[this_thr->th.th_task_state_top + 1] = - value; -} - // __kmp_task_team_setup: Create a task_team for the current team, but use // an already created, unused one if it already exists. -void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) { +void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) { KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); + // For the serial and root teams, setup the first task team pointer to point + // to task team. The other pointer is a stack of task teams from previous + // serial levels. + if (team == this_thr->th.th_serial_team || + team == this_thr->th.th_root->r.r_root_team) { + KMP_DEBUG_ASSERT(team->t.t_nproc == 1); + if (team->t.t_task_team[0] == NULL) { + team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team); + KA_TRACE( + 20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p" + " for serial/root team %p\n", + __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team)); + + } else + __kmp_task_team_init(team->t.t_task_team[0], team); + return; + } + // If this task_team hasn't been created yet, allocate it. It will be used in // the region after the next. // If it exists, it is the current task team and shouldn't be touched yet as // it may still be in use. - if (team->t.t_task_team[this_thr->th.th_task_state] == NULL && - (always || team->t.t_nproc > 1)) { + if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) { team->t.t_task_team[this_thr->th.th_task_state] = __kmp_allocate_task_team(this_thr, team); KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p" @@ -4174,52 +4190,31 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) { team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id, this_thr->th.th_task_state)); } - if (this_thr->th.th_task_state == 1 && always && team->t.t_nproc == 1) { - // fix task state stack to adjust for proxy and helper tasks - KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d needs to shift stack" - " for team %d at parity=%d\n", - __kmp_gtid_from_thread(this_thr), team->t.t_id, - this_thr->th.th_task_state)); - __kmp_shift_task_state_stack(this_thr, this_thr->th.th_task_state); - } // After threads exit the release, they will call sync, and then point to this // other task_team; make sure it is allocated and properly initialized. As // threads spin in the barrier release phase, they will continue to use the // previous task_team struct(above), until they receive the signal to stop // checking for tasks (they can't safely reference the kmp_team_t struct, - // which could be reallocated by the primary thread). No task teams are formed - // for serialized teams. - if (team->t.t_nproc > 1) { - int other_team = 1 - this_thr->th.th_task_state; - KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2); - if (team->t.t_task_team[other_team] == NULL) { // setup other team as well - team->t.t_task_team[other_team] = - __kmp_allocate_task_team(this_thr, team); - KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new " - "task_team %p for team %d at parity=%d\n", - __kmp_gtid_from_thread(this_thr), - team->t.t_task_team[other_team], team->t.t_id, other_team)); - } else { // Leave the old task team struct in place for the upcoming region; - // adjust as needed - kmp_task_team_t *task_team = team->t.t_task_team[other_team]; - if (!task_team->tt.tt_active || - team->t.t_nproc != task_team->tt.tt_nproc) { - TCW_4(task_team->tt.tt_nproc, team->t.t_nproc); - TCW_4(task_team->tt.tt_found_tasks, FALSE); - TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); - TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); - KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, - team->t.t_nproc); - TCW_4(task_team->tt.tt_active, TRUE); - } - // if team size has changed, the first thread to enable tasking will - // realloc threads_data if necessary - KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team " - "%p for team %d at parity=%d\n", - __kmp_gtid_from_thread(this_thr), - team->t.t_task_team[other_team], team->t.t_id, other_team)); - } + // which could be reallocated by the primary thread). + int other_team = 1 - this_thr->th.th_task_state; + KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2); + if (team->t.t_task_team[other_team] == NULL) { // setup other team as well + team->t.t_task_team[other_team] = __kmp_allocate_task_team(this_thr, team); + KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new " + "task_team %p for team %d at parity=%d\n", + __kmp_gtid_from_thread(this_thr), + team->t.t_task_team[other_team], team->t.t_id, other_team)); + } else { // Leave the old task team struct in place for the upcoming region; + // adjust as needed + kmp_task_team_t *task_team = team->t.t_task_team[other_team]; + __kmp_task_team_init(task_team, team); + // if team size has changed, the first thread to enable tasking will + // realloc threads_data if necessary + KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team " + "%p for team %d at parity=%d\n", + __kmp_gtid_from_thread(this_thr), + team->t.t_task_team[other_team], team->t.t_id, other_team)); } // For regular thread, task enabling should be called when the task is going @@ -4245,9 +4240,11 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) { // __kmp_task_team_sync: Propagation of task team data from team to threads // which happens just after the release phase of a team barrier. This may be -// called by any thread, but only for teams with # threads > 1. +// called by any thread. This is not called for serial or root teams. void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) { KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); + KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team); + KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team); // Toggle the th_task_state field, to switch which task_team this thread // refers to @@ -4265,8 +4262,7 @@ void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) { } // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the -// barrier gather phase. Only called by primary thread if #threads in team > 1 -// or if proxy tasks were created. +// barrier gather phase. Only called by the primary thread. // // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off // by passing in 0 optionally as the last argument. When wait is zero, primary @@ -4300,9 +4296,6 @@ void __kmp_task_team_wait( ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: " "setting active to false, setting local and team's pointer to NULL\n", __kmp_gtid_from_thread(this_thr), task_team)); - KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 || - task_team->tt.tt_found_proxy_tasks == TRUE || - task_team->tt.tt_hidden_helper_task_encountered == TRUE); TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE); TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0); diff --git a/openmp/runtime/test/target/issue-81488.c b/openmp/runtime/test/target/issue-81488.c new file mode 100644 index 0000000..adac7d6 --- /dev/null +++ b/openmp/runtime/test/target/issue-81488.c @@ -0,0 +1,36 @@ +// RUN: %libomp-compile +// RUN: env OMP_NUM_THREADS=1 LIBOMP_USE_HIDDEN_HELPER_TASK=1 \ +// RUN: LIBOMP_NUM_HIDDEN_HELPER_THREADS=8 %libomp-run + +#include <stdio.h> +#include <stdlib.h> +#include <omp.h> + +#define Nz 8 +#define DEVICE_ID 0 + +int a[Nz]; + +int main(void) { + for (int n = 0; n < 10; ++n) { + for (int k = 0; k < Nz; ++k) { + a[k] = -1; + } +#pragma omp parallel shared(a) + { +#pragma omp single + { +#pragma omp target teams distribute parallel for nowait device(DEVICE_ID) \ + map(tofrom : a[0 : 8]) + for (int i = 0; i < Nz; ++i) { + a[i] = i; + } + } +#pragma omp barrier + } + for (int k = 0; k < Nz; ++k) { + printf("a[%d] = %d\n", k, a[k]); + } + } + return 0; +} diff --git a/openmp/runtime/test/tasking/issue-50602.c b/openmp/runtime/test/tasking/issue-50602.c new file mode 100644 index 0000000..b691204 --- /dev/null +++ b/openmp/runtime/test/tasking/issue-50602.c @@ -0,0 +1,40 @@ +// RUN: %libomp-compile-and-run +// RUN: env OMP_NUM_THREADS=1 %libomp-run +// RUN: %libomp-compile -DUSE_HIDDEN_HELPERS=1 +// RUN: %libomp-run +// RUN: env OMP_NUM_THREADS=1 %libomp-run +#include <omp.h> + +int main(int argc, char *argv[]) { + int i; + + omp_set_max_active_levels(1); + omp_set_dynamic(0); + + for (i = 0; i < 10; ++i) { +#pragma omp parallel + { +#ifndef USE_HIDDEN_HELPERS + omp_event_handle_t event; +#endif + int a = 0; + +#ifdef USE_HIDDEN_HELPERS +#pragma omp target map(tofrom : a) nowait +#else +#pragma omp task shared(a) detach(event) +#endif + { a = 1; } + +#pragma omp parallel + { a = 2; } + +#ifndef USE_HIDDEN_HELPERS + omp_fulfill_event(event); +#endif + +#pragma omp taskwait + } + } + return 0; +} diff --git a/openmp/runtime/test/tasking/issue-69368.c b/openmp/runtime/test/tasking/issue-69368.c new file mode 100644 index 0000000..57bd741 --- /dev/null +++ b/openmp/runtime/test/tasking/issue-69368.c @@ -0,0 +1,27 @@ +// RUN: %libomp-compile-and-run +// RUN: env OMP_NUM_THREADS=1 %libomp-run + +int main() { + int i; + int a[2]; + volatile int attempt = 0; + + for (i = 0; i < 10; ++i) { + a[0] = a[1] = 0; +#pragma omp parallel for + for (int i = 0; i < 2; i++) { + a[i] = 2; + } + if (a[0] != 2 || a[1] != 2) + return 1; + +#pragma omp teams distribute parallel for if (attempt >= 2) + for (int i = 0; i < 2; i++) { + a[i] = 1; + } + if (a[0] != 1 || a[1] != 1) + return 1; + } + + return 0; +} diff --git a/openmp/runtime/test/tasking/issue-69733.c b/openmp/runtime/test/tasking/issue-69733.c new file mode 100644 index 0000000..5775b01 --- /dev/null +++ b/openmp/runtime/test/tasking/issue-69733.c @@ -0,0 +1,147 @@ +// RUN: %libomp-compile-and-run + +#include <stdio.h> +#include <stdlib.h> +#include <omp.h> + +int a; + +void inc_a() { +#pragma omp atomic + a++; +} + +void root_team_detached() { + a = 0; + omp_event_handle_t ev; +#pragma omp task detach(ev) + inc_a(); + omp_fulfill_event(ev); + if (a != 1) { + fprintf(stderr, "error: root_team_detached(): a != 1\n"); + exit(EXIT_FAILURE); + } +} + +void root_team_hidden_helpers() { + a = 0; +#pragma omp target nowait + inc_a(); + +#pragma omp taskwait + + if (a != 1) { + fprintf(stderr, "error: root_team_hidden_helpers(): a != 1\n"); + exit(EXIT_FAILURE); + } +} + +void parallel_detached(int nth1) { + a = 0; + omp_event_handle_t *evs = + (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nth1); +#pragma omp parallel num_threads(nth1) + { + int tid = omp_get_thread_num(); + omp_event_handle_t e = evs[tid]; +#pragma omp task detach(e) + inc_a(); + omp_fulfill_event(e); + } + free(evs); + if (a != nth1) { + fprintf(stderr, "error: parallel_detached(): a (%d) != %d\n", a, nth1); + exit(EXIT_FAILURE); + } +} + +void parallel_hidden_helpers(int nth1) { + a = 0; +#pragma omp parallel num_threads(nth1) + { +#pragma omp target nowait + inc_a(); + } + if (a != nth1) { + fprintf(stderr, "error: parallel_hidden_helpers(): a (%d) != %d\n", a, + nth1); + exit(EXIT_FAILURE); + } +} + +void nested_parallel_detached(int nth1, int nth2) { + a = 0; + omp_event_handle_t **evs = + (omp_event_handle_t **)malloc(sizeof(omp_event_handle_t *) * nth1); +#pragma omp parallel num_threads(nth1) + { + int tid = omp_get_thread_num(); + evs[tid] = (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nth2); +#pragma omp parallel num_threads(nth2) shared(tid) + { + int tid2 = omp_get_thread_num(); + omp_event_handle_t e = evs[tid][tid2]; +#pragma omp task detach(e) + inc_a(); + omp_fulfill_event(e); + } + free(evs[tid]); + } + free(evs); + if (a != nth1 * nth2) { + fprintf(stderr, "error: nested_parallel_detached(): a (%d) != %d * %d\n", a, + nth1, nth2); + exit(EXIT_FAILURE); + } +} + +void nested_parallel_hidden_helpers(int nth1, int nth2) { + a = 0; +#pragma omp parallel num_threads(nth1) + { +#pragma omp parallel num_threads(nth2) + { +#pragma omp target nowait + inc_a(); + } + } + if (a != nth1 * nth2) { + fprintf(stderr, + "error: nested_parallel_hidden_helpers(): a (%d) != %d * %d\n", a, + nth1, nth2); + exit(EXIT_FAILURE); + } +} + +int main() { + int i, nth1, nth2; + + omp_set_max_active_levels(2); + omp_set_dynamic(0); + + for (i = 0; i < 10; ++i) + root_team_detached(); + + for (i = 0; i < 10; ++i) + root_team_hidden_helpers(); + + for (i = 0; i < 10; ++i) + for (nth1 = 1; nth1 <= 4; ++nth1) + parallel_detached(nth1); + + for (i = 0; i < 10; ++i) + for (nth1 = 1; nth1 <= 4; ++nth1) + parallel_hidden_helpers(nth1); + + for (i = 0; i < 10; ++i) + for (nth1 = 1; nth1 <= 4; ++nth1) + for (nth2 = 1; nth2 <= 4; ++nth2) + nested_parallel_detached(nth1, nth2); + + for (i = 0; i < 10; ++i) + for (nth1 = 1; nth1 <= 4; ++nth1) + for (nth2 = 1; nth2 <= 4; ++nth2) + nested_parallel_hidden_helpers(nth1, nth2); + + return 0; +} diff --git a/openmp/runtime/test/tasking/issue-79416.c b/openmp/runtime/test/tasking/issue-79416.c new file mode 100644 index 0000000..ee96fce --- /dev/null +++ b/openmp/runtime/test/tasking/issue-79416.c @@ -0,0 +1,33 @@ +// RUN: %libomp-compile-and-run +#include <stdio.h> +#include <stdlib.h> + +int a; + +void run(int nteams, int nth) { + a = 0; +#pragma omp teams num_teams(nteams) + { +#pragma omp parallel num_threads(nth) + { +#pragma omp task + { +#pragma omp atomic + a++; + } + } + } + if (a == 0) + exit(EXIT_FAILURE); +} + +int main() { + int i, nteams, nth; + for (nteams = 1; nteams <= 2; ++nteams) + for (nth = 1; nth <= 3; ++nth) + for (i = 0; i < 10; ++i) { + printf("run(%d, %d)\n", nteams, nth); + run(nteams, nth); + } + return EXIT_SUCCESS; +} diff --git a/openmp/runtime/test/tasking/task_teams_stress_test.cpp b/openmp/runtime/test/tasking/task_teams_stress_test.cpp new file mode 100644 index 0000000..e781a89 --- /dev/null +++ b/openmp/runtime/test/tasking/task_teams_stress_test.cpp @@ -0,0 +1,318 @@ +// RUN: %libomp-cxx-compile +// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=0 %libomp-run +// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=0 %libomp-run +// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=1 %libomp-run +// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=2 %libomp-run +// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=3 %libomp-run +// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=4 %libomp-run +// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=5 %libomp-run +// +// RUN: %libomp-cxx-compile -DUSE_HIDDEN_HELPERS=1 +// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=0 %libomp-run +// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=0 %libomp-run +// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=1 %libomp-run +// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=2 %libomp-run +// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=3 %libomp-run +// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=4 %libomp-run +// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=5 %libomp-run + +// This test stresses the task team mechanism by running a simple +// increment task over and over with varying number of threads and nesting. +// The test covers nested serial teams and mixing serial teams with +// normal active teams. + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <omp.h> + +// The number of times to run each test +#define NTIMES 5 + +// Regular single increment task +void task_inc_a(int *a) { +#pragma omp task + { +#pragma omp atomic + (*a)++; + } +} + +// Splitting increment task that binary splits the incrementing task +void task_inc_split_a(int *a, int low, int high) { +#pragma omp task firstprivate(low, high) + { + if (low == high) { +#pragma omp atomic + (*a)++; + } else if (low < high) { + int mid = (high - low) / 2 + low; + task_inc_split_a(a, low, mid); + task_inc_split_a(a, mid + 1, high); + } + } +} + +#ifdef USE_HIDDEN_HELPERS +// Hidden helper tasks force serial regions to create task teams +void task_inc_a_hidden_helper(int *a) { +#pragma omp target map(tofrom : a[0]) nowait + { +#pragma omp atomic + (*a)++; + } +} +#else +// Detached tasks force serial regions to create task teams +void task_inc_a_detached(int *a, omp_event_handle_t handle) { +#pragma omp task detach(handle) + { +#pragma omp atomic + (*a)++; + omp_fulfill_event(handle); + } +} +#endif + +void check_a(int *a, int expected) { + if (*a != expected) { + fprintf(stderr, + "FAIL: a = %d instead of expected = %d. Compile with " + "-DVERBOSE for more verbose output.\n", + *a, expected); + exit(EXIT_FAILURE); + } +} + +// Every thread creates a single "increment" task +void test_tasks(omp_event_handle_t *handles, int expected, int *a) { + int tid = omp_get_thread_num(); + + task_inc_a(a); + +#pragma omp barrier + check_a(a, expected); +#pragma omp barrier + check_a(a, expected); +#pragma omp barrier + +#ifdef USE_HIDDEN_HELPERS + task_inc_a_hidden_helper(a); +#else + task_inc_a_detached(a, handles[tid]); +#endif + +#pragma omp barrier + check_a(a, 2 * expected); +#pragma omp barrier + task_inc_a(a); +#pragma omp barrier + check_a(a, 3 * expected); +} + +// Testing single level of parallelism with increment tasks +void test_base(int nthreads) { +#ifdef VERBOSE +#pragma omp master + printf(" test_base(%d)\n", nthreads); +#endif + int a = 0; + omp_event_handle_t *handles; + handles = (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nthreads); +#pragma omp parallel num_threads(nthreads) shared(a) + { test_tasks(handles, nthreads, &a); } + free(handles); +} + +// Testing nested parallel with increment tasks +// first = nthreads of outer parallel +// second = nthreads of nested parallel +void test_nest(int first, int second) { +#ifdef VERBOSE +#pragma omp master + printf(" test_nest(%d, %d)\n", first, second); +#endif +#pragma omp parallel num_threads(first) + { test_base(second); } +} + +// Testing 2-level nested parallels with increment tasks +// first = nthreads of outer parallel +// second = nthreads of nested parallel +// third = nthreads of second nested parallel +void test_nest2(int first, int second, int third) { +#ifdef VERBOSE +#pragma omp master + printf(" test_nest2(%d, %d, %d)\n", first, second, third); +#endif +#pragma omp parallel num_threads(first) + { test_nest(second, third); } +} + +// Testing 3-level nested parallels with increment tasks +// first = nthreads of outer parallel +// second = nthreads of nested parallel +// third = nthreads of second nested parallel +// fourth = nthreads of third nested parallel +void test_nest3(int first, int second, int third, int fourth) { +#ifdef VERBOSE +#pragma omp master + printf(" test_nest3(%d, %d, %d, %d)\n", first, second, third, fourth); +#endif +#pragma omp parallel num_threads(first) + { test_nest2(second, third, fourth); } +} + +// Testing 4-level nested parallels with increment tasks +// first = nthreads of outer parallel +// second = nthreads of nested parallel +// third = nthreads of second nested parallel +// fourth = nthreads of third nested parallel +// fifth = nthreads of fourth nested parallel +void test_nest4(int first, int second, int third, int fourth, int fifth) { +#ifdef VERBOSE +#pragma omp master + printf("test_nest4(%d, %d, %d, %d, %d)\n", first, second, third, fourth, + fifth); +#endif +#pragma omp parallel num_threads(first) + { test_nest3(second, third, fourth, fifth); } +} + +// Single thread starts a binary splitting "increment" task +// Detached tasks are still single "increment" task +void test_tasks_split(omp_event_handle_t *handles, int expected, int *a) { + int tid = omp_get_thread_num(); + +#pragma omp single + task_inc_split_a(a, 1, expected); // task team A + +#pragma omp barrier + check_a(a, expected); +#pragma omp barrier + check_a(a, expected); +#pragma omp barrier + +#ifdef USE_HIDDEN_HELPERS + task_inc_a_hidden_helper(a); +#else + task_inc_a_detached(a, handles[tid]); +#endif + +#pragma omp barrier + check_a(a, 2 * expected); +#pragma omp barrier +#pragma omp single + task_inc_split_a(a, 1, expected); // task team B +#pragma omp barrier + check_a(a, 3 * expected); +} + +// Testing single level of parallelism with splitting incrementing tasks +void test_base_split(int nthreads) { +#ifdef VERBOSE +#pragma omp master + printf(" test_base_split(%d)\n", nthreads); +#endif + int a = 0; + omp_event_handle_t *handles; + handles = (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nthreads); +#pragma omp parallel num_threads(nthreads) shared(a) + { test_tasks_split(handles, nthreads, &a); } + free(handles); +} + +// Testing nested parallels with splitting tasks +// first = nthreads of outer parallel +// second = nthreads of nested parallel +void test_nest_split(int first, int second) { +#ifdef VERBOSE +#pragma omp master + printf(" test_nest_split(%d, %d)\n", first, second); +#endif +#pragma omp parallel num_threads(first) + { test_base_split(second); } +} + +// Testing doubly nested parallels with splitting tasks +// first = nthreads of outer parallel +// second = nthreads of nested parallel +// third = nthreads of second nested parallel +void test_nest2_split(int first, int second, int third) { +#ifdef VERBOSE +#pragma omp master + printf("test_nest2_split(%d, %d, %d)\n", first, second, third); +#endif +#pragma omp parallel num_threads(first) + { test_nest_split(second, third); } +} + +template <typename... Args> +void run_ntimes(int n, void (*func)(Args...), Args... args) { + for (int i = 0; i < n; ++i) { + func(args...); + } +} + +int main() { + omp_set_max_active_levels(5); + + run_ntimes(NTIMES, test_base, 4); + run_ntimes(NTIMES, test_base, 1); + run_ntimes(NTIMES, test_base, 8); + run_ntimes(NTIMES, test_base, 2); + run_ntimes(NTIMES, test_base, 6); + run_ntimes(NTIMES, test_nest, 1, 1); + run_ntimes(NTIMES, test_nest, 1, 5); + run_ntimes(NTIMES, test_nest, 2, 6); + run_ntimes(NTIMES, test_nest, 1, 1); + run_ntimes(NTIMES, test_nest, 4, 3); + run_ntimes(NTIMES, test_nest, 3, 2); + run_ntimes(NTIMES, test_nest, 1, 1); + run_ntimes(NTIMES, test_nest2, 1, 1, 2); + run_ntimes(NTIMES, test_nest2, 1, 2, 1); + run_ntimes(NTIMES, test_nest2, 2, 2, 1); + run_ntimes(NTIMES, test_nest2, 2, 1, 1); + run_ntimes(NTIMES, test_nest2, 4, 2, 1); + run_ntimes(NTIMES, test_nest2, 4, 2, 2); + run_ntimes(NTIMES, test_nest2, 1, 1, 1); + run_ntimes(NTIMES, test_nest2, 4, 2, 2); + run_ntimes(NTIMES, test_nest3, 1, 1, 1, 1); + run_ntimes(NTIMES, test_nest3, 1, 2, 1, 1); + run_ntimes(NTIMES, test_nest3, 1, 1, 2, 1); + run_ntimes(NTIMES, test_nest3, 1, 1, 1, 2); + run_ntimes(NTIMES, test_nest3, 2, 1, 1, 1); + run_ntimes(NTIMES, test_nest4, 1, 1, 1, 1, 1); + run_ntimes(NTIMES, test_nest4, 2, 1, 1, 1, 1); + run_ntimes(NTIMES, test_nest4, 1, 2, 1, 1, 1); + run_ntimes(NTIMES, test_nest4, 1, 1, 2, 1, 1); + run_ntimes(NTIMES, test_nest4, 1, 1, 1, 2, 1); + run_ntimes(NTIMES, test_nest4, 1, 1, 1, 1, 2); + run_ntimes(NTIMES, test_nest4, 1, 1, 1, 1, 1); + run_ntimes(NTIMES, test_nest4, 1, 2, 1, 2, 1); + + run_ntimes(NTIMES, test_base_split, 4); + run_ntimes(NTIMES, test_base_split, 2); + + run_ntimes(NTIMES, test_base_split, 7); + + run_ntimes(NTIMES, test_base_split, 1); + run_ntimes(NTIMES, test_nest_split, 4, 2); + run_ntimes(NTIMES, test_nest_split, 2, 1); + + run_ntimes(NTIMES, test_nest_split, 7, 2); + run_ntimes(NTIMES, test_nest_split, 1, 1); + run_ntimes(NTIMES, test_nest_split, 1, 4); + + run_ntimes(NTIMES, test_nest2_split, 1, 1, 2); + run_ntimes(NTIMES, test_nest2_split, 1, 2, 1); + run_ntimes(NTIMES, test_nest2_split, 2, 2, 1); + run_ntimes(NTIMES, test_nest2_split, 2, 1, 1); + run_ntimes(NTIMES, test_nest2_split, 4, 2, 1); + run_ntimes(NTIMES, test_nest2_split, 4, 2, 2); + run_ntimes(NTIMES, test_nest2_split, 1, 1, 1); + run_ntimes(NTIMES, test_nest2_split, 4, 2, 2); + + printf("PASS\n"); + return EXIT_SUCCESS; +} |