aboutsummaryrefslogtreecommitdiff
path: root/openmp/runtime
diff options
context:
space:
mode:
authorJonathan Peyton <jonathan.l.peyton@intel.com>2024-05-07 08:41:51 -0500
committerGitHub <noreply@github.com>2024-05-07 08:41:51 -0500
commit41ca9104ac1e0bf248d4082f45c5ad03ddd55727 (patch)
tree60565343107ec5893bf5530aaf5ff82311777930 /openmp/runtime
parent1d87465a0a95cee9accc5dce7abdabbbc3f3c122 (diff)
downloadllvm-41ca9104ac1e0bf248d4082f45c5ad03ddd55727.zip
llvm-41ca9104ac1e0bf248d4082f45c5ad03ddd55727.tar.gz
llvm-41ca9104ac1e0bf248d4082f45c5ad03ddd55727.tar.bz2
[OpenMP] Fix task state and taskteams for serial teams (#86859)
* Serial teams now use a stack (similar to dispatch buffers) * Serial teams always use `t_task_team[0]` as the task team and the second pointer is a next pointer for the stack `t_task_team[1]` is interpreted as a stack of task teams where each level is a nested level ``` inner serial team outer serial team [ t_task_team[0] ] -> (task_team) [ t_task_team[0] ] -> (task_team) [ next ] ----------------> [ next ] -> ... ``` * Remove the task state memo stack from thread structure. * Instead of a thread-private stack, use team structure to store th_task_state of the primary thread. When coming out of a parallel, restore the primary thread's task state. The new field in the team structure doesn't cause sizeof(team) to change and is in the cache line which is only read/written by the primary thread. Fixes: #50602 Fixes: #69368 Fixes: #69733 Fixes: #79416
Diffstat (limited to 'openmp/runtime')
-rw-r--r--openmp/runtime/src/kmp.h29
-rw-r--r--openmp/runtime/src/kmp_barrier.cpp15
-rw-r--r--openmp/runtime/src/kmp_csupport.cpp11
-rw-r--r--openmp/runtime/src/kmp_runtime.cpp179
-rw-r--r--openmp/runtime/src/kmp_tasking.cpp193
-rw-r--r--openmp/runtime/test/target/issue-81488.c36
-rw-r--r--openmp/runtime/test/tasking/issue-50602.c40
-rw-r--r--openmp/runtime/test/tasking/issue-69368.c27
-rw-r--r--openmp/runtime/test/tasking/issue-69733.c147
-rw-r--r--openmp/runtime/test/tasking/issue-79416.c33
-rw-r--r--openmp/runtime/test/tasking/task_teams_stress_test.cpp318
11 files changed, 792 insertions, 236 deletions
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 18ccf10..64a3ea6 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2871,6 +2871,11 @@ union KMP_ALIGN_CACHE kmp_task_team {
char tt_pad[KMP_PAD(kmp_base_task_team_t, CACHE_LINE)];
};
+typedef struct kmp_task_team_list_t {
+ kmp_task_team_t *task_team;
+ kmp_task_team_list_t *next;
+} kmp_task_team_list_t;
+
#if (USE_FAST_MEMORY == 3) || (USE_FAST_MEMORY == 5)
// Free lists keep same-size free memory slots for fast memory allocation
// routines
@@ -3008,10 +3013,6 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
kmp_task_team_t *th_task_team; // Task team struct
kmp_taskdata_t *th_current_task; // Innermost Task being executed
kmp_uint8 th_task_state; // alternating 0/1 for task team identification
- kmp_uint8 *th_task_state_memo_stack; // Stack holding memos of th_task_state
- // at nested levels
- kmp_uint32 th_task_state_top; // Top element of th_task_state_memo_stack
- kmp_uint32 th_task_state_stack_sz; // Size of th_task_state_memo_stack
kmp_uint32 th_reap_state; // Non-zero indicates thread is not
// tasking, thus safe to reap
@@ -3133,6 +3134,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
kmp_disp_t *t_dispatch; // thread's dispatch data
kmp_task_team_t *t_task_team[2]; // Task team struct; switch between 2
kmp_proc_bind_t t_proc_bind; // bind type for par region
+ int t_primary_task_state; // primary thread's task state saved
#if USE_ITT_BUILD
kmp_uint64 t_region_time; // region begin timestamp
#endif /* USE_ITT_BUILD */
@@ -3204,6 +3206,12 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
distributedBarrier *b; // Distributed barrier data associated with team
} kmp_base_team_t;
+// Assert that the list structure fits and aligns within
+// the double task team pointer
+KMP_BUILD_ASSERT(sizeof(kmp_task_team_t *[2]) == sizeof(kmp_task_team_list_t));
+KMP_BUILD_ASSERT(alignof(kmp_task_team_t *[2]) ==
+ alignof(kmp_task_team_list_t));
+
union KMP_ALIGN_CACHE kmp_team {
kmp_base_team_t t;
double t_align; /* use worst case alignment */
@@ -4114,9 +4122,10 @@ extern void __kmp_fulfill_event(kmp_event_t *event);
extern void __kmp_free_task_team(kmp_info_t *thread,
kmp_task_team_t *task_team);
extern void __kmp_reap_task_teams(void);
+extern void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team);
+extern void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team);
extern void __kmp_wait_to_unref_task_teams(void);
-extern void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team,
- int always);
+extern void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team);
extern void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team);
extern void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team
#if USE_ITT_BUILD
@@ -4127,6 +4136,14 @@ extern void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team
int wait = 1);
extern void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread,
int gtid);
+#if KMP_DEBUG
+#define KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, thr) \
+ KMP_DEBUG_ASSERT( \
+ __kmp_tasking_mode != tskm_task_teams || team->t.t_nproc == 1 || \
+ thr->th.th_task_team == team->t.t_task_team[thr->th.th_task_state])
+#else
+#define KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, thr) /* Nothing */
+#endif
extern int __kmp_is_address_mapped(void *addr);
extern kmp_uint64 __kmp_hardware_timestamp(void);
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
index e9ab15f..b381694 100644
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -1858,8 +1858,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
}
if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
- // use 0 to only setup the current team if nthreads > 1
- __kmp_task_team_setup(this_thr, team, 0);
+ __kmp_task_team_setup(this_thr, team);
if (cancellable) {
cancelled = __kmp_linear_barrier_gather_cancellable(
@@ -2042,7 +2041,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
TRUE);
__kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
- __kmp_task_team_setup(this_thr, team, 0);
+ __kmp_task_team_setup(this_thr, team);
#if USE_ITT_BUILD
if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
@@ -2243,9 +2242,7 @@ void __kmp_join_barrier(int gtid) {
__kmp_gtid_from_thread(this_thr), team_id,
team->t.t_task_team[this_thr->th.th_task_state],
this_thr->th.th_task_team));
- if (this_thr->th.th_task_team)
- KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
- team->t.t_task_team[this_thr->th.th_task_state]);
+ KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr);
}
#endif /* KMP_DEBUG */
@@ -2440,10 +2437,8 @@ void __kmp_fork_barrier(int gtid, int tid) {
}
#endif
- if (__kmp_tasking_mode != tskm_immediate_exec) {
- // 0 indicates setup current task team if nthreads > 1
- __kmp_task_team_setup(this_thr, team, 0);
- }
+ if (__kmp_tasking_mode != tskm_immediate_exec)
+ __kmp_task_team_setup(this_thr, team);
/* The primary thread may have changed its blocktime between join barrier
and fork barrier. Copy the blocktime info to the thread, where
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index 0268f69..f45fe64 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -654,6 +654,12 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
serial_team->t.t_dispatch->th_disp_buffer->next;
__kmp_free(disp_buffer);
}
+
+ /* pop the task team stack */
+ if (serial_team->t.t_serialized > 1) {
+ __kmp_pop_task_team_node(this_thr, serial_team);
+ }
+
this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
--serial_team->t.t_serialized;
@@ -692,6 +698,11 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
this_thr->th.th_current_task->td_flags.executing = 1;
if (__kmp_tasking_mode != tskm_immediate_exec) {
+ // Restore task state from serial team structure
+ KMP_DEBUG_ASSERT(serial_team->t.t_primary_task_state == 0 ||
+ serial_team->t.t_primary_task_state == 1);
+ this_thr->th.th_task_state =
+ (kmp_uint8)serial_team->t.t_primary_task_state;
// Copy the task team from the new child / old parent team to the thread.
this_thr->th.th_task_team =
this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index 95acf4d..4be67f3 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -1042,6 +1042,41 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
}
}
+ // Take care of primary thread's task state
+ if (__kmp_tasking_mode != tskm_immediate_exec) {
+ if (use_hot_team) {
+ KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
+ KA_TRACE(
+ 20,
+ ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
+ "%p, new task_team %p / team %p\n",
+ __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
+ team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
+ team));
+
+ // Store primary thread's current task state on new team
+ KMP_CHECK_UPDATE(team->t.t_primary_task_state,
+ master_th->th.th_task_state);
+
+ // Restore primary thread's task state to hot team's state
+ // by using thread 1's task state
+ if (team->t.t_nproc > 1) {
+ KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
+ team->t.t_threads[1]->th.th_task_state == 1);
+ KMP_CHECK_UPDATE(master_th->th.th_task_state,
+ team->t.t_threads[1]->th.th_task_state);
+ } else {
+ master_th->th.th_task_state = 0;
+ }
+ } else {
+ // Store primary thread's current task_state on new team
+ KMP_CHECK_UPDATE(team->t.t_primary_task_state,
+ master_th->th.th_task_state);
+ // Are not using hot team, so set task state to 0.
+ master_th->th.th_task_state = 0;
+ }
+ }
+
if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
for (i = 0; i < team->t.t_nproc; i++) {
kmp_info_t *thr = team->t.t_threads[i];
@@ -1145,18 +1180,6 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
KMP_DEBUG_ASSERT(serial_team);
KMP_MB();
- if (__kmp_tasking_mode != tskm_immediate_exec) {
- KMP_DEBUG_ASSERT(
- this_thr->th.th_task_team ==
- this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
- KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
- NULL);
- KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
- "team %p, new task_team = NULL\n",
- global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
- this_thr->th.th_task_team = NULL;
- }
-
kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
proc_bind = proc_bind_false;
@@ -1242,6 +1265,8 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
serial_team->t.t_serialized = 1;
serial_team->t.t_nproc = 1;
serial_team->t.t_parent = this_thr->th.th_team;
+ // Save previous team's task state on serial team structure
+ serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
this_thr->th.th_team = serial_team;
serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
@@ -1281,6 +1306,8 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
this_thr->th.th_team_nproc = 1;
this_thr->th.th_team_master = this_thr;
this_thr->th.th_team_serialized = 1;
+ this_thr->th.th_task_team = NULL;
+ this_thr->th.th_task_state = 0;
serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
@@ -1332,6 +1359,9 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
}
this_thr->th.th_dispatch = serial_team->t.t_dispatch;
+ /* allocate/push task team stack */
+ __kmp_push_task_team_node(this_thr, serial_team);
+
KMP_MB();
}
KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
@@ -1985,17 +2015,12 @@ int __kmp_fork_call(ident_t *loc, int gtid,
ap);
} // End parallel closely nested in teams construct
-#if KMP_DEBUG
- if (__kmp_tasking_mode != tskm_immediate_exec) {
- KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
- parent_team->t.t_task_team[master_th->th.th_task_state]);
- }
-#endif
-
// Need this to happen before we determine the number of threads, not while
// we are allocating the team
//__kmp_push_current_task_to_thread(master_th, parent_team, 0);
+ KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
+
// Determine the number of threads
int enter_teams =
__kmp_is_entering_teams(active_level, level, teams_level, ap);
@@ -2186,64 +2211,6 @@ int __kmp_fork_call(ident_t *loc, int gtid,
ompd_bp_parallel_begin();
#endif
- if (__kmp_tasking_mode != tskm_immediate_exec) {
- // Set primary thread's task team to team's task team. Unless this is hot
- // team, it should be NULL.
- KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
- parent_team->t.t_task_team[master_th->th.th_task_state]);
- KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
- "%p, new task_team %p / team %p\n",
- __kmp_gtid_from_thread(master_th),
- master_th->th.th_task_team, parent_team,
- team->t.t_task_team[master_th->th.th_task_state], team));
-
- if (active_level || master_th->th.th_task_team) {
- // Take a memo of primary thread's task_state
- KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
- if (master_th->th.th_task_state_top >=
- master_th->th.th_task_state_stack_sz) { // increase size
- kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
- kmp_uint8 *old_stack, *new_stack;
- kmp_uint32 i;
- new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
- for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
- new_stack[i] = master_th->th.th_task_state_memo_stack[i];
- }
- for (i = master_th->th.th_task_state_stack_sz; i < new_size;
- ++i) { // zero-init rest of stack
- new_stack[i] = 0;
- }
- old_stack = master_th->th.th_task_state_memo_stack;
- master_th->th.th_task_state_memo_stack = new_stack;
- master_th->th.th_task_state_stack_sz = new_size;
- __kmp_free(old_stack);
- }
- // Store primary thread's task_state on stack
- master_th->th
- .th_task_state_memo_stack[master_th->th.th_task_state_top] =
- master_th->th.th_task_state;
- master_th->th.th_task_state_top++;
-#if KMP_NESTED_HOT_TEAMS
- if (master_th->th.th_hot_teams &&
- active_level < __kmp_hot_teams_max_level &&
- team == master_th->th.th_hot_teams[active_level].hot_team) {
- // Restore primary thread's nested state if nested hot team
- master_th->th.th_task_state =
- master_th->th
- .th_task_state_memo_stack[master_th->th.th_task_state_top];
- } else {
-#endif
- master_th->th.th_task_state = 0;
-#if KMP_NESTED_HOT_TEAMS
- }
-#endif
- }
-#if !KMP_NESTED_HOT_TEAMS
- KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
- (team == root->r.r_hot_team));
-#endif
- }
-
KA_TRACE(
20,
("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
@@ -2451,8 +2418,7 @@ void __kmp_join_call(ident_t *loc, int gtid
__kmp_gtid_from_thread(master_th), team,
team->t.t_task_team[master_th->th.th_task_state],
master_th->th.th_task_team));
- KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
- team->t.t_task_team[master_th->th.th_task_state]);
+ KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
}
#endif
@@ -2690,24 +2656,11 @@ void __kmp_join_call(ident_t *loc, int gtid
}
if (__kmp_tasking_mode != tskm_immediate_exec) {
- if (master_th->th.th_task_state_top >
- 0) { // Restore task state from memo stack
- KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
- // Remember primary thread's state if we re-use this nested hot team
- master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
- master_th->th.th_task_state;
- --master_th->th.th_task_state_top; // pop
- // Now restore state at this level
- master_th->th.th_task_state =
- master_th->th
- .th_task_state_memo_stack[master_th->th.th_task_state_top];
- } else if (team != root->r.r_hot_team) {
- // Reset the task state of primary thread if we are not hot team because
- // in this case all the worker threads will be free, and their task state
- // will be reset. If not reset the primary's, the task state will be
- // inconsistent.
- master_th->th.th_task_state = 0;
- }
+ // Restore primary thread's task state from team structure
+ KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
+ team->t.t_primary_task_state == 1);
+ master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
+
// Copy the task team from the parent team to the primary thread
master_th->th.th_task_team =
parent_team->t.t_task_team[master_th->th.th_task_state];
@@ -4396,17 +4349,6 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
this_thr->th.th_next_pool = NULL;
- if (!this_thr->th.th_task_state_memo_stack) {
- size_t i;
- this_thr->th.th_task_state_memo_stack =
- (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
- this_thr->th.th_task_state_top = 0;
- this_thr->th.th_task_state_stack_sz = 4;
- for (i = 0; i < this_thr->th.th_task_state_stack_sz;
- ++i) // zero init the stack
- this_thr->th.th_task_state_memo_stack[i] = 0;
- }
-
KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
@@ -4463,8 +4405,6 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
TCW_4(__kmp_nth, __kmp_nth + 1);
new_thr->th.th_task_state = 0;
- new_thr->th.th_task_state_top = 0;
- new_thr->th.th_task_state_stack_sz = 4;
if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
// Make sure pool thread has transitioned to waiting on own thread struct
@@ -5262,6 +5202,15 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
// Activate team threads via th_used_in_team
__kmp_add_threads_to_team(team, new_nproc);
}
+ // When decreasing team size, threads no longer in the team should
+ // unref task team.
+ if (__kmp_tasking_mode != tskm_immediate_exec) {
+ for (f = new_nproc; f < team->t.t_nproc; f++) {
+ kmp_info_t *th = team->t.t_threads[f];
+ KMP_DEBUG_ASSERT(th);
+ th->th.th_task_team = NULL;
+ }
+ }
#if KMP_NESTED_HOT_TEAMS
if (__kmp_hot_teams_mode == 0) {
// AC: saved number of threads should correspond to team's value in this
@@ -5272,11 +5221,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
/* release the extra threads we don't need any more */
for (f = new_nproc; f < team->t.t_nproc; f++) {
KMP_DEBUG_ASSERT(team->t.t_threads[f]);
- if (__kmp_tasking_mode != tskm_immediate_exec) {
- // When decreasing team size, threads no longer in the team should
- // unref task team.
- team->t.t_threads[f]->th.th_task_team = NULL;
- }
__kmp_free_thread(team->t.t_threads[f]);
team->t.t_threads[f] = NULL;
}
@@ -6248,11 +6192,6 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
thread->th.th_pri_common = NULL;
}
- if (thread->th.th_task_state_memo_stack != NULL) {
- __kmp_free(thread->th.th_task_state_memo_stack);
- thread->th.th_task_state_memo_stack = NULL;
- }
-
#if KMP_USE_BGET
if (thread->th.th_local.bget_data != NULL) {
__kmp_finalize_bget(thread);
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 6303bb0..a782027 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -1511,8 +1511,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
KA_TRACE(30,
("T#%d creating task team in __kmp_task_alloc for proxy task\n",
gtid));
- // 1 indicates setup the current team regardless of nthreads
- __kmp_task_team_setup(thread, team, 1);
+ __kmp_task_team_setup(thread, team);
thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
}
kmp_task_team_t *task_team = thread->th.th_task_team;
@@ -3390,8 +3389,6 @@ static inline int __kmp_execute_tasks_template(
nthreads = task_team->tt.tt_nproc;
unfinished_threads = &(task_team->tt.tt_unfinished_threads);
- KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks ||
- task_team->tt.tt_hidden_helper_task_encountered);
KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
while (1) { // Outer loop keeps trying to find tasks in case of single thread
@@ -3943,6 +3940,20 @@ static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
__kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
}
+static inline void __kmp_task_team_init(kmp_task_team_t *task_team,
+ kmp_team_t *team) {
+ int team_nth = team->t.t_nproc;
+ // Only need to init if task team is isn't active or team size changed
+ if (!task_team->tt.tt_active || team_nth != task_team->tt.tt_nproc) {
+ TCW_4(task_team->tt.tt_found_tasks, FALSE);
+ TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
+ TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
+ TCW_4(task_team->tt.tt_nproc, team_nth);
+ KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth);
+ TCW_4(task_team->tt.tt_active, TRUE);
+ }
+}
+
// __kmp_allocate_task_team:
// Allocates a task team associated with a specific team, taking it from
// the global task team free list if possible. Also initializes data
@@ -3950,7 +3961,6 @@ static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
kmp_team_t *team) {
kmp_task_team_t *task_team = NULL;
- int nthreads;
KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
(thread ? __kmp_gtid_from_thread(thread) : -1), team));
@@ -3992,14 +4002,7 @@ static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
// task_team->tt.tt_next = NULL;
}
- TCW_4(task_team->tt.tt_found_tasks, FALSE);
- TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
- TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
- task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
-
- KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
- TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
- TCW_4(task_team->tt.tt_active, TRUE);
+ __kmp_task_team_init(task_team, team);
KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
"unfinished_threads init'd to %d\n",
@@ -4053,6 +4056,40 @@ void __kmp_reap_task_teams(void) {
}
}
+// View the array of two task team pointers as a pair of pointers:
+// 1) a single task_team pointer
+// 2) next pointer for stack
+// Serial teams can create a stack of task teams for nested serial teams.
+void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
+ KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
+ kmp_task_team_list_t *current =
+ (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
+ kmp_task_team_list_t *node =
+ (kmp_task_team_list_t *)__kmp_allocate(sizeof(kmp_task_team_list_t));
+ node->task_team = current->task_team;
+ node->next = current->next;
+ thread->th.th_task_team = current->task_team = NULL;
+ current->next = node;
+}
+
+// Serial team pops a task team off the stack
+void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
+ KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
+ kmp_task_team_list_t *current =
+ (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
+ if (current->task_team) {
+ __kmp_free_task_team(thread, current->task_team);
+ }
+ kmp_task_team_list_t *next = current->next;
+ if (next) {
+ current->task_team = next->task_team;
+ current->next = next->next;
+ KMP_DEBUG_ASSERT(next != current);
+ __kmp_free(next);
+ thread->th.th_task_team = current->task_team;
+ }
+}
+
// __kmp_wait_to_unref_task_teams:
// Some threads could still be in the fork barrier release code, possibly
// trying to steal tasks. Wait for each thread to unreference its task team.
@@ -4117,55 +4154,34 @@ void __kmp_wait_to_unref_task_teams(void) {
}
}
-void __kmp_shift_task_state_stack(kmp_info_t *this_thr, kmp_uint8 value) {
- // Shift values from th_task_state_top+1 to task_state_stack_sz
- if (this_thr->th.th_task_state_top + 1 >=
- this_thr->th.th_task_state_stack_sz) { // increase size
- kmp_uint32 new_size = 2 * this_thr->th.th_task_state_stack_sz;
- kmp_uint8 *old_stack, *new_stack;
- kmp_uint32 i;
- new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
- for (i = 0; i <= this_thr->th.th_task_state_top; ++i) {
- new_stack[i] = this_thr->th.th_task_state_memo_stack[i];
- }
- // If we need to reallocate do the shift at the same time.
- for (; i < this_thr->th.th_task_state_stack_sz; ++i) {
- new_stack[i + 1] = this_thr->th.th_task_state_memo_stack[i];
- }
- for (i = this_thr->th.th_task_state_stack_sz; i < new_size;
- ++i) { // zero-init rest of stack
- new_stack[i] = 0;
- }
- old_stack = this_thr->th.th_task_state_memo_stack;
- this_thr->th.th_task_state_memo_stack = new_stack;
- this_thr->th.th_task_state_stack_sz = new_size;
- __kmp_free(old_stack);
- } else {
- kmp_uint8 *end;
- kmp_uint32 i;
-
- end = &this_thr->th
- .th_task_state_memo_stack[this_thr->th.th_task_state_stack_sz];
-
- for (i = this_thr->th.th_task_state_stack_sz - 1;
- i > this_thr->th.th_task_state_top; i--, end--)
- end[0] = end[-1];
- }
- this_thr->th.th_task_state_memo_stack[this_thr->th.th_task_state_top + 1] =
- value;
-}
-
// __kmp_task_team_setup: Create a task_team for the current team, but use
// an already created, unused one if it already exists.
-void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
+void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+ // For the serial and root teams, setup the first task team pointer to point
+ // to task team. The other pointer is a stack of task teams from previous
+ // serial levels.
+ if (team == this_thr->th.th_serial_team ||
+ team == this_thr->th.th_root->r.r_root_team) {
+ KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
+ if (team->t.t_task_team[0] == NULL) {
+ team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
+ KA_TRACE(
+ 20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
+ " for serial/root team %p\n",
+ __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
+
+ } else
+ __kmp_task_team_init(team->t.t_task_team[0], team);
+ return;
+ }
+
// If this task_team hasn't been created yet, allocate it. It will be used in
// the region after the next.
// If it exists, it is the current task team and shouldn't be touched yet as
// it may still be in use.
- if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
- (always || team->t.t_nproc > 1)) {
+ if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) {
team->t.t_task_team[this_thr->th.th_task_state] =
__kmp_allocate_task_team(this_thr, team);
KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
@@ -4174,52 +4190,31 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
this_thr->th.th_task_state));
}
- if (this_thr->th.th_task_state == 1 && always && team->t.t_nproc == 1) {
- // fix task state stack to adjust for proxy and helper tasks
- KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d needs to shift stack"
- " for team %d at parity=%d\n",
- __kmp_gtid_from_thread(this_thr), team->t.t_id,
- this_thr->th.th_task_state));
- __kmp_shift_task_state_stack(this_thr, this_thr->th.th_task_state);
- }
// After threads exit the release, they will call sync, and then point to this
// other task_team; make sure it is allocated and properly initialized. As
// threads spin in the barrier release phase, they will continue to use the
// previous task_team struct(above), until they receive the signal to stop
// checking for tasks (they can't safely reference the kmp_team_t struct,
- // which could be reallocated by the primary thread). No task teams are formed
- // for serialized teams.
- if (team->t.t_nproc > 1) {
- int other_team = 1 - this_thr->th.th_task_state;
- KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
- if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
- team->t.t_task_team[other_team] =
- __kmp_allocate_task_team(this_thr, team);
- KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
- "task_team %p for team %d at parity=%d\n",
- __kmp_gtid_from_thread(this_thr),
- team->t.t_task_team[other_team], team->t.t_id, other_team));
- } else { // Leave the old task team struct in place for the upcoming region;
- // adjust as needed
- kmp_task_team_t *task_team = team->t.t_task_team[other_team];
- if (!task_team->tt.tt_active ||
- team->t.t_nproc != task_team->tt.tt_nproc) {
- TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
- TCW_4(task_team->tt.tt_found_tasks, FALSE);
- TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
- TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
- KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
- team->t.t_nproc);
- TCW_4(task_team->tt.tt_active, TRUE);
- }
- // if team size has changed, the first thread to enable tasking will
- // realloc threads_data if necessary
- KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
- "%p for team %d at parity=%d\n",
- __kmp_gtid_from_thread(this_thr),
- team->t.t_task_team[other_team], team->t.t_id, other_team));
- }
+ // which could be reallocated by the primary thread).
+ int other_team = 1 - this_thr->th.th_task_state;
+ KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
+ if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
+ team->t.t_task_team[other_team] = __kmp_allocate_task_team(this_thr, team);
+ KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
+ "task_team %p for team %d at parity=%d\n",
+ __kmp_gtid_from_thread(this_thr),
+ team->t.t_task_team[other_team], team->t.t_id, other_team));
+ } else { // Leave the old task team struct in place for the upcoming region;
+ // adjust as needed
+ kmp_task_team_t *task_team = team->t.t_task_team[other_team];
+ __kmp_task_team_init(task_team, team);
+ // if team size has changed, the first thread to enable tasking will
+ // realloc threads_data if necessary
+ KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
+ "%p for team %d at parity=%d\n",
+ __kmp_gtid_from_thread(this_thr),
+ team->t.t_task_team[other_team], team->t.t_id, other_team));
}
// For regular thread, task enabling should be called when the task is going
@@ -4245,9 +4240,11 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
// __kmp_task_team_sync: Propagation of task team data from team to threads
// which happens just after the release phase of a team barrier. This may be
-// called by any thread, but only for teams with # threads > 1.
+// called by any thread. This is not called for serial or root teams.
void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+ KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team);
+ KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team);
// Toggle the th_task_state field, to switch which task_team this thread
// refers to
@@ -4265,8 +4262,7 @@ void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
}
// __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
-// barrier gather phase. Only called by primary thread if #threads in team > 1
-// or if proxy tasks were created.
+// barrier gather phase. Only called by the primary thread.
//
// wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
// by passing in 0 optionally as the last argument. When wait is zero, primary
@@ -4300,9 +4296,6 @@ void __kmp_task_team_wait(
("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
"setting active to false, setting local and team's pointer to NULL\n",
__kmp_gtid_from_thread(this_thr), task_team));
- KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
- task_team->tt.tt_found_proxy_tasks == TRUE ||
- task_team->tt.tt_hidden_helper_task_encountered == TRUE);
TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
diff --git a/openmp/runtime/test/target/issue-81488.c b/openmp/runtime/test/target/issue-81488.c
new file mode 100644
index 0000000..adac7d6
--- /dev/null
+++ b/openmp/runtime/test/target/issue-81488.c
@@ -0,0 +1,36 @@
+// RUN: %libomp-compile
+// RUN: env OMP_NUM_THREADS=1 LIBOMP_USE_HIDDEN_HELPER_TASK=1 \
+// RUN: LIBOMP_NUM_HIDDEN_HELPER_THREADS=8 %libomp-run
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+#define Nz 8
+#define DEVICE_ID 0
+
+int a[Nz];
+
+int main(void) {
+ for (int n = 0; n < 10; ++n) {
+ for (int k = 0; k < Nz; ++k) {
+ a[k] = -1;
+ }
+#pragma omp parallel shared(a)
+ {
+#pragma omp single
+ {
+#pragma omp target teams distribute parallel for nowait device(DEVICE_ID) \
+ map(tofrom : a[0 : 8])
+ for (int i = 0; i < Nz; ++i) {
+ a[i] = i;
+ }
+ }
+#pragma omp barrier
+ }
+ for (int k = 0; k < Nz; ++k) {
+ printf("a[%d] = %d\n", k, a[k]);
+ }
+ }
+ return 0;
+}
diff --git a/openmp/runtime/test/tasking/issue-50602.c b/openmp/runtime/test/tasking/issue-50602.c
new file mode 100644
index 0000000..b691204
--- /dev/null
+++ b/openmp/runtime/test/tasking/issue-50602.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile-and-run
+// RUN: env OMP_NUM_THREADS=1 %libomp-run
+// RUN: %libomp-compile -DUSE_HIDDEN_HELPERS=1
+// RUN: %libomp-run
+// RUN: env OMP_NUM_THREADS=1 %libomp-run
+#include <omp.h>
+
+int main(int argc, char *argv[]) {
+ int i;
+
+ omp_set_max_active_levels(1);
+ omp_set_dynamic(0);
+
+ for (i = 0; i < 10; ++i) {
+#pragma omp parallel
+ {
+#ifndef USE_HIDDEN_HELPERS
+ omp_event_handle_t event;
+#endif
+ int a = 0;
+
+#ifdef USE_HIDDEN_HELPERS
+#pragma omp target map(tofrom : a) nowait
+#else
+#pragma omp task shared(a) detach(event)
+#endif
+ { a = 1; }
+
+#pragma omp parallel
+ { a = 2; }
+
+#ifndef USE_HIDDEN_HELPERS
+ omp_fulfill_event(event);
+#endif
+
+#pragma omp taskwait
+ }
+ }
+ return 0;
+}
diff --git a/openmp/runtime/test/tasking/issue-69368.c b/openmp/runtime/test/tasking/issue-69368.c
new file mode 100644
index 0000000..57bd741
--- /dev/null
+++ b/openmp/runtime/test/tasking/issue-69368.c
@@ -0,0 +1,27 @@
+// RUN: %libomp-compile-and-run
+// RUN: env OMP_NUM_THREADS=1 %libomp-run
+
+int main() {
+ int i;
+ int a[2];
+ volatile int attempt = 0;
+
+ for (i = 0; i < 10; ++i) {
+ a[0] = a[1] = 0;
+#pragma omp parallel for
+ for (int i = 0; i < 2; i++) {
+ a[i] = 2;
+ }
+ if (a[0] != 2 || a[1] != 2)
+ return 1;
+
+#pragma omp teams distribute parallel for if (attempt >= 2)
+ for (int i = 0; i < 2; i++) {
+ a[i] = 1;
+ }
+ if (a[0] != 1 || a[1] != 1)
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/openmp/runtime/test/tasking/issue-69733.c b/openmp/runtime/test/tasking/issue-69733.c
new file mode 100644
index 0000000..5775b01
--- /dev/null
+++ b/openmp/runtime/test/tasking/issue-69733.c
@@ -0,0 +1,147 @@
+// RUN: %libomp-compile-and-run
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+int a;
+
+void inc_a() {
+#pragma omp atomic
+ a++;
+}
+
+void root_team_detached() {
+ a = 0;
+ omp_event_handle_t ev;
+#pragma omp task detach(ev)
+ inc_a();
+ omp_fulfill_event(ev);
+ if (a != 1) {
+ fprintf(stderr, "error: root_team_detached(): a != 1\n");
+ exit(EXIT_FAILURE);
+ }
+}
+
+void root_team_hidden_helpers() {
+ a = 0;
+#pragma omp target nowait
+ inc_a();
+
+#pragma omp taskwait
+
+ if (a != 1) {
+ fprintf(stderr, "error: root_team_hidden_helpers(): a != 1\n");
+ exit(EXIT_FAILURE);
+ }
+}
+
+void parallel_detached(int nth1) {
+ a = 0;
+ omp_event_handle_t *evs =
+ (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nth1);
+#pragma omp parallel num_threads(nth1)
+ {
+ int tid = omp_get_thread_num();
+ omp_event_handle_t e = evs[tid];
+#pragma omp task detach(e)
+ inc_a();
+ omp_fulfill_event(e);
+ }
+ free(evs);
+ if (a != nth1) {
+ fprintf(stderr, "error: parallel_detached(): a (%d) != %d\n", a, nth1);
+ exit(EXIT_FAILURE);
+ }
+}
+
+void parallel_hidden_helpers(int nth1) {
+ a = 0;
+#pragma omp parallel num_threads(nth1)
+ {
+#pragma omp target nowait
+ inc_a();
+ }
+ if (a != nth1) {
+ fprintf(stderr, "error: parallel_hidden_helpers(): a (%d) != %d\n", a,
+ nth1);
+ exit(EXIT_FAILURE);
+ }
+}
+
+void nested_parallel_detached(int nth1, int nth2) {
+ a = 0;
+ omp_event_handle_t **evs =
+ (omp_event_handle_t **)malloc(sizeof(omp_event_handle_t *) * nth1);
+#pragma omp parallel num_threads(nth1)
+ {
+ int tid = omp_get_thread_num();
+ evs[tid] = (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nth2);
+#pragma omp parallel num_threads(nth2) shared(tid)
+ {
+ int tid2 = omp_get_thread_num();
+ omp_event_handle_t e = evs[tid][tid2];
+#pragma omp task detach(e)
+ inc_a();
+ omp_fulfill_event(e);
+ }
+ free(evs[tid]);
+ }
+ free(evs);
+ if (a != nth1 * nth2) {
+ fprintf(stderr, "error: nested_parallel_detached(): a (%d) != %d * %d\n", a,
+ nth1, nth2);
+ exit(EXIT_FAILURE);
+ }
+}
+
+void nested_parallel_hidden_helpers(int nth1, int nth2) {
+ a = 0;
+#pragma omp parallel num_threads(nth1)
+ {
+#pragma omp parallel num_threads(nth2)
+ {
+#pragma omp target nowait
+ inc_a();
+ }
+ }
+ if (a != nth1 * nth2) {
+ fprintf(stderr,
+ "error: nested_parallel_hidden_helpers(): a (%d) != %d * %d\n", a,
+ nth1, nth2);
+ exit(EXIT_FAILURE);
+ }
+}
+
+int main() {
+ int i, nth1, nth2;
+
+ omp_set_max_active_levels(2);
+ omp_set_dynamic(0);
+
+ for (i = 0; i < 10; ++i)
+ root_team_detached();
+
+ for (i = 0; i < 10; ++i)
+ root_team_hidden_helpers();
+
+ for (i = 0; i < 10; ++i)
+ for (nth1 = 1; nth1 <= 4; ++nth1)
+ parallel_detached(nth1);
+
+ for (i = 0; i < 10; ++i)
+ for (nth1 = 1; nth1 <= 4; ++nth1)
+ parallel_hidden_helpers(nth1);
+
+ for (i = 0; i < 10; ++i)
+ for (nth1 = 1; nth1 <= 4; ++nth1)
+ for (nth2 = 1; nth2 <= 4; ++nth2)
+ nested_parallel_detached(nth1, nth2);
+
+ for (i = 0; i < 10; ++i)
+ for (nth1 = 1; nth1 <= 4; ++nth1)
+ for (nth2 = 1; nth2 <= 4; ++nth2)
+ nested_parallel_hidden_helpers(nth1, nth2);
+
+ return 0;
+}
diff --git a/openmp/runtime/test/tasking/issue-79416.c b/openmp/runtime/test/tasking/issue-79416.c
new file mode 100644
index 0000000..ee96fce
--- /dev/null
+++ b/openmp/runtime/test/tasking/issue-79416.c
@@ -0,0 +1,33 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+
+int a;
+
+void run(int nteams, int nth) {
+ a = 0;
+#pragma omp teams num_teams(nteams)
+ {
+#pragma omp parallel num_threads(nth)
+ {
+#pragma omp task
+ {
+#pragma omp atomic
+ a++;
+ }
+ }
+ }
+ if (a == 0)
+ exit(EXIT_FAILURE);
+}
+
+int main() {
+ int i, nteams, nth;
+ for (nteams = 1; nteams <= 2; ++nteams)
+ for (nth = 1; nth <= 3; ++nth)
+ for (i = 0; i < 10; ++i) {
+ printf("run(%d, %d)\n", nteams, nth);
+ run(nteams, nth);
+ }
+ return EXIT_SUCCESS;
+}
diff --git a/openmp/runtime/test/tasking/task_teams_stress_test.cpp b/openmp/runtime/test/tasking/task_teams_stress_test.cpp
new file mode 100644
index 0000000..e781a89
--- /dev/null
+++ b/openmp/runtime/test/tasking/task_teams_stress_test.cpp
@@ -0,0 +1,318 @@
+// RUN: %libomp-cxx-compile
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=0 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=0 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=1 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=2 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=3 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=4 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=5 %libomp-run
+//
+// RUN: %libomp-cxx-compile -DUSE_HIDDEN_HELPERS=1
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=0 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=0 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=1 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=2 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=3 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=4 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=5 %libomp-run
+
+// This test stresses the task team mechanism by running a simple
+// increment task over and over with varying number of threads and nesting.
+// The test covers nested serial teams and mixing serial teams with
+// normal active teams.
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+// The number of times to run each test
+#define NTIMES 5
+
+// Regular single increment task
+void task_inc_a(int *a) {
+#pragma omp task
+ {
+#pragma omp atomic
+ (*a)++;
+ }
+}
+
+// Splitting increment task that binary splits the incrementing task
+void task_inc_split_a(int *a, int low, int high) {
+#pragma omp task firstprivate(low, high)
+ {
+ if (low == high) {
+#pragma omp atomic
+ (*a)++;
+ } else if (low < high) {
+ int mid = (high - low) / 2 + low;
+ task_inc_split_a(a, low, mid);
+ task_inc_split_a(a, mid + 1, high);
+ }
+ }
+}
+
+#ifdef USE_HIDDEN_HELPERS
+// Hidden helper tasks force serial regions to create task teams
+void task_inc_a_hidden_helper(int *a) {
+#pragma omp target map(tofrom : a[0]) nowait
+ {
+#pragma omp atomic
+ (*a)++;
+ }
+}
+#else
+// Detached tasks force serial regions to create task teams
+void task_inc_a_detached(int *a, omp_event_handle_t handle) {
+#pragma omp task detach(handle)
+ {
+#pragma omp atomic
+ (*a)++;
+ omp_fulfill_event(handle);
+ }
+}
+#endif
+
+void check_a(int *a, int expected) {
+ if (*a != expected) {
+ fprintf(stderr,
+ "FAIL: a = %d instead of expected = %d. Compile with "
+ "-DVERBOSE for more verbose output.\n",
+ *a, expected);
+ exit(EXIT_FAILURE);
+ }
+}
+
+// Every thread creates a single "increment" task
+void test_tasks(omp_event_handle_t *handles, int expected, int *a) {
+ int tid = omp_get_thread_num();
+
+ task_inc_a(a);
+
+#pragma omp barrier
+ check_a(a, expected);
+#pragma omp barrier
+ check_a(a, expected);
+#pragma omp barrier
+
+#ifdef USE_HIDDEN_HELPERS
+ task_inc_a_hidden_helper(a);
+#else
+ task_inc_a_detached(a, handles[tid]);
+#endif
+
+#pragma omp barrier
+ check_a(a, 2 * expected);
+#pragma omp barrier
+ task_inc_a(a);
+#pragma omp barrier
+ check_a(a, 3 * expected);
+}
+
+// Testing single level of parallelism with increment tasks
+void test_base(int nthreads) {
+#ifdef VERBOSE
+#pragma omp master
+ printf(" test_base(%d)\n", nthreads);
+#endif
+ int a = 0;
+ omp_event_handle_t *handles;
+ handles = (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nthreads);
+#pragma omp parallel num_threads(nthreads) shared(a)
+ { test_tasks(handles, nthreads, &a); }
+ free(handles);
+}
+
+// Testing nested parallel with increment tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+void test_nest(int first, int second) {
+#ifdef VERBOSE
+#pragma omp master
+ printf(" test_nest(%d, %d)\n", first, second);
+#endif
+#pragma omp parallel num_threads(first)
+ { test_base(second); }
+}
+
+// Testing 2-level nested parallels with increment tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+// third = nthreads of second nested parallel
+void test_nest2(int first, int second, int third) {
+#ifdef VERBOSE
+#pragma omp master
+ printf(" test_nest2(%d, %d, %d)\n", first, second, third);
+#endif
+#pragma omp parallel num_threads(first)
+ { test_nest(second, third); }
+}
+
+// Testing 3-level nested parallels with increment tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+// third = nthreads of second nested parallel
+// fourth = nthreads of third nested parallel
+void test_nest3(int first, int second, int third, int fourth) {
+#ifdef VERBOSE
+#pragma omp master
+ printf(" test_nest3(%d, %d, %d, %d)\n", first, second, third, fourth);
+#endif
+#pragma omp parallel num_threads(first)
+ { test_nest2(second, third, fourth); }
+}
+
+// Testing 4-level nested parallels with increment tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+// third = nthreads of second nested parallel
+// fourth = nthreads of third nested parallel
+// fifth = nthreads of fourth nested parallel
+void test_nest4(int first, int second, int third, int fourth, int fifth) {
+#ifdef VERBOSE
+#pragma omp master
+ printf("test_nest4(%d, %d, %d, %d, %d)\n", first, second, third, fourth,
+ fifth);
+#endif
+#pragma omp parallel num_threads(first)
+ { test_nest3(second, third, fourth, fifth); }
+}
+
+// Single thread starts a binary splitting "increment" task
+// Detached tasks are still single "increment" task
+void test_tasks_split(omp_event_handle_t *handles, int expected, int *a) {
+ int tid = omp_get_thread_num();
+
+#pragma omp single
+ task_inc_split_a(a, 1, expected); // task team A
+
+#pragma omp barrier
+ check_a(a, expected);
+#pragma omp barrier
+ check_a(a, expected);
+#pragma omp barrier
+
+#ifdef USE_HIDDEN_HELPERS
+ task_inc_a_hidden_helper(a);
+#else
+ task_inc_a_detached(a, handles[tid]);
+#endif
+
+#pragma omp barrier
+ check_a(a, 2 * expected);
+#pragma omp barrier
+#pragma omp single
+ task_inc_split_a(a, 1, expected); // task team B
+#pragma omp barrier
+ check_a(a, 3 * expected);
+}
+
+// Testing single level of parallelism with splitting incrementing tasks
+void test_base_split(int nthreads) {
+#ifdef VERBOSE
+#pragma omp master
+ printf(" test_base_split(%d)\n", nthreads);
+#endif
+ int a = 0;
+ omp_event_handle_t *handles;
+ handles = (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nthreads);
+#pragma omp parallel num_threads(nthreads) shared(a)
+ { test_tasks_split(handles, nthreads, &a); }
+ free(handles);
+}
+
+// Testing nested parallels with splitting tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+void test_nest_split(int first, int second) {
+#ifdef VERBOSE
+#pragma omp master
+ printf(" test_nest_split(%d, %d)\n", first, second);
+#endif
+#pragma omp parallel num_threads(first)
+ { test_base_split(second); }
+}
+
+// Testing doubly nested parallels with splitting tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+// third = nthreads of second nested parallel
+void test_nest2_split(int first, int second, int third) {
+#ifdef VERBOSE
+#pragma omp master
+ printf("test_nest2_split(%d, %d, %d)\n", first, second, third);
+#endif
+#pragma omp parallel num_threads(first)
+ { test_nest_split(second, third); }
+}
+
+template <typename... Args>
+void run_ntimes(int n, void (*func)(Args...), Args... args) {
+ for (int i = 0; i < n; ++i) {
+ func(args...);
+ }
+}
+
+int main() {
+ omp_set_max_active_levels(5);
+
+ run_ntimes(NTIMES, test_base, 4);
+ run_ntimes(NTIMES, test_base, 1);
+ run_ntimes(NTIMES, test_base, 8);
+ run_ntimes(NTIMES, test_base, 2);
+ run_ntimes(NTIMES, test_base, 6);
+ run_ntimes(NTIMES, test_nest, 1, 1);
+ run_ntimes(NTIMES, test_nest, 1, 5);
+ run_ntimes(NTIMES, test_nest, 2, 6);
+ run_ntimes(NTIMES, test_nest, 1, 1);
+ run_ntimes(NTIMES, test_nest, 4, 3);
+ run_ntimes(NTIMES, test_nest, 3, 2);
+ run_ntimes(NTIMES, test_nest, 1, 1);
+ run_ntimes(NTIMES, test_nest2, 1, 1, 2);
+ run_ntimes(NTIMES, test_nest2, 1, 2, 1);
+ run_ntimes(NTIMES, test_nest2, 2, 2, 1);
+ run_ntimes(NTIMES, test_nest2, 2, 1, 1);
+ run_ntimes(NTIMES, test_nest2, 4, 2, 1);
+ run_ntimes(NTIMES, test_nest2, 4, 2, 2);
+ run_ntimes(NTIMES, test_nest2, 1, 1, 1);
+ run_ntimes(NTIMES, test_nest2, 4, 2, 2);
+ run_ntimes(NTIMES, test_nest3, 1, 1, 1, 1);
+ run_ntimes(NTIMES, test_nest3, 1, 2, 1, 1);
+ run_ntimes(NTIMES, test_nest3, 1, 1, 2, 1);
+ run_ntimes(NTIMES, test_nest3, 1, 1, 1, 2);
+ run_ntimes(NTIMES, test_nest3, 2, 1, 1, 1);
+ run_ntimes(NTIMES, test_nest4, 1, 1, 1, 1, 1);
+ run_ntimes(NTIMES, test_nest4, 2, 1, 1, 1, 1);
+ run_ntimes(NTIMES, test_nest4, 1, 2, 1, 1, 1);
+ run_ntimes(NTIMES, test_nest4, 1, 1, 2, 1, 1);
+ run_ntimes(NTIMES, test_nest4, 1, 1, 1, 2, 1);
+ run_ntimes(NTIMES, test_nest4, 1, 1, 1, 1, 2);
+ run_ntimes(NTIMES, test_nest4, 1, 1, 1, 1, 1);
+ run_ntimes(NTIMES, test_nest4, 1, 2, 1, 2, 1);
+
+ run_ntimes(NTIMES, test_base_split, 4);
+ run_ntimes(NTIMES, test_base_split, 2);
+
+ run_ntimes(NTIMES, test_base_split, 7);
+
+ run_ntimes(NTIMES, test_base_split, 1);
+ run_ntimes(NTIMES, test_nest_split, 4, 2);
+ run_ntimes(NTIMES, test_nest_split, 2, 1);
+
+ run_ntimes(NTIMES, test_nest_split, 7, 2);
+ run_ntimes(NTIMES, test_nest_split, 1, 1);
+ run_ntimes(NTIMES, test_nest_split, 1, 4);
+
+ run_ntimes(NTIMES, test_nest2_split, 1, 1, 2);
+ run_ntimes(NTIMES, test_nest2_split, 1, 2, 1);
+ run_ntimes(NTIMES, test_nest2_split, 2, 2, 1);
+ run_ntimes(NTIMES, test_nest2_split, 2, 1, 1);
+ run_ntimes(NTIMES, test_nest2_split, 4, 2, 1);
+ run_ntimes(NTIMES, test_nest2_split, 4, 2, 2);
+ run_ntimes(NTIMES, test_nest2_split, 1, 1, 1);
+ run_ntimes(NTIMES, test_nest2_split, 4, 2, 2);
+
+ printf("PASS\n");
+ return EXIT_SUCCESS;
+}