aboutsummaryrefslogtreecommitdiff
path: root/libgomp
diff options
context:
space:
mode:
authorAndrew Stubbs <ams@codesourcery.com>2020-09-17 12:53:39 +0100
committerAndrew Stubbs <ams@codesourcery.com>2020-09-29 11:48:04 +0100
commit6f513951972b47e3f86898ff2ce59ed8abe2dd39 (patch)
tree7d429d1099482b891546e466971af6061965605c /libgomp
parent95e10b8aa1066dbd5c433e613652674b0636fcd1 (diff)
downloadgcc-6f513951972b47e3f86898ff2ce59ed8abe2dd39.zip
gcc-6f513951972b47e3f86898ff2ce59ed8abe2dd39.tar.gz
gcc-6f513951972b47e3f86898ff2ce59ed8abe2dd39.tar.bz2
libgomp: disable barriers in nested teams
Both GCN and NVPTX allow nested parallel regions, but the barrier implementation did not allow the nested teams to run independently of each other (due to hardware limitations). This patch fixes that, under the assumption that each thread will create a new subteam of one thread, by simply not using barriers when there's no other thread to synchronise. libgomp/ChangeLog: * config/gcn/bar.c (gomp_barrier_wait_end): Skip the barrier if the total number of threads is one. (gomp_team_barrier_wake): Likewise. (gomp_team_barrier_wait_end): Likewise. (gomp_team_barrier_wait_cancel_end): Likewise. * config/nvptx/bar.c (gomp_barrier_wait_end): Likewise. (gomp_team_barrier_wake): Likewise. (gomp_team_barrier_wait_end): Likewise. (gomp_team_barrier_wait_cancel_end): Likewise. * testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c: New test.
Diffstat (limited to 'libgomp')
-rw-r--r--libgomp/config/gcn/bar.c15
-rw-r--r--libgomp/config/nvptx/bar.c18
-rw-r--r--libgomp/testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c31
3 files changed, 53 insertions, 11 deletions
diff --git a/libgomp/config/gcn/bar.c b/libgomp/config/gcn/bar.c
index 02fd197..a21529a 100644
--- a/libgomp/config/gcn/bar.c
+++ b/libgomp/config/gcn/bar.c
@@ -43,7 +43,8 @@ gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
__atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
MEMMODEL_RELAXED);
}
- asm ("s_barrier" ::: "memory");
+ if (bar->total > 1)
+ asm ("s_barrier" ::: "memory");
}
void
@@ -71,7 +72,8 @@ gomp_barrier_wait_last (gomp_barrier_t *bar)
void
gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
{
- asm ("s_barrier" ::: "memory");
+ if (bar->total > 1)
+ asm ("s_barrier" ::: "memory");
}
void
@@ -97,7 +99,8 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
state &= ~BAR_CANCELLED;
state += BAR_INCR - BAR_WAS_LAST;
__atomic_store_n (&bar->generation, state, MEMMODEL_RELAXED);
- asm ("s_barrier" ::: "memory");
+ if (bar->total > 1)
+ asm ("s_barrier" ::: "memory");
return;
}
}
@@ -172,7 +175,8 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
{
state += BAR_INCR - BAR_WAS_LAST;
__atomic_store_n (&bar->generation, state, MEMMODEL_RELAXED);
- asm ("s_barrier" ::: "memory");
+ if (bar->total > 1)
+ asm ("s_barrier" ::: "memory");
return false;
}
}
@@ -195,7 +199,8 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
abort();
}
- asm ("s_barrier" ::: "memory");
+ if (bar->total > 1)
+ asm ("s_barrier" ::: "memory");
gen = __atomic_load_n (&bar->generation, MEMMODEL_RELAXED);
if (__builtin_expect (gen & BAR_CANCELLED, 0))
return true;
diff --git a/libgomp/config/nvptx/bar.c b/libgomp/config/nvptx/bar.c
index 125ca3e..1116561 100644
--- a/libgomp/config/nvptx/bar.c
+++ b/libgomp/config/nvptx/bar.c
@@ -41,7 +41,8 @@ gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
__atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
MEMMODEL_RELEASE);
}
- asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+ if (bar->total > 1)
+ asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
}
void
@@ -69,7 +70,8 @@ gomp_barrier_wait_last (gomp_barrier_t *bar)
void
gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
{
- asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+ if (bar->total > 1)
+ asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
}
void
@@ -95,7 +97,8 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
state &= ~BAR_CANCELLED;
state += BAR_INCR - BAR_WAS_LAST;
__atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
- asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+ if (bar->total > 1)
+ asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
return;
}
}
@@ -104,7 +107,8 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
state &= ~BAR_CANCELLED;
do
{
- asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+ if (bar->total > 1)
+ asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
{
@@ -158,7 +162,8 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
{
state += BAR_INCR - BAR_WAS_LAST;
__atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
- asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+ if (bar->total > 1)
+ asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
return false;
}
}
@@ -169,7 +174,8 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
generation = state;
do
{
- asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+ if (bar->total > 1)
+ asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
if (__builtin_expect (gen & BAR_CANCELLED, 0))
return true;
diff --git a/libgomp/testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c b/libgomp/testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c
new file mode 100644
index 0000000..e777271
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c
@@ -0,0 +1,31 @@
+/* Ensure that nested parallel regions work even when the number of loop
+ iterations is not divisible by the number of threads. */
+
+#include <stdlib.h>
+
+int main() {
+ int A[30][40], B[30][40];
+ size_t n = 30;
+
+ for (size_t i = 0; i < 30; ++i)
+ for (size_t j = 0; j < 40; ++j)
+ A[i][j] = 42;
+
+#pragma omp target map(A[0:30][0:40], B[0:30][0:40])
+ {
+#pragma omp parallel for num_threads(8)
+ for (size_t i = 0; i < n; ++i)
+ {
+#pragma omp parallel for
+ for (size_t j = 0; j < n; ++j)
+ {
+ B[i][j] = A[i][j];
+ }
+ }
+ }
+
+for (size_t i = 0; i < n; ++i)
+ for (size_t j = 0; j < n; ++j)
+ if (B[i][j] != 42)
+ abort ();
+}