aboutsummaryrefslogtreecommitdiff
path: root/libgomp
diff options
context:
space:
mode:
authorAndrew Stubbs <ams@codesourcery.com>2019-11-13 12:38:09 +0000
committerAndrew Stubbs <ams@gcc.gnu.org>2019-11-13 12:38:09 +0000
commitcee1645106465bc593b4cf31716b0a8ddd59af61 (patch)
tree7509f5cbce6597d2c912531eb3324118262c88b0 /libgomp
parentfa4999953db61cf94c0e57a9ab8b006d950e54ca (diff)
downloadgcc-cee1645106465bc593b4cf31716b0a8ddd59af61.zip
gcc-cee1645106465bc593b4cf31716b0a8ddd59af61.tar.gz
gcc-cee1645106465bc593b4cf31716b0a8ddd59af61.tar.bz2
Optimize GCN OpenMP malloc performance
2019-11-13 Andrew Stubbs <ams@codesourcery.com> libgomp/ * config/gcn/team.c (gomp_gcn_enter_kernel): Set up the team arena and use team_malloc variants. (gomp_gcn_exit_kernel): Use team_free. * libgomp.h (TEAM_ARENA_SIZE): Define. (TEAM_ARENA_START): Define. (TEAM_ARENA_FREE): Define. (TEAM_ARENA_END): Define. (team_malloc): New function. (team_malloc_cleared): New function. (team_free): New function. * team.c (gomp_new_team): Initialize and use team_malloc. (free_team): Use team_free. (gomp_free_thread): Use team_free. (gomp_pause_host): Use team_free. * work.c (gomp_init_work_share): Use team_malloc. (gomp_fini_work_share): Use team_free. From-SVN: r278136
Diffstat (limited to 'libgomp')
-rw-r--r--libgomp/ChangeLog19
-rw-r--r--libgomp/config/gcn/team.c20
-rw-r--r--libgomp/libgomp.h63
-rw-r--r--libgomp/team.c12
-rw-r--r--libgomp/work.c4
5 files changed, 106 insertions, 12 deletions
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog
index a7ce28f..59dae9c 100644
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,4 +1,23 @@
2019-11-13 Andrew Stubbs <ams@codesourcery.com>
+
+ * config/gcn/team.c (gomp_gcn_enter_kernel): Set up the team arena
+ and use team_malloc variants.
+ (gomp_gcn_exit_kernel): Use team_free.
+ * libgomp.h (TEAM_ARENA_SIZE): Define.
+ (TEAM_ARENA_START): Define.
+ (TEAM_ARENA_FREE): Define.
+ (TEAM_ARENA_END): Define.
+ (team_malloc): New function.
+ (team_malloc_cleared): New function.
+ (team_free): New function.
+ * team.c (gomp_new_team): Initialize and use team_malloc.
+ (free_team): Use team_free.
+ (gomp_free_thread): Use team_free.
+ (gomp_pause_host): Use team_free.
+ * work.c (gomp_init_work_share): Use team_malloc.
+ (gomp_fini_work_share): Use team_free.
+
+2019-11-13 Andrew Stubbs <ams@codesourcery.com>
Kwok Cheung Yeung <kcy@codesourcery.com>
Julian Brown <julian@codesourcery.com>
Tom de Vries <tom@codesourcery.com>
diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c
index c566482..20d4191 100644
--- a/libgomp/config/gcn/team.c
+++ b/libgomp/config/gcn/team.c
@@ -57,16 +57,28 @@ gomp_gcn_enter_kernel (void)
/* Starting additional threads is not supported. */
gomp_global_icv.dyn_var = true;
+ /* Initialize the team arena for optimized memory allocation.
+ The arena has been allocated on the host side, and the address
+ passed in via the kernargs. Each team takes a small slice of it. */
+ register void **kernargs asm("s8");
+ void *team_arena = (kernargs[4] + TEAM_ARENA_SIZE*teamid);
+ void * __lds *arena_start = (void * __lds *)TEAM_ARENA_START;
+ void * __lds *arena_free = (void * __lds *)TEAM_ARENA_FREE;
+ void * __lds *arena_end = (void * __lds *)TEAM_ARENA_END;
+ *arena_start = team_arena;
+ *arena_free = team_arena;
+ *arena_end = team_arena + TEAM_ARENA_SIZE;
+
/* Allocate and initialize the team-local-storage data. */
- struct gomp_thread *thrs = gomp_malloc_cleared (sizeof (*thrs)
+ struct gomp_thread *thrs = team_malloc_cleared (sizeof (*thrs)
* numthreads);
set_gcn_thrs (thrs);
/* Allocate and initailize a pool of threads in the team.
The threads are already running, of course, we just need to manage
the communication between them. */
- struct gomp_thread_pool *pool = gomp_malloc (sizeof (*pool));
- pool->threads = gomp_malloc (sizeof (void *) * numthreads);
+ struct gomp_thread_pool *pool = team_malloc (sizeof (*pool));
+ pool->threads = team_malloc (sizeof (void *) * numthreads);
for (int tid = 0; tid < numthreads; tid++)
pool->threads[tid] = &thrs[tid];
pool->threads_size = numthreads;
@@ -91,7 +103,7 @@ void
gomp_gcn_exit_kernel (void)
{
gomp_free_thread (gcn_thrs ());
- free (gcn_thrs ());
+ team_free (gcn_thrs ());
}
/* This function contains the idle loop in which a thread waits
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 19e1241..bab733d 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -106,6 +106,69 @@ extern void gomp_aligned_free (void *);
GCC's builtin alloca(). */
#define gomp_alloca(x) __builtin_alloca(x)
+/* Optimized allocators for team-specific data that will die with the team. */
+
+#ifdef __AMDGCN__
+/* The arena is initialized in config/gcn/team.c. */
+#define TEAM_ARENA_SIZE 64*1024 /* Must match the value in plugin-gcn.c. */
+#define TEAM_ARENA_START 16 /* LDS offset of free pointer. */
+#define TEAM_ARENA_FREE 24 /* LDS offset of free pointer. */
+#define TEAM_ARENA_END 32 /* LDS offset of end pointer. */
+
+static inline void * __attribute__((malloc))
+team_malloc (size_t size)
+{
+ /* 4-byte align the size. */
+ size = (size + 3) & ~3;
+
+ /* Allocate directly from the arena.
+ The compiler does not support DS atomics, yet. */
+ void *result;
+ asm ("ds_add_rtn_u64 %0, %1, %2\n\ts_waitcnt 0"
+ : "=v"(result) : "v"(TEAM_ARENA_FREE), "v"(size), "e"(1L) : "memory");
+
+ /* Handle OOM. */
+ if (result + size > *(void * __lds *)TEAM_ARENA_END)
+ {
+ /* While this is experimental, let's make sure we know when OOM
+ happens. */
+ const char msg[] = "GCN team arena exhausted\n";
+ write (2, msg, sizeof(msg)-1);
+
+ /* Fall back to using the heap (slowly). */
+ result = gomp_malloc (size);
+ }
+ return result;
+}
+
+static inline void * __attribute__((malloc))
+team_malloc_cleared (size_t size)
+{
+ char *result = team_malloc (size);
+
+ /* Clear the allocated memory. */
+ __builtin_memset (result, 0, size);
+
+ return result;
+}
+
+static inline void
+team_free (void *ptr)
+{
+ /* The whole arena is freed when the kernel exits.
+ However, if we fell back to using heap then we should free it.
+ It would be better if this function could be a no-op, but at least
+ LDS loads are cheap. */
+ if (ptr < *(void * __lds *)TEAM_ARENA_START
+ || ptr >= *(void * __lds *)TEAM_ARENA_END)
+ free (ptr);
+}
+#else
+#define team_malloc(...) gomp_malloc (__VA_ARGS__)
+#define team_malloc_cleared(...) gomp_malloc_cleared (__VA_ARGS__)
+#define team_free(...) free (__VA_ARGS__)
+#endif
+
/* error.c */
extern void gomp_vdebug (int, const char *, va_list);
diff --git a/libgomp/team.c b/libgomp/team.c
index b26caaa..cdfb9ba 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -171,7 +171,7 @@ gomp_new_team (unsigned nthreads)
{
size_t extra = sizeof (team->ordered_release[0])
+ sizeof (team->implicit_task[0]);
- team = gomp_malloc (sizeof (*team) + nthreads * extra);
+ team = team_malloc (sizeof (*team) + nthreads * extra);
#ifndef HAVE_SYNC_BUILTINS
gomp_mutex_init (&team->work_share_list_free_lock);
@@ -221,7 +221,7 @@ free_team (struct gomp_team *team)
gomp_barrier_destroy (&team->barrier);
gomp_mutex_destroy (&team->task_lock);
priority_queue_free (&team->task_queue);
- free (team);
+ team_free (team);
}
static void
@@ -285,8 +285,8 @@ gomp_free_thread (void *arg __attribute__((unused)))
if (pool->last_team)
free_team (pool->last_team);
#ifndef __nvptx__
- free (pool->threads);
- free (pool);
+ team_free (pool->threads);
+ team_free (pool);
#endif
thr->thread_pool = NULL;
}
@@ -1082,8 +1082,8 @@ gomp_pause_host (void)
if (pool->last_team)
free_team (pool->last_team);
#ifndef __nvptx__
- free (pool->threads);
- free (pool);
+ team_free (pool->threads);
+ team_free (pool);
#endif
thr->thread_pool = NULL;
}
diff --git a/libgomp/work.c b/libgomp/work.c
index a589b8b..28bb0c1 100644
--- a/libgomp/work.c
+++ b/libgomp/work.c
@@ -120,7 +120,7 @@ gomp_init_work_share (struct gomp_work_share *ws, size_t ordered,
else
ordered = nthreads * sizeof (*ws->ordered_team_ids);
if (ordered > INLINE_ORDERED_TEAM_IDS_SIZE)
- ws->ordered_team_ids = gomp_malloc (ordered);
+ ws->ordered_team_ids = team_malloc (ordered);
else
ws->ordered_team_ids = ws->inline_ordered_team_ids;
memset (ws->ordered_team_ids, '\0', ordered);
@@ -142,7 +142,7 @@ gomp_fini_work_share (struct gomp_work_share *ws)
{
gomp_mutex_destroy (&ws->lock);
if (ws->ordered_team_ids != ws->inline_ordered_team_ids)
- free (ws->ordered_team_ids);
+ team_free (ws->ordered_team_ids);
gomp_ptrlock_destroy (&ws->next_ws);
}