diff options
author | Andrew Stubbs <ams@codesourcery.com> | 2023-01-30 14:43:00 +0000 |
---|---|---|
committer | Andrew Stubbs <ams@codesourcery.com> | 2023-12-06 16:48:57 +0000 |
commit | e7d6c277fa28c0b9b621d23c471e0388d2912644 (patch) | |
tree | 3ef9390ef49f8deefa281fd7ad2a145ad85254a6 /libgomp | |
parent | e9a19ead498fcc89186b724c6e76854f7751a89b (diff) | |
download | gcc-e7d6c277fa28c0b9b621d23c471e0388d2912644.zip gcc-e7d6c277fa28c0b9b621d23c471e0388d2912644.tar.gz gcc-e7d6c277fa28c0b9b621d23c471e0388d2912644.tar.bz2 |
amdgcn, libgomp: low-latency allocator
This implements the OpenMP low-latency memory allocator for AMD GCN using the
small per-team LDS memory (Local Data Store).
Since addresses can now refer to LDS space, the "Global" address space is
no-longer compatible. This patch therefore switches the backend to use
entirely "Flat" addressing (which supports both memories). A future patch
will re-enable "global" instructions for cases where it is known to be safe
to do so.
gcc/ChangeLog:
* config/gcn/gcn-builtins.def (DISPATCH_PTR): New built-in.
* config/gcn/gcn.cc (gcn_init_machine_status): Disable global
addressing.
(gcn_expand_builtin_1): Implement GCN_BUILTIN_DISPATCH_PTR.
libgomp/ChangeLog:
* config/gcn/libgomp-gcn.h (TEAM_ARENA_START): Move to here.
(TEAM_ARENA_FREE): Likewise.
(TEAM_ARENA_END): Likewise.
(GCN_LOWLAT_HEAP): New.
* config/gcn/team.c (LITTLEENDIAN_CPU): New, and import hsa.h.
(__gcn_lowlat_init): New prototype.
(gomp_gcn_enter_kernel): Initialize the low-latency heap.
* libgomp.h (TEAM_ARENA_START): Move to libgomp.h.
(TEAM_ARENA_FREE): Likewise.
(TEAM_ARENA_END): Likewise.
* plugin/plugin-gcn.c (lowlat_size): New variable.
(print_kernel_dispatch): Label the group_segment_size purpose.
(init_environment_variables): Read GOMP_GCN_LOWLAT_POOL.
(create_kernel_dispatch): Pass low-latency head allocation to kernel.
(run_kernel): Use shadow; don't assume values.
* testsuite/libgomp.c/omp_alloc-traits.c: Enable for amdgcn.
* config/gcn/allocator.c: New file.
* libgomp.texi: Document low-latency implementation details.
Diffstat (limited to 'libgomp')
-rw-r--r-- | libgomp/config/gcn/allocator.c | 127 | ||||
-rw-r--r-- | libgomp/config/gcn/libgomp-gcn.h | 6 | ||||
-rw-r--r-- | libgomp/config/gcn/team.c | 12 | ||||
-rw-r--r-- | libgomp/libgomp.h | 3 | ||||
-rw-r--r-- | libgomp/libgomp.texi | 13 | ||||
-rw-r--r-- | libgomp/plugin/plugin-gcn.c | 35 | ||||
-rw-r--r-- | libgomp/testsuite/libgomp.c/omp_alloc-traits.c | 2 |
7 files changed, 188 insertions, 10 deletions
diff --git a/libgomp/config/gcn/allocator.c b/libgomp/config/gcn/allocator.c new file mode 100644 index 0000000..e9a95d6 --- /dev/null +++ b/libgomp/config/gcn/allocator.c @@ -0,0 +1,127 @@ +/* Copyright (C) 2023 Free Software Foundation, Inc. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* The low-latency allocators use space reserved in LDS memory when the + kernel is launched. The heap is initialized in gomp_gcn_enter_kernel and + all allocations are forgotten when the kernel exits. Allocations to other + memory spaces all use the system malloc syscall. + + The pointers returned are 64-bit "Flat" addresses indistinguishable from + regular pointers, but only compatible with the "flat_load/store" + instructions. The compiler has been coded to assign default address + spaces accordingly. + + LDS memory is not visible to other teams, and therefore may only be used + when the memspace access trait is set accordingly. */ + +#include "libgomp.h" +#include <stdlib.h> + +#define BASIC_ALLOC_PREFIX __gcn_lowlat +#define BASIC_ALLOC_YIELD asm ("s_sleep 1" ::: "memory") +#include "../../basic-allocator.c" + +/* The low-latency heap is located in LDS memory, but we need the __flat + address space for compatibility reasons. */ +#define FLAT_HEAP_PTR \ + ((void *) (uintptr_t) (void __flat *) (void __lds *) GCN_LOWLAT_HEAP) + +static void * +gcn_memspace_alloc (omp_memspace_handle_t memspace, size_t size) +{ + if (memspace == omp_low_lat_mem_space) + { + char *shared_pool = FLAT_HEAP_PTR; + + return __gcn_lowlat_alloc (shared_pool, size); + } + else + return malloc (size); +} + +static void * +gcn_memspace_calloc (omp_memspace_handle_t memspace, size_t size) +{ + if (memspace == omp_low_lat_mem_space) + { + char *shared_pool = FLAT_HEAP_PTR; + + return __gcn_lowlat_calloc (shared_pool, size); + } + else + return calloc (1, size); +} + +static void +gcn_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size) +{ + if (memspace == omp_low_lat_mem_space) + { + char *shared_pool = FLAT_HEAP_PTR; + + __gcn_lowlat_free (shared_pool, addr, size); + } + else + free (addr); +} + +static void * +gcn_memspace_realloc (omp_memspace_handle_t memspace, void *addr, + size_t oldsize, size_t size) +{ + if (memspace == omp_low_lat_mem_space) + { + char *shared_pool = FLAT_HEAP_PTR; + + return __gcn_lowlat_realloc (shared_pool, addr, oldsize, size); + } + else + return realloc (addr, size); +} + +static inline int +gcn_memspace_validate (omp_memspace_handle_t memspace, unsigned access) +{ + /* Disallow use of low-latency memory when it must be accessible by + all threads. */ + return (memspace != omp_low_lat_mem_space + || access != omp_atv_all); +} + +#define MEMSPACE_ALLOC(MEMSPACE, SIZE) \ + gcn_memspace_alloc (MEMSPACE, SIZE) +#define MEMSPACE_CALLOC(MEMSPACE, SIZE) \ + gcn_memspace_calloc (MEMSPACE, SIZE) +#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE) \ + gcn_memspace_realloc (MEMSPACE, ADDR, OLDSIZE, SIZE) +#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \ + gcn_memspace_free (MEMSPACE, ADDR, SIZE) +#define MEMSPACE_VALIDATE(MEMSPACE, ACCESS) \ + gcn_memspace_validate (MEMSPACE, ACCESS) + +/* The default low-latency memspace implies omp_atv_all, which is incompatible + with the LDS memory space. */ +#define OMP_LOW_LAT_MEM_ALLOC_INVALID 1 + +#include "../../allocator.c" diff --git a/libgomp/config/gcn/libgomp-gcn.h b/libgomp/config/gcn/libgomp-gcn.h index f62b7dd..05b6fb6 100644 --- a/libgomp/config/gcn/libgomp-gcn.h +++ b/libgomp/config/gcn/libgomp-gcn.h @@ -33,6 +33,12 @@ #define DEFAULT_GCN_STACK_SIZE (32*1024) #define DEFAULT_TEAM_ARENA_SIZE (64*1024) +/* These define the LDS location of data needed by OpenMP. */ +#define TEAM_ARENA_START 16 /* LDS offset of free pointer. */ +#define TEAM_ARENA_FREE 24 /* LDS offset of free pointer. */ +#define TEAM_ARENA_END 32 /* LDS offset of end pointer. */ +#define GCN_LOWLAT_HEAP 40 /* LDS offset of the OpenMP low-latency heap. */ + struct heap { int64_t size; diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c index fb20cbb..7ee6115 100644 --- a/libgomp/config/gcn/team.c +++ b/libgomp/config/gcn/team.c @@ -29,6 +29,12 @@ #include <stdlib.h> #include <string.h> +#define LITTLEENDIAN_CPU +#include "hsa.h" + +/* Defined in basic-allocator.c via config/amdgcn/allocator.c. */ +void __gcn_lowlat_init (void *heap, size_t size); + static void gomp_thread_start (struct gomp_thread_pool *); extern void build_indirect_map (void); @@ -75,6 +81,12 @@ gomp_gcn_enter_kernel (void) *arena_free = team_arena; *arena_end = team_arena + kernargs->arena_size_per_team; + /* Initialize the low-latency heap. The header is the size. */ + void __lds *lowlat = (void __lds *)GCN_LOWLAT_HEAP; + hsa_kernel_dispatch_packet_t *queue_ptr = __builtin_gcn_dispatch_ptr (); + __gcn_lowlat_init ((void*)(uintptr_t)(void __flat*)lowlat, + queue_ptr->group_segment_size - GCN_LOWLAT_HEAP); + /* Allocate and initialize the team-local-storage data. */ struct gomp_thread *thrs = team_malloc_cleared (sizeof (*thrs) * numthreads); diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 15a767c..fa29f42 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -114,9 +114,6 @@ extern void gomp_aligned_free (void *); #ifdef __AMDGCN__ #include "libgomp-gcn.h" /* The arena is initialized in config/gcn/team.c. */ -#define TEAM_ARENA_START 16 /* LDS offset of free pointer. */ -#define TEAM_ARENA_FREE 24 /* LDS offset of free pointer. */ -#define TEAM_ARENA_END 32 /* LDS offset of end pointer. */ static inline void * __attribute__((malloc)) team_malloc (size_t size) diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi index 8d57c17..67a1112 100644 --- a/libgomp/libgomp.texi +++ b/libgomp/libgomp.texi @@ -5836,6 +5836,19 @@ The implementation remark: available devices (``host fallback''). @item The available stack size can be changed using the @code{GCN_STACK_SIZE} environment variable; the default is 32 kiB per thread. +@item Low-latency memory (@code{omp_low_lat_mem_space}) is supported when the + the @code{access} trait is set to @code{cgroup}. The default pool size + is automatically scaled to share the 64 kiB LDS memory between the number + of teams configured to run on each compute-unit, but may be adjusted at + runtime by setting environment variable + @code{GOMP_GCN_LOWLAT_POOL=@var{bytes}}. +@item @code{omp_low_lat_mem_alloc} cannot be used with true low-latency memory + because the definition implies the @code{omp_atv_all} trait; main + graphics memory is used instead. +@item @code{omp_cgroup_mem_alloc}, @code{omp_pteam_mem_alloc}, and + @code{omp_thread_mem_alloc}, all use low-latency memory as first + preference, and fall back to main graphics memory when the low-latency + pool is exhausted. @end itemize diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index 8aabbd9..7f8178c 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -550,6 +550,7 @@ static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE; static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE; static int stack_size = DEFAULT_GCN_STACK_SIZE; +static int lowlat_size = -1; /* Flag to decide whether print to stderr information about what is going on. Set in init_debug depending on environment variables. */ @@ -1016,8 +1017,8 @@ print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent) fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object); fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "", dispatch->private_segment_size); - fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "", - dispatch->group_segment_size); + fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent, + "", dispatch->group_segment_size); fprintf (stderr, "\n"); } @@ -1088,6 +1089,10 @@ init_environment_variables (void) if (tmp) stack_size = tmp;; } + + const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL"); + if (lowlat) + lowlat_size = atoi (lowlat); } /* Return malloc'd string with name of SYMBOL. */ @@ -1930,7 +1935,25 @@ create_kernel_dispatch (struct kernel_info *kernel, int num_teams, shadow->signal = sync_signal.handle; shadow->private_segment_size = kernel->private_segment_size; - shadow->group_segment_size = kernel->group_segment_size; + + if (lowlat_size < 0) + { + /* Divide the LDS between the number of running teams. + Allocate not less than is defined in the kernel metadata. */ + int teams_per_cu = num_teams / get_cu_count (agent); + int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536); + shadow->group_segment_size + = (kernel->group_segment_size > LDS_per_team + ? kernel->group_segment_size + : LDS_per_team);; + } + else if (lowlat_size < GCN_LOWLAT_HEAP+8) + /* Ensure that there's space for the OpenMP libgomp data. */ + shadow->group_segment_size = GCN_LOWLAT_HEAP+8; + else + shadow->group_segment_size = (lowlat_size > 65536 + ? 65536 + : lowlat_size); /* We expect kernels to request a single pointer, explicitly, and the rest of struct kernargs, implicitly. If they request anything else @@ -2290,9 +2313,9 @@ run_kernel (struct kernel_info *kernel, void *vars, print_kernel_dispatch (shadow, 2); } - packet->private_segment_size = kernel->private_segment_size; - packet->group_segment_size = kernel->group_segment_size; - packet->kernel_object = kernel->object; + packet->private_segment_size = shadow->private_segment_size; + packet->group_segment_size = shadow->group_segment_size; + packet->kernel_object = shadow->object; packet->kernarg_address = shadow->kernarg_address; hsa_signal_t s; s.handle = shadow->signal; diff --git a/libgomp/testsuite/libgomp.c/omp_alloc-traits.c b/libgomp/testsuite/libgomp.c/omp_alloc-traits.c index 4ff0fca..e9acc86 100644 --- a/libgomp/testsuite/libgomp.c/omp_alloc-traits.c +++ b/libgomp/testsuite/libgomp.c/omp_alloc-traits.c @@ -1,7 +1,7 @@ /* { dg-do run } */ /* { dg-require-effective-target offload_device } */ -/* { dg-xfail-if "not implemented" { ! offload_target_nvptx } } */ +/* { dg-xfail-if "not implemented" { ! { offload_target_nvptx || offload_target_amdgcn } } } */ /* Test that GPU low-latency allocation is limited to team access. */ |