diff options
author | Andrew Stubbs <ams@codesourcery.com> | 2021-12-03 17:46:41 +0000 |
---|---|---|
committer | Andrew Stubbs <ams@codesourcery.com> | 2023-12-06 16:48:57 +0000 |
commit | 30486fab717a90dc7516722c24ef9c5ea246c350 (patch) | |
tree | 473b0117e6eb85aa3d358ef4bf9ccdc3cff5ec6b /libgomp/config | |
parent | 458e7c937924bbcef80eb006af0b61420dbfc1c1 (diff) | |
download | gcc-30486fab717a90dc7516722c24ef9c5ea246c350.zip gcc-30486fab717a90dc7516722c24ef9c5ea246c350.tar.gz gcc-30486fab717a90dc7516722c24ef9c5ea246c350.tar.bz2 |
libgomp, nvptx: low-latency memory allocator
This patch adds support for allocating low-latency ".shared" memory on
NVPTX GPU device, via the omp_low_lat_mem_space and omp_alloc. The memory
can be allocated, reallocated, and freed using a basic but fast algorithm,
is thread safe and the size of the low-latency heap can be configured using
the GOMP_NVPTX_LOWLAT_POOL environment variable.
The use of the PTX dynamic_smem_size feature means that low-latency allocator
will not work with the PTX 3.1 multilib.
For now, the omp_low_lat_mem_alloc allocator also works, but that will change
when I implement the access traits.
libgomp/ChangeLog:
* allocator.c (MEMSPACE_ALLOC): New macro.
(MEMSPACE_CALLOC): New macro.
(MEMSPACE_REALLOC): New macro.
(MEMSPACE_FREE): New macro.
(predefined_alloc_mapping): New array. Add _Static_assert to match.
(ARRAY_SIZE): New macro.
(omp_aligned_alloc): Use MEMSPACE_ALLOC.
Implement fall-backs for predefined allocators. Simplify existing
fall-backs.
(omp_free): Use MEMSPACE_FREE.
(omp_calloc): Use MEMSPACE_CALLOC. Implement fall-backs for
predefined allocators. Simplify existing fall-backs.
(omp_realloc): Use MEMSPACE_REALLOC, MEMSPACE_ALLOC, and MEMSPACE_FREE.
Implement fall-backs for predefined allocators. Simplify existing
fall-backs.
* config/nvptx/team.c (__nvptx_lowlat_pool): New asm variable.
(__nvptx_lowlat_init): New prototype.
(gomp_nvptx_main): Call __nvptx_lowlat_init.
* libgomp.texi: Update memory space table.
* plugin/plugin-nvptx.c (lowlat_pool_size): New variable.
(GOMP_OFFLOAD_init_device): Read the GOMP_NVPTX_LOWLAT_POOL envvar.
(GOMP_OFFLOAD_run): Apply lowlat_pool_size.
* basic-allocator.c: New file.
* config/nvptx/allocator.c: New file.
* testsuite/libgomp.c/omp_alloc-1.c: New test.
* testsuite/libgomp.c/omp_alloc-2.c: New test.
* testsuite/libgomp.c/omp_alloc-3.c: New test.
* testsuite/libgomp.c/omp_alloc-4.c: New test.
* testsuite/libgomp.c/omp_alloc-5.c: New test.
* testsuite/libgomp.c/omp_alloc-6.c: New test.
Co-authored-by: Kwok Cheung Yeung <kcy@codesourcery.com>
Co-Authored-By: Thomas Schwinge <thomas@codesourcery.com>
Diffstat (limited to 'libgomp/config')
-rw-r--r-- | libgomp/config/nvptx/allocator.c | 120 | ||||
-rw-r--r-- | libgomp/config/nvptx/team.c | 18 |
2 files changed, 138 insertions, 0 deletions
diff --git a/libgomp/config/nvptx/allocator.c b/libgomp/config/nvptx/allocator.c new file mode 100644 index 0000000..6014fba --- /dev/null +++ b/libgomp/config/nvptx/allocator.c @@ -0,0 +1,120 @@ +/* Copyright (C) 2023 Free Software Foundation, Inc. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* The low-latency allocators use space reserved in .shared memory when the + kernel is launched. The heap is initialized in gomp_nvptx_main and all + allocations are forgotten when the kernel exits. Allocations to other + memory spaces all use the system malloc syscall. + + The root heap descriptor is stored elsewhere in shared memory, and each + free chunk contains a similar descriptor for the next free chunk in the + chain. + + The descriptor is two 16-bit values: offset and size, which describe the + location of a chunk of memory available for allocation. The offset is + relative to the base of the heap. The special value 0xffff, 0xffff + indicates that the heap is locked. The descriptor is encoded into a + single 32-bit integer so that it may be easily accessed atomically. + + Memory is allocated to the first free chunk that fits. The free chain + is always stored in order of the offset to assist coalescing adjacent + chunks. */ + +#include "libgomp.h" +#include <stdlib.h> + +#define BASIC_ALLOC_PREFIX __nvptx_lowlat +#include "../../basic-allocator.c" + +/* There should be some .shared space reserved for us. There's no way to + express this magic extern sizeless array in C so use asm. */ +asm (".extern .shared .u8 __nvptx_lowlat_pool[];\n"); + +static void * +nvptx_memspace_alloc (omp_memspace_handle_t memspace, size_t size) +{ + if (memspace == omp_low_lat_mem_space) + { + char *shared_pool; + asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r" (shared_pool)); + + return __nvptx_lowlat_alloc (shared_pool, size); + } + else + return malloc (size); +} + +static void * +nvptx_memspace_calloc (omp_memspace_handle_t memspace, size_t size) +{ + if (memspace == omp_low_lat_mem_space) + { + char *shared_pool; + asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r" (shared_pool)); + + return __nvptx_lowlat_calloc (shared_pool, size); + } + else + return calloc (1, size); +} + +static void +nvptx_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size) +{ + if (memspace == omp_low_lat_mem_space) + { + char *shared_pool; + asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r" (shared_pool)); + + __nvptx_lowlat_free (shared_pool, addr, size); + } + else + free (addr); +} + +static void * +nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr, + size_t oldsize, size_t size) +{ + if (memspace == omp_low_lat_mem_space) + { + char *shared_pool; + asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r" (shared_pool)); + + return __nvptx_lowlat_realloc (shared_pool, addr, oldsize, size); + } + else + return realloc (addr, size); +} + +#define MEMSPACE_ALLOC(MEMSPACE, SIZE) \ + nvptx_memspace_alloc (MEMSPACE, SIZE) +#define MEMSPACE_CALLOC(MEMSPACE, SIZE) \ + nvptx_memspace_calloc (MEMSPACE, SIZE) +#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE) \ + nvptx_memspace_realloc (MEMSPACE, ADDR, OLDSIZE, SIZE) +#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \ + nvptx_memspace_free (MEMSPACE, ADDR, SIZE) + +#include "../../allocator.c" diff --git a/libgomp/config/nvptx/team.c b/libgomp/config/nvptx/team.c index 59521fa..9243774 100644 --- a/libgomp/config/nvptx/team.c +++ b/libgomp/config/nvptx/team.c @@ -37,6 +37,12 @@ int __gomp_team_num __attribute__((shared,nocommon)); static void gomp_thread_start (struct gomp_thread_pool *); extern void build_indirect_map (void); +/* There should be some .shared space reserved for us. There's no way to + express this magic extern sizeless array in C so use asm. */ +asm (".extern .shared .u8 __nvptx_lowlat_pool[];\n"); + +/* Defined in basic-allocator.c via config/nvptx/allocator.c. */ +void __nvptx_lowlat_init (void *heap, size_t size); /* This externally visible function handles target region entry. It sets up a per-team thread pool and transfers control by calling FN (FN_DATA) @@ -68,6 +74,18 @@ gomp_nvptx_main (void (*fn) (void *), void *fn_data) nvptx_thrs = alloca (ntids * sizeof (*nvptx_thrs)); memset (nvptx_thrs, 0, ntids * sizeof (*nvptx_thrs)); + /* Find the low-latency heap details .... */ + uint32_t *shared_pool; + uint32_t shared_pool_size = 0; + asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool)); +#if __PTX_ISA_VERSION_MAJOR__ > 4 \ + || (__PTX_ISA_VERSION_MAJOR__ == 4 && __PTX_ISA_VERSION_MINOR__ >= 1) + asm ("mov.u32\t%0, %%dynamic_smem_size;\n" + : "=r"(shared_pool_size)); +#endif + __nvptx_lowlat_init (shared_pool, shared_pool_size); + + /* Initialize the thread pool. */ struct gomp_thread_pool *pool = alloca (sizeof (*pool)); pool->threads = alloca (ntids * sizeof (*pool->threads)); for (tid = 0; tid < ntids; tid++) |