aboutsummaryrefslogtreecommitdiff
path: root/libgomp/config
diff options
context:
space:
mode:
authorAndrew Stubbs <ams@codesourcery.com>2021-12-03 17:46:41 +0000
committerAndrew Stubbs <ams@codesourcery.com>2023-12-06 16:48:57 +0000
commit30486fab717a90dc7516722c24ef9c5ea246c350 (patch)
tree473b0117e6eb85aa3d358ef4bf9ccdc3cff5ec6b /libgomp/config
parent458e7c937924bbcef80eb006af0b61420dbfc1c1 (diff)
downloadgcc-30486fab717a90dc7516722c24ef9c5ea246c350.zip
gcc-30486fab717a90dc7516722c24ef9c5ea246c350.tar.gz
gcc-30486fab717a90dc7516722c24ef9c5ea246c350.tar.bz2
libgomp, nvptx: low-latency memory allocator
This patch adds support for allocating low-latency ".shared" memory on NVPTX GPU device, via the omp_low_lat_mem_space and omp_alloc. The memory can be allocated, reallocated, and freed using a basic but fast algorithm, is thread safe and the size of the low-latency heap can be configured using the GOMP_NVPTX_LOWLAT_POOL environment variable. The use of the PTX dynamic_smem_size feature means that low-latency allocator will not work with the PTX 3.1 multilib. For now, the omp_low_lat_mem_alloc allocator also works, but that will change when I implement the access traits. libgomp/ChangeLog: * allocator.c (MEMSPACE_ALLOC): New macro. (MEMSPACE_CALLOC): New macro. (MEMSPACE_REALLOC): New macro. (MEMSPACE_FREE): New macro. (predefined_alloc_mapping): New array. Add _Static_assert to match. (ARRAY_SIZE): New macro. (omp_aligned_alloc): Use MEMSPACE_ALLOC. Implement fall-backs for predefined allocators. Simplify existing fall-backs. (omp_free): Use MEMSPACE_FREE. (omp_calloc): Use MEMSPACE_CALLOC. Implement fall-backs for predefined allocators. Simplify existing fall-backs. (omp_realloc): Use MEMSPACE_REALLOC, MEMSPACE_ALLOC, and MEMSPACE_FREE. Implement fall-backs for predefined allocators. Simplify existing fall-backs. * config/nvptx/team.c (__nvptx_lowlat_pool): New asm variable. (__nvptx_lowlat_init): New prototype. (gomp_nvptx_main): Call __nvptx_lowlat_init. * libgomp.texi: Update memory space table. * plugin/plugin-nvptx.c (lowlat_pool_size): New variable. (GOMP_OFFLOAD_init_device): Read the GOMP_NVPTX_LOWLAT_POOL envvar. (GOMP_OFFLOAD_run): Apply lowlat_pool_size. * basic-allocator.c: New file. * config/nvptx/allocator.c: New file. * testsuite/libgomp.c/omp_alloc-1.c: New test. * testsuite/libgomp.c/omp_alloc-2.c: New test. * testsuite/libgomp.c/omp_alloc-3.c: New test. * testsuite/libgomp.c/omp_alloc-4.c: New test. * testsuite/libgomp.c/omp_alloc-5.c: New test. * testsuite/libgomp.c/omp_alloc-6.c: New test. Co-authored-by: Kwok Cheung Yeung <kcy@codesourcery.com> Co-Authored-By: Thomas Schwinge <thomas@codesourcery.com>
Diffstat (limited to 'libgomp/config')
-rw-r--r--libgomp/config/nvptx/allocator.c120
-rw-r--r--libgomp/config/nvptx/team.c18
2 files changed, 138 insertions, 0 deletions
diff --git a/libgomp/config/nvptx/allocator.c b/libgomp/config/nvptx/allocator.c
new file mode 100644
index 0000000..6014fba
--- /dev/null
+++ b/libgomp/config/nvptx/allocator.c
@@ -0,0 +1,120 @@
+/* Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* The low-latency allocators use space reserved in .shared memory when the
+ kernel is launched. The heap is initialized in gomp_nvptx_main and all
+ allocations are forgotten when the kernel exits. Allocations to other
+ memory spaces all use the system malloc syscall.
+
+ The root heap descriptor is stored elsewhere in shared memory, and each
+ free chunk contains a similar descriptor for the next free chunk in the
+ chain.
+
+ The descriptor is two 16-bit values: offset and size, which describe the
+ location of a chunk of memory available for allocation. The offset is
+ relative to the base of the heap. The special value 0xffff, 0xffff
+ indicates that the heap is locked. The descriptor is encoded into a
+ single 32-bit integer so that it may be easily accessed atomically.
+
+ Memory is allocated to the first free chunk that fits. The free chain
+ is always stored in order of the offset to assist coalescing adjacent
+ chunks. */
+
+#include "libgomp.h"
+#include <stdlib.h>
+
+#define BASIC_ALLOC_PREFIX __nvptx_lowlat
+#include "../../basic-allocator.c"
+
+/* There should be some .shared space reserved for us. There's no way to
+ express this magic extern sizeless array in C so use asm. */
+asm (".extern .shared .u8 __nvptx_lowlat_pool[];\n");
+
+static void *
+nvptx_memspace_alloc (omp_memspace_handle_t memspace, size_t size)
+{
+ if (memspace == omp_low_lat_mem_space)
+ {
+ char *shared_pool;
+ asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r" (shared_pool));
+
+ return __nvptx_lowlat_alloc (shared_pool, size);
+ }
+ else
+ return malloc (size);
+}
+
+static void *
+nvptx_memspace_calloc (omp_memspace_handle_t memspace, size_t size)
+{
+ if (memspace == omp_low_lat_mem_space)
+ {
+ char *shared_pool;
+ asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r" (shared_pool));
+
+ return __nvptx_lowlat_calloc (shared_pool, size);
+ }
+ else
+ return calloc (1, size);
+}
+
+static void
+nvptx_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size)
+{
+ if (memspace == omp_low_lat_mem_space)
+ {
+ char *shared_pool;
+ asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r" (shared_pool));
+
+ __nvptx_lowlat_free (shared_pool, addr, size);
+ }
+ else
+ free (addr);
+}
+
+static void *
+nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
+ size_t oldsize, size_t size)
+{
+ if (memspace == omp_low_lat_mem_space)
+ {
+ char *shared_pool;
+ asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r" (shared_pool));
+
+ return __nvptx_lowlat_realloc (shared_pool, addr, oldsize, size);
+ }
+ else
+ return realloc (addr, size);
+}
+
+#define MEMSPACE_ALLOC(MEMSPACE, SIZE) \
+ nvptx_memspace_alloc (MEMSPACE, SIZE)
+#define MEMSPACE_CALLOC(MEMSPACE, SIZE) \
+ nvptx_memspace_calloc (MEMSPACE, SIZE)
+#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE) \
+ nvptx_memspace_realloc (MEMSPACE, ADDR, OLDSIZE, SIZE)
+#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \
+ nvptx_memspace_free (MEMSPACE, ADDR, SIZE)
+
+#include "../../allocator.c"
diff --git a/libgomp/config/nvptx/team.c b/libgomp/config/nvptx/team.c
index 59521fa..9243774 100644
--- a/libgomp/config/nvptx/team.c
+++ b/libgomp/config/nvptx/team.c
@@ -37,6 +37,12 @@ int __gomp_team_num __attribute__((shared,nocommon));
static void gomp_thread_start (struct gomp_thread_pool *);
extern void build_indirect_map (void);
+/* There should be some .shared space reserved for us. There's no way to
+ express this magic extern sizeless array in C so use asm. */
+asm (".extern .shared .u8 __nvptx_lowlat_pool[];\n");
+
+/* Defined in basic-allocator.c via config/nvptx/allocator.c. */
+void __nvptx_lowlat_init (void *heap, size_t size);
/* This externally visible function handles target region entry. It
sets up a per-team thread pool and transfers control by calling FN (FN_DATA)
@@ -68,6 +74,18 @@ gomp_nvptx_main (void (*fn) (void *), void *fn_data)
nvptx_thrs = alloca (ntids * sizeof (*nvptx_thrs));
memset (nvptx_thrs, 0, ntids * sizeof (*nvptx_thrs));
+ /* Find the low-latency heap details .... */
+ uint32_t *shared_pool;
+ uint32_t shared_pool_size = 0;
+ asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool));
+#if __PTX_ISA_VERSION_MAJOR__ > 4 \
+ || (__PTX_ISA_VERSION_MAJOR__ == 4 && __PTX_ISA_VERSION_MINOR__ >= 1)
+ asm ("mov.u32\t%0, %%dynamic_smem_size;\n"
+ : "=r"(shared_pool_size));
+#endif
+ __nvptx_lowlat_init (shared_pool, shared_pool_size);
+
+ /* Initialize the thread pool. */
struct gomp_thread_pool *pool = alloca (sizeof (*pool));
pool->threads = alloca (ntids * sizeof (*pool->threads));
for (tid = 0; tid < ntids; tid++)