diff options
Diffstat (limited to 'libgomp')
-rw-r--r-- | libgomp/ChangeLog.omp | 55 | ||||
-rw-r--r-- | libgomp/libgomp-plugin.h | 1 | ||||
-rw-r--r-- | libgomp/libgomp.h | 3 | ||||
-rw-r--r-- | libgomp/libgomp.map | 6 | ||||
-rw-r--r-- | libgomp/libgomp.texi | 112 | ||||
-rw-r--r-- | libgomp/omp.h.in | 4 | ||||
-rw-r--r-- | libgomp/omp_lib.f90.in | 23 | ||||
-rw-r--r-- | libgomp/omp_lib.h.in | 25 | ||||
-rw-r--r-- | libgomp/plugin/cuda-lib.def | 1 | ||||
-rw-r--r-- | libgomp/plugin/plugin-gcn.c | 80 | ||||
-rw-r--r-- | libgomp/plugin/plugin-nvptx.c | 9 | ||||
-rw-r--r-- | libgomp/target.c | 83 | ||||
-rw-r--r-- | libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-2.c | 62 | ||||
-rw-r--r-- | libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-3.c | 80 | ||||
-rw-r--r-- | libgomp/testsuite/libgomp.c-c++-common/omp_target_memset.c | 62 | ||||
-rw-r--r-- | libgomp/testsuite/libgomp.c/declare-variant-4-gfx942.c | 8 | ||||
-rw-r--r-- | libgomp/testsuite/libgomp.c/declare-variant-4.h | 8 | ||||
-rw-r--r-- | libgomp/testsuite/libgomp.c/target-map-zero-sized-3.c | 7 | ||||
-rw-r--r-- | libgomp/testsuite/libgomp.fortran/omp_target_memset-2.f90 | 67 | ||||
-rw-r--r-- | libgomp/testsuite/libgomp.fortran/omp_target_memset.f90 | 39 |
20 files changed, 728 insertions, 7 deletions
diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp index 5ecab6d..2bf31a9 100644 --- a/libgomp/ChangeLog.omp +++ b/libgomp/ChangeLog.omp @@ -1,3 +1,58 @@ +2025-06-10 Tobias Burnus <tburnus@baylibre.com> + + Backported from master: + 2025-06-10 Tobias Burnus <tburnus@baylibre.com> + + * testsuite/libgomp.c/declare-variant-4.h (gfx942): New variant function. + * testsuite/libgomp.c/declare-variant-4-gfx942.c: New test. + +2025-06-10 Tobias Burnus <tburnus@baylibre.com> + + Backported from master: + 2025-06-02 Tobias Burnus <tburnus@baylibre.com> + + PR libgomp/120444 + * libgomp-plugin.h (GOMP_OFFLOAD_memset): Declare. + * libgomp.h (struct gomp_device_descr): Add memset_func. + * libgomp.map (GOMP_6.0.1): Add omp_target_memset{,_async}. + * libgomp.texi (Device Memory Routines): Document them. + * omp.h.in (omp_target_memset, omp_target_memset_async): Declare. + * omp_lib.f90.in (omp_target_memset, omp_target_memset_async): + Add interfaces. + * omp_lib.h.in (omp_target_memset, omp_target_memset_async): Likewise. + * plugin/cuda-lib.def: Add cuMemsetD8. + * plugin/plugin-gcn.c (struct hsa_runtime_fn_info): Add + hsa_amd_memory_fill_fn. + (init_hsa_runtime_functions): DLSYM_OPT_FN load it. + (GOMP_OFFLOAD_memset): New. + * plugin/plugin-nvptx.c (GOMP_OFFLOAD_memset): New. + * target.c (omp_target_memset_int, omp_target_memset, + omp_target_memset_async_helper, omp_target_memset_async): New. + (gomp_load_plugin_for_device): Add DLSYM (memset). + * testsuite/libgomp.c-c++-common/omp_target_memset.c: New test. + * testsuite/libgomp.c-c++-common/omp_target_memset-2.c: New test. + * testsuite/libgomp.c-c++-common/omp_target_memset-3.c: New test. + * testsuite/libgomp.fortran/omp_target_memset.f90: New test. + * testsuite/libgomp.fortran/omp_target_memset-2.f90: New test. + +2025-06-06 Tobias Burnus <tburnus@baylibre.com> + + Backported from master: + 2025-06-06 Tobias Burnus <tburnus@baylibre.com> + Sandra Loosemore <sloosemore@baylibre.com> + + * libgomp.texi (omp_get_num_devices, omp_get_intrinsic_device): + Document builtin handling. + +2025-06-06 Tobias Burnus <tburnus@baylibre.com> + + Backported from master: + 2025-06-06 Tobias Burnus <tburnus@baylibre.com> + + PR target/120530 + * testsuite/libgomp.c/target-map-zero-sized-3.c (main): Add missing + map clause; remove unused variable. + 2025-05-30 Thomas Schwinge <tschwinge@baylibre.com> * testsuite/libgomp.c++/target-flex-300.C: XFAIL. diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h index 3c7741b..d0bcc23 100644 --- a/libgomp/libgomp-plugin.h +++ b/libgomp/libgomp-plugin.h @@ -179,6 +179,7 @@ extern int GOMP_OFFLOAD_memcpy3d (int, int, size_t, size_t, size_t, void *, size_t, size_t, size_t, size_t, size_t, const void *, size_t, size_t, size_t, size_t, size_t); +extern bool GOMP_OFFLOAD_memset (int, void *, int, size_t); extern bool GOMP_OFFLOAD_can_run (void *); extern void GOMP_OFFLOAD_run (int, void *, void *, void **); extern void GOMP_OFFLOAD_async_run (int, void *, void *, void **, void *); diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 571ac62c..465f7c1 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -1441,9 +1441,10 @@ struct gomp_device_descr __typeof (GOMP_OFFLOAD_page_locked_host_free) *page_locked_host_free_func; __typeof (GOMP_OFFLOAD_dev2host) *dev2host_func; __typeof (GOMP_OFFLOAD_host2dev) *host2dev_func; + __typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func; __typeof (GOMP_OFFLOAD_memcpy2d) *memcpy2d_func; __typeof (GOMP_OFFLOAD_memcpy3d) *memcpy3d_func; - __typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func; + __typeof (GOMP_OFFLOAD_memset) *memset_func; __typeof (GOMP_OFFLOAD_can_run) *can_run_func; __typeof (GOMP_OFFLOAD_run) *run_func; __typeof (GOMP_OFFLOAD_async_run) *async_run_func; diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map index bc2de6b..a6c523b 100644 --- a/libgomp/libgomp.map +++ b/libgomp/libgomp.map @@ -454,6 +454,12 @@ GOMP_6.0 { omp_get_uid_from_device_8_; } GOMP_5.1.3; +GOMP_6.0.1 { + global: + omp_target_memset; + omp_target_memset_async; +} GOMP_6.0; + OACC_2.0 { global: acc_get_num_devices; diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi index e1b70b0..91259cc 100644 --- a/libgomp/libgomp.texi +++ b/libgomp/libgomp.texi @@ -603,7 +603,7 @@ to address of matching mapped list item per 5.1, Sect. 2.21.7.2 @tab N @tab @code{omp_get_device_teams_thread_limit}, and @code{omp_set_device_teams_thread_limit} routines @tab N @tab @item @code{omp_target_memset} and @code{omp_target_memset_async} routines - @tab N @tab + @tab Y @tab @item Fortran version of the interop runtime routines @tab Y @tab @item Routines for obtaining memory spaces/allocators for shared/device memory @tab N @tab @@ -1802,6 +1802,11 @@ Returns the number of available non-host devices. The effect of running this routine in a @code{target} region is unspecified. +Note that in GCC the function is marked pure, i.e. as returning always the +same number. When GCC was not configured to support offloading, it is replaced +by zero; compile with @option{-fno-builtin-omp_get_num_devices} if a run-time +function is desired. + @item @emph{C/C++}: @multitable @columnfractions .20 .80 @item @emph{Prototype}: @tab @code{int omp_get_num_devices(void);} @@ -1812,6 +1817,9 @@ The effect of running this routine in a @code{target} region is unspecified. @item @emph{Interface}: @tab @code{integer function omp_get_num_devices()} @end multitable +@item @emph{See also}: +@ref{omp_get_initial_device} + @item @emph{Reference}: @uref{https://www.openmp.org, OpenMP specification v4.5}, Section 3.2.31. @end table @@ -1950,6 +1958,12 @@ the value of @code{omp_initial_device}. The effect of running this routine in a @code{target} region is unspecified. +Note that GCC inlines this function unless you compile with +@option{-fno-builtin-omp_get_initial_device}. If GCC was not configured to +support offloading, it expands to constant zero; in non-host code it expands +to @code{omp_initial_device}; and otherwise it is replaced with a call to +@code{omp_get_num_devices}. + @item @emph{C/C++} @multitable @columnfractions .20 .80 @item @emph{Prototype}: @tab @code{int omp_get_initial_device(void);} @@ -1984,8 +1998,8 @@ pointers on devices. They have C linkage and do not throw exceptions. * omp_target_memcpy_async:: Copy data between devices asynchronously * omp_target_memcpy_rect:: Copy a subvolume of data between devices * omp_target_memcpy_rect_async:: Copy a subvolume of data between devices asynchronously -@c * omp_target_memset:: <fixme>/TR12 -@c * omp_target_memset_async:: <fixme>/TR12 +* omp_target_memset:: Set bytes in device memory +* omp_target_memset_async:: Set bytes in device memory asynchronously * omp_target_associate_ptr:: Associate a device pointer with a host pointer * omp_target_disassociate_ptr:: Remove device--host pointer association * omp_get_mapped_ptr:: Return device pointer to a host pointer @@ -2398,6 +2412,98 @@ the initial device. @end table +@node omp_target_memset +@subsection @code{omp_target_memset} -- Set bytes in device memory +@table @asis +@item @emph{Description}: +This routine fills memory on the device identified by device number +@var{device_num}. Starting from the device address @var{ptr}, the first +@var{count} bytes are set to the value @var{val}, converted to +@code{unsigned char}. If @var{count} is zero, the routine has no effect; +if @var{ptr} is @code{NULL}, the behavior is unspecified. The function +returns @var{ptr}. + +The @var{device_num} must be a conforming device number and @var{ptr} must be +a valid device pointer for that device. Running this routine in a +@code{target} region except on the initial device is not supported. + +@item @emph{C/C++} +@multitable @columnfractions .20 .80 +@item @emph{Prototype}: @tab @code{void *omp_target_memcpy(void *ptr,} +@item @tab @code{ int val,} +@item @tab @code{ size_t count,} +@item @tab @code{ int device_num)} +@end multitable + +@item @emph{Fortran}: +@multitable @columnfractions .20 .80 +@item @emph{Interface}: @tab @code{type(c_ptr) function omp_target_memset( &} +@item @tab @code{ ptr, val, count, device_num) bind(C)} +@item @tab @code{use, intrinsic :: iso_c_binding, only: c_ptr, c_size_t, c_int} +@item @tab @code{type(c_ptr), value :: ptr} +@item @tab @code{integer(c_size_t), value :: count} +@item @tab @code{integer(c_int), value :: val, device_num} +@end multitable + +@item @emph{See also}: +@ref{omp_target_memset_async} + +@item @emph{Reference}: +@uref{https://www.openmp.org, OpenMP specification v6.0}, Section 25.8.1 +@end table + + + +@node omp_target_memset_async +@subsection @code{omp_target_memset} -- Set bytes in device memory asynchronously +@table @asis +@item @emph{Description}: +This routine fills memory on the device identified by device number +@var{device_num}. Starting from the device address @var{ptr}, the first +@var{count} bytes are set to the value @var{val}, converted to +@code{unsigned char}. If @var{count} is zero, the routine has no effect; +if @var{ptr} is @code{NULL}, the behavior is unspecified. Task dependence +is expressed by passing an array of depend objects to @var{depobj_list}, where +the number of array elements is passed as @var{depobj_count}; if the count is +zero, the @var{depobj_list} argument is ignored. In C++ and Fortran, the +@var{depobj_list} argument can also be omitted in that case. The function +returns @var{ptr}. + +The @var{device_num} must be a conforming device number and @var{ptr} must be +a valid device pointer for that device. Running this routine in a +@code{target} region except on the initial device is not supported. + +@item @emph{C/C++} +@multitable @columnfractions .20 .80 +@item @emph{Prototype}: @tab @code{void *omp_target_memcpy_async(void *ptr,} +@item @tab @code{ int val,} +@item @tab @code{ size_t count,} +@item @tab @code{ int device_num,} +@item @tab @code{ int depobj_count,} +@item @tab @code{ omp_depend_t *depobj_list)} +@end multitable + +@item @emph{Fortran}: +@multitable @columnfractions .20 .80 +@item @emph{Interface}: @tab @code{type(c_ptr) function omp_target_memset_async( &} +@item @tab @code{ ptr, val, count, device_num, &} +@item @tab @code{ depobj_count, depobj_list) bind(C)} +@item @tab @code{use, intrinsic :: iso_c_binding, only: c_ptr, c_size_t, c_int} +@item @tab @code{type(c_ptr), value :: ptr} +@item @tab @code{integer(c_size_t), value :: count} +@item @tab @code{integer(c_int), value :: val, device_num, depobj_count} +@item @tab @code{integer(omp_depend_kind), optional :: depobj_list(*)} +@end multitable + + +@item @emph{See also}: +@ref{omp_target_memset} + +@item @emph{Reference}: +@uref{https://www.openmp.org, OpenMP specification v6.0}, Section 25.8.2 +@end table + + @node omp_target_associate_ptr @subsection @code{omp_target_associate_ptr} -- Associate a device pointer with a host pointer diff --git a/libgomp/omp.h.in b/libgomp/omp.h.in index 8d17db1..4f2bc46 100644 --- a/libgomp/omp.h.in +++ b/libgomp/omp.h.in @@ -347,6 +347,10 @@ extern int omp_target_memcpy_rect_async (void *, const void *, __SIZE_TYPE__, const __SIZE_TYPE__ *, int, int, int, omp_depend_t * __GOMP_DEFAULT_NULL) __GOMP_NOTHROW; +extern void *omp_target_memset (void *, int, __SIZE_TYPE__, int) __GOMP_NOTHROW; +extern void *omp_target_memset_async (void *, int, __SIZE_TYPE__, int, + int, omp_depend_t * __GOMP_DEFAULT_NULL) + __GOMP_NOTHROW; extern int omp_target_associate_ptr (const void *, const void *, __SIZE_TYPE__, __SIZE_TYPE__, int) __GOMP_NOTHROW; extern int omp_target_disassociate_ptr (const void *, int) __GOMP_NOTHROW; diff --git a/libgomp/omp_lib.f90.in b/libgomp/omp_lib.f90.in index cb6b95f..ce866c0 100644 --- a/libgomp/omp_lib.f90.in +++ b/libgomp/omp_lib.f90.in @@ -904,6 +904,29 @@ end interface interface + function omp_target_memset (ptr, val, count, device_num) bind(c) + use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t + type(c_ptr) :: omp_target_memset + type(c_ptr), value :: ptr + integer(c_size_t), value :: count + integer(c_int), value :: val, device_num + end function omp_target_memset + end interface + + interface + function omp_target_memset_async (ptr, val, count, device_num, & + depobj_count, depobj_list) bind(c) + use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t + import :: omp_depend_kind + type(c_ptr) :: omp_target_memset_async + type(c_ptr), value :: ptr + integer(c_size_t), value :: count + integer(c_int), value :: val, device_num, depobj_count + integer(omp_depend_kind), optional :: depobj_list(*) + end function omp_target_memset_async + end interface + + interface function omp_target_associate_ptr (host_ptr, device_ptr, size, & device_offset, device_num) bind(c) use, intrinsic :: iso_c_binding, only : c_ptr, c_size_t, c_int diff --git a/libgomp/omp_lib.h.in b/libgomp/omp_lib.h.in index f7af5ff..9047095 100644 --- a/libgomp/omp_lib.h.in +++ b/libgomp/omp_lib.h.in @@ -505,6 +505,31 @@ end interface interface + function omp_target_memset (ptr, val, count, device_num) bind(c) + use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t + type(c_ptr) omp_target_memset + type(c_ptr), value :: ptr + integer(c_size_t), value :: count + integer(c_int), value :: val, device_num + end function omp_target_memset + end interface + + interface + function omp_target_memset_async (ptr, val, count, device_num, & + & depobj_count, depobj_list) & + & bind(c) + use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t + import :: omp_depend_kind + type(c_ptr) :: omp_target_memset_async + type(c_ptr), value :: ptr + integer(c_size_t), value :: count + integer(c_int), value :: val, device_num, depobj_count + integer(omp_depend_kind), optional :: depobj_list(*) + end function omp_target_memset_async + end interface + + + interface function omp_target_associate_ptr (host_ptr, device_ptr, size, & & device_offset, device_num) & & bind(c) diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def index eb562ac..7f4ddcc 100644 --- a/libgomp/plugin/cuda-lib.def +++ b/libgomp/plugin/cuda-lib.def @@ -42,6 +42,7 @@ CUDA_ONE_CALL (cuMemcpyHtoDAsync) CUDA_ONE_CALL (cuMemcpy2D) CUDA_ONE_CALL (cuMemcpy2DUnaligned) CUDA_ONE_CALL (cuMemcpy3D) +CUDA_ONE_CALL (cuMemsetD8) CUDA_ONE_CALL (cuMemFree) CUDA_ONE_CALL (cuMemFreeHost) CUDA_ONE_CALL (cuMemGetAddressRange) diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index f823b27..ae358f5 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -208,6 +208,8 @@ struct hsa_runtime_fn_info hsa_status_t (*hsa_code_object_deserialize_fn) (void *serialized_code_object, size_t serialized_code_object_size, const char *options, hsa_code_object_t *code_object); + hsa_status_t (*hsa_amd_memory_fill_fn)(void *ptr, uint32_t value, + size_t count); hsa_status_t (*hsa_amd_memory_lock_fn) (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent, void **agent_ptr); @@ -1456,6 +1458,7 @@ init_hsa_runtime_functions (void) DLSYM_FN (hsa_signal_load_acquire) DLSYM_FN (hsa_queue_destroy) DLSYM_FN (hsa_code_object_deserialize) + DLSYM_OPT_FN (hsa_amd_memory_fill) DLSYM_OPT_FN (hsa_amd_memory_lock) DLSYM_OPT_FN (hsa_amd_memory_unlock) DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect) @@ -4437,6 +4440,83 @@ init_hip_runtime_functions (void) return true; } +bool +GOMP_OFFLOAD_memset (int ord, void *ptr, int val, size_t count) +{ + hsa_status_t status = HSA_STATUS_SUCCESS; + + /* A memset feature is only provided via hsa_amd_memory_fill; while it + is fast, it is an HSA extension and it has two requirements: The memory + must be aligned to multiples of 4 bytes - and, by construction, only + multiples of 4 bytes can be filled (uint32_t value argument). + + This means: Either not using that function or up to three function calls: + - copy 1 to 3 bytes to get alignment (hsa_memory_copy), if unaligned + - call hsa_amd_memory_fill + - copy remaining 1 to 3 bytes (hsa_memory_copy), if after alignment + count is not a multiple of 4 bytes. + + Having more than one function call is only profitable if there is + enough data to process; see below for the used heuristic values. */ + + uint8_t v8 = (uint8_t) val; + size_t before = (4 - (uintptr_t) ptr % 4) % 4; /* 0 to 3 bytes. */ + size_t tail = (count - before) % 4; /* 0 to 3 bytes. */ + + /* Heuristic */ + enum { + /* Prefer alloca to malloc up to ... */ + alloca_size = 256, /* bytes */ + /* Call hsa_amd_memory_fill also when two copy calls are required. */ + always_use_fill = 256*1024, /* bytes */ + /* Call hsa_amd_memory_fill also when on copy call is required. */ + use_fill_one_copy = (128+64)*1024 /* bytes */ + }; + + /* Do not call hsa_amd_memory_fill when any of the following conditions + is true. Note that it is always preferred if available and + before == tail == 0. */ + if (__builtin_expect (!hsa_fns.hsa_amd_memory_fill_fn, 0) + || (before && tail && count < always_use_fill) + || ((before || tail) && count < use_fill_one_copy)) + before = count; + + /* Copy call for alignment - or all data, if condition above is true. */ + if (before) + { + void *data; + if (before > alloca_size) + data = malloc (before * sizeof (uint8_t)); + else + data = alloca (before * sizeof (uint8_t)); + memset (data, val, before); + status = hsa_fns.hsa_memory_copy_fn (ptr, data, before); + if (before > alloca_size) + free (data); + if (data == 0 || status != HSA_STATUS_SUCCESS) + goto fail; + count -= before; + } + + if (count == 0) + return true; + + ptr += before; + + uint32_t values = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24); + status = hsa_fns.hsa_amd_memory_fill_fn (ptr, values, count / 4); + if (tail && status == HSA_STATUS_SUCCESS) + { + ptr += count - tail; + status = hsa_fns.hsa_memory_copy_fn (ptr, &values, tail); + } + if (status == HSA_STATUS_SUCCESS) + return true; + +fail: + GOMP_PLUGIN_error ("memory set failed"); + return false; +} void GOMP_OFFLOAD_interop (struct interop_obj_t *obj, int ord, diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 712c8b7..90c5916 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -2339,6 +2339,15 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size, } bool +GOMP_OFFLOAD_memset (int ord, void *ptr, int val, size_t count) +{ + if (!nvptx_attach_host_thread_to_device (ord)) + return false; + CUDA_CALL (cuMemsetD8, (CUdeviceptr) ptr, (unsigned char) val, count); + return true; +} + +bool GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src, size_t n, struct goacc_asyncqueue *aq) { diff --git a/libgomp/target.c b/libgomp/target.c index 4ad803a..1932e2a 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -5740,6 +5740,88 @@ omp_target_memcpy_rect_async (void *dst, const void *src, size_t element_size, return 0; } +static void +omp_target_memset_int (void *ptr, int val, size_t count, + struct gomp_device_descr *devicep) +{ + if (__builtin_expect (count == 0, 0)) + return; + if (devicep == NULL) + { + memset (ptr, val, count); + return; + } + + gomp_mutex_lock (&devicep->lock); + int ret = devicep->memset_func (devicep->target_id, ptr, val, count); + gomp_mutex_unlock (&devicep->lock); + if (!ret) + gomp_fatal ("omp_target_memset failed"); +} + +void* +omp_target_memset (void *ptr, int val, size_t count, int device_num) +{ + struct gomp_device_descr *devicep; + if (device_num == omp_initial_device + || device_num == gomp_get_num_devices () + || (devicep = resolve_device (device_num, false)) == NULL + || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) + || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) + devicep = NULL; + + omp_target_memset_int (ptr, val, count, devicep); + return ptr; +} + +typedef struct +{ + void *ptr; + size_t count; + struct gomp_device_descr *devicep; + int val; +} omp_target_memset_data; + +static void +omp_target_memset_async_helper (void *args) +{ + omp_target_memset_data *a = args; + omp_target_memset_int (a->ptr, a->val, a->count, a->devicep); +} + +void* +omp_target_memset_async (void *ptr, int val, size_t count, int device_num, + int depobj_count, omp_depend_t *depobj_list) +{ + void *depend[depobj_count + 5]; + struct gomp_device_descr *devicep; + unsigned flags = 0; + int i; + + if (device_num == omp_initial_device + || device_num == gomp_get_num_devices () + || (devicep = resolve_device (device_num, false)) == NULL + || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) + || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) + devicep = NULL; + + omp_target_memset_data s = {.ptr = ptr, .val = val, .count = count, + .devicep = devicep}; + if (depobj_count > 0 && depobj_list != NULL) + { + flags |= GOMP_TASK_FLAG_DEPEND; + depend[0] = 0; + depend[1] = (void *) (uintptr_t) depobj_count; + depend[2] = depend[3] = depend[4] = 0; + for (i = 0; i < depobj_count; ++i) + depend[i + 5] = &depobj_list[i]; + } + + GOMP_task (omp_target_memset_async_helper, &s, NULL, sizeof (s), + __alignof__ (s), true, flags, depend, 0, NULL); + return ptr; +} + int omp_target_associate_ptr (const void *host_ptr, const void *device_ptr, size_t size, size_t device_offset, int device_num) @@ -6307,6 +6389,7 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device, DLSYM_OPT (async_run, async_run); DLSYM_OPT (can_run, can_run); DLSYM (dev2dev); + DLSYM (memset); } if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200) { diff --git a/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-2.c b/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-2.c new file mode 100644 index 0000000..b36d2f5 --- /dev/null +++ b/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-2.c @@ -0,0 +1,62 @@ +// PR libgomp/120444 +// Async version + +#include <omp.h> + +int main() +{ + #pragma omp parallel for + for (int dev = omp_initial_device; dev <= omp_get_num_devices (); dev++) + { + char *ptr = (char *) omp_target_alloc (sizeof(int) * 1024, dev); + + omp_depend_t dep; + #pragma omp depobj(dep) depend(inout: ptr) + + /* Play also around with the alignment - as hsa_amd_memory_fill operates + on multiples of 4 bytes (uint32_t). */ + + for (int start = 0; start < 32; start++) + for (int tail = 0; tail < 32; tail++) + { + unsigned char val = '0' + start + tail; +#if __cplusplus + void *ptr2 = omp_target_memset_async (ptr + start, val, + 1024 - start - tail, dev, 0); +#else + void *ptr2 = omp_target_memset_async (ptr + start, val, + 1024 - start - tail, dev, 0, nullptr); +#endif + if (ptr + start != ptr2) + __builtin_abort (); + + #pragma omp taskwait + + #pragma omp target device(dev) is_device_ptr(ptr) depend(depobj: dep) nowait + for (int i = start; i < 1024 - start - tail; i++) + { + if (ptr[i] != val) + __builtin_abort (); + ptr[i] += 2; + } + + omp_target_memset_async (ptr + start, val + 3, + 1024 - start - tail, dev, 1, &dep); + + #pragma omp target device(dev) is_device_ptr(ptr) depend(depobj: dep) nowait + for (int i = start; i < 1024 - start - tail; i++) + { + if (ptr[i] != val + 3) + __builtin_abort (); + ptr[i] += 1; + } + + omp_target_memset_async (ptr + start, val - 3, + 1024 - start - tail, dev, 1, &dep); + + #pragma omp taskwait depend (depobj: dep) + } + #pragma omp depobj(dep) destroy + omp_target_free (ptr, dev); + } +} diff --git a/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-3.c b/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-3.c new file mode 100644 index 0000000..6f25204 --- /dev/null +++ b/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-3.c @@ -0,0 +1,80 @@ +#include <stddef.h> +#include <stdint.h> +#include <omp.h> + +#define MIN(x,y) ((x) < (y) ? x : y) + +enum { N = 524288 + 8 }; + +static void +init_val (int8_t *ptr, int val, size_t count) +{ + #pragma omp target is_device_ptr(ptr) firstprivate(val, count) + __builtin_memset (ptr, val, count); +} + +static void +check_val (int8_t *ptr, int val, size_t count) +{ + if (count == 0) + return; + #pragma omp target is_device_ptr(ptr) firstprivate(val, count) + for (size_t i = 0; i < count; i++) + if (ptr[i] != val) __builtin_abort (); +} + +static void +test_it (void *ptr, int lshift, size_t count) +{ + if (N < count + lshift) __builtin_abort (); + if (lshift >= 4) __builtin_abort (); + ptr += lshift; + + init_val (ptr, 'z', MIN (count + 32, N - lshift)); + + omp_target_memset (ptr, '1', count, omp_get_default_device()); + + check_val (ptr, '1', count); + check_val (ptr + count, 'z', MIN (32, N - lshift - count)); +} + + +int main() +{ + size_t size; + void *ptr = omp_target_alloc (N + 3, omp_get_default_device()); + ptr += (4 - (uintptr_t) ptr % 4) % 4; + if ((uintptr_t) ptr % 4 != 0) __builtin_abort (); + + test_it (ptr, 0, 1); + test_it (ptr, 3, 1); + test_it (ptr, 0, 4); + test_it (ptr, 3, 4); + test_it (ptr, 0, 5); + test_it (ptr, 3, 5); + test_it (ptr, 0, 6); + test_it (ptr, 3, 6); + + for (int i = 1; i <= 9; i++) + { + switch (i) + { + case 1: size = 16; break; // = 2^4 bytes + case 2: size = 32; break; // = 2^5 bytes + case 3: size = 64; break; // = 2^7 bytes + case 4: size = 128; break; // = 2^7 bytes + case 5: size = 256; break; // = 2^8 bytes + case 6: size = 512; break; // = 2^9 bytes + case 7: size = 65536; break; // = 2^16 bytes + case 8: size = 262144; break; // = 2^18 bytes + case 9: size = 524288; break; // = 2^20 bytes + default: __builtin_abort (); + } + test_it (ptr, 0, size); + test_it (ptr, 3, size); + test_it (ptr, 0, size + 1); + test_it (ptr, 3, size + 1); + test_it (ptr, 3, size + 2); + } + omp_target_free (ptr, omp_get_default_device()); +} diff --git a/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset.c b/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset.c new file mode 100644 index 0000000..01909f8 --- /dev/null +++ b/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset.c @@ -0,0 +1,62 @@ +// PR libgomp/120444 + +#include <omp.h> + +int main() +{ + for (int dev = omp_initial_device; dev < omp_get_num_devices (); dev++) + { + char *ptr = (char *) omp_target_alloc (sizeof(int) * 1024, dev); + + /* Play also around with the alignment - as hsa_amd_memory_fill operates + on multiples of 4 bytes (uint32_t). */ + + for (int start = 0; start < 32; start++) + for (int tail = 0; tail < 32; tail++) + { + unsigned char val = '0' + start + tail; + void *ptr2 = omp_target_memset (ptr + start, val, + 1024 - start - tail, dev); + if (ptr + start != ptr2) + __builtin_abort (); + + #pragma omp target device(dev) is_device_ptr(ptr) + for (int i = start; i < 1024 - start - tail; i++) + if (ptr[i] != val) + __builtin_abort (); + + } + + /* Check 'small' values for correctness. */ + + for (int start = 0; start < 32; start++) + for (int size = 0; size <= 64 + 32; size++) + { + omp_target_memset (ptr, 'a' - 2, 1024, dev); + + unsigned char val = '0' + start + size % 32; + void *ptr2 = omp_target_memset (ptr + start, val, size, dev); + + if (ptr + start != ptr2) + __builtin_abort (); + + if (size == 0) + continue; + + #pragma omp target device(dev) is_device_ptr(ptr) + { + for (int i = 0; i < start; i++) + if (ptr[i] != 'a' - 2) + __builtin_abort (); + for (int i = start; i < start + size; i++) + if (ptr[i] != val) + __builtin_abort (); + for (int i = start + size + 1; i < 1024; i++) + if (ptr[i] != 'a' - 2) + __builtin_abort (); + } + } + + omp_target_free (ptr, dev); + } +} diff --git a/libgomp/testsuite/libgomp.c/declare-variant-4-gfx942.c b/libgomp/testsuite/libgomp.c/declare-variant-4-gfx942.c new file mode 100644 index 0000000..d1df550 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/declare-variant-4-gfx942.c @@ -0,0 +1,8 @@ +/* { dg-do link { target { offload_target_amdgcn } } } */ +/* { dg-additional-options -foffload=amdgcn-amdhsa } */ +/* { dg-additional-options -foffload=-march=gfx942 } */ +/* { dg-additional-options "-foffload=-fdump-tree-optimized" } */ + +#include "declare-variant-4.h" + +/* { dg-final { only_for_offload_target amdgcn-amdhsa scan-offload-tree-dump "= gfx942 \\(\\);" "optimized" } } */ diff --git a/libgomp/testsuite/libgomp.c/declare-variant-4.h b/libgomp/testsuite/libgomp.c/declare-variant-4.h index 53788d2..2257f4c 100644 --- a/libgomp/testsuite/libgomp.c/declare-variant-4.h +++ b/libgomp/testsuite/libgomp.c/declare-variant-4.h @@ -37,6 +37,13 @@ gfx90c (void) __attribute__ ((noipa)) int +gfx942 (void) +{ + return 0x942; +} + +__attribute__ ((noipa)) +int gfx1030 (void) { return 0x1030; @@ -68,6 +75,7 @@ gfx1103 (void) #pragma omp declare variant(gfx908) match(device = {isa("gfx908")}) #pragma omp declare variant(gfx90a) match(device = {isa("gfx90a")}) #pragma omp declare variant(gfx90c) match(device = {isa("gfx90c")}) +#pragma omp declare variant(gfx942) match(device = {isa("gfx942")}) #pragma omp declare variant(gfx1030) match(device = {isa("gfx1030")}) #pragma omp declare variant(gfx1036) match(device = {isa("gfx1036")}) #pragma omp declare variant(gfx1100) match(device = {isa("gfx1100")}) diff --git a/libgomp/testsuite/libgomp.c/target-map-zero-sized-3.c b/libgomp/testsuite/libgomp.c/target-map-zero-sized-3.c index f968bd3..580c6ad 100644 --- a/libgomp/testsuite/libgomp.c/target-map-zero-sized-3.c +++ b/libgomp/testsuite/libgomp.c/target-map-zero-sized-3.c @@ -1,7 +1,7 @@ int main () { - int i, n, n2; + int i, n; int data[] = {1,2}; struct S { int **ptrset; @@ -33,16 +33,17 @@ main () i = 1; n = 0; - n2 = 2; + #pragma omp target enter data map(data) #pragma omp target enter data map(sptr1[:1], sptr1->ptrset[:3], sptr1->ptrset2[:3]) #pragma omp target enter data map(sptr1->ptrset[i][:n], sptr1->ptrset2[i][:n]) - #pragma omp target + #pragma omp target map(sptr1->ptrset[i][:n], sptr1->ptrset2[i][:n]) if (sptr1->ptrset2[1][0] != 1 || sptr1->ptrset2[1][1] != 2) __builtin_abort (); #pragma omp target exit data map(sptr1->ptrset[i][:n], sptr1->ptrset2[i][:n]) #pragma omp target exit data map(sptr1[:1], sptr1->ptrset[:3], sptr1->ptrset2[:3]) + #pragma omp target exit data map(data) __builtin_free (s1.ptrset); __builtin_free (s1.ptrset2); diff --git a/libgomp/testsuite/libgomp.fortran/omp_target_memset-2.f90 b/libgomp/testsuite/libgomp.fortran/omp_target_memset-2.f90 new file mode 100644 index 0000000..2641086 --- /dev/null +++ b/libgomp/testsuite/libgomp.fortran/omp_target_memset-2.f90 @@ -0,0 +1,67 @@ +! PR libgomp/120444 +! Async version + +use omp_lib +use iso_c_binding +implicit none (type, external) +integer(c_int) :: dev + +!$omp parallel do +do dev = omp_initial_device, omp_get_num_devices () +block + integer(c_int) :: i, val, start, tail + type(c_ptr) :: ptr, ptr2, tmpptr + integer(c_int8_t), pointer, contiguous :: fptr(:) + integer(c_intptr_t) :: intptr + integer(c_size_t), parameter :: count = 1024 + integer(omp_depend_kind) :: dep(1) + + ptr = omp_target_alloc (count, dev) + + !$omp depobj(dep(1)) depend(inout: ptr) + + ! Play also around with the alignment - as hsa_amd_memory_fill operates + ! on multiples of 4 bytes (c_int32_t) + + do start = 0, 31 + do tail = 0, 31 + val = iachar('0') + start + tail + + tmpptr = transfer (transfer (ptr, intptr) + start, tmpptr) + ptr2 = omp_target_memset_async (tmpptr, val, count - start - tail, dev, 0) + + if (.not. c_associated (tmpptr, ptr2)) stop 1 + + !$omp taskwait + + !$omp target device(dev) is_device_ptr(ptr) depend(depobj: dep(1)) nowait + do i = 1 + start, int(count, c_int) - start - tail + call c_f_pointer (ptr, fptr, [count]) + if (fptr(i) /= int (val, c_int8_t)) stop 2 + fptr(i) = fptr(i) + 2_c_int8_t + end do + !$omp end target + + ptr2 = omp_target_memset_async (tmpptr, val + 3, & + count - start - tail, dev, 1, dep) + + !$omp target device(dev) is_device_ptr(ptr) depend(depobj: dep(1)) nowait + do i = 1 + start, int(count, c_int) - start - tail + call c_f_pointer (ptr, fptr, [count]) + if (fptr(i) /= int (val + 3, c_int8_t)) stop 3 + fptr(i) = fptr(i) - 1_c_int8_t + end do + !$omp end target + + ptr2 = omp_target_memset_async (tmpptr, val - 3, & + count - start - tail, dev, 1, dep) + + !$omp taskwait depend (depobj: dep(1)) + end do + end do + + !$omp depobj(dep(1)) destroy + call omp_target_free (ptr, dev); +end block +end do +end diff --git a/libgomp/testsuite/libgomp.fortran/omp_target_memset.f90 b/libgomp/testsuite/libgomp.fortran/omp_target_memset.f90 new file mode 100644 index 0000000..1ee184a --- /dev/null +++ b/libgomp/testsuite/libgomp.fortran/omp_target_memset.f90 @@ -0,0 +1,39 @@ +! PR libgomp/120444 + +use omp_lib +use iso_c_binding +implicit none (type, external) + +integer(c_int) :: dev, i, val, start, tail +type(c_ptr) :: ptr, ptr2, tmpptr +integer(c_int8_t), pointer, contiguous :: fptr(:) +integer(c_intptr_t) :: intptr +integer(c_size_t), parameter :: count = 1024 + +do dev = omp_initial_device, omp_get_num_devices () + ptr = omp_target_alloc (count, dev) + + ! Play also around with the alignment - as hsa_amd_memory_fill operates + ! on multiples of 4 bytes (c_int32_t) + + do start = 0, 31 + do tail = 0, 31 + val = iachar('0') + start + tail + + tmpptr = transfer (transfer (ptr, intptr) + start, tmpptr) + ptr2 = omp_target_memset (tmpptr, val, count - start - tail, dev) + + if (.not. c_associated (tmpptr, ptr2)) stop 1 + + !$omp target device(dev) is_device_ptr(ptr) + do i = 1 + start, int(count, c_int) - start - tail + call c_f_pointer (ptr, fptr, [count]) + if (fptr(i) /= int (val, c_int8_t)) stop 2 + end do + !$omp end target + end do + end do + + call omp_target_free (ptr, dev); +end do +end |