aboutsummaryrefslogtreecommitdiff
path: root/libgomp
diff options
context:
space:
mode:
Diffstat (limited to 'libgomp')
-rw-r--r--libgomp/ChangeLog.omp55
-rw-r--r--libgomp/libgomp-plugin.h1
-rw-r--r--libgomp/libgomp.h3
-rw-r--r--libgomp/libgomp.map6
-rw-r--r--libgomp/libgomp.texi112
-rw-r--r--libgomp/omp.h.in4
-rw-r--r--libgomp/omp_lib.f90.in23
-rw-r--r--libgomp/omp_lib.h.in25
-rw-r--r--libgomp/plugin/cuda-lib.def1
-rw-r--r--libgomp/plugin/plugin-gcn.c80
-rw-r--r--libgomp/plugin/plugin-nvptx.c9
-rw-r--r--libgomp/target.c83
-rw-r--r--libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-2.c62
-rw-r--r--libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-3.c80
-rw-r--r--libgomp/testsuite/libgomp.c-c++-common/omp_target_memset.c62
-rw-r--r--libgomp/testsuite/libgomp.c/declare-variant-4-gfx942.c8
-rw-r--r--libgomp/testsuite/libgomp.c/declare-variant-4.h8
-rw-r--r--libgomp/testsuite/libgomp.c/target-map-zero-sized-3.c7
-rw-r--r--libgomp/testsuite/libgomp.fortran/omp_target_memset-2.f9067
-rw-r--r--libgomp/testsuite/libgomp.fortran/omp_target_memset.f9039
20 files changed, 728 insertions, 7 deletions
diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp
index 5ecab6d..2bf31a9 100644
--- a/libgomp/ChangeLog.omp
+++ b/libgomp/ChangeLog.omp
@@ -1,3 +1,58 @@
+2025-06-10 Tobias Burnus <tburnus@baylibre.com>
+
+ Backported from master:
+ 2025-06-10 Tobias Burnus <tburnus@baylibre.com>
+
+ * testsuite/libgomp.c/declare-variant-4.h (gfx942): New variant function.
+ * testsuite/libgomp.c/declare-variant-4-gfx942.c: New test.
+
+2025-06-10 Tobias Burnus <tburnus@baylibre.com>
+
+ Backported from master:
+ 2025-06-02 Tobias Burnus <tburnus@baylibre.com>
+
+ PR libgomp/120444
+ * libgomp-plugin.h (GOMP_OFFLOAD_memset): Declare.
+ * libgomp.h (struct gomp_device_descr): Add memset_func.
+ * libgomp.map (GOMP_6.0.1): Add omp_target_memset{,_async}.
+ * libgomp.texi (Device Memory Routines): Document them.
+ * omp.h.in (omp_target_memset, omp_target_memset_async): Declare.
+ * omp_lib.f90.in (omp_target_memset, omp_target_memset_async):
+ Add interfaces.
+ * omp_lib.h.in (omp_target_memset, omp_target_memset_async): Likewise.
+ * plugin/cuda-lib.def: Add cuMemsetD8.
+ * plugin/plugin-gcn.c (struct hsa_runtime_fn_info): Add
+ hsa_amd_memory_fill_fn.
+ (init_hsa_runtime_functions): DLSYM_OPT_FN load it.
+ (GOMP_OFFLOAD_memset): New.
+ * plugin/plugin-nvptx.c (GOMP_OFFLOAD_memset): New.
+ * target.c (omp_target_memset_int, omp_target_memset,
+ omp_target_memset_async_helper, omp_target_memset_async): New.
+ (gomp_load_plugin_for_device): Add DLSYM (memset).
+ * testsuite/libgomp.c-c++-common/omp_target_memset.c: New test.
+ * testsuite/libgomp.c-c++-common/omp_target_memset-2.c: New test.
+ * testsuite/libgomp.c-c++-common/omp_target_memset-3.c: New test.
+ * testsuite/libgomp.fortran/omp_target_memset.f90: New test.
+ * testsuite/libgomp.fortran/omp_target_memset-2.f90: New test.
+
+2025-06-06 Tobias Burnus <tburnus@baylibre.com>
+
+ Backported from master:
+ 2025-06-06 Tobias Burnus <tburnus@baylibre.com>
+ Sandra Loosemore <sloosemore@baylibre.com>
+
+ * libgomp.texi (omp_get_num_devices, omp_get_intrinsic_device):
+ Document builtin handling.
+
+2025-06-06 Tobias Burnus <tburnus@baylibre.com>
+
+ Backported from master:
+ 2025-06-06 Tobias Burnus <tburnus@baylibre.com>
+
+ PR target/120530
+ * testsuite/libgomp.c/target-map-zero-sized-3.c (main): Add missing
+ map clause; remove unused variable.
+
2025-05-30 Thomas Schwinge <tschwinge@baylibre.com>
* testsuite/libgomp.c++/target-flex-300.C: XFAIL.
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index 3c7741b..d0bcc23 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -179,6 +179,7 @@ extern int GOMP_OFFLOAD_memcpy3d (int, int, size_t, size_t, size_t, void *,
size_t, size_t, size_t, size_t, size_t,
const void *, size_t, size_t, size_t, size_t,
size_t);
+extern bool GOMP_OFFLOAD_memset (int, void *, int, size_t);
extern bool GOMP_OFFLOAD_can_run (void *);
extern void GOMP_OFFLOAD_run (int, void *, void *, void **);
extern void GOMP_OFFLOAD_async_run (int, void *, void *, void **, void *);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 571ac62c..465f7c1 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1441,9 +1441,10 @@ struct gomp_device_descr
__typeof (GOMP_OFFLOAD_page_locked_host_free) *page_locked_host_free_func;
__typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
__typeof (GOMP_OFFLOAD_host2dev) *host2dev_func;
+ __typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func;
__typeof (GOMP_OFFLOAD_memcpy2d) *memcpy2d_func;
__typeof (GOMP_OFFLOAD_memcpy3d) *memcpy3d_func;
- __typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func;
+ __typeof (GOMP_OFFLOAD_memset) *memset_func;
__typeof (GOMP_OFFLOAD_can_run) *can_run_func;
__typeof (GOMP_OFFLOAD_run) *run_func;
__typeof (GOMP_OFFLOAD_async_run) *async_run_func;
diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map
index bc2de6b..a6c523b 100644
--- a/libgomp/libgomp.map
+++ b/libgomp/libgomp.map
@@ -454,6 +454,12 @@ GOMP_6.0 {
omp_get_uid_from_device_8_;
} GOMP_5.1.3;
+GOMP_6.0.1 {
+ global:
+ omp_target_memset;
+ omp_target_memset_async;
+} GOMP_6.0;
+
OACC_2.0 {
global:
acc_get_num_devices;
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index e1b70b0..91259cc 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -603,7 +603,7 @@ to address of matching mapped list item per 5.1, Sect. 2.21.7.2 @tab N @tab
@code{omp_get_device_teams_thread_limit}, and
@code{omp_set_device_teams_thread_limit} routines @tab N @tab
@item @code{omp_target_memset} and @code{omp_target_memset_async} routines
- @tab N @tab
+ @tab Y @tab
@item Fortran version of the interop runtime routines @tab Y @tab
@item Routines for obtaining memory spaces/allocators for shared/device memory
@tab N @tab
@@ -1802,6 +1802,11 @@ Returns the number of available non-host devices.
The effect of running this routine in a @code{target} region is unspecified.
+Note that in GCC the function is marked pure, i.e. as returning always the
+same number. When GCC was not configured to support offloading, it is replaced
+by zero; compile with @option{-fno-builtin-omp_get_num_devices} if a run-time
+function is desired.
+
@item @emph{C/C++}:
@multitable @columnfractions .20 .80
@item @emph{Prototype}: @tab @code{int omp_get_num_devices(void);}
@@ -1812,6 +1817,9 @@ The effect of running this routine in a @code{target} region is unspecified.
@item @emph{Interface}: @tab @code{integer function omp_get_num_devices()}
@end multitable
+@item @emph{See also}:
+@ref{omp_get_initial_device}
+
@item @emph{Reference}:
@uref{https://www.openmp.org, OpenMP specification v4.5}, Section 3.2.31.
@end table
@@ -1950,6 +1958,12 @@ the value of @code{omp_initial_device}.
The effect of running this routine in a @code{target} region is unspecified.
+Note that GCC inlines this function unless you compile with
+@option{-fno-builtin-omp_get_initial_device}. If GCC was not configured to
+support offloading, it expands to constant zero; in non-host code it expands
+to @code{omp_initial_device}; and otherwise it is replaced with a call to
+@code{omp_get_num_devices}.
+
@item @emph{C/C++}
@multitable @columnfractions .20 .80
@item @emph{Prototype}: @tab @code{int omp_get_initial_device(void);}
@@ -1984,8 +1998,8 @@ pointers on devices. They have C linkage and do not throw exceptions.
* omp_target_memcpy_async:: Copy data between devices asynchronously
* omp_target_memcpy_rect:: Copy a subvolume of data between devices
* omp_target_memcpy_rect_async:: Copy a subvolume of data between devices asynchronously
-@c * omp_target_memset:: <fixme>/TR12
-@c * omp_target_memset_async:: <fixme>/TR12
+* omp_target_memset:: Set bytes in device memory
+* omp_target_memset_async:: Set bytes in device memory asynchronously
* omp_target_associate_ptr:: Associate a device pointer with a host pointer
* omp_target_disassociate_ptr:: Remove device--host pointer association
* omp_get_mapped_ptr:: Return device pointer to a host pointer
@@ -2398,6 +2412,98 @@ the initial device.
@end table
+@node omp_target_memset
+@subsection @code{omp_target_memset} -- Set bytes in device memory
+@table @asis
+@item @emph{Description}:
+This routine fills memory on the device identified by device number
+@var{device_num}. Starting from the device address @var{ptr}, the first
+@var{count} bytes are set to the value @var{val}, converted to
+@code{unsigned char}. If @var{count} is zero, the routine has no effect;
+if @var{ptr} is @code{NULL}, the behavior is unspecified. The function
+returns @var{ptr}.
+
+The @var{device_num} must be a conforming device number and @var{ptr} must be
+a valid device pointer for that device. Running this routine in a
+@code{target} region except on the initial device is not supported.
+
+@item @emph{C/C++}
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{void *omp_target_memcpy(void *ptr,}
+@item @tab @code{ int val,}
+@item @tab @code{ size_t count,}
+@item @tab @code{ int device_num)}
+@end multitable
+
+@item @emph{Fortran}:
+@multitable @columnfractions .20 .80
+@item @emph{Interface}: @tab @code{type(c_ptr) function omp_target_memset( &}
+@item @tab @code{ ptr, val, count, device_num) bind(C)}
+@item @tab @code{use, intrinsic :: iso_c_binding, only: c_ptr, c_size_t, c_int}
+@item @tab @code{type(c_ptr), value :: ptr}
+@item @tab @code{integer(c_size_t), value :: count}
+@item @tab @code{integer(c_int), value :: val, device_num}
+@end multitable
+
+@item @emph{See also}:
+@ref{omp_target_memset_async}
+
+@item @emph{Reference}:
+@uref{https://www.openmp.org, OpenMP specification v6.0}, Section 25.8.1
+@end table
+
+
+
+@node omp_target_memset_async
+@subsection @code{omp_target_memset} -- Set bytes in device memory asynchronously
+@table @asis
+@item @emph{Description}:
+This routine fills memory on the device identified by device number
+@var{device_num}. Starting from the device address @var{ptr}, the first
+@var{count} bytes are set to the value @var{val}, converted to
+@code{unsigned char}. If @var{count} is zero, the routine has no effect;
+if @var{ptr} is @code{NULL}, the behavior is unspecified. Task dependence
+is expressed by passing an array of depend objects to @var{depobj_list}, where
+the number of array elements is passed as @var{depobj_count}; if the count is
+zero, the @var{depobj_list} argument is ignored. In C++ and Fortran, the
+@var{depobj_list} argument can also be omitted in that case. The function
+returns @var{ptr}.
+
+The @var{device_num} must be a conforming device number and @var{ptr} must be
+a valid device pointer for that device. Running this routine in a
+@code{target} region except on the initial device is not supported.
+
+@item @emph{C/C++}
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{void *omp_target_memcpy_async(void *ptr,}
+@item @tab @code{ int val,}
+@item @tab @code{ size_t count,}
+@item @tab @code{ int device_num,}
+@item @tab @code{ int depobj_count,}
+@item @tab @code{ omp_depend_t *depobj_list)}
+@end multitable
+
+@item @emph{Fortran}:
+@multitable @columnfractions .20 .80
+@item @emph{Interface}: @tab @code{type(c_ptr) function omp_target_memset_async( &}
+@item @tab @code{ ptr, val, count, device_num, &}
+@item @tab @code{ depobj_count, depobj_list) bind(C)}
+@item @tab @code{use, intrinsic :: iso_c_binding, only: c_ptr, c_size_t, c_int}
+@item @tab @code{type(c_ptr), value :: ptr}
+@item @tab @code{integer(c_size_t), value :: count}
+@item @tab @code{integer(c_int), value :: val, device_num, depobj_count}
+@item @tab @code{integer(omp_depend_kind), optional :: depobj_list(*)}
+@end multitable
+
+
+@item @emph{See also}:
+@ref{omp_target_memset}
+
+@item @emph{Reference}:
+@uref{https://www.openmp.org, OpenMP specification v6.0}, Section 25.8.2
+@end table
+
+
@node omp_target_associate_ptr
@subsection @code{omp_target_associate_ptr} -- Associate a device pointer with a host pointer
diff --git a/libgomp/omp.h.in b/libgomp/omp.h.in
index 8d17db1..4f2bc46 100644
--- a/libgomp/omp.h.in
+++ b/libgomp/omp.h.in
@@ -347,6 +347,10 @@ extern int omp_target_memcpy_rect_async (void *, const void *, __SIZE_TYPE__,
const __SIZE_TYPE__ *, int, int, int,
omp_depend_t * __GOMP_DEFAULT_NULL)
__GOMP_NOTHROW;
+extern void *omp_target_memset (void *, int, __SIZE_TYPE__, int) __GOMP_NOTHROW;
+extern void *omp_target_memset_async (void *, int, __SIZE_TYPE__, int,
+ int, omp_depend_t * __GOMP_DEFAULT_NULL)
+ __GOMP_NOTHROW;
extern int omp_target_associate_ptr (const void *, const void *, __SIZE_TYPE__,
__SIZE_TYPE__, int) __GOMP_NOTHROW;
extern int omp_target_disassociate_ptr (const void *, int) __GOMP_NOTHROW;
diff --git a/libgomp/omp_lib.f90.in b/libgomp/omp_lib.f90.in
index cb6b95f..ce866c0 100644
--- a/libgomp/omp_lib.f90.in
+++ b/libgomp/omp_lib.f90.in
@@ -904,6 +904,29 @@
end interface
interface
+ function omp_target_memset (ptr, val, count, device_num) bind(c)
+ use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+ type(c_ptr) :: omp_target_memset
+ type(c_ptr), value :: ptr
+ integer(c_size_t), value :: count
+ integer(c_int), value :: val, device_num
+ end function omp_target_memset
+ end interface
+
+ interface
+ function omp_target_memset_async (ptr, val, count, device_num, &
+ depobj_count, depobj_list) bind(c)
+ use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+ import :: omp_depend_kind
+ type(c_ptr) :: omp_target_memset_async
+ type(c_ptr), value :: ptr
+ integer(c_size_t), value :: count
+ integer(c_int), value :: val, device_num, depobj_count
+ integer(omp_depend_kind), optional :: depobj_list(*)
+ end function omp_target_memset_async
+ end interface
+
+ interface
function omp_target_associate_ptr (host_ptr, device_ptr, size, &
device_offset, device_num) bind(c)
use, intrinsic :: iso_c_binding, only : c_ptr, c_size_t, c_int
diff --git a/libgomp/omp_lib.h.in b/libgomp/omp_lib.h.in
index f7af5ff..9047095 100644
--- a/libgomp/omp_lib.h.in
+++ b/libgomp/omp_lib.h.in
@@ -505,6 +505,31 @@
end interface
interface
+ function omp_target_memset (ptr, val, count, device_num) bind(c)
+ use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+ type(c_ptr) omp_target_memset
+ type(c_ptr), value :: ptr
+ integer(c_size_t), value :: count
+ integer(c_int), value :: val, device_num
+ end function omp_target_memset
+ end interface
+
+ interface
+ function omp_target_memset_async (ptr, val, count, device_num, &
+ & depobj_count, depobj_list) &
+ & bind(c)
+ use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+ import :: omp_depend_kind
+ type(c_ptr) :: omp_target_memset_async
+ type(c_ptr), value :: ptr
+ integer(c_size_t), value :: count
+ integer(c_int), value :: val, device_num, depobj_count
+ integer(omp_depend_kind), optional :: depobj_list(*)
+ end function omp_target_memset_async
+ end interface
+
+
+ interface
function omp_target_associate_ptr (host_ptr, device_ptr, size, &
& device_offset, device_num) &
& bind(c)
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index eb562ac..7f4ddcc 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -42,6 +42,7 @@ CUDA_ONE_CALL (cuMemcpyHtoDAsync)
CUDA_ONE_CALL (cuMemcpy2D)
CUDA_ONE_CALL (cuMemcpy2DUnaligned)
CUDA_ONE_CALL (cuMemcpy3D)
+CUDA_ONE_CALL (cuMemsetD8)
CUDA_ONE_CALL (cuMemFree)
CUDA_ONE_CALL (cuMemFreeHost)
CUDA_ONE_CALL (cuMemGetAddressRange)
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index f823b27..ae358f5 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -208,6 +208,8 @@ struct hsa_runtime_fn_info
hsa_status_t (*hsa_code_object_deserialize_fn)
(void *serialized_code_object, size_t serialized_code_object_size,
const char *options, hsa_code_object_t *code_object);
+ hsa_status_t (*hsa_amd_memory_fill_fn)(void *ptr, uint32_t value,
+ size_t count);
hsa_status_t (*hsa_amd_memory_lock_fn)
(void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
void **agent_ptr);
@@ -1456,6 +1458,7 @@ init_hsa_runtime_functions (void)
DLSYM_FN (hsa_signal_load_acquire)
DLSYM_FN (hsa_queue_destroy)
DLSYM_FN (hsa_code_object_deserialize)
+ DLSYM_OPT_FN (hsa_amd_memory_fill)
DLSYM_OPT_FN (hsa_amd_memory_lock)
DLSYM_OPT_FN (hsa_amd_memory_unlock)
DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
@@ -4437,6 +4440,83 @@ init_hip_runtime_functions (void)
return true;
}
+bool
+GOMP_OFFLOAD_memset (int ord, void *ptr, int val, size_t count)
+{
+ hsa_status_t status = HSA_STATUS_SUCCESS;
+
+ /* A memset feature is only provided via hsa_amd_memory_fill; while it
+ is fast, it is an HSA extension and it has two requirements: The memory
+ must be aligned to multiples of 4 bytes - and, by construction, only
+ multiples of 4 bytes can be filled (uint32_t value argument).
+
+ This means: Either not using that function or up to three function calls:
+ - copy 1 to 3 bytes to get alignment (hsa_memory_copy), if unaligned
+ - call hsa_amd_memory_fill
+ - copy remaining 1 to 3 bytes (hsa_memory_copy), if after alignment
+ count is not a multiple of 4 bytes.
+
+ Having more than one function call is only profitable if there is
+ enough data to process; see below for the used heuristic values. */
+
+ uint8_t v8 = (uint8_t) val;
+ size_t before = (4 - (uintptr_t) ptr % 4) % 4; /* 0 to 3 bytes. */
+ size_t tail = (count - before) % 4; /* 0 to 3 bytes. */
+
+ /* Heuristic */
+ enum {
+ /* Prefer alloca to malloc up to ... */
+ alloca_size = 256, /* bytes */
+ /* Call hsa_amd_memory_fill also when two copy calls are required. */
+ always_use_fill = 256*1024, /* bytes */
+ /* Call hsa_amd_memory_fill also when on copy call is required. */
+ use_fill_one_copy = (128+64)*1024 /* bytes */
+ };
+
+ /* Do not call hsa_amd_memory_fill when any of the following conditions
+ is true. Note that it is always preferred if available and
+ before == tail == 0. */
+ if (__builtin_expect (!hsa_fns.hsa_amd_memory_fill_fn, 0)
+ || (before && tail && count < always_use_fill)
+ || ((before || tail) && count < use_fill_one_copy))
+ before = count;
+
+ /* Copy call for alignment - or all data, if condition above is true. */
+ if (before)
+ {
+ void *data;
+ if (before > alloca_size)
+ data = malloc (before * sizeof (uint8_t));
+ else
+ data = alloca (before * sizeof (uint8_t));
+ memset (data, val, before);
+ status = hsa_fns.hsa_memory_copy_fn (ptr, data, before);
+ if (before > alloca_size)
+ free (data);
+ if (data == 0 || status != HSA_STATUS_SUCCESS)
+ goto fail;
+ count -= before;
+ }
+
+ if (count == 0)
+ return true;
+
+ ptr += before;
+
+ uint32_t values = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24);
+ status = hsa_fns.hsa_amd_memory_fill_fn (ptr, values, count / 4);
+ if (tail && status == HSA_STATUS_SUCCESS)
+ {
+ ptr += count - tail;
+ status = hsa_fns.hsa_memory_copy_fn (ptr, &values, tail);
+ }
+ if (status == HSA_STATUS_SUCCESS)
+ return true;
+
+fail:
+ GOMP_PLUGIN_error ("memory set failed");
+ return false;
+}
void
GOMP_OFFLOAD_interop (struct interop_obj_t *obj, int ord,
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 712c8b7..90c5916 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -2339,6 +2339,15 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
}
bool
+GOMP_OFFLOAD_memset (int ord, void *ptr, int val, size_t count)
+{
+ if (!nvptx_attach_host_thread_to_device (ord))
+ return false;
+ CUDA_CALL (cuMemsetD8, (CUdeviceptr) ptr, (unsigned char) val, count);
+ return true;
+}
+
+bool
GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
size_t n, struct goacc_asyncqueue *aq)
{
diff --git a/libgomp/target.c b/libgomp/target.c
index 4ad803a..1932e2a 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -5740,6 +5740,88 @@ omp_target_memcpy_rect_async (void *dst, const void *src, size_t element_size,
return 0;
}
+static void
+omp_target_memset_int (void *ptr, int val, size_t count,
+ struct gomp_device_descr *devicep)
+{
+ if (__builtin_expect (count == 0, 0))
+ return;
+ if (devicep == NULL)
+ {
+ memset (ptr, val, count);
+ return;
+ }
+
+ gomp_mutex_lock (&devicep->lock);
+ int ret = devicep->memset_func (devicep->target_id, ptr, val, count);
+ gomp_mutex_unlock (&devicep->lock);
+ if (!ret)
+ gomp_fatal ("omp_target_memset failed");
+}
+
+void*
+omp_target_memset (void *ptr, int val, size_t count, int device_num)
+{
+ struct gomp_device_descr *devicep;
+ if (device_num == omp_initial_device
+ || device_num == gomp_get_num_devices ()
+ || (devicep = resolve_device (device_num, false)) == NULL
+ || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+ || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+ devicep = NULL;
+
+ omp_target_memset_int (ptr, val, count, devicep);
+ return ptr;
+}
+
+typedef struct
+{
+ void *ptr;
+ size_t count;
+ struct gomp_device_descr *devicep;
+ int val;
+} omp_target_memset_data;
+
+static void
+omp_target_memset_async_helper (void *args)
+{
+ omp_target_memset_data *a = args;
+ omp_target_memset_int (a->ptr, a->val, a->count, a->devicep);
+}
+
+void*
+omp_target_memset_async (void *ptr, int val, size_t count, int device_num,
+ int depobj_count, omp_depend_t *depobj_list)
+{
+ void *depend[depobj_count + 5];
+ struct gomp_device_descr *devicep;
+ unsigned flags = 0;
+ int i;
+
+ if (device_num == omp_initial_device
+ || device_num == gomp_get_num_devices ()
+ || (devicep = resolve_device (device_num, false)) == NULL
+ || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+ || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+ devicep = NULL;
+
+ omp_target_memset_data s = {.ptr = ptr, .val = val, .count = count,
+ .devicep = devicep};
+ if (depobj_count > 0 && depobj_list != NULL)
+ {
+ flags |= GOMP_TASK_FLAG_DEPEND;
+ depend[0] = 0;
+ depend[1] = (void *) (uintptr_t) depobj_count;
+ depend[2] = depend[3] = depend[4] = 0;
+ for (i = 0; i < depobj_count; ++i)
+ depend[i + 5] = &depobj_list[i];
+ }
+
+ GOMP_task (omp_target_memset_async_helper, &s, NULL, sizeof (s),
+ __alignof__ (s), true, flags, depend, 0, NULL);
+ return ptr;
+}
+
int
omp_target_associate_ptr (const void *host_ptr, const void *device_ptr,
size_t size, size_t device_offset, int device_num)
@@ -6307,6 +6389,7 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
DLSYM_OPT (async_run, async_run);
DLSYM_OPT (can_run, can_run);
DLSYM (dev2dev);
+ DLSYM (memset);
}
if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
{
diff --git a/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-2.c b/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-2.c
new file mode 100644
index 0000000..b36d2f5
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-2.c
@@ -0,0 +1,62 @@
+// PR libgomp/120444
+// Async version
+
+#include <omp.h>
+
+int main()
+{
+ #pragma omp parallel for
+ for (int dev = omp_initial_device; dev <= omp_get_num_devices (); dev++)
+ {
+ char *ptr = (char *) omp_target_alloc (sizeof(int) * 1024, dev);
+
+ omp_depend_t dep;
+ #pragma omp depobj(dep) depend(inout: ptr)
+
+ /* Play also around with the alignment - as hsa_amd_memory_fill operates
+ on multiples of 4 bytes (uint32_t). */
+
+ for (int start = 0; start < 32; start++)
+ for (int tail = 0; tail < 32; tail++)
+ {
+ unsigned char val = '0' + start + tail;
+#if __cplusplus
+ void *ptr2 = omp_target_memset_async (ptr + start, val,
+ 1024 - start - tail, dev, 0);
+#else
+ void *ptr2 = omp_target_memset_async (ptr + start, val,
+ 1024 - start - tail, dev, 0, nullptr);
+#endif
+ if (ptr + start != ptr2)
+ __builtin_abort ();
+
+ #pragma omp taskwait
+
+ #pragma omp target device(dev) is_device_ptr(ptr) depend(depobj: dep) nowait
+ for (int i = start; i < 1024 - start - tail; i++)
+ {
+ if (ptr[i] != val)
+ __builtin_abort ();
+ ptr[i] += 2;
+ }
+
+ omp_target_memset_async (ptr + start, val + 3,
+ 1024 - start - tail, dev, 1, &dep);
+
+ #pragma omp target device(dev) is_device_ptr(ptr) depend(depobj: dep) nowait
+ for (int i = start; i < 1024 - start - tail; i++)
+ {
+ if (ptr[i] != val + 3)
+ __builtin_abort ();
+ ptr[i] += 1;
+ }
+
+ omp_target_memset_async (ptr + start, val - 3,
+ 1024 - start - tail, dev, 1, &dep);
+
+ #pragma omp taskwait depend (depobj: dep)
+ }
+ #pragma omp depobj(dep) destroy
+ omp_target_free (ptr, dev);
+ }
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-3.c b/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-3.c
new file mode 100644
index 0000000..6f25204
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-3.c
@@ -0,0 +1,80 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <omp.h>
+
+#define MIN(x,y) ((x) < (y) ? x : y)
+
+enum { N = 524288 + 8 };
+
+static void
+init_val (int8_t *ptr, int val, size_t count)
+{
+ #pragma omp target is_device_ptr(ptr) firstprivate(val, count)
+ __builtin_memset (ptr, val, count);
+}
+
+static void
+check_val (int8_t *ptr, int val, size_t count)
+{
+ if (count == 0)
+ return;
+ #pragma omp target is_device_ptr(ptr) firstprivate(val, count)
+ for (size_t i = 0; i < count; i++)
+ if (ptr[i] != val) __builtin_abort ();
+}
+
+static void
+test_it (void *ptr, int lshift, size_t count)
+{
+ if (N < count + lshift) __builtin_abort ();
+ if (lshift >= 4) __builtin_abort ();
+ ptr += lshift;
+
+ init_val (ptr, 'z', MIN (count + 32, N - lshift));
+
+ omp_target_memset (ptr, '1', count, omp_get_default_device());
+
+ check_val (ptr, '1', count);
+ check_val (ptr + count, 'z', MIN (32, N - lshift - count));
+}
+
+
+int main()
+{
+ size_t size;
+ void *ptr = omp_target_alloc (N + 3, omp_get_default_device());
+ ptr += (4 - (uintptr_t) ptr % 4) % 4;
+ if ((uintptr_t) ptr % 4 != 0) __builtin_abort ();
+
+ test_it (ptr, 0, 1);
+ test_it (ptr, 3, 1);
+ test_it (ptr, 0, 4);
+ test_it (ptr, 3, 4);
+ test_it (ptr, 0, 5);
+ test_it (ptr, 3, 5);
+ test_it (ptr, 0, 6);
+ test_it (ptr, 3, 6);
+
+ for (int i = 1; i <= 9; i++)
+ {
+ switch (i)
+ {
+ case 1: size = 16; break; // = 2^4 bytes
+ case 2: size = 32; break; // = 2^5 bytes
+ case 3: size = 64; break; // = 2^7 bytes
+ case 4: size = 128; break; // = 2^7 bytes
+ case 5: size = 256; break; // = 2^8 bytes
+ case 6: size = 512; break; // = 2^9 bytes
+ case 7: size = 65536; break; // = 2^16 bytes
+ case 8: size = 262144; break; // = 2^18 bytes
+ case 9: size = 524288; break; // = 2^20 bytes
+ default: __builtin_abort ();
+ }
+ test_it (ptr, 0, size);
+ test_it (ptr, 3, size);
+ test_it (ptr, 0, size + 1);
+ test_it (ptr, 3, size + 1);
+ test_it (ptr, 3, size + 2);
+ }
+ omp_target_free (ptr, omp_get_default_device());
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset.c b/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset.c
new file mode 100644
index 0000000..01909f8
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset.c
@@ -0,0 +1,62 @@
+// PR libgomp/120444
+
+#include <omp.h>
+
+int main()
+{
+ for (int dev = omp_initial_device; dev < omp_get_num_devices (); dev++)
+ {
+ char *ptr = (char *) omp_target_alloc (sizeof(int) * 1024, dev);
+
+ /* Play also around with the alignment - as hsa_amd_memory_fill operates
+ on multiples of 4 bytes (uint32_t). */
+
+ for (int start = 0; start < 32; start++)
+ for (int tail = 0; tail < 32; tail++)
+ {
+ unsigned char val = '0' + start + tail;
+ void *ptr2 = omp_target_memset (ptr + start, val,
+ 1024 - start - tail, dev);
+ if (ptr + start != ptr2)
+ __builtin_abort ();
+
+ #pragma omp target device(dev) is_device_ptr(ptr)
+ for (int i = start; i < 1024 - start - tail; i++)
+ if (ptr[i] != val)
+ __builtin_abort ();
+
+ }
+
+ /* Check 'small' values for correctness. */
+
+ for (int start = 0; start < 32; start++)
+ for (int size = 0; size <= 64 + 32; size++)
+ {
+ omp_target_memset (ptr, 'a' - 2, 1024, dev);
+
+ unsigned char val = '0' + start + size % 32;
+ void *ptr2 = omp_target_memset (ptr + start, val, size, dev);
+
+ if (ptr + start != ptr2)
+ __builtin_abort ();
+
+ if (size == 0)
+ continue;
+
+ #pragma omp target device(dev) is_device_ptr(ptr)
+ {
+ for (int i = 0; i < start; i++)
+ if (ptr[i] != 'a' - 2)
+ __builtin_abort ();
+ for (int i = start; i < start + size; i++)
+ if (ptr[i] != val)
+ __builtin_abort ();
+ for (int i = start + size + 1; i < 1024; i++)
+ if (ptr[i] != 'a' - 2)
+ __builtin_abort ();
+ }
+ }
+
+ omp_target_free (ptr, dev);
+ }
+}
diff --git a/libgomp/testsuite/libgomp.c/declare-variant-4-gfx942.c b/libgomp/testsuite/libgomp.c/declare-variant-4-gfx942.c
new file mode 100644
index 0000000..d1df550
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/declare-variant-4-gfx942.c
@@ -0,0 +1,8 @@
+/* { dg-do link { target { offload_target_amdgcn } } } */
+/* { dg-additional-options -foffload=amdgcn-amdhsa } */
+/* { dg-additional-options -foffload=-march=gfx942 } */
+/* { dg-additional-options "-foffload=-fdump-tree-optimized" } */
+
+#include "declare-variant-4.h"
+
+/* { dg-final { only_for_offload_target amdgcn-amdhsa scan-offload-tree-dump "= gfx942 \\(\\);" "optimized" } } */
diff --git a/libgomp/testsuite/libgomp.c/declare-variant-4.h b/libgomp/testsuite/libgomp.c/declare-variant-4.h
index 53788d2..2257f4c 100644
--- a/libgomp/testsuite/libgomp.c/declare-variant-4.h
+++ b/libgomp/testsuite/libgomp.c/declare-variant-4.h
@@ -37,6 +37,13 @@ gfx90c (void)
__attribute__ ((noipa))
int
+gfx942 (void)
+{
+ return 0x942;
+}
+
+__attribute__ ((noipa))
+int
gfx1030 (void)
{
return 0x1030;
@@ -68,6 +75,7 @@ gfx1103 (void)
#pragma omp declare variant(gfx908) match(device = {isa("gfx908")})
#pragma omp declare variant(gfx90a) match(device = {isa("gfx90a")})
#pragma omp declare variant(gfx90c) match(device = {isa("gfx90c")})
+#pragma omp declare variant(gfx942) match(device = {isa("gfx942")})
#pragma omp declare variant(gfx1030) match(device = {isa("gfx1030")})
#pragma omp declare variant(gfx1036) match(device = {isa("gfx1036")})
#pragma omp declare variant(gfx1100) match(device = {isa("gfx1100")})
diff --git a/libgomp/testsuite/libgomp.c/target-map-zero-sized-3.c b/libgomp/testsuite/libgomp.c/target-map-zero-sized-3.c
index f968bd3..580c6ad 100644
--- a/libgomp/testsuite/libgomp.c/target-map-zero-sized-3.c
+++ b/libgomp/testsuite/libgomp.c/target-map-zero-sized-3.c
@@ -1,7 +1,7 @@
int
main ()
{
- int i, n, n2;
+ int i, n;
int data[] = {1,2};
struct S {
int **ptrset;
@@ -33,16 +33,17 @@ main ()
i = 1;
n = 0;
- n2 = 2;
+ #pragma omp target enter data map(data)
#pragma omp target enter data map(sptr1[:1], sptr1->ptrset[:3], sptr1->ptrset2[:3])
#pragma omp target enter data map(sptr1->ptrset[i][:n], sptr1->ptrset2[i][:n])
- #pragma omp target
+ #pragma omp target map(sptr1->ptrset[i][:n], sptr1->ptrset2[i][:n])
if (sptr1->ptrset2[1][0] != 1 || sptr1->ptrset2[1][1] != 2)
__builtin_abort ();
#pragma omp target exit data map(sptr1->ptrset[i][:n], sptr1->ptrset2[i][:n])
#pragma omp target exit data map(sptr1[:1], sptr1->ptrset[:3], sptr1->ptrset2[:3])
+ #pragma omp target exit data map(data)
__builtin_free (s1.ptrset);
__builtin_free (s1.ptrset2);
diff --git a/libgomp/testsuite/libgomp.fortran/omp_target_memset-2.f90 b/libgomp/testsuite/libgomp.fortran/omp_target_memset-2.f90
new file mode 100644
index 0000000..2641086
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/omp_target_memset-2.f90
@@ -0,0 +1,67 @@
+! PR libgomp/120444
+! Async version
+
+use omp_lib
+use iso_c_binding
+implicit none (type, external)
+integer(c_int) :: dev
+
+!$omp parallel do
+do dev = omp_initial_device, omp_get_num_devices ()
+block
+ integer(c_int) :: i, val, start, tail
+ type(c_ptr) :: ptr, ptr2, tmpptr
+ integer(c_int8_t), pointer, contiguous :: fptr(:)
+ integer(c_intptr_t) :: intptr
+ integer(c_size_t), parameter :: count = 1024
+ integer(omp_depend_kind) :: dep(1)
+
+ ptr = omp_target_alloc (count, dev)
+
+ !$omp depobj(dep(1)) depend(inout: ptr)
+
+ ! Play also around with the alignment - as hsa_amd_memory_fill operates
+ ! on multiples of 4 bytes (c_int32_t)
+
+ do start = 0, 31
+ do tail = 0, 31
+ val = iachar('0') + start + tail
+
+ tmpptr = transfer (transfer (ptr, intptr) + start, tmpptr)
+ ptr2 = omp_target_memset_async (tmpptr, val, count - start - tail, dev, 0)
+
+ if (.not. c_associated (tmpptr, ptr2)) stop 1
+
+ !$omp taskwait
+
+ !$omp target device(dev) is_device_ptr(ptr) depend(depobj: dep(1)) nowait
+ do i = 1 + start, int(count, c_int) - start - tail
+ call c_f_pointer (ptr, fptr, [count])
+ if (fptr(i) /= int (val, c_int8_t)) stop 2
+ fptr(i) = fptr(i) + 2_c_int8_t
+ end do
+ !$omp end target
+
+ ptr2 = omp_target_memset_async (tmpptr, val + 3, &
+ count - start - tail, dev, 1, dep)
+
+ !$omp target device(dev) is_device_ptr(ptr) depend(depobj: dep(1)) nowait
+ do i = 1 + start, int(count, c_int) - start - tail
+ call c_f_pointer (ptr, fptr, [count])
+ if (fptr(i) /= int (val + 3, c_int8_t)) stop 3
+ fptr(i) = fptr(i) - 1_c_int8_t
+ end do
+ !$omp end target
+
+ ptr2 = omp_target_memset_async (tmpptr, val - 3, &
+ count - start - tail, dev, 1, dep)
+
+ !$omp taskwait depend (depobj: dep(1))
+ end do
+ end do
+
+ !$omp depobj(dep(1)) destroy
+ call omp_target_free (ptr, dev);
+end block
+end do
+end
diff --git a/libgomp/testsuite/libgomp.fortran/omp_target_memset.f90 b/libgomp/testsuite/libgomp.fortran/omp_target_memset.f90
new file mode 100644
index 0000000..1ee184a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/omp_target_memset.f90
@@ -0,0 +1,39 @@
+! PR libgomp/120444
+
+use omp_lib
+use iso_c_binding
+implicit none (type, external)
+
+integer(c_int) :: dev, i, val, start, tail
+type(c_ptr) :: ptr, ptr2, tmpptr
+integer(c_int8_t), pointer, contiguous :: fptr(:)
+integer(c_intptr_t) :: intptr
+integer(c_size_t), parameter :: count = 1024
+
+do dev = omp_initial_device, omp_get_num_devices ()
+ ptr = omp_target_alloc (count, dev)
+
+ ! Play also around with the alignment - as hsa_amd_memory_fill operates
+ ! on multiples of 4 bytes (c_int32_t)
+
+ do start = 0, 31
+ do tail = 0, 31
+ val = iachar('0') + start + tail
+
+ tmpptr = transfer (transfer (ptr, intptr) + start, tmpptr)
+ ptr2 = omp_target_memset (tmpptr, val, count - start - tail, dev)
+
+ if (.not. c_associated (tmpptr, ptr2)) stop 1
+
+ !$omp target device(dev) is_device_ptr(ptr)
+ do i = 1 + start, int(count, c_int) - start - tail
+ call c_f_pointer (ptr, fptr, [count])
+ if (fptr(i) /= int (val, c_int8_t)) stop 2
+ end do
+ !$omp end target
+ end do
+ end do
+
+ call omp_target_free (ptr, dev);
+end do
+end