diff options
Diffstat (limited to 'libgomp')
40 files changed, 2610 insertions, 23 deletions
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index 096e17b..49a62d4 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,3 +1,53 @@ +2025-04-24 Tobias Burnus <tburnus@baylibre.com> + + * testsuite/lib/libgomp.exp + (check_effective_target_gomp_hip_header_nvidia): Compile with + "-Wno-deprecated-declarations". + * testsuite/libgomp.c/interop-hip-nvidia-full.c: Likewise. + * testsuite/libgomp.c/interop-hipblas-nvidia-full.c: Likewise. + * testsuite/libgomp.c/interop-hipblas.h: Add workarounds + when using the HIP headers with __HIP_PLATFORM_NVIDIA__. + +2025-04-24 Tobias Burnus <tburnus@baylibre.com> + + * testsuite/lib/libgomp.exp (check_effective_target_openacc_cublas, + check_effective_target_openacc_cudart): Update description as + the check requires more. + (check_effective_target_openacc_libcuda, + check_effective_target_openacc_libcublas, + check_effective_target_openacc_libcudart, + check_effective_target_gomp_hip_header_amd, + check_effective_target_gomp_hip_header_nvidia, + check_effective_target_gomp_hipfort_module, + check_effective_target_gomp_libamdhip64, + check_effective_target_gomp_libhipblas): New. + * testsuite/libgomp.c-c++-common/interop-2.c: New test. + * testsuite/libgomp.c/interop-cublas-full.c: New test. + * testsuite/libgomp.c/interop-cublas-libonly.c: New test. + * testsuite/libgomp.c/interop-cuda-full.c: New test. + * testsuite/libgomp.c/interop-cuda-libonly.c: New test. + * testsuite/libgomp.c/interop-hip-amd-full.c: New test. + * testsuite/libgomp.c/interop-hip-amd-no-hip-header.c: New test. + * testsuite/libgomp.c/interop-hip-nvidia-full.c: New test. + * testsuite/libgomp.c/interop-hip-nvidia-no-headers.c: New test. + * testsuite/libgomp.c/interop-hip-nvidia-no-hip-header.c: New test. + * testsuite/libgomp.c/interop-hip.h: New test. + * testsuite/libgomp.c/interop-hipblas-amd-full.c: New test. + * testsuite/libgomp.c/interop-hipblas-amd-no-hip-header.c: New test. + * testsuite/libgomp.c/interop-hipblas-nvidia-full.c: New test. + * testsuite/libgomp.c/interop-hipblas-nvidia-no-headers.c: New test. + * testsuite/libgomp.c/interop-hipblas-nvidia-no-hip-header.c: New test. + * testsuite/libgomp.c/interop-hipblas.h: New test. + * testsuite/libgomp.fortran/interop-hip-amd-full.F90: New test. + * testsuite/libgomp.fortran/interop-hip-amd-no-module.F90: New test. + * testsuite/libgomp.fortran/interop-hip-nvidia-full.F90: New test. + * testsuite/libgomp.fortran/interop-hip-nvidia-no-module.F90: New test. + * testsuite/libgomp.fortran/interop-hip.h: New test. + +2025-04-23 Tobias Burnus <tburnus@baylibre.com> + + * testsuite/libgomp.fortran/target-enter-data-8.f90: New test. + 2025-04-17 Jakub Jelinek <jakub@redhat.com> PR libgomp/119849 diff --git a/libgomp/Makefile.am b/libgomp/Makefile.am index e3202ae..19479ae 100644 --- a/libgomp/Makefile.am +++ b/libgomp/Makefile.am @@ -70,7 +70,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c error.c \ target.c splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c \ oacc-init.c oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c \ priority_queue.c affinity-fmt.c teams.c allocator.c oacc-profiling.c \ - oacc-target.c target-indirect.c + oacc-target.c target-indirect.c target-cxa-dso-dtor.c include $(top_srcdir)/plugin/Makefrag.am diff --git a/libgomp/Makefile.in b/libgomp/Makefile.in index 2a0a842..6d22b3d 100644 --- a/libgomp/Makefile.in +++ b/libgomp/Makefile.in @@ -219,7 +219,8 @@ am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \ oacc-parallel.lo oacc-host.lo oacc-init.lo oacc-mem.lo \ oacc-async.lo oacc-plugin.lo oacc-cuda.lo priority_queue.lo \ affinity-fmt.lo teams.lo allocator.lo oacc-profiling.lo \ - oacc-target.lo target-indirect.lo $(am__objects_1) + oacc-target.lo target-indirect.lo target-cxa-dso-dtor.lo \ + $(am__objects_1) libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS) AM_V_P = $(am__v_P_@AM_V@) am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) @@ -552,7 +553,8 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \ oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \ oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \ affinity-fmt.c teams.c allocator.c oacc-profiling.c \ - oacc-target.c target-indirect.c $(am__append_3) + oacc-target.c target-indirect.c target-cxa-dso-dtor.c \ + $(am__append_3) # Nvidia PTX OpenACC plugin. @PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION) @@ -780,6 +782,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sem.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/single.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/splay-tree.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target-cxa-dso-dtor.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target-indirect.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/task.Plo@am__quote@ diff --git a/libgomp/config/accel/target-cxa-dso-dtor.c b/libgomp/config/accel/target-cxa-dso-dtor.c new file mode 100644 index 0000000..e40a5f0 --- /dev/null +++ b/libgomp/config/accel/target-cxa-dso-dtor.c @@ -0,0 +1,62 @@ +/* Host/device compatibility: Itanium C++ ABI, DSO Object Destruction API + + Copyright (C) 2025 Free Software Foundation, Inc. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#include "libgomp.h" + +extern void __cxa_finalize (void *); + +/* See <https://itanium-cxx-abi.github.io/cxx-abi/abi.html#dso-dtor>. + + Even if the device is '!DEFAULT_USE_CXA_ATEXIT', we may see '__cxa_atexit' + calls, referencing '__dso_handle', via a 'DEFAULT_USE_CXA_ATEXIT' host. + '__cxa_atexit' is provided by newlib, but use of '__dso_handle' for nvptx + results in 'ld' error: + + unresolved symbol __dso_handle + collect2: error: ld returned 1 exit status + nvptx mkoffload: fatal error: [...]/x86_64-pc-linux-gnu-accel-nvptx-none-gcc returned 1 exit status + + ..., or for GCN get an implicit definition (running with + '--trace-symbol=__dso_handle'): + + ./a.xamdgcn-amdhsa.mkoffload.hsaco-a.xamdgcn-amdhsa.mkoffload.2.o: reference to __dso_handle + <internal>: definition of __dso_handle + + ..., which might be fine, but let's just make it explicit. */ + +/* There are no DSOs; this is the main program. */ +attribute_hidden void * const __dso_handle = 0; + +/* If this file gets linked in, that means that '__dso_handle' has been + referenced (for '__cxa_atexit'), and in that case, we also have to run + '__cxa_finalize'. Make that happen by overriding the weak libgcc dummy + function '__GCC_offload___cxa_finalize'. */ + +void +__GCC_offload___cxa_finalize (void *dso_handle) +{ + __cxa_finalize (dso_handle); +} diff --git a/libgomp/target-cxa-dso-dtor.c b/libgomp/target-cxa-dso-dtor.c new file mode 100644 index 0000000..d1a898d --- /dev/null +++ b/libgomp/target-cxa-dso-dtor.c @@ -0,0 +1,3 @@ +/* Host/device compatibility: Itanium C++ ABI, DSO Object Destruction API */ + +/* Nothing needed here. */ diff --git a/libgomp/testsuite/lib/libgomp.exp b/libgomp/testsuite/lib/libgomp.exp index bc38e3c..54f2f708 100644 --- a/libgomp/testsuite/lib/libgomp.exp +++ b/libgomp/testsuite/lib/libgomp.exp @@ -553,7 +553,23 @@ int main() { } } "-lcuda" ] } -# Return 1 if cublas_v2.h and -lcublas are available. +# Return 1 if -lcuda is available (header not required). + +proc check_effective_target_openacc_libcuda { } { + return [check_no_compiler_messages openacc_libcuda executable { +typedef enum { CUDA_SUCCESS } CUresult; +typedef int CUdevice; +CUresult cuDeviceGet (CUdevice *, int); +int main() { + CUdevice dev; + CUresult r = cuDeviceGet (&dev, 0); + if (r != CUDA_SUCCESS) + return 1; + return 0; +} } "-lcuda" ] +} + +# Return 1 if cublas_v2.h, cuda.h, -lcublas and -lcuda are available. proc check_effective_target_openacc_cublas { } { return [check_no_compiler_messages openacc_cublas executable { @@ -573,7 +589,25 @@ int main() { } } "-lcuda -lcublas" ] } -# Return 1 if cuda_runtime_api.h and -lcudart are available. +# Return 1 if -lcublas is available header not required). + +proc check_effective_target_openacc_libcublas { } { + return [check_no_compiler_messages openacc_libcublas executable { +typedef enum { CUBLAS_STATUS_SUCCESS } cublasStatus_t; +typedef struct cublasContext* cublasHandle_t; +#define cublasCreate cublasCreate_v2 +cublasStatus_t cublasCreate_v2 (cublasHandle_t *); +int main() { + cublasStatus_t s; + cublasHandle_t h; + s = cublasCreate (&h); + if (s != CUBLAS_STATUS_SUCCESS) + return 1; + return 0; +} } "-lcublas" ] +} + +# Return 1 if cuda_runtime_api.h, cuda.h, -lcuda and -lcudart are available. proc check_effective_target_openacc_cudart { } { return [check_no_compiler_messages openacc_cudart executable { @@ -592,3 +626,98 @@ int main() { return 0; } } "-lcuda -lcudart" ] } + +# Return 1 if -lcudart is available (no header required). + +proc check_effective_target_openacc_libcudart { } { + return [check_no_compiler_messages openacc_libcudart executable { +typedef int cudaError_t; +cudaError_t cudaGetDevice(int *); +enum { cudaSuccess }; +int main() { + cudaError_t e; + int devn; + e = cudaGetDevice (&devn); + if (e != cudaSuccess) + return 1; + return 0; +} } "-lcudart" ] +} + +# Return 1 if hip.h is available (no link check; AMD platform). + +proc check_effective_target_gomp_hip_header_amd { } { + return [check_no_compiler_messages gomp_hip_header_amd assembly { +#define __HIP_PLATFORM_AMD__ +#include <hip/hip_runtime_api.h> +int main() { + hipDevice_t dev; + hipError_t r = hipDeviceGet (&dev, 0); + if (r != hipSuccess) + return 1; + return 0; +} }] +} + +# Return 1 if hip.h is available (no link check; Nvidia/CUDA platform). + +proc check_effective_target_gomp_hip_header_nvidia { } { + return [check_no_compiler_messages gomp_hip_header_nvidia assembly { +#define __HIP_PLATFORM_NVIDIA__ +#include <hip/hip_runtime_api.h> +int main() { + hipDevice_t dev; + hipError_t r = hipDeviceGet (&dev, 0); + if (r != hipSuccess) + return 1; + return 0; +} } "-Wno-deprecated-declarations"] +} + +# Return 1 if the Fortran hipfort module is available (no link check) + +proc check_effective_target_gomp_hipfort_module { } { + return [check_no_compiler_messages gomp_hipfort_module assembly { +! Fortran +use hipfort +implicit none +integer(kind(hipSuccess)) :: r +integer(c_int) :: dev +r = hipDeviceGet (dev, 0) +if (r /= hipSuccess) error stop +end +}] +} + +# Return 1 if AMD HIP's -lamdhip64 is available (no header required). + +proc check_effective_target_gomp_libamdhip64 { } { + return [check_no_compiler_messages gomp_libamdhip64 executable { +typedef int hipError_t; +typedef int hipDevice_t; +enum { hipSuccess = 0 }; +hipError_t hipDeviceGet(hipDevice_t*, int); +int main() { + hipDevice_t dev; + hipError_t r = hipDeviceGet (&dev, 0); + if (r != hipSuccess) + return 1; + return 0; +} } "-lamdhip64" ] +} + +# Return 1 if AMD HIP's -lamdhip64 is available (no header required). + +proc check_effective_target_gomp_libhipblas { } { + return [check_no_compiler_messages gomp_libhipblas executable { +typedef enum { HIPBLAS_STATUS_SUCCESS = 0 } hipblasStatus_t; +typedef void* hipblasHandle_t; +hipblasStatus_t hipblasCreate (hipblasHandle_t*); +int main() { + hipblasHandle_t handle; + hipblasStatus_t stat = hipblasCreate (&handle); + if (stat != HIPBLAS_STATUS_SUCCESS) + return 1; + return 0; +} } "-lhipblas" ] +} diff --git a/libgomp/testsuite/libgomp.c++/target-cdtor-1.C b/libgomp/testsuite/libgomp.c++/target-cdtor-1.C new file mode 100644 index 0000000..ecb029e --- /dev/null +++ b/libgomp/testsuite/libgomp.c++/target-cdtor-1.C @@ -0,0 +1,104 @@ +/* Offloaded C++ objects construction and destruction. */ + +/* { dg-additional-options -fdump-tree-optimized-raw-asmname } + { dg-additional-options -foffload-options=-fdump-tree-optimized-raw-asmname } */ + +#include <omp.h> +#include <vector> + +#pragma omp declare target + +struct S +{ + int x; + + S() + : x(-1) + { + __builtin_printf("%s, %d, %d\n", __FUNCTION__, x, omp_is_initial_device()); + } + S(int x) + : x(x) + { + __builtin_printf("%s, %d, %d\n", __FUNCTION__, x, omp_is_initial_device()); + } + ~S() + { + __builtin_printf("%s, %d, %d\n", __FUNCTION__, x, omp_is_initial_device()); + } +}; + +#pragma omp end declare target + +S sH1(7); + +#pragma omp declare target + +S sHD1(5); + +std::vector<S> svHD1(2); + +#pragma omp end declare target + +S sH2(3); + +int main() +{ + int c = 0; + + __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device()); + +#pragma omp target map(c) + { + __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device()); + } + +#pragma omp target map(c) + { + __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device()); + } + + __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device()); + + return 0; +} + +/* Verify '__cxa_atexit' calls. + + For the host, there are four expected calls: + { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, } 4 optimized { target cxa_atexit } } } + { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZN1SD1Ev, \&sH1, \&__dso_handle>} 1 optimized { target cxa_atexit } } } + { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZN1SD1Ev, \&sHD1, \&__dso_handle>} 1 optimized { target cxa_atexit } } } + { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZNSt6vectorI1SSaIS0_EED1Ev, \&svHD1, \&__dso_handle>} 1 optimized { target cxa_atexit } } } + { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZN1SD1Ev, \&sH2, \&__dso_handle>} 1 optimized { target cxa_atexit } } } + + For the device, there are two expected calls: + { dg-final { scan-offload-tree-dump-times {gimple_call <__cxa_atexit, } 2 optimized { target cxa_atexit } } } + { dg-final { scan-offload-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZN1SD1Ev, \&sHD1, \&__dso_handle>} 1 optimized { target cxa_atexit } } } + { dg-final { scan-offload-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZNSt6vectorI1SSaIS0_EED1Ev, \&svHD1, \&__dso_handle>} 1 optimized { target cxa_atexit } } } +*/ + +/* C++ objects are constructed in order of appearance (..., and destructed in reverse order). + { dg-output {S, 7, 1[\r\n]+} } + { dg-output {S, 5, 1[\r\n]+} } + { dg-output {S, -1, 1[\r\n]+} } + { dg-output {S, -1, 1[\r\n]+} } + { dg-output {S, 3, 1[\r\n]+} } + { dg-output {main:1, 1[\r\n]+} } + { dg-output {S, 5, 0[\r\n]+} { target offload_device } } + { dg-output {S, -1, 0[\r\n]+} { target offload_device } } + { dg-output {S, -1, 0[\r\n]+} { target offload_device } } + { dg-output {main:2, 1[\r\n]+} { target { ! offload_device } } } + { dg-output {main:2, 0[\r\n]+} { target offload_device } } + { dg-output {main:3, 1[\r\n]+} { target { ! offload_device } } } + { dg-output {main:3, 0[\r\n]+} { target offload_device } } + { dg-output {main:4, 1[\r\n]+} } + { dg-output {~S, -1, 0[\r\n]+} { target offload_device } } + { dg-output {~S, -1, 0[\r\n]+} { target offload_device } } + { dg-output {~S, 5, 0[\r\n]+} { target offload_device } } + { dg-output {~S, 3, 1[\r\n]+} } + { dg-output {~S, -1, 1[\r\n]+} } + { dg-output {~S, -1, 1[\r\n]+} } + { dg-output {~S, 5, 1[\r\n]+} } + { dg-output {~S, 7, 1[\r\n]+} } +*/ diff --git a/libgomp/testsuite/libgomp.c++/target-cdtor-2.C b/libgomp/testsuite/libgomp.c++/target-cdtor-2.C new file mode 100644 index 0000000..75e48ca --- /dev/null +++ b/libgomp/testsuite/libgomp.c++/target-cdtor-2.C @@ -0,0 +1,140 @@ +/* Offloaded 'constructor' and 'destructor' functions, and C++ objects construction and destruction. */ + +/* { dg-require-effective-target init_priority } */ + +/* { dg-additional-options -fdump-tree-optimized-raw-asmname } + { dg-additional-options -foffload-options=-fdump-tree-optimized-raw-asmname } */ + +#include <omp.h> +#include <vector> + +#pragma omp declare target + +struct S +{ + int x; + + S() + : x(-1) + { + __builtin_printf("%s, %d, %d\n", __FUNCTION__, x, omp_is_initial_device()); + } + S(int x) + : x(x) + { + __builtin_printf("%s, %d, %d\n", __FUNCTION__, x, omp_is_initial_device()); + } + ~S() + { + __builtin_printf("%s, %d, %d\n", __FUNCTION__, x, omp_is_initial_device()); + } +}; + +#pragma omp end declare target + +S sH1 __attribute__((init_priority(1500))) (7); + +#pragma omp declare target + +S sHD1 __attribute__((init_priority(2000))) (5); + +std::vector<S> svHD1 __attribute__((init_priority(1000))) (2); + +static void +__attribute__((constructor(20000))) +initDH1() +{ + __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device()); +} + +static void +__attribute__((destructor(20000))) +finiDH1() +{ + __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device()); +} + +#pragma omp end declare target + +S sH2 __attribute__((init_priority(500))) (3); + +static void +__attribute__((constructor(10000))) +initH1() +{ + __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device()); +} + +static void +__attribute__((destructor(10000))) +finiH1() +{ + __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device()); +} + +int main() +{ + int c = 0; + + __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device()); + +#pragma omp target map(c) + { + __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device()); + } + +#pragma omp target map(c) + { + __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device()); + } + + __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device()); + + return 0; +} + +/* Verify '__cxa_atexit' calls. + + For the host, there are four expected calls: + { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, } 4 optimized { target cxa_atexit } } } + { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZN1SD1Ev, \&sH1, \&__dso_handle>} 1 optimized { target cxa_atexit } } } + { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZN1SD1Ev, \&sHD1, \&__dso_handle>} 1 optimized { target cxa_atexit } } } + { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZNSt6vectorI1SSaIS0_EED1Ev, \&svHD1, \&__dso_handle>} 1 optimized { target cxa_atexit } } } + { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZN1SD1Ev, \&sH2, \&__dso_handle>} 1 optimized { target cxa_atexit } } } + + For the device, there are two expected calls: + { dg-final { scan-offload-tree-dump-times {gimple_call <__cxa_atexit, } 2 optimized { target cxa_atexit } } } + { dg-final { scan-offload-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZN1SD1Ev, \&sHD1, \&__dso_handle>} 1 optimized { target cxa_atexit } } } + { dg-final { scan-offload-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZNSt6vectorI1SSaIS0_EED1Ev, \&svHD1, \&__dso_handle>} 1 optimized { target cxa_atexit } } } +*/ + +/* Defined order in which 'constructor' functions, and 'destructor' functions are run, and C++ objects are constructed (..., and destructed in reverse order). + { dg-output {S, 3, 1[\r\n]+} } + { dg-output {S, -1, 1[\r\n]+} } + { dg-output {S, -1, 1[\r\n]+} } + { dg-output {S, 7, 1[\r\n]+} } + { dg-output {S, 5, 1[\r\n]+} } + { dg-output {initH1, 1[\r\n]+} } + { dg-output {initDH1, 1[\r\n]+} } + { dg-output {main:1, 1[\r\n]+} } + { dg-output {S, -1, 0[\r\n]+} { target offload_device } } + { dg-output {S, -1, 0[\r\n]+} { target offload_device } } + { dg-output {S, 5, 0[\r\n]+} { target offload_device } } + { dg-output {initDH1, 0[\r\n]+} { target offload_device } } + { dg-output {main:2, 1[\r\n]+} { target { ! offload_device } } } + { dg-output {main:2, 0[\r\n]+} { target offload_device } } + { dg-output {main:3, 1[\r\n]+} { target { ! offload_device } } } + { dg-output {main:3, 0[\r\n]+} { target offload_device } } + { dg-output {main:4, 1[\r\n]+} } + { dg-output {~S, 5, 0[\r\n]+} { target offload_device } } + { dg-output {~S, -1, 0[\r\n]+} { target offload_device } } + { dg-output {~S, -1, 0[\r\n]+} { target offload_device } } + { dg-output {finiDH1, 0[\r\n]+} { target offload_device } } + { dg-output {~S, 5, 1[\r\n]+} } + { dg-output {~S, 7, 1[\r\n]+} } + { dg-output {~S, -1, 1[\r\n]+} } + { dg-output {~S, -1, 1[\r\n]+} } + { dg-output {~S, 3, 1[\r\n]+} } + { dg-output {finiDH1, 1[\r\n]+} } + { dg-output {finiH1, 1[\r\n]+} } +*/ diff --git a/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1-offload-sorry-GCN.C b/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1-offload-sorry-GCN.C index 3cdedf4..d4dccf1 100644 --- a/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1-offload-sorry-GCN.C +++ b/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1-offload-sorry-GCN.C @@ -14,8 +14,10 @@ /* In this specific C++ arrangement, distilled from PR118794, GCC synthesizes '__builtin_eh_pointer', '__builtin_unwind_resume' calls as dead code in 'f': - { dg-final { scan-tree-dump-times {gimple_call <__builtin_eh_pointer, } 1 optimized } } - { dg-final { scan-tree-dump-times {gimple_call <__builtin_unwind_resume, } 1 optimized } } + { dg-final { scan-tree-dump-times {gimple_call <__builtin_eh_pointer, } 1 optimized { target { ! { arm_eabi || tic6x-*-* } } } } } + { dg-final { scan-tree-dump-times {gimple_call <__builtin_unwind_resume, } 1 optimized { target { ! { arm_eabi || tic6x-*-* } } } } } + ..., just 'targetm.arm_eabi_unwinder' is different: + { dg-final { scan-tree-dump-times {gimple_call <__builtin_cxa_end_cleanup, } 1 optimized { target { arm_eabi || tic6x-*-* } } } } { dg-final { only_for_offload_target amdgcn-amdhsa scan-offload-tree-dump-times {gimple_call <__builtin_eh_pointer, } 1 optimized } } { dg-final { only_for_offload_target amdgcn-amdhsa scan-offload-tree-dump-times {gimple_call <__builtin_unwind_resume, } 1 optimized } } Given '-O0' and '-foffload-options=-mno-fake-exceptions', offload compilation fails: diff --git a/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1-offload-sorry-nvptx.C b/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1-offload-sorry-nvptx.C index ef996cf..724e34b 100644 --- a/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1-offload-sorry-nvptx.C +++ b/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1-offload-sorry-nvptx.C @@ -14,8 +14,10 @@ /* In this specific C++ arrangement, distilled from PR118794, GCC synthesizes '__builtin_eh_pointer', '__builtin_unwind_resume' calls as dead code in 'f': - { dg-final { scan-tree-dump-times {gimple_call <__builtin_eh_pointer, } 1 optimized } } - { dg-final { scan-tree-dump-times {gimple_call <__builtin_unwind_resume, } 1 optimized } } + { dg-final { scan-tree-dump-times {gimple_call <__builtin_eh_pointer, } 1 optimized { target { ! { arm_eabi || tic6x-*-* } } } } } + { dg-final { scan-tree-dump-times {gimple_call <__builtin_unwind_resume, } 1 optimized { target { ! { arm_eabi || tic6x-*-* } } } } } + ..., just 'targetm.arm_eabi_unwinder' is different: + { dg-final { scan-tree-dump-times {gimple_call <__builtin_cxa_end_cleanup, } 1 optimized { target { arm_eabi || tic6x-*-* } } } } { dg-final { only_for_offload_target nvptx-none scan-offload-tree-dump-times {gimple_call <__builtin_eh_pointer, } 1 optimized } } { dg-final { only_for_offload_target nvptx-none scan-offload-tree-dump-times {gimple_call <__builtin_unwind_resume, } 1 optimized } } Given '-O0' and '-foffload-options=-mno-fake-exceptions', offload compilation fails: diff --git a/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1.C b/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1.C index 24e3d07..24eb7a5 100644 --- a/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1.C +++ b/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1.C @@ -51,7 +51,9 @@ int main() /* In this specific C++ arrangement, distilled from PR118794, GCC synthesizes '__builtin_eh_pointer', '__builtin_unwind_resume' calls as dead code in 'f': - { dg-final { scan-tree-dump-times {gimple_call <__builtin_eh_pointer, } 1 optimized } } - { dg-final { scan-tree-dump-times {gimple_call <__builtin_unwind_resume, } 1 optimized } } + { dg-final { scan-tree-dump-times {gimple_call <__builtin_eh_pointer, } 1 optimized { target { ! { arm_eabi || tic6x-*-* } } } } } + { dg-final { scan-tree-dump-times {gimple_call <__builtin_unwind_resume, } 1 optimized { target { ! { arm_eabi || tic6x-*-* } } } } } + ..., just 'targetm.arm_eabi_unwinder' is different: + { dg-final { scan-tree-dump-times {gimple_call <__builtin_cxa_end_cleanup, } 1 optimized { target { arm_eabi || tic6x-*-* } } } } { dg-final { scan-offload-tree-dump-times {gimple_call <__builtin_eh_pointer, } 1 optimized } } { dg-final { scan-offload-tree-dump-times {gimple_call <__builtin_unwind_resume, } 1 optimized } } */ diff --git a/libgomp/testsuite/libgomp.c++/target-exceptions-throw-1.C b/libgomp/testsuite/libgomp.c++/target-exceptions-throw-1.C index 2467061..a4e7a10 100644 --- a/libgomp/testsuite/libgomp.c++/target-exceptions-throw-1.C +++ b/libgomp/testsuite/libgomp.c++/target-exceptions-throw-1.C @@ -4,9 +4,6 @@ { dg-additional-options -fexceptions } */ /* { dg-additional-options -fdump-tree-optimized-raw } { dg-additional-options -foffload-options=-fdump-tree-optimized-raw } */ -/* { dg-bogus {Size expression must be absolute\.} PR119737 { target offload_target_amdgcn xfail *-*-* } 0 } - { dg-ice PR119737 { offload_target_amdgcn } } - { dg-excess-errors {'mkoffload' failures etc.} { xfail offload_target_amdgcn } } */ #include "../libgomp.oacc-c++/exceptions-throw-1.C" diff --git a/libgomp/testsuite/libgomp.c++/target-exceptions-throw-2.C b/libgomp/testsuite/libgomp.c++/target-exceptions-throw-2.C index e85e6c3..97f4845 100644 --- a/libgomp/testsuite/libgomp.c++/target-exceptions-throw-2.C +++ b/libgomp/testsuite/libgomp.c++/target-exceptions-throw-2.C @@ -4,9 +4,6 @@ { dg-additional-options -fexceptions } */ /* { dg-additional-options -fdump-tree-optimized-raw } { dg-additional-options -foffload-options=-fdump-tree-optimized-raw } */ -/* { dg-bogus {Size expression must be absolute\.} PR119737 { target offload_target_amdgcn xfail *-*-* } 0 } - { dg-ice PR119737 { offload_target_amdgcn } } - { dg-excess-errors {'mkoffload' failures etc.} { xfail offload_target_amdgcn } } */ #include "../libgomp.oacc-c++/exceptions-throw-2.C" diff --git a/libgomp/testsuite/libgomp.c-c++-common/interop-2.c b/libgomp/testsuite/libgomp.c-c++-common/interop-2.c new file mode 100644 index 0000000..a7526dc --- /dev/null +++ b/libgomp/testsuite/libgomp.c-c++-common/interop-2.c @@ -0,0 +1,129 @@ +/* { dg-do run } */ +/* { dg-additional-options "-lm" } */ + +/* Note: At the time this program was written, Nvptx was not asynchronous + enough to trigger the issue (with a 'nowait' added); however, one + AMD GPUs, it triggered. */ + +/* Test whether nowait / dependency is handled correctly. + Motivated by OpenMP_VV's 5.1/interop/test_interop_target.c + + The code actually only creates a streaming object without actually using it, + except for dependency tracking. + + Note that there is a difference between having a steaming (targetsync) object + and not (= omp_interop_none); at least if one assumes that omp_interop_none + does not include 'targetsync' as (effective) interop type - in that case, + 'nowait' has no effect and the 'depend' is active as included task, otherwise + the code continues with the depend being active only for the about to be + destroyed or used thread. + + The OpenMP spec states (here 6.0): + "If the interop-type set includes 'targetsync', an empty mergeable task is + generated. If the 'nowait' clause is not present on the construct then + the task is also an included task. If the interop-type set does not + include 'targetsync', the 'nowait' clause has no effect. Any depend + clauses that are present on the construct apply to the generated task. */ + +#include <omp.h> + +void +test_async (const int dev) +{ + constexpr int N = 2048; + constexpr int ulp = 4; + constexpr double M_PI = 2.0 * __builtin_acos (0.0); + omp_interop_t obj1, obj2; + double A[N] = { }; + int B[N] = { }; + + /* Create interop object. */ + #pragma omp interop device(dev) init(targetsync : obj1, obj2) + + if (dev == omp_initial_device || dev == omp_get_num_devices ()) + { + if (obj1 != omp_interop_none || obj2 != omp_interop_none) + __builtin_abort (); + } + else + { + if (obj1 == omp_interop_none || obj2 == omp_interop_none) + __builtin_abort (); + } + + /* DOUBLE */ + + /* Now in the background update it, slowly enough that the + code afterwards is reached while still running asynchronously. + As OpenMP_VV's Issue #863 shows, the overhead is high enough to + fail even when only doing an atomic integer increment. */ + + #pragma omp target device(dev) map(A) depend(out: A[:N]) nowait + for (int i = 0; i < N; i++) + #pragma omp atomic update + A[i] += __builtin_sin (2*i*M_PI/N); + + /* DESTROY take care of the dependeny such that ... */ + + if (obj1 == omp_interop_none) + { + // Same as below as 'nowait' is ignored. + #pragma omp interop destroy(obj1) depend(in: A[:N]) nowait + } + else + { + #pragma omp interop destroy(obj1) depend(in: A[:N]) + } + + /* ... this code is only executed once the dependency as been fulfilled. */ + + /* Check the value - part I: quick, avoid A[0] == sin(0) = 0. */ + for (int i = 1; i < N; i++) + if (A[i] == 0.0) + __builtin_abort (); + + /* Check the value - part II: throughly */ + for (int i = 0; i < N; i++) + { + double x = A[i]; + double y = __builtin_sin (2*i*M_PI/N); + if (__builtin_fabs (x - y) > ulp * __builtin_fabs (x+y) * __DBL_EPSILON__) + __builtin_abort (); + } + + /* Integer */ + + #pragma omp target device(dev) map(B) depend(out: B[:N]) nowait + for (int i = 0; i < N; i++) + #pragma omp atomic update + B[i] += 42; + + /* Same - but using USE. */ + if (obj2 == omp_interop_none) + { + // Same as below as 'nowait' is ignored. + #pragma omp interop use(obj2) depend(in: B[:N]) nowait + } + else + { + #pragma omp interop use(obj2) depend(in: B[:N]) + } + + for (int i = 0; i < N; i++) + if (B[i] != 42) + __builtin_abort (); + + #pragma omp interop destroy(obj2) +} + +int +main () +{ + int ndev = omp_get_num_devices (); + + for (int dev = 0; dev <= ndev; dev++) + test_async (dev); + test_async (omp_initial_device); + + return 0; +} diff --git a/libgomp/testsuite/libgomp.c-c++-common/target-cdtor-1.c b/libgomp/testsuite/libgomp.c-c++-common/target-cdtor-1.c new file mode 100644 index 0000000..e6099cf --- /dev/null +++ b/libgomp/testsuite/libgomp.c-c++-common/target-cdtor-1.c @@ -0,0 +1,89 @@ +/* Offloaded 'constructor' and 'destructor' functions. */ + +#include <omp.h> + +#pragma omp declare target + +static void +__attribute__((constructor)) +initHD1() +{ + __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device()); +} + +static void +__attribute__((constructor)) +initHD2() +{ + __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device()); +} + +static void +__attribute__((destructor)) +finiHD1() +{ + __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device()); +} + +static void +__attribute__((destructor)) +finiHD2() +{ + __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device()); +} + +#pragma omp end declare target + +static void +__attribute__((constructor)) +initH1() +{ + __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device()); +} + +static void +__attribute__((destructor)) +finiH2() +{ + __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device()); +} + +int main() +{ + int c = 0; + + __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device()); + +#pragma omp target map(c) + { + __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device()); + } + +#pragma omp target map(c) + { + __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device()); + } + + __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device()); + + return 0; +} + +/* The order is undefined, in which same-priority 'constructor' functions, and 'destructor' functions are run. + { dg-output {init[^,]+, 1[\r\n]+} } + { dg-output {init[^,]+, 1[\r\n]+} } + { dg-output {init[^,]+, 1[\r\n]+} } + { dg-output {main:1, 1[\r\n]+} } + { dg-output {initHD[^,]+, 0[\r\n]+} { target offload_device } } + { dg-output {initHD[^,]+, 0[\r\n]+} { target offload_device } } + { dg-output {main:2, 1[\r\n]+} { target { ! offload_device } } } + { dg-output {main:2, 0[\r\n]+} { target offload_device } } + { dg-output {main:3, 1[\r\n]+} { target { ! offload_device } } } + { dg-output {main:3, 0[\r\n]+} { target offload_device } } + { dg-output {main:4, 1[\r\n]+} } + { dg-output {finiHD[^,]+, 0[\r\n]+} { target offload_device } } + { dg-output {finiHD[^,]+, 0[\r\n]+} { target offload_device } } + { dg-output {fini[^,]+, 1[\r\n]+} } + { dg-output {fini[^,]+, 1[\r\n]+} } + { dg-output {fini[^,]+, 1[\r\n]+} } +*/ diff --git a/libgomp/testsuite/libgomp.c/interop-cublas-full.c b/libgomp/testsuite/libgomp.c/interop-cublas-full.c new file mode 100644 index 0000000..2df5277 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-cublas-full.c @@ -0,0 +1,176 @@ +/* { dg-require-effective-target openacc_cublas } */ +/* { dg-additional-options "-lcublas" } */ + +/* NOTE: This file is also included by libgomp.c-c++-common/interop-cudablas-libonly.c + to test the fallback version. */ + +/* Check whether cuBlas' daxpy works with an interop object. + daxpy(N, DA, DX, INCX, DY, INCY) + calculates (for DX = DY = 1): + DY(1:N) = DY(1:N) + DA * DX(1:N) + and otherwise N array elements, taking every INCX-th or INCY-th one, repectively. + +Based on the interop example in OpenMP's example document */ + +/* Minimal check whether CUDA works - by checking whether the API routines + seem to work. This includes a fallback if the header is not + available. */ + +#include <assert.h> +#include <omp.h> +#include "../libgomp.c-c++-common/on_device_arch.h" + + +#if __has_include(<cuda.h>) && __has_include(<cudaTypedefs.h>) && __has_include(<cuda_runtime.h>) && __has_include(<cublas_v2.h>) && !defined(USE_CUDA_FALLBACK_HEADER) + #include <cuda.h> + #include <cudaTypedefs.h> + #include <cuda_runtime.h> + #include <cublas_v2.h> + +#else + /* Add a poor man's fallback declaration. */ + #if USE_CUDA_FALLBACK_HEADER + // Don't warn. + #elif !__has_include(<cuda.h>) + #warning "Using GCC's cuda.h as fallback for cuda.h" + #elif !__has_include(<cudaTypedefs.h>) + #warning "Using GCC's cuda.h as fallback for cudaTypedefs.h" + #elif !__has_include(<cuda_runtime.h>) + #warning "Using GCC's cuda.h as fallback for cuda_runtime.h" + #else + #warning "Using GCC's cuda.h as fallback for cublas_v2.h" + #endif + #include "../../../include/cuda/cuda.h" + + typedef enum { + CUBLAS_STATUS_SUCCESS = 0, + } cublasStatus_t; + + typedef CUstream cudaStream_t; + typedef struct cublasContext* cublasHandle_t; + + #define cublasCreate cublasCreate_v2 + cublasStatus_t cublasCreate_v2 (cublasHandle_t *); + + #define cublasSetStream cublasSetStream_v2 + cublasStatus_t cublasSetStream_v2 (cublasHandle_t, cudaStream_t); + + #define cublasDaxpy cublasDaxpy_v2 + cublasStatus_t cublasDaxpy_v2(cublasHandle_t, int, const double*, const double*, int, double*, int); +#endif + +static int used_variant = 0; + +void +run_cuBlasdaxpy (int n, double da, const double *dx, int incx, double *dy, int incy, omp_interop_t obj) +{ + used_variant = 1; + + omp_interop_rc_t res; + cublasStatus_t stat; + + omp_intptr_t fr = omp_get_interop_int(obj, omp_ipr_fr_id, &res); + assert (res == omp_irc_success && fr == omp_ifr_cuda); + + cudaStream_t stream = (cudaStream_t) omp_get_interop_ptr (obj, omp_ipr_targetsync, &res); + assert (res == omp_irc_success); + + cublasHandle_t handle; + stat = cublasCreate (&handle); + assert (stat == CUBLAS_STATUS_SUCCESS); + + stat = cublasSetStream (handle, stream); + assert (stat == CUBLAS_STATUS_SUCCESS); + + /* 'da' can be in host or device space, 'dx' and 'dy' must be in device space. */ + stat = cublasDaxpy (handle, n, &da, dx, 1, dy, 1) ; + assert (stat == CUBLAS_STATUS_SUCCESS); +} + + +#pragma omp declare variant(run_cuBlasdaxpy) \ + match(construct={dispatch}, target_device={kind(nohost), arch("nvptx")}) \ + adjust_args(need_device_ptr : dx, dy) \ + append_args(interop(targetsync, prefer_type("cuda"))) + +void +run_daxpy (int n, double da, const double *dx, int incx, double *dy, int incy) +{ + used_variant = 2; + + if (incx == 1 && incy == 1) + #pragma omp simd + for (int i = 0; i < n; i++) + dy[i] += da * dx[i]; + else + { + int ix = 0; + int iy = 0; + for (int i = 0; i < n; i++) + { + dy[iy] += da * dx[ix]; + ix += incx; + iy += incy; + } + } +} + + +void +run_test (int dev) +{ + constexpr int N = 1024; + + // A = {1,2,...,N} + // B = {-1, -2, ..., N} + // B' = daxpy (N, 3, A, incx=1, B, incy=1) + // = B + 3*A + // -> B' = {0, 2, 4, 6, ... } + + double A[N], B[N]; + double factor = 3.0; + for (int i = 0; i < N; i++) + { + A[i] = i; + B[i] = -i; + } + + if (dev != omp_initial_device && dev != omp_get_num_devices ()) + { + #pragma omp target enter data device(dev) map(A, B) + } + + used_variant = 99; + #pragma omp dispatch device(dev) + run_daxpy (N, factor, A, 1, B, 1); + + if (dev != omp_initial_device && dev != omp_get_num_devices ()) + { + #pragma omp target exit data device(dev) map(release: A) map(from: B) + + int tmp = omp_get_default_device (); + omp_set_default_device (dev); + if (on_device_arch_nvptx ()) + assert (used_variant == 1); + else + assert (used_variant == 2); + omp_set_default_device (tmp); + } + else + assert (used_variant == 2); + + for (int i = 0; i < N; i++) + assert (B[i] == 2*i); +} + +int +main () +{ + int ndev = omp_get_num_devices (); + + for (int dev = 0; dev <= ndev; dev++) + run_test (dev); + run_test (omp_initial_device); + + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/interop-cublas-libonly.c b/libgomp/testsuite/libgomp.c/interop-cublas-libonly.c new file mode 100644 index 0000000..89c0652 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-cublas-libonly.c @@ -0,0 +1,7 @@ +/* { dg-require-effective-target openacc_libcublas } */ +/* { dg-additional-options "-lcublas" } */ + +/* Same as interop-cudablas-full.c, but also works if the header is not available. */ + +#define USE_CUDA_FALLBACK_HEADER 1 +#include "interop-cublas-full.c" diff --git a/libgomp/testsuite/libgomp.c/interop-cuda-full.c b/libgomp/testsuite/libgomp.c/interop-cuda-full.c new file mode 100644 index 0000000..38aa6b1 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-cuda-full.c @@ -0,0 +1,159 @@ +/* { dg-require-effective-target openacc_cuda } */ +/* { dg-require-effective-target openacc_cudart } */ +/* { dg-additional-options "-lcuda -lcudart" } */ + +/* NOTE: This file is also included by libgomp.c-c++-common/interop-cuda-libonly.c + to test the fallback version, which defines USE_CUDA_FALLBACK_HEADER. */ + +/* Minimal check whether CUDA works - by checking whether the API routines + seem to work. This includes a fallback if the header is not + available. */ + +#include <assert.h> +#include <omp.h> + +#if __has_include(<cuda.h>) && __has_include(<cudaTypedefs.h>) && __has_include(<cuda_runtime.h>) && !defined(USE_CUDA_FALLBACK_HEADER) + #include <cuda.h> + #include <cudaTypedefs.h> + #include <cuda_runtime.h> + +#else + /* Add a poor man's fallback declaration. */ + #if USE_CUDA_FALLBACK_HEADER + // Don't warn. + #elif !__has_include(<cuda.h>) + #warning "Using GCC's cuda.h as fallback for cuda.h" + #elif !__has_include(<cudaTypedefs.h>) + #warning "Using GCC's cuda.h as fallback for cudaTypedefs.h" + #else + #warning "Using GCC's cuda.h as fallback for cuda_runtime.h" + #endif + #include "../../../include/cuda/cuda.h" + + typedef int cudaError_t; + typedef CUstream cudaStream_t; + enum { + cudaSuccess = 0 + }; + + enum cudaDeviceAttr { + cudaDevAttrClockRate = 13, + cudaDevAttrMaxGridDimX = 5 + }; + + cudaError_t cudaDeviceGetAttribute (int *, enum cudaDeviceAttr, int); + cudaError_t cudaStreamQuery(cudaStream_t); + CUresult cuCtxGetApiVersion(CUcontext, unsigned int *); + CUresult cuStreamGetCtx (CUstream, CUcontext *); +#endif + +int +main () +{ + int ivar; + unsigned uvar; + omp_interop_rc_t res; + omp_interop_t obj_cuda = omp_interop_none; + omp_interop_t obj_cuda_driver = omp_interop_none; + cudaError_t cuda_err; + CUresult cu_err; + + #pragma omp interop init(target, targetsync, prefer_type("cuda") : obj_cuda) \ + init(target, targetsync, prefer_type("cuda_driver") : obj_cuda_driver) \ + + omp_interop_fr_t fr = (omp_interop_fr_t) omp_get_interop_int (obj_cuda, omp_ipr_fr_id, &res); + assert (res == omp_irc_success); + assert (fr == omp_ifr_cuda); + + fr = (omp_interop_fr_t) omp_get_interop_int (obj_cuda_driver, omp_ipr_fr_id, &res); + assert (res == omp_irc_success); + assert (fr == omp_ifr_cuda_driver); + + ivar = (int) omp_get_interop_int (obj_cuda, omp_ipr_vendor, &res); + assert (res == omp_irc_success); + assert (ivar == 11); + + ivar = (int) omp_get_interop_int (obj_cuda_driver, omp_ipr_vendor, &res); + assert (res == omp_irc_success); + assert (ivar == 11); + + + /* Check whether the omp_ipr_device -> cudaDevice_t yields a valid device. */ + + CUdevice cu_dev = (int) omp_get_interop_int (obj_cuda_driver, omp_ipr_device, &res); + assert (res == omp_irc_success); + + /* Assume a clock size is available and > 1 GHz; value is in kHz. */ + cu_err = cuDeviceGetAttribute (&ivar, cudaDevAttrClockRate, cu_dev); + assert (cu_err == CUDA_SUCCESS); + assert (ivar > 1000000 /* kHz */); + + /* Assume that the MaxGridDimX is available and > 1024. */ + cu_err = cuDeviceGetAttribute (&ivar, cudaDevAttrMaxGridDimX, cu_dev); + assert (cu_err == CUDA_SUCCESS); + assert (ivar > 1024); + + int cuda_dev = (int) omp_get_interop_int (obj_cuda, omp_ipr_device, &res); + assert (res == omp_irc_success); + assert (cuda_dev == (CUdevice) cu_dev); // Assume they are the same ... + + /* Assume a clock size is available and > 1 GHz; value is in kHz. */ + cuda_err = cudaDeviceGetAttribute (&ivar, cudaDevAttrClockRate, cuda_dev); + assert (cuda_err == cudaSuccess); + assert (ivar > 1000000 /* kHz */); + + /* Assume that the MaxGridDimX is available and > 1024. */ + cuda_err = cudaDeviceGetAttribute (&ivar, cudaDevAttrMaxGridDimX, cuda_dev); + assert (cuda_err == cudaSuccess); + assert (ivar > 1024); + + + + + /* Check whether the omp_ipr_device_context -> CUcontext yields a context. */ + + CUcontext cu_ctx = (CUcontext) omp_get_interop_ptr (obj_cuda_driver, omp_ipr_device_context, &res); + assert (res == omp_irc_success); + + /* Assume API Version > 0 for Nvidia, cudaErrorNotSupported for AMD. */ + uvar = 99; + cu_err = cuCtxGetApiVersion (cu_ctx, &uvar); + assert (cu_err == CUDA_SUCCESS); + assert (uvar > 0); + + + /* Check whether the omp_ipr_targetsync -> cudaStream_t yields a stream. */ + + cudaStream_t cuda_sm = (cudaStream_t) omp_get_interop_ptr (obj_cuda, omp_ipr_targetsync, &res); + assert (res == omp_irc_success); + + CUstream cu_sm = (cudaStream_t) omp_get_interop_ptr (obj_cuda_driver, omp_ipr_targetsync, &res); + assert (res == omp_irc_success); + + assert ((void*) cu_sm != (void*) cuda_sm); // Type compatible but should have created two streams + + int dev_stream = 99; +#if CUDA_VERSION >= 12080 + cuda_err = cudaStreamGetDevice (cuda_sm, &dev_stream); + assert (cuda_err == cudaSuccess); +#else + cu_err = cuStreamGetCtx (cu_sm, &cu_ctx) != CUDA_SUCCESS; + if (cu_err == CUDA_SUCCESS) + cuda_err = cuCtxPushCurrent (cu_ctx) != CUDA_SUCCESS; + if (cu_err == CUDA_SUCCESS) + cuda_err = cuCtxGetDevice (&dev_stream) != CUDA_SUCCESS; + if (cu_err == CUDA_SUCCESS) + cu_err = cuCtxPopCurrent (&cu_ctx) != CUDA_SUCCESS; + assert (cu_err == CUDA_SUCCESS); +#endif + assert (dev_stream == cuda_dev); + + /* All jobs should have been completed (as there were none none) */ + cuda_err = cudaStreamQuery (cuda_sm); + assert (cuda_err == cudaSuccess); + + cu_err = cuStreamQuery (cu_sm); + assert (cu_err == CUDA_SUCCESS); + + #pragma omp interop destroy(obj_cuda, obj_cuda_driver) +} diff --git a/libgomp/testsuite/libgomp.c/interop-cuda-libonly.c b/libgomp/testsuite/libgomp.c/interop-cuda-libonly.c new file mode 100644 index 0000000..17cbb15 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-cuda-libonly.c @@ -0,0 +1,8 @@ +/* { dg-require-effective-target openacc_libcudart } */ +/* { dg-require-effective-target openacc_libcuda } */ +/* { dg-additional-options "-lcuda -lcudart" } */ + +/* Same as interop-cuda-full.c, but also works if the header is not available. */ + +#define USE_CUDA_FALLBACK_HEADER 1 +#include "interop-cuda-full.c" diff --git a/libgomp/testsuite/libgomp.c/interop-hip-amd-full.c b/libgomp/testsuite/libgomp.c/interop-hip-amd-full.c new file mode 100644 index 0000000..d7725fc --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hip-amd-full.c @@ -0,0 +1,7 @@ +/* { dg-require-effective-target gomp_hip_header_amd } */ +/* { dg-require-effective-target gomp_libamdhip64 } */ +/* { dg-additional-options "-lamdhip64" } */ + +#define __HIP_PLATFORM_AMD__ 1 + +#include "interop-hip.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hip-amd-no-hip-header.c b/libgomp/testsuite/libgomp.c/interop-hip-amd-no-hip-header.c new file mode 100644 index 0000000..2584537 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hip-amd-no-hip-header.c @@ -0,0 +1,8 @@ +/* { dg-require-effective-target gomp_libamdhip64 } */ +/* { dg-additional-options "-lamdhip64" } */ + +#define __HIP_PLATFORM_AMD__ 1 + +#define USE_HIP_FALLBACK_HEADER 1 + +#include "interop-hip.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hip-nvidia-full.c b/libgomp/testsuite/libgomp.c/interop-hip-nvidia-full.c new file mode 100644 index 0000000..79af47d --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hip-nvidia-full.c @@ -0,0 +1,8 @@ +/* { dg-require-effective-target openacc_cudart } */ +/* { dg-require-effective-target openacc_cuda } */ +/* { dg-require-effective-target gomp_hip_header_nvidia } */ +/* { dg-additional-options "-lcuda -lcudart -Wno-deprecated-declarations" } */ + +#define __HIP_PLATFORM_NVIDIA__ 1 + +#include "interop-hip.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hip-nvidia-no-headers.c b/libgomp/testsuite/libgomp.c/interop-hip-nvidia-no-headers.c new file mode 100644 index 0000000..4586398 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hip-nvidia-no-headers.c @@ -0,0 +1,10 @@ +/* { dg-require-effective-target openacc_libcudart } */ +/* { dg-require-effective-target openacc_libcuda } */ +/* { dg-additional-options "-lcuda -lcudart" } */ + +#define __HIP_PLATFORM_NVIDIA__ 1 + +#define USE_HIP_FALLBACK_HEADER 1 +#define USE_CUDA_FALLBACK_HEADER 1 + +#include "interop-hip.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hip-nvidia-no-hip-header.c b/libgomp/testsuite/libgomp.c/interop-hip-nvidia-no-hip-header.c new file mode 100644 index 0000000..4186984 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hip-nvidia-no-hip-header.c @@ -0,0 +1,9 @@ +/* { dg-require-effective-target openacc_cudart } */ +/* { dg-require-effective-target openacc_cuda } */ +/* { dg-additional-options "-lcuda -lcudart" } */ + +#define __HIP_PLATFORM_NVIDIA__ 1 + +#define USE_HIP_FALLBACK_HEADER 1 + +#include "interop-hip.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hip.h b/libgomp/testsuite/libgomp.c/interop-hip.h new file mode 100644 index 0000000..20a1ccb --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hip.h @@ -0,0 +1,234 @@ +/* Minimal check whether HIP works - by checking whether the API routines + seem to work. This includes various fallbacks if the header is not + available. */ + +#include <assert.h> +#include <omp.h> + +#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__) + #error "Either __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__ must be defined" +#endif + +#if defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__) + #error "Either __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__ must be defined" +#endif + +#if __has_include(<hip/hip_runtime_api.h>) && !defined(USE_HIP_FALLBACK_HEADER) + #include <hip/hip_runtime_api.h> + +#elif defined(__HIP_PLATFORM_AMD__) + /* Add a poor man's fallback declaration. */ + #if !defined(USE_HIP_FALLBACK_HEADER) + #warning "Using fallback declaration for <hip/hip_runtime_api.h> for __HIP_PLATFORM_AMD__" + #endif + + typedef struct ihipStream_t* hipStream_t; + typedef struct ihipCtx_t* hipCtx_t; + typedef int hipError_t; + typedef int hipDevice_t; + enum { + hipSuccess = 0, + hipErrorNotSupported = 801 + }; + + typedef enum hipDeviceAttribute_t { + hipDeviceAttributeClockRate = 5, + hipDeviceAttributeMaxGridDimX = 29 + } hipDeviceAttribute_t; + + hipError_t hipDeviceGetAttribute (int *, hipDeviceAttribute_t, hipDevice_t); + hipError_t hipCtxGetApiVersion (hipCtx_t, int *); + hipError_t hipStreamGetDevice (hipStream_t, hipDevice_t *); + hipError_t hipStreamQuery (hipStream_t); + +#elif defined(__HIP_PLATFORM_NVIDIA__) + /* Add a poor man's fallback declaration. */ + #if !defined(USE_HIP_FALLBACK_HEADER) + #warning "Using fallback declaration for <hip/hip_runtime_api.h> for __HIP_PLATFORM_NVIDIA__" + #endif + + #if __has_include(<cuda.h>) && __has_include(<cudaTypedefs.h>) && __has_include(<cuda_runtime.h>) && !defined(USE_CUDA_FALLBACK_HEADER) + #include <cuda.h> + #include <cudaTypedefs.h> + #include <cuda_runtime.h> + #else + #if defined(USE_CUDA_FALLBACK_HEADER) + // no warning + #elif !__has_include(<cuda.h>) + #warning "Using GCC's cuda.h as fallback for cuda.h" + #elif !__has_include(<cudaTypedefs.h>) + #warning "Using GCC's cuda.h as fallback for cudaTypedefs.h" + #else + #warning "Using GCC's cuda.h as fallback for cuda_runtime.h" + #endif + + #include "../../../include/cuda/cuda.h" + + typedef int cudaError_t; + enum { + cudaSuccess = 0 + }; + + enum cudaDeviceAttr { + cudaDevAttrClockRate = 13, + cudaDevAttrMaxGridDimX = 5 + }; + + cudaError_t cudaDeviceGetAttribute (int *, enum cudaDeviceAttr, int); + CUresult cuCtxGetApiVersion(CUcontext, unsigned int *); + CUresult cuStreamGetCtx (CUstream, CUcontext *); + #endif + + typedef CUstream hipStream_t; + typedef CUcontext hipCtx_t; + typedef CUdevice hipDevice_t; + + typedef int hipError_t; + typedef int hipDevice_t; + enum { + hipSuccess = 0, + hipErrorNotSupported = 801 + }; + + + typedef enum hipDeviceAttribute_t { + hipDeviceAttributeClockRate = 5, + hipDeviceAttributeMaxGridDimX = 29 + } hipDeviceAttribute_t; + + inline static hipError_t + hipDeviceGetAttribute (int *ival, hipDeviceAttribute_t attr, hipDevice_t dev) + { + enum cudaDeviceAttr cuattr; + switch (attr) + { + case hipDeviceAttributeClockRate: + cuattr = cudaDevAttrClockRate; + break; + case hipDeviceAttributeMaxGridDimX: + cuattr = cudaDevAttrMaxGridDimX; + break; + default: + assert (0); + } + return cudaDeviceGetAttribute (ival, cuattr, dev) != cudaSuccess; + } + + inline static hipError_t + hipCtxGetApiVersion (hipCtx_t ctx, int *ver) + { + unsigned uver; + hipError_t err; + err = cuCtxGetApiVersion (ctx, &uver) != CUDA_SUCCESS; + *ver = (int) uver; + return err; + } + + inline static hipError_t + hipStreamGetDevice (hipStream_t stream, hipDevice_t *dev) + { +#if CUDA_VERSION >= 12080 + return cudaStreamGetDevice (stream, dev); +#else + hipError_t err; + CUcontext ctx; + err = cuStreamGetCtx (stream, &ctx) != CUDA_SUCCESS; + if (err == hipSuccess) + err = cuCtxPushCurrent (ctx) != CUDA_SUCCESS; + if (err == hipSuccess) + err = cuCtxGetDevice (dev) != CUDA_SUCCESS; + if (err == hipSuccess) + err = cuCtxPopCurrent (&ctx) != CUDA_SUCCESS; + return err; +#endif + } + + inline static hipError_t + hipStreamQuery (hipStream_t stream) + { + return cuStreamQuery (stream) != CUDA_SUCCESS; + } + +#else + #error "should be unreachable" +#endif + +int +main () +{ + int ivar; + omp_interop_rc_t res; + omp_interop_t obj = omp_interop_none; + hipError_t hip_err; + + #pragma omp interop init(target, targetsync, prefer_type("hip") : obj) + + omp_interop_fr_t fr = (omp_interop_fr_t) omp_get_interop_int (obj, omp_ipr_fr_id, &res); + assert (res == omp_irc_success); + assert (fr == omp_ifr_hip); + + ivar = (int) omp_get_interop_int (obj, omp_ipr_vendor, &res); + assert (res == omp_irc_success); + int vendor_is_amd = ivar == 1; + #if defined(__HIP_PLATFORM_AMD__) + assert (ivar == 1); + #elif defined(__HIP_PLATFORM_NVIDIA__) + assert (ivar == 11); + #else + assert (0); + #endif + + + /* Check whether the omp_ipr_device -> hipDevice_t yields a valid device. */ + + hipDevice_t hip_dev = (int) omp_get_interop_int (obj, omp_ipr_device, &res); + assert (res == omp_irc_success); + + /* Assume a clock size is available and > 1 GHz; value is in kHz. */ + hip_err = hipDeviceGetAttribute (&ivar, hipDeviceAttributeClockRate, hip_dev); + assert (hip_err == hipSuccess); + assert (ivar > 1000000 /* kHz */); + + /* Assume that the MaxGridDimX is available and > 1024. */ + hip_err = hipDeviceGetAttribute (&ivar, hipDeviceAttributeMaxGridDimX, hip_dev); + assert (hip_err == hipSuccess); + assert (ivar > 1024); + + + /* Check whether the omp_ipr_device_context -> hipCtx_t yields a context. */ + + hipCtx_t hip_ctx = (hipCtx_t) omp_get_interop_ptr (obj, omp_ipr_device_context, &res); + assert (res == omp_irc_success); + + /* Assume API Version > 0 for Nvidia, hipErrorNotSupported for AMD. */ + ivar = -99; + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wdeprecated-declarations" + hip_err = hipCtxGetApiVersion (hip_ctx, &ivar); + #pragma GCC diagnostic pop + + if (vendor_is_amd) + assert (hip_err == hipErrorNotSupported && ivar == -99); + else + { + assert (hip_err == hipSuccess); + assert (ivar > 0); + } + + + /* Check whether the omp_ipr_targetsync -> hipStream_t yields a stream. */ + + hipStream_t hip_sm = (hipStream_t) omp_get_interop_ptr (obj, omp_ipr_targetsync, &res); + assert (res == omp_irc_success); + + hipDevice_t dev_stream = 99; + hip_err = hipStreamGetDevice (hip_sm, &dev_stream); + assert (hip_err == hipSuccess); + assert (dev_stream == hip_dev); + + /* All jobs should have been completed (as there were none none) */ + hip_err = hipStreamQuery (hip_sm); + assert (hip_err == hipSuccess); + + #pragma omp interop destroy(obj) +} diff --git a/libgomp/testsuite/libgomp.c/interop-hipblas-amd-full.c b/libgomp/testsuite/libgomp.c/interop-hipblas-amd-full.c new file mode 100644 index 0000000..53c05bd --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hipblas-amd-full.c @@ -0,0 +1,7 @@ +/* { dg-require-effective-target gomp_hip_header_amd } */ +/* { dg-require-effective-target gomp_libhipblas } */ +/* { dg-additional-options "-lhipblas" } */ + +#define __HIP_PLATFORM_AMD__ 1 + +#include "interop-hipblas.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hipblas-amd-no-hip-header.c b/libgomp/testsuite/libgomp.c/interop-hipblas-amd-no-hip-header.c new file mode 100644 index 0000000..0ea3133 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hipblas-amd-no-hip-header.c @@ -0,0 +1,8 @@ +/* { dg-require-effective-target gomp_libhipblas } */ +/* { dg-additional-options "-lhipblas" } */ + +#define __HIP_PLATFORM_AMD__ 1 + +#define USE_HIP_FALLBACK_HEADER 1 + +#include "interop-hipblas.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hipblas-nvidia-full.c b/libgomp/testsuite/libgomp.c/interop-hipblas-nvidia-full.c new file mode 100644 index 0000000..ed428c6 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hipblas-nvidia-full.c @@ -0,0 +1,7 @@ +/* { dg-require-effective-target openacc_cublas } */ +/* { dg-require-effective-target gomp_hip_header_nvidia } */ +/* { dg-additional-options "-lcublas -Wno-deprecated-declarations" } */ + +#define __HIP_PLATFORM_NVIDIA__ 1 + +#include "interop-hipblas.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hipblas-nvidia-no-headers.c b/libgomp/testsuite/libgomp.c/interop-hipblas-nvidia-no-headers.c new file mode 100644 index 0000000..1a31b30 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hipblas-nvidia-no-headers.c @@ -0,0 +1,9 @@ +/* { dg-require-effective-target openacc_libcublas } */ +/* { dg-additional-options "-lcublas" } */ + +#define __HIP_PLATFORM_NVIDIA__ 1 + +#define USE_HIP_FALLBACK_HEADER 1 +#define USE_CUDA_FALLBACK_HEADER 1 + +#include "interop-hipblas.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hipblas-nvidia-no-hip-header.c b/libgomp/testsuite/libgomp.c/interop-hipblas-nvidia-no-hip-header.c new file mode 100644 index 0000000..f85c13b --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hipblas-nvidia-no-hip-header.c @@ -0,0 +1,8 @@ +/* { dg-require-effective-target openacc_cublas } */ +/* { dg-additional-options "-lcublas" } */ + +#define __HIP_PLATFORM_NVIDIA__ 1 + +#define USE_HIP_FALLBACK_HEADER 1 + +#include "interop-hipblas.h" diff --git a/libgomp/testsuite/libgomp.c/interop-hipblas.h b/libgomp/testsuite/libgomp.c/interop-hipblas.h new file mode 100644 index 0000000..d7cb174 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hipblas.h @@ -0,0 +1,240 @@ +/* Check whether hipBlas' daxpy works with an interop object. + daxpy(N, DA, DX, INCX, DY, INCY) + calculates (for DX = DY = 1): + DY(1:N) = DY(1:N) + DA * DX(1:N) + and otherwise N array elements, taking every INCX-th or INCY-th one, repectively. + +Based on the interop example in OpenMP's example document */ + +/* Minimal check whether HIP works - by checking whether the API routines + seem to work. This includes a fallback if the header is not + available. */ + +#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__) + #error "Either __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__ must be defined" +#endif + +#if defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__) + #error "Either __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__ must be defined" +#endif + + +#include <assert.h> +#include <omp.h> +#include "../libgomp.c-c++-common/on_device_arch.h" + + +#if __has_include(<hipblas/hipblas.h>) && (__has_include(<library_types.h>) || !defined(__HIP_PLATFORM_NVIDIA__)) && !defined(USE_HIP_FALLBACK_HEADER) + #ifdef __HIP_PLATFORM_NVIDIA__ + /* There seems to be an issue with hip/library_types.h including + CUDA's "library_types.h". Include CUDA's one explicitly here. + Could possibly worked around by using -isystem vs. -I. */ + #include <library_types.h> + + /* For some reasons, the following symbols do not seem to get + mapped from HIP to CUDA, causing link errors. */ + #define hipblasSetStream cublasSetStream_v2 + #define hipblasDaxpy cublasDaxpy_v2 + #define hipblasCreate cublasCreate_v2 + #endif + #include <hipblas/hipblas.h> + +#elif defined(__HIP_PLATFORM_AMD__) + /* Add a poor man's fallback declaration. */ + #if !defined(USE_HIP_FALLBACK_HEADER) + #warning "Using fallback declaration for <hipblas/hipblas.h> for __HIP_PLATFORM_AMD__" + #endif + + typedef enum + { + HIPBLAS_STATUS_SUCCESS = 0 + + } hipblasStatus_t; + + typedef struct ihipStream_t* hipStream_t; + typedef void* hipblasHandle_t; + + hipblasStatus_t hipblasCreate (hipblasHandle_t*); + hipblasStatus_t hipblasSetStream (hipblasHandle_t, hipStream_t); + hipblasStatus_t hipblasDaxpy (hipblasHandle_t, int, const double*, const double*, int, double*, int); + +#else + /* Add a poor man's fallback declaration. */ + #if !defined(USE_HIP_FALLBACK_HEADER) + #warning "Using fallback declaration for <hipblas/hipblas.h> for __HIP_PLATFORM_NVIDA__" + #endif + + #if __has_include(<cuda.h>) && __has_include(<cudaTypedefs.h>) && __has_include(<cuda_runtime.h>) && __has_include(<cublas_v2.h>) && !defined(USE_CUDA_FALLBACK_HEADER) + #include <cuda.h> + #include <cudaTypedefs.h> + #include <cuda_runtime.h> + #include <cublas_v2.h> + + #else + /* Add a poor man's fallback declaration. */ + #if defined(USE_CUDA_FALLBACK_HEADER) + // no warning + #elif !__has_include(<cuda.h>) + #warning "Using GCC's cuda.h as fallback for cuda.h" + #elif !__has_include(<cudaTypedefs.h>) + #warning "Using GCC's cuda.h as fallback for cudaTypedefs.h" + #elif !__has_include(<cuda_runtime.h>) + #warning "Using GCC's cuda.h as fallback for cuda_runtime.h" + #else + #warning "Using GCC's cuda.h as fallback for cublas_v2.h" + #endif + #include "../../../include/cuda/cuda.h" + + typedef enum { + CUBLAS_STATUS_SUCCESS = 0, + } cublasStatus_t; + + typedef CUstream cudaStream_t; + typedef struct cublasContext* cublasHandle_t; + + #define cublasCreate cublasCreate_v2 + cublasStatus_t cublasCreate_v2 (cublasHandle_t *); + + #define cublasSetStream cublasSetStream_v2 + cublasStatus_t cublasSetStream_v2 (cublasHandle_t, cudaStream_t); + + #define cublasDaxpy cublasDaxpy_v2 + cublasStatus_t cublasDaxpy_v2(cublasHandle_t, int, const double*, const double*, int, double*, int); + #endif + + #define HIPBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS + #define hipblasStatus_t cublasStatus_t + #define hipStream_t cudaStream_t + #define hipblasHandle_t cublasHandle_t + #define hipblasCreate cublasCreate + #define hipblasSetStream cublasSetStream + #define hipblasDaxpy cublasDaxpy +#endif + +static int used_variant = 0; + +void +run_hipBlasdaxpy (int n, double da, const double *dx, int incx, double *dy, int incy, omp_interop_t obj) +{ + used_variant = 1; + + omp_interop_rc_t res; + hipblasStatus_t stat; + + omp_intptr_t fr = omp_get_interop_int(obj, omp_ipr_fr_id, &res); + assert (res == omp_irc_success && fr == omp_ifr_hip); + + hipStream_t stream = (hipStream_t) omp_get_interop_ptr (obj, omp_ipr_targetsync, &res); + assert (res == omp_irc_success); + + hipblasHandle_t handle; + stat = hipblasCreate (&handle); + assert (stat == HIPBLAS_STATUS_SUCCESS); + + stat = hipblasSetStream (handle, stream); + assert (stat == HIPBLAS_STATUS_SUCCESS); + + /* 'da' can be in host or device space, 'dx' and 'dy' must be in device space. */ + stat = hipblasDaxpy (handle, n, &da, dx, 1, dy, 1) ; + assert (stat == HIPBLAS_STATUS_SUCCESS); +} + +#if defined(__HIP_PLATFORM_AMD__) +#pragma omp declare variant(run_hipBlasdaxpy) \ + match(construct={dispatch}, target_device={kind(nohost), arch("amdgcn")}) \ + adjust_args(need_device_ptr : dx, dy) \ + append_args(interop(targetsync, prefer_type("hip"))) +#elif defined(__HIP_PLATFORM_NVIDIA__) +#pragma omp declare variant(run_hipBlasdaxpy) \ + match(construct={dispatch}, target_device={kind(nohost), arch("nvptx")}) \ + adjust_args(need_device_ptr : dx, dy) \ + append_args(interop(targetsync, prefer_type("hip"))) +#else + #error "wrong platform" +#endif + +void +run_daxpy (int n, double da, const double *dx, int incx, double *dy, int incy) +{ + used_variant = 2; + + if (incx == 1 && incy == 1) + #pragma omp simd + for (int i = 0; i < n; i++) + dy[i] += da * dx[i]; + else + { + int ix = 0; + int iy = 0; + for (int i = 0; i < n; i++) + { + dy[iy] += da * dx[ix]; + ix += incx; + iy += incy; + } + } +} + + +void +run_test (int dev) +{ + constexpr int N = 1024; + + // A = {1,2,...,N} + // B = {-1, -2, ..., N} + // B' = daxpy (N, 3, A, incx=1, B, incy=1) + // = B + 3*A + // -> B' = {0, 2, 4, 6, ... } + + double A[N], B[N]; + double factor = 3.0; + for (int i = 0; i < N; i++) + { + A[i] = i; + B[i] = -i; + } + + if (dev != omp_initial_device && dev != omp_get_num_devices ()) + { + #pragma omp target enter data device(dev) map(A, B) + } + + used_variant = 99; + #pragma omp dispatch device(dev) + run_daxpy (N, factor, A, 1, B, 1); + + if (dev != omp_initial_device && dev != omp_get_num_devices ()) + { + #pragma omp target exit data device(dev) map(release: A) map(from: B) + + int tmp = omp_get_default_device (); + omp_set_default_device (dev); +#if defined(__HIP_PLATFORM_AMD__) + if (on_device_arch_gcn ()) +#else + if (on_device_arch_nvptx ()) +#endif + assert (used_variant == 1); + else + assert (used_variant == 2); + omp_set_default_device (tmp); + } + else + assert (used_variant == 2); + + for (int i = 0; i < N; i++) + assert (B[i] == 2*i); +} + +int +main () +{ + int ndev = omp_get_num_devices (); + + for (int dev = 0; dev <= ndev; dev++) + run_test (dev); + run_test (omp_initial_device); + + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/interop-hsa.c b/libgomp/testsuite/libgomp.c/interop-hsa.c new file mode 100644 index 0000000..cf8bc90 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/interop-hsa.c @@ -0,0 +1,203 @@ +/* { dg-additional-options "-ldl" } */ +/* { dg-require-effective-target offload_device_gcn } */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <omp.h> +#include <assert.h> +#include <dlfcn.h> +#include "../../../include/hsa.h" +#include "../../config/gcn/libgomp-gcn.h" + +#define STACKSIZE (100 * 1024) +#define HEAPSIZE (10 * 1024 * 1024) +#define ARENASIZE HEAPSIZE + +/* This code fragment must be optimized or else the host-fallback kernel has + * invalid ASM inserts. The rest of the file can be compiled safely at -O0. */ +#pragma omp declare target +uintptr_t __attribute__((optimize("O1"))) +get_kernel_ptr () +{ + uintptr_t val; + if (!omp_is_initial_device ()) + /* "main._omp_fn.0" is the name GCC gives the first OpenMP target + * region in the "main" function. + * The ".kd" suffix is added by the LLVM assembler when it creates the + * kernel meta-data, and this is what we need to launch a kernel. */ + asm ("s_getpc_b64 %0\n\t" + "s_add_u32 %L0, %L0, main._omp_fn.0.kd@rel32@lo+4\n\t" + "s_addc_u32 %H0, %H0, main._omp_fn.0.kd@rel32@hi+4" + : "=Sg"(val)); + return val; +} +#pragma omp end declare target + +int +main(int argc, char** argv) +{ + + /* Load the HSA runtime DLL. */ + void *hsalib = dlopen ("libhsa-runtime64.so.1", RTLD_LAZY); + assert (hsalib); + + hsa_status_t (*hsa_signal_create) (hsa_signal_value_t initial_value, + uint32_t num_consumers, + const hsa_agent_t *consumers, + hsa_signal_t *signal) + = dlsym (hsalib, "hsa_signal_create"); + assert (hsa_signal_create); + + uint64_t (*hsa_queue_load_write_index_relaxed) (const hsa_queue_t *queue) + = dlsym (hsalib, "hsa_queue_load_write_index_relaxed"); + assert (hsa_queue_load_write_index_relaxed); + + void (*hsa_signal_store_relaxed) (hsa_signal_t signal, + hsa_signal_value_t value) + = dlsym (hsalib, "hsa_signal_store_relaxed"); + assert (hsa_signal_store_relaxed); + + hsa_signal_value_t (*hsa_signal_wait_relaxed) (hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint) + = dlsym (hsalib, "hsa_signal_wait_relaxed"); + assert (hsa_signal_wait_relaxed); + + void (*hsa_queue_store_write_index_relaxed) (const hsa_queue_t *queue, + uint64_t value) + = dlsym (hsalib, "hsa_queue_store_write_index_relaxed"); + assert (hsa_queue_store_write_index_relaxed); + + hsa_status_t (*hsa_signal_destroy) (hsa_signal_t signal) + = dlsym (hsalib, "hsa_signal_destroy"); + assert (hsa_signal_destroy); + + /* Set up the device data environment. */ + int test_data_value = 0; +#pragma omp target enter data map(test_data_value) + + /* Get the interop details. */ + int device_num = omp_get_default_device(); + hsa_agent_t *gpu_agent; + hsa_queue_t *hsa_queue = NULL; + + omp_interop_t interop = omp_interop_none; +#pragma omp interop init(target, targetsync, prefer_type("hsa"): interop) device(device_num) + assert (interop != omp_interop_none); + + omp_interop_rc_t retcode; + omp_interop_fr_t fr = omp_get_interop_int (interop, omp_ipr_fr_id, &retcode); + assert (retcode == omp_irc_success); + assert (fr == omp_ifr_hsa); + + gpu_agent = omp_get_interop_ptr(interop, omp_ipr_device, &retcode); + assert (retcode == omp_irc_success); + + hsa_queue = omp_get_interop_ptr(interop, omp_ipr_targetsync, &retcode); + assert (retcode == omp_irc_success); + assert (hsa_queue); + + /* Call an offload kernel via OpenMP/libgomp. + * + * This kernel serves two purposes: + * 1) Lookup the device-side load-address of itself (thus avoiding the + * need to access the libgomp internals). + * 2) Count how many times it is called. + * We then call it once using OpenMP, and once manually, and check + * the counter reads "2". */ + uint64_t kernel_object = 0; +#pragma omp target map(from:kernel_object) map(present,alloc:test_data_value) + { + kernel_object = get_kernel_ptr (); + ++test_data_value; + } + + assert (kernel_object != 0); + + /* Configure the same kernel to run again, using HSA manually this time. */ + hsa_status_t status; + hsa_signal_t signal; + status = hsa_signal_create(1, 0, NULL, &signal); + assert (status == HSA_STATUS_SUCCESS); + + /* The kernel is built by GCC for OpenMP, so we need to pass the same + * data pointers that libgomp would pass in. */ + struct { + uintptr_t test_data_value; + uintptr_t kernel_object; + } tgtaddrs; + +#pragma omp target data use_device_addr(test_data_value) + { + tgtaddrs.test_data_value = (uintptr_t)&test_data_value; + tgtaddrs.kernel_object = (uintptr_t)omp_target_alloc (8, device_num); + } + + /* We also need to duplicate the launch ABI used by plugin-gcn.c. */ + struct kernargs_abi args; /* From libgomp-gcn.h. */ + args.dummy1 = (int64_t)&tgtaddrs; + args.out_ptr = (int64_t)malloc (sizeof (struct output)); /* Host side. */ + args.heap_ptr = (int64_t)omp_target_alloc (HEAPSIZE, device_num); + args.arena_ptr = (int64_t)omp_target_alloc (ARENASIZE, device_num); + args.stack_ptr = (int64_t)omp_target_alloc (STACKSIZE, device_num); + args.arena_size_per_team = ARENASIZE; + args.stack_size_per_thread = STACKSIZE; + + /* Build the HSA dispatch packet, and insert it into the queue. */ + uint64_t packet_id = hsa_queue_load_write_index_relaxed (hsa_queue); + const uint32_t queueMask = hsa_queue->size - 1; + hsa_kernel_dispatch_packet_t *dispatch_packet = + &(((hsa_kernel_dispatch_packet_t *) + (hsa_queue->base_address))[packet_id & queueMask]); + + dispatch_packet->setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + dispatch_packet->workgroup_size_x = 1; + dispatch_packet->workgroup_size_y = 64; + dispatch_packet->workgroup_size_z = 1; + dispatch_packet->grid_size_x = 1; + dispatch_packet->grid_size_y = 64; + dispatch_packet->grid_size_z = 1; + dispatch_packet->completion_signal = signal; + dispatch_packet->kernel_object = kernel_object; + dispatch_packet->kernarg_address = &args; + dispatch_packet->private_segment_size = 0; + dispatch_packet->group_segment_size = 1536; + + uint16_t header = 0; + header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; + header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + + /* Finish writing the packet header with an atomic release. */ + __atomic_store_n((uint16_t*)dispatch_packet, header, __ATOMIC_RELEASE); + + hsa_queue_store_write_index_relaxed (hsa_queue, packet_id + 1); + + ;/* Run the kernel and wait for it to complete. */ + hsa_signal_store_relaxed(hsa_queue->doorbell_signal, packet_id); + while (hsa_signal_wait_relaxed(signal, HSA_SIGNAL_CONDITION_LT, 1, + UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) + ; + + /* Clean up HSA. */ + hsa_signal_destroy(signal); + free ((void*)args.out_ptr); + omp_target_free ((void*)args.heap_ptr, device_num); + omp_target_free ((void*)args.arena_ptr, device_num); + omp_target_free ((void*)args.stack_ptr, device_num); + omp_target_free ((void*)tgtaddrs.kernel_object, device_num); + + /* Clean up OpenMP. */ + #pragma omp interop destroy(interop) + + /* Bring the data back from the device. */ +#pragma omp target exit data map(test_data_value) + + /* Ensure the kernel was called twice. Once by OpenMP, once by HSA. */ + assert (test_data_value == 2); + + return 0; +} diff --git a/libgomp/testsuite/libgomp.fortran/interop-hip-amd-full.F90 b/libgomp/testsuite/libgomp.fortran/interop-hip-amd-full.F90 new file mode 100644 index 0000000..bbd49dd --- /dev/null +++ b/libgomp/testsuite/libgomp.fortran/interop-hip-amd-full.F90 @@ -0,0 +1,7 @@ +! { dg-require-effective-target gomp_hipfort_module } +! { dg-require-effective-target gomp_libamdhip64 } +! { dg-additional-options "-lamdhip64" } + +#define HAVE_HIPFORT 1 + +#include "interop-hip.h" diff --git a/libgomp/testsuite/libgomp.fortran/interop-hip-amd-no-module.F90 b/libgomp/testsuite/libgomp.fortran/interop-hip-amd-no-module.F90 new file mode 100644 index 0000000..0afec83 --- /dev/null +++ b/libgomp/testsuite/libgomp.fortran/interop-hip-amd-no-module.F90 @@ -0,0 +1,6 @@ +! { dg-require-effective-target gomp_libamdhip64 } +! { dg-additional-options "-lamdhip64" } + +#define USE_HIP_FALLBACK_MODULE 1 + +#include "interop-hip.h" diff --git a/libgomp/testsuite/libgomp.fortran/interop-hip-nvidia-full.F90 b/libgomp/testsuite/libgomp.fortran/interop-hip-nvidia-full.F90 new file mode 100644 index 0000000..cef592f --- /dev/null +++ b/libgomp/testsuite/libgomp.fortran/interop-hip-nvidia-full.F90 @@ -0,0 +1,9 @@ +! { dg-require-effective-target gomp_hipfort_module } +! { dg-require-effective-target openacc_cudart } +! { dg-require-effective-target openacc_cuda } +! { dg-additional-options "-lcuda -lcudart" } + +#define HAVE_HIPFORT 1 +#define USE_CUDA_NAMES 1 + +#include "interop-hip.h" diff --git a/libgomp/testsuite/libgomp.fortran/interop-hip-nvidia-no-module.F90 b/libgomp/testsuite/libgomp.fortran/interop-hip-nvidia-no-module.F90 new file mode 100644 index 0000000..c1ef29d --- /dev/null +++ b/libgomp/testsuite/libgomp.fortran/interop-hip-nvidia-no-module.F90 @@ -0,0 +1,8 @@ +! { dg-require-effective-target openacc_libcudart } +! { dg-require-effective-target openacc_libcuda } +! { dg-additional-options "-lcuda -lcudart" } + +#define USE_CUDA_NAMES 1 +#define USE_HIP_FALLBACK_MODULE 1 + +#include "interop-hip.h" diff --git a/libgomp/testsuite/libgomp.fortran/interop-hip.h b/libgomp/testsuite/libgomp.fortran/interop-hip.h new file mode 100644 index 0000000..753ccce --- /dev/null +++ b/libgomp/testsuite/libgomp.fortran/interop-hip.h @@ -0,0 +1,214 @@ +! Minimal check whether HIP works - by checking whether the API routines +! seem to work. This includes a fallback if hipfort is not available + +#ifndef HAVE_HIPFORT +#ifndef USE_HIP_FALLBACK_MODULE +#if USE_CUDA_NAMES +#warning "Using fallback implementation for module hipfort as HAVE_HIPFORT is undefined (for NVIDA/CUDA)" +#else +#warning "Using fallback implementation for module hipfort as HAVE_HIPFORT is undefined - assume AMD as USE_CUDA_NAMES is unset" +#endif +#endif +module hipfort ! Minimal implementation for the testsuite + implicit none + + enum, bind(c) + enumerator :: hipSuccess = 0 + enumerator :: hipErrorNotSupported = 801 + end enum + + enum, bind(c) + enumerator :: hipDeviceAttributeClockRate = 5 + enumerator :: hipDeviceAttributeMaxGridDimX = 29 + end enum + + interface + integer(kind(hipSuccess)) function hipDeviceGetAttribute (ip, attr, dev) & +#if USE_CUDA_NAMES + bind(c, name="cudaDeviceGetAttribute") +#else + bind(c, name="hipDeviceGetAttribute") +#endif + use iso_c_binding, only: c_ptr, c_int + import + implicit none + type(c_ptr), value :: ip + integer(kind(hipDeviceAttributeClockRate)), value :: attr + integer(c_int), value :: dev + end + + integer(kind(hipSuccess)) function hipCtxGetApiVersion (ctx, ip) & +#if USE_CUDA_NAMES + bind(c, name="cudaCtxGetApiVersion") +#else + bind(c, name="hipCtxGetApiVersion") +#endif + use iso_c_binding, only: c_ptr + import + implicit none + type(c_ptr), value :: ctx, ip + end + + integer(kind(hipSuccess)) function hipStreamQuery (stream) & +#if USE_CUDA_NAMES + bind(c, name="cudaStreamQuery") +#else + bind(c, name="hipStreamQuery") +#endif + use iso_c_binding, only: c_ptr + import + implicit none + type(c_ptr), value :: stream + end + + integer(kind(hipSuccess)) function hipStreamGetFlags (stream, flags) & +#if USE_CUDA_NAMES + bind(c, name="cudaStreamGetFlags") +#else + bind(c, name="hipStreamGetFlags") +#endif + use iso_c_binding, only: c_ptr + import + implicit none + type(c_ptr), value :: stream + type(c_ptr), value :: flags + end + end interface +end module +#endif + +program main + use iso_c_binding, only: c_ptr, c_int, c_loc + use omp_lib + use hipfort + implicit none (type, external) + +! Only supported since CUDA 12.8 - skip for better compatibility +! ! Manally implement hipStreamGetDevice as hipfort misses it +! ! -> https://github.com/ROCm/hipfort/issues/238 +! interface +! integer(kind(hipSuccess)) function my_hipStreamGetDevice(stream, dev) & +!#if USE_CUDA_NAMES +! bind(c, name="cudaStreamGetDevice") +!#else +! bind(c, name="hipStreamGetDevice") +!#endif +! use iso_c_binding, only: c_ptr, c_int +! import +! implicit none +! type(c_ptr), value :: stream +! integer(c_int) :: dev +! end +! end interface + + integer(c_int), target :: ivar + integer(omp_interop_rc_kind) :: res + integer(omp_interop_kind) :: obj + integer(omp_interop_fr_kind) :: fr + integer(kind(hipSuccess)) :: hip_err + integer(c_int) :: hip_dev, dev_stream + type(c_ptr) :: hip_ctx, hip_sm + + logical :: vendor_is_amd + + obj = omp_interop_none + + !$omp interop init(target, targetsync, prefer_type("hip") : obj) + + fr = omp_get_interop_int (obj, omp_ipr_fr_id, res) + if (res /= omp_irc_success) error stop 1 + if (fr /= omp_ifr_hip) error stop 1 + + ivar = omp_get_interop_int (obj, omp_ipr_vendor, res) + if (ivar == 1) then ! AMD + vendor_is_amd = .true. + else if (ivar == 11) then ! Nvidia + vendor_is_amd = .false. + else + error stop 1 ! Unknown + endif +#if USE_CUDA_NAMES + if (vendor_is_amd) error stop 1 +#else + if (.not. vendor_is_amd) error stop 1 +#endif + + ! Check whether the omp_ipr_device -> hipDevice_t yields a valid device. + + hip_dev = omp_get_interop_int (obj, omp_ipr_device, res) + if (res /= omp_irc_success) error stop 1 + +! AMD messed up in Fortran with the attribute handling, missing the +! translation table it has for C. +block + enum, bind(c) + enumerator :: cudaDevAttrClockRate = 13 + enumerator :: cudaDevAttrMaxGridDimX = 5 + end enum + + ! Assume a clock size is available and > 1 GHz; value is in kHz. + ! c_loc is completely bogus, but as AMD messed up the interface ... + ! Cf. https://github.com/ROCm/hipfort/issues/239 +if (vendor_is_amd) then + hip_err = hipDeviceGetAttribute (c_loc(ivar), hipDeviceAttributeClockRate, hip_dev) +else + hip_err = hipDeviceGetAttribute (c_loc(ivar), cudaDevAttrClockRate, hip_dev) +endif + if (hip_err /= hipSuccess) error stop 1 + if (ivar <= 1000000) error stop 1 ! in kHz + + ! Assume that the MaxGridDimX is available and > 1024 + ! c_loc is completely bogus, but as AMD messed up the interface ... + ! Cf. https://github.com/ROCm/hipfort/issues/239 +if (vendor_is_amd) then + hip_err = hipDeviceGetAttribute (c_loc(ivar), hipDeviceAttributeMaxGridDimX, hip_dev) +else + hip_err = hipDeviceGetAttribute (c_loc(ivar), cudaDevAttrMaxGridDimX, hip_dev) +endif + if (hip_err /= hipSuccess) error stop 1 + if (ivar <= 1024) error stop 1 +end block + + + ! Check whether the omp_ipr_device_context -> hipCtx_t yields a context. + + hip_ctx = omp_get_interop_ptr (obj, omp_ipr_device_context, res) + if (res /= omp_irc_success) error stop 1 + +! ! Assume API Version > 0 for Nvidia, hipErrorNotSupported for AMD. */ +! ivar = -99 +! ! AMD deprectated hipCtxGetApiVersion (in C/C++) +! hip_err = hipCtxGetApiVersion (hip_ctx, c_loc(ivar)) +! +! if (vendor_is_amd) then +! if (hip_err /= hipErrorNotSupported .or. ivar /= -99) error stop 1 +! else +! if (hip_err /= hipSuccess) error stop 1 +! if (ivar <= 0) error stop 1 +! end if + + + ! Check whether the omp_ipr_targetsync -> hipStream_t yields a stream. + + hip_sm = omp_get_interop_ptr (obj, omp_ipr_targetsync, res) + if (res /= omp_irc_success) error stop 1 + +! Skip as this is only in CUDA 12.8 +! dev_stream = 99 +! ! Not (yet) implemented: https://github.com/ROCm/hipfort/issues/238 +! ! hip_err = hipStreamGetDevice (hip_sm, dev_stream) +! hip_err = my_hipStreamGetDevice (hip_sm, dev_stream) +! if (hip_err /= hipSuccess) error stop 1 +! if (dev_stream /= hip_dev) error stop 1 + + ! Get flags of the stream + hip_err = hipStreamGetFlags (hip_sm, c_loc (ivar)) + if (hip_err /= hipSuccess) error stop 1 + ! Accept any value + + ! All jobs should have been completed (as there were none none) + hip_err = hipStreamQuery (hip_sm) + if (hip_err /= hipSuccess) error stop 1 + + !$omp interop destroy(obj) +end diff --git a/libgomp/testsuite/libgomp.fortran/target-enter-data-8.f90 b/libgomp/testsuite/libgomp.fortran/target-enter-data-8.f90 new file mode 100644 index 0000000..c6d671c --- /dev/null +++ b/libgomp/testsuite/libgomp.fortran/target-enter-data-8.f90 @@ -0,0 +1,532 @@ +! { dg-additional-options "-cpp" } + +! FIXME: Some tests do not work yet. Those are for now in '#if 0' + +! Check that 'map(alloc:' properly works with +! - deferred-length character strings +! - arrays with array descriptors +! For those, the array descriptor / string length must be mapped with 'to:' + +program main +implicit none + +type t + integer :: ic(2:5), ic2 + character(len=11) :: ccstr(3:4), ccstr2 + character(len=11,kind=4) :: cc4str(3:7), cc4str2 + integer, pointer :: pc(:), pc2 + character(len=:), pointer :: pcstr(:), pcstr2 + character(len=:,kind=4), pointer :: pc4str(:), pc4str2 +end type t + +type(t) :: dt + +integer :: ii(5), ii2 +character(len=11) :: clstr(-1:1), clstr2 +character(len=11,kind=4) :: cl4str(0:3), cl4str2 +integer, pointer :: ip(:), ip2 +integer, allocatable :: ia(:), ia2 +character(len=:), pointer :: pstr(:), pstr2 +character(len=:), allocatable :: astr(:), astr2 +character(len=:,kind=4), pointer :: p4str(:), p4str2 +character(len=:,kind=4), allocatable :: a4str(:), a4str2 + + +allocate(dt%pc(5), dt%pc2) +allocate(character(len=2) :: dt%pcstr(2)) +allocate(character(len=4) :: dt%pcstr2) + +allocate(character(len=3,kind=4) :: dt%pc4str(2:3)) +allocate(character(len=5,kind=4) :: dt%pc4str2) + +allocate(ip(5), ip2, ia(8), ia2) +allocate(character(len=2) :: pstr(-2:0)) +allocate(character(len=4) :: pstr2) +allocate(character(len=6) :: astr(3:5)) +allocate(character(len=8) :: astr2) + +allocate(character(len=3,kind=4) :: p4str(2:4)) +allocate(character(len=5,kind=4) :: p4str2) +allocate(character(len=7,kind=4) :: a4str(-2:3)) +allocate(character(len=9,kind=4) :: a4str2) + + +! integer :: ic(2:5), ic2 + +!$omp target enter data map(alloc: dt%ic) +!$omp target map(alloc: dt%ic) + if (size(dt%ic) /= 4) error stop + if (lbound(dt%ic, 1) /= 2) error stop + if (ubound(dt%ic, 1) /= 5) error stop + dt%ic = [22, 33, 44, 55] +!$omp end target +!$omp target exit data map(from: dt%ic) +if (size(dt%ic) /= 4) error stop +if (lbound(dt%ic, 1) /= 2) error stop +if (ubound(dt%ic, 1) /= 5) error stop +if (any (dt%ic /= [22, 33, 44, 55])) error stop + +!$omp target enter data map(alloc: dt%ic2) +!$omp target map(alloc: dt%ic2) + dt%ic2 = 42 +!$omp end target +!$omp target exit data map(from: dt%ic2) +if (dt%ic2 /= 42) error stop + + +! character(len=11) :: ccstr(3:4), ccstr2 + +!$omp target enter data map(alloc: dt%ccstr) +!$omp target map(alloc: dt%ccstr) + if (len(dt%ccstr) /= 11) error stop + if (size(dt%ccstr) /= 2) error stop + if (lbound(dt%ccstr, 1) /= 3) error stop + if (ubound(dt%ccstr, 1) /= 4) error stop + dt%ccstr = ["12345678901", "abcdefghijk"] +!$omp end target +!$omp target exit data map(from: dt%ccstr) +if (len(dt%ccstr) /= 11) error stop +if (size(dt%ccstr) /= 2) error stop +if (lbound(dt%ccstr, 1) /= 3) error stop +if (ubound(dt%ccstr, 1) /= 4) error stop +if (any (dt%ccstr /= ["12345678901", "abcdefghijk"])) error stop + +!$omp target enter data map(alloc: dt%ccstr2) +!$omp target map(alloc: dt%ccstr2) + if (len(dt%ccstr2) /= 11) error stop + dt%ccstr2 = "ABCDEFGHIJK" +!$omp end target +!$omp target exit data map(from: dt%ccstr2) +if (len(dt%ccstr2) /= 11) error stop +if (dt%ccstr2 /= "ABCDEFGHIJK") error stop + + +! character(len=11,kind=4) :: cc4str(3:7), cc4str2 + +#if 0 +! Value check fails +!$omp target map(alloc: dt%cc4str) + if (len(dt%cc4str) /= 11) error stop + if (size(dt%cc4str) /= 5) error stop + if (lbound(dt%cc4str, 1) /= 3) error stop + if (ubound(dt%cc4str, 1) /= 7) error stop + dt%cc4str = [4_"12345678901", 4_"abcdefghijk", & + 4_"qerftcea6ds", 4_"a1f9g37ga4.", & + 4_"45ngwj56sj2"] +!$omp end target +!$omp target exit data map(from: dt%cc4str) +if (len(dt%cc4str) /= 11) error stop +if (size(dt%cc4str) /= 5) error stop +if (lbound(dt%cc4str, 1) /= 3) error stop +if (ubound(dt%cc4str, 1) /= 7) error stop +if (dt%cc4str(3) /= 4_"12345678901") error stop +if (dt%cc4str(4) /= 4_"abcdefghijk") error stop +if (dt%cc4str(5) /= 4_"qerftcea6ds") error stop +if (dt%cc4str(6) /= 4_"a1f9g37ga4.") error stop +if (dt%cc4str(7) /= 4_"45ngwj56sj2") error stop +#endif + +!$omp target enter data map(alloc: dt%cc4str2) +!$omp target map(alloc: dt%cc4str2) + if (len(dt%cc4str2) /= 11) error stop + dt%cc4str2 = 4_"ABCDEFGHIJK" +!$omp end target +!$omp target exit data map(from: dt%cc4str2) +if (len(dt%cc4str2) /= 11) error stop +if (dt%cc4str2 /= 4_"ABCDEFGHIJK") error stop + + +! integer, pointer :: pc(:), pc2 +! allocate(dt%pc(5), dt%pc2) + +!$omp target enter data map(alloc: dt%pc) +!$omp target map(alloc: dt%pc) + if (.not. associated(dt%pc)) error stop + if (size(dt%pc) /= 5) error stop + if (lbound(dt%pc, 1) /= 1) error stop + if (ubound(dt%pc, 1) /= 5) error stop + dt%pc = [11, 22, 33, 44, 55] +!$omp end target +!$omp target exit data map(from: dt%pc) +if (.not. associated(dt%pc)) error stop +if (size(dt%pc) /= 5) error stop +if (lbound(dt%pc, 1) /= 1) error stop +if (ubound(dt%pc, 1) /= 5) error stop +if (any (dt%pc /= [11, 22, 33, 44, 55])) error stop + +!$omp target enter data map(alloc: dt%pc2) +!$omp target map(alloc: dt%pc2) + if (.not. associated(dt%pc2)) error stop + dt%pc2 = 99 +!$omp end target +!$omp target exit data map(from: dt%pc2) +if (dt%pc2 /= 99) error stop +if (.not. associated(dt%pc2)) error stop + + +! character(len=:), pointer :: pcstr(:), pcstr2 +! allocate(character(len=2) :: dt%pcstr(2)) +! allocate(character(len=4) :: dt%pcstr2) + +!$omp target enter data map(alloc: dt%pcstr) +!$omp target map(alloc: dt%pcstr) + if (.not. associated(dt%pcstr)) error stop + if (len(dt%pcstr) /= 2) error stop + if (size(dt%pcstr) /= 2) error stop + if (lbound(dt%pcstr, 1) /= 1) error stop + if (ubound(dt%pcstr, 1) /= 2) error stop + dt%pcstr = ["01", "jk"] +!$omp end target +!$omp target exit data map(from: dt%pcstr) +if (.not. associated(dt%pcstr)) error stop +if (len(dt%pcstr) /= 2) error stop +if (size(dt%pcstr) /= 2) error stop +if (lbound(dt%pcstr, 1) /= 1) error stop +if (ubound(dt%pcstr, 1) /= 2) error stop +if (any (dt%pcstr /= ["01", "jk"])) error stop + + +!$omp target enter data map(alloc: dt%pcstr2) +!$omp target map(alloc: dt%pcstr2) + if (.not. associated(dt%pcstr2)) error stop + if (len(dt%pcstr2) /= 4) error stop + dt%pcstr2 = "HIJK" +!$omp end target +!$omp target exit data map(from: dt%pcstr2) +if (.not. associated(dt%pcstr2)) error stop +if (len(dt%pcstr2) /= 4) error stop +if (dt%pcstr2 /= "HIJK") error stop + + +! character(len=:,kind=4), pointer :: pc4str(:), pc4str2 +! allocate(character(len=3,kind=4) :: dt%pc4str(2:3)) +! allocate(character(len=5,kind=4) :: dt%pc4str2) + +!$omp target enter data map(alloc: dt%pc4str) +!$omp target map(alloc: dt%pc4str) + if (.not. associated(dt%pc4str)) error stop + if (len(dt%pc4str) /= 3) error stop + if (size(dt%pc4str) /= 2) error stop + if (lbound(dt%pc4str, 1) /= 2) error stop + if (ubound(dt%pc4str, 1) /= 3) error stop + dt%pc4str = [4_"456", 4_"tzu"] +!$omp end target +!$omp target exit data map(from: dt%pc4str) +if (.not. associated(dt%pc4str)) error stop +if (len(dt%pc4str) /= 3) error stop +if (size(dt%pc4str) /= 2) error stop +if (lbound(dt%pc4str, 1) /= 2) error stop +if (ubound(dt%pc4str, 1) /= 3) error stop +if (dt%pc4str(2) /= 4_"456") error stop +if (dt%pc4str(3) /= 4_"tzu") error stop + +!$omp target enter data map(alloc: dt%pc4str2) +!$omp target map(alloc: dt%pc4str2) + if (.not. associated(dt%pc4str2)) error stop + if (len(dt%pc4str2) /= 5) error stop + dt%pc4str2 = 4_"98765" +!$omp end target +!$omp target exit data map(from: dt%pc4str2) +if (.not. associated(dt%pc4str2)) error stop +if (len(dt%pc4str2) /= 5) error stop +if (dt%pc4str2 /= 4_"98765") error stop + + +! integer :: ii(5), ii2 + +!$omp target enter data map(alloc: ii) +!$omp target map(alloc: ii) + if (size(ii) /= 5) error stop + if (lbound(ii, 1) /= 1) error stop + if (ubound(ii, 1) /= 5) error stop + ii = [-1, -2, -3, -4, -5] +!$omp end target +!$omp target exit data map(from: ii) +if (size(ii) /= 5) error stop +if (lbound(ii, 1) /= 1) error stop +if (ubound(ii, 1) /= 5) error stop +if (any (ii /= [-1, -2, -3, -4, -5])) error stop + +!$omp target enter data map(alloc: ii2) +!$omp target map(alloc: ii2) + ii2 = -410 +!$omp end target +!$omp target exit data map(from: ii2) +if (ii2 /= -410) error stop + + +! character(len=11) :: clstr(-1:1), clstr2 + +!$omp target enter data map(alloc: clstr) +!$omp target map(alloc: clstr) + if (len(clstr) /= 11) error stop + if (size(clstr) /= 3) error stop + if (lbound(clstr, 1) /= -1) error stop + if (ubound(clstr, 1) /= 1) error stop + clstr = ["12345678901", "abcdefghijk", "ABCDEFGHIJK"] +!$omp end target +!$omp target exit data map(from: clstr) +if (len(clstr) /= 11) error stop +if (size(clstr) /= 3) error stop +if (lbound(clstr, 1) /= -1) error stop +if (ubound(clstr, 1) /= 1) error stop +if (any (clstr /= ["12345678901", "abcdefghijk", "ABCDEFGHIJK"])) error stop + +!$omp target enter data map(alloc: clstr2) +!$omp target map(alloc: clstr2) + if (len(clstr2) /= 11) error stop + clstr2 = "ABCDEFghijk" +!$omp end target +!$omp target exit data map(from: clstr2) +if (len(clstr2) /= 11) error stop +if (clstr2 /= "ABCDEFghijk") error stop + + +! character(len=11,kind=4) :: cl4str(0:3), cl4str2 + +!$omp target enter data map(alloc: cl4str) +!$omp target map(alloc: cl4str) + if (len(cl4str) /= 11) error stop + if (size(cl4str) /= 4) error stop + if (lbound(cl4str, 1) /= 0) error stop + if (ubound(cl4str, 1) /= 3) error stop + cl4str = [4_"12345678901", 4_"abcdefghijk", & + 4_"qerftcea6ds", 4_"a1f9g37ga4."] +!$omp end target +!$omp target exit data map(from: cl4str) +if (len(cl4str) /= 11) error stop +if (size(cl4str) /= 4) error stop +if (lbound(cl4str, 1) /= 0) error stop +if (ubound(cl4str, 1) /= 3) error stop +if (cl4str(0) /= 4_"12345678901") error stop +if (cl4str(1) /= 4_"abcdefghijk") error stop +if (cl4str(2) /= 4_"qerftcea6ds") error stop +if (cl4str(3) /= 4_"a1f9g37ga4.") error stop + +!$omp target enter data map(alloc: cl4str2) +!$omp target map(alloc: cl4str2) + if (len(cl4str2) /= 11) error stop + cl4str2 = 4_"ABCDEFGHIJK" +!$omp end target +!$omp target exit data map(from: cl4str2) +if (len(cl4str2) /= 11) error stop +if (cl4str2 /= 4_"ABCDEFGHIJK") error stop + + +! allocate(ip(5), ip2, ia(8), ia2) + +!$omp target enter data map(alloc: ip) +!$omp target map(alloc: ip) + if (.not. associated(ip)) error stop + if (size(ip) /= 5) error stop + if (lbound(ip, 1) /= 1) error stop + if (ubound(ip, 1) /= 5) error stop + ip = [11, 22, 33, 44, 55] +!$omp end target +!$omp target exit data map(from: ip) +if (.not. associated(ip)) error stop +if (size(ip) /= 5) error stop +if (lbound(ip, 1) /= 1) error stop +if (ubound(ip, 1) /= 5) error stop +if (any (ip /= [11, 22, 33, 44, 55])) error stop + +!$omp target enter data map(alloc: ip2) +!$omp target map(alloc: ip2) + if (.not. associated(ip2)) error stop + ip2 = 99 +!$omp end target +!$omp target exit data map(from: ip2) +if (ip2 /= 99) error stop +if (.not. associated(ip2)) error stop + + +! allocate(ip(5), ip2, ia(8), ia2) + +!$omp target enter data map(alloc: ia) +!$omp target map(alloc: ia) + if (.not. allocated(ia)) error stop + if (size(ia) /= 8) error stop + if (lbound(ia, 1) /= 1) error stop + if (ubound(ia, 1) /= 8) error stop + ia = [1,2,3,4,5,6,7,8] +!$omp end target +!$omp target exit data map(from: ia) +if (.not. allocated(ia)) error stop +if (size(ia) /= 8) error stop +if (lbound(ia, 1) /= 1) error stop +if (ubound(ia, 1) /= 8) error stop +if (any (ia /= [1,2,3,4,5,6,7,8])) error stop + +!$omp target enter data map(alloc: ia2) +!$omp target map(alloc: ia2) + if (.not. allocated(ia2)) error stop + ia2 = 102 +!$omp end target +!$omp target exit data map(from: ia2) +if (ia2 /= 102) error stop +if (.not. allocated(ia2)) error stop + + +! character(len=:), pointer :: pstr(:), pstr2 +! allocate(character(len=2) :: pstr(-2:0)) +! allocate(character(len=4) :: pstr2) + +!$omp target enter data map(alloc: pstr) +!$omp target map(alloc: pstr) + if (.not. associated(pstr)) error stop + if (len(pstr) /= 2) error stop + if (size(pstr) /= 3) error stop + if (lbound(pstr, 1) /= -2) error stop + if (ubound(pstr, 1) /= 0) error stop + pstr = ["01", "jk", "aq"] +!$omp end target +!$omp target exit data map(from: pstr) +if (.not. associated(pstr)) error stop +if (len(pstr) /= 2) error stop +if (size(pstr) /= 3) error stop +if (lbound(pstr, 1) /= -2) error stop +if (ubound(pstr, 1) /= 0) error stop +if (any (pstr /= ["01", "jk", "aq"])) error stop + +!$omp target enter data map(alloc: pstr2) +!$omp target map(alloc: pstr2) + if (.not. associated(pstr2)) error stop + if (len(pstr2) /= 4) error stop + pstr2 = "HIJK" +!$omp end target +!$omp target exit data map(from: pstr2) +if (.not. associated(pstr2)) error stop +if (len(pstr2) /= 4) error stop +if (pstr2 /= "HIJK") error stop + + +! character(len=:), allocatable :: astr(:), astr2 +! allocate(character(len=6) :: astr(3:5)) +! allocate(character(len=8) :: astr2) + + +!$omp target enter data map(alloc: astr) +!$omp target map(alloc: astr) + if (.not. allocated(astr)) error stop + if (len(astr) /= 6) error stop + if (size(astr) /= 3) error stop + if (lbound(astr, 1) /= 3) error stop + if (ubound(astr, 1) /= 5) error stop + astr = ["01db45", "jk$D%S", "zutg47"] +!$omp end target +!$omp target exit data map(from: astr) +if (.not. allocated(astr)) error stop +if (len(astr) /= 6) error stop +if (size(astr) /= 3) error stop +if (lbound(astr, 1) /= 3) error stop +if (ubound(astr, 1) /= 5) error stop +if (any (astr /= ["01db45", "jk$D%S", "zutg47"])) error stop + + +!$omp target enter data map(alloc: astr2) +!$omp target map(alloc: astr2) + if (.not. allocated(astr2)) error stop + if (len(astr2) /= 8) error stop + astr2 = "HIJKhijk" +!$omp end target +!$omp target exit data map(from: astr2) +if (.not. allocated(astr2)) error stop +if (len(astr2) /= 8) error stop +if (astr2 /= "HIJKhijk") error stop + + +! character(len=:,kind=4), pointer :: p4str(:), p4str2 +! allocate(character(len=3,kind=4) :: p4str(2:4)) +! allocate(character(len=5,kind=4) :: p4str2) + +! FAILS with value check + +!$omp target enter data map(alloc: p4str) +!$omp target map(alloc: p4str) + if (.not. associated(p4str)) error stop + if (len(p4str) /= 3) error stop + if (size(p4str) /= 3) error stop + if (lbound(p4str, 1) /= 2) error stop + if (ubound(p4str, 1) /= 4) error stop + p4str(:) = [4_"f85", 4_"8af", 4_"A%F"] +!$omp end target +!$omp target exit data map(from: p4str) +if (.not. associated(p4str)) error stop +if (len(p4str) /= 3) error stop +if (size(p4str) /= 3) error stop +if (lbound(p4str, 1) /= 2) error stop +if (ubound(p4str, 1) /= 4) error stop +if (p4str(2) /= 4_"f85") error stop +if (p4str(3) /= 4_"8af") error stop +if (p4str(4) /= 4_"A%F") error stop + +!$omp target enter data map(alloc: p4str2) +!$omp target map(alloc: p4str2) + if (.not. associated(p4str2)) error stop + if (len(p4str2) /= 5) error stop + p4str2 = 4_"9875a" +!$omp end target +!$omp target exit data map(from: p4str2) +if (.not. associated(p4str2)) error stop +if (len(p4str2) /= 5) error stop +if (p4str2 /= 4_"9875a") error stop + + +! character(len=:,kind=4), allocatable :: a4str(:), a4str2 +! allocate(character(len=7,kind=4) :: a4str(-2:3)) +! allocate(character(len=9,kind=4) :: a4str2) + +!$omp target enter data map(alloc: a4str) +!$omp target map(alloc: a4str) + if (.not. allocated(a4str)) error stop + if (len(a4str) /= 7) error stop + if (size(a4str) /= 6) error stop + if (lbound(a4str, 1) /= -2) error stop + if (ubound(a4str, 1) /= 3) error stop + ! See PR fortran/107508 why '(:)' is required + a4str(:) = [4_"sf456aq", 4_"3dtzu24", 4_"_4fh7sm", 4_"=ff85s7", 4_"j=8af4d", 4_".,A%Fsz"] +!$omp end target +!$omp target exit data map(from: a4str) +if (.not. allocated(a4str)) error stop +if (len(a4str) /= 7) error stop +if (size(a4str) /= 6) error stop +if (lbound(a4str, 1) /= -2) error stop +if (ubound(a4str, 1) /= 3) error stop +if (a4str(-2) /= 4_"sf456aq") error stop +if (a4str(-1) /= 4_"3dtzu24") error stop +if (a4str(0) /= 4_"_4fh7sm") error stop +if (a4str(1) /= 4_"=ff85s7") error stop +if (a4str(2) /= 4_"j=8af4d") error stop +if (a4str(3) /= 4_".,A%Fsz") error stop + +!$omp target enter data map(alloc: a4str2) +!$omp target map(alloc: a4str2) + if (.not. allocated(a4str2)) error stop + if (len(a4str2) /= 9) error stop + a4str2 = 4_"98765a23d" +!$omp end target +!$omp target exit data map(from: a4str2) +if (.not. allocated(a4str2)) error stop +if (len(a4str2) /= 9) error stop +if (a4str2 /= 4_"98765a23d") error stop + + +deallocate(dt%pc, dt%pc2) +deallocate(dt%pcstr) +deallocate(dt%pcstr2) + +deallocate(dt%pc4str) +deallocate(dt%pc4str2) + +deallocate(ip, ip2, ia, ia2) +deallocate(pstr) +deallocate(pstr2) +deallocate(astr) +deallocate(astr2) + +deallocate(p4str) +deallocate(p4str2) +deallocate(a4str) +deallocate(a4str2) + +end diff --git a/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-1.C b/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-1.C index f2ef751..08c5766 100644 --- a/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-1.C +++ b/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-1.C @@ -4,9 +4,6 @@ { dg-additional-options -fexceptions } */ /* { dg-additional-options -fdump-tree-optimized-raw } { dg-additional-options -foffload-options=-fdump-tree-optimized-raw } */ -/* { dg-bogus {Size expression must be absolute\.} PR119737 { target { openacc_radeon_accel_selected && __OPTIMIZE__ } xfail *-*-* } 0 } - { dg-ice PR119737 { openacc_radeon_accel_selected && __OPTIMIZE__ } } - { dg-excess-errors {'mkoffload' failure etc.} { xfail { openacc_radeon_accel_selected && __OPTIMIZE__ } } } */ /* See also '../libgomp.c++/target-exceptions-throw-1.C'. */ diff --git a/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-2.C b/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-2.C index f6dc970..a7408cd 100644 --- a/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-2.C +++ b/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-2.C @@ -6,9 +6,6 @@ { dg-additional-options -foffload-options=-fdump-tree-optimized-raw } */ /* { dg-bogus {undefined symbol: typeinfo name for MyException} PR119806 { target { openacc_radeon_accel_selected && { ! __OPTIMIZE__ } } xfail *-*-* } 0 } { dg-excess-errors {'mkoffload' failure etc.} { xfail { openacc_radeon_accel_selected && { ! __OPTIMIZE__ } } } } */ -/* { dg-bogus {Size expression must be absolute\.} PR119737 { target { openacc_radeon_accel_selected && __OPTIMIZE__ } xfail *-*-* } 0 } - { dg-ice PR119737 { openacc_radeon_accel_selected && __OPTIMIZE__ } } - { dg-excess-errors {'mkoffload' failures etc.} { xfail { openacc_radeon_accel_selected && __OPTIMIZE__ } } } */ /* { dg-bogus {Initial value type mismatch} PR119806 { target { openacc_nvidia_accel_selected && { ! __OPTIMIZE__ } } xfail *-*-* } 0 } { dg-excess-errors {'mkoffload' failure etc.} { xfail { openacc_nvidia_accel_selected && { ! __OPTIMIZE__ } } } } */ |