diff options
author | Jakub Jelinek <jakub@redhat.com> | 2017-01-17 10:44:17 +0100 |
---|---|---|
committer | Jakub Jelinek <jakub@gcc.gnu.org> | 2017-01-17 10:44:17 +0100 |
commit | 2393d337e7c5ff258b1ad167025b9e4d5f518533 (patch) | |
tree | eaf913e6c721a84f592e444c942b624bbb14fe06 /libgomp/plugin/plugin-nvptx.c | |
parent | 3c36aa6ba2be894d4092a6ce8129d39ef846c964 (diff) | |
download | gcc-2393d337e7c5ff258b1ad167025b9e4d5f518533.zip gcc-2393d337e7c5ff258b1ad167025b9e4d5f518533.tar.gz gcc-2393d337e7c5ff258b1ad167025b9e4d5f518533.tar.bz2 |
configfrag.ac: For --without-cuda-driver don't initialize CUDA_DRIVER_INCLUDE nor CUDA_DRIVER_LIB.
* plugin/configfrag.ac: For --without-cuda-driver don't initialize
CUDA_DRIVER_INCLUDE nor CUDA_DRIVER_LIB. If both
CUDA_DRIVER_INCLUDE and CUDA_DRIVER_LIB are empty and linking small
cuda program fails, define PLUGIN_NVPTX_DYNAMIC to 1 and use
plugin/include/cuda as include dir and -ldl instead of -lcuda as
library to link ptx plugin against.
* plugin/plugin-nvptx.c: Include dlfcn.h if PLUGIN_NVPTX_DYNAMIC.
(CUDA_CALLS): Define.
(cuda_lib, cuda_lib_inited): New variables.
(init_cuda_lib): New function.
(CUDA_CALL_PREFIX): Define.
(CUDA_CALL_ERET, CUDA_CALL_ASSERT): Use CUDA_CALL_PREFIX.
(CUDA_CALL): Use FN instead of (FN).
(CUDA_CALL_NOCHECK): Define.
(cuda_error, fini_streams_for_device, select_stream_for_async,
nvptx_attach_host_thread_to_device, nvptx_open_device, link_ptx,
event_gc, nvptx_exec, nvptx_async_test, nvptx_async_test_all,
nvptx_wait_all, nvptx_set_clocktick, GOMP_OFFLOAD_unload_image,
nvptx_stacks_alloc, nvptx_stacks_free, GOMP_OFFLOAD_run): Use
CUDA_CALL_NOCHECK.
(nvptx_init): Call init_cuda_lib, if it fails, return false. Use
CUDA_CALL_NOCHECK.
(nvptx_get_num_devices): Call init_cuda_lib, if it fails, return 0.
Use CUDA_CALL_NOCHECK.
* plugin/cuda/cuda.h: New file.
* config.h.in: Regenerated.
* configure: Regenerated.
From-SVN: r244522
Diffstat (limited to 'libgomp/plugin/plugin-nvptx.c')
-rw-r--r-- | libgomp/plugin/plugin-nvptx.c | 219 |
1 files changed, 164 insertions, 55 deletions
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index a2e1aeb..4144218 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -48,30 +48,104 @@ #include <assert.h> #include <errno.h> -static const char * -cuda_error (CUresult r) -{ -#if CUDA_VERSION < 7000 - /* Specified in documentation and present in library from at least - 5.5. Not declared in header file prior to 7.0. */ - extern CUresult cuGetErrorString (CUresult, const char **); -#endif - const char *desc; - - r = cuGetErrorString (r, &desc); - if (r != CUDA_SUCCESS) - desc = "unknown cuda error"; - - return desc; +#if PLUGIN_NVPTX_DYNAMIC +# include <dlfcn.h> + +# define CUDA_CALLS \ +CUDA_ONE_CALL (cuCtxCreate) \ +CUDA_ONE_CALL (cuCtxDestroy) \ +CUDA_ONE_CALL (cuCtxGetCurrent) \ +CUDA_ONE_CALL (cuCtxGetDevice) \ +CUDA_ONE_CALL (cuCtxPopCurrent) \ +CUDA_ONE_CALL (cuCtxPushCurrent) \ +CUDA_ONE_CALL (cuCtxSynchronize) \ +CUDA_ONE_CALL (cuDeviceGet) \ +CUDA_ONE_CALL (cuDeviceGetAttribute) \ +CUDA_ONE_CALL (cuDeviceGetCount) \ +CUDA_ONE_CALL (cuEventCreate) \ +CUDA_ONE_CALL (cuEventDestroy) \ +CUDA_ONE_CALL (cuEventElapsedTime) \ +CUDA_ONE_CALL (cuEventQuery) \ +CUDA_ONE_CALL (cuEventRecord) \ +CUDA_ONE_CALL (cuEventSynchronize) \ +CUDA_ONE_CALL (cuFuncGetAttribute) \ +CUDA_ONE_CALL (cuGetErrorString) \ +CUDA_ONE_CALL (cuInit) \ +CUDA_ONE_CALL (cuLaunchKernel) \ +CUDA_ONE_CALL (cuLinkAddData) \ +CUDA_ONE_CALL (cuLinkComplete) \ +CUDA_ONE_CALL (cuLinkCreate) \ +CUDA_ONE_CALL (cuLinkDestroy) \ +CUDA_ONE_CALL (cuMemAlloc) \ +CUDA_ONE_CALL (cuMemAllocHost) \ +CUDA_ONE_CALL (cuMemcpy) \ +CUDA_ONE_CALL (cuMemcpyDtoDAsync) \ +CUDA_ONE_CALL (cuMemcpyDtoH) \ +CUDA_ONE_CALL (cuMemcpyDtoHAsync) \ +CUDA_ONE_CALL (cuMemcpyHtoD) \ +CUDA_ONE_CALL (cuMemcpyHtoDAsync) \ +CUDA_ONE_CALL (cuMemFree) \ +CUDA_ONE_CALL (cuMemFreeHost) \ +CUDA_ONE_CALL (cuMemGetAddressRange) \ +CUDA_ONE_CALL (cuMemHostGetDevicePointer)\ +CUDA_ONE_CALL (cuModuleGetFunction) \ +CUDA_ONE_CALL (cuModuleGetGlobal) \ +CUDA_ONE_CALL (cuModuleLoad) \ +CUDA_ONE_CALL (cuModuleLoadData) \ +CUDA_ONE_CALL (cuModuleUnload) \ +CUDA_ONE_CALL (cuStreamCreate) \ +CUDA_ONE_CALL (cuStreamDestroy) \ +CUDA_ONE_CALL (cuStreamQuery) \ +CUDA_ONE_CALL (cuStreamSynchronize) \ +CUDA_ONE_CALL (cuStreamWaitEvent) +# define CUDA_ONE_CALL(call) \ + __typeof (call) *call; +struct cuda_lib_s { + CUDA_CALLS +} cuda_lib; + +/* -1 if init_cuda_lib has not been called yet, false + if it has been and failed, true if it has been and succeeded. */ +static char cuda_lib_inited = -1; + +/* Dynamically load the CUDA runtime library and initialize function + pointers, return false if unsuccessful, true if successful. */ +static bool +init_cuda_lib (void) +{ + if (cuda_lib_inited != -1) + return cuda_lib_inited; + const char *cuda_runtime_lib = "libcuda.so.1"; + void *h = dlopen (cuda_runtime_lib, RTLD_LAZY); + cuda_lib_inited = false; + if (h == NULL) + return false; +# undef CUDA_ONE_CALL +# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call) +# define CUDA_ONE_CALL_1(call) \ + cuda_lib.call = dlsym (h, #call); \ + if (cuda_lib.call == NULL) \ + return false; + CUDA_CALLS + cuda_lib_inited = true; + return true; } +# undef CUDA_ONE_CALL +# undef CUDA_ONE_CALL_1 +# define CUDA_CALL_PREFIX cuda_lib. +#else +# define CUDA_CALL_PREFIX +# define init_cuda_lib() true +#endif /* Convenience macros for the frequently used CUDA library call and - error handling sequence. This does not capture all the cases we - use in this file, but is common enough. */ + error handling sequence as well as CUDA library calls that + do the error checking themselves or don't do it at all. */ #define CUDA_CALL_ERET(ERET, FN, ...) \ do { \ - unsigned __r = FN (__VA_ARGS__); \ + unsigned __r \ + = CUDA_CALL_PREFIX FN (__VA_ARGS__); \ if (__r != CUDA_SUCCESS) \ { \ GOMP_PLUGIN_error (#FN " error: %s", \ @@ -81,11 +155,12 @@ cuda_error (CUresult r) } while (0) #define CUDA_CALL(FN, ...) \ - CUDA_CALL_ERET (false, (FN), __VA_ARGS__) + CUDA_CALL_ERET (false, FN, __VA_ARGS__) #define CUDA_CALL_ASSERT(FN, ...) \ do { \ - unsigned __r = FN (__VA_ARGS__); \ + unsigned __r \ + = CUDA_CALL_PREFIX FN (__VA_ARGS__); \ if (__r != CUDA_SUCCESS) \ { \ GOMP_PLUGIN_fatal (#FN " error: %s", \ @@ -93,6 +168,26 @@ cuda_error (CUresult r) } \ } while (0) +#define CUDA_CALL_NOCHECK(FN, ...) \ + CUDA_CALL_PREFIX FN (__VA_ARGS__) + +static const char * +cuda_error (CUresult r) +{ +#if CUDA_VERSION < 7000 + /* Specified in documentation and present in library from at least + 5.5. Not declared in header file prior to 7.0. */ + extern CUresult cuGetErrorString (CUresult, const char **); +#endif + const char *desc; + + r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc); + if (r != CUDA_SUCCESS) + desc = "unknown cuda error"; + + return desc; +} + static unsigned int instantiated_devices = 0; static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; @@ -401,7 +496,7 @@ fini_streams_for_device (struct ptx_device *ptx_dev) ret &= map_fini (s); - CUresult r = cuStreamDestroy (s->stream); + CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream); if (r != CUDA_SUCCESS) { GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r)); @@ -484,7 +579,8 @@ select_stream_for_async (int async, pthread_t thread, bool create, s->stream = existing; else { - r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT); + r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream, + CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { pthread_mutex_unlock (&ptx_dev->stream_lock); @@ -554,10 +650,14 @@ nvptx_init (void) if (instantiated_devices != 0) return true; - CUDA_CALL (cuInit, 0); ptx_events = NULL; pthread_mutex_init (&ptx_event_lock, NULL); + if (!init_cuda_lib ()) + return false; + + CUDA_CALL (cuInit, 0); + CUDA_CALL (cuDeviceGetCount, &ndevs); ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *) * ndevs); @@ -575,7 +675,7 @@ nvptx_attach_host_thread_to_device (int n) struct ptx_device *ptx_dev; CUcontext thd_ctx; - r = cuCtxGetDevice (&dev); + r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) { GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r)); @@ -623,7 +723,7 @@ nvptx_open_device (int n) ptx_dev->dev = dev; ptx_dev->ctx_shared = false; - r = cuCtxGetDevice (&ctx_dev); + r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) { GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r)); @@ -669,7 +769,7 @@ nvptx_open_device (int n) &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); ptx_dev->clock_khz = pi; - CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); ptx_dev->num_sms = pi; @@ -679,7 +779,7 @@ nvptx_open_device (int n) /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only in CUDA 6.0 and newer. */ - r = cuDeviceGetAttribute (&pi, 82, dev); + r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev); /* Fallback: use limit of registers per block, which is usually equal. */ if (r == CUDA_ERROR_INVALID_VALUE) pi = ptx_dev->regs_per_block; @@ -698,8 +798,8 @@ nvptx_open_device (int n) return NULL; } - r = cuDeviceGetAttribute (&async_engines, - CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); + r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines, + CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); if (r != CUDA_SUCCESS) async_engines = 1; @@ -746,7 +846,9 @@ nvptx_get_num_devices (void) further initialization). */ if (instantiated_devices == 0) { - CUresult r = cuInit (0); + if (!init_cuda_lib ()) + return 0; + CUresult r = CUDA_CALL_NOCHECK (cuInit, 0); /* This is not an error: e.g. we may have CUDA libraries installed but no devices available. */ if (r != CUDA_SUCCESS) @@ -797,8 +899,9 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, /* cuLinkAddData's 'data' argument erroneously omits the const qualifier. */ GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code); - r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code, - ptx_objs->size, 0, 0, 0, 0); + r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX, + (char *) ptx_objs->code, ptx_objs->size, + 0, 0, 0, 0); if (r != CUDA_SUCCESS) { GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); @@ -809,7 +912,7 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, } GOMP_PLUGIN_debug (0, "Linking\n"); - r = cuLinkComplete (linkstate, &linkout, &linkoutsize); + r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize); GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed); GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]); @@ -844,7 +947,7 @@ event_gc (bool memmap_lockable) if (e->ord != nvthd->ptx_dev->ord) continue; - r = cuEventQuery (*e->evt); + r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt); if (r == CUDA_SUCCESS) { bool append_async = false; @@ -877,7 +980,7 @@ event_gc (bool memmap_lockable) break; } - cuEventDestroy (*te); + CUDA_CALL_NOCHECK (cuEventDestroy, *te); free ((void *)te); /* Unlink 'e' from ptx_events list. */ @@ -1015,10 +1118,14 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT; cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR; - if (cuDeviceGetAttribute (&block_size, cu_tpb, dev) == CUDA_SUCCESS - && cuDeviceGetAttribute (&warp_size, cu_ws, dev) == CUDA_SUCCESS - && cuDeviceGetAttribute (&dev_size, cu_mpc, dev) == CUDA_SUCCESS - && cuDeviceGetAttribute (&cpu_size, cu_tpm, dev) == CUDA_SUCCESS) + if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb, + dev) == CUDA_SUCCESS + && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws, + dev) == CUDA_SUCCESS + && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc, + dev) == CUDA_SUCCESS + && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm, + dev) == CUDA_SUCCESS) { GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d," " dev_size=%d, cpu_size=%d\n", @@ -1090,7 +1197,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, #ifndef DISABLE_ASYNC if (async < acc_async_noval) { - r = cuStreamSynchronize (dev_str->stream); + r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); @@ -1103,7 +1210,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); - r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); + r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r), maybe_abort_msg); @@ -1117,7 +1224,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, event_add (PTX_EVT_KNL, e, (void *)dev_str, 0); } #else - r = cuCtxSynchronize (); + r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); @@ -1294,7 +1401,7 @@ nvptx_async_test (int async) if (!s) GOMP_PLUGIN_fatal ("unknown async %d", async); - r = cuStreamQuery (s->stream); + r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream); if (r == CUDA_SUCCESS) { /* The oacc-parallel.c:goacc_wait function calls this hook to determine @@ -1325,7 +1432,8 @@ nvptx_async_test_all (void) for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) { if ((s->multithreaded || pthread_equal (s->host_thread, self)) - && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY) + && CUDA_CALL_NOCHECK (cuStreamQuery, + s->stream) == CUDA_ERROR_NOT_READY) { pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); return 0; @@ -1400,7 +1508,7 @@ nvptx_wait_all (void) { if (s->multithreaded || pthread_equal (s->host_thread, self)) { - r = cuStreamQuery (s->stream); + r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream); if (r == CUDA_SUCCESS) continue; else if (r != CUDA_ERROR_NOT_READY) @@ -1632,13 +1740,15 @@ static void nvptx_set_clocktick (CUmodule module, struct ptx_device *dev) { CUdeviceptr dptr; - CUresult r = cuModuleGetGlobal (&dptr, NULL, module, "__nvptx_clocktick"); + CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL, + module, "__nvptx_clocktick"); if (r == CUDA_ERROR_NOT_FOUND) return; if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r)); double __nvptx_clocktick = 1e-3 / dev->clock_khz; - r = cuMemcpyHtoD (dptr, &__nvptx_clocktick, sizeof (__nvptx_clocktick)); + r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick, + sizeof (__nvptx_clocktick)); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); } @@ -1761,7 +1871,7 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data) if (image->target_data == target_data) { *prev_p = image->next; - if (cuModuleUnload (image->module) != CUDA_SUCCESS) + if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS) ret = false; free (image->fns); free (image); @@ -1974,7 +2084,7 @@ static void * nvptx_stacks_alloc (size_t size, int num) { CUdeviceptr stacks; - CUresult r = cuMemAlloc (&stacks, size * num); + CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r)); return (void *) stacks; @@ -1985,7 +2095,7 @@ nvptx_stacks_alloc (size_t size, int num) static void nvptx_stacks_free (void *p, int num) { - CUresult r = cuMemFree ((CUdeviceptr) p); + CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r)); } @@ -2028,14 +2138,13 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size, CU_LAUNCH_PARAM_END }; - r = cuLaunchKernel (function, - teams, 1, 1, - 32, threads, 1, - 0, ptx_dev->null_stream->stream, NULL, config); + r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1, + 32, threads, 1, 0, ptx_dev->null_stream->stream, + NULL, config); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); - r = cuCtxSynchronize (); + r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); |