diff options
Diffstat (limited to 'libgomp/plugin/plugin-nvptx.c')
-rw-r--r-- | libgomp/plugin/plugin-nvptx.c | 155 |
1 files changed, 145 insertions, 10 deletions
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index a5cf859..712c8b7 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -1799,8 +1799,6 @@ GOMP_OFFLOAD_alloc (int ord, size_t size) ptx_dev->free_blocks = NULL; pthread_mutex_unlock (&ptx_dev->free_blocks_lock); - nvptx_stacks_free (ptx_dev, false); - while (blocks) { tmp = blocks->next; @@ -1828,6 +1826,48 @@ GOMP_OFFLOAD_free (int ord, void *ptr) && nvptx_free (ptr, ptx_devices[ord])); } +bool +GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size) +{ + GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n", + __FUNCTION__, ptr, (unsigned long long) size); + + if (size == 0) + { + /* Special case to ensure omp_alloc specification compliance. */ + *ptr = NULL; + GOMP_PLUGIN_debug (0, " -> *ptr=null\n"); + return true; + } + + CUresult r; + + unsigned int flags = 0; + /* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need + 'flags |= CU_MEMHOSTALLOC_PORTABLE;' here. */ + r = CUDA_CALL_NOCHECK (cuMemHostAlloc, ptr, size, flags); + if (r == CUDA_ERROR_OUT_OF_MEMORY) + *ptr = NULL; + else if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("cuMemHostAlloc error: %s", cuda_error (r)); + return false; + } + GOMP_PLUGIN_debug (0, " -> *ptr=%p\n", + *ptr); + return true; +} + +bool +GOMP_OFFLOAD_page_locked_host_free (void *ptr) +{ + GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p\n", + __FUNCTION__, ptr); + + CUDA_CALL (cuMemFreeHost, ptr); + return true; +} + void GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum __attribute__((unused)), @@ -1939,9 +1979,10 @@ nvptx_goacc_asyncqueue_construct (unsigned int flags) } struct goacc_asyncqueue * -GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused))) +GOMP_OFFLOAD_openacc_async_construct (int device) { - return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT); + nvptx_attach_host_thread_to_device (device); + return nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING); } static bool @@ -2019,6 +2060,34 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq, } static bool +cuda_memcpy_dev_sanity_check (const void *d1, const void *d2, size_t s) +{ + CUdeviceptr pb1, pb2; + size_t ps1, ps2; + if (!s) + return true; + if (!d1 || !d2) + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } + CUDA_CALL (cuMemGetAddressRange, &pb1, &ps1, (CUdeviceptr) d1); + CUDA_CALL (cuMemGetAddressRange, &pb2, &ps2, (CUdeviceptr) d2); + if (!pb1 || !pb2) + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } + if ((void *)(d1 + s) > (void *)(pb1 + ps1) + || (void *)(d2 + s) > (void *)(pb2 + ps2)) + { + GOMP_PLUGIN_error ("invalid size"); + return false; + } + return true; +} + +static bool cuda_memcpy_sanity_check (const void *h, const void *d, size_t s) { CUdeviceptr pb; @@ -2077,6 +2146,9 @@ GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n) bool GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n) { + if (!nvptx_attach_host_thread_to_device (ord) + || !cuda_memcpy_dev_sanity_check (dst, src, n)) + return false; CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL); return true; } @@ -2288,6 +2360,18 @@ GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src, return true; } +bool +GOMP_OFFLOAD_openacc_async_dev2dev (int ord, void *dst, const void *src, + size_t n, struct goacc_asyncqueue *aq) +{ + if (!nvptx_attach_host_thread_to_device (ord) + || !cuda_memcpy_dev_sanity_check (dst, src, n)) + return false; + CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, + aq->cuda_stream); + return true; +} + union goacc_property_value GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop) { @@ -2815,17 +2899,68 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) else if (r != CUDA_ERROR_NOT_READY) GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); - if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0) + struct rev_offload *rev_metadata = ptx_dev->rev_data; + + /* Claim a portion of the ring buffer to process on this iteration. + Don't mark them as consumed until all the data has been read out. */ + unsigned int consumed = __atomic_load_n (&rev_metadata->consumed, + __ATOMIC_ACQUIRE); + unsigned int from = __atomic_load_n (&rev_metadata->claimed, + __ATOMIC_RELAXED); + unsigned int to = __atomic_load_n (&rev_metadata->next_slot, + __ATOMIC_RELAXED); + + if (consumed > to) + { + /* Overflow happens when we exceed UINTMAX requests. */ + GOMP_PLUGIN_fatal ("NVPTX reverse offload buffer overflowed.\n"); + } + + to = MIN(to, consumed + REV_OFFLOAD_QUEUE_SIZE / 2); + if (to <= from) + /* Nothing to do; poll again. */ + goto poll_again; + + if (!__atomic_compare_exchange_n (&rev_metadata->claimed, &from, to, + false, + __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) + /* Collision with another thread ... go around again. */ + goto poll_again; + + unsigned int index; + for (index = from; index < to; index++) { - struct rev_offload *rev_data = ptx_dev->rev_data; + int slot = index % REV_OFFLOAD_QUEUE_SIZE; + + /* Wait while the target finishes filling in the slot. */ + while (__atomic_load_n (&ptx_dev->rev_data->queue[slot].signal, + __ATOMIC_ACQUIRE) == 0) + ; /* spin */ + + /* Pass the request to libgomp; this will queue the request and + return right away, without waiting for the kernel to run. */ + struct rev_req *rev_data = &ptx_dev->rev_data->queue[slot]; GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum, rev_data->addrs, rev_data->sizes, rev_data->kinds, rev_data->dev_num, - reverse_offload_aq); - if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq)) - exit (EXIT_FAILURE); - __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE); + rev_data->signal, true); + + /* Ensure that the slot doesn't trigger early, when reused. */ + __atomic_store_n (&rev_data->signal, 0, __ATOMIC_RELEASE); } + + /* The data is now consumed so release the slots for reuse. */ + unsigned int consumed_so_far = from; + while (!__atomic_compare_exchange_n (&rev_metadata->consumed, + &consumed_so_far, to, false, + __ATOMIC_RELEASE, __ATOMIC_RELAXED)) + { + /* Another thread didn't consume all it claimed yet.... */ + consumed_so_far = from; + usleep (1); + } + +poll_again: usleep (1); } else |