diff options
Diffstat (limited to 'libgomp/plugin/plugin-nvptx.c')
-rw-r--r-- | libgomp/plugin/plugin-nvptx.c | 112 |
1 files changed, 102 insertions, 10 deletions
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index a5cf859..a6c8198 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -1799,8 +1799,6 @@ GOMP_OFFLOAD_alloc (int ord, size_t size) ptx_dev->free_blocks = NULL; pthread_mutex_unlock (&ptx_dev->free_blocks_lock); - nvptx_stacks_free (ptx_dev, false); - while (blocks) { tmp = blocks->next; @@ -1828,6 +1826,48 @@ GOMP_OFFLOAD_free (int ord, void *ptr) && nvptx_free (ptr, ptx_devices[ord])); } +bool +GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size) +{ + GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n", + __FUNCTION__, ptr, (unsigned long long) size); + + if (size == 0) + { + /* Special case to ensure omp_alloc specification compliance. */ + *ptr = NULL; + GOMP_PLUGIN_debug (0, " -> *ptr=null\n"); + return true; + } + + CUresult r; + + unsigned int flags = 0; + /* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need + 'flags |= CU_MEMHOSTALLOC_PORTABLE;' here. */ + r = CUDA_CALL_NOCHECK (cuMemHostAlloc, ptr, size, flags); + if (r == CUDA_ERROR_OUT_OF_MEMORY) + *ptr = NULL; + else if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("cuMemHostAlloc error: %s", cuda_error (r)); + return false; + } + GOMP_PLUGIN_debug (0, " -> *ptr=%p\n", + *ptr); + return true; +} + +bool +GOMP_OFFLOAD_page_locked_host_free (void *ptr) +{ + GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p\n", + __FUNCTION__, ptr); + + CUDA_CALL (cuMemFreeHost, ptr); + return true; +} + void GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum __attribute__((unused)), @@ -1939,9 +1979,10 @@ nvptx_goacc_asyncqueue_construct (unsigned int flags) } struct goacc_asyncqueue * -GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused))) +GOMP_OFFLOAD_openacc_async_construct (int device) { - return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT); + nvptx_attach_host_thread_to_device (device); + return nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING); } static bool @@ -2815,17 +2856,68 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) else if (r != CUDA_ERROR_NOT_READY) GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); - if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0) + struct rev_offload *rev_metadata = ptx_dev->rev_data; + + /* Claim a portion of the ring buffer to process on this iteration. + Don't mark them as consumed until all the data has been read out. */ + unsigned int consumed = __atomic_load_n (&rev_metadata->consumed, + __ATOMIC_ACQUIRE); + unsigned int from = __atomic_load_n (&rev_metadata->claimed, + __ATOMIC_RELAXED); + unsigned int to = __atomic_load_n (&rev_metadata->next_slot, + __ATOMIC_RELAXED); + + if (consumed > to) + { + /* Overflow happens when we exceed UINTMAX requests. */ + GOMP_PLUGIN_fatal ("NVPTX reverse offload buffer overflowed.\n"); + } + + to = MIN(to, consumed + REV_OFFLOAD_QUEUE_SIZE / 2); + if (to <= from) + /* Nothing to do; poll again. */ + goto poll_again; + + if (!__atomic_compare_exchange_n (&rev_metadata->claimed, &from, to, + false, + __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) + /* Collision with another thread ... go around again. */ + goto poll_again; + + unsigned int index; + for (index = from; index < to; index++) { - struct rev_offload *rev_data = ptx_dev->rev_data; + int slot = index % REV_OFFLOAD_QUEUE_SIZE; + + /* Wait while the target finishes filling in the slot. */ + while (__atomic_load_n (&ptx_dev->rev_data->queue[slot].signal, + __ATOMIC_ACQUIRE) == 0) + ; /* spin */ + + /* Pass the request to libgomp; this will queue the request and + return right away, without waiting for the kernel to run. */ + struct rev_req *rev_data = &ptx_dev->rev_data->queue[slot]; GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum, rev_data->addrs, rev_data->sizes, rev_data->kinds, rev_data->dev_num, - reverse_offload_aq); - if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq)) - exit (EXIT_FAILURE); - __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE); + rev_data->signal, true); + + /* Ensure that the slot doesn't trigger early, when reused. */ + __atomic_store_n (&rev_data->signal, 0, __ATOMIC_RELEASE); } + + /* The data is now consumed so release the slots for reuse. */ + unsigned int consumed_so_far = from; + while (!__atomic_compare_exchange_n (&rev_metadata->consumed, + &consumed_so_far, to, false, + __ATOMIC_RELEASE, __ATOMIC_RELAXED)) + { + /* Another thread didn't consume all it claimed yet.... */ + consumed_so_far = from; + usleep (1); + } + +poll_again: usleep (1); } else |