aboutsummaryrefslogtreecommitdiff
path: root/libgomp/target.c
diff options
context:
space:
mode:
Diffstat (limited to 'libgomp/target.c')
-rw-r--r--libgomp/target.c987
1 files changed, 863 insertions, 124 deletions
diff --git a/libgomp/target.c b/libgomp/target.c
index 9674ff4..01434f8 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -990,15 +990,155 @@ gomp_map_val (struct target_mem_desc *tgt, void **hostaddrs, size_t i)
}
}
+static const char *
+kind_to_name (unsigned short kind)
+{
+ if (GOMP_MAP_IMPLICIT_P (kind))
+ kind &= ~GOMP_MAP_IMPLICIT;
+
+ switch (kind & 0xff)
+ {
+ case GOMP_MAP_ALLOC: return "GOMP_MAP_ALLOC";
+ case GOMP_MAP_FIRSTPRIVATE: return "GOMP_MAP_FIRSTPRIVATE";
+ case GOMP_MAP_FIRSTPRIVATE_INT: return "GOMP_MAP_FIRSTPRIVATE_INT";
+ case GOMP_MAP_TO: return "GOMP_MAP_TO";
+ case GOMP_MAP_TO_PSET: return "GOMP_MAP_TO_PSET";
+ case GOMP_MAP_FROM: return "GOMP_MAP_FROM";
+ case GOMP_MAP_TOFROM: return "GOMP_MAP_TOFROM";
+ case GOMP_MAP_POINTER: return "GOMP_MAP_POINTER";
+ case GOMP_MAP_ATTACH: return "GOMP_MAP_ATTACH";
+ case GOMP_MAP_DETACH: return "GOMP_MAP_DETACH";
+ case GOMP_MAP_STRUCT: return "GOMP_MAP_STRUCT";
+ case GOMP_MAP_STRUCT_UNORD: return "GOMP_MAP_STRUCT_UNORD";
+ default: return "unknown";
+ }
+}
+
+static void
+gomp_add_map (size_t idx, size_t *new_idx,
+ void ***hostaddrs, size_t **sizes, unsigned short **skinds,
+ void ***new_hostaddrs, size_t **new_sizes,
+ unsigned short **new_kinds, size_t *iterator_count)
+{
+ if ((*sizes)[idx] == SIZE_MAX)
+ {
+ uintptr_t *iterator_array = (*hostaddrs)[idx];
+ size_t count = *iterator_array++;
+ for (size_t i = 0; i < count; i++)
+ {
+ (*new_hostaddrs)[*new_idx] = (void *) *iterator_array++;
+ (*new_sizes)[*new_idx] = *iterator_array++;
+ (*new_kinds)[*new_idx] = (*skinds)[idx];
+ iterator_count[*new_idx] = i + 1;
+ gomp_debug (1,
+ "Expanding map %u <%s>: "
+ "hostaddrs[%u] = %p, sizes[%u] = %lu\n",
+ (int) idx, kind_to_name ((*new_kinds)[*new_idx]),
+ (int) *new_idx, (*new_hostaddrs)[*new_idx],
+ (int) *new_idx, (unsigned long) (*new_sizes)[*new_idx]);
+ (*new_idx)++;
+ }
+ }
+ else
+ {
+ (*new_hostaddrs)[*new_idx] = (*hostaddrs)[idx];
+ (*new_sizes)[*new_idx] = (*sizes)[idx];
+ (*new_kinds)[*new_idx] = (*skinds)[idx];
+ iterator_count[*new_idx] = 0;
+ (*new_idx)++;
+ }
+}
+
+
+/* Map entries containing expanded iterators will be flattened and merged into
+ HOSTADDRS, SIZES and KINDS, and MAPNUM updated. Returns true if there are
+ any iterators found. ITERATOR_COUNT holds the iteration count of the
+ iterator that generates each map (0 if not generated from an iterator).
+ HOSTADDRS, SIZES, KINDS and ITERATOR_COUNT must be freed afterwards if any
+ merging occurs. */
+
+static bool
+gomp_merge_iterator_maps (size_t *mapnum, void ***hostaddrs, size_t **sizes,
+ void **kinds, size_t **iterator_count)
+{
+ bool iterator_p = false;
+ size_t map_count = 0;
+ unsigned short **skinds = (unsigned short **) kinds;
+
+ for (size_t i = 0; i < *mapnum; i++)
+ if ((*sizes)[i] == SIZE_MAX)
+ {
+ uintptr_t *iterator_array = (*hostaddrs)[i];
+ map_count += iterator_array[0];
+ iterator_p = true;
+ }
+ else
+ map_count++;
+
+ if (!iterator_p)
+ return false;
+
+ gomp_debug (1,
+ "Expanding iterator maps - number of map entries: %u -> %u\n",
+ (int) *mapnum, (int) map_count);
+ void **new_hostaddrs = (void **) gomp_malloc (map_count * sizeof (void *));
+ size_t *new_sizes = (size_t *) gomp_malloc (map_count * sizeof (size_t));
+ unsigned short *new_kinds
+ = (unsigned short *) gomp_malloc (map_count * sizeof (unsigned short));
+ size_t new_idx = 0;
+ *iterator_count = (size_t *) gomp_malloc (map_count * sizeof (size_t));
+
+ for (size_t i = 0; i < *mapnum; i++)
+ {
+ int map_type = get_kind (true, *skinds, i) & 0xff;
+ if (map_type == GOMP_MAP_STRUCT || map_type == GOMP_MAP_STRUCT_UNORD)
+ {
+ size_t field_count = (*sizes)[i];
+ size_t idx_i = new_idx;
+
+ gomp_add_map (i, &new_idx, hostaddrs, sizes, skinds,
+ &new_hostaddrs, &new_sizes, &new_kinds,
+ *iterator_count);
+
+ for (size_t j = i + 1; j <= i + field_count; j++)
+ {
+ if ((*sizes)[j] == SIZE_MAX)
+ {
+ uintptr_t *iterator_array = (*hostaddrs)[j];
+ size_t count = iterator_array[0];
+ new_sizes[idx_i] += count - 1;
+ }
+ gomp_add_map (j, &new_idx, hostaddrs, sizes, skinds,
+ &new_hostaddrs, &new_sizes, &new_kinds,
+ *iterator_count);
+ }
+ gomp_debug (1, "Map %u: new field count = %lu\n",
+ (int) i, (unsigned long) new_sizes[idx_i]);
+ i += field_count;
+ }
+ else
+ gomp_add_map (i, &new_idx, hostaddrs, sizes, skinds,
+ &new_hostaddrs, &new_sizes, &new_kinds, *iterator_count);
+ }
+
+ *mapnum = map_count;
+ *hostaddrs = new_hostaddrs;
+ *sizes = new_sizes;
+ *kinds = new_kinds;
+
+ return true;
+}
+
static inline __attribute__((always_inline)) struct target_mem_desc *
gomp_map_vars_internal (struct gomp_device_descr *devicep,
struct goacc_asyncqueue *aq, size_t mapnum,
void **hostaddrs, void **devaddrs, size_t *sizes,
- void *kinds, bool short_mapkind,
- htab_t *refcount_set,
+ void *kinds, struct goacc_ncarray_info *nca_info,
+ bool short_mapkind, htab_t *refcount_set,
enum gomp_map_vars_kind pragma_kind)
{
size_t i, tgt_align, tgt_size, not_found_cnt = 0;
+ size_t nca_data_row_num = (nca_info ? nca_info->num_data_rows : 0);
bool has_firstprivate = false;
bool has_always_ptrset = false;
bool openmp_p = (pragma_kind & GOMP_MAP_VARS_OPENACC) == 0;
@@ -1006,9 +1146,15 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
const int typemask = short_mapkind ? 0xff : 0x7;
struct splay_tree_s *mem_map = &devicep->mem_map;
struct splay_tree_key_s cur_node;
+ bool iterators_p = false;
+ size_t *iterator_count = NULL;
+ if (short_mapkind)
+ iterators_p = gomp_merge_iterator_maps (&mapnum, &hostaddrs, &sizes,
+ &kinds, &iterator_count);
struct target_mem_desc *tgt
- = gomp_malloc (sizeof (*tgt) + sizeof (tgt->list[0]) * mapnum);
- tgt->list_count = mapnum;
+ = gomp_malloc (sizeof (*tgt)
+ + sizeof (tgt->list[0]) * (mapnum + nca_data_row_num));
+ tgt->list_count = mapnum + nca_data_row_num;
tgt->refcount = (pragma_kind & GOMP_MAP_VARS_ENTER_DATA) ? 0 : 1;
tgt->device_descr = devicep;
tgt->prev = NULL;
@@ -1162,6 +1308,28 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
has_firstprivate = true;
continue;
}
+ else if (GOMP_MAP_NONCONTIG_ARRAY_P (kind & typemask))
+ {
+ /* Ignore non-contiguous arrays for now, we process them together
+ later. */
+ tgt->list[i].key = NULL;
+ tgt->list[i].offset = 0;
+ not_found_cnt++;
+
+ /* The map for the non-contiguous array itself is never copied from
+ during unmapping, its the data rows that count. Set copy-from
+ flags to false here. */
+ tgt->list[i].copy_from = false;
+ tgt->list[i].always_copy_from = false;
+ tgt->list[i].is_attach = false;
+
+ size_t align = (size_t) 1 << (kind >> rshift);
+ if (tgt_align < align)
+ tgt_align = align;
+
+ continue;
+ }
+
cur_node.host_start = (uintptr_t) hostaddrs[i];
if (!GOMP_MAP_POINTER_P (kind & typemask))
cur_node.host_end = cur_node.host_start + sizes[i];
@@ -1297,6 +1465,45 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
}
}
+ /* For non-contiguous arrays. Each data row is one target item, separated
+ from the normal map clause items, hence we order them after mapnum. */
+ if (nca_info)
+ {
+ struct target_var_desc *next_var_desc = &tgt->list[mapnum];
+ for (i = 0; i < nca_info->num_ncarray; i++)
+ {
+ struct goacc_ncarray *nca = &nca_info->ncarray[i];
+ int kind = get_kind (short_mapkind, kinds, nca->map_index);
+ size_t align = (size_t) 1 << (kind >> rshift);
+ tgt_size = (tgt_size + align - 1) & ~(align - 1);
+ tgt_size += nca->ptrblock_size;
+
+ for (size_t j = 0; j < nca->data_row_num; j++)
+ {
+ struct target_var_desc *row_desc = next_var_desc++;
+ void *row = nca->data_rows[j];
+ cur_node.host_start = (uintptr_t) row;
+ cur_node.host_end = cur_node.host_start + nca->data_row_size;
+ splay_tree_key n = splay_tree_lookup (mem_map, &cur_node);
+ if (n)
+ {
+ assert (n->refcount != REFCOUNT_LINK);
+ gomp_map_vars_existing (devicep, aq, n, &cur_node, row_desc,
+ kind & typemask, false, false,
+ /* TODO: cbuf? */ NULL,
+ refcount_set);
+ }
+ else
+ {
+ tgt_size = (tgt_size + align - 1) & ~(align - 1);
+ tgt_size += nca->data_row_size;
+ not_found_cnt++;
+ }
+ }
+ }
+ assert (next_var_desc == &tgt->list[mapnum + nca_info->num_data_rows]);
+ }
+
if (devaddrs)
{
if (mapnum != 1)
@@ -1643,6 +1850,15 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
default:
break;
}
+
+ if (GOMP_MAP_NONCONTIG_ARRAY_P (kind & typemask))
+ {
+ tgt->list[i].key = &array->key;
+ tgt->list[i].key->tgt = tgt;
+ array++;
+ continue;
+ }
+
splay_tree_key k = &array->key;
k->host_start = (uintptr_t) hostaddrs[i];
if (!GOMP_MAP_POINTER_P (kind & typemask))
@@ -1879,18 +2095,120 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
array++;
}
}
+
+ /* Processing of non-contiguous array rows. */
+ if (nca_info)
+ {
+ struct target_var_desc *next_var_desc = &tgt->list[mapnum];
+ for (i = 0; i < nca_info->num_ncarray; i++)
+ {
+ struct goacc_ncarray *nca = &nca_info->ncarray[i];
+ int kind = get_kind (short_mapkind, kinds, nca->map_index);
+ size_t align = (size_t) 1 << (kind >> rshift);
+ tgt_size = (tgt_size + align - 1) & ~(align - 1);
+
+ assert (nca->ptr == hostaddrs[nca->map_index]);
+
+ /* For the map of the non-contiguous array itself, adjust so that
+ the passed device address points to the beginning of the
+ ptrblock. Remember to adjust the first-dimension's bias here. */
+ tgt->list[nca->map_index].key->tgt_offset
+ = tgt_size - nca->descr->dims[0].base;
+
+ void *target_ptrblock = (void*) tgt->tgt_start + tgt_size;
+ tgt_size += nca->ptrblock_size;
+
+ /* Add splay key for each data row in current non-contiguous
+ array. */
+ for (size_t j = 0; j < nca->data_row_num; j++)
+ {
+ struct target_var_desc *row_desc = next_var_desc++;
+ void *row = nca->data_rows[j];
+ cur_node.host_start = (uintptr_t) row;
+ cur_node.host_end = cur_node.host_start + nca->data_row_size;
+ splay_tree_key k = splay_tree_lookup (mem_map, &cur_node);
+ if (k)
+ {
+ assert (k->refcount != REFCOUNT_LINK);
+ gomp_map_vars_existing (devicep, aq, k, &cur_node, row_desc,
+ kind & typemask, false, false,
+ cbufp, refcount_set);
+ }
+ else
+ {
+ tgt->refcount++;
+ tgt_size = (tgt_size + align - 1) & ~(align - 1);
+
+ k = &array->key;
+ k->host_start = (uintptr_t) row;
+ k->host_end = k->host_start + nca->data_row_size;
+
+ k->tgt = tgt;
+ k->refcount = 1;
+ k->dynamic_refcount = 0;
+ k->aux = NULL;
+ k->tgt_offset = tgt_size;
+
+ tgt_size += nca->data_row_size;
+
+ row_desc->key = k;
+ row_desc->copy_from
+ = GOMP_MAP_COPY_FROM_P (kind & typemask);
+ row_desc->always_copy_from
+ = GOMP_MAP_COPY_FROM_P (kind & typemask);
+ row_desc->is_attach = false;
+ row_desc->offset = 0;
+ row_desc->length = nca->data_row_size;
+
+ array->left = NULL;
+ array->right = NULL;
+ splay_tree_insert (mem_map, array);
+
+ if (GOMP_MAP_COPY_TO_P (kind & typemask))
+ gomp_copy_host2dev (devicep, aq,
+ (void *) tgt->tgt_start + k->tgt_offset,
+ (void *) k->host_start,
+ nca->data_row_size, false,
+ cbufp);
+ array++;
+ }
+ nca->tgt_data_rows[j]
+ = (void *) (k->tgt->tgt_start + k->tgt_offset);
+ }
+
+ /* Now we have the target memory allocated, and target offsets of all
+ row blocks assigned and calculated, we can construct the
+ accelerator side ptrblock and copy it in. */
+ if (nca->ptrblock_size)
+ {
+ void *ptrblock = gomp_malloc (nca->ptrblock_size);
+ goacc_noncontig_array_create_ptrblock
+ (nca, ptrblock, target_ptrblock);
+ gomp_copy_host2dev (devicep, aq, target_ptrblock, ptrblock,
+ nca->ptrblock_size, false, cbufp);
+ if (aq)
+ /* Free once the transfer has completed. */
+ devicep->openacc.async.queue_callback_func (aq, free, ptrblock);
+ else
+ free (ptrblock);
+ }
+ }
+ }
}
if (pragma_kind & GOMP_MAP_VARS_TARGET)
{
+ size_t map_num = 0;
for (i = 0; i < mapnum; i++)
- {
- cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
- gomp_copy_host2dev (devicep, aq,
- (void *) (tgt->tgt_start + i * sizeof (void *)),
- (void *) &cur_node.tgt_offset, sizeof (void *),
- true, cbufp);
- }
+ if (!iterator_count || iterator_count[i] <= 1)
+ {
+ cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
+ gomp_copy_host2dev (devicep, aq,
+ (void *) (tgt->tgt_start + map_num * sizeof (void *)),
+ (void *) &cur_node.tgt_offset, sizeof (void *),
+ true, cbufp);
+ map_num++;
+ }
}
if (cbufp)
@@ -1922,6 +2240,15 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
}
gomp_mutex_unlock (&devicep->lock);
+
+ if (iterators_p)
+ {
+ free (hostaddrs);
+ free (sizes);
+ free (kinds);
+ free (iterator_count);
+ }
+
return tgt;
}
@@ -1942,8 +2269,8 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
struct target_mem_desc *tgt;
tgt = gomp_map_vars_internal (devicep, NULL, mapnum, hostaddrs, devaddrs,
- sizes, kinds, short_mapkind, refcount_set,
- pragma_kind);
+ sizes, kinds, NULL, short_mapkind,
+ refcount_set, pragma_kind);
if (local_refcount_set)
htab_free (local_refcount_set);
@@ -1954,11 +2281,12 @@ attribute_hidden struct target_mem_desc *
goacc_map_vars (struct gomp_device_descr *devicep,
struct goacc_asyncqueue *aq, size_t mapnum,
void **hostaddrs, void **devaddrs, size_t *sizes,
- void *kinds, bool short_mapkind,
+ void *kinds, struct goacc_ncarray_info *nca_info,
+ bool short_mapkind,
enum gomp_map_vars_kind pragma_kind)
{
return gomp_map_vars_internal (devicep, aq, mapnum, hostaddrs, devaddrs,
- sizes, kinds, short_mapkind, NULL,
+ sizes, kinds, nca_info, short_mapkind, NULL,
GOMP_MAP_VARS_OPENACC | pragma_kind);
}
@@ -2112,6 +2440,9 @@ gomp_unmap_vars_internal (struct target_mem_desc *tgt, bool do_copyfrom,
false, NULL);
}
+ size_t nrmvars = 0;
+ splay_tree_key remove_vars[tgt->list_count];
+
for (i = 0; i < tgt->list_count; i++)
{
splay_tree_key k = tgt->list[i].key;
@@ -2133,17 +2464,22 @@ gomp_unmap_vars_internal (struct target_mem_desc *tgt, bool do_copyfrom,
(void *) (k->tgt->tgt_start + k->tgt_offset
+ tgt->list[i].offset),
tgt->list[i].length);
+ /* Queue all removals together for processing below.
+ See also 'gomp_exit_data'. */
if (do_remove)
- {
- struct target_mem_desc *k_tgt __attribute__((unused)) = k->tgt;
- bool is_tgt_unmapped __attribute__((unused))
- = gomp_remove_var (devicep, k);
- /* It would be bad if TGT got unmapped while we're still iterating
- over its LIST_COUNT, and also expect to use it in the following
- code. */
- assert (!is_tgt_unmapped
- || k_tgt != tgt);
- }
+ remove_vars[nrmvars++] = k;
+ }
+
+ for (i = 0; i < nrmvars; i++)
+ {
+ splay_tree_key k = remove_vars[i];
+ struct target_mem_desc *k_tgt __attribute__((unused)) = k->tgt;
+ bool is_tgt_unmapped __attribute__((unused))
+ = gomp_remove_var (devicep, k);
+ /* It would be bad if TGT got unmapped while we're still iterating over
+ its LIST_COUNT, and also expect to use it in the following code. */
+ assert (!is_tgt_unmapped
+ || k_tgt != tgt);
}
if (aq)
@@ -2181,6 +2517,14 @@ goacc_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom,
gomp_unmap_vars_internal (tgt, do_copyfrom, NULL, aq);
}
+static int
+omp_target_memcpy_rect_worker (void *, const void *, size_t, size_t, int,
+ const size_t *, const size_t *, const size_t *,
+ const size_t *, const size_t *, const size_t *,
+ struct gomp_device_descr *,
+ struct gomp_device_descr *, size_t *tmp_size,
+ void **tmp);
+
static void
gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
size_t *sizes, void *kinds, bool short_mapkind)
@@ -2188,6 +2532,8 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
size_t i;
struct splay_tree_key_s cur_node;
const int typemask = short_mapkind ? 0xff : 0x7;
+ bool iterators_p = false;
+ size_t *iterator_count = NULL;
if (!devicep)
return;
@@ -2195,6 +2541,10 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
if (mapnum == 0)
return;
+ if (short_mapkind)
+ iterators_p = gomp_merge_iterator_maps (&mapnum, &hostaddrs, &sizes,
+ &kinds, &iterator_count);
+
gomp_mutex_lock (&devicep->lock);
if (devicep->state == GOMP_DEVICE_FINALIZED)
{
@@ -2203,91 +2553,143 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
}
for (i = 0; i < mapnum; i++)
- if (sizes[i])
- {
- cur_node.host_start = (uintptr_t) hostaddrs[i];
- cur_node.host_end = cur_node.host_start + sizes[i];
- splay_tree_key n = splay_tree_lookup (&devicep->mem_map, &cur_node);
- if (n)
- {
- int kind = get_kind (short_mapkind, kinds, i);
- if (n->host_start > cur_node.host_start
- || n->host_end < cur_node.host_end)
- {
- gomp_mutex_unlock (&devicep->lock);
- gomp_fatal ("Trying to update [%p..%p) object when "
- "only [%p..%p) is mapped",
- (void *) cur_node.host_start,
- (void *) cur_node.host_end,
- (void *) n->host_start,
- (void *) n->host_end);
- }
+ {
+ int kind = get_kind (short_mapkind, kinds, i);
+ if ((kind & typemask) == GOMP_MAP_TO_GRID
+ || (kind & typemask) == GOMP_MAP_FROM_GRID)
+ {
+ omp_noncontig_array_desc *desc
+ = (omp_noncontig_array_desc *) hostaddrs[i + 1];
+ size_t bias = sizes[i + 1];
+ cur_node.host_start = (uintptr_t) hostaddrs[i] + bias;
+ cur_node.host_end = cur_node.host_start + sizes[i];
+ splay_tree_key n = splay_tree_lookup (&devicep->mem_map, &cur_node);
+ if (n)
+ {
+ if (n->aux && n->aux->attach_count)
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ gomp_error ("noncontiguous update with attached pointers");
+ return;
+ }
+ void *devaddr = (void *) (n->tgt->tgt_start + n->tgt_offset
+ + cur_node.host_start
+ - n->host_start
+ - bias);
+ size_t tmp_size = 0;
+ void *tmp = NULL;
+ if ((kind & typemask) == GOMP_MAP_TO_GRID)
+ omp_target_memcpy_rect_worker (devaddr, hostaddrs[i],
+ desc->elemsize, desc->span,
+ desc->ndims, desc->length,
+ desc->stride, desc->index,
+ desc->index, desc->dim,
+ desc->dim, devicep,
+ NULL, &tmp_size, &tmp);
+ else
+ omp_target_memcpy_rect_worker (hostaddrs[i], devaddr,
+ desc->elemsize, desc->span,
+ desc->ndims, desc->length,
+ desc->stride, desc->index,
+ desc->index, desc->dim,
+ desc->dim, NULL,
+ devicep, &tmp_size, &tmp);
+ }
+ i++;
+ }
+ else if (sizes[i])
+ {
+ cur_node.host_start = (uintptr_t) hostaddrs[i];
+ cur_node.host_end = cur_node.host_start + sizes[i];
+ splay_tree_key n = splay_tree_lookup (&devicep->mem_map, &cur_node);
+ if (n)
+ {
+ if (n->host_start > cur_node.host_start
+ || n->host_end < cur_node.host_end)
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ gomp_fatal ("Trying to update [%p..%p) object when "
+ "only [%p..%p) is mapped",
+ (void *) cur_node.host_start,
+ (void *) cur_node.host_end,
+ (void *) n->host_start,
+ (void *) n->host_end);
+ }
- if (n->aux && n->aux->attach_count)
- {
- uintptr_t addr = cur_node.host_start;
- while (addr < cur_node.host_end)
- {
- /* We have to be careful not to overwrite still attached
- pointers during host<->device updates. */
- size_t i = (addr - cur_node.host_start) / sizeof (void *);
- if (n->aux->attach_count[i] == 0)
- {
- void *devaddr = (void *) (n->tgt->tgt_start
- + n->tgt_offset
- + addr - n->host_start);
- if (GOMP_MAP_COPY_TO_P (kind & typemask))
- gomp_copy_host2dev (devicep, NULL,
- devaddr, (void *) addr,
- sizeof (void *), false, NULL);
- if (GOMP_MAP_COPY_FROM_P (kind & typemask))
- gomp_copy_dev2host (devicep, NULL,
- (void *) addr, devaddr,
- sizeof (void *));
- }
- addr += sizeof (void *);
- }
- }
- else
- {
- void *hostaddr = (void *) cur_node.host_start;
- void *devaddr = (void *) (n->tgt->tgt_start + n->tgt_offset
- + cur_node.host_start
- - n->host_start);
- size_t size = cur_node.host_end - cur_node.host_start;
-
- if (GOMP_MAP_COPY_TO_P (kind & typemask))
- gomp_copy_host2dev (devicep, NULL, devaddr, hostaddr, size,
- false, NULL);
- if (GOMP_MAP_COPY_FROM_P (kind & typemask))
- gomp_copy_dev2host (devicep, NULL, hostaddr, devaddr, size);
- }
- }
- else
- {
- int kind = get_kind (short_mapkind, kinds, i);
+ if (n->aux && n->aux->attach_count)
+ {
+ uintptr_t addr = cur_node.host_start;
+ while (addr < cur_node.host_end)
+ {
+ /* We have to be careful not to overwrite still attached
+ pointers during host<->device updates. */
+ size_t i = (addr - cur_node.host_start) / sizeof (void *);
+ if (n->aux->attach_count[i] == 0)
+ {
+ void *devaddr = (void *) (n->tgt->tgt_start
+ + n->tgt_offset
+ + addr - n->host_start);
+ if (GOMP_MAP_COPY_TO_P (kind & typemask))
+ gomp_copy_host2dev (devicep, NULL,
+ devaddr, (void *) addr,
+ sizeof (void *), false, NULL);
+ if (GOMP_MAP_COPY_FROM_P (kind & typemask))
+ gomp_copy_dev2host (devicep, NULL,
+ (void *) addr, devaddr,
+ sizeof (void *));
+ }
+ addr += sizeof (void *);
+ }
+ }
+ else
+ {
+ void *hostaddr = (void *) cur_node.host_start;
+ void *devaddr = (void *) (n->tgt->tgt_start + n->tgt_offset
+ + cur_node.host_start
+ - n->host_start);
+ size_t size = cur_node.host_end - cur_node.host_start;
+
+ if (GOMP_MAP_COPY_TO_P (kind & typemask))
+ gomp_copy_host2dev (devicep, NULL, devaddr, hostaddr, size,
+ false, NULL);
+ if (GOMP_MAP_COPY_FROM_P (kind & typemask))
+ gomp_copy_dev2host (devicep, NULL, hostaddr, devaddr, size);
+ }
+ }
+ else
+ {
+ int kind = get_kind (short_mapkind, kinds, i);
- if (GOMP_MAP_PRESENT_P (kind))
- {
- /* We already looked up the memory region above and it
- was missing. */
- gomp_mutex_unlock (&devicep->lock);
+ if (GOMP_MAP_PRESENT_P (kind))
+ {
+ /* We already looked up the memory region above and it
+ was missing. */
+ gomp_mutex_unlock (&devicep->lock);
#ifdef HAVE_INTTYPES_H
- gomp_fatal ("present clause: not present on the device "
- "(addr: %p, size: %"PRIu64" (0x%"PRIx64"), "
- "dev: %d)", (void *) hostaddrs[i],
- (uint64_t) sizes[i], (uint64_t) sizes[i],
- devicep->target_id);
+ gomp_fatal ("present clause: not present on the device "
+ "(addr: %p, size: %"PRIu64" (0x%"PRIx64"), "
+ "dev: %d)", (void *) hostaddrs[i],
+ (uint64_t) sizes[i], (uint64_t) sizes[i],
+ devicep->target_id);
#else
- gomp_fatal ("present clause: not present on the device "
- "(addr: %p, size: %lu (0x%lx), dev: %d)",
- (void *) hostaddrs[i], (unsigned long) sizes[i],
- (unsigned long) sizes[i], devicep->target_id);
+ gomp_fatal ("present clause: not present on the device "
+ "(addr: %p, size: %lu (0x%lx), dev: %d)",
+ (void *) hostaddrs[i], (unsigned long) sizes[i],
+ (unsigned long) sizes[i], devicep->target_id);
#endif
- }
- }
- }
+ }
+ }
+ }
+ }
gomp_mutex_unlock (&devicep->lock);
+
+ if (iterators_p)
+ {
+ free (hostaddrs);
+ free (sizes);
+ free (kinds);
+ free (iterator_count);
+ }
}
static struct gomp_offload_icv_list *
@@ -3481,16 +3883,18 @@ gomp_map_cdata_lookup (struct cpy_data *d, uint64_t *devaddrs,
tgt_start, tgt_end);
}
-/* Handle reverse offload. This is called by the device plugins for a
- reverse offload; it is not called if the outer target runs on the host.
+/* Handle reverse offload. This is called by the host worker thread to
+ execute a single reverse offload request; it is not called if the outer
+ target runs on the host.
The mapping is simplified device-affecting constructs (except for target
with device(ancestor:1)) must not be encountered; in particular not
target (enter/exit) data. */
-void
-gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
- uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
- struct goacc_asyncqueue *aq)
+static void
+gomp_target_rev_internal (uint64_t fn_ptr, uint64_t mapnum,
+ uint64_t devaddrs_ptr, uint64_t sizes_ptr,
+ uint64_t kinds_ptr, struct gomp_device_descr *devicep,
+ struct goacc_asyncqueue *aq)
{
/* Return early if there is no offload code. */
if (sizeof (OFFLOAD_PLUGINS) == sizeof (""))
@@ -3507,7 +3911,6 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
unsigned short *kinds;
const bool short_mapkind = true;
const int typemask = short_mapkind ? 0xff : 0x7;
- struct gomp_device_descr *devicep = resolve_device (dev_num, false);
reverse_splay_tree_key n;
struct reverse_splay_tree_key_s k;
@@ -3918,6 +4321,134 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
}
}
+static struct target_rev_queue_s
+{
+ uint64_t fn_ptr;
+ uint64_t mapnum;
+ uint64_t devaddrs_ptr;
+ uint64_t sizes_ptr;
+ uint64_t kinds_ptr;
+ struct gomp_device_descr *devicep;
+
+ volatile int *signal;
+ bool use_aq;
+
+ struct target_rev_queue_s *next;
+} *target_rev_queue_head = NULL, *target_rev_queue_last = NULL;
+static gomp_mutex_t target_rev_queue_lock = 0;
+static int target_rev_thread_count = 0;
+
+static void *
+gomp_target_rev_worker_thread (void *)
+{
+ struct target_rev_queue_s *rev_kernel = NULL;
+ struct goacc_asyncqueue *aq = NULL;
+ struct gomp_device_descr *aq_devicep;
+
+ while (1)
+ {
+ gomp_mutex_lock (&target_rev_queue_lock);
+
+ /* Take a reverse-offload kernel request from the queue. */
+ rev_kernel = target_rev_queue_head;
+ if (rev_kernel)
+ {
+ target_rev_queue_head = rev_kernel->next;
+ if (target_rev_queue_head == NULL)
+ target_rev_queue_last = NULL;
+ }
+
+ if (rev_kernel == NULL)
+ {
+ target_rev_thread_count--;
+ gomp_mutex_unlock (&target_rev_queue_lock);
+ break;
+ }
+ gomp_mutex_unlock (&target_rev_queue_lock);
+
+ /* Ensure we have a suitable device queue for the memory transfers. */
+ if (rev_kernel->use_aq)
+ {
+ if (aq && aq_devicep != rev_kernel->devicep)
+ {
+ aq_devicep->openacc.async.destruct_func (aq);
+ aq = NULL;
+ }
+
+ if (!aq)
+ {
+ aq_devicep = rev_kernel->devicep;
+ aq = aq_devicep->openacc.async.construct_func (aq_devicep->target_id);
+ }
+ }
+
+ /* Run the kernel on the host. */
+ gomp_target_rev_internal (rev_kernel->fn_ptr, rev_kernel->mapnum,
+ rev_kernel->devaddrs_ptr, rev_kernel->sizes_ptr,
+ rev_kernel->kinds_ptr, rev_kernel->devicep, aq);
+
+ /* Signal the device that the reverse-offload is completed. */
+ int one = 1;
+ gomp_copy_host2dev (rev_kernel->devicep, aq, (void*)rev_kernel->signal,
+ &one, sizeof (one), false, NULL);
+
+ /* We're done with this request. */
+ free (rev_kernel);
+
+ /* Loop around and see if another request is waiting. */
+ }
+
+ if (aq)
+ aq_devicep->openacc.async.destruct_func (aq);
+
+ return NULL;
+}
+
+void
+gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
+ uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
+ volatile int *signal, bool use_aq)
+{
+ struct gomp_device_descr *devicep = resolve_device (dev_num, false);
+
+ /* Create a new queue node. */
+ struct target_rev_queue_s *newreq = gomp_malloc (sizeof (*newreq));
+ newreq->fn_ptr = fn_ptr;
+ newreq->mapnum = mapnum;
+ newreq->devaddrs_ptr = devaddrs_ptr;
+ newreq->sizes_ptr = sizes_ptr;
+ newreq->kinds_ptr = kinds_ptr;
+ newreq->devicep = devicep;
+ newreq->signal = signal;
+ newreq->use_aq = use_aq;
+ newreq->next = NULL;
+
+ gomp_mutex_lock (&target_rev_queue_lock);
+
+ /* Enqueue the reverse-offload request. */
+ if (target_rev_queue_last)
+ {
+ target_rev_queue_last->next = newreq;
+ target_rev_queue_last = newreq;
+ }
+ else
+ target_rev_queue_last = target_rev_queue_head = newreq;
+
+ /* Launch a new thread to process the request asynchronously.
+ If the thread pool limit has been reached then an existing thread will
+ pick up the job when it is ready. */
+ if (target_rev_thread_count < gomp_reverse_offload_threads)
+ {
+ target_rev_thread_count++;
+ gomp_mutex_unlock (&target_rev_queue_lock);
+
+ pthread_t t;
+ pthread_create (&t, NULL, gomp_target_rev_worker_thread, NULL);
+ }
+ else
+ gomp_mutex_unlock (&target_rev_queue_lock);
+}
+
/* Host fallback for GOMP_target_data{,_ext} routines. */
static void
@@ -4114,7 +4645,7 @@ gomp_exit_data (struct gomp_device_descr *devicep, size_t mapnum,
false, NULL);
}
- int nrmvars = 0;
+ size_t nrmvars = 0;
splay_tree_key remove_vars[mapnum];
for (i = 0; i < mapnum; i++)
@@ -4177,10 +4708,6 @@ gomp_exit_data (struct gomp_device_descr *devicep, size_t mapnum,
errors if we still have following element siblings to copy back.
While we're at it, it also seems more disciplined to simply
queue all removals together for processing below.
-
- Structured block unmapping (i.e. gomp_unmap_vars_internal) should
- not have this problem, since they maintain an additional
- tgt->refcount = 1 reference to the target_mem_desc to start with.
*/
if (do_remove)
remove_vars[nrmvars++] = k;
@@ -4195,7 +4722,7 @@ gomp_exit_data (struct gomp_device_descr *devicep, size_t mapnum,
}
}
- for (int i = 0; i < nrmvars; i++)
+ for (i = 0; i < nrmvars; i++)
gomp_remove_var (devicep, remove_vars[i]);
gomp_mutex_unlock (&devicep->lock);
@@ -4497,6 +5024,140 @@ omp_target_free (void *device_ptr, int device_num)
gomp_mutex_unlock (&devicep->lock);
}
+/* Device (really: libgomp plugin) to use for paged-locked memory. We
+ assume there is either none or exactly one such device for the lifetime of
+ the process. */
+
+static struct gomp_device_descr *device_for_page_locked
+ = /* uninitialized */ (void *) -1;
+
+static struct gomp_device_descr *
+get_device_for_page_locked (void)
+{
+ gomp_debug (0, "%s\n",
+ __FUNCTION__);
+
+ struct gomp_device_descr *device;
+#ifdef HAVE_SYNC_BUILTINS
+ device
+ = __atomic_load_n (&device_for_page_locked, MEMMODEL_RELAXED);
+ if (device == (void *) -1)
+ {
+ gomp_debug (0, " init\n");
+
+ gomp_init_targets_once ();
+
+ device = NULL;
+ for (int i = 0; i < num_devices; ++i)
+ {
+ gomp_debug (0, " i=%d, target_id=%d\n",
+ i, devices[i].target_id);
+
+ /* We consider only the first device of potentially several of the
+ same type as this functionality is not specific to an individual
+ offloading device, but instead relates to the host-side
+ implementation of the respective offloading implementation. */
+ if (devices[i].target_id != 0)
+ continue;
+
+ if (!devices[i].page_locked_host_alloc_func)
+ continue;
+
+ gomp_debug (0, " found device: %p (%s)\n",
+ &devices[i], devices[i].name);
+ if (device)
+ gomp_fatal ("Unclear how %s and %s libgomp plugins may"
+ " simultaneously provide functionality"
+ " for page-locked memory",
+ device->name, devices[i].name);
+ else
+ device = &devices[i];
+ }
+
+ struct gomp_device_descr *device_old
+ = __atomic_exchange_n (&device_for_page_locked, device,
+ MEMMODEL_RELAXED);
+ gomp_debug (0, " old device_for_page_locked: %p\n",
+ device_old);
+ assert (device_old == (void *) -1
+ /* We shouldn't have concurrently found a different or no
+ device. */
+ || device_old == device);
+ }
+#else /* !HAVE_SYNC_BUILTINS */
+ gomp_debug (0, " not implemented for '!HAVE_SYNC_BUILTINS'\n");
+ (void) &device_for_page_locked;
+ device = NULL;
+#endif /* HAVE_SYNC_BUILTINS */
+
+ gomp_debug (0, " -> device=%p (%s)\n",
+ device, device ? device->name : "[none]");
+ return device;
+}
+
+/* Allocate page-locked host memory.
+ Returns whether we have a device capable of that. */
+
+attribute_hidden bool
+gomp_page_locked_host_alloc (void **ptr, size_t size)
+{
+ gomp_debug (0, "%s: ptr=%p, size=%llu\n",
+ __FUNCTION__, ptr, (unsigned long long) size);
+
+ struct gomp_device_descr *device = get_device_for_page_locked ();
+ gomp_debug (0, " device=%p (%s)\n",
+ device, device ? device->name : "[none]");
+ if (device)
+ {
+ gomp_mutex_lock (&device->lock);
+ if (device->state == GOMP_DEVICE_UNINITIALIZED)
+ gomp_init_device (device);
+ else if (device->state == GOMP_DEVICE_FINALIZED)
+ {
+ gomp_mutex_unlock (&device->lock);
+ gomp_fatal ("Device %s used for for page-locked memory is finalized",
+ device->name);
+ }
+ gomp_mutex_unlock (&device->lock);
+
+ if (!device->page_locked_host_alloc_func (ptr, size))
+ gomp_fatal ("Failed to allocate page-locked host memory"
+ " via %s libgomp plugin",
+ device->name);
+ }
+ return device != NULL;
+}
+
+/* Free page-locked host memory.
+ This must only be called if 'gomp_page_locked_host_alloc' returned
+ 'true'. */
+
+attribute_hidden void
+gomp_page_locked_host_free (void *ptr)
+{
+ gomp_debug (0, "%s: ptr=%p\n",
+ __FUNCTION__, ptr);
+
+ struct gomp_device_descr *device = get_device_for_page_locked ();
+ gomp_debug (0, " device=%p (%s)\n",
+ device, device ? device->name : "[none]");
+ assert (device);
+
+ gomp_mutex_lock (&device->lock);
+ assert (device->state != GOMP_DEVICE_UNINITIALIZED);
+ if (device->state == GOMP_DEVICE_FINALIZED)
+ {
+ gomp_mutex_unlock (&device->lock);
+ return;
+ }
+ gomp_mutex_unlock (&device->lock);
+
+ if (!device->page_locked_host_free_func (ptr))
+ gomp_fatal ("Failed to free page-locked host memory"
+ " via %s libgomp plugin",
+ device->name);
+}
+
int
omp_target_is_present (const void *ptr, int device_num)
{
@@ -4683,7 +5344,8 @@ omp_target_memcpy_async (void *dst, const void *src, size_t length,
static int
omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
- int num_dims, const size_t *volume,
+ size_t span, int num_dims, const size_t *volume,
+ const size_t *strides,
const size_t *dst_offsets,
const size_t *src_offsets,
const size_t *dst_dimensions,
@@ -4697,7 +5359,7 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
size_t j, dst_off, src_off, length;
int i, ret;
- if (num_dims == 1)
+ if (num_dims == 1 && (!strides || (strides[0] == 1 && element_size == span)))
{
if (__builtin_mul_overflow (element_size, volume[0], &length)
|| __builtin_mul_overflow (element_size, dst_offsets[0], &dst_off)
@@ -4751,9 +5413,74 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
}
return ret ? 0 : EINVAL;
}
+ else if (num_dims == 1 && strides)
+ {
+ size_t stride;
+
+ assert ((src_devicep == NULL || dst_devicep == NULL)
+ && (src_devicep != NULL || dst_devicep != NULL));
+
+ if (__builtin_mul_overflow (span, dst_offsets[0], &dst_off)
+ || __builtin_mul_overflow (span, src_offsets[0], &src_off))
+ return EINVAL;
+
+ if (__builtin_mul_overflow (span, strides[0], &stride))
+ return EINVAL;
+
+ if (((src_devicep && src_devicep->memcpy2d_func)
+ || (dst_devicep && dst_devicep->memcpy2d_func))
+ && (stride % element_size) == 0)
+ {
+ /* Try using memcpy2d for a 1-dimensional strided access. Here we
+ treat the transfer as a 2-dimensional array, where the inner
+ dimension is calculated to be (stride in bytes) / element_size.
+ Indices/offsets are adjusted so the source/destination pointers
+ point to the first element to be transferred, to make the sums
+ easier. (There are some configurations of 2D strided accesses
+ that memcpy3d could handle similarly, but those are probably rare
+ and are unimplemented for now.) */
+
+ /* If stride is element size, this is a contiguous transfer and
+ should have been handled above. */
+ assert (stride > element_size);
+
+ int dst_id = dst_devicep ? dst_devicep->target_id : -1;
+ int src_id = src_devicep ? src_devicep->target_id : -1;
+ void *subarray_src = (char *) src + src_off;
+ void *subarray_dst = (char *) dst + dst_off;
+
+ struct gomp_device_descr *devp = dst_devicep ? dst_devicep
+ : src_devicep;
+ ret = devp->memcpy2d_func (dst_id, src_id, element_size, volume[0],
+ subarray_dst, 0, 0, stride, subarray_src,
+ 0, 0, stride);
+ if (ret != -1)
+ return ret ? 0 : EINVAL;
+ }
+
+ for (i = 0, ret = 1; i < volume[0] && ret; i++)
+ {
+ if (src_devicep == NULL)
+ ret = dst_devicep->host2dev_func (dst_devicep->target_id,
+ (char *) dst + dst_off,
+ (const char *) src + src_off,
+ element_size);
+ else if (dst_devicep == NULL)
+ ret = src_devicep->dev2host_func (src_devicep->target_id,
+ (char *) dst + dst_off,
+ (const char *) src + src_off,
+ element_size);
+ dst_off += stride;
+ src_off += stride;
+ }
+ return ret ? 0 : EINVAL;
+ }
/* host->device, device->host and intra device. */
if (num_dims == 2
+ && (!strides || (strides[0] == 1
+ && strides[1] == 1
+ && element_size == span))
&& ((src_devicep
&& src_devicep == dst_devicep
&& src_devicep->memcpy2d_func)
@@ -4780,6 +5507,10 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
return ret ? 0 : EINVAL;
}
else if (num_dims == 3
+ && (!strides || (strides[0] == 1
+ && strides[1] == 1
+ && strides[2] == 1
+ && element_size == span))
&& ((src_devicep
&& src_devicep == dst_devicep
&& src_devicep->memcpy3d_func)
@@ -4815,13 +5546,19 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
if (__builtin_mul_overflow (dst_slice, dst_offsets[0], &dst_off)
|| __builtin_mul_overflow (src_slice, src_offsets[0], &src_off))
return EINVAL;
+ if (strides
+ && (__builtin_mul_overflow (dst_slice, strides[0], &dst_slice)
+ || __builtin_mul_overflow (src_slice, strides[0], &src_slice)))
+ return EINVAL;
for (j = 0; j < volume[0]; j++)
{
ret = omp_target_memcpy_rect_worker ((char *) dst + dst_off,
(const char *) src + src_off,
- element_size, num_dims - 1,
- volume + 1, dst_offsets + 1,
- src_offsets + 1, dst_dimensions + 1,
+ element_size, span, num_dims - 1,
+ volume + 1,
+ strides ? strides + 1 : NULL,
+ dst_offsets + 1, src_offsets + 1,
+ dst_dimensions + 1,
src_dimensions + 1, dst_devicep,
src_devicep, tmp_size, tmp);
if (ret)
@@ -4870,8 +5607,8 @@ omp_target_memcpy_rect_copy (void *dst, const void *src,
gomp_mutex_lock (&src_devicep->lock);
if (lock_dst)
gomp_mutex_lock (&dst_devicep->lock);
- int ret = omp_target_memcpy_rect_worker (dst, src, element_size, num_dims,
- volume, dst_offsets, src_offsets,
+ int ret = omp_target_memcpy_rect_worker (dst, src, element_size, element_size, num_dims,
+ volume, NULL, dst_offsets, src_offsets,
dst_dimensions, src_dimensions,
dst_devicep, src_devicep,
&tmp_size, &tmp);
@@ -5536,6 +6273,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
DLSYM (unload_image);
DLSYM (alloc);
DLSYM (free);
+ DLSYM_OPT (page_locked_host_alloc, page_locked_host_alloc);
+ DLSYM_OPT (page_locked_host_free, page_locked_host_free);
DLSYM (dev2host);
DLSYM (host2dev);
DLSYM_OPT (memcpy2d, memcpy2d);