aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libgomp/libgomp.h77
-rw-r--r--libgomp/oacc-host.c1
-rw-r--r--libgomp/plugin/plugin-nvptx.c4
-rw-r--r--libgomp/splay-tree.c22
-rw-r--r--libgomp/splay-tree.h8
-rw-r--r--libgomp/target.c599
-rw-r--r--libgomp/testsuite/libgomp.fortran/reverse-offload-2.f9072
-rw-r--r--libgomp/testsuite/libgomp.fortran/reverse-offload-3.f9068
-rw-r--r--libgomp/testsuite/libgomp.fortran/reverse-offload-4.f90129
-rw-r--r--libgomp/testsuite/libgomp.fortran/reverse-offload-5.f90100
-rw-r--r--libgomp/testsuite/libgomp.fortran/reverse-offload-5a.f9098
11 files changed, 1139 insertions, 39 deletions
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 5803683..8fc9379 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1157,29 +1157,7 @@ struct target_var_desc {
uintptr_t length;
};
-struct target_mem_desc {
- /* Reference count. */
- uintptr_t refcount;
- /* All the splay nodes allocated together. */
- splay_tree_node array;
- /* Start of the target region. */
- uintptr_t tgt_start;
- /* End of the targer region. */
- uintptr_t tgt_end;
- /* Handle to free. */
- void *to_free;
- /* Previous target_mem_desc. */
- struct target_mem_desc *prev;
- /* Number of items in following list. */
- size_t list_count;
-
- /* Corresponding target device descriptor. */
- struct gomp_device_descr *device_descr;
-
- /* List of target items to remove (or decrease refcount)
- at the end of region. */
- struct target_var_desc list[];
-};
+struct target_mem_desc;
/* Special value for refcount - mask to indicate existence of special
values. Right now we allocate 3 bits. */
@@ -1273,6 +1251,58 @@ splay_compare (splay_tree_key x, splay_tree_key y)
#include "splay-tree.h"
+/* Reverse offload splay-tree handling (functions only). */
+
+struct reverse_splay_tree_key_s {
+ /* Address of the device object. */
+ uint64_t dev;
+ splay_tree_key k;
+};
+
+typedef struct reverse_splay_tree_node_s *reverse_splay_tree_node;
+typedef struct reverse_splay_tree_s *reverse_splay_tree;
+typedef struct reverse_splay_tree_key_s *reverse_splay_tree_key;
+
+static inline int
+reverse_splay_compare (reverse_splay_tree_key x, reverse_splay_tree_key y)
+{
+ if (x->dev < y->dev)
+ return -1;
+ if (x->dev > y->dev)
+ return 1;
+ return 0;
+}
+
+#define splay_tree_prefix reverse
+#include "splay-tree.h"
+
+struct target_mem_desc {
+ /* Reference count. */
+ uintptr_t refcount;
+ /* All the splay nodes allocated together. */
+ splay_tree_node array;
+ /* Likewise for the reverse lookup device->host for reverse offload. */
+ reverse_splay_tree_node rev_array;
+ /* Start of the target region. */
+ uintptr_t tgt_start;
+ /* End of the targer region. */
+ uintptr_t tgt_end;
+ /* Handle to free. */
+ void *to_free;
+ /* Previous target_mem_desc. */
+ struct target_mem_desc *prev;
+ /* Number of items in following list. */
+ size_t list_count;
+
+ /* Corresponding target device descriptor. */
+ struct gomp_device_descr *device_descr;
+
+ /* List of target items to remove (or decrease refcount)
+ at the end of region. */
+ struct target_var_desc list[];
+};
+
+
typedef struct acc_dispatch_t
{
/* Execute. */
@@ -1367,6 +1397,7 @@ struct gomp_device_descr
/* Splay tree containing information about mapped memory regions. */
struct splay_tree_s mem_map;
+ struct reverse_splay_tree_s mem_map_rev;
/* Mutex for the mutable data. */
gomp_mutex_t lock;
diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c
index 4e3971a..77a2427 100644
--- a/libgomp/oacc-host.c
+++ b/libgomp/oacc-host.c
@@ -284,6 +284,7 @@ static struct gomp_device_descr host_dispatch =
.run_func = host_run,
.mem_map = { NULL },
+ .mem_map_rev = { NULL },
/* .lock initialized in goacc_host_init. */
.state = GOMP_DEVICE_UNINITIALIZED,
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index e803f08..4f4c25a 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -1190,7 +1190,9 @@ GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
devices were present. Unified-shared address: see comment in
nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
if (num_devices > 0
- && (omp_requires_mask & ~GOMP_REQUIRES_UNIFIED_ADDRESS) != 0)
+ && ((omp_requires_mask
+ & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
+ | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
return -1;
return num_devices;
}
diff --git a/libgomp/splay-tree.c b/libgomp/splay-tree.c
index 2d8742f..3809e90 100644
--- a/libgomp/splay-tree.c
+++ b/libgomp/splay-tree.c
@@ -236,3 +236,25 @@ splay_tree_foreach (splay_tree sp, splay_tree_callback func, void *data)
{
splay_tree_foreach_internal (sp->root, func, data);
}
+
+/* Like above, except when func returns != 0, stop early. */
+
+static int
+splay_tree_foreach_internal_lazy (splay_tree_node node,
+ splay_tree_callback_stop func, void *data)
+{
+ if (!node)
+ return 0;
+ if (func (&node->key, data))
+ return 1;
+ if (splay_tree_foreach_internal_lazy (node->left, func, data))
+ return 1;
+ /* Yeah, whatever. GCC can fix my tail recursion. */
+ return splay_tree_foreach_internal_lazy (node->right, func, data);
+}
+
+attribute_hidden void
+splay_tree_foreach_lazy (splay_tree sp, splay_tree_callback_stop func, void *data)
+{
+ splay_tree_foreach_internal_lazy (sp->root, func, data);
+}
diff --git a/libgomp/splay-tree.h b/libgomp/splay-tree.h
index 992381c..7dea0cc 100644
--- a/libgomp/splay-tree.h
+++ b/libgomp/splay-tree.h
@@ -78,8 +78,12 @@ typedef struct splay_tree_key_s *splay_tree_key;
splay_tree_name (splay_tree_prefix, splay_tree_remove)
# define splay_tree_foreach \
splay_tree_name (splay_tree_prefix, splay_tree_foreach)
+# define splay_tree_foreach_lazy \
+ splay_tree_name (splay_tree_prefix, splay_tree_foreach_lazy)
# define splay_tree_callback \
splay_tree_name (splay_tree_prefix, splay_tree_callback)
+# define splay_tree_callback_stop \
+ splay_tree_name (splay_tree_prefix, splay_tree_callback_stop)
#endif
#ifndef splay_tree_c
@@ -99,11 +103,13 @@ struct splay_tree_s {
};
typedef void (*splay_tree_callback) (splay_tree_key, void *);
+typedef int (*splay_tree_callback_stop) (splay_tree_key, void *);
extern splay_tree_key splay_tree_lookup (splay_tree, splay_tree_key);
extern void splay_tree_insert (splay_tree, splay_tree_node);
extern void splay_tree_remove (splay_tree, splay_tree_key);
extern void splay_tree_foreach (splay_tree, splay_tree_callback, void *);
+extern void splay_tree_foreach_lazy (splay_tree, splay_tree_callback_stop, void *);
#else /* splay_tree_c */
# ifdef splay_tree_prefix
# include "splay-tree.c"
@@ -125,6 +131,8 @@ extern void splay_tree_foreach (splay_tree, splay_tree_callback, void *);
# undef splay_tree_insert
# undef splay_tree_remove
# undef splay_tree_foreach
+# undef splay_tree_foreach_lazy
# undef splay_tree_callback
+# undef splay_tree_callback_stop
# undef splay_tree_prefix
#endif
diff --git a/libgomp/target.c b/libgomp/target.c
index 06809db..e38cc3b 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -45,6 +45,12 @@
#include "plugin-suffix.h"
#endif
+/* Define another splay tree instantiation - for reverse offload. */
+#define splay_tree_prefix reverse
+#define splay_tree_c
+#include "splay-tree.h"
+
+
typedef uintptr_t *hash_entry_type;
static inline void * htab_alloc (size_t size) { return gomp_malloc (size); }
static inline void htab_free (void *ptr) { free (ptr); }
@@ -200,6 +206,12 @@ gomp_map_lookup (splay_tree mem_map, splay_tree_key key)
return splay_tree_lookup (mem_map, key);
}
+static inline reverse_splay_tree_key
+gomp_map_lookup_rev (reverse_splay_tree mem_map_rev, reverse_splay_tree_key key)
+{
+ return reverse_splay_tree_lookup (mem_map_rev, key);
+}
+
static inline splay_tree_key
gomp_map_0len_lookup (splay_tree mem_map, splay_tree_key key)
{
@@ -1674,7 +1686,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
- k->host_start),
sizes[j], cbufp, false);
}
- }
+ }
i = j - 1;
break;
case GOMP_MAP_FORCE_PRESENT:
@@ -2205,11 +2217,16 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
/* Load image to device and get target addresses for the image. */
struct addr_pair *target_table = NULL;
+ uint64_t *rev_target_fn_table = NULL;
int i, num_target_entries;
+ /* With reverse offload, insert also target-host addresses. */
+ bool rev_lookup = omp_requires_mask & GOMP_REQUIRES_REVERSE_OFFLOAD;
+
num_target_entries
= devicep->load_image_func (devicep->target_id, version,
- target_data, &target_table, NULL);
+ target_data, &target_table,
+ rev_lookup ? &rev_target_fn_table : NULL);
if (num_target_entries != num_funcs + num_vars
/* "+1" due to the additional ICV struct. */
@@ -2228,6 +2245,10 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
/* "+1" due to the additional ICV struct. */
tgt->array = gomp_malloc ((num_funcs + num_vars + 1)
* sizeof (*tgt->array));
+ if (rev_target_fn_table)
+ tgt->rev_array = gomp_malloc (num_funcs * sizeof (*tgt->rev_array));
+ else
+ tgt->rev_array = NULL;
tgt->refcount = REFCOUNT_INFINITY;
tgt->tgt_start = 0;
tgt->tgt_end = 0;
@@ -2236,6 +2257,7 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
tgt->list_count = 0;
tgt->device_descr = devicep;
splay_tree_node array = tgt->array;
+ reverse_splay_tree_node rev_array = tgt->rev_array;
for (i = 0; i < num_funcs; i++)
{
@@ -2250,6 +2272,17 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
array->left = NULL;
array->right = NULL;
splay_tree_insert (&devicep->mem_map, array);
+ if (rev_target_fn_table)
+ {
+ reverse_splay_tree_key k2 = &rev_array->key;
+ k2->dev = rev_target_fn_table[i];
+ k2->k = k;
+ rev_array->left = NULL;
+ rev_array->right = NULL;
+ if (k2->dev != 0)
+ reverse_splay_tree_insert (&devicep->mem_map_rev, rev_array);
+ rev_array++;
+ }
array++;
}
@@ -2369,6 +2402,15 @@ gomp_unload_image_from_device (struct gomp_device_descr *devicep,
gomp_mutex_unlock (&devicep->lock);
gomp_fatal ("image unload fail");
}
+ if (devicep->mem_map_rev.root)
+ {
+ /* Free reverse offload splay tree + data; 'tgt->rev_array' is the only
+ real allocation. */
+ assert (node && node->tgt && node->tgt->rev_array);
+ assert (devicep->mem_map_rev.root->key.k->tgt == node->tgt);
+ free (node->tgt->rev_array);
+ devicep->mem_map_rev.root = NULL;
+ }
/* Remove mappings from splay tree. */
int i;
@@ -3114,23 +3156,549 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
}
+
+/* Reverse lookup (device addr -> host addr) for reverse offload. We avoid
+ keeping track of all variable handling - assuming that reverse offload occurs
+ ony very rarely. Downside is that the reverse search is slow. */
+
+struct gomp_splay_tree_rev_lookup_data {
+ uintptr_t tgt_start;
+ uintptr_t tgt_end;
+ splay_tree_key key;
+};
+
+static int
+gomp_splay_tree_rev_lookup (splay_tree_key key, void *d)
+{
+ struct gomp_splay_tree_rev_lookup_data *data;
+ data = (struct gomp_splay_tree_rev_lookup_data *)d;
+ uintptr_t tgt_start = key->tgt->tgt_start + key->tgt_offset;
+
+ if (tgt_start > data->tgt_start || key->tgt->list_count == 0)
+ return 0;
+
+ size_t j;
+ for (j = 0; j < key->tgt->list_count; j++)
+ if (key->tgt->list[j].key == key)
+ break;
+ assert (j < key->tgt->list_count);
+ uintptr_t tgt_end = tgt_start + key->tgt->list[j].length;
+
+ if ((tgt_start == data->tgt_start && tgt_end == data->tgt_end)
+ || (tgt_end > data->tgt_start && tgt_start < data->tgt_end))
+ {
+ data->key = key;
+ return 1;
+ }
+ return 0;
+}
+
+static inline splay_tree_key
+gomp_map_rev_lookup (splay_tree mem_map, uint64_t tgt_start, uint64_t tgt_end,
+ bool zero_len)
+{
+ struct gomp_splay_tree_rev_lookup_data data;
+ data.key = NULL;
+ data.tgt_start = tgt_start;
+ data.tgt_end = tgt_end;
+
+ if (tgt_start != tgt_end)
+ {
+ splay_tree_foreach_lazy (mem_map, gomp_splay_tree_rev_lookup, &data);
+ return data.key;
+ }
+
+ data.tgt_end++;
+ splay_tree_foreach_lazy (mem_map, gomp_splay_tree_rev_lookup, &data);
+ if (data.key != NULL || zero_len)
+ return data.key;
+ data.tgt_end--;
+
+ data.tgt_start--;
+ splay_tree_foreach_lazy (mem_map, gomp_splay_tree_rev_lookup, &data);
+ return data.key;
+}
+
+struct cpy_data
+{
+ uint64_t devaddr;
+ bool present, aligned;
+};
+
+
+/* Search just mapped reverse-offload data; returns index if found,
+ otherwise >= n. */
+
+static inline int
+gomp_map_cdata_lookup_int (struct cpy_data *d, uint64_t *devaddrs,
+ unsigned short *kinds, uint64_t *sizes, size_t n,
+ uint64_t tgt_start, uint64_t tgt_end)
+{
+ const bool short_mapkind = true;
+ const int typemask = short_mapkind ? 0xff : 0x7;
+ size_t i;
+ for (i = 0; i < n; i++)
+ {
+ bool is_struct = ((get_kind (short_mapkind, kinds, i) & typemask)
+ == GOMP_MAP_STRUCT);
+ uint64_t dev_end;
+ if (!is_struct)
+ dev_end = d[i].devaddr + sizes[i];
+ else
+ {
+ if (i + sizes[i] < n)
+ dev_end = d[i + sizes[i]].devaddr + sizes[i + sizes[i]];
+ else
+ dev_end = devaddrs[i + sizes[i]] + sizes[i + sizes[i]];
+ }
+ if ((d[i].devaddr == tgt_start && dev_end == tgt_end)
+ || (dev_end > tgt_start && d[i].devaddr < tgt_end))
+ break;
+ if (is_struct)
+ i += sizes[i];
+ }
+ return i;
+}
+
+static inline int
+gomp_map_cdata_lookup (struct cpy_data *d, uint64_t *devaddrs,
+ unsigned short *kinds, uint64_t *sizes,
+ size_t n, uint64_t tgt_start, uint64_t tgt_end,
+ bool zero_len)
+{
+ size_t i;
+ if (tgt_start != tgt_end)
+ return gomp_map_cdata_lookup_int (d, devaddrs, kinds, sizes, n,
+ tgt_start, tgt_end);
+ tgt_end++;
+ i = gomp_map_cdata_lookup_int (d, devaddrs, kinds, sizes, n,
+ tgt_start, tgt_end);
+ if (i < n || zero_len)
+ return i;
+ tgt_end--;
+
+ tgt_start--;
+ return gomp_map_cdata_lookup_int (d, devaddrs, kinds, sizes, n,
+ tgt_start, tgt_end);
+}
+
/* Handle reverse offload. This is called by the device plugins for a
- reverse offload; it is not called if the outer target runs on the host. */
+ reverse offload; it is not called if the outer target runs on the host.
+ The mapping is simplified device-affecting constructs (except for target
+ with device(ancestor:1)) must not be encountered; in particular not
+ target (enter/exit) data. */
void
-gomp_target_rev (uint64_t fn_ptr __attribute__ ((unused)),
- uint64_t mapnum __attribute__ ((unused)),
- uint64_t devaddrs_ptr __attribute__ ((unused)),
- uint64_t sizes_ptr __attribute__ ((unused)),
- uint64_t kinds_ptr __attribute__ ((unused)),
- int dev_num __attribute__ ((unused)),
- void (*dev_to_host_cpy) (void *, const void *, size_t,
- void *) __attribute__ ((unused)),
- void (*host_to_dev_cpy) (void *, const void *, size_t,
- void *) __attribute__ ((unused)),
- void *token __attribute__ ((unused)))
+gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
+ uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
+ void (*dev_to_host_cpy) (void *, const void *, size_t, void*),
+ void (*host_to_dev_cpy) (void *, const void *, size_t, void*),
+ void *token)
{
- __builtin_unreachable ();
+ /* Return early if there is no offload code. */
+ if (sizeof (OFFLOAD_PLUGINS) == sizeof (""))
+ return;
+ /* Currently, this fails because of calculate_firstprivate_requirements
+ below; it could be fixed but additional code needs to be updated to
+ handle 32bit hosts - thus, it is not worthwhile. */
+ if (sizeof (void *) != sizeof (uint64_t))
+ gomp_fatal ("Reverse offload of 32bit hosts not supported.");
+
+ struct cpy_data *cdata = NULL;
+ uint64_t *devaddrs;
+ uint64_t *sizes;
+ unsigned short *kinds;
+ const bool short_mapkind = true;
+ const int typemask = short_mapkind ? 0xff : 0x7;
+ struct gomp_device_descr *devicep = resolve_device (dev_num, false);
+
+ reverse_splay_tree_key n;
+ struct reverse_splay_tree_key_s k;
+ k.dev = fn_ptr;
+
+ gomp_mutex_lock (&devicep->lock);
+ n = gomp_map_lookup_rev (&devicep->mem_map_rev, &k);
+ gomp_mutex_unlock (&devicep->lock);
+
+ if (n == NULL)
+ gomp_fatal ("Cannot find reverse-offload function");
+ void (*host_fn)() = (void (*)()) n->k->host_start;
+
+ if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+ {
+ devaddrs = (uint64_t *) (uintptr_t) devaddrs_ptr;
+ sizes = (uint64_t *) (uintptr_t) sizes_ptr;
+ kinds = (unsigned short *) (uintptr_t) kinds_ptr;
+ }
+ else
+ {
+ devaddrs = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
+ sizes = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
+ kinds = (unsigned short *) gomp_malloc (mapnum * sizeof (unsigned short));
+ if (dev_to_host_cpy)
+ {
+ dev_to_host_cpy (devaddrs, (const void *) (uintptr_t) devaddrs_ptr,
+ mapnum * sizeof (uint64_t), token);
+ dev_to_host_cpy (sizes, (const void *) (uintptr_t) sizes_ptr,
+ mapnum * sizeof (uint64_t), token);
+ dev_to_host_cpy (kinds, (const void *) (uintptr_t) kinds_ptr,
+ mapnum * sizeof (unsigned short), token);
+ }
+ else
+ {
+ gomp_copy_dev2host (devicep, NULL, devaddrs,
+ (const void *) (uintptr_t) devaddrs_ptr,
+ mapnum * sizeof (uint64_t));
+ gomp_copy_dev2host (devicep, NULL, sizes,
+ (const void *) (uintptr_t) sizes_ptr,
+ mapnum * sizeof (uint64_t));
+ gomp_copy_dev2host (devicep, NULL, kinds, (const void *) (uintptr_t) kinds_ptr,
+ mapnum * sizeof (unsigned short));
+ }
+ }
+
+ size_t tgt_align = 0, tgt_size = 0;
+
+ /* If actually executed on 32bit systems, the casts lead to wrong code;
+ but 32bit with offloading is not supported; see top of this function. */
+ calculate_firstprivate_requirements (mapnum, (void *) (uintptr_t) sizes,
+ (void *) (uintptr_t) kinds,
+ &tgt_align, &tgt_size);
+
+ if (tgt_align)
+ {
+ char *tgt = gomp_alloca (tgt_size + tgt_align - 1);
+ uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
+ if (al)
+ tgt += tgt_align - al;
+ tgt_size = 0;
+ for (uint64_t i = 0; i < mapnum; i++)
+ if (get_kind (short_mapkind, kinds, i) == GOMP_MAP_FIRSTPRIVATE
+ && devaddrs[i] != 0)
+ {
+ size_t align = (size_t) 1 << (kinds[i] >> 8);
+ tgt_size = (tgt_size + align - 1) & ~(align - 1);
+ if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+ memcpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
+ (size_t) sizes[i]);
+ else if (dev_to_host_cpy)
+ dev_to_host_cpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
+ (size_t) sizes[i], token);
+ else
+ gomp_copy_dev2host (devicep, NULL, tgt + tgt_size,
+ (void *) (uintptr_t) devaddrs[i],
+ (size_t) sizes[i]);
+ devaddrs[i] = (uint64_t) (uintptr_t) tgt + tgt_size;
+ tgt_size = tgt_size + sizes[i];
+ if ((devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+ && i + 1 < mapnum
+ && ((get_kind (short_mapkind, kinds, i) & typemask)
+ == GOMP_MAP_ATTACH))
+ {
+ *(uint64_t*) (uintptr_t) (devaddrs[i+1] + sizes[i+1])
+ = (uint64_t) devaddrs[i];
+ ++i;
+ }
+ }
+ }
+
+ if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM))
+ {
+ size_t j, struct_cpy = 0;
+ splay_tree_key n2;
+ cdata = gomp_alloca (sizeof (*cdata) * mapnum);
+ memset (cdata, '\0', sizeof (*cdata) * mapnum);
+ gomp_mutex_lock (&devicep->lock);
+ for (uint64_t i = 0; i < mapnum; i++)
+ {
+ if (devaddrs[i] == 0)
+ continue;
+ n = NULL;
+ int kind = get_kind (short_mapkind, kinds, i) & typemask;
+ switch (kind)
+ {
+ case GOMP_MAP_FIRSTPRIVATE:
+ case GOMP_MAP_FIRSTPRIVATE_INT:
+ continue;
+
+ case GOMP_MAP_DELETE:
+ case GOMP_MAP_RELEASE:
+ case GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION:
+ /* Assume it is present; look it up - but ignore otherwise. */
+ case GOMP_MAP_ALLOC:
+ case GOMP_MAP_FROM:
+ case GOMP_MAP_FORCE_ALLOC:
+ case GOMP_MAP_FORCE_FROM:
+ case GOMP_MAP_ALWAYS_FROM:
+ case GOMP_MAP_TO:
+ case GOMP_MAP_TOFROM:
+ case GOMP_MAP_FORCE_TO:
+ case GOMP_MAP_FORCE_TOFROM:
+ case GOMP_MAP_ALWAYS_TO:
+ case GOMP_MAP_ALWAYS_TOFROM:
+ case GOMP_MAP_ZERO_LEN_ARRAY_SECTION:
+ cdata[i].devaddr = devaddrs[i];
+ bool zero_len = (kind == GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION
+ || kind == GOMP_MAP_ZERO_LEN_ARRAY_SECTION);
+ j = gomp_map_cdata_lookup (cdata, devaddrs, kinds, sizes, i,
+ devaddrs[i],
+ devaddrs[i] + sizes[i], zero_len);
+ if (j < i)
+ {
+ n2 = NULL;
+ cdata[i].present = true;
+ devaddrs[i] = devaddrs[j] + devaddrs[i] - cdata[j].devaddr;
+ }
+ else
+ {
+ n2 = gomp_map_rev_lookup (&devicep->mem_map,
+ devaddrs[i],
+ devaddrs[i] + sizes[i], zero_len);
+ cdata[i].present = n2 != NULL;
+ }
+ if (!cdata[i].present
+ && kind != GOMP_MAP_DELETE
+ && kind != GOMP_MAP_RELEASE
+ && kind != GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION)
+ {
+ cdata[i].aligned = true;
+ size_t align = (size_t) 1 << (kinds[i] >> 8);
+ devaddrs[i]
+ = (uint64_t) (uintptr_t) gomp_aligned_alloc (align,
+ sizes[i]);
+ }
+ else if (n2 != NULL)
+ devaddrs[i] = (n2->host_start + cdata[i].devaddr
+ - (n2->tgt->tgt_start + n2->tgt_offset));
+ if (((!cdata[i].present || struct_cpy)
+ && (kind == GOMP_MAP_TO || kind == GOMP_MAP_TOFROM))
+ || kind == GOMP_MAP_FORCE_TO
+ || kind == GOMP_MAP_FORCE_TOFROM
+ || kind == GOMP_MAP_ALWAYS_TO
+ || kind == GOMP_MAP_ALWAYS_TOFROM)
+ {
+ if (dev_to_host_cpy)
+ dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
+ (void *) (uintptr_t) cdata[i].devaddr,
+ sizes[i], token);
+ else
+ gomp_copy_dev2host (devicep, NULL,
+ (void *) (uintptr_t) devaddrs[i],
+ (void *) (uintptr_t) cdata[i].devaddr,
+ sizes[i]);
+ }
+ if (struct_cpy)
+ struct_cpy--;
+ break;
+ case GOMP_MAP_ATTACH:
+ case GOMP_MAP_POINTER:
+ case GOMP_MAP_ALWAYS_POINTER:
+ n2 = gomp_map_rev_lookup (&devicep->mem_map,
+ devaddrs[i] + sizes[i],
+ devaddrs[i] + sizes[i]
+ + sizeof (void*), false);
+ cdata[i].present = n2 != NULL;
+ cdata[i].devaddr = devaddrs[i];
+ if (n2)
+ devaddrs[i] = (n2->host_start + cdata[i].devaddr
+ - (n2->tgt->tgt_start + n2->tgt_offset));
+ else
+ {
+ j = gomp_map_cdata_lookup (cdata, devaddrs, kinds, sizes, i,
+ devaddrs[i] + sizes[i],
+ devaddrs[i] + sizes[i]
+ + sizeof (void*), false);
+ if (j < i)
+ {
+ cdata[i].present = true;
+ devaddrs[i] = (devaddrs[j] + devaddrs[i]
+ - cdata[j].devaddr);
+ }
+ }
+ if (!cdata[i].present)
+ devaddrs[i] = (uintptr_t) gomp_malloc (sizeof (void*));
+ /* Assume that when present, the pointer is already correct. */
+ if (!n2)
+ *(uint64_t *) (uintptr_t) (devaddrs[i] + sizes[i])
+ = devaddrs[i-1];
+ break;
+ case GOMP_MAP_TO_PSET:
+ /* Assume that when present, the pointers are fine and no 'to:'
+ is required. */
+ n2 = gomp_map_rev_lookup (&devicep->mem_map,
+ devaddrs[i], devaddrs[i] + sizes[i],
+ false);
+ cdata[i].present = n2 != NULL;
+ cdata[i].devaddr = devaddrs[i];
+ if (n2)
+ devaddrs[i] = (n2->host_start + cdata[i].devaddr
+ - (n2->tgt->tgt_start + n2->tgt_offset));
+ else
+ {
+ j = gomp_map_cdata_lookup (cdata, devaddrs, kinds, sizes, i,
+ devaddrs[i],
+ devaddrs[i] + sizes[i], false);
+ if (j < i)
+ {
+ cdata[i].present = true;
+ devaddrs[i] = (devaddrs[j] + devaddrs[i]
+ - cdata[j].devaddr);
+ }
+ }
+ if (!cdata[i].present)
+ {
+ cdata[i].aligned = true;
+ size_t align = (size_t) 1 << (kinds[i] >> 8);
+ devaddrs[i]
+ = (uint64_t) (uintptr_t) gomp_aligned_alloc (align,
+ sizes[i]);
+ if (dev_to_host_cpy)
+ dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
+ (void *) (uintptr_t) cdata[i].devaddr,
+ sizes[i], token);
+ else
+ gomp_copy_dev2host (devicep, NULL,
+ (void *) (uintptr_t) devaddrs[i],
+ (void *) (uintptr_t) cdata[i].devaddr,
+ sizes[i]);
+ }
+ for (j = i + 1; j < mapnum; j++)
+ {
+ kind = get_kind (short_mapkind, kinds, j) & typemask;
+ if (!GOMP_MAP_ALWAYS_POINTER_P (kind)
+ && !GOMP_MAP_POINTER_P (kind))
+ break;
+ if (devaddrs[j] < devaddrs[i])
+ break;
+ if (cdata[i].present)
+ continue;
+ if (devaddrs[j] == 0)
+ {
+ *(uint64_t *) (uintptr_t) (devaddrs[i] + sizes[j]) = 0;
+ continue;
+ }
+ int k;
+ n2 = NULL;
+ cdata[i].present = true;
+ cdata[j].devaddr = devaddrs[j];
+ k = gomp_map_cdata_lookup (cdata, devaddrs, kinds, sizes, j,
+ devaddrs[j],
+ devaddrs[j] + sizeof (void*),
+ false);
+ if (k < j)
+ devaddrs[j] = (devaddrs[k] + devaddrs[j]
+ - cdata[k].devaddr);
+ else
+ {
+ n2 = gomp_map_rev_lookup (&devicep->mem_map,
+ devaddrs[j],
+ devaddrs[j] + sizeof (void*),
+ false);
+ if (n2 == NULL)
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ gomp_fatal ("Pointer target wasn't mapped");
+ }
+ devaddrs[j] = (n2->host_start + cdata[j].devaddr
+ - (n2->tgt->tgt_start + n2->tgt_offset));
+ }
+ *(void **) (uintptr_t) (devaddrs[i] + sizes[j])
+ = (void *) (uintptr_t) devaddrs[j];
+ }
+ i = j -1;
+ break;
+ case GOMP_MAP_STRUCT:
+ n2 = gomp_map_rev_lookup (&devicep->mem_map, devaddrs[i+1],
+ devaddrs[i + sizes[i]]
+ + sizes[i + sizes[i]], false);
+ cdata[i].present = n2 != NULL;
+ cdata[i].devaddr = devaddrs[i];
+ struct_cpy = cdata[i].present ? 0 : sizes[i];
+ if (!n2)
+ {
+ size_t sz = (size_t) (devaddrs[i + sizes[i]]
+ - devaddrs[i+1]
+ + sizes[i + sizes[i]]);
+ size_t align = (size_t) 1 << (kinds[i] >> 8);
+ cdata[i].aligned = true;
+ devaddrs[i] = (uintptr_t) gomp_aligned_alloc (align, sz);
+ devaddrs[i] -= devaddrs[i+1] - cdata[i].devaddr;
+ }
+ else
+ devaddrs[i] = (n2->host_start + cdata[i].devaddr
+ - (n2->tgt->tgt_start + n2->tgt_offset));
+ break;
+ default:
+ gomp_mutex_unlock (&devicep->lock);
+ gomp_fatal ("gomp_target_rev unhandled kind 0x%.4x", kinds[i]);
+ }
+ }
+ gomp_mutex_unlock (&devicep->lock);
+ }
+
+ host_fn (devaddrs);
+
+ if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM))
+ {
+ uint64_t struct_cpy = 0;
+ bool clean_struct = false;
+ for (uint64_t i = 0; i < mapnum; i++)
+ {
+ if (cdata[i].devaddr == 0)
+ continue;
+ int kind = get_kind (short_mapkind, kinds, i) & typemask;
+ bool copy = !cdata[i].present || struct_cpy;
+ switch (kind)
+ {
+ case GOMP_MAP_FORCE_FROM:
+ case GOMP_MAP_FORCE_TOFROM:
+ case GOMP_MAP_ALWAYS_FROM:
+ case GOMP_MAP_ALWAYS_TOFROM:
+ copy = true;
+ /* FALLTHRU */
+ case GOMP_MAP_FROM:
+ case GOMP_MAP_TOFROM:
+ if (copy && host_to_dev_cpy)
+ host_to_dev_cpy ((void *) (uintptr_t) cdata[i].devaddr,
+ (void *) (uintptr_t) devaddrs[i],
+ sizes[i], token);
+ else if (copy)
+ gomp_copy_host2dev (devicep, NULL,
+ (void *) (uintptr_t) cdata[i].devaddr,
+ (void *) (uintptr_t) devaddrs[i],
+ sizes[i], false, NULL);
+ default:
+ break;
+ }
+ if (struct_cpy)
+ {
+ struct_cpy--;
+ continue;
+ }
+ if (kind == GOMP_MAP_STRUCT && !cdata[i].present)
+ {
+ clean_struct = true;
+ struct_cpy = sizes[i];
+ }
+ else if (cdata[i].aligned)
+ gomp_aligned_free ((void *) (uintptr_t) devaddrs[i]);
+ else if (!cdata[i].present)
+ free ((void *) (uintptr_t) devaddrs[i]);
+ }
+ if (clean_struct)
+ for (uint64_t i = 0; i < mapnum; i++)
+ if (!cdata[i].present
+ && ((get_kind (short_mapkind, kinds, i) & typemask)
+ == GOMP_MAP_STRUCT))
+ {
+ devaddrs[i] += cdata[i+1].devaddr - cdata[i].devaddr;
+ gomp_aligned_free ((void *) (uintptr_t) devaddrs[i]);
+ }
+
+ free (devaddrs);
+ free (sizes);
+ free (kinds);
+ }
}
/* Host fallback for GOMP_target_data{,_ext} routines. */
@@ -4510,6 +5078,7 @@ gomp_target_init (void)
/* current_device.capabilities has already been set. */
current_device.type = current_device.get_type_func ();
current_device.mem_map.root = NULL;
+ current_device.mem_map_rev.root = NULL;
current_device.state = GOMP_DEVICE_UNINITIALIZED;
for (i = 0; i < new_num_devs; i++)
{
diff --git a/libgomp/testsuite/libgomp.fortran/reverse-offload-2.f90 b/libgomp/testsuite/libgomp.fortran/reverse-offload-2.f90
new file mode 100644
index 0000000..067639bc67
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/reverse-offload-2.f90
@@ -0,0 +1,72 @@
+! { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } }
+
+implicit none
+!$omp requires reverse_offload
+integer :: A(50), A2(50)
+integer :: i, error
+logical :: shared_mem
+
+error = 0
+shared_mem = .false.
+A = [(i, i=1,50)]
+A2 = [(i, i=1,50)]
+
+!$omp target map(to: shared_mem)
+ shared_mem = .true.
+!$omp end target
+
+!$omp target map(to: A(20:40), A2(20:40)) map(from: error)
+block
+ integer :: B(10), C(10)
+ B = 99
+ C = 88
+ A(20:40) = -A(20:40)
+ A2(20:40) = -A2(20:40)
+
+ !$omp target device (ancestor:1) &
+ !$omp& map(to: A(25:35)) map(always, to: A2(25:35)) &
+ !$omp& map(from:B(4:8)) map(tofrom:C(4:8))
+ if (shared_mem) then
+ if (any (A(25:35) /= [(-i,i=25,35)])) stop 20
+ else
+ if (any (A(25:35) /= [( i,i=25,35)])) stop 21
+ end if
+ if (any (A2(25:35) /= [(-i,i=25,35)])) stop 22
+ if (any (C(4:8) /= 88)) stop 23
+
+ A(25:35) = -A(25:35)*10
+ A2(25:35) = -A2(25:35)*10
+ B(4:8) = [4,5,6,7,8]
+ C(4:8) = [-4,-5,-6,-7,-8]
+ !$omp end target
+
+ if (any (B(1:3) /= 99) .or. any (B(9:10) /= 99)) then
+ error = 30
+ elseif (any (B(4:8) /= [4,5,6,7,8])) then
+ error = 31
+ elseif (any (C(1:3) /= 88) .or. any (C(9:10) /= 88)) then
+ error = 32
+ elseif (any (C(4:8) /= [-4,-5,-6,-7,-8])) then
+ error = 33
+ else
+ error = 0
+ endif
+end block
+
+if (error /= 0) stop error
+
+if (shared_mem) then
+ if (any (A(1:19) /= [( i, i=1,19)])) stop 1
+ if (any (A(20:24) /= [(-i, i=20,24)])) stop 2
+ if (any (A(36:40) /= [(-i, i=36,40)])) stop 3
+ if (any (A(41:50) /= [( i, i=41,50)])) stop 4
+
+ if (any (A(25:35) /= [( 10*i, i=25,35)])) stop 5
+else
+ if (any (A(1:24) /= [( i, i=1,24)])) stop 6
+ if (any (A(36:50) /= [( i, i=36,50)])) stop 7
+
+ if (any (A(25:35) /= [(-10*i, i=25,35)])) stop 8
+end if
+if (any (A2(25:35) /= [( 10*i, i=25,35)])) stop 9
+end
diff --git a/libgomp/testsuite/libgomp.fortran/reverse-offload-3.f90 b/libgomp/testsuite/libgomp.fortran/reverse-offload-3.f90
new file mode 100644
index 0000000..2fd2f5b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/reverse-offload-3.f90
@@ -0,0 +1,68 @@
+! { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } }
+
+implicit none
+!$omp requires reverse_offload
+integer :: A(50), A2(50), A3(50)
+integer :: i
+logical :: shared_mem
+
+shared_mem = .false.
+A = [(3*i, i=1,50)]
+A2 = [(7*i, i=1,50)]
+A3 = [(11*i, i=1,50)]
+
+!$omp target map(to: shared_mem)
+ shared_mem = .true.
+!$omp end target
+
+!$omp target map(to: A(20:40), A2(20:40), A3(20:40))
+block
+ integer :: C(10)
+ C = 88
+ A(20:40) = -2*A(20:40)
+ A2(20:40) = -9*A2(20:40)
+ A3(20:40) = -13*A3(20:40)
+
+ !$omp target device (ancestor:1) &
+ !$omp& map(from: A(25:35)) map(always, from: A2(25:35)) &
+ !$omp& map(alloc: A3(25:35)) map(alloc:C(4:8))
+ if (shared_mem) then
+ if (any (A(25:35) /= [(-2*3*i, i=25,35)])) stop 1
+ if (any (A2(25:35) /= [(-9*7*i, i=25,35)])) stop 2
+ if (any (A3(25:35) /= [(-13*11*i, i=25,35)])) stop 3
+ else
+ if (any (A(25:35) /= [(3*i, i=25,35)])) stop 4
+ if (any (A2(25:35) /= [(7*i, i=25,35)])) stop 5
+ if (any (A3(25:35) /= [(11*i, i=25,35)])) stop 6
+ end if
+
+ A(25:35) = A(25:35)*5
+ A2(25:35) = A2(25:35)*8
+ A3(25:35) = A3(25:35)*18
+ C(4:8) = [4,5,6,7,8]
+ !$omp end target
+
+ if (shared_mem) then
+ if (any (A(25:35) /= [(-2*3*5*i, i=25,35)])) stop 7
+ if (any (A2(25:35) /= [(-9*7*8*i, i=25,35)])) stop 8
+ if (any (A3(25:35) /= [(-13*11*18*i, i=25,35)])) stop 9
+ if (any (C(4:8) /= [4,5,6,7,8])) stop 10
+ else
+ if (any (A(25:35) /= [(-2*3*i, i=25,35)])) stop 11
+ if (any (A2(25:35) /= [(7*8*i, i=25,35)])) stop 12
+ if (any (A3(25:35) /= [(-13*11*i, i=25,35)])) stop 13
+ if (any (C(4:8) /= 88)) stop 14
+ end if
+end block
+
+if (shared_mem) then
+ if (any (A(25:35) /= [(-2*3*5*i, i=25,35)])) stop
+ if (any (A2(25:35) /= [(-9*7**8*i, i=25,35)])) stop
+ if (any (A3(25:35) /= [(-13*11*18*i, i=25,35)])) stop
+else
+ if (any (A(25:35) /= [(3*5*i, i=25,35)])) stop
+ if (any (A2(25:35) /= [(7*8*i, i=25,35)])) stop
+ if (any (A3(25:35) /= [(11*18*i, i=25,35)])) stop
+end if
+
+end
diff --git a/libgomp/testsuite/libgomp.fortran/reverse-offload-4.f90 b/libgomp/testsuite/libgomp.fortran/reverse-offload-4.f90
new file mode 100644
index 0000000..fb27aa7
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/reverse-offload-4.f90
@@ -0,0 +1,129 @@
+! { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } }
+
+implicit none
+!$omp requires reverse_offload
+
+type t2
+ integer :: a, b, c
+end type t2
+
+type t
+ integer :: A(5), B(5), C(5)
+ integer, pointer :: P(:), P2 !Just some padding
+ type(t2) :: tt !Just some padding
+end type t
+
+type(t) :: S1, S2
+logical :: shared_mem
+
+shared_mem = .false.
+
+!$omp target map(to: shared_mem)
+ shared_mem = .true.
+!$omp end target
+
+s1%A = [1,2,3,4,5]
+s1%B = [10,20,30,40,50]
+s1%C = [11,22,33,44,55]
+s2%A = 2*s1%A
+s2%B = 2*s1%B
+s2%C = 2*s1%C
+
+!$omp target &
+!$omp& map(to: s1%b, s1%c) &
+!$omp& map(to: s2%b, s2%c)
+block
+ type(t) :: si1, si2, si3, si4
+
+ s1%B = -10 * s1%B
+ s1%C = -10 * s1%C
+ s2%B = -15 * s2%B
+ s2%C = -15 * s2%C
+
+ si1%A = -1 * [1,2,3,4,5]
+ si1%B = -1 * [10,20,30,40,50]
+ si1%C = -1 * [11,22,33,44,55]
+ si2%A = -23 * [1,2,3,4,5]
+ si2%B = -23 * [10,20,30,40,50]
+ si2%C = -23 * [11,22,33,44,55]
+
+ !$omp target device (ancestor:1) &
+ !$omp& map(to: si1%C, si1%B) &
+ !$omp& map(tofrom: si2%C, si2%B) &
+ !$omp& map(always, to: s1%B) &
+ !$omp& map( to: s2%B)
+ if (any (s1%A /= [1,2,3,4,5])) stop 1
+ if (any (s1%B /= -10 * [10,20,30,40,50])) stop 2
+ if (shared_mem) then
+ if (any (s1%C /= -10 * [11,22,33,44,55])) stop 4
+ else
+ if (any (s1%C /= [11,22,33,44,55])) stop 3
+ endif
+ if (any (s2%A /= 2 * [1,2,3,4,5])) stop 4
+ if (shared_mem) then
+ if (any (s2%B /= -15 * 2 * [10,20,30,40,50])) stop 5
+ if (any (s2%C /= -15 * 2 * [11,22,33,44,55])) stop 6
+ else
+ if (any (s2%B /= 2 * [10,20,30,40,50])) stop 7
+ if (any (s2%C /= 2 * [11,22,33,44,55])) stop 8
+ endif
+ if (any (si1%B /= -1 * [10,20,30,40,50])) stop 9
+ if (any (si1%C /= -1 * [11,22,33,44,55])) stop 10
+ if (any (si2%B /= -23 * [10,20,30,40,50])) stop 10
+ if (any (si2%C /= -23 * [11,22,33,44,55])) stop 11
+
+ s1%A = 5 * s1%A
+ s1%B = 7 * s1%B
+ s1%C = 13 * s1%C
+ s2%A = 9 * s2%A
+ s2%B = 21 * s2%B
+ s2%C = 31 * s2%C
+ si1%B = -11 * si1%B
+ si1%C = -13 * si1%C
+ si2%B = -27 * si2%B
+ si2%C = -29 * si2%C
+ !$omp end target
+
+ if (shared_mem) then
+ if (any (s1%B /= -10 * 7 * [10,20,30,40,50])) stop 20
+ if (any (s1%C /= -10 * 13 * [11,22,33,44,55])) stop 21
+ else
+ if (any (s1%B /= -10 * [10,20,30,40,50])) stop 22
+ if (any (s1%C /= -10 * [11,22,33,44,55])) stop 23
+ endif
+ if (shared_mem) then
+ if (any (s2%B /= -15 * 2 * 21 * [10,20,30,40,50])) stop 24
+ if (any (s2%C /= -15 * 2 * 31 * [11,22,33,44,55])) stop 25
+ else
+ if (any (s2%B /= -15 * 2 * [10,20,30,40,50])) stop 26
+ if (any (s2%C /= -15 * 2 * [11,22,33,44,55])) stop 27
+ endif
+ if (any (si1%A /= -1 * [1,2,3,4,5])) stop 28
+ if (shared_mem) then
+ if (any (si1%B /= -1 * (-11) * [10,20,30,40,50])) stop 29
+ if (any (si1%C /= -1 * (-13) * [11,22,33,44,55])) stop 30
+ else
+ if (any (si1%B /= -1 * [10,20,30,40,50])) stop 31
+ if (any (si1%C /= -1 * [11,22,33,44,55])) stop 32
+ endif
+ if (any (si2%A /= -23 * [1,2,3,4,5])) stop 33
+ if (any (si2%B /= -23 * (-27) * [10,20,30,40,50])) stop 34
+ if (any (si2%C /= -23 * (-29) * [11,22,33,44,55])) stop 35
+end block
+
+if (any (s1%A /= 5 * [1,2,3,4,5])) stop 40
+if (any (s1%B /= -10 * 7 * [10,20,30,40,50])) stop 41
+if (shared_mem) then
+ if (any (s1%C /= -10 * 13 * [11,22,33,44,55])) stop 42
+else
+ if (any (s1%C /= 13 * [11,22,33,44,55])) stop 43
+endif
+if (any (s2%A /= 2 * 9 * [1,2,3,4,5])) stop 44
+if (shared_mem) then
+ if (any (s2%B /= -15 * 2 * 21 * [10,20,30,40,50])) stop 45
+ if (any (s2%C /= -15 * 2 * 31 * [11,22,33,44,55])) stop 46
+else
+ if (any (s2%B /= 2 * 21 * [10,20,30,40,50])) stop 47
+ if (any (s2%C /= 2 * 31 * [11,22,33,44,55])) stop 48
+endif
+end
diff --git a/libgomp/testsuite/libgomp.fortran/reverse-offload-5.f90 b/libgomp/testsuite/libgomp.fortran/reverse-offload-5.f90
new file mode 100644
index 0000000..ef7eb7b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/reverse-offload-5.f90
@@ -0,0 +1,100 @@
+! { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } }
+! { dg-xfail-run-if "Copying on-device allocated memory fails with cuMemcpyDtoHAsync error: invalid argument" { offload_device_nvptx } }
+
+! Because of the nvptx fail, a non-device alloc version has been
+! created: reverse-offload-5a.f90
+
+implicit none
+!$omp requires reverse_offload
+
+integer, allocatable :: A(:), A2(:), s1, s2
+integer :: i
+logical :: shared_mem
+
+shared_mem = .false.
+
+a = [1,2,3,4]
+a2 = [8,7,6,5]
+s1 = 532
+s2 = 55
+
+!$omp target map(to: shared_mem)
+ shared_mem = .true.
+!$omp end target
+
+!$omp target map(to: A, A2, s1, s2)
+block
+ integer, allocatable :: ai(:), ai2(:), si1, si2
+
+ a = a * 2
+ a2 = a2 * 3
+ s1 = s1 * 4
+ s2 = s2 * 5
+
+ ai = [23,35,86,43]
+ ai2 = [8,4,7,1]
+ si1 = 64
+ si2 = 765
+
+ !$omp target device (ancestor:1) &
+ !$omp& map(to: A, s1, ai, si1) map(always, to: a2, s2) &
+ !$omp& map(tofrom: ai2, si2)
+ if (shared_mem) then
+ if (any (a /= 2 * [1,2,3,4])) stop 1
+ if (s1 /= 4 * 532) stop 2
+ else
+ if (any (a /= [1,2,3,4])) stop 3
+ if (s1 /= 532) stop 4
+ endif
+ if (any (a2 /= 3 * [8,7,6,5])) stop 5
+ if (s2 /= 5 * 55) stop 6
+ if (any (ai /= [23,35,86,43])) stop 7
+ if (any (ai2 /= [8,4,7,1])) stop 8
+ if (si1 /= 64) stop 9
+ if (si2 /= 765) stop 10
+
+ a = a*3
+ a2 = a2*7
+ s1 = s1*11
+ s2 = s2*5
+ ai = ai*13
+ ai2 = ai2*21
+ si1 = si1*27
+ si2 = si2*31
+ !$omp end target
+
+ if (shared_mem) then
+ if (any (a /= 3 * 2 * [1,2,3,4])) stop 11
+ if (any (a2 /= 7 * 3 * [8,7,6,5])) stop 12
+ if (s1 /= 11 * 4 * 532) stop 13
+ if (s2 /= 5 * 5 * 55) stop 14
+ if (any (ai /= 13 * [23,35,86,43])) stop 15
+ if (si1 /= 27 * 64) stop 16
+ else
+ if (any (a /= 2 * [1,2,3,4])) stop 17
+ if (any (a2 /= 3 * [8,7,6,5])) stop 18
+ if (s1 /= 4 * 532) stop 19
+ if (s2 /= 5 * 55) stop 20
+ if (any (ai /= [23,35,86,43])) stop 22
+ if (si1 /= 64) stop 23
+ endif
+ if (any (ai2 /= 21 * [8,4,7,1])) stop 24
+ if (si2 /= 31 * 765) stop 25
+
+ deallocate (ai, ai2, si1, si2)
+end block
+
+if (shared_mem) then
+ if (any (a /= 3 * 2 * [1,2,3,4])) stop 30
+ if (any (a2 /= 7 * 3 * [8,7,6,5])) stop 31
+ if (s1 /= 11 * 4 * 532) stop 32
+ if (s2 /= 5 * 5 * 55) stop 33
+else
+ if (any (a /= 3 * [1,2,3,4])) stop 34
+ if (any (a2 /= 3 * 7 * [8,7,6,5])) stop 35
+ if (s1 /= 11 * 532) stop 36
+ if (s2 /= 5 * 5 * 55) stop 37
+endif
+
+deallocate (a, a2, s1, s2)
+end
diff --git a/libgomp/testsuite/libgomp.fortran/reverse-offload-5a.f90 b/libgomp/testsuite/libgomp.fortran/reverse-offload-5a.f90
new file mode 100644
index 0000000..914d10d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/reverse-offload-5a.f90
@@ -0,0 +1,98 @@
+! { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } }
+
+! Copying on-device allocated memory fails with cuMemcpyDtoHAsync error: invalid argument
+! Hence, reverse-offload-5.f90 has been copied to *this* file,
+! which uses on-host allocated vars - but only on the device side stack variables
+
+implicit none
+!$omp requires reverse_offload
+
+integer, allocatable :: A(:), A2(:), s1, s2
+integer :: i,D(4)
+logical :: shared_mem
+
+shared_mem = .false.
+
+a = [1,2,3,4]
+a2 = [8,7,6,5]
+s1 = 532
+s2 = 55
+
+!$omp target map(to: shared_mem)
+ shared_mem = .true.
+!$omp end target
+
+!$omp target map(to: A, A2, s1, s2)
+block
+ ! stack variables:
+ integer :: ai(4), ai2(4), si1, si2
+
+ a = a * 2
+ a2 = a2 * 3
+ s1 = s1 * 4
+ s2 = s2 * 5
+
+ ai = [23,35,86,43]
+ ai2 = [8,4,7,1]
+ si1 = 64
+ si2 = 765
+
+ !$omp target device (ancestor:1) &
+ !$omp& map(to: A, s1, ai, si1) map(always, to: a2, s2) &
+ !$omp& map(tofrom: ai2, si2)
+ if (shared_mem) then
+ if (any (a /= 2 * [1,2,3,4])) stop 1
+ if (s1 /= 4 * 532) stop 2
+ else
+ if (any (a /= [1,2,3,4])) stop 3
+ if (s1 /= 532) stop 4
+ endif
+ if (any (a2 /= 3 * [8,7,6,5])) stop 5
+ if (s2 /= 5 * 55) stop 6
+ if (any (ai /= [23,35,86,43])) stop 7
+ if (any (ai2 /= [8,4,7,1])) stop 8
+ if (si1 /= 64) stop 9
+ if (si2 /= 765) stop 10
+
+ a = a*3
+ a2 = a2*7
+ s1 = s1*11
+ s2 = s2*5
+ ai = ai*13
+ ai2 = ai2*21
+ si1 = si1*27
+ si2 = si2*31
+ !$omp end target
+
+ if (shared_mem) then
+ if (any (a /= 3 * 2 * [1,2,3,4])) stop 11
+ if (any (a2 /= 7 * 3 * [8,7,6,5])) stop 12
+ if (s1 /= 11 * 4 * 532) stop 13
+ if (s2 /= 5 * 5 * 55) stop 14
+ if (any (ai /= 13 * [23,35,86,43])) stop 15
+ if (si1 /= 27 * 64) stop 16
+ else
+ if (any (a /= 2 * [1,2,3,4])) stop 17
+ if (any (a2 /= 3 * [8,7,6,5])) stop 18
+ if (s1 /= 4 * 532) stop 19
+ if (s2 /= 5 * 55) stop 20
+ if (any (ai /= [23,35,86,43])) stop 22
+ if (si1 /= 64) stop 23
+ endif
+ if (any (ai2 /= 21 * [8,4,7,1])) stop 24
+ if (si2 /= 31 * 765) stop 25
+end block
+if (shared_mem) then
+ if (any (a /= 3 * 2 * [1,2,3,4])) stop 30
+ if (any (a2 /= 7 * 3 * [8,7,6,5])) stop 31
+ if (s1 /= 11 * 4 * 532) stop 32
+ if (s2 /= 5 * 5 * 55) stop 33
+else
+ if (any (a /= 3 * [1,2,3,4])) stop 34
+ if (any (a2 /= 3 * 7 * [8,7,6,5])) stop 35
+ if (s1 /= 11 * 532) stop 36
+ if (s2 /= 5 * 5 * 55) stop 37
+endif
+
+deallocate (a, a2, s1, s2)
+end