349 files changed, 22553 insertions, 1408 deletions
diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp
new file mode 100644
index 0000000..5ecab6d
--- /dev/null
+++ b/libgomp/ChangeLog.omp
@@ -0,0 +1,1053 @@
+2025-05-30  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	* testsuite/libgomp.c++/target-flex-300.C: XFAIL.
+	* testsuite/libgomp.c++/target-flex-60.C: Likewise.
+	* testsuite/libgomp.c++/target-flex-61.C: Likewise.
+	* testsuite/libgomp.c++/target-flex-62.C: Likewise.
+	* testsuite/libgomp.c++/target-flex-81.C: Likewise.
+
+2025-05-30  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	Backported from master:
+	2025-05-30  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	* testsuite/libgomp.c++/target-std__valarray-1.C: New.
+	* testsuite/libgomp.c++/target-std__valarray-1.output: Likewise.
+
+2025-05-30  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	Backported from master:
+	2025-05-30  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	* testsuite/libgomp.c++/target-std__array-concurrent-usm.C: New.
+	* testsuite/libgomp.c++/target-std__array-concurrent.C: Adjust.
+	* testsuite/libgomp.c++/target-std__bitset-concurrent-usm.C: New.
+	* testsuite/libgomp.c++/target-std__bitset-concurrent.C: Adjust.
+	* testsuite/libgomp.c++/target-std__deque-concurrent-usm.C: New.
+	* testsuite/libgomp.c++/target-std__deque-concurrent.C: Adjust.
+	* testsuite/libgomp.c++/target-std__forward_list-concurrent-usm.C: New.
+	* testsuite/libgomp.c++/target-std__forward_list-concurrent.C: Adjust.
+	* testsuite/libgomp.c++/target-std__list-concurrent-usm.C: New.
+	* testsuite/libgomp.c++/target-std__list-concurrent.C: Adjust.
+	* testsuite/libgomp.c++/target-std__map-concurrent-usm.C: New.
+	* testsuite/libgomp.c++/target-std__map-concurrent.C: Adjust.
+	* testsuite/libgomp.c++/target-std__multimap-concurrent-usm.C: New.
+	* testsuite/libgomp.c++/target-std__multimap-concurrent.C: Adjust.
+	* testsuite/libgomp.c++/target-std__multiset-concurrent-usm.C: New.
+	* testsuite/libgomp.c++/target-std__multiset-concurrent.C: Adjust.
+	* testsuite/libgomp.c++/target-std__set-concurrent-usm.C: New.
+	* testsuite/libgomp.c++/target-std__set-concurrent.C: Adjust.
+	* testsuite/libgomp.c++/target-std__span-concurrent-usm.C: New.
+	* testsuite/libgomp.c++/target-std__span-concurrent.C: Adjust.
+	* testsuite/libgomp.c++/target-std__valarray-concurrent-usm.C: New.
+	* testsuite/libgomp.c++/target-std__valarray-concurrent.C: Adjust.
+	* testsuite/libgomp.c++/target-std__vector-concurrent-usm.C: New.
+	* testsuite/libgomp.c++/target-std__vector-concurrent.C: Adjust.
+
+2025-05-30  Kwok Cheung Yeung  <kcyeung@baylibre.com>
+
+	Backported from master:
+	2025-05-30  Kwok Cheung Yeung  <kcyeung@baylibre.com>
+		    Thomas Schwinge  <tschwinge@baylibre.com>
+
+	* testsuite/libgomp.c++/target-std__array-concurrent.C: New.
+	* testsuite/libgomp.c++/target-std__bitset-concurrent.C: Likewise.
+	* testsuite/libgomp.c++/target-std__deque-concurrent.C: Likewise.
+	* testsuite/libgomp.c++/target-std__flat_map-concurrent.C: Likewise.
+	* testsuite/libgomp.c++/target-std__flat_multimap-concurrent.C: Likewise.
+	* testsuite/libgomp.c++/target-std__flat_multiset-concurrent.C: Likewise.
+	* testsuite/libgomp.c++/target-std__flat_set-concurrent.C: Likewise.
+	* testsuite/libgomp.c++/target-std__forward_list-concurrent.C: Likewise.
+	* testsuite/libgomp.c++/target-std__list-concurrent.C: Likewise.
+	* testsuite/libgomp.c++/target-std__map-concurrent.C: Likewise.
+	* testsuite/libgomp.c++/target-std__multimap-concurrent.C: Likewise.
+	* testsuite/libgomp.c++/target-std__multiset-concurrent.C: Likewise.
+	* testsuite/libgomp.c++/target-std__set-concurrent.C: Likewise.
+	* testsuite/libgomp.c++/target-std__span-concurrent.C: Likewise.
+	* testsuite/libgomp.c++/target-std__unordered_map-concurrent.C: Likewise.
+	* testsuite/libgomp.c++/target-std__unordered_multimap-concurrent.C: Likewise.
+	* testsuite/libgomp.c++/target-std__unordered_multiset-concurrent.C: Likewise.
+	* testsuite/libgomp.c++/target-std__unordered_set-concurrent.C: Likewise.
+	* testsuite/libgomp.c++/target-std__valarray-concurrent.C: Likewise.
+	* testsuite/libgomp.c++/target-std__vector-concurrent.C: Likewise.
+
+2025-05-30  Kwok Cheung Yeung  <kcyeung@baylibre.com>
+
+	Backported from master:
+	2025-05-30  Kwok Cheung Yeung  <kcyeung@baylibre.com>
+
+	* testsuite/libgomp.c++/target-std__cmath.C: New.
+	* testsuite/libgomp.c++/target-std__complex.C: Likewise.
+	* testsuite/libgomp.c++/target-std__numbers.C: Likewise.
+
+2025-05-30  Waffl3x  <waffl3x@baylibre.com>
+
+	Backported from master:
+	2025-05-30  Waffl3x  <waffl3x@baylibre.com>
+		    Thomas Schwinge  <tschwinge@baylibre.com>
+
+	* testsuite/libgomp.c++/target-flex-10.C: New test.
+	* testsuite/libgomp.c++/target-flex-100.C: New test.
+	* testsuite/libgomp.c++/target-flex-101.C: New test.
+	* testsuite/libgomp.c++/target-flex-11.C: New test.
+	* testsuite/libgomp.c++/target-flex-12.C: New test.
+	* testsuite/libgomp.c++/target-flex-2000.C: New test.
+	* testsuite/libgomp.c++/target-flex-2001.C: New test.
+	* testsuite/libgomp.c++/target-flex-2002.C: New test.
+	* testsuite/libgomp.c++/target-flex-2003.C: New test.
+	* testsuite/libgomp.c++/target-flex-30.C: New test.
+	* testsuite/libgomp.c++/target-flex-300.C: New test.
+	* testsuite/libgomp.c++/target-flex-31.C: New test.
+	* testsuite/libgomp.c++/target-flex-32.C: New test.
+	* testsuite/libgomp.c++/target-flex-33.C: New test.
+	* testsuite/libgomp.c++/target-flex-41.C: New test.
+	* testsuite/libgomp.c++/target-flex-60.C: New test.
+	* testsuite/libgomp.c++/target-flex-61.C: New test.
+	* testsuite/libgomp.c++/target-flex-62.C: New test.
+	* testsuite/libgomp.c++/target-flex-70.C: New test.
+	* testsuite/libgomp.c++/target-flex-80.C: New test.
+	* testsuite/libgomp.c++/target-flex-81.C: New test.
+	* testsuite/libgomp.c++/target-flex-90.C: New test.
+	* testsuite/libgomp.c++/target-flex-common.h: New test.
+
+2025-05-30  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	Backported from master:
+	2025-05-30  Thomas Schwinge  <tschwinge@baylibre.com>
+		    Richard Biener  <rguenther@suse.de>
+
+	PR middle-end/119835
+	* testsuite/libgomp.oacc-c-c++-common/abi-struct-1.c:
+	'#pragma GCC optimize "-fno-inline"'.
+	* testsuite/libgomp.c-c++-common/target-abi-struct-1.c: New.
+	* testsuite/libgomp.c-c++-common/target-abi-struct-1-O0.c: Adjust.
+
+2025-05-30  Tobias Burnus  <tburnus@baylibre.com>
+
+	Backported from master:
+	2025-05-29  Tobias Burnus  <tburnus@baylibre.com>
+
+	PR libgomp/93226
+	* libgomp-plugin.h (GOMP_OFFLOAD_openacc_async_dev2dev): New
+	prototype.
+	* libgomp.h (struct acc_dispatch_t): Add dev2dev_func.
+	(gomp_copy_dev2dev): New prototype.
+	* libgomp.map (OACC_2.6.1): New; add acc_memcpy_device{,_async}.
+	* libgomp.texi (acc_memcpy_device): New.
+	* oacc-mem.c (memcpy_tofrom_device): Change to take from/to
+	device boolean; use memcpy not memmove; add early return if
+	size == 0 or same device + same ptr.
+	(acc_memcpy_to_device, acc_memcpy_to_device_async,
+	acc_memcpy_from_device, acc_memcpy_from_device_async): Update.
+	(acc_memcpy_device, acc_memcpy_device_async): New.
+	* openacc.f90 (acc_memcpy_device, acc_memcpy_device_async):
+	Add interface.
+	* openacc_lib.h (acc_memcpy_device, acc_memcpy_device_async):
+	Likewise.
+	* openacc.h (acc_memcpy_device, acc_memcpy_device_async): Add
+	prototype.
+	* plugin/plugin-gcn.c (GOMP_OFFLOAD_openacc_async_host2dev):
+	Update comment.
+	(GOMP_OFFLOAD_openacc_async_dev2host): Update call.
+	(GOMP_OFFLOAD_openacc_async_dev2dev): New.
+	* plugin/plugin-nvptx.c (cuda_memcpy_dev_sanity_check): New.
+	(GOMP_OFFLOAD_dev2dev): Call it.
+	(GOMP_OFFLOAD_openacc_async_dev2dev): New.
+	* target.c (gomp_copy_dev2dev): New.
+	(gomp_load_plugin_for_device): Load dev2dev and async_dev2dev.
+	* testsuite/libgomp.oacc-c-c++-common/acc_memcpy_device-1.c: New test.
+	* testsuite/libgomp.oacc-fortran/acc_memcpy_device-1.f90: New test.
+
+2025-05-22  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	Backported from master:
+	2025-05-19  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	PR lto/120308
+	* testsuite/libgomp.oacc-c-c++-common/abi-struct-1.c: Add empty
+	structure testing.
+
+2025-05-22  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	Backported from master:
+	2025-05-19  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	* testsuite/libgomp.c-c++-common/target-abi-struct-1-O0.c: New.
+	* testsuite/libgomp.oacc-c-c++-common/abi-struct-1.c: Likewise.
+
+2025-05-22  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	Backported from master:
+	2025-05-12  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	PR target/119692
+	* testsuite/libgomp.c++/pr119692-1-4.C: '{ dg-timeout 10 }'.
+	* testsuite/libgomp.c++/pr119692-1-5.C: Likewise.
+	* testsuite/libgomp.c++/target-exceptions-bad_cast-1.C: Likewise.
+	* testsuite/libgomp.c++/target-exceptions-bad_cast-2.C: Likewise.
+	* testsuite/libgomp.oacc-c++/exceptions-bad_cast-1.C: Likewise.
+	* testsuite/libgomp.oacc-c++/exceptions-bad_cast-2.C: Likewise.
+
+2025-05-16  Tobias Burnus  <tburnus@baylibre.com>
+
+	Backported from master:
+	2025-05-14  Tobias Burnus  <tburnus@baylibre.com>
+
+	* target.c (gomp_attach_pointer): Return bool; accept additional
+	bool to optionally silence the fatal pointee-not-found error.
+	(gomp_map_vars_internal): If the pointee could not be found,
+	check whether it was mapped as GOMP_MAP_ZERO_LEN_ARRAY_SECTION.
+	* libgomp.h (gomp_attach_pointer): Update prototype.
+	* oacc-mem.c (acc_attach_async, goacc_enter_data_internal): Update
+	calls.
+	* testsuite/libgomp.c/target-map-zero-sized.c: New test.
+	* testsuite/libgomp.c/target-map-zero-sized-2.c: New test.
+	* testsuite/libgomp.c/target-map-zero-sized-3.c: New test.
+
+2025-05-15  Tobias Burnus  <tburnus@baylibre.com>
+
+	Backported from master:
+	2025-05-09  Tobias Burnus  <tburnus@baylibre.com>
+
+	* testsuite/libgomp.c/interop-cuda-full.c: Use 'link' instead
+	of 'run' when the default device is "! offload_device_nvptx".
+	* testsuite/libgomp.c/interop-cuda-libonly.c: Likewise.
+	* testsuite/libgomp.c/interop-hip-nvidia-full.c: Likewise.
+	* testsuite/libgomp.c/interop-hip-nvidia-no-headers.c: Likewise.
+	* testsuite/libgomp.c/interop-hip-nvidia-no-hip-header.c: Likewise.
+	* testsuite/libgomp.fortran/interop-hip-nvidia-full.F90: Likewise.
+	* testsuite/libgomp.fortran/interop-hip-nvidia-no-module.F90: Likewise.
+	* testsuite/libgomp.c/interop-hip-amd-full.c: Use 'link' instead
+	of 'run' when the default device is "! offload_device_gcn".
+	* testsuite/libgomp.c/interop-hip-amd-no-hip-header.c: Likewise.
+	* testsuite/libgomp.fortran/interop-hip-amd-full.F90: Likewise.
+	* testsuite/libgomp.fortran/interop-hip-amd-no-module.F90: Likewise.
+
+2025-05-15  Sandra Loosemore  <sloosemore@baylibre.com>
+	    Tobias Burnus  <tburnus@baylibre.com>
+
+	* libgomp.texi: Mark need_device_addr as supported.
+	* testsuite/libgomp.c++/need-device-ptr.C: New.
+	* testsuite/libgomp.c-c++-common/dispatch-3.c: New.
+	* testsuite/libgomp.fortran/adjust-args-array-descriptor.f90: New.
+	* testsuite/libgomp.fortran/need-device-ptr.f90: New.
+
+2025-05-15  waffl3x  <waffl3x@baylibre.com>
+
+	PR c++/119659
+	PR c++/118859
+	PR c++/119601
+	PR c++/119602
+	PR c++/119775
+	* libgomp.texi: Set 'adjust args' variadic arguments support to Y.
+
+2025-05-15  Andrew Stubbs  <ams@baylibre.com>
+
+	Backported from master:
+	2025-04-25  Andrew Stubbs  <ams@baylibre.com>
+
+	* testsuite/libgomp.c/interop-hsa.c: New test.
+
+2025-05-15  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	Backported from master:
+	2025-04-25  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	PR target/119853
+	PR target/119854
+	* target-cxa-dso-dtor.c: New.
+	* config/accel/target-cxa-dso-dtor.c: Likewise.
+	* Makefile.am (libgomp_la_SOURCES): Add it.
+	* Makefile.in: Regenerate.
+	* testsuite/libgomp.c++/target-cdtor-1.C: New.
+	* testsuite/libgomp.c++/target-cdtor-2.C: Likewise.
+
+2025-05-15  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	Backported from master:
+	2025-04-25  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	* testsuite/libgomp.c-c++-common/target-cdtor-1.c: New.
+
+2025-05-15  Andrew Pinski  <quic_apinski@quicinc.com>
+
+	Backported from master:
+	2025-04-25  Andrew Pinski  <quic_apinski@quicinc.com>
+		    Thomas Schwinge  <tschwinge@baylibre.com>
+
+	PR target/119737
+	* testsuite/libgomp.c++/target-exceptions-throw-1.C: Remove
+	PR119737 XFAILing.
+	* testsuite/libgomp.c++/target-exceptions-throw-2.C: Likewise.
+	* testsuite/libgomp.oacc-c++/exceptions-throw-1.C: Likewise.
+	* testsuite/libgomp.oacc-c++/exceptions-throw-2.C: Likewise.
+
+2025-05-15  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	Backported from master:
+	2025-04-25  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	PR target/118794
+	* testsuite/libgomp.c++/target-exceptions-pr118794-1.C: Adjust for
+	'targetm.arm_eabi_unwinder'.
+	* testsuite/libgomp.c++/target-exceptions-pr118794-1-offload-sorry-GCN.C:
+	Likewise.
+	* testsuite/libgomp.c++/target-exceptions-pr118794-1-offload-sorry-nvptx.C:
+	Likewise.
+
+2025-05-15  Tobias Burnus  <tburnus@baylibre.com>
+
+	Backported from master:
+	2025-04-24  Tobias Burnus  <tburnus@baylibre.com>
+
+	* testsuite/lib/libgomp.exp
+	(check_effective_target_gomp_hip_header_nvidia): Compile with
+	"-Wno-deprecated-declarations".
+	* testsuite/libgomp.c/interop-hip-nvidia-full.c: Likewise.
+	* testsuite/libgomp.c/interop-hipblas-nvidia-full.c: Likewise.
+	* testsuite/libgomp.c/interop-hipblas.h: Add workarounds
+	when using the HIP headers with __HIP_PLATFORM_NVIDIA__.
+
+2025-05-15  Tobias Burnus  <tburnus@baylibre.com>
+
+	Backported from master:
+	2025-04-24  Tobias Burnus  <tburnus@baylibre.com>
+
+	* testsuite/lib/libgomp.exp (check_effective_target_openacc_cublas,
+	check_effective_target_openacc_cudart): Update description as
+	the check requires more.
+	(check_effective_target_openacc_libcuda,
+	check_effective_target_openacc_libcublas,
+	check_effective_target_openacc_libcudart,
+	check_effective_target_gomp_hip_header_amd,
+	check_effective_target_gomp_hip_header_nvidia,
+	check_effective_target_gomp_hipfort_module,
+	check_effective_target_gomp_libamdhip64,
+	check_effective_target_gomp_libhipblas): New.
+	* testsuite/libgomp.c-c++-common/interop-2.c: New test.
+	* testsuite/libgomp.c/interop-cublas-full.c: New test.
+	* testsuite/libgomp.c/interop-cublas-libonly.c: New test.
+	* testsuite/libgomp.c/interop-cuda-full.c: New test.
+	* testsuite/libgomp.c/interop-cuda-libonly.c: New test.
+	* testsuite/libgomp.c/interop-hip-amd-full.c: New test.
+	* testsuite/libgomp.c/interop-hip-amd-no-hip-header.c: New test.
+	* testsuite/libgomp.c/interop-hip-nvidia-full.c: New test.
+	* testsuite/libgomp.c/interop-hip-nvidia-no-headers.c: New test.
+	* testsuite/libgomp.c/interop-hip-nvidia-no-hip-header.c: New test.
+	* testsuite/libgomp.c/interop-hip.h: New test.
+	* testsuite/libgomp.c/interop-hipblas-amd-full.c: New test.
+	* testsuite/libgomp.c/interop-hipblas-amd-no-hip-header.c: New test.
+	* testsuite/libgomp.c/interop-hipblas-nvidia-full.c: New test.
+	* testsuite/libgomp.c/interop-hipblas-nvidia-no-headers.c: New test.
+	* testsuite/libgomp.c/interop-hipblas-nvidia-no-hip-header.c: New test.
+	* testsuite/libgomp.c/interop-hipblas.h: New test.
+	* testsuite/libgomp.fortran/interop-hip-amd-full.F90: New test.
+	* testsuite/libgomp.fortran/interop-hip-amd-no-module.F90: New test.
+	* testsuite/libgomp.fortran/interop-hip-nvidia-full.F90: New test.
+	* testsuite/libgomp.fortran/interop-hip-nvidia-no-module.F90: New test.
+	* testsuite/libgomp.fortran/interop-hip.h: New test.
+
+2025-05-15  Kwok Cheung Yeung  <kcyeung@baylibre.com>
+
+	* testsuite/libgomp.fortran/allocatable-comp-iterators.f90: Add test
+	for non-const iterator boundaries.
+
+2025-05-15  Kwok Cheung Yeung  <kcyeung@baylibre.com>
+
+	* testsuite/libgomp.fortran/allocatable-comp-iterators.f90: New.
+
+2025-05-15  Kwok Cheung Yeung  <kcyeung@baylibre.com>
+	    Andrew Stubbs  <ams@baylibre.com>
+
+	* testsuite/libgomp.fortran/mapper-iterators-1.f90: New test.
+	* testsuite/libgomp.fortran/mapper-iterators-2.f90: New test.
+	* testsuite/libgomp.fortran/mapper-iterators-3.f90: New test.
+	* testsuite/libgomp.fortran/mapper-iterators-4.f90: New test.
+
+2025-05-15  Kwok Cheung Yeung  <kcyeung@baylibre.com>
+	    Andrew Stubbs  <ams@baylibre.com>
+
+	* testsuite/libgomp.c-c++-common/mapper-iterators-1.c: New test.
+	* testsuite/libgomp.c-c++-common/mapper-iterators-2.c: New test.
+	* testsuite/libgomp.c-c++-common/mapper-iterators-3.c: New test.
+
+2025-05-15  Kwok Cheung Yeung  <kcyeung@baylibre.com>
+
+	* testsuite/libgomp.c-c++-common/target-map-iterators-4.c: New.
+	* testsuite/libgomp.c-c++-common/target-map-iterators-5.c: New.
+	* testsuite/libgomp.c-c++-common/target-update-iterators-4.c: New.
+	* testsuite/libgomp.fortran/target-map-iterators-4.f90: New.
+	* testsuite/libgomp.fortran/target-map-iterators-5.f90: New.
+	* testsuite/libgomp.fortran/target-update-iterators-4.f90: New.
+
+2025-05-15  Kwok Cheung Yeung  <kcyeung@baylibre.com>
+	    Andrew Stubbs   <ams@baylibre.com>
+
+	* testsuite/libgomp.fortran/target-update-iterators-1.f90: New.
+	* testsuite/libgomp.fortran/target-update-iterators-2.f90: New.
+	* testsuite/libgomp.fortran/target-update-iterators-3.f90: New.
+
+2025-05-15  Kwok Cheung Yeung  <kcyeung@baylibre.com>
+	    Andrew Stubbs  <ams@baylibre.com>
+
+	* target.c (kind_to_name): Handle GOMP_MAP_STRUCT and
+	GOMP_MAP_STRUCT_UNORD.
+	(gomp_add_map): New.
+	(gomp_merge_iterator_maps): Expand fields of a struct mapping
+	breadth-first.
+	* testsuite/libgomp.fortran/target-map-iterators-1.f90: New.
+	* testsuite/libgomp.fortran/target-map-iterators-2.f90: New.
+	* testsuite/libgomp.fortran/target-map-iterators-3.f90: New.
+
+2025-05-15  Kwok Cheung Yeung  <kcyeung@baylibre.com>
+
+	* target.c (gomp_update): Call gomp_merge_iterator_maps.  Free
+	allocated variables.
+	* testsuite/libgomp.c-c++-common/target-update-iterators-1.c: New.
+	* testsuite/libgomp.c-c++-common/target-update-iterators-2.c: New.
+	* testsuite/libgomp.c-c++-common/target-update-iterators-3.c: New.
+
+2025-05-15  Kwok Cheung Yeung  <kcyeung@baylibre.com>
+	    Andrew Stubbs  <ams@baylibre.com>
+
+	* target.c (kind_to_name): New.
+	(gomp_merge_iterator_maps): New.
+	(gomp_map_vars_internal): Call gomp_merge_iterator_maps.  Copy
+	address of only the first iteration to target vars.  Free allocated
+	variables.
+	* testsuite/libgomp.c-c++-common/target-map-iterators-1.c: New.
+	* testsuite/libgomp.c-c++-common/target-map-iterators-2.c: New.
+	* testsuite/libgomp.c-c++-common/target-map-iterators-3.c: New.
+
+2025-05-15  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	* testsuite/libgomp.oacc-c++/exceptions-bad_cast-3.C: Adjust.
+	* testsuite/libgomp.oacc-c++/exceptions-throw-3.C: Likewise.
+	* testsuite/libgomp.oacc-c++/pr119692-1-1.C: Likewise.
+	* testsuite/libgomp.oacc-c++/pr119692-1-2.C: Likewise.
+	* testsuite/libgomp.oacc-c++/pr119692-1-3.C: Likewise.
+
+2025-05-15  Chung-Lin Tang  <cltang@baylibre.com>
+	    Sandra Loosemore  <sloosemore@baylibre.com>
+
+	* acc_prof.h (_ACC_PROF_INFO_VERSION): Adjust to 201811.
+	* libgomp.texi (Enabling OpenACC): Adjust version
+	references to 2.7 from 2.6.
+	* openacc.f90 (module openacc): Adjust openacc_version to 201811.
+	* openacc_lib.h (openacc_version): Adjust openacc_version to 201811.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c:
+	Adjust test value to 201811.
+	* testsuite/libgomp.oacc-fortran/openacc_version-1.f: Adjust
+	test value to 201811.
+	* testsuite/libgomp.oacc-fortran/openacc_version-2.f90: Likewise.
+
+2025-05-15  Chung-Lin Tang  <cltang@baylibre.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c: Adjust test.
+	* testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c: Likewise.
+
+2025-05-15  Chung-Lin Tang  <cltang@baylibre.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/reduction.h
+	(check_reduction_array_xx): New macro.
+	(operator_apply): Likewise.
+	(check_reduction_array_op): Likewise.
+	(check_reduction_arraysec_op): Likewise.
+	(function_apply): Likewise.
+	(check_reduction_array_macro): Likewise.
+	(check_reduction_arraysec_macro): Likewise.
+	(check_reduction_xxx_xx_all): Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/reduction-arrays-1.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/reduction-structs-1.c: New test.
+	* testsuite/libgomp.oacc-fortran/reduction-10.f90: New test.
+	* testsuite/libgomp.oacc-fortran/reduction-11.f90: New test.
+	* testsuite/libgomp.oacc-fortran/reduction-12.f90: New test.
+	* testsuite/libgomp.oacc-fortran/reduction-13.f90: New test.
+	* testsuite/libgomp.oacc-fortran/reduction-14.f90: New test.
+	* testsuite/libgomp.oacc-fortran/reduction-15.f90: New test.
+	* testsuite/libgomp.oacc-fortran/reduction-16.f90: New test.
+
+2025-05-15  Sandra Loosemore  <sloosemore@baylibre.com>
+
+	* testsuite/libgomp.c-c++-common/delim-declare-variant-1.c: New.
+
+2025-05-15  Sandra Loosemore  <sloosemore@baylibre.com>
+	    Julian Brown  <julian@codesourcery.com>
+	    waffl3x  <waffl3x@baylibre.com>
+
+	* testsuite/libgomp.c++/delim-declare-variant-1.C: New.
+	* testsuite/libgomp.c++/delim-declare-variant-2.C: New.
+	* testsuite/libgomp.c++/delim-declare-variant-7.C: New.
+
+2025-05-15  Paul-Antoine Arras  <parras@baylibre.com>
+
+	* target.c (omp_target_memcpy_rect_worker): Require unit strides
+	and matching element size.
+
+2025-05-15  Tobias Burnus  <tobias@codesourcery.com>
+
+	Backported from master:
+	2025-05-01  Tobias Burnus  <tobias@codesourcery.com>
+
+	* testsuite/libgomp.fortran/allocate-8a.f90: New test.
+
+2025-05-15  waffl3x  <waffl3x@baylibre.com>
+	    Tobias Burnus  <tobias@codesourcery.com>
+
+	* libgomp.texi: Document C++ support.
+	* testsuite/libgomp.c/allocate-4.c: Move to...
+	* testsuite/libgomp.c-c++-common/allocate-4.c: ...here.
+	* testsuite/libgomp.c/allocate-5.c: Move to...
+	* testsuite/libgomp.c-c++-common/allocate-5.c: ...here.
+	* testsuite/libgomp.c/allocate-6.c: Move to...
+	* testsuite/libgomp.c-c++-common/allocate-6.c: ...here.
+	* testsuite/libgomp.c++/allocate-2.C: New test.
+
+2025-05-15  Julian Brown  <julian@codesourcery.com>
+
+	* target.c (omp_target_memcpy_rect_worker): Add 1D strided transfer
+	support.
+
+2025-05-15  Andrew Stubbs  <ams@codesourcery.com>
+
+	* config/gcn/target.c (GOMP_target_ext): Add "signal" field.
+	Fix atomics race condition.
+	* config/nvptx/libgomp-nvptx.h (REV_OFFLOAD_QUEUE_SIZE): New define.
+	(struct rev_offload): Implement ring buffer.
+	* config/nvptx/target.c (GOMP_target_ext): Likewise.
+	* env.c (initialize_env): Read GOMP_REVERSE_OFFLOAD_THREADS.
+	* libgomp-plugin.c (GOMP_PLUGIN_target_rev): Replace "aq" parameter
+	with "signal" and "use_aq".
+	* libgomp-plugin.h (GOMP_PLUGIN_target_rev): Likewise.
+	* libgomp.h (gomp_target_rev): Likewise.
+	* plugin/plugin-gcn.c (process_reverse_offload): Add "signal".
+	(console_output): Pass signal value through.
+	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_openacc_async_construct):
+	Attach new threads to the numbered device.
+	Change the flag to CU_STREAM_NON_BLOCKING.
+	(GOMP_OFFLOAD_run): Implement ring-buffer and remove signalling.
+	* target.c (gomp_target_rev): Rename to ...
+	(gomp_target_rev_internal): ... this, and change "dev_num" to
+	"devicep".
+	(gomp_target_rev_worker_thread): New function.
+	(gomp_target_rev): New function (old name).
+	* libgomp.texi: Document GOMP_REVERSE_OFFLOAD_THREADS.
+	* testsuite/libgomp.c/reverse-offload-threads-1.c: New test.
+	* testsuite/libgomp.c/reverse-offload-threads-2.c: New test.
+
+2025-05-15  Julian Brown  <julian@codesourcery.com>
+	    Andrew Stubbs   <ams@baylibre.com>
+	    Kwok Cheung Yeung  <kcyeung@baylibre.com>
+	    Sandra Loosemore  <sloosemore@baylibre.com>
+
+	* testsuite/libgomp.c-c++-common/declare-mapper-18.c: New test.
+	* testsuite/libgomp.fortran/declare-mapper-25.f90: New test.
+	* testsuite/libgomp.fortran/declare-mapper-28.f90: New test.
+
+2025-05-15  Julian Brown  <julian@codesourcery.com>
+	    Sandra Loosemore  <sloosemore@baylibre.com>
+
+	* libgomp.h (omp_noncontig_array_desc): Add span field.
+	* target.c (omp_target_memcpy_rect_worker): Add span parameter. Update
+	forward declaration. Handle span != element_size.
+	(gomp_update): Handle bias in descriptor's size slot.  Update calls to
+	omp_target_memcpy_rect_worker.
+	* testsuite/libgomp.fortran/noncontig-updates-1.f90: New test.
+	* testsuite/libgomp.fortran/noncontig-updates-2.f90: New test.
+	* testsuite/libgomp.fortran/noncontig-updates-3.f90: New test.
+	* testsuite/libgomp.fortran/noncontig-updates-4.f90: New test.
+	* testsuite/libgomp.fortran/noncontig-updates-5.f90: New test.
+	* testsuite/libgomp.fortran/noncontig-updates-6.f90: New test.
+	* testsuite/libgomp.fortran/noncontig-updates-7.f90: New test.
+	* testsuite/libgomp.fortran/noncontig-updates-8.f90: New test.
+	* testsuite/libgomp.fortran/noncontig-updates-9.f90: New test.
+	* testsuite/libgomp.fortran/noncontig-updates-10.f90: New test.
+	* testsuite/libgomp.fortran/noncontig-updates-11.f90: New test.
+	* testsuite/libgomp.fortran/noncontig-updates-12.f90: New test.
+	* testsuite/libgomp.fortran/noncontig-updates-13.f90: New test.
+
+2025-05-15  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.c-c++-common/array-shaping-14.c: New test.
+	* testsuite/libgomp.c/array-shaping-1.c: New test.
+	* testsuite/libgomp.c/array-shaping-2.c: New test.
+	* testsuite/libgomp.c/array-shaping-3.c: New test.
+	* testsuite/libgomp.c/array-shaping-4.c: New test.
+	* testsuite/libgomp.c/array-shaping-5.c: New test.
+	* testsuite/libgomp.c/array-shaping-6.c: New test.
+
+2025-05-15  Julian Brown  <julian@codesourcery.com>
+
+	* libgomp.h (omp_noncontig_array_desc): New struct.
+	* target.c (omp_target_memcpy_rect_worker): Add stride array
+	parameter.  Forward declare.  Add STRIDES parameter and strided
+	update support.
+	(gomp_update): Add noncontiguous (strided/shaped) update support.
+	* testsuite/libgomp.c++/array-shaping-1.C: New test.
+	* testsuite/libgomp.c++/array-shaping-2.C: New test.
+	* testsuite/libgomp.c++/array-shaping-3.C: New test.
+	* testsuite/libgomp.c++/array-shaping-4.C: New test.
+	* testsuite/libgomp.c++/array-shaping-5.C: New test.
+	* testsuite/libgomp.c++/array-shaping-6.C: New test.
+	* testsuite/libgomp.c++/array-shaping-7.C: New test.
+	* testsuite/libgomp.c++/array-shaping-8.C: New test.
+	* testsuite/libgomp.c++/array-shaping-9.C: New test.
+	* testsuite/libgomp.c++/array-shaping-10.C: New test.
+	* testsuite/libgomp.c++/array-shaping-11.C: New test.
+	* testsuite/libgomp.c++/array-shaping-12.C: New test.
+	* testsuite/libgomp.c++/array-shaping-13.C: New test.
+
+2025-05-15  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.fortran/declare-mapper-30.f90: New test.
+	* testsuite/libgomp.fortran/declare-mapper-4.f90: Adjust test for new
+	lookup behaviour.
+
+2025-05-15  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.fortran/declare-mapper-2.f90: New test.
+	* testsuite/libgomp.fortran/declare-mapper-3.f90: New test.
+	* testsuite/libgomp.fortran/declare-mapper-4.f90: New test.
+	* testsuite/libgomp.fortran/declare-mapper-6.f90: New test.
+	* testsuite/libgomp.fortran/declare-mapper-7.f90: New test.
+	* testsuite/libgomp.fortran/declare-mapper-8.f90: New test.
+	* testsuite/libgomp.fortran/declare-mapper-9.f90: New test.
+	* testsuite/libgomp.fortran/declare-mapper-10.f90: New test.
+	* testsuite/libgomp.fortran/declare-mapper-11.f90: New test.
+	* testsuite/libgomp.fortran/declare-mapper-12.f90: New test.
+	* testsuite/libgomp.fortran/declare-mapper-13.f90: New test.
+	* testsuite/libgomp.fortran/declare-mapper-15.f90: New test.
+	* testsuite/libgomp.fortran/declare-mapper-17.f90: New test.
+	* testsuite/libgomp.fortran/declare-mapper-18.f90: New test.
+	* testsuite/libgomp.fortran/declare-mapper-19.f90: New test.
+	* testsuite/libgomp.fortran/declare-mapper-20.f90: New test.
+	* testsuite/libgomp.fortran/declare-mapper-21.f90: New test.
+
+2025-05-15  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.c-c++-common/declare-mapper-9.c: Enable for C.
+	* testsuite/libgomp.c-c++-common/declare-mapper-10.c: Likewise.
+	* testsuite/libgomp.c-c++-common/declare-mapper-11.c: Likewise.
+	* testsuite/libgomp.c-c++-common/declare-mapper-12.c: Likewise.
+	* testsuite/libgomp.c-c++-common/declare-mapper-13.c: Likewise.
+	* testsuite/libgomp.c-c++-common/declare-mapper-14.c: Likewise.
+
+2025-05-15  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.c++/declare-mapper-1.C: New test.
+	* testsuite/libgomp.c++/declare-mapper-2.C: New test.
+	* testsuite/libgomp.c++/declare-mapper-3.C: New test.
+	* testsuite/libgomp.c++/declare-mapper-4.C: New test.
+	* testsuite/libgomp.c++/declare-mapper-5.C: New test.
+	* testsuite/libgomp.c++/declare-mapper-6.C: New test.
+	* testsuite/libgomp.c++/declare-mapper-7.C: New test.
+	* testsuite/libgomp.c++/declare-mapper-8.C: New test.
+	* testsuite/libgomp.c-c++-common/declare-mapper-9.c: New test (only
+	enabled for C++ for now).
+	* testsuite/libgomp.c-c++-common/declare-mapper-10.c: Likewise.
+	* testsuite/libgomp.c-c++-common/declare-mapper-11.c: Likewise.
+	* testsuite/libgomp.c-c++-common/declare-mapper-12.c: Likewise.
+	* testsuite/libgomp.c-c++-common/declare-mapper-13.c: Likewise.
+	* testsuite/libgomp.c-c++-common/declare-mapper-14.c: Likewise.
+
+2025-05-15  Julian Brown  <julian@codesourcery.com>
+	    Thomas Schwinge  <tschwinge@baylibre.com>
+	    Sandra Loosemore  <sloosemore@baylibre.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/implicit-mapping-1.c: New test.
+
+2025-05-15  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.oacc-fortran/nonlexical-assumed-size-1.f90: New
+	test.
+	* testsuite/libgomp.oacc-fortran/nonlexical-assumed-size-2.f90: New
+	test.
+
+2025-05-15  Julian Brown  <julian@codesourcery.com>
+	    Paul-Antoine Arras  <parras@baylibre.com>
+	    Sandra Loosemore  <sandra@baylibre.com>
+
+	* testsuite/libgomp.oacc-fortran/declare-allocatable-1-directive.f90:
+	Remove xfails.
+	* testsuite/libgomp.oacc-fortran/declare-allocatable-1-runtime.f90:
+	Remove xfails.
+	* testsuite/libgomp.oacc-fortran/declare-allocatable-1.f90:
+	Remove xfails.
+	* testsuite/libgomp.oacc-fortran/declare-create-1.f90: New test.
+	* testsuite/libgomp.oacc-fortran/declare-create-2.f90: New test.
+	* testsuite/libgomp.oacc-fortran/declare-create-3.f90: New test.
+
+2025-05-15  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/pr70828.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/pr70828-2.c: Likewise.
+	* testsuite/libgomp.oacc-fortran/pr70828.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/pr70828-2.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/pr70828-3.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/pr70828-4.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/pr70828-5.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/pr70828-6.f90: Likewise.
+
+2025-05-15  Chung-Lin Tang  <cltang@codesourcery.com>
+
+	Backported from master:
+	2023-05-19  Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* config/nvptx/team.c (__nvptx_omp_num_threads): New global variable in
+	shared memory.
+	* testsuite/libgomp.c-c++-common/for-17.c: New file.
+	* testsuite/libgomp.c-c++-common/for-18.c: New file.
+
+2025-05-15  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* libgomp.texi (AMD Radeon, nvptx): Document OpenMP 'pinned'
+	memory.
+
+2025-05-15  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* target.c (gomp_unmap_vars_internal): Queue splay-tree keys for
+	removal after main loop.
+
+2025-05-15  Tobias Burnus  <tobias@codesourcery.com>
+
+	* testsuite/libgomp.fortran/target-enter-data-3a.f90: New test.
+
+2025-05-15  Tobias Burnus  <tobias@codesourcery.com>
+
+	* testsuite/libgomp.fortran/target-13.f90: Update test.
+
+2025-05-15  Tobias Burnus  <tobias@codesourcery.com>
+
+	* testsuite/libgomp.c++/c++.exp (check_effective_target_c,
+	check_effective_target_c++): Add.
+	* testsuite/libgomp.c/c.exp (check_effective_target_c,
+	check_effective_target_c++): Add.
+	* testsuite/libgomp.fortran/uses_allocators_2.f90: Remove 'sorry'.
+	* testsuite/libgomp.c-c++-common/uses_allocators-1.c: New test.
+	* testsuite/libgomp.c-c++-common/uses_allocators-2.c: New test.
+	* testsuite/libgomp.c-c++-common/uses_allocators-3.c: New test.
+	* testsuite/libgomp.c-c++-common/uses_allocators-4.c: New test.
+	* testsuite/libgomp.fortran/uses_allocators_3.f90: New test.
+	* testsuite/libgomp.fortran/uses_allocators_4.f90: New test.
+	* testsuite/libgomp.fortran/uses_allocators_5.f90: New test.
+	* testsuite/libgomp.fortran/uses_allocators_6.f90: New test.
+
+2025-05-15  Andrew Stubbs  <ams@baylibre.com>
+
+	* Makefile.am (libgomp_la_SOURCES): Add usmpin-allocator.c.
+	* Makefile.in: Regenerate.
+	* config/linux/allocator.c: Include unistd.h.
+	(pin_ctx): New variable.
+	(ctxlock): New variable.
+	(linux_init_pin_ctx): New function.
+	(linux_memspace_alloc): Use usmpin-allocator for pinned memory.
+	(linux_memspace_free): Likewise.
+	(linux_memspace_realloc): Likewise.
+	* libgomp.h (usmpin_init_context): New prototype.
+	(usmpin_register_memory): New prototype.
+	(usmpin_alloc): New prototype.
+	(usmpin_free): New prototype.
+	(usmpin_realloc): New prototype.
+	* testsuite/libgomp.c/alloc-pinned-8.c: New test.
+	* usmpin-allocator.c: New file.
+
+2025-05-15  Andrew Stubbs  <ams@baylibre.com>
+	    Thomas Schwinge  <thomas@codesourcery.com>
+
+	* config/linux/allocator.c: Include assert.h.
+	(using_device_for_page_locked): New variable.
+	(linux_memspace_alloc): Add init0 parameter. Support device pinning.
+	(linux_memspace_calloc): Set init0 to true.
+	(linux_memspace_free): Support device pinning.
+	(linux_memspace_realloc): Support device pinning.
+	(MEMSPACE_ALLOC): Set init0 to false.
+	* libgomp-plugin.h
+	(GOMP_OFFLOAD_page_locked_host_alloc): New prototype.
+	(GOMP_OFFLOAD_page_locked_host_free): Likewise.
+	* libgomp.h (gomp_page_locked_host_alloc): Likewise.
+	(gomp_page_locked_host_free): Likewise.
+	(struct gomp_device_descr): Add page_locked_host_alloc_func and
+	page_locked_host_free_func.
+	* libgomp.texi: Adjust the docs for the pinned trait.
+	* libgomp_g.h (GOMP_enable_pinned_mode): New prototype.
+	* plugin/plugin-nvptx.c
+	(GOMP_OFFLOAD_page_locked_host_alloc): New function.
+	(GOMP_OFFLOAD_page_locked_host_free): Likewise.
+	* target.c (device_for_page_locked): New variable.
+	(get_device_for_page_locked): New function.
+	(gomp_page_locked_host_alloc): Likewise.
+	(gomp_page_locked_host_free): Likewise.
+	(gomp_load_plugin_for_device): Add page_locked_host_alloc and
+	page_locked_host_free.
+	* testsuite/libgomp.c/alloc-pinned-1.c: Change expectations for NVPTX
+	devices.
+	* testsuite/libgomp.c/alloc-pinned-2.c: Likewise.
+	* testsuite/libgomp.c/alloc-pinned-3.c: Likewise.
+	* testsuite/libgomp.c/alloc-pinned-4.c: Likewise.
+	* testsuite/libgomp.c/alloc-pinned-5.c: Likewise.
+	* testsuite/libgomp.c/alloc-pinned-6.c: Likewise.
+
+2025-05-15  Andrew Stubbs  <ams@baylibre.com>
+
+	* config/linux/allocator.c (always_pinned_mode): New variable.
+	(GOMP_enable_pinned_mode): New function.
+	(linux_memspace_alloc): Disable pinning when always_pinned_mode set.
+	(linux_memspace_calloc): Likewise.
+	(linux_memspace_free): Likewise.
+	(linux_memspace_realloc): Likewise.
+	* libgomp.map: Add GOMP_enable_pinned_mode.
+	* testsuite/libgomp.c/alloc-pinned-7.c: New test.
+	* testsuite/libgomp.c-c++-common/alloc-pinned-1.c: New test.
+
+2025-05-15  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* testsuite/libgomp.c-c++-common/collapse-4.c: New.
+	* testsuite/libgomp.fortran/collapse5.f90: New.
+
+2025-05-15  Andrew Stubbs  <ams@codesourcery.com>
+
+	* config/gcn/bar.h (gomp_barrier_init): Limit thread count to the
+	actual physical number.
+	* config/gcn/team.c (gomp_team_start): Don't attempt to set up
+	threads that do not exist.
+
+2025-05-15  Andrew Stubbs  <ams@codesourcery.com>
+
+	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_alloc): Remove early call to
+	nvptx_stacks_free.
+
+2025-05-15  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c: Adjust for loop
+	lowering changes.
+	* testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c: Likewise.
+
+2025-05-15  Cesar Philippidis  <cesar@codesourcery.com>
+	    Julian Brown   <julian@codesourcery.com>
+	    Kwok Cheung Yeung   <kcy@codesourcery.com>
+	    Tobias Burnus  <tobias@codesourcery.com>
+	    Thomas Schwinge  <thomas@codesourcery.com>
+	    Paul-Antoine Arras  <parras@baylibre.com>
+
+	* libgomp.h (gomp_acc_declare_allocate): Remove prototype.
+	* oacc-mem.c (gomp_acc_declare_allocate): New function.
+	(find_group_last): Handle GOMP_MAP_DECLARE_ALLOCATE and
+	GOMP_MAP_DECLARE_DEALLOCATE groupings.
+	(goacc_enter_data_internal): Fix kind check for
+	GOMP_MAP_DECLARE_ALLOCATE. Pass new pointer argument to
+	gomp_acc_declare_allocate.   Unlock mutex before calling
+	gomp_acc_declare_allocate and relock it afterwards.
+	(goacc_exit_data_internal): Unlock device mutex around
+	gomp_acc_declare_allocate call. Pass new pointer argument. Handle
+	group pointer mapping for deallocate.
+	* testsuite/libgomp.oacc-fortran/allocatable-scalar.f90: New test.
+	* testsuite/libgomp.oacc-fortran/declare-allocatable-1-directive.f90:
+	Adjust.
+	* testsuite/libgomp.oacc-fortran/declare-allocatable-1-runtime.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/declare-allocatable-1.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/declare-allocatable-2.f90: New test.
+	* testsuite/libgomp.oacc-fortran/declare-allocatable-3.f90: New test.
+	* testsuite/libgomp.oacc-fortran/declare-allocatable-4.f90: New test.
+	* testsuite/libgomp.oacc-fortran/declare-allocatable-array_descriptor-1-directive.f90:
+	Adjust.
+	* testsuite/libgomp.oacc-fortran/declare-allocatable-array_descriptor-1-runtime.f90:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/declare-allocatable-array_descriptor-1.f90:
+	New test.
+
+2025-05-15  Julian Brown  <julian@codesourcery.com>
+	    Tobias Burnus  <tobias@codesourcery.com>
+	    Sandra Loosemore  <sandra@baylibre.com>
+
+	* testsuite/libgomp.oacc-c++/privatized-ref-3.C: Add xfails.
+	* testsuite/libgomp.oacc-fortran/optional-private.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/privatized-ref-1.f95: Likewise.
+
+2025-05-15  Cesar Philippidis  <cesar@codesourcery.com>
+	    Julian Brown  <julian@codesourcery.com>
+	    Sandra Loosemore  <sloosemore@baylibre.com>
+
+	* testsuite/libgomp.oacc-fortran/optional-reduction.f90: Remove
+	xfail on bogus warnings.
+	* testsuite/libgomp.oacc-fortran/parallel-reduction.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/pr70643.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/reduction-5.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/reduction-7.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/reference-reductions.f90: Likewise.
+
+2025-05-15  Thomas Schwinge  <thomas@codesourcery.com>
+	    Maciej W. Rozycki   <macro@codesourcery.com>
+
+	* Makefile.am (libgomp_la_SOURCES): Add
+	oacc-profiling-acc_register_library.c.
+	* Makefile.in: Regenerate.
+	* libgomp.texi: Remove paragraph about acc_register_library.
+	* oacc-init.c (get_property_any): Add profiling code.
+	* oacc-parallel.c (GOACC_parallel_keyed_internal): Set device_api for
+	profiling.
+	* oacc-profiling-acc_register_library.c: New file.
+	* oacc-profiling.c (goacc_profiling_initialize): Call
+	acc_register_library.  Avoid duplicate registration.
+	(acc_register_library): Remove.
+	* config/nvptx/oacc-profiling-acc_register_library.c:
+	New empty file.
+	* config/nvptx/oacc-profiling.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c: Remove
+	call to acc_register_library.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c: Likewise.
+
+2025-05-15  Cesar Philippidis  <cesar@codesourcery.com>
+	    Chung-Lin Tang   <cltang@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/privatize-reduction-1.c: New
+	test.
+	* testsuite/libgomp.oacc-c-c++-common/privatize-reduction-2.c: New
+	test.
+
+2025-05-15  Cesar Philippidis  <cesar@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Adjust test case
+	to conform to the new behavior of the auto clause in OpenACC 2.5.
+
+2025-05-15  Cesar Philippidis  <cesar@codesourcery.com>
+	    Julian Brown   <julian@codesourcery.com>
+	    Tobias Burnus  <tobias@codesourcery.com>
+	    Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c++/firstprivate-int.C: New test.
+	* testsuite/libgomp.oacc-c-c++-common/firstprivate-int.c: New
+	test.
+	* testsuite/libgomp.oacc-c-c++-common/data-firstprivate-1.c: XFAIL
+	execution test.
+	* testsuite/libgomp.oacc-fortran/firstprivate-int.f90: New test.
+
+2025-05-15  Nathan Sidwell  <nathan@acm.org>
+	    Tom de Vries   <tdevries@suse.de>
+	    Thomas Schwinge   <thomas@codesourcery.com>
+	    Julian Brown   <julian@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/loop-default-compile.c: New.
+
+2025-05-15  Cesar Philippidis  <cesar@codesourcery.com>
+	    James Norris   <jnorris@codesourcery.com>
+	    Tom de Vries   <tom@codesourcery.com>
+	    Julian Brown  <julian@codesourcery.com>
+	    Tobias Burnus  <tburnus@baylibre.com>
+
+	* testsuite/libgomp.oacc-fortran/data-3.f90: Update parallel
+	regions to denote variables copyied in via acc enter data as
+	present.
+	* testsuite/libgomp.oacc-c-c++-common/subr.h: Reimplement.
+	* testsuite/libgomp.oacc-c-c++-common/subr.ptx: Regenerated PTX.
+	* testsuite/libgomp.oacc-c-c++-common/timer.h: Removed.
+	* testsuite/libgomp.oacc-c-c++-common/lib-69.c: Change async checks.
+	* testsuite/libgomp.oacc-c-c++-common/lib-70.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-72.c: Rework kernel i/f and
+	change async checks.
+	* testsuite/libgomp.oacc-c-c++-common/lib-73.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-74.c: Rework kernel i/f and
+	timing checks.
+	* testsuite/libgomp.oacc-c-c++-common/lib-75.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-76.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-78.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-79.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-81.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-82.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-93.c: New test.
+
+2025-05-15  Julian Brown  <julian@codesourcery.com>
+	    Cesar Philippidis   <cesar@codesourcery.com>
+	    Nathan Sidwell   <nathan@acm.org>
+
+	* testsuite/libgomp.oacc-c-c++-common/par-reduction-3.c: New.
+	* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt-2.c: New.
+	* testsuite/libgomp.oacc-fortran/reduction-9.f90: New.
+
+2025-05-15  Cesar Philippidis  <cesar@codesourcery.com>
+	    James Norris   <jnorris@codesourcery.com>
+	    Julian Brown  <julian@codesourcery.com>
+	    Tobias Burnus  <tobias@codesourcery.com>
+	    Thomas Schwinge  <tschwinge@baylibre.com>
+
+	* oacc-parallel.c (GOACC_parallel_keyed): Handle Fortran deviceptr
+	clause.
+	(GOACC_data_start): Likewise.
+	* testsuite/libgomp.oacc-fortran/deviceptr-1.f90: New test.
+
+2025-05-15  Thomas Schwinge  <thomas@codesourcery.com>
+
+	PR other/76739
+	* target.c (gomp_map_vars_internal): Pass pre-allocated 'ptrblock'
+	to 'goacc_noncontig_array_create_ptrblock'.
+	* oacc-parallel.c (goacc_noncontig_array_create_ptrblock): Adjust.
+	* oacc-int.h (goacc_noncontig_array_create_ptrblock): Adjust.
+
+2025-05-15  Thomas Schwinge  <thomas@codesourcery.com>
+
+	PR other/76739
+	* oacc-parallel.c (GOACC_parallel_keyed): Given OpenACC 'async',
+	defer 'free' of non-contiguous array support data structures.
+	* target.c (gomp_map_vars_internal): Likewise.
+
+2025-05-15  Thomas Schwinge  <thomas@codesourcery.com>
+
+	PR other/76739
+	* libgomp.h (goacc_map_vars): Add 'struct goacc_ncarray_info *'
+	formal parameter.
+	(gomp_map_vars_openacc): Remove.
+	* target.c (goacc_map_vars): Adjust.
+	(gomp_map_vars_openacc): Remove.
+	* oacc-mem.c (acc_map_data, goacc_enter_datum)
+	(goacc_enter_data_internal): Adjust.
+	* oacc-parallel.c (GOACC_parallel_keyed, GOACC_data_start):
+	Adjust.
+
+2025-05-15  Chung-Lin Tang  <cltang@codesourcery.com>
+	    Kwok Cheung Yeung  <kcy@codesourcery.com>
+	    Paul-Antoine Arras  <parras@baylibre.com>
+
+	PR other/76739
+	* libgomp.h (gomp_map_vars_openacc): New function declaration.
+	* libgomp_g.h (GOACC_data_start): Add variadic '...' to declaration.
+	* oacc-int.h (struct goacc_ncarray_dim): New struct declaration.
+	(struct goacc_ncarray_descr_type): Likewise.
+	(struct goacc_ncarray): Likewise.
+	(struct goacc_ncarray_info): Likewise.
+	(goacc_noncontig_array_create_ptrblock): New function declaration.
+	* oacc-parallel.c (goacc_noncontig_array_count_rows): New function.
+	(goacc_noncontig_array_compute_sizes): Likewise.
+	(goacc_noncontig_array_fill_rows_1): Likewise.
+	(goacc_noncontig_array_fill_rows): Likewise.
+	(goacc_process_noncontiguous_arrays): Likewise.
+	(goacc_noncontig_array_create_ptrblock): Likewise.
+	(GOACC_parallel_keyed): Use goacc_process_noncontiguous_arrays to
+	handle non-contiguous array descriptors at end of varargs, adjust
+	to use gomp_map_vars_openacc.
+	(GOACC_data_start): Likewise. Adjust function type to accept varargs.
+	* target.c (gomp_map_vars_internal): Add struct goacc_ncarray_info *
+	nca_info parameter, add handling code for non-contiguous arrays.
+	(gomp_map_vars_openacc): Add new function for specialization of
+	gomp_map_vars_internal for OpenACC structured region usage.
+	* testsuite/libgomp.oacc-c-c++-common/noncontig_array-1.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/noncontig_array-2.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/noncontig_array-3.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/noncontig_array-4.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/noncontig_array-utils.h: Support
+	header for new tests.
diff --git a/libgomp/Makefile.am b/libgomp/Makefile.am
index e3202ae..aece103 100644
--- a/libgomp/Makefile.am
+++ b/libgomp/Makefile.am
@@ -70,7 +70,8 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c error.c \
 	target.c splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c \
 	oacc-init.c oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c \
 	priority_queue.c affinity-fmt.c teams.c allocator.c oacc-profiling.c \
-	oacc-target.c target-indirect.c
+	oacc-target.c target-indirect.c oacc-profiling-acc_register_library.c \
+	usmpin-allocator.c target-cxa-dso-dtor.c
 
 include $(top_srcdir)/plugin/Makefrag.am
 
diff --git a/libgomp/Makefile.in b/libgomp/Makefile.in
index 2a0a842..89dc47c 100644
--- a/libgomp/Makefile.in
+++ b/libgomp/Makefile.in
@@ -219,7 +219,9 @@ am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \
 	oacc-parallel.lo oacc-host.lo oacc-init.lo oacc-mem.lo \
 	oacc-async.lo oacc-plugin.lo oacc-cuda.lo priority_queue.lo \
 	affinity-fmt.lo teams.lo allocator.lo oacc-profiling.lo \
-	oacc-target.lo target-indirect.lo $(am__objects_1)
+	oacc-target.lo target-indirect.lo \
+	oacc-profiling-acc_register_library.lo usmpin-allocator.lo \
+	target-cxa-dso-dtor.lo $(am__objects_1)
 libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
 AM_V_P = $(am__v_P_@AM_V@)
 am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
@@ -552,7 +554,9 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
 	oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \
 	oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
 	affinity-fmt.c teams.c allocator.c oacc-profiling.c \
-	oacc-target.c target-indirect.c $(am__append_3)
+	oacc-target.c target-indirect.c \
+	oacc-profiling-acc_register_library.c usmpin-allocator.c \
+	target-cxa-dso-dtor.c $(am__append_3)
 
 # Nvidia PTX OpenACC plugin.
 @PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
@@ -768,6 +772,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-mem.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-parallel.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-plugin.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-profiling-acc_register_library.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-profiling.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-target.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ordered.Plo@am__quote@
@@ -780,12 +785,14 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sem.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/single.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/splay-tree.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target-cxa-dso-dtor.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target-indirect.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/task.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/team.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/teams.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/time.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/usmpin-allocator.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/work.Plo@am__quote@
 
 .c.o:
diff --git a/libgomp/acc_prof.h b/libgomp/acc_prof.h
index 635d0a1..07ce41b 100644
--- a/libgomp/acc_prof.h
+++ b/libgomp/acc_prof.h
@@ -117,9 +117,11 @@ typedef struct acc_prof_info
 				_ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_int_t))
 } acc_prof_info;
 
-/* We implement the OpenACC 2.6 Profiling Interface.  */
+/* We implement the OpenACC 2.7 Profiling Interface, or at least according
+   to the OpenACC spec the number in the version field of acc_prof_info must
+   be _OPENACC.  */
 
-#define _ACC_PROF_INFO_VERSION 201711
+#define _ACC_PROF_INFO_VERSION 201811
 
 typedef enum acc_construct_t
 {
diff --git a/libgomp/config/accel/target-cxa-dso-dtor.c b/libgomp/config/accel/target-cxa-dso-dtor.c
new file mode 100644
index 0000000..e40a5f0
--- /dev/null
+++ b/libgomp/config/accel/target-cxa-dso-dtor.c
@@ -0,0 +1,62 @@
+/* Host/device compatibility: Itanium C++ ABI, DSO Object Destruction API
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "libgomp.h"
+
+extern void __cxa_finalize (void *);
+
+/* See <https://itanium-cxx-abi.github.io/cxx-abi/abi.html#dso-dtor>.
+
+   Even if the device is '!DEFAULT_USE_CXA_ATEXIT', we may see '__cxa_atexit'
+   calls, referencing '__dso_handle', via a 'DEFAULT_USE_CXA_ATEXIT' host.
+   '__cxa_atexit' is provided by newlib, but use of '__dso_handle' for nvptx
+   results in 'ld' error:
+
+       unresolved symbol __dso_handle
+       collect2: error: ld returned 1 exit status
+       nvptx mkoffload: fatal error: [...]/x86_64-pc-linux-gnu-accel-nvptx-none-gcc returned 1 exit status
+
+   ..., or for GCN get an implicit definition (running with
+   '--trace-symbol=__dso_handle'):
+
+       ./a.xamdgcn-amdhsa.mkoffload.hsaco-a.xamdgcn-amdhsa.mkoffload.2.o: reference to __dso_handle
+       <internal>: definition of __dso_handle
+
+   ..., which might be fine, but let's just make it explicit.  */
+
+/* There are no DSOs; this is the main program.  */
+attribute_hidden void * const __dso_handle = 0;
+
+/* If this file gets linked in, that means that '__dso_handle' has been
+   referenced (for '__cxa_atexit'), and in that case, we also have to run
+   '__cxa_finalize'.  Make that happen by overriding the weak libgcc dummy
+   function '__GCC_offload___cxa_finalize'.  */
+
+void
+__GCC_offload___cxa_finalize (void *dso_handle)
+{
+  __cxa_finalize (dso_handle);
+}
diff --git a/libgomp/config/gcn/bar.h b/libgomp/config/gcn/bar.h
index b62d3af..4402b10 100644
--- a/libgomp/config/gcn/bar.h
+++ b/libgomp/config/gcn/bar.h
@@ -55,6 +55,9 @@ typedef unsigned int gomp_barrier_state_t;
 
 static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count)
 {
+  unsigned actual_thread_count = __builtin_gcn_dim_size (1);
+  if (count > actual_thread_count)
+    count = actual_thread_count;
   bar->total = count;
   bar->awaited = count;
   bar->awaited_final = count;
diff --git a/libgomp/config/gcn/target.c b/libgomp/config/gcn/target.c
index 1e98f1d..5327ad3 100644
--- a/libgomp/config/gcn/target.c
+++ b/libgomp/config/gcn/target.c
@@ -122,19 +122,38 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
 	   <= (index - 1024))
       asm ("s_sleep 64");
 
+  /* In theory, it should be enough to write "written" with __ATOMIC_RELEASE,
+     and have the rest of the data flushed to memory automatically, but some
+     devices (gfx908) seem to have a race condition where the flushed data
+     arrives after the atomic data, and the host does the wrong thing.
+     If we just write everything atomically in the correct order then we're
+     safe.  */
+
   unsigned int slot = index % 1024;
-  data->queue[slot].value_u64[0] = (uint64_t) fn;
-  data->queue[slot].value_u64[1] = (uint64_t) mapnum;
-  data->queue[slot].value_u64[2] = (uint64_t) hostaddrs;
-  data->queue[slot].value_u64[3] = (uint64_t) sizes;
-  data->queue[slot].value_u64[4] = (uint64_t) kinds;
-  data->queue[slot].value_u64[5] = (uint64_t) GOMP_ADDITIONAL_ICVS.device_num;
-
-  data->queue[slot].type = 4; /* Reverse offload.  */
+  __atomic_store_n (&data->queue[slot].value_u64[0], (uint64_t) fn,
+		    __ATOMIC_RELAXED);
+  __atomic_store_n (&data->queue[slot].value_u64[1], (uint64_t) mapnum,
+		    __ATOMIC_RELAXED);
+  __atomic_store_n (&data->queue[slot].value_u64[2], (uint64_t) hostaddrs,
+		    __ATOMIC_RELAXED);
+  __atomic_store_n (&data->queue[slot].value_u64[3], (uint64_t) sizes,
+		    __ATOMIC_RELAXED);
+  __atomic_store_n (&data->queue[slot].value_u64[4], (uint64_t) kinds,
+		    __ATOMIC_RELAXED);
+  __atomic_store_n (&data->queue[slot].value_u64[5],
+		    (uint64_t) GOMP_ADDITIONAL_ICVS.device_num,
+		    __ATOMIC_RELAXED);
+
+  volatile int signal = 0;
+  __atomic_store_n (&data->queue[slot].value_u64[6], (uint64_t) &signal,
+		    __ATOMIC_RELAXED);
+
+  __atomic_store_n (&data->queue[slot].type, 4 /* Reverse offload.  */,
+		    __ATOMIC_RELAXED);
   __atomic_store_n (&data->queue[slot].written, 1, __ATOMIC_RELEASE);
 
-  /* Spinlock while the host catches up.  */
-  while (__atomic_load_n (&data->queue[slot].written, __ATOMIC_ACQUIRE) != 0)
+  /* Spinlock while the host runs the kernel.  */
+  while (__atomic_load_n (&signal, __ATOMIC_ACQUIRE) == 0)
     asm ("s_sleep 64");
 }
 
diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c
index 40827ce..939ee87 100644
--- a/libgomp/config/gcn/team.c
+++ b/libgomp/config/gcn/team.c
@@ -209,6 +209,10 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
   if (nthreads == 1)
     return;
 
+  unsigned actual_thread_count = __builtin_gcn_dim_size (1);
+  if (nthreads > actual_thread_count)
+    nthreads = actual_thread_count;
+
   /* Release existing idle threads.  */
   for (unsigned i = 1; i < nthreads; ++i)
     {
diff --git a/libgomp/config/linux/allocator.c b/libgomp/config/linux/allocator.c
index 8dea959..845ee27 100644
--- a/libgomp/config/linux/allocator.c
+++ b/libgomp/config/linux/allocator.c
@@ -36,6 +36,11 @@
 
 /* Implement malloc routines that can handle pinned memory on Linux.
    
+   Given that pinned memory is typically used to help host <-> device memory
+   transfers, we attempt to allocate such memory using a device (really:
+   libgomp plugin), but fall back to mmap plus mlock if no suitable device is
+   available.
+
    It's possible to use mlock on any heap memory, but using munlock is
    problematic if there are multiple pinned allocations on the same page.
    Tracking all that manually would be possible, but adds overhead. This may
@@ -48,50 +53,147 @@
 
 #define _GNU_SOURCE
 #include <sys/mman.h>
+#include <unistd.h>
 #include <string.h>
+#include <assert.h>
 #include "libgomp.h"
 #ifdef HAVE_INTTYPES_H
 # include <inttypes.h>  /* For PRIu64.  */
 #endif
 
+static bool always_pinned_mode = false;
+
+/* This function is called by the compiler when -foffload-memory=pinned
+   is used.  */
+
+void
+GOMP_enable_pinned_mode ()
+{
+  if (mlockall (MCL_CURRENT | MCL_FUTURE) != 0)
+    gomp_error ("failed to pin all memory (ulimit too low?)");
+  else
+    always_pinned_mode = true;
+}
+
+static int using_device_for_page_locked
+  = /* uninitialized */ -1;
+
+
+static usmpin_ctx_p pin_ctx = NULL;
+static pthread_once_t ctxlock = PTHREAD_ONCE_INIT;
+
+static void
+linux_init_pin_ctx ()
+{
+  pin_ctx = usmpin_init_context ();
+}
+
 static void *
-linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
+linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin,
+		      bool init0)
 {
-  (void)memspace;
+  gomp_debug (0, "%s: memspace=%llu, size=%llu, pin=%d, init0=%d\n",
+	      __FUNCTION__, (unsigned long long) memspace,
+	      (unsigned long long) size, pin, init0);
+
+  void *addr = NULL;
+
+  /* Explicit pinning may not be required.  */
+  pin = pin && !always_pinned_mode;
 
   if (pin)
     {
-      /* Note that mmap always returns zeroed memory and is therefore also a
-	 suitable implementation of calloc.  */
-      void *addr = mmap (NULL, size, PROT_READ | PROT_WRITE,
-			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-      if (addr == MAP_FAILED)
-	return NULL;
-
-      if (mlock (addr, size))
+      int using_device
+	= __atomic_load_n (&using_device_for_page_locked,
+			   MEMMODEL_RELAXED);
+      gomp_debug (0, "  using_device=%d\n",
+		  using_device);
+      if (using_device != 0)
 	{
+	  using_device = gomp_page_locked_host_alloc (&addr, size);
+	  int using_device_old
+	    = __atomic_exchange_n (&using_device_for_page_locked,
+				   using_device, MEMMODEL_RELAXED);
+	  gomp_debug (0, "  using_device=%d, using_device_old=%d\n",
+		      using_device, using_device_old);
+	  assert (using_device_old == -1
+		  /* We shouldn't have concurrently changed our mind.  */
+		  || using_device_old == using_device);
+	}
+      if (using_device == 0)
+	{
+	  static int pagesize = 0;
+	  static void *addrhint = NULL;
+
+	  if (!pagesize)
+	    pagesize = sysconf(_SC_PAGE_SIZE);
+ 
+	  while (1)
+	    {
+	      addr = usmpin_alloc (pin_ctx, size);
+	      if (addr)
+		break;
+
+	      gomp_debug (0, "  mmap\n");
+
+	      /* Round up to a whole page.  */
+	      size_t misalignment = size % pagesize;
+	      size_t mmap_size = (misalignment > 0
+				  ? size + pagesize - misalignment
+				  : size);
+	      void *newpage = mmap (addrhint, mmap_size, PROT_READ | PROT_WRITE,
+				 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	      if (newpage == MAP_FAILED)
+		break;
+	      else
+		{
+		  gomp_debug (0, "  mlock\n");
+		  if (mlock (newpage, size))
+		    {
 #ifdef HAVE_INTTYPES_H
-	  gomp_debug (0, "libgomp: failed to pin %"PRIu64" bytes of"
-		      " memory (ulimit too low?)\n", (uint64_t) size);
+		      gomp_debug (0, "libgomp: failed to pin %"PRIu64" bytes"
+				  " of memory (ulimit too low?)\n",
+				  (uint64_t) size);
 #else
-	  gomp_debug (0, "libgomp: failed to pin %lu bytes of"
-		      " memory (ulimit too low?)\n", (unsigned long) size);
+		      gomp_debug (0, "libgomp: failed to pin %lu bytes of"
+				  " memory (ulimit too low?)\n",
+				  (unsigned long) size);
 #endif
-	  munmap (addr, size);
-	  return NULL;
-	}
+		      munmap (newpage, size);
+		      break;
+		    }
 
-      return addr;
+		  addrhint = newpage + mmap_size;
+
+		  pthread_once (&ctxlock, linux_init_pin_ctx);
+		  usmpin_register_memory (pin_ctx, newpage, mmap_size);
+		}
+	    }
+	}
     }
   else
-    return malloc (size);
+    addr = malloc (size);
+
+  if (addr && init0)
+    {
+      gomp_debug (0, "  init0\n");
+      memset (addr, 0, size);
+    }
+
+  return addr;
 }
 
 static void *
 linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
 {
+  gomp_debug (0, "%s: memspace=%llu, size=%llu, pin=%d\n",
+	      __FUNCTION__, (unsigned long long) memspace, (unsigned long long) size, pin);
+
+  /* Explicit pinning may not be required.  */
+  pin = pin && !always_pinned_mode;
+
   if (pin)
-    return linux_memspace_alloc (memspace, size, pin);
+    return linux_memspace_alloc (memspace, size, pin, true);
   else
     return calloc (1, size);
 }
@@ -100,10 +202,24 @@ static void
 linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
 		     int pin)
 {
-  (void)memspace;
+  gomp_debug (0, "%s: memspace=%llu, addr=%p, size=%llu, pin=%d\n",
+	      __FUNCTION__, (unsigned long long) memspace, addr, (unsigned long long) size, pin);
+
+  /* Explicit pinning may not be required.  */
+  pin = pin && !always_pinned_mode;
 
   if (pin)
-    munmap (addr, size);
+    {
+      int using_device
+	= __atomic_load_n (&using_device_for_page_locked,
+			   MEMMODEL_RELAXED);
+      gomp_debug (0, "  using_device=%d\n",
+		  using_device);
+      if (using_device == 1)
+	gomp_page_locked_host_free (addr);
+      else
+	usmpin_free (pin_ctx, addr);
+    }
   else
     free (addr);
 }
@@ -112,27 +228,45 @@ static void *
 linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
 			size_t oldsize, size_t size, int oldpin, int pin)
 {
+  gomp_debug (0, "%s: memspace=%llu, addr=%p, oldsize=%llu, size=%llu, oldpin=%d, pin=%d\n",
+	      __FUNCTION__, (unsigned long long) memspace, addr, (unsigned long long) oldsize, (unsigned long long) size, oldpin, pin);
+
+  /* Explicit pinning may not be required.  */
+  pin = pin && !always_pinned_mode;
+
   if (oldpin && pin)
     {
-      void *newaddr = mremap (addr, oldsize, size, MREMAP_MAYMOVE);
-      if (newaddr == MAP_FAILED)
-	return NULL;
+      int using_device
+	= __atomic_load_n (&using_device_for_page_locked,
+		       MEMMODEL_RELAXED);
+      gomp_debug (0, "  using_device=%d\n",
+		  using_device);
 
-      return newaddr;
-    }
-  else if (oldpin || pin)
-    {
-      void *newaddr = linux_memspace_alloc (memspace, size, pin);
-      if (newaddr)
+      /* The device plugin API does not support realloc,
+	 but the usmpin allocator does.  */
+      if (using_device == 0)
 	{
-	  memcpy (newaddr, addr, oldsize < size ? oldsize : size);
-	  linux_memspace_free (memspace, addr, oldsize, oldpin);
+	  /* This can fail if there is insufficient pinned memory free.  */
+	  void *newaddr = usmpin_realloc (pin_ctx, addr, size);
+	  if (newaddr)
+	    return newaddr;
 	}
-
-      return newaddr;
     }
+  else if (oldpin || pin)
+    /* Moving from pinned to unpinned memory cannot be done in-place.  */
+    ;
   else
     return realloc (addr, size);
+
+  /* In-place reallocation failed.  Fall back to copy.  */
+  void *newaddr = linux_memspace_alloc (memspace, size, pin, false);
+  if (newaddr)
+    {
+      memcpy (newaddr, addr, oldsize < size ? oldsize : size);
+      linux_memspace_free (memspace, addr, oldsize, oldpin);
+    }
+
+  return newaddr;
 }
 
 static int
@@ -143,7 +277,7 @@ linux_memspace_validate (omp_memspace_handle_t, unsigned, int)
 }
 
 #define MEMSPACE_ALLOC(MEMSPACE, SIZE, PIN) \
-  linux_memspace_alloc (MEMSPACE, SIZE, PIN)
+  linux_memspace_alloc (MEMSPACE, SIZE, PIN, false)
 #define MEMSPACE_CALLOC(MEMSPACE, SIZE, PIN) \
   linux_memspace_calloc (MEMSPACE, SIZE, PIN)
 #define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE, OLDPIN, PIN) \
diff --git a/libgomp/config/nvptx/libgomp-nvptx.h b/libgomp/config/nvptx/libgomp-nvptx.h
index 40586d3..fb1891b 100644
--- a/libgomp/config/nvptx/libgomp-nvptx.h
+++ b/libgomp/config/nvptx/libgomp-nvptx.h
@@ -25,20 +25,41 @@
 
 /* This file contains defines and type definitions shared between the
    nvptx target's libgomp.a and the plugin-nvptx.c, but that is only
-   needef for this target.  */
+   needed for this target.  */
 
 #ifndef LIBGOMP_NVPTX_H
 #define LIBGOMP_NVPTX_H 1
 
 #define GOMP_REV_OFFLOAD_VAR __gomp_rev_offload_var
+#define REV_OFFLOAD_QUEUE_SIZE 1024
 
 struct rev_offload {
-  uint64_t fn;
-  uint64_t mapnum;
-  uint64_t addrs;
-  uint64_t sizes;
-  uint64_t kinds;
-  int32_t dev_num;
+  /* The target can grab a slot by incrementing "next_slot".
+     Each host thread may claim some slots for processing.
+     When the host processing is completed "consumed" indicates that the
+     corresponding slots in the ring-buffer "queue" are available for reuse.
+
+     Note that "next_slot" is an index, and "consumed"/"claimed" are counters,
+     so beware of the fence-posts.  */
+  unsigned int next_slot;
+  unsigned int consumed;
+  unsigned int claimed;
+
+  struct rev_req {
+    /* The target writes an address to "signal" as the last item, which
+       indicates to the host that the record is completely written.  The target
+       must not assume that it still owns the slot, after that.  The signal
+       address is then used by the host to communicate that the reverse-offload
+       kernel has completed execution.  */
+    volatile int *signal;
+
+    uint64_t fn;
+    uint64_t mapnum;
+    uint64_t addrs;
+    uint64_t sizes;
+    uint64_t kinds;
+    int32_t dev_num;
+  } queue[REV_OFFLOAD_QUEUE_SIZE];
 };
 
 #if (__SIZEOF_SHORT__ != 2 \
diff --git a/libgomp/config/nvptx/oacc-profiling-acc_register_library.c b/libgomp/config/nvptx/oacc-profiling-acc_register_library.c
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/libgomp/config/nvptx/oacc-profiling-acc_register_library.c
diff --git a/libgomp/config/nvptx/oacc-profiling.c b/libgomp/config/nvptx/oacc-profiling.c
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/libgomp/config/nvptx/oacc-profiling.c
diff --git a/libgomp/config/nvptx/target.c b/libgomp/config/nvptx/target.c
index 715d993..690db59 100644
--- a/libgomp/config/nvptx/target.c
+++ b/libgomp/config/nvptx/target.c
@@ -101,7 +101,6 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
 		 void **hostaddrs, size_t *sizes, unsigned short *kinds,
 		 unsigned int flags, void **depend, void **args)
 {
-  static int lock = 0;  /* == gomp_mutex_t lock; gomp_mutex_init (&lock); */
   (void) flags;
   (void) depend;
   (void) args;
@@ -111,43 +110,57 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
       || GOMP_REV_OFFLOAD_VAR == NULL)
     return;
 
-  gomp_mutex_lock (&lock);
-
-  GOMP_REV_OFFLOAD_VAR->mapnum = mapnum;
-  GOMP_REV_OFFLOAD_VAR->addrs = (uint64_t) hostaddrs;
-  GOMP_REV_OFFLOAD_VAR->sizes = (uint64_t) sizes;
-  GOMP_REV_OFFLOAD_VAR->kinds = (uint64_t) kinds;
-  GOMP_REV_OFFLOAD_VAR->dev_num = GOMP_ADDITIONAL_ICVS.device_num;
-
-  /* Set 'fn' to trigger processing on the host; wait for completion,
-     which is flagged by setting 'fn' back to 0 on the host.  */
-  uint64_t addr_struct_fn = (uint64_t) &GOMP_REV_OFFLOAD_VAR->fn;
+  /* Reserve one slot. */
+  unsigned int index = __atomic_fetch_add (&GOMP_REV_OFFLOAD_VAR->next_slot,
+					   1, __ATOMIC_ACQUIRE);
+
+  if ((unsigned int) (index + 1) < GOMP_REV_OFFLOAD_VAR->consumed)
+    abort ();  /* Overflow.  */
+
+  /* Spinlock while the host catches up.  */
+  if (index >= REV_OFFLOAD_QUEUE_SIZE)
+    while (__atomic_load_n (&GOMP_REV_OFFLOAD_VAR->consumed, __ATOMIC_ACQUIRE)
+	   <= (index - REV_OFFLOAD_QUEUE_SIZE))
+      ; /* spin  */
+
+  unsigned int slot = index % REV_OFFLOAD_QUEUE_SIZE;
+  GOMP_REV_OFFLOAD_VAR->queue[slot].fn = (uint64_t) fn;
+  GOMP_REV_OFFLOAD_VAR->queue[slot].mapnum = mapnum;
+  GOMP_REV_OFFLOAD_VAR->queue[slot].addrs = (uint64_t) hostaddrs;
+  GOMP_REV_OFFLOAD_VAR->queue[slot].sizes = (uint64_t) sizes;
+  GOMP_REV_OFFLOAD_VAR->queue[slot].kinds = (uint64_t) kinds;
+  GOMP_REV_OFFLOAD_VAR->queue[slot].dev_num = GOMP_ADDITIONAL_ICVS.device_num;
+
+  /* Set 'signal' to trigger processing on the host; the slot is now consumed
+     by the host, so we should not touch it again.  */
+  volatile int signal = 0;
+  uint64_t addr_struct_signal = (uint64_t) &GOMP_REV_OFFLOAD_VAR->queue[slot].signal;
 #if __PTX_SM__ >= 700
   asm volatile ("st.global.release.sys.u64 [%0], %1;"
-		: : "r"(addr_struct_fn), "r" (fn) : "memory");
+		: : "r"(addr_struct_signal), "r" (&signal) : "memory");
 #else
   __sync_synchronize ();  /* membar.sys */
   asm volatile ("st.volatile.global.u64 [%0], %1;"
-		: : "r"(addr_struct_fn), "r" (fn) : "memory");
+		: : "r"(addr_struct_signal), "r" (&signal) : "memory");
 #endif
 
+  /* The host signals completion by writing a non-zero value to the 'signal'
+     variable.  */
 #if __PTX_SM__ >= 700
-  uint64_t fn2;
+  uint64_t signal2;
   do
     {
       asm volatile ("ld.acquire.sys.global.u64 %0, [%1];"
-		    : "=r" (fn2) : "r" (addr_struct_fn) : "memory");
+		    : "=r" (signal2) : "r" (&signal) : "memory");
     }
-  while (fn2 != 0);
+  while (signal2 == 0);
 #else
   /* ld.global.u64 %r64,[__gomp_rev_offload_var];
      ld.u64 %r36,[%r64];
      membar.sys;  */
-  while (__atomic_load_n (&GOMP_REV_OFFLOAD_VAR->fn, __ATOMIC_ACQUIRE) != 0)
+  while (__atomic_load_n (&signal, __ATOMIC_ACQUIRE) == 0)
     ;  /* spin  */
 #endif
-
-  gomp_mutex_unlock (&lock);
 }
 
 void
diff --git a/libgomp/config/nvptx/team.c b/libgomp/config/nvptx/team.c
index 4227344..e1a0d37 100644
--- a/libgomp/config/nvptx/team.c
+++ b/libgomp/config/nvptx/team.c
@@ -34,6 +34,9 @@
 struct gomp_thread *nvptx_thrs __attribute__((shared,nocommon));
 int __gomp_team_num __attribute__((shared,nocommon));
 
+/* Number of active target threads in team, used in ACC mode.  */
+unsigned int __nvptx_omp_num_threads __attribute__((shared,nocommon));
+
 static void gomp_thread_start (struct gomp_thread_pool *);
 extern void build_indirect_map (void);
 
diff --git a/libgomp/env.c b/libgomp/env.c
index 626a753..f48a95e 100644
--- a/libgomp/env.c
+++ b/libgomp/env.c
@@ -124,6 +124,7 @@ size_t gomp_affinity_format_len;
 char *goacc_device_type;
 int goacc_device_num;
 int goacc_default_dims[GOMP_DIM_MAX];
+int gomp_reverse_offload_threads = 8;  /* Reasonable default.  */
 
 #ifndef LIBGOMP_OFFLOADED_ONLY
 
@@ -2489,6 +2490,11 @@ initialize_env (void)
 
   handle_omp_display_env ();
 
+  /* Control the number of background threads reverse offload is permitted
+     to use.  */
+  parse_int_secure ("GOMP_REVERSE_OFFLOAD_THREADS",
+		    &gomp_reverse_offload_threads, false);
+
   /* OpenACC.  */
 
   if (!parse_int ("ACC_DEVICE_NUM", getenv ("ACC_DEVICE_NUM"),
diff --git a/libgomp/libgomp-plugin.c b/libgomp/libgomp-plugin.c
index 5e2cb9b..edbdfb7 100644
--- a/libgomp/libgomp-plugin.c
+++ b/libgomp/libgomp-plugin.c
@@ -82,8 +82,8 @@ GOMP_PLUGIN_fatal (const char *msg, ...)
 void
 GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
 			uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
-			struct goacc_asyncqueue *aq)
+			volatile int *signal, bool use_aq)
 {
   gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, dev_num,
-		   aq);
+		   signal, use_aq);
 }
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index 924fc1f..3c7741b 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -150,7 +150,7 @@ extern void GOMP_PLUGIN_fatal (const char *, ...)
 	__attribute__ ((noreturn, format (printf, 1, 2)));
 
 extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t,
-				    uint64_t, int, struct goacc_asyncqueue *);
+				    uint64_t, int, volatile int *, bool);
 
 /* Prototypes for functions implemented by libgomp plugins.  */
 extern const char *GOMP_OFFLOAD_get_name (void);
@@ -167,6 +167,8 @@ extern int GOMP_OFFLOAD_load_image (int, unsigned, const void *,
 extern bool GOMP_OFFLOAD_unload_image (int, unsigned, const void *);
 extern void *GOMP_OFFLOAD_alloc (int, size_t);
 extern bool GOMP_OFFLOAD_free (int, void *);
+extern bool GOMP_OFFLOAD_page_locked_host_alloc (void **, size_t);
+extern bool GOMP_OFFLOAD_page_locked_host_free (void *);
 extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t);
 extern bool GOMP_OFFLOAD_host2dev (int, void *, const void *, size_t);
 extern bool GOMP_OFFLOAD_dev2dev (int, void *, const void *, size_t);
@@ -200,6 +202,8 @@ extern bool GOMP_OFFLOAD_openacc_async_dev2host (int, void *, const void *, size
 						 struct goacc_asyncqueue *);
 extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *, size_t,
 						 struct goacc_asyncqueue *);
+extern bool GOMP_OFFLOAD_openacc_async_dev2dev (int, void *, const void *, size_t,
+						struct goacc_asyncqueue *);
 extern void *GOMP_OFFLOAD_openacc_cuda_get_current_device (void);
 extern void *GOMP_OFFLOAD_openacc_cuda_get_current_context (void);
 extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 6030f9d..571ac62c 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -620,6 +620,7 @@ extern struct gomp_offload_icv_list *gomp_offload_icv_list;
 extern int goacc_device_num;
 extern char *goacc_device_type;
 extern int goacc_default_dims[GOMP_DIM_MAX];
+extern int gomp_reverse_offload_threads;
 
 enum gomp_task_kind
 {
@@ -1134,7 +1135,9 @@ extern void gomp_init_targets_once (void);
 extern int gomp_get_num_devices (void);
 extern bool gomp_target_task_fn (void *);
 extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
-			     int, struct goacc_asyncqueue *);
+			     int, volatile int *, bool);
+extern bool gomp_page_locked_host_alloc (void **, size_t);
+extern void gomp_page_locked_host_free (void *);
 
 /* Splay tree definitions.  */
 typedef struct splay_tree_node_s *splay_tree_node;
@@ -1331,6 +1334,21 @@ struct target_mem_desc {
 };
 
 
+/* A rectangular section of an array, for noncontiguous target update
+   operations.  Must be kept in sync with
+   omp-low.cc:omp_noncontig_descriptor_type.  */
+
+typedef struct {
+  size_t ndims;
+  size_t elemsize;
+  size_t span;
+  size_t *dim;
+  size_t *index;
+  size_t *length;
+  size_t *stride;
+} omp_noncontig_array_desc;
+
+
 typedef struct acc_dispatch_t
 {
   /* Execute.  */
@@ -1360,6 +1378,7 @@ typedef struct acc_dispatch_t
     __typeof (GOMP_OFFLOAD_openacc_async_exec) *exec_func;
     __typeof (GOMP_OFFLOAD_openacc_async_dev2host) *dev2host_func;
     __typeof (GOMP_OFFLOAD_openacc_async_host2dev) *host2dev_func;
+    __typeof (GOMP_OFFLOAD_openacc_async_dev2dev) *dev2dev_func;
   } async;
 
   __typeof (GOMP_OFFLOAD_openacc_get_property) *get_property_func;
@@ -1418,6 +1437,8 @@ struct gomp_device_descr
   __typeof (GOMP_OFFLOAD_unload_image) *unload_image_func;
   __typeof (GOMP_OFFLOAD_alloc) *alloc_func;
   __typeof (GOMP_OFFLOAD_free) *free_func;
+  __typeof (GOMP_OFFLOAD_page_locked_host_alloc) *page_locked_host_alloc_func;
+  __typeof (GOMP_OFFLOAD_page_locked_host_free) *page_locked_host_free_func;
   __typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
   __typeof (GOMP_OFFLOAD_host2dev) *host2dev_func;
   __typeof (GOMP_OFFLOAD_memcpy2d) *memcpy2d_func;
@@ -1458,8 +1479,6 @@ enum gomp_map_vars_kind
   GOMP_MAP_VARS_ENTER_DATA = 8
 };
 
-extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *,
-				       unsigned short *);
 struct gomp_coalesce_buf;
 extern void gomp_copy_host2dev (struct gomp_device_descr *,
 				struct goacc_asyncqueue *, void *, const void *,
@@ -1467,6 +1486,9 @@ extern void gomp_copy_host2dev (struct gomp_device_descr *,
 extern void gomp_copy_dev2host (struct gomp_device_descr *,
 				struct goacc_asyncqueue *, void *, const void *,
 				size_t);
+extern void gomp_copy_dev2dev (struct gomp_device_descr *,
+			       struct goacc_asyncqueue *, void *, const void *,
+			       size_t);
 extern uintptr_t gomp_map_val (struct target_mem_desc *, void **, size_t);
 extern bool gomp_attach_pointer (struct gomp_device_descr *,
 				 struct goacc_asyncqueue *, splay_tree,
@@ -1475,10 +1497,13 @@ extern bool gomp_attach_pointer (struct gomp_device_descr *,
 extern void gomp_detach_pointer (struct gomp_device_descr *,
 				 struct goacc_asyncqueue *, splay_tree_key,
 				 uintptr_t, bool, struct gomp_coalesce_buf *);
+struct goacc_ncarray_info;
 extern struct target_mem_desc *goacc_map_vars (struct gomp_device_descr *,
 					       struct goacc_asyncqueue *,
 					       size_t, void **, void **,
-					       size_t *, void *, bool,
+					       size_t *, void *,
+					       struct goacc_ncarray_info *,
+					       bool,
 					       enum gomp_map_vars_kind);
 extern void goacc_unmap_vars (struct target_mem_desc *, bool,
 			      struct goacc_asyncqueue *);
@@ -1663,4 +1688,14 @@ gomp_thread_to_pthread_t (struct gomp_thread *thr)
 }
 #endif
 
+/* usmpin-allocator.c  */
+
+typedef struct usmpin_context *usmpin_ctx_p;
+
+usmpin_ctx_p usmpin_init_context ();
+void usmpin_register_memory (usmpin_ctx_p ctx, char *base, size_t size);
+void *usmpin_alloc (usmpin_ctx_p ctx, size_t size);
+void usmpin_free (usmpin_ctx_p ctx, void *addr);
+void *usmpin_realloc (usmpin_ctx_p ctx, void *addr, size_t newsize);
+
 #endif /* LIBGOMP_H */
diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map
index eae2f53..bc2de6b 100644
--- a/libgomp/libgomp.map
+++ b/libgomp/libgomp.map
@@ -406,6 +406,7 @@ GOMP_5.0.1 {
   global:
 	GOMP_alloc;
 	GOMP_free;
+	GOMP_enable_pinned_mode;
 } GOMP_5.0;
 
 GOMP_5.1 {
@@ -609,6 +610,12 @@ OACC_2.6 {
 	acc_get_property_string_h_;
 } OACC_2.5.1;
 
+OACC_2.6.1 {
+  global:
+	acc_memcpy_device;
+	acc_memcpy_device_async;
+} OACC_2.6;
+
 GOACC_2.0 {
   global:
 	GOACC_data_end;
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index 6909c2b..e1b70b0 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -232,7 +232,7 @@ The OpenMP 4.5 specification is fully supported.
       @tab Y @tab See also @ref{Memory allocation}
 @item Memory management routines @tab Y @tab
 @item @code{allocate} directive @tab P
-      @tab C++ unsupported; see also @ref{Memory allocation}
+      @tab Stack and static variables; see also @ref{Memory allocation}
 @item @code{allocate} clause @tab P @tab Clause has no effect on @code{target}
       (@uref{https://gcc.gnu.org/PR113436,PR113436})
 @item @code{use_device_addr} clause on @code{target data} @tab Y @tab
@@ -305,7 +305,7 @@ The OpenMP 4.5 specification is fully supported.
 @item @code{strict} modifier in the @code{grainsize} and @code{num_tasks}
       clauses of the @code{taskloop} construct @tab Y @tab
 @item @code{align} clause in @code{allocate} directive @tab P
-      @tab Only C and Fortran
+      @tab Supported
 @item @code{align} modifier in @code{allocate} clause @tab Y @tab
 @item @code{thread_limit} clause to @code{target} construct @tab Y @tab
 @item @code{has_device_addr} clause to @code{target} construct @tab Y @tab
@@ -484,7 +484,7 @@ to address of matching mapped list item per 5.1, Sect. 2.21.7.2 @tab N @tab
 @item Support for pure directives in Fortran's @code{do concurrent} @tab N @tab
 @item All inarguable clauses take now an optional Boolean argument @tab N @tab
 @item The @code{adjust_args} clause was extended to specify the argument by position
-      and supports variadic arguments @tab N @tab
+      and supports variadic arguments @tab Y @tab
 @item For Fortran, @emph{locator list} can be also function reference with
       data pointer result @tab N @tab
 @item Concept of @emph{assumed-size arrays} in C and C++
@@ -543,7 +543,7 @@ to address of matching mapped list item per 5.1, Sect. 2.21.7.2 @tab N @tab
 @item New @code{partitioner} value to @code{partition} allocator trait
       @tab N @tab
 @item Semicolon-separated list to @code{uses_allocators} @tab N @tab
-@item New @code{need_device_addr} modifier to @code{adjust_args} clause @tab N @tab
+@item New @code{need_device_addr} modifier to @code{adjust_args} clause @tab Y @tab
 @item @code{interop} clause to @code{dispatch} @tab Y @tab
 @item Scope requirement changes for @code{declare_target} @tab N @tab
 @item @code{message} and @code{severity} clauses to @code{parallel} directive
@@ -3912,6 +3912,7 @@ variable is not set.
 * GOMP_STACKSIZE::          Set default thread stack size
 * GOMP_SPINCOUNT::          Set the busy-wait spin count
 * GOMP_RTEMS_THREAD_POOLS:: Set the RTEMS specific thread pools
+* GOMP_REVERSE_OFFLOAD_THREADS:: Set the maximum number of host threads
 @end menu
 
 
@@ -4677,6 +4678,22 @@ pools available and their worker threads run at priority four.
 
 
 
+@node GOMP_REVERSE_OFFLOAD_THREADS
+@section @env{GOMP_REVERSE_OFFLOAD_THREADS} -- Set the maximum number of host threads
+@cindex Environment Variable
+@table @asis
+@item @emph{Description}
+Set the maximum number of threads that may be used to run reverse offload
+code sections (host code nested within offload regions, declared using
+@code{#pragma omp target device(ancestor:1)}).  The value should be a non-zero
+positive integer.  The default is 8 threads.
+
+The threads are created on demand, up to the maximum number given, and are
+destroyed when no reverse offload requests remain.
+@end table
+
+
+
 @c ---------------------------------------------------------------------
 @c Enabling OpenACC
 @c ---------------------------------------------------------------------
@@ -4695,7 +4712,7 @@ See @uref{https://gcc.gnu.org/wiki/OpenACC} for more information.
 
 A complete description of all OpenACC directives accepted may be found in 
 the @uref{https://www.openacc.org, OpenACC} Application Programming
-Interface manual, version 2.6.
+Interface manual, version 2.7.
 
 
 
@@ -4707,7 +4724,7 @@ Interface manual, version 2.6.
 @chapter OpenACC Runtime Library Routines
 
 The runtime routines described here are defined by section 3 of the OpenACC
-specifications in version 2.6.
+specifications in version 2.7.
 They have C linkage, and do not throw exceptions.
 Generally, they are available only for the host, with the exception of
 @code{acc_on_device}, which is available for both the host and the
@@ -4763,6 +4780,7 @@ acceleration device.
                                 present on device.
 * acc_memcpy_to_device::        Copy host memory to device memory.
 * acc_memcpy_from_device::      Copy device memory to host memory.
+* acc_memcpy_device::           Copy memory within a device.
 * acc_attach::                  Let device pointer point to device-pointer target.
 * acc_detach::                  Let device pointer point to host-pointer target.
 
@@ -4802,7 +4820,7 @@ for the device type specified in @var{devicetype}.
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+@uref{https://www.openacc.org, OpenACC specification v2.7}, section
 3.2.1.
 @end table
 
@@ -5837,6 +5855,44 @@ This function copies device memory specified by device address of
 
 
 
+@node acc_memcpy_device
+@section @code{acc_memcpy_device} -- Copy memory within a device.
+@table @asis
+@item @emph{Description}
+This function copies device memory from one memory location to another
+on the current device.  It copies @var{bytes} bytes of data from the device
+address, specified by @var{data_dev_src}, to the device address
+@var{data_dev_dest}.  The @code{_async} version performs the transfer
+asnychronously using the queue associated with @var{async_arg}.
+
+@item @emph{C/C++}:
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{void acc_memcpy_device(d_void* data_dev_dest,}
+@item                   @tab @code{d_void* data_dev_src, size_t bytes);}
+@item @emph{Prototype}: @tab @code{void acc_memcpy_device_async(d_void* data_dev_dest,}
+@item                   @tab @code{d_void* data_dev_src, size_t bytes, int async_arg);}
+@end multitable
+
+@item @emph{Fortran}:
+@multitable @columnfractions .20 .80
+@item @emph{Interface}: @tab @code{subroutine acc_memcpy_device(data_dev_dest, &}
+@item                   @tab @code{data_dev_src, bytes)}
+@item @emph{Interface}: @tab @code{subroutine acc_memcpy_device_async(data_dev_dest, &}
+@item                   @tab @code{data_dev_src, bytes, async_arg)}
+@item                   @tab @code{type(c_ptr), value :: data_dev_dest}
+@item                   @tab @code{type(c_ptr), value :: data_dev_src}
+@item                   @tab @code{integer(c_size_t), value :: bytes}
+@item                   @tab @code{integer(acc_handle_kind), value :: async_arg}
+@end multitable
+
+@item @emph{Reference}:
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.33.  @uref{https://www.openacc.org, OpenACC specification v3.3}, section
+3.2.28.
+@end table
+
+
+
 @node acc_attach
 @section @code{acc_attach} -- Let device pointer point to device-pointer target.
 @table @asis
@@ -6074,7 +6130,7 @@ Function for library registration.
 @chapter OpenACC Environment Variables
 
 The variables @env{ACC_DEVICE_TYPE} and @env{ACC_DEVICE_NUM}
-are defined by section 4 of the OpenACC specification in version 2.0.
+are defined by section 4 of the OpenACC specification in version 2.6.
 The variable @env{ACC_PROFLIB}
 is defined by section 4 of the OpenACC specification in version 2.6.
 
@@ -6377,14 +6433,6 @@ We just handle one case specially, as required by CUDA 9.0
 @code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}
 callbacks.
 
-We're not yet implementing initialization via a
-@code{acc_register_library} function that is either statically linked
-in, or dynamically via @env{LD_PRELOAD}.
-Initialization via @code{acc_register_library} functions dynamically
-loaded via the @env{ACC_PROFLIB} environment variable does work, as
-does directly calling @code{acc_prof_register},
-@code{acc_prof_unregister}, @code{acc_prof_lookup}.
-
 As currently there are no inquiry functions defined, calls to
 @code{acc_prof_lookup} always returns @code{NULL}.
 
@@ -6803,8 +6851,9 @@ a @code{nearest} allocation.
 
 Additional notes regarding the traits:
 @itemize
-@item The @code{pinned} trait is supported on Linux hosts, but is subject to
-      the OS @code{ulimit}/@code{rlimit} locked memory settings.
+@item The @code{pinned} trait is supported on Linux hosts, but is usually
+      subject to the OS @code{ulimit}/@code{rlimit} locked memory settings (see
+      @ref{Offload-Target Specifics} for exceptions).
 @item The default for the @code{pool_size} trait is no pool and for every
       (re)allocation the associated library routine is called, which might
       internally use a memory pool.
@@ -6896,6 +6945,9 @@ The implementation remark:
       does not support XNACK, consider using @code{ROCR_VISIBLE_DEVICES} to
       enable only the APU.  If not supported, all AMD GPU devices are removed
       from the list of available devices (``host fallback'').
+@item OpenMP @emph{pinned} memory (@code{omp_atk_pinned},
+      @code{ompx_pinned_mem_alloc}, for example)
+      is allocated via @code{mmap}, @code{mlock}.
 @item The available stack size can be changed using the @code{GCN_STACK_SIZE}
       environment variable; the default is 32 kiB per thread.
 @item Low-latency memory (@code{omp_low_lat_mem_space}) is supported when the
@@ -6911,6 +6963,12 @@ The implementation remark:
       @code{omp_thread_mem_alloc}, all use low-latency memory as first
       preference, and fall back to main graphics memory when the low-latency
       pool is exhausted.
+@item Pinned memory allocated using @code{omp_alloc} with the
+      @code{ompx_gnu_pinned_mem_alloc} allocator or the @code{pinned} trait is
+      obtained via the CUDA API when an NVPTX device is present.  This provides
+      a performance boost for NVPTX offload code and also allows unlimited use
+      of pinned memory regardless of the OS @code{ulimit}/@code{rlimit}
+      settings.
 @item The OpenMP routines @code{omp_target_memcpy_rect} and
       @code{omp_target_memcpy_rect_async} and the @code{target update}
       directive for non-contiguous list items use the 3D memory-copy function
@@ -7054,6 +7112,11 @@ The implementation remark:
       @uref{https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements}}
       otherwise, all nvptx device are removed from the list of available
       devices (``host fallback'').
+@item OpenMP @emph{pinned} memory (@code{omp_atk_pinned},
+      @code{ompx_pinned_mem_alloc}, for example)
+      is allocated via @code{cuMemHostAlloc} (CUDA Driver API).
+      This potentially helps optimization of host <-> device data
+      transfers.
 @item The default per-warp stack size is 128 kiB; see also @code{-msoft-stack}
       in the GCC manual.
 @item Low-latency memory (@code{omp_low_lat_mem_space}) is supported when the
diff --git a/libgomp/libgomp_g.h b/libgomp/libgomp_g.h
index 8993ec6..cdc4fc8 100644
--- a/libgomp/libgomp_g.h
+++ b/libgomp/libgomp_g.h
@@ -375,6 +375,7 @@ extern bool GOMP_is_alloc (void *);
 
 extern void *GOMP_alloc (size_t, size_t, uintptr_t);
 extern void GOMP_free (void *, uintptr_t);
+extern void GOMP_enable_pinned_mode (void);
 
 /* error.c */
 
@@ -402,7 +403,7 @@ extern void GOACC_parallel_keyed (int, void (*) (void *), size_t,
 extern void GOACC_parallel (int, void (*) (void *), size_t, void **, size_t *,
 			    unsigned short *, int, int, int, int, int, ...);
 extern void GOACC_data_start (int, size_t, void **, size_t *,
-			      unsigned short *);
+			      unsigned short *, ...);
 extern void GOACC_data_end (void);
 extern void GOACC_update (int, size_t, void **, size_t *,
 			  unsigned short *, int, int, ...);
diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c
index 3856f85..5fb1bb8 100644
--- a/libgomp/oacc-init.c
+++ b/libgomp/oacc-init.c
@@ -810,6 +810,16 @@ get_property_any (int ord, acc_device_t d, acc_device_property_t prop)
   if (d == acc_device_current && thr && thr->dev)
     return thr->dev->openacc.get_property_func (thr->dev->target_id, prop);
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+
+  if (profiling_p)
+    {
+      prof_info.device_type = d;
+      prof_info.device_number = ord;
+    }
+
   gomp_mutex_lock (&acc_device_lock);
 
   struct gomp_device_descr *dev = resolve_device (d, true);
@@ -830,7 +840,16 @@ get_property_any (int ord, acc_device_t d, acc_device_property_t prop)
 
   assert (dev);
 
-  return dev->openacc.get_property_func (dev->target_id, prop);
+  union goacc_property_value propval =
+      dev->openacc.get_property_func (dev->target_id, prop);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
+  return propval;
 }
 
 size_t
diff --git a/libgomp/oacc-int.h b/libgomp/oacc-int.h
index 85b91dd..a24de9d 100644
--- a/libgomp/oacc-int.h
+++ b/libgomp/oacc-int.h
@@ -165,6 +165,58 @@ bool _goacc_profiling_setup_p (struct goacc_thread *,
 void goacc_profiling_dispatch (acc_prof_info *, acc_event_info *,
 			       acc_api_info *);
 
+/* Definitions for data structures describing OpenACC non-contiguous arrays
+   (Note: interfaces with compiler)
+
+   The compiler generates a descriptor for each such array, places the
+   descriptor on stack, and passes the address of the descriptor to the libgomp
+   runtime as a normal map argument. The runtime then processes the array
+   data structure setup, and replaces the argument with the new actual
+   array address for the child function.
+
+   Care must be taken such that the struct field and layout assumptions
+   of struct goacc_ncarray_dim, goacc_ncarray_descr_type inside the compiler
+   be consistant with the below declarations.  */
+
+struct goacc_ncarray_dim {
+  size_t base;
+  size_t length;
+  size_t elem_size;
+  size_t is_array;
+};
+
+struct goacc_ncarray_descr_type
+{
+  size_t ndims;
+  struct goacc_ncarray_dim dims[];
+};
+
+/* Internal non-contiguous array info struct, used only here inside the runtime. */
+
+struct goacc_ncarray
+{
+  struct goacc_ncarray_descr_type *descr;
+  void *ptr;
+  size_t map_index;
+  size_t ptrblock_size;
+  void **data_rows;
+  void **tgt_data_rows;
+  size_t data_row_num;
+  size_t data_row_size;
+};
+
+struct goacc_ncarray_info
+{
+  size_t num_data_rows, num_ncarray;
+  void **data_rows;
+  void **tgt_data_rows;
+  struct goacc_ncarray ncarray[];
+};
+
+extern void goacc_noncontig_array_create_ptrblock (struct goacc_ncarray *,
+						   void *, void *);
+
+
 #ifdef HAVE_ATTRIBUTE_VISIBILITY
 # pragma GCC visibility pop
 #endif
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index 0482ed3..e40b41b 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -171,21 +171,22 @@ acc_free (void *d)
 }
 
 static void
-memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
-		      const char *libfnname)
+memcpy_tofrom_device (bool dev_to, bool dev_from, void *dst, void *src,
+		      size_t s, int async, const char *libfnname)
 {
   /* No need to call lazy open here, as the device pointer must have
      been obtained from a routine that did that.  */
   struct goacc_thread *thr = goacc_thread ();
 
   assert (thr && thr->dev);
+  if (s == 0)
+    return;
 
   if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     {
-      if (from)
-	memmove (h, d, s);
-      else
-	memmove (d, h, s);
+      if (src == dst)
+	return;
+      memcpy (dst, src, s);
       return;
     }
 
@@ -199,10 +200,15 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
     }
 
   goacc_aq aq = get_goacc_asyncqueue (async);
-  if (from)
-    gomp_copy_dev2host (thr->dev, aq, h, d, s);
+  if (dev_to && dev_from)
+    {
+      if (dst != src)
+	gomp_copy_dev2dev (thr->dev, aq, dst, src, s);
+    }
+  else if (dev_from)
+    gomp_copy_dev2host (thr->dev, aq, dst, src, s);
   else
-    gomp_copy_host2dev (thr->dev, aq, d, h, s, false, /* TODO: cbuf? */ NULL);
+    gomp_copy_host2dev (thr->dev, aq, dst, src, s, false, /* TODO: cbuf? */ NULL);
 
   if (profiling_p)
     {
@@ -214,25 +220,37 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
 void
 acc_memcpy_to_device (void *d, void *h, size_t s)
 {
-  memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__);
+  memcpy_tofrom_device (true, false, d, h, s, acc_async_sync, __FUNCTION__);
 }
 
 void
 acc_memcpy_to_device_async (void *d, void *h, size_t s, int async)
 {
-  memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__);
+  memcpy_tofrom_device (true, false, d, h, s, async, __FUNCTION__);
 }
 
 void
 acc_memcpy_from_device (void *h, void *d, size_t s)
 {
-  memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__);
+  memcpy_tofrom_device (false, true, h, d, s, acc_async_sync, __FUNCTION__);
 }
 
 void
 acc_memcpy_from_device_async (void *h, void *d, size_t s, int async)
 {
-  memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__);
+  memcpy_tofrom_device (false, true, h, d, s, async, __FUNCTION__);
+}
+
+void
+acc_memcpy_device (void *dst, void *src, size_t s)
+{
+  memcpy_tofrom_device (true, true, dst, src, s, acc_async_sync, __FUNCTION__);
+}
+
+void
+acc_memcpy_device_async (void *dst, void *src, size_t s, int async)
+{
+  memcpy_tofrom_device (true, true, dst, src, s, async, __FUNCTION__);
 }
 
 /* Return the device pointer that corresponds to host data H.  Or NULL
@@ -403,7 +421,7 @@ acc_map_data (void *h, void *d, size_t s)
 
       struct target_mem_desc *tgt
 	= goacc_map_vars (acc_dev, NULL, mapnum, &hostaddrs, &devaddrs, &sizes,
-			  &kinds, true, GOMP_MAP_VARS_ENTER_DATA);
+			  &kinds, NULL, true, GOMP_MAP_VARS_ENTER_DATA);
       assert (tgt);
       assert (tgt->list_count == 1);
       splay_tree_key n = tgt->list[0].key;
@@ -568,7 +586,7 @@ goacc_enter_datum (void **hostaddrs, size_t *sizes, void *kinds, int async)
 
       struct target_mem_desc *tgt
 	= goacc_map_vars (acc_dev, aq, mapnum, hostaddrs, NULL, sizes,
-			  kinds, true, GOMP_MAP_VARS_ENTER_DATA);
+			  kinds, NULL, true, GOMP_MAP_VARS_ENTER_DATA);
       assert (tgt);
       assert (tgt->list_count == 1);
       n = tgt->list[0].key;
@@ -925,6 +943,35 @@ acc_update_self_async (void *h, size_t s, int async)
   update_dev_host (0, h, s, async);
 }
 
+/* Implement "declare allocate" and "declare deallocate" operations.  The
+   device lock must not be held before calling this function.  */
+
+static void
+gomp_acc_declare_allocate (bool allocate, bool pointer, void **hostaddrs,
+			   size_t *sizes, unsigned short *kinds)
+{
+  gomp_debug (0, "  %s: processing\n", __FUNCTION__);
+
+  if (allocate)
+    {
+      /* Allocate memory for the array data.  */
+      uintptr_t data = (uintptr_t) acc_create (hostaddrs[0], sizes[0]);
+
+      if (pointer)
+	{
+	  /* Update the PSET.  */
+	  acc_update_device (hostaddrs[1], sizes[1]);
+	  void *pset = acc_deviceptr (hostaddrs[1]);
+	  acc_memcpy_to_device (pset, &data, sizeof (uintptr_t));
+	}
+    }
+  else
+    /* Deallocate memory for the array data.  */
+    acc_delete (hostaddrs[0], sizes[0]);
+
+  gomp_debug (0, "  %s: end\n", __FUNCTION__);
+}
+
 void
 acc_attach_async (void **hostaddr, int async)
 {
@@ -1056,6 +1103,28 @@ find_group_last (int pos, size_t mapnum, size_t *sizes, unsigned short *kinds)
     case GOMP_MAP_ATTACH:
       break;
 
+    case GOMP_MAP_DECLARE_ALLOCATE:
+    case GOMP_MAP_DECLARE_DEALLOCATE:
+      {
+	/* The "declare allocate" and "declare deallocate" mappings can be
+	   used to specify either a scalar allocatable (which just appears as
+	   GOMP_MAP_DECLARE_{ALLOCATE,DEALLOCATE} by itself), or an array
+	   allocatable (which appears as that directive followed by a
+	   GOMP_MAP_TO_PSET and one (or more?) GOMP_MAP_POINTER mappings.  */
+	if (pos + 1 >= mapnum)
+	  break;
+
+	unsigned char kind1 = kinds[pos + 1] & 0xff;
+	if (kind1 != GOMP_MAP_TO_PSET)
+	  break;
+
+	pos++;
+
+	while (pos + 1 < mapnum && (kinds[pos + 1] & 0xff) == GOMP_MAP_POINTER)
+	  pos++;
+      }
+      break;
+
     default:
       /* GOMP_MAP_ALWAYS_POINTER can only appear directly after some other
 	 mapping.  */
@@ -1121,7 +1190,14 @@ goacc_enter_data_internal (struct gomp_device_descr *acc_dev, size_t mapnum,
 
       n = lookup_host (acc_dev, hostaddrs[i], size);
 
-      if (n && struct_p)
+      if ((kinds[i] & 0xff) == GOMP_MAP_DECLARE_ALLOCATE)
+	{
+	  gomp_mutex_unlock (&acc_dev->lock);
+	  gomp_acc_declare_allocate (true, group_last > i, &hostaddrs[i],
+				     &sizes[i], &kinds[i]);
+	  gomp_mutex_lock (&acc_dev->lock);
+	}
+      else if (n && struct_p)
 	{
 	  for (size_t j = i + 1; j <= group_last; j++)
 	    {
@@ -1206,7 +1282,7 @@ goacc_enter_data_internal (struct gomp_device_descr *acc_dev, size_t mapnum,
 	      gomp_mutex_unlock (&acc_dev->lock);
 	      struct target_mem_desc *tgt_ __attribute__((unused))
 		= goacc_map_vars (acc_dev, aq, groupnum, &hostaddrs[i], NULL,
-				  &sizes[i], &kinds[i], true,
+				  &sizes[i], &kinds[i], NULL, true,
 				  GOMP_MAP_VARS_ENTER_DATA);
 	      assert (tgt_ == NULL);
 	      gomp_mutex_lock (&acc_dev->lock);
@@ -1257,7 +1333,7 @@ goacc_enter_data_internal (struct gomp_device_descr *acc_dev, size_t mapnum,
 
 	  struct target_mem_desc *tgt
 	    = goacc_map_vars (acc_dev, aq, groupnum, &hostaddrs[i], NULL,
-			      &sizes[i], &kinds[i], true,
+			      &sizes[i], &kinds[i], NULL, true,
 			      GOMP_MAP_VARS_ENTER_DATA);
 	  assert (tgt);
 
@@ -1365,6 +1441,24 @@ goacc_exit_data_internal (struct gomp_device_descr *acc_dev, size_t mapnum,
 	     reference counts ('n->refcount', 'n->dynamic_refcount').  */
 	  break;
 
+	case GOMP_MAP_DECLARE_DEALLOCATE:
+	  {
+	    bool deallocate_pointer
+	      = i + 1 < mapnum && (kinds[i + 1] & 0xff) == GOMP_MAP_TO_PSET;
+	    gomp_mutex_unlock (&acc_dev->lock);
+	    gomp_acc_declare_allocate (false, deallocate_pointer,
+				       &hostaddrs[i], &sizes[i], &kinds[i]);
+	    gomp_mutex_lock (&acc_dev->lock);
+	    if (deallocate_pointer)
+	      {
+		i++;
+		while (i + 1 < mapnum
+		       && (kinds[i + 1] & 0xff) == GOMP_MAP_POINTER)
+		  i++;
+	      }
+	  }
+	  break;
+
 	default:
 	  gomp_fatal (">>>> goacc_exit_data_internal UNHANDLED kind 0x%.2x",
 			  kind);
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index 388cabd..a1fb11b 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -36,7 +36,7 @@
 #include <string.h>
 #include <stdarg.h>
 #include <assert.h>
-
+#include <stdio.h>
 
 /* In the ABI, the GOACC_FLAGs are encoded as an inverted bitmask, so that we
    continue to support the following two legacy values.  */
@@ -46,6 +46,171 @@ _Static_assert (GOACC_FLAGS_UNMARSHAL (GOMP_DEVICE_HOST_FALLBACK)
 		== GOACC_FLAG_HOST_FALLBACK,
 		"legacy GOMP_DEVICE_HOST_FALLBACK broken");
 
+static size_t
+goacc_noncontig_array_count_rows (struct goacc_ncarray_descr_type *descr)
+{
+  size_t nrows = 1;
+  for (size_t d = 0; d < descr->ndims - 1; d++)
+    nrows *= descr->dims[d].length / sizeof (void *);
+  return nrows;
+}
+
+static void
+goacc_noncontig_array_compute_sizes (struct goacc_ncarray *nca)
+{
+  size_t d, n = 1;
+  struct goacc_ncarray_descr_type *descr = nca->descr;
+
+  nca->ptrblock_size = 0;
+  for (d = 0; d < descr->ndims - 1; d++)
+    {
+      size_t dim_count = descr->dims[d].length / descr->dims[d].elem_size;
+      size_t dim_ptrblock_size = (descr->dims[d + 1].is_array
+				  ? 0 : descr->dims[d].length * n);
+      nca->ptrblock_size += dim_ptrblock_size;
+      n *= dim_count;
+    }
+  nca->data_row_num = n;
+  nca->data_row_size = descr->dims[d].length;
+}
+
+static void
+goacc_noncontig_array_fill_rows_1 (struct goacc_ncarray_descr_type *descr, void *nca,
+				   size_t d, void ***row_ptr, size_t *count)
+{
+  if (d < descr->ndims - 1)
+    {
+      size_t elsize = descr->dims[d].elem_size;
+      size_t n = descr->dims[d].length / elsize;
+      void *p = nca + descr->dims[d].base;
+      for (size_t i = 0; i < n; i++)
+	{
+	  void *ptr = p + i * elsize;
+	  /* Deref if next dimension is not array.  */
+	  if (!descr->dims[d + 1].is_array)
+	    ptr = *((void **) ptr);
+	  goacc_noncontig_array_fill_rows_1 (descr, ptr, d + 1, row_ptr, count);
+	}
+    }
+  else
+    {
+      **row_ptr = nca + descr->dims[d].base;
+      *row_ptr += 1;
+      *count += 1;
+    }
+}
+
+static size_t
+goacc_noncontig_array_fill_rows (struct goacc_ncarray *nca)
+{
+  size_t count = 0;
+  void **p = nca->data_rows;
+  goacc_noncontig_array_fill_rows_1 (nca->descr, nca->ptr, 0, &p, &count);
+  return count;
+}
+
+static struct goacc_ncarray_info *
+goacc_process_noncontiguous_arrays (size_t mapnum, void **hostaddrs,
+				    unsigned short *kinds, va_list* ap)
+{
+  size_t i, nr, num_data_rows = 0, num_ncarray = 0, curr_row_start = 0;
+  struct goacc_ncarray_descr_type *descr;
+
+  /* We need to go over *ap twice, so preserve *ap state here.  */
+  va_list itr;
+  va_copy (itr, *ap);
+  for (i = 0; i < mapnum; i++)
+    if (GOMP_MAP_NONCONTIG_ARRAY_P (kinds[i] & 0xff))
+      {
+	descr = va_arg (itr, struct goacc_ncarray_descr_type *);
+	num_data_rows += goacc_noncontig_array_count_rows (descr);
+	num_ncarray += 1;
+      }
+    else
+      break;
+
+  /* Allocate the entire info struct, array entries, and row pointer
+     arrays in one large block.  */
+  struct goacc_ncarray_info *nca_info
+    = gomp_malloc (sizeof (struct goacc_ncarray_info)
+		   + sizeof (struct goacc_ncarray) * num_ncarray
+		   + sizeof (void *) * num_data_rows * 2);
+  nca_info->num_data_rows = num_data_rows;
+  nca_info->num_ncarray = num_ncarray;
+  nca_info->data_rows = (void **) (nca_info->ncarray + num_ncarray);
+  nca_info->tgt_data_rows = nca_info->data_rows + num_data_rows;
+
+  struct goacc_ncarray *curr_ncarray = nca_info->ncarray;
+  for (i = 0; i < mapnum; i++)
+    if (GOMP_MAP_NONCONTIG_ARRAY_P (kinds[i] & 0xff))
+      {
+	descr = va_arg (*ap, struct goacc_ncarray_descr_type *);
+	curr_ncarray->descr = descr;
+	curr_ncarray->ptr = hostaddrs[i];
+	curr_ncarray->map_index = i;
+
+	goacc_noncontig_array_compute_sizes (curr_ncarray);
+
+	curr_ncarray->data_rows = nca_info->data_rows + curr_row_start;
+	curr_ncarray->tgt_data_rows = nca_info->tgt_data_rows + curr_row_start;
+
+	nr = goacc_noncontig_array_fill_rows (curr_ncarray);
+	assert (nr == curr_ncarray->data_row_num);
+	curr_row_start += nr;
+	curr_ncarray += 1;
+      }
+    else
+      break;
+
+  return nca_info;
+}
+
+void
+goacc_noncontig_array_create_ptrblock (struct goacc_ncarray *nca,
+				       void *ptrblock,
+				       void *tgt_ptrblock_addr)
+{
+  struct goacc_ncarray_descr_type *descr = nca->descr;
+  void **tgt_data_rows = nca->tgt_data_rows;
+  void **curr_dim_ptrblock = (void **) ptrblock;
+  size_t n = 1;
+
+  for (size_t d = 0; d < descr->ndims - 1; d++)
+    {
+      int curr_dim_len = descr->dims[d].length;
+      int next_dim_len = descr->dims[d + 1].length;
+      int curr_dim_num = curr_dim_len / sizeof (void *);
+      size_t next_dim_bias = descr->dims[d + 1].base;
+
+      void *next_dim_ptrblock
+	= (void *)(curr_dim_ptrblock + n * curr_dim_num);
+
+      for (int b = 0; b < n; b++)
+	for (int i = 0; i < curr_dim_num; i++)
+	  {
+	    if (d < descr->ndims - 2)
+	      {
+		void *ptr = (next_dim_ptrblock
+			     + b * curr_dim_num * next_dim_len
+			     + i * next_dim_len);
+		void *tgt_ptr = (tgt_ptrblock_addr
+				 + (ptr - ptrblock) - next_dim_bias);
+		curr_dim_ptrblock[b * curr_dim_num + i] = tgt_ptr;
+	      }
+	    else
+	      {
+		curr_dim_ptrblock[b * curr_dim_num + i]
+		  = tgt_data_rows[b * curr_dim_num + i] - next_dim_bias;
+	      }
+	    void *addr = &curr_dim_ptrblock[b * curr_dim_num + i];
+	    assert (ptrblock <= addr && addr < ptrblock + nca->ptrblock_size);
+	  }
+
+      n *= curr_dim_num;
+      curr_dim_ptrblock = next_dim_ptrblock;
+    }
+  assert (n == nca->data_row_num);
+}
 
 /* Handle the mapping pair that are presented when a
    deviceptr clause is used with Fortran.  */
@@ -115,6 +280,7 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
   int async = GOMP_ASYNC_SYNC;
   unsigned dims[GOMP_DIM_MAX];
   unsigned tag;
+  struct goacc_ncarray_info *nca_info = NULL;
 
 #ifdef HAVE_INTTYPES_H
   gomp_debug (0, "%s: mapnum=%"PRIu64", hostaddrs=%p, size=%p, kinds=%p\n",
@@ -201,6 +367,8 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
       fn (hostaddrs);
       goto out_prof;
     }
+  else if (profiling_p)
+    api_info.device_api = acc_device_api_cuda;
 
   /* Default: let the runtime choose.  */
   for (i = 0; i != GOMP_DIM_MAX; i++)
@@ -250,13 +418,22 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
 	    break;
 	  }
 
+	  /*case GOMP_LAUNCH_NONCONTIG_ARRAYS:
+	  nca_info = goacc_process_noncontiguous_arrays (mapnum, hostaddrs,
+							 kinds, &ap);
+							 break;*/
+
 	default:
 	  gomp_fatal ("unrecognized offload code '%d',"
 		      " libgomp is too old", GOMP_LAUNCH_CODE (tag));
 	}
     }
+
+  if (mapnum > 0 && GOMP_MAP_NONCONTIG_ARRAY_P (kinds[0] & 0xff))
+    nca_info = goacc_process_noncontiguous_arrays (mapnum, hostaddrs, kinds, &ap);
+
   va_end (ap);
-  
+
   if (!(acc_dev->capabilities & GOMP_OFFLOAD_CAP_NATIVE_EXEC))
     {
       k.host_start = (uintptr_t) fn;
@@ -292,8 +469,12 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
   goacc_aq aq = get_goacc_asyncqueue (async);
 
   struct target_mem_desc *tgt
-    = goacc_map_vars (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds, true,
-		      GOMP_MAP_VARS_TARGET);
+      = goacc_map_vars (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds,
+			nca_info, true, GOMP_MAP_VARS_TARGET);
+  if (aq == NULL)
+    free (nca_info);
+  else
+    acc_dev->openacc.async.queue_callback_func (aq, free, nca_info);
 
   if (profiling_p)
     {
@@ -362,7 +543,7 @@ GOACC_parallel (int flags_m, void (*fn) (void *),
 
 void
 GOACC_data_start (int flags_m, size_t mapnum,
-		  void **hostaddrs, size_t *sizes, unsigned short *kinds)
+		  void **hostaddrs, size_t *sizes, unsigned short *kinds, ...)
 {
   int flags = GOACC_FLAGS_UNMARSHAL (flags_m);
 
@@ -447,6 +628,8 @@ GOACC_data_start (int flags_m, size_t mapnum,
   if (profiling_p)
     goacc_profiling_dispatch (&prof_info, &enter_data_event_info, &api_info);
 
+  handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
+
   /* Host fallback or 'do nothing'.  */
   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
       || (flags & GOACC_FLAG_HOST_FALLBACK)
@@ -454,16 +637,26 @@ GOACC_data_start (int flags_m, size_t mapnum,
     {
       prof_info.device_type = acc_device_host;
       api_info.device_type = prof_info.device_type;
-      tgt = goacc_map_vars (NULL, NULL, 0, NULL, NULL, NULL, NULL, true, 0);
+      tgt = goacc_map_vars (NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, true, 0);
       tgt->prev = thr->mapped_data;
       thr->mapped_data = tgt;
 
       goto out_prof;
     }
 
+  struct goacc_ncarray_info *nca_info = NULL;
+  if (mapnum > 0 && GOMP_MAP_NONCONTIG_ARRAY_P (kinds[0] & 0xff))
+    {
+      va_list ap;
+      va_start (ap, kinds);
+      nca_info = goacc_process_noncontiguous_arrays (mapnum, hostaddrs, kinds, &ap);
+      va_end (ap);
+    }
+
   gomp_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
   tgt = goacc_map_vars (acc_dev, NULL, mapnum, hostaddrs, NULL, sizes, kinds,
-			true, 0);
+			nca_info, true, 0);
+  free (nca_info);
   gomp_debug (0, "  %s: mappings prepared\n", __FUNCTION__);
   tgt->prev = thr->mapped_data;
   thr->mapped_data = tgt;
diff --git a/libgomp/oacc-profiling-acc_register_library.c b/libgomp/oacc-profiling-acc_register_library.c
new file mode 100644
index 0000000..f6b482b
--- /dev/null
+++ b/libgomp/oacc-profiling-acc_register_library.c
@@ -0,0 +1,39 @@
+/* Copyright (C) 2017 Free Software Foundation, Inc.
+
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file provides an stub acc_register_library function.  It's in a
+   separate file so that this function can easily be overridden when linking
+   statically.  */
+
+#include "libgomp.h"
+#include "acc_prof.h"
+
+void
+acc_register_library (acc_prof_reg reg, acc_prof_reg unreg,
+		      acc_prof_lookup_func lookup)
+{
+  gomp_debug (0, "dummy %s\n", __FUNCTION__);
+}
diff --git a/libgomp/oacc-profiling.c b/libgomp/oacc-profiling.c
index f98cc0a..d6cc9ce 100644
--- a/libgomp/oacc-profiling.c
+++ b/libgomp/oacc-profiling.c
@@ -104,7 +104,12 @@ goacc_profiling_initialize (void)
   for (int i = 0; i < acc_ev_last; ++i)
     goacc_prof_callbacks_enabled[i] = true;
 
-
+  /* We are to invoke an external acc_register_library routine, defaulting to
+     our stub oacc-profiling-acc_register_library.c:acc_register_library
+     implementation.  */
+  gomp_debug (0, "%s: calling acc_register_library\n", __FUNCTION__);
+  //TODO.
+  acc_register_library (acc_prof_register, acc_prof_unregister, NULL);
 #ifdef PLUGIN_SUPPORT
   char *acc_proflibs = secure_getenv ("ACC_PROFLIB");
   while (acc_proflibs != NULL && acc_proflibs[0] != '\0')
@@ -141,10 +146,20 @@ goacc_profiling_initialize (void)
 		= dlsym (dl_handle, "acc_register_library");
 	      if (a_r_l == NULL)
 		goto dl_fail;
-	      gomp_debug (0, "  %s: calling %s:acc_register_library\n",
-			  __FUNCTION__, acc_proflib);
-	      a_r_l (acc_prof_register, acc_prof_unregister,
-		     acc_prof_lookup);
+	      /* Avoid duplicate registration, for example if the same shared
+		 library is specified in LD_PRELOAD and ACC_PROFLIB -- which
+		 TAU 2.26 does when using "tau_exec -openacc".  */
+	      if (a_r_l != acc_register_library)
+		{
+		  gomp_debug (0, "  %s: calling %s:acc_register_library\n",
+			      __FUNCTION__, acc_proflib);
+		  //TODO.
+		  a_r_l (acc_prof_register, acc_prof_unregister, NULL);
+		}
+	      else
+		gomp_debug (0, "  %s: skipping duplicate"
+			    " %s:acc_register_library\n",
+			    __FUNCTION__, acc_proflib);
 	    }
 	  else
 	    {
@@ -487,13 +502,6 @@ acc_prof_lookup (const char *name)
   return NULL;
 }
 
-void
-acc_register_library (acc_prof_reg reg, acc_prof_reg unreg,
-		      acc_prof_lookup_func lookup)
-{
-  gomp_fatal ("TODO");
-}
-
 /* Prepare to dispatch events?  */
 
 bool
diff --git a/libgomp/openacc.f90 b/libgomp/openacc.f90
index 8ef107e..55894df 100644
--- a/libgomp/openacc.f90
+++ b/libgomp/openacc.f90
@@ -797,8 +797,9 @@ module openacc
   public :: acc_copyout_finalize, acc_delete_finalize
   public :: acc_memcpy_to_device, acc_memcpy_to_device_async
   public :: acc_memcpy_from_device, acc_memcpy_from_device_async
+  public :: acc_memcpy_device, acc_memcpy_device_async
 
-  integer, parameter :: openacc_version = 201711
+  integer, parameter :: openacc_version = 201811
 
   interface acc_get_num_devices
     procedure :: acc_get_num_devices_h
@@ -1046,6 +1047,27 @@ module openacc
     end subroutine
   end interface
 
+  interface
+    subroutine acc_memcpy_device (data_dev_dest, data_dev_src, bytes) bind(C)
+      use iso_c_binding, only: c_ptr, c_size_t
+      type(c_ptr), value :: data_dev_dest
+      type(c_ptr), value :: data_dev_src
+      integer(c_size_t), value :: bytes
+    end subroutine
+  end interface
+
+  interface
+    subroutine acc_memcpy_device_async (data_dev_dest, data_dev_src,  &
+                                        bytes, async_arg) bind(C)
+      use iso_c_binding, only: c_ptr, c_size_t
+      import :: acc_handle_kind
+      type(c_ptr), value :: data_dev_dest
+      type(c_ptr), value :: data_dev_src
+      integer(c_size_t), value :: bytes
+      integer(acc_handle_kind), value :: async_arg
+    end subroutine
+  end interface
+
   interface acc_copyin_async
     procedure :: acc_copyin_async_32_h
     procedure :: acc_copyin_async_64_h
diff --git a/libgomp/openacc.h b/libgomp/openacc.h
index a520bbe..3085b00 100644
--- a/libgomp/openacc.h
+++ b/libgomp/openacc.h
@@ -123,6 +123,7 @@ void *acc_hostptr (void *) __GOACC_NOTHROW;
 int acc_is_present (void *, size_t) __GOACC_NOTHROW;
 void acc_memcpy_to_device (void *, void *, size_t) __GOACC_NOTHROW;
 void acc_memcpy_from_device (void *, void *, size_t) __GOACC_NOTHROW;
+void acc_memcpy_device (void *, void *, size_t) __GOACC_NOTHROW;
 void acc_attach (void **) __GOACC_NOTHROW;
 void acc_attach_async (void **, int) __GOACC_NOTHROW;
 void acc_detach (void **) __GOACC_NOTHROW;
@@ -136,7 +137,7 @@ void acc_delete_finalize_async (void *, size_t, int) __GOACC_NOTHROW;
 void acc_detach_finalize (void **) __GOACC_NOTHROW;
 void acc_detach_finalize_async (void **, int) __GOACC_NOTHROW;
 
-/* Async functions, specified in OpenACC 2.5.  */
+/* Async functions, specified in OpenACC 2.5, acc_memcpy_device in 2.6.  */
 void acc_copyin_async (void *, size_t, int) __GOACC_NOTHROW;
 void acc_create_async (void *, size_t, int) __GOACC_NOTHROW;
 void acc_copyout_async (void *, size_t, int) __GOACC_NOTHROW;
@@ -145,6 +146,7 @@ void acc_update_device_async (void *, size_t, int) __GOACC_NOTHROW;
 void acc_update_self_async (void *, size_t, int) __GOACC_NOTHROW;
 void acc_memcpy_to_device_async (void *, void *, size_t, int) __GOACC_NOTHROW;
 void acc_memcpy_from_device_async (void *, void *, size_t, int) __GOACC_NOTHROW;
+void acc_memcpy_device_async (void *, void *, size_t, int) __GOACC_NOTHROW;
 
 /* CUDA-specific routines.  */
 void *acc_get_current_cuda_device (void) __GOACC_NOTHROW;
diff --git a/libgomp/openacc_lib.h b/libgomp/openacc_lib.h
index b0d287e..e0e7788 100644
--- a/libgomp/openacc_lib.h
+++ b/libgomp/openacc_lib.h
@@ -70,7 +70,7 @@
       integer (acc_handle_kind), parameter :: acc_async_noval = -1
       integer (acc_handle_kind), parameter :: acc_async_sync = -2
 
-      integer, parameter :: openacc_version = 201711
+      integer, parameter :: openacc_version = 201811
 
       interface acc_get_num_devices
         function acc_get_num_devices_h (devicetype)
@@ -528,6 +528,30 @@
         end subroutine
       end interface
 
+      interface
+        subroutine acc_memcpy_device(data_dev_dest, data_dev_src,       &
+     &                               bytes) bind(C)
+          use iso_c_binding, only: c_ptr, c_size_t
+          type(c_ptr), value :: data_dev_dest
+          type(c_ptr), value :: data_dev_src
+          integer(c_size_t), value :: bytes
+        end subroutine
+      end interface
+
+      interface
+        subroutine acc_memcpy_device_async(data_dev_dest,               &
+     &                                     data_dev_src, bytes,         &
+     &                                     async_arg) bind(C)
+          use iso_c_binding, only: c_ptr, c_size_t
+          import :: acc_handle_kind
+          type(c_ptr), value :: data_dev_dest
+          type(c_ptr), value :: data_dev_src
+          integer(c_size_t), value :: bytes
+          integer(acc_handle_kind), value :: async_arg
+        end subroutine
+      end interface
+
+
       interface acc_copyin_async
         subroutine acc_copyin_async_32_h (a, len, async)
           use iso_c_binding, only: c_int32_t
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 4b42a59..f823b27 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -2026,11 +2026,12 @@ create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
 
 static void
 process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
-			 uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
+			 uint64_t sizes, uint64_t kinds, uint64_t dev_num64,
+			 uint64_t signal)
 {
   int dev_num = dev_num64;
   GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
-			  NULL);
+			  (volatile int *) signal, false);
 }
 
 /* Output any data written to console output from the kernel.  It is expected
@@ -2080,7 +2081,8 @@ console_output (struct kernel_info *kernel, struct kernargs *kernargs,
 	case 4:
 	  process_reverse_offload (data->value_u64[0], data->value_u64[1],
 				   data->value_u64[2], data->value_u64[3],
-				   data->value_u64[4], data->value_u64[5]);
+				   data->value_u64[4], data->value_u64[5],
+				   data->value_u64[6]);
 	  break;
 	default: printf ("GCN print buffer error!\n"); break;
 	}
@@ -5079,7 +5081,8 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
   queue_push_callback (aq, fn, data);
 }
 
-/* Queue up an asynchronous data copy from host to DEVICE.  */
+/* Queue up an asynchronous data copy from host to DEVICE.
+   (Also handles dev2host and dev2dev.)  */
 
 bool
 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
@@ -5097,10 +5100,16 @@ bool
 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
 				     size_t n, struct goacc_asyncqueue *aq)
 {
-  struct agent_info *agent = get_agent_info (device);
-  assert (agent == aq->agent);
-  queue_push_copy (aq, dst, src, n);
-  return true;
+  return GOMP_OFFLOAD_openacc_async_host2dev (device, dst, src, n, aq);
+}
+
+/* Queue up an asynchronous data copy from DEVICE to DEVICE.  */
+
+bool
+GOMP_OFFLOAD_openacc_async_dev2dev (int device, void *dst, const void *src,
+				    size_t n, struct goacc_asyncqueue *aq)
+{
+  return GOMP_OFFLOAD_openacc_async_host2dev (device, dst, src, n, aq);
 }
 
 union goacc_property_value
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index a5cf859..712c8b7 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -1799,8 +1799,6 @@ GOMP_OFFLOAD_alloc (int ord, size_t size)
   ptx_dev->free_blocks = NULL;
   pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
 
-  nvptx_stacks_free (ptx_dev, false);
-
   while (blocks)
     {
       tmp = blocks->next;
@@ -1828,6 +1826,48 @@ GOMP_OFFLOAD_free (int ord, void *ptr)
 	  && nvptx_free (ptr, ptx_devices[ord]));
 }
 
+bool
+GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size)
+{
+  GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n",
+		     __FUNCTION__, ptr, (unsigned long long) size);
+
+  if (size == 0)
+    {
+      /* Special case to ensure omp_alloc specification compliance.  */
+      *ptr = NULL;
+      GOMP_PLUGIN_debug (0, "  -> *ptr=null\n");
+      return true;
+    }
+
+  CUresult r;
+
+  unsigned int flags = 0;
+  /* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need
+     'flags |= CU_MEMHOSTALLOC_PORTABLE;' here.  */
+  r = CUDA_CALL_NOCHECK (cuMemHostAlloc, ptr, size, flags);
+  if (r == CUDA_ERROR_OUT_OF_MEMORY)
+    *ptr = NULL;
+  else if (r != CUDA_SUCCESS)
+    {
+      GOMP_PLUGIN_error ("cuMemHostAlloc error: %s", cuda_error (r));
+      return false;
+    }
+  GOMP_PLUGIN_debug (0, "  -> *ptr=%p\n",
+		     *ptr);
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_page_locked_host_free (void *ptr)
+{
+  GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p\n",
+		     __FUNCTION__, ptr);
+
+  CUDA_CALL (cuMemFreeHost, ptr);
+  return true;
+}
+
 void
 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *),
 			   size_t mapnum  __attribute__((unused)),
@@ -1939,9 +1979,10 @@ nvptx_goacc_asyncqueue_construct (unsigned int flags)
 }
 
 struct goacc_asyncqueue *
-GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+GOMP_OFFLOAD_openacc_async_construct (int device)
 {
-  return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
+  nvptx_attach_host_thread_to_device (device);
+  return nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
 }
 
 static bool
@@ -2019,6 +2060,34 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
 }
 
 static bool
+cuda_memcpy_dev_sanity_check (const void *d1, const void *d2, size_t s)
+{
+  CUdeviceptr pb1, pb2;
+  size_t ps1, ps2;
+  if (!s)
+    return true;
+  if (!d1 || !d2)
+    {
+      GOMP_PLUGIN_error ("invalid device address");
+      return false;
+    }
+  CUDA_CALL (cuMemGetAddressRange, &pb1, &ps1, (CUdeviceptr) d1);
+  CUDA_CALL (cuMemGetAddressRange, &pb2, &ps2, (CUdeviceptr) d2);
+  if (!pb1 || !pb2)
+    {
+      GOMP_PLUGIN_error ("invalid device address");
+      return false;
+    }
+  if ((void *)(d1 + s) > (void *)(pb1 + ps1)
+      || (void *)(d2 + s) > (void *)(pb2 + ps2))
+    {
+      GOMP_PLUGIN_error ("invalid size");
+      return false;
+    }
+  return true;
+}
+
+static bool
 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
 {
   CUdeviceptr pb;
@@ -2077,6 +2146,9 @@ GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
 bool
 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
 {
+  if (!nvptx_attach_host_thread_to_device (ord)
+      || !cuda_memcpy_dev_sanity_check (dst, src, n))
+    return false;
   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
   return true;
 }
@@ -2288,6 +2360,18 @@ GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
   return true;
 }
 
+bool
+GOMP_OFFLOAD_openacc_async_dev2dev (int ord, void *dst, const void *src,
+				    size_t n, struct goacc_asyncqueue *aq)
+{
+  if (!nvptx_attach_host_thread_to_device (ord)
+      || !cuda_memcpy_dev_sanity_check (dst, src, n))
+    return false;
+  CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
+	     aq->cuda_stream);
+  return true;
+}
+
 union goacc_property_value
 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
 {
@@ -2815,17 +2899,68 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
 	else if (r != CUDA_ERROR_NOT_READY)
 	  GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
 
-	if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0)
+	struct rev_offload *rev_metadata = ptx_dev->rev_data;
+
+	/* Claim a portion of the ring buffer to process on this iteration.
+	   Don't mark them as consumed until all the data has been read out.  */
+	unsigned int consumed = __atomic_load_n (&rev_metadata->consumed,
+						 __ATOMIC_ACQUIRE);
+	unsigned int from = __atomic_load_n (&rev_metadata->claimed,
+						__ATOMIC_RELAXED);
+	unsigned int to = __atomic_load_n (&rev_metadata->next_slot,
+					   __ATOMIC_RELAXED);
+
+	if (consumed > to)
+	  {
+	    /* Overflow happens when we exceed UINTMAX requests.  */
+	    GOMP_PLUGIN_fatal ("NVPTX reverse offload buffer overflowed.\n");
+	  }
+
+	to = MIN(to, consumed + REV_OFFLOAD_QUEUE_SIZE / 2);
+	if (to <= from)
+	  /* Nothing to do; poll again.  */
+	  goto poll_again;
+
+	if (!__atomic_compare_exchange_n (&rev_metadata->claimed, &from, to,
+					  false,
+					  __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
+	  /* Collision with another thread ... go around again.  */
+	  goto poll_again;
+
+	unsigned int index;
+	for (index = from; index < to; index++)
 	  {
-	    struct rev_offload *rev_data = ptx_dev->rev_data;
+	    int slot = index % REV_OFFLOAD_QUEUE_SIZE;
+
+	    /* Wait while the target finishes filling in the slot.  */
+	    while (__atomic_load_n (&ptx_dev->rev_data->queue[slot].signal,
+				    __ATOMIC_ACQUIRE) == 0)
+	      ; /* spin  */
+
+	    /* Pass the request to libgomp; this will queue the request and
+	       return right away, without waiting for the kernel to run.  */
+	    struct rev_req *rev_data = &ptx_dev->rev_data->queue[slot];
 	    GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
 				    rev_data->addrs, rev_data->sizes,
 				    rev_data->kinds, rev_data->dev_num,
-				    reverse_offload_aq);
-	    if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
-	      exit (EXIT_FAILURE);
-	    __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
+				    rev_data->signal, true);
+
+	    /* Ensure that the slot doesn't trigger early, when reused.  */
+	    __atomic_store_n (&rev_data->signal, 0, __ATOMIC_RELEASE);
 	  }
+
+	/* The data is now consumed so release the slots for reuse.  */
+	unsigned int consumed_so_far = from;
+	while (!__atomic_compare_exchange_n (&rev_metadata->consumed,
+					    &consumed_so_far, to, false,
+					    __ATOMIC_RELEASE, __ATOMIC_RELAXED))
+	  {
+	    /* Another thread didn't consume all it claimed yet.... */
+	    consumed_so_far = from;
+	    usleep (1);
+	  }
+
+poll_again:
 	usleep (1);
       }
   else
diff --git a/libgomp/target-cxa-dso-dtor.c b/libgomp/target-cxa-dso-dtor.c
new file mode 100644
index 0000000..d1a898d
--- /dev/null
+++ b/libgomp/target-cxa-dso-dtor.c
@@ -0,0 +1,3 @@
+/* Host/device compatibility: Itanium C++ ABI, DSO Object Destruction API */
+
+/* Nothing needed here.  */
diff --git a/libgomp/target.c b/libgomp/target.c
index 9674ff4..4ad803a 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -461,6 +461,19 @@ gomp_copy_dev2host (struct gomp_device_descr *devicep,
     gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz);
 }
 
+attribute_hidden void
+gomp_copy_dev2dev (struct gomp_device_descr *devicep,
+		   struct goacc_asyncqueue *aq,
+		   void *dst, const void *src, size_t sz)
+{
+  if (__builtin_expect (aq != NULL, 0))
+    goacc_device_copy_async (devicep, devicep->openacc.async.dev2dev_func,
+			     "dev", dst, "dev", src, NULL, sz, aq);
+  else
+    gomp_device_copy (devicep, devicep->dev2dev_func, "dev", dst,
+		      "dev", src, sz);
+}
+
 static void
 gomp_free_device_memory (struct gomp_device_descr *devicep, void *devptr)
 {
@@ -990,15 +1003,155 @@ gomp_map_val (struct target_mem_desc *tgt, void **hostaddrs, size_t i)
     }
 }
 
+static const char *
+kind_to_name (unsigned short kind)
+{
+  if (GOMP_MAP_IMPLICIT_P (kind))
+    kind &= ~GOMP_MAP_IMPLICIT;
+
+  switch (kind & 0xff)
+    {
+    case GOMP_MAP_ALLOC: return "GOMP_MAP_ALLOC";
+    case GOMP_MAP_FIRSTPRIVATE: return "GOMP_MAP_FIRSTPRIVATE";
+    case GOMP_MAP_FIRSTPRIVATE_INT: return "GOMP_MAP_FIRSTPRIVATE_INT";
+    case GOMP_MAP_TO: return "GOMP_MAP_TO";
+    case GOMP_MAP_TO_PSET: return "GOMP_MAP_TO_PSET";
+    case GOMP_MAP_FROM: return "GOMP_MAP_FROM";
+    case GOMP_MAP_TOFROM: return "GOMP_MAP_TOFROM";
+    case GOMP_MAP_POINTER: return "GOMP_MAP_POINTER";
+    case GOMP_MAP_ATTACH: return "GOMP_MAP_ATTACH";
+    case GOMP_MAP_DETACH: return "GOMP_MAP_DETACH";
+    case GOMP_MAP_STRUCT: return "GOMP_MAP_STRUCT";
+    case GOMP_MAP_STRUCT_UNORD: return "GOMP_MAP_STRUCT_UNORD";
+    default: return "unknown";
+    }
+}
+
+static void
+gomp_add_map (size_t idx, size_t *new_idx,
+	      void ***hostaddrs, size_t **sizes, unsigned short **skinds,
+	      void ***new_hostaddrs, size_t **new_sizes,
+	      unsigned short **new_kinds, size_t *iterator_count)
+{
+  if ((*sizes)[idx] == SIZE_MAX)
+    {
+      uintptr_t *iterator_array = (*hostaddrs)[idx];
+      size_t count = *iterator_array++;
+      for (size_t i = 0; i < count; i++)
+	{
+	  (*new_hostaddrs)[*new_idx] = (void *) *iterator_array++;
+	  (*new_sizes)[*new_idx] = *iterator_array++;
+	  (*new_kinds)[*new_idx] = (*skinds)[idx];
+	  iterator_count[*new_idx] = i + 1;
+	  gomp_debug (1,
+		      "Expanding map %u <%s>: "
+		      "hostaddrs[%u] = %p, sizes[%u] = %lu\n",
+		      (int) idx, kind_to_name ((*new_kinds)[*new_idx]),
+		      (int) *new_idx, (*new_hostaddrs)[*new_idx],
+		      (int) *new_idx, (unsigned long) (*new_sizes)[*new_idx]);
+	  (*new_idx)++;
+	}
+    }
+  else
+    {
+      (*new_hostaddrs)[*new_idx] = (*hostaddrs)[idx];
+      (*new_sizes)[*new_idx] = (*sizes)[idx];
+      (*new_kinds)[*new_idx] = (*skinds)[idx];
+      iterator_count[*new_idx] = 0;
+      (*new_idx)++;
+    }
+}
+
+
+/* Map entries containing expanded iterators will be flattened and merged into
+   HOSTADDRS, SIZES and KINDS, and MAPNUM updated.  Returns true if there are
+   any iterators found.  ITERATOR_COUNT holds the iteration count of the
+   iterator that generates each map (0 if not generated from an iterator).
+   HOSTADDRS, SIZES, KINDS and ITERATOR_COUNT must be freed afterwards if any
+   merging occurs.  */
+
+static bool
+gomp_merge_iterator_maps (size_t *mapnum, void ***hostaddrs, size_t **sizes,
+			  void **kinds, size_t **iterator_count)
+{
+  bool iterator_p = false;
+  size_t map_count = 0;
+  unsigned short **skinds = (unsigned short **) kinds;
+
+  for (size_t i = 0; i < *mapnum; i++)
+    if ((*sizes)[i] == SIZE_MAX)
+      {
+	uintptr_t *iterator_array = (*hostaddrs)[i];
+	map_count += iterator_array[0];
+	iterator_p = true;
+      }
+    else
+      map_count++;
+
+  if (!iterator_p)
+    return false;
+
+  gomp_debug (1,
+	      "Expanding iterator maps - number of map entries: %u -> %u\n",
+	      (int) *mapnum, (int) map_count);
+  void **new_hostaddrs = (void **) gomp_malloc (map_count * sizeof (void *));
+  size_t *new_sizes = (size_t *) gomp_malloc (map_count * sizeof (size_t));
+  unsigned short *new_kinds
+    = (unsigned short *) gomp_malloc (map_count * sizeof (unsigned short));
+  size_t new_idx = 0;
+  *iterator_count = (size_t *) gomp_malloc (map_count * sizeof (size_t));
+
+  for (size_t i = 0; i < *mapnum; i++)
+    {
+      int map_type = get_kind (true, *skinds, i) & 0xff;
+      if (map_type == GOMP_MAP_STRUCT || map_type == GOMP_MAP_STRUCT_UNORD)
+	{
+	  size_t field_count = (*sizes)[i];
+	  size_t idx_i = new_idx;
+
+	  gomp_add_map (i, &new_idx, hostaddrs, sizes, skinds,
+			&new_hostaddrs, &new_sizes, &new_kinds,
+			*iterator_count);
+
+	  for (size_t j = i + 1; j <= i + field_count; j++)
+	    {
+	      if ((*sizes)[j] == SIZE_MAX)
+		{
+		  uintptr_t *iterator_array = (*hostaddrs)[j];
+		  size_t count = iterator_array[0];
+		  new_sizes[idx_i] += count - 1;
+		}
+	      gomp_add_map (j, &new_idx, hostaddrs, sizes, skinds,
+			    &new_hostaddrs, &new_sizes, &new_kinds,
+			    *iterator_count);
+	    }
+	  gomp_debug (1, "Map %u: new field count = %lu\n",
+		      (int) i, (unsigned long) new_sizes[idx_i]);
+	  i += field_count;
+	}
+      else
+	gomp_add_map (i, &new_idx, hostaddrs, sizes, skinds,
+		      &new_hostaddrs, &new_sizes, &new_kinds, *iterator_count);
+    }
+
+  *mapnum = map_count;
+  *hostaddrs = new_hostaddrs;
+  *sizes = new_sizes;
+  *kinds = new_kinds;
+
+  return true;
+}
+
 static inline __attribute__((always_inline)) struct target_mem_desc *
 gomp_map_vars_internal (struct gomp_device_descr *devicep,
 			struct goacc_asyncqueue *aq, size_t mapnum,
 			void **hostaddrs, void **devaddrs, size_t *sizes,
-			void *kinds, bool short_mapkind,
-			htab_t *refcount_set,
+			void *kinds, struct goacc_ncarray_info *nca_info,
+			bool short_mapkind, htab_t *refcount_set,
 			enum gomp_map_vars_kind pragma_kind)
 {
   size_t i, tgt_align, tgt_size, not_found_cnt = 0;
+  size_t nca_data_row_num = (nca_info ? nca_info->num_data_rows : 0);
   bool has_firstprivate = false;
   bool has_always_ptrset = false;
   bool openmp_p = (pragma_kind & GOMP_MAP_VARS_OPENACC) == 0;
@@ -1006,9 +1159,15 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
   const int typemask = short_mapkind ? 0xff : 0x7;
   struct splay_tree_s *mem_map = &devicep->mem_map;
   struct splay_tree_key_s cur_node;
+  bool iterators_p = false;
+  size_t *iterator_count = NULL;
+  if (short_mapkind)
+    iterators_p = gomp_merge_iterator_maps (&mapnum, &hostaddrs, &sizes,
+					    &kinds, &iterator_count);
   struct target_mem_desc *tgt
-    = gomp_malloc (sizeof (*tgt) + sizeof (tgt->list[0]) * mapnum);
-  tgt->list_count = mapnum;
+    = gomp_malloc (sizeof (*tgt)
+		   + sizeof (tgt->list[0]) * (mapnum + nca_data_row_num));
+  tgt->list_count = mapnum + nca_data_row_num;
   tgt->refcount = (pragma_kind & GOMP_MAP_VARS_ENTER_DATA) ? 0 : 1;
   tgt->device_descr = devicep;
   tgt->prev = NULL;
@@ -1162,6 +1321,28 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
 	  has_firstprivate = true;
 	  continue;
 	}
+      else if (GOMP_MAP_NONCONTIG_ARRAY_P (kind & typemask))
+	{
+	  /* Ignore non-contiguous arrays for now, we process them together
+	     later.  */
+	  tgt->list[i].key = NULL;
+	  tgt->list[i].offset = 0;
+	  not_found_cnt++;
+
+	  /* The map for the non-contiguous array itself is never copied from
+	     during unmapping, its the data rows that count. Set copy-from
+	     flags to false here.  */
+	  tgt->list[i].copy_from = false;
+	  tgt->list[i].always_copy_from = false;
+	  tgt->list[i].is_attach = false;
+
+	  size_t align = (size_t) 1 << (kind >> rshift);
+	  if (tgt_align < align)
+	    tgt_align = align;
+
+	  continue;
+	}
+
       cur_node.host_start = (uintptr_t) hostaddrs[i];
       if (!GOMP_MAP_POINTER_P (kind & typemask))
 	cur_node.host_end = cur_node.host_start + sizes[i];
@@ -1297,6 +1478,45 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
 	}
     }
 
+  /* For non-contiguous arrays. Each data row is one target item, separated
+     from the normal map clause items, hence we order them after mapnum.  */
+  if (nca_info)
+    {
+      struct target_var_desc *next_var_desc = &tgt->list[mapnum];
+      for (i = 0; i < nca_info->num_ncarray; i++)
+	{
+	  struct goacc_ncarray *nca = &nca_info->ncarray[i];
+	  int kind = get_kind (short_mapkind, kinds, nca->map_index);
+	  size_t align = (size_t) 1 << (kind >> rshift);
+	  tgt_size = (tgt_size + align - 1) & ~(align - 1);
+	  tgt_size += nca->ptrblock_size;
+
+	  for (size_t j = 0; j < nca->data_row_num; j++)
+	    {
+	      struct target_var_desc *row_desc = next_var_desc++;
+	      void *row = nca->data_rows[j];
+	      cur_node.host_start = (uintptr_t) row;
+	      cur_node.host_end = cur_node.host_start + nca->data_row_size;
+	      splay_tree_key n = splay_tree_lookup (mem_map, &cur_node);
+	      if (n)
+		{
+		  assert (n->refcount != REFCOUNT_LINK);
+		  gomp_map_vars_existing (devicep, aq, n, &cur_node, row_desc,
+					  kind & typemask, false, false,
+					  /* TODO: cbuf? */ NULL,
+					  refcount_set);
+		}
+	      else
+		{
+		  tgt_size = (tgt_size + align - 1) & ~(align - 1);
+		  tgt_size += nca->data_row_size;
+		  not_found_cnt++;
+		}
+	    }
+	}
+      assert (next_var_desc == &tgt->list[mapnum + nca_info->num_data_rows]);
+    }
+
   if (devaddrs)
     {
       if (mapnum != 1)
@@ -1643,6 +1863,15 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
 	      default:
 		break;
 	      }
+
+	    if (GOMP_MAP_NONCONTIG_ARRAY_P (kind & typemask))
+	      {
+		tgt->list[i].key = &array->key;
+		tgt->list[i].key->tgt = tgt;
+		array++;
+		continue;
+	      }
+
 	    splay_tree_key k = &array->key;
 	    k->host_start = (uintptr_t) hostaddrs[i];
 	    if (!GOMP_MAP_POINTER_P (kind & typemask))
@@ -1879,18 +2108,120 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
 		array++;
 	      }
 	  }
+
+      /* Processing of non-contiguous array rows.  */
+      if (nca_info)
+	{
+	  struct target_var_desc *next_var_desc = &tgt->list[mapnum];
+	  for (i = 0; i < nca_info->num_ncarray; i++)
+	    {
+	      struct goacc_ncarray *nca = &nca_info->ncarray[i];
+	      int kind = get_kind (short_mapkind, kinds, nca->map_index);
+	      size_t align = (size_t) 1 << (kind >> rshift);
+	      tgt_size = (tgt_size + align - 1) & ~(align - 1);
+
+	      assert (nca->ptr == hostaddrs[nca->map_index]);
+
+	      /* For the map of the non-contiguous array itself, adjust so that
+		 the passed device address points to the beginning of the
+		 ptrblock. Remember to adjust the first-dimension's bias here.   */
+	      tgt->list[nca->map_index].key->tgt_offset
+		= tgt_size - nca->descr->dims[0].base;
+
+	      void *target_ptrblock = (void*) tgt->tgt_start + tgt_size;
+	      tgt_size += nca->ptrblock_size;
+
+	      /* Add splay key for each data row in current non-contiguous
+		 array.  */
+	      for (size_t j = 0; j < nca->data_row_num; j++)
+		{
+		  struct target_var_desc *row_desc = next_var_desc++;
+		  void *row = nca->data_rows[j];
+		  cur_node.host_start = (uintptr_t) row;
+		  cur_node.host_end = cur_node.host_start + nca->data_row_size;
+		  splay_tree_key k = splay_tree_lookup (mem_map, &cur_node);
+		  if (k)
+		    {
+		      assert (k->refcount != REFCOUNT_LINK);
+		      gomp_map_vars_existing (devicep, aq, k, &cur_node, row_desc,
+					      kind & typemask, false, false,
+					      cbufp, refcount_set);
+		    }
+		  else
+		    {
+		      tgt->refcount++;
+		      tgt_size = (tgt_size + align - 1) & ~(align - 1);
+
+		      k = &array->key;
+		      k->host_start = (uintptr_t) row;
+		      k->host_end = k->host_start + nca->data_row_size;
+
+		      k->tgt = tgt;
+		      k->refcount = 1;
+		      k->dynamic_refcount = 0;
+		      k->aux = NULL;
+		      k->tgt_offset = tgt_size;
+
+		      tgt_size += nca->data_row_size;
+
+		      row_desc->key = k;
+		      row_desc->copy_from
+			= GOMP_MAP_COPY_FROM_P (kind & typemask);
+		      row_desc->always_copy_from
+			= GOMP_MAP_COPY_FROM_P (kind & typemask);
+		      row_desc->is_attach = false;
+		      row_desc->offset = 0;
+		      row_desc->length = nca->data_row_size;
+
+		      array->left = NULL;
+		      array->right = NULL;
+		      splay_tree_insert (mem_map, array);
+
+		      if (GOMP_MAP_COPY_TO_P (kind & typemask))
+			gomp_copy_host2dev (devicep, aq,
+					    (void *) tgt->tgt_start + k->tgt_offset,
+					    (void *) k->host_start,
+					    nca->data_row_size, false,
+					    cbufp);
+		      array++;
+		    }
+		  nca->tgt_data_rows[j]
+		    = (void *) (k->tgt->tgt_start + k->tgt_offset);
+		}
+
+	      /* Now we have the target memory allocated, and target offsets of all
+		 row blocks assigned and calculated, we can construct the
+		 accelerator side ptrblock and copy it in.  */
+	      if (nca->ptrblock_size)
+		{
+		  void *ptrblock = gomp_malloc (nca->ptrblock_size);
+		  goacc_noncontig_array_create_ptrblock
+		    (nca, ptrblock, target_ptrblock);
+		  gomp_copy_host2dev (devicep, aq, target_ptrblock, ptrblock,
+				      nca->ptrblock_size, false, cbufp);
+		  if (aq)
+		    /* Free once the transfer has completed.  */
+		    devicep->openacc.async.queue_callback_func (aq, free, ptrblock);
+		  else
+		    free (ptrblock);
+		}
+	    }
+	}
     }
 
   if (pragma_kind & GOMP_MAP_VARS_TARGET)
     {
+      size_t map_num = 0;
       for (i = 0; i < mapnum; i++)
-	{
-	  cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
-	  gomp_copy_host2dev (devicep, aq,
-			      (void *) (tgt->tgt_start + i * sizeof (void *)),
-			      (void *) &cur_node.tgt_offset, sizeof (void *),
-			      true, cbufp);
-	}
+	if (!iterator_count || iterator_count[i] <= 1)
+	  {
+	    cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
+	    gomp_copy_host2dev (devicep, aq,
+				(void *) (tgt->tgt_start + map_num * sizeof (void *)),
+				(void *) &cur_node.tgt_offset, sizeof (void *),
+				true, cbufp);
+	    map_num++;
+	  }
     }
 
   if (cbufp)
@@ -1922,6 +2253,15 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
     }
 
   gomp_mutex_unlock (&devicep->lock);
+
+  if (iterators_p)
+    {
+      free (hostaddrs);
+      free (sizes);
+      free (kinds);
+      free (iterator_count);
+    }
+
   return tgt;
 }
 
@@ -1942,8 +2282,8 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 
   struct target_mem_desc *tgt;
   tgt = gomp_map_vars_internal (devicep, NULL, mapnum, hostaddrs, devaddrs,
-				sizes, kinds, short_mapkind, refcount_set,
-				pragma_kind);
+				sizes, kinds, NULL, short_mapkind,
+				refcount_set, pragma_kind);
   if (local_refcount_set)
     htab_free (local_refcount_set);
 
@@ -1954,11 +2294,12 @@ attribute_hidden struct target_mem_desc *
 goacc_map_vars (struct gomp_device_descr *devicep,
 		struct goacc_asyncqueue *aq, size_t mapnum,
 		void **hostaddrs, void **devaddrs, size_t *sizes,
-		void *kinds, bool short_mapkind,
+		void *kinds, struct goacc_ncarray_info *nca_info,
+		bool short_mapkind,
 		enum gomp_map_vars_kind pragma_kind)
 {
   return gomp_map_vars_internal (devicep, aq, mapnum, hostaddrs, devaddrs,
-				 sizes, kinds, short_mapkind, NULL,
+				 sizes, kinds, nca_info, short_mapkind, NULL,
 				 GOMP_MAP_VARS_OPENACC | pragma_kind);
 }
 
@@ -2112,6 +2453,9 @@ gomp_unmap_vars_internal (struct target_mem_desc *tgt, bool do_copyfrom,
 			     false, NULL);
     }
 
+  size_t nrmvars = 0;
+  splay_tree_key remove_vars[tgt->list_count];
+
   for (i = 0; i < tgt->list_count; i++)
     {
       splay_tree_key k = tgt->list[i].key;
@@ -2133,17 +2477,22 @@ gomp_unmap_vars_internal (struct target_mem_desc *tgt, bool do_copyfrom,
 			    (void *) (k->tgt->tgt_start + k->tgt_offset
 				      + tgt->list[i].offset),
 			    tgt->list[i].length);
+      /* Queue all removals together for processing below.
+	 See also 'gomp_exit_data'.  */
       if (do_remove)
-	{
-	  struct target_mem_desc *k_tgt __attribute__((unused)) = k->tgt;
-	  bool is_tgt_unmapped __attribute__((unused))
-	    = gomp_remove_var (devicep, k);
-	  /* It would be bad if TGT got unmapped while we're still iterating
-	     over its LIST_COUNT, and also expect to use it in the following
-	     code.  */
-	  assert (!is_tgt_unmapped
-		  || k_tgt != tgt);
-	}
+	remove_vars[nrmvars++] = k;
+    }
+
+  for (i = 0; i < nrmvars; i++)
+    {
+      splay_tree_key k = remove_vars[i];
+      struct target_mem_desc *k_tgt __attribute__((unused)) = k->tgt;
+      bool is_tgt_unmapped __attribute__((unused))
+	= gomp_remove_var (devicep, k);
+      /* It would be bad if TGT got unmapped while we're still iterating over
+	 its LIST_COUNT, and also expect to use it in the following code.  */
+      assert (!is_tgt_unmapped
+	      || k_tgt != tgt);
     }
 
   if (aq)
@@ -2181,6 +2530,14 @@ goacc_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom,
   gomp_unmap_vars_internal (tgt, do_copyfrom, NULL, aq);
 }
 
+static int
+omp_target_memcpy_rect_worker (void *, const void *, size_t, size_t, int,
+			       const size_t *, const size_t *, const size_t *,
+			       const size_t *, const size_t *, const size_t *,
+			       struct gomp_device_descr *,
+			       struct gomp_device_descr *, size_t *tmp_size,
+			       void **tmp);
+
 static void
 gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
 	     size_t *sizes, void *kinds, bool short_mapkind)
@@ -2188,6 +2545,8 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
   size_t i;
   struct splay_tree_key_s cur_node;
   const int typemask = short_mapkind ? 0xff : 0x7;
+  bool iterators_p = false;
+  size_t *iterator_count = NULL;
 
   if (!devicep)
     return;
@@ -2195,6 +2554,10 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
   if (mapnum == 0)
     return;
 
+  if (short_mapkind)
+    iterators_p = gomp_merge_iterator_maps (&mapnum, &hostaddrs, &sizes,
+					    &kinds, &iterator_count);
+
   gomp_mutex_lock (&devicep->lock);
   if (devicep->state == GOMP_DEVICE_FINALIZED)
     {
@@ -2203,91 +2566,143 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
     }
 
   for (i = 0; i < mapnum; i++)
-    if (sizes[i])
-      {
-	cur_node.host_start = (uintptr_t) hostaddrs[i];
-	cur_node.host_end = cur_node.host_start + sizes[i];
-	splay_tree_key n = splay_tree_lookup (&devicep->mem_map, &cur_node);
-	if (n)
-	  {
-	    int kind = get_kind (short_mapkind, kinds, i);
-	    if (n->host_start > cur_node.host_start
-		|| n->host_end < cur_node.host_end)
-	      {
-		gomp_mutex_unlock (&devicep->lock);
-		gomp_fatal ("Trying to update [%p..%p) object when "
-			    "only [%p..%p) is mapped",
-			    (void *) cur_node.host_start,
-			    (void *) cur_node.host_end,
-			    (void *) n->host_start,
-			    (void *) n->host_end);
-	      }
+    {
+      int kind = get_kind (short_mapkind, kinds, i);
+      if ((kind & typemask) == GOMP_MAP_TO_GRID
+	  || (kind & typemask) == GOMP_MAP_FROM_GRID)
+	{
+	  omp_noncontig_array_desc *desc
+	    = (omp_noncontig_array_desc *) hostaddrs[i + 1];
+	  size_t bias = sizes[i + 1];
+	  cur_node.host_start = (uintptr_t) hostaddrs[i] + bias;
+	  cur_node.host_end = cur_node.host_start + sizes[i];
+	  splay_tree_key n = splay_tree_lookup (&devicep->mem_map, &cur_node);
+	  if (n)
+	    {
+	      if (n->aux && n->aux->attach_count)
+		{
+		  gomp_mutex_unlock (&devicep->lock);
+		  gomp_error ("noncontiguous update with attached pointers");
+		  return;
+		}
+	      void *devaddr = (void *) (n->tgt->tgt_start + n->tgt_offset
+					+ cur_node.host_start
+					- n->host_start
+					- bias);
+	      size_t tmp_size = 0;
+	      void *tmp = NULL;
+	      if ((kind & typemask) == GOMP_MAP_TO_GRID)
+		omp_target_memcpy_rect_worker (devaddr, hostaddrs[i],
+					       desc->elemsize, desc->span,
+					       desc->ndims, desc->length,
+					       desc->stride, desc->index,
+					       desc->index, desc->dim,
+					       desc->dim, devicep,
+					       NULL, &tmp_size, &tmp);
+	      else
+		omp_target_memcpy_rect_worker (hostaddrs[i], devaddr,
+					       desc->elemsize, desc->span,
+					       desc->ndims, desc->length,
+					       desc->stride, desc->index,
+					       desc->index, desc->dim,
+					       desc->dim, NULL,
+					       devicep, &tmp_size, &tmp);
+	    }
+	  i++;
+	}
+      else if (sizes[i])
+	{
+	  cur_node.host_start = (uintptr_t) hostaddrs[i];
+	  cur_node.host_end = cur_node.host_start + sizes[i];
+	  splay_tree_key n = splay_tree_lookup (&devicep->mem_map, &cur_node);
+	  if (n)
+	    {
+	      if (n->host_start > cur_node.host_start
+		  || n->host_end < cur_node.host_end)
+		{
+		  gomp_mutex_unlock (&devicep->lock);
+		  gomp_fatal ("Trying to update [%p..%p) object when "
+			      "only [%p..%p) is mapped",
+			      (void *) cur_node.host_start,
+			      (void *) cur_node.host_end,
+			      (void *) n->host_start,
+			      (void *) n->host_end);
+		}
 
-	    if (n->aux && n->aux->attach_count)
-	      {
-		uintptr_t addr = cur_node.host_start;
-		while (addr < cur_node.host_end)
-		  {
-		    /* We have to be careful not to overwrite still attached
-		       pointers during host<->device updates.  */
-		    size_t i = (addr - cur_node.host_start) / sizeof (void *);
-		    if (n->aux->attach_count[i] == 0)
-		      {
-			void *devaddr = (void *) (n->tgt->tgt_start
-						  + n->tgt_offset
-						  + addr - n->host_start);
-			if (GOMP_MAP_COPY_TO_P (kind & typemask))
-			  gomp_copy_host2dev (devicep, NULL,
-					      devaddr, (void *) addr,
-					      sizeof (void *), false, NULL);
-			if (GOMP_MAP_COPY_FROM_P (kind & typemask))
-			  gomp_copy_dev2host (devicep, NULL,
-					      (void *) addr, devaddr,
-					      sizeof (void *));
-		      }
-		    addr += sizeof (void *);
-		  }
-	      }
-	    else
-	      {
-		void *hostaddr = (void *) cur_node.host_start;
-		void *devaddr = (void *) (n->tgt->tgt_start + n->tgt_offset
-					  + cur_node.host_start
-					  - n->host_start);
-		size_t size = cur_node.host_end - cur_node.host_start;
-
-		if (GOMP_MAP_COPY_TO_P (kind & typemask))
-		  gomp_copy_host2dev (devicep, NULL, devaddr, hostaddr, size,
-				      false, NULL);
-		if (GOMP_MAP_COPY_FROM_P (kind & typemask))
-		  gomp_copy_dev2host (devicep, NULL, hostaddr, devaddr, size);
-	      }
-	  }
-	else
-	  {
-	    int kind = get_kind (short_mapkind, kinds, i);
+	      if (n->aux && n->aux->attach_count)
+		{
+		  uintptr_t addr = cur_node.host_start;
+		  while (addr < cur_node.host_end)
+		    {
+		      /* We have to be careful not to overwrite still attached
+			 pointers during host<->device updates.  */
+		      size_t i = (addr - cur_node.host_start) / sizeof (void *);
+		      if (n->aux->attach_count[i] == 0)
+			{
+			  void *devaddr = (void *) (n->tgt->tgt_start
+						    + n->tgt_offset
+						    + addr - n->host_start);
+			  if (GOMP_MAP_COPY_TO_P (kind & typemask))
+			    gomp_copy_host2dev (devicep, NULL,
+						devaddr, (void *) addr,
+						sizeof (void *), false, NULL);
+			  if (GOMP_MAP_COPY_FROM_P (kind & typemask))
+			    gomp_copy_dev2host (devicep, NULL,
+						(void *) addr, devaddr,
+						sizeof (void *));
+			}
+		      addr += sizeof (void *);
+		    }
+		}
+	      else
+		{
+		  void *hostaddr = (void *) cur_node.host_start;
+		  void *devaddr = (void *) (n->tgt->tgt_start + n->tgt_offset
+					    + cur_node.host_start
+					    - n->host_start);
+		  size_t size = cur_node.host_end - cur_node.host_start;
+
+		  if (GOMP_MAP_COPY_TO_P (kind & typemask))
+		    gomp_copy_host2dev (devicep, NULL, devaddr, hostaddr, size,
+					false, NULL);
+		  if (GOMP_MAP_COPY_FROM_P (kind & typemask))
+		    gomp_copy_dev2host (devicep, NULL, hostaddr, devaddr, size);
+		}
+	    }
+	  else
+	    {
+	      int kind = get_kind (short_mapkind, kinds, i);
 
-	    if (GOMP_MAP_PRESENT_P (kind))
-	      {
-		/* We already looked up the memory region above and it
-		   was missing.  */
-		gomp_mutex_unlock (&devicep->lock);
+	      if (GOMP_MAP_PRESENT_P (kind))
+		{
+		  /* We already looked up the memory region above and it
+		     was missing.  */
+		  gomp_mutex_unlock (&devicep->lock);
 #ifdef HAVE_INTTYPES_H
-		gomp_fatal ("present clause: not present on the device "
-			    "(addr: %p, size: %"PRIu64" (0x%"PRIx64"), "
-			    "dev: %d)", (void *) hostaddrs[i],
-			    (uint64_t) sizes[i], (uint64_t) sizes[i],
-			    devicep->target_id);
+		  gomp_fatal ("present clause: not present on the device "
+			      "(addr: %p, size: %"PRIu64" (0x%"PRIx64"), "
+			      "dev: %d)", (void *) hostaddrs[i],
+			      (uint64_t) sizes[i], (uint64_t) sizes[i],
+			      devicep->target_id);
 #else
-		gomp_fatal ("present clause: not present on the device "
-			    "(addr: %p, size: %lu (0x%lx), dev: %d)",
-			    (void *) hostaddrs[i], (unsigned long) sizes[i],
-			    (unsigned long) sizes[i], devicep->target_id);
+		  gomp_fatal ("present clause: not present on the device "
+			      "(addr: %p, size: %lu (0x%lx), dev: %d)",
+			      (void *) hostaddrs[i], (unsigned long) sizes[i],
+			      (unsigned long) sizes[i], devicep->target_id);
 #endif
-	      }
-	  }
-      }
+		}
+	    }
+	}
+    }
   gomp_mutex_unlock (&devicep->lock);
+
+  if (iterators_p)
+    {
+      free (hostaddrs);
+      free (sizes);
+      free (kinds);
+      free (iterator_count);
+    }
 }
 
 static struct gomp_offload_icv_list *
@@ -3481,16 +3896,18 @@ gomp_map_cdata_lookup (struct cpy_data *d, uint64_t *devaddrs,
 				    tgt_start, tgt_end);
 }
 
-/* Handle reverse offload.  This is called by the device plugins for a
-   reverse offload; it is not called if the outer target runs on the host.
+/* Handle reverse offload.  This is called by the host worker thread to
+   execute a single reverse offload request; it is not called if the outer
+   target runs on the host.
    The mapping is simplified device-affecting constructs (except for target
    with device(ancestor:1)) must not be encountered; in particular not
    target (enter/exit) data.  */
 
-void
-gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
-		 uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
-		 struct goacc_asyncqueue *aq)
+static void
+gomp_target_rev_internal (uint64_t fn_ptr, uint64_t mapnum,
+			  uint64_t devaddrs_ptr, uint64_t sizes_ptr,
+			  uint64_t kinds_ptr, struct gomp_device_descr *devicep,
+			  struct goacc_asyncqueue *aq)
 {
   /* Return early if there is no offload code.  */
   if (sizeof (OFFLOAD_PLUGINS) == sizeof (""))
@@ -3507,7 +3924,6 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
   unsigned short *kinds;
   const bool short_mapkind = true;
   const int typemask = short_mapkind ? 0xff : 0x7;
-  struct gomp_device_descr *devicep = resolve_device (dev_num, false);
 
   reverse_splay_tree_key n;
   struct reverse_splay_tree_key_s k;
@@ -3918,6 +4334,134 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
     }
 }
 
+static struct target_rev_queue_s
+{
+  uint64_t fn_ptr;
+  uint64_t mapnum;
+  uint64_t devaddrs_ptr;
+  uint64_t sizes_ptr;
+  uint64_t kinds_ptr;
+  struct gomp_device_descr *devicep;
+
+  volatile int *signal;
+  bool use_aq;
+
+  struct target_rev_queue_s *next;
+} *target_rev_queue_head = NULL, *target_rev_queue_last = NULL;
+static gomp_mutex_t target_rev_queue_lock = 0;
+static int target_rev_thread_count = 0;
+
+static void *
+gomp_target_rev_worker_thread (void *)
+{
+  struct target_rev_queue_s *rev_kernel = NULL;
+  struct goacc_asyncqueue *aq = NULL;
+  struct gomp_device_descr *aq_devicep;
+
+  while (1)
+    {
+      gomp_mutex_lock (&target_rev_queue_lock);
+
+      /* Take a reverse-offload kernel request from the queue.  */
+      rev_kernel = target_rev_queue_head;
+      if (rev_kernel)
+	{
+	  target_rev_queue_head = rev_kernel->next;
+	  if (target_rev_queue_head == NULL)
+	    target_rev_queue_last = NULL;
+	}
+
+      if (rev_kernel == NULL)
+	{
+	  target_rev_thread_count--;
+	  gomp_mutex_unlock (&target_rev_queue_lock);
+	  break;
+	}
+      gomp_mutex_unlock (&target_rev_queue_lock);
+
+      /* Ensure we have a suitable device queue for the memory transfers.  */
+      if (rev_kernel->use_aq)
+	{
+	  if (aq && aq_devicep != rev_kernel->devicep)
+	    {
+	      aq_devicep->openacc.async.destruct_func (aq);
+	      aq = NULL;
+	    }
+
+	  if (!aq)
+	    {
+	      aq_devicep = rev_kernel->devicep;
+	      aq = aq_devicep->openacc.async.construct_func (aq_devicep->target_id);
+	    }
+	}
+
+      /* Run the kernel on the host.  */
+      gomp_target_rev_internal (rev_kernel->fn_ptr, rev_kernel->mapnum,
+				rev_kernel->devaddrs_ptr, rev_kernel->sizes_ptr,
+				rev_kernel->kinds_ptr, rev_kernel->devicep, aq);
+
+      /* Signal the device that the reverse-offload is completed.  */
+      int one = 1;
+      gomp_copy_host2dev (rev_kernel->devicep, aq, (void*)rev_kernel->signal,
+			  &one, sizeof (one), false, NULL);
+
+      /* We're done with this request.  */
+      free (rev_kernel);
+
+      /* Loop around and see if another request is waiting.  */
+    }
+
+  if (aq)
+    aq_devicep->openacc.async.destruct_func (aq);
+
+  return NULL;
+}
+
+void
+gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
+		 uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
+		 volatile int *signal, bool use_aq)
+{
+  struct gomp_device_descr *devicep = resolve_device (dev_num, false);
+
+  /* Create a new queue node.  */
+  struct target_rev_queue_s *newreq = gomp_malloc (sizeof (*newreq));
+  newreq->fn_ptr = fn_ptr;
+  newreq->mapnum = mapnum;
+  newreq->devaddrs_ptr = devaddrs_ptr;
+  newreq->sizes_ptr = sizes_ptr;
+  newreq->kinds_ptr = kinds_ptr;
+  newreq->devicep = devicep;
+  newreq->signal = signal;
+  newreq->use_aq = use_aq;
+  newreq->next = NULL;
+
+  gomp_mutex_lock (&target_rev_queue_lock);
+
+  /* Enqueue the reverse-offload request.  */
+  if (target_rev_queue_last)
+    {
+      target_rev_queue_last->next = newreq;
+      target_rev_queue_last = newreq;
+    }
+  else
+    target_rev_queue_last = target_rev_queue_head = newreq;
+
+  /* Launch a new thread to process the request asynchronously.
+     If the thread pool limit has been reached then an existing thread will
+     pick up the job when it is ready.  */
+  if (target_rev_thread_count < gomp_reverse_offload_threads)
+    {
+      target_rev_thread_count++;
+      gomp_mutex_unlock (&target_rev_queue_lock);
+
+      pthread_t t;
+      pthread_create (&t, NULL, gomp_target_rev_worker_thread, NULL);
+    }
+  else
+    gomp_mutex_unlock (&target_rev_queue_lock);
+}
+
 /* Host fallback for GOMP_target_data{,_ext} routines.  */
 
 static void
@@ -4114,7 +4658,7 @@ gomp_exit_data (struct gomp_device_descr *devicep, size_t mapnum,
 			       false, NULL);
       }
 
-  int nrmvars = 0;
+  size_t nrmvars = 0;
   splay_tree_key remove_vars[mapnum];
 
   for (i = 0; i < mapnum; i++)
@@ -4177,10 +4721,6 @@ gomp_exit_data (struct gomp_device_descr *devicep, size_t mapnum,
 	     errors if we still have following element siblings to copy back.
 	     While we're at it, it also seems more disciplined to simply
 	     queue all removals together for processing below.
-
-	     Structured block unmapping (i.e. gomp_unmap_vars_internal) should
-	     not have this problem, since they maintain an additional
-	     tgt->refcount = 1 reference to the target_mem_desc to start with.
 	  */
 	  if (do_remove)
 	    remove_vars[nrmvars++] = k;
@@ -4195,7 +4735,7 @@ gomp_exit_data (struct gomp_device_descr *devicep, size_t mapnum,
 	}
     }
 
-  for (int i = 0; i < nrmvars; i++)
+  for (i = 0; i < nrmvars; i++)
     gomp_remove_var (devicep, remove_vars[i]);
 
   gomp_mutex_unlock (&devicep->lock);
@@ -4497,6 +5037,140 @@ omp_target_free (void *device_ptr, int device_num)
   gomp_mutex_unlock (&devicep->lock);
 }
 
+/* Device (really: libgomp plugin) to use for paged-locked memory.  We
+   assume there is either none or exactly one such device for the lifetime of
+   the process.  */
+
+static struct gomp_device_descr *device_for_page_locked
+  = /* uninitialized */ (void *) -1;
+
+static struct gomp_device_descr *
+get_device_for_page_locked (void)
+{
+  gomp_debug (0, "%s\n",
+	      __FUNCTION__);
+
+  struct gomp_device_descr *device;
+#ifdef HAVE_SYNC_BUILTINS
+  device
+    = __atomic_load_n (&device_for_page_locked, MEMMODEL_RELAXED);
+  if (device == (void *) -1)
+    {
+      gomp_debug (0, "  init\n");
+
+      gomp_init_targets_once ();
+
+      device = NULL;
+      for (int i = 0; i < num_devices; ++i)
+	{
+	  gomp_debug (0, "  i=%d, target_id=%d\n",
+		      i, devices[i].target_id);
+
+	  /* We consider only the first device of potentially several of the
+	     same type as this functionality is not specific to an individual
+	     offloading device, but instead relates to the host-side
+	     implementation of the respective offloading implementation.  */
+	  if (devices[i].target_id != 0)
+	    continue;
+
+	  if (!devices[i].page_locked_host_alloc_func)
+	    continue;
+
+	  gomp_debug (0, "  found device: %p (%s)\n",
+		      &devices[i], devices[i].name);
+	  if (device)
+	    gomp_fatal ("Unclear how %s and %s libgomp plugins may"
+			" simultaneously provide functionality"
+			" for page-locked memory",
+			device->name, devices[i].name);
+	  else
+	    device = &devices[i];
+	}
+
+      struct gomp_device_descr *device_old
+	= __atomic_exchange_n (&device_for_page_locked, device,
+			       MEMMODEL_RELAXED);
+      gomp_debug (0, "  old device_for_page_locked: %p\n",
+		  device_old);
+      assert (device_old == (void *) -1
+	      /* We shouldn't have concurrently found a different or no
+		 device.  */
+	      || device_old == device);
+    }
+#else /* !HAVE_SYNC_BUILTINS */
+  gomp_debug (0, "  not implemented for '!HAVE_SYNC_BUILTINS'\n");
+  (void) &device_for_page_locked;
+  device = NULL;
+#endif /* HAVE_SYNC_BUILTINS */
+
+  gomp_debug (0, "  -> device=%p (%s)\n",
+	      device, device ? device->name : "[none]");
+  return device;
+}
+
+/* Allocate page-locked host memory.
+   Returns whether we have a device capable of that.  */
+
+attribute_hidden bool
+gomp_page_locked_host_alloc (void **ptr, size_t size)
+{
+  gomp_debug (0, "%s: ptr=%p, size=%llu\n",
+	      __FUNCTION__, ptr, (unsigned long long) size);
+
+  struct gomp_device_descr *device = get_device_for_page_locked ();
+  gomp_debug (0, "  device=%p (%s)\n",
+	      device, device ? device->name : "[none]");
+  if (device)
+    {
+      gomp_mutex_lock (&device->lock);
+      if (device->state == GOMP_DEVICE_UNINITIALIZED)
+	gomp_init_device (device);
+      else if (device->state == GOMP_DEVICE_FINALIZED)
+	{
+	  gomp_mutex_unlock (&device->lock);
+	  gomp_fatal ("Device %s used for for page-locked memory is finalized",
+		      device->name);
+	}
+      gomp_mutex_unlock (&device->lock);
+
+      if (!device->page_locked_host_alloc_func (ptr, size))
+	gomp_fatal ("Failed to allocate page-locked host memory"
+		    " via %s libgomp plugin",
+		    device->name);
+    }
+  return device != NULL;
+}
+
+/* Free page-locked host memory.
+   This must only be called if 'gomp_page_locked_host_alloc' returned
+   'true'.  */
+
+attribute_hidden void
+gomp_page_locked_host_free (void *ptr)
+{
+  gomp_debug (0, "%s: ptr=%p\n",
+	      __FUNCTION__, ptr);
+
+  struct gomp_device_descr *device = get_device_for_page_locked ();
+  gomp_debug (0, "  device=%p (%s)\n",
+	      device, device ? device->name : "[none]");
+  assert (device);
+
+  gomp_mutex_lock (&device->lock);
+  assert (device->state != GOMP_DEVICE_UNINITIALIZED);
+  if (device->state == GOMP_DEVICE_FINALIZED)
+    {
+      gomp_mutex_unlock (&device->lock);
+      return;
+    }
+  gomp_mutex_unlock (&device->lock);
+
+  if (!device->page_locked_host_free_func (ptr))
+    gomp_fatal ("Failed to free page-locked host memory"
+		" via %s libgomp plugin",
+		device->name);
+}
+
 int
 omp_target_is_present (const void *ptr, int device_num)
 {
@@ -4683,7 +5357,8 @@ omp_target_memcpy_async (void *dst, const void *src, size_t length,
 
 static int
 omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
-			       int num_dims, const size_t *volume,
+			       size_t span, int num_dims, const size_t *volume,
+			       const size_t *strides,
 			       const size_t *dst_offsets,
 			       const size_t *src_offsets,
 			       const size_t *dst_dimensions,
@@ -4697,7 +5372,7 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
   size_t j, dst_off, src_off, length;
   int i, ret;
 
-  if (num_dims == 1)
+  if (num_dims == 1 && (!strides || (strides[0] == 1 && element_size == span)))
     {
       if (__builtin_mul_overflow (element_size, volume[0], &length)
 	  || __builtin_mul_overflow (element_size, dst_offsets[0], &dst_off)
@@ -4751,9 +5426,74 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
 	}
       return ret ? 0 : EINVAL;
     }
+  else if (num_dims == 1 && strides)
+    {
+      size_t stride;
+
+      assert ((src_devicep == NULL || dst_devicep == NULL)
+	      && (src_devicep != NULL || dst_devicep != NULL));
+
+      if (__builtin_mul_overflow (span, dst_offsets[0], &dst_off)
+	  || __builtin_mul_overflow (span, src_offsets[0], &src_off))
+	return EINVAL;
+
+      if (__builtin_mul_overflow (span, strides[0], &stride))
+	return EINVAL;
+
+      if (((src_devicep && src_devicep->memcpy2d_func)
+	   || (dst_devicep && dst_devicep->memcpy2d_func))
+	  && (stride % element_size) == 0)
+	{
+	  /* Try using memcpy2d for a 1-dimensional strided access.  Here we
+	     treat the transfer as a 2-dimensional array, where the inner
+	     dimension is calculated to be (stride in bytes) / element_size.
+	     Indices/offsets are adjusted so the source/destination pointers
+	     point to the first element to be transferred, to make the sums
+	     easier.  (There are some configurations of 2D strided accesses
+	     that memcpy3d could handle similarly, but those are probably rare
+	     and are unimplemented for now.)   */
+
+	  /* If stride is element size, this is a contiguous transfer and
+	     should have been handled above.  */
+	  assert (stride > element_size);
+
+	  int dst_id = dst_devicep ? dst_devicep->target_id : -1;
+	  int src_id = src_devicep ? src_devicep->target_id : -1;
+	  void *subarray_src = (char *) src + src_off;
+	  void *subarray_dst = (char *) dst + dst_off;
+
+	  struct gomp_device_descr *devp = dst_devicep ? dst_devicep
+						       : src_devicep;
+	  ret = devp->memcpy2d_func (dst_id, src_id, element_size, volume[0],
+				     subarray_dst, 0, 0, stride, subarray_src,
+				     0, 0, stride);
+	  if (ret != -1)
+	    return ret ? 0 : EINVAL;
+	}
+
+      for (i = 0, ret = 1; i < volume[0] && ret; i++)
+	{
+	  if (src_devicep == NULL)
+	    ret = dst_devicep->host2dev_func (dst_devicep->target_id,
+					      (char *) dst + dst_off,
+					      (const char *) src + src_off,
+					      element_size);
+	  else if (dst_devicep == NULL)
+	    ret = src_devicep->dev2host_func (src_devicep->target_id,
+					      (char *) dst + dst_off,
+					      (const char *) src + src_off,
+					      element_size);
+	  dst_off += stride;
+	  src_off += stride;
+	}
+      return ret ? 0 : EINVAL;
+    }
 
   /* host->device, device->host and intra device.  */
   if (num_dims == 2
+      && (!strides || (strides[0] == 1
+		       && strides[1] == 1
+		       && element_size == span))
       && ((src_devicep
 	   && src_devicep == dst_devicep
 	   && src_devicep->memcpy2d_func)
@@ -4780,6 +5520,10 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
 	return ret ? 0 : EINVAL;
     }
   else if (num_dims == 3
+	   && (!strides || (strides[0] == 1
+			    && strides[1] == 1
+			    && strides[2] == 1
+			    && element_size == span))
 	   && ((src_devicep
 		&& src_devicep == dst_devicep
 		&& src_devicep->memcpy3d_func)
@@ -4815,13 +5559,19 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
   if (__builtin_mul_overflow (dst_slice, dst_offsets[0], &dst_off)
       || __builtin_mul_overflow (src_slice, src_offsets[0], &src_off))
     return EINVAL;
+  if (strides
+      && (__builtin_mul_overflow (dst_slice, strides[0], &dst_slice)
+	  || __builtin_mul_overflow (src_slice, strides[0], &src_slice)))
+    return EINVAL;
   for (j = 0; j < volume[0]; j++)
     {
       ret = omp_target_memcpy_rect_worker ((char *) dst + dst_off,
 					   (const char *) src + src_off,
-					   element_size, num_dims - 1,
-					   volume + 1, dst_offsets + 1,
-					   src_offsets + 1, dst_dimensions + 1,
+					   element_size, span, num_dims - 1,
+					   volume + 1,
+					   strides ? strides + 1 : NULL,
+					   dst_offsets + 1, src_offsets + 1,
+					   dst_dimensions + 1,
 					   src_dimensions + 1, dst_devicep,
 					   src_devicep, tmp_size, tmp);
       if (ret)
@@ -4870,8 +5620,8 @@ omp_target_memcpy_rect_copy (void *dst, const void *src,
     gomp_mutex_lock (&src_devicep->lock);
   if (lock_dst)
     gomp_mutex_lock (&dst_devicep->lock);
-  int ret = omp_target_memcpy_rect_worker (dst, src, element_size, num_dims,
-					   volume, dst_offsets, src_offsets,
+  int ret = omp_target_memcpy_rect_worker (dst, src, element_size, element_size, num_dims,
+					   volume, NULL, dst_offsets, src_offsets,
 					   dst_dimensions, src_dimensions,
 					   dst_devicep, src_devicep,
 					   &tmp_size, &tmp);
@@ -5536,6 +6286,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
   DLSYM (unload_image);
   DLSYM (alloc);
   DLSYM (free);
+  DLSYM_OPT (page_locked_host_alloc, page_locked_host_alloc);
+  DLSYM_OPT (page_locked_host_free, page_locked_host_free);
   DLSYM (dev2host);
   DLSYM (host2dev);
   DLSYM_OPT (memcpy2d, memcpy2d);
@@ -5573,6 +6325,7 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
 	  || !DLSYM_OPT (openacc.async.exec, openacc_async_exec)
 	  || !DLSYM_OPT (openacc.async.dev2host, openacc_async_dev2host)
 	  || !DLSYM_OPT (openacc.async.host2dev, openacc_async_host2dev)
+	  || !DLSYM_OPT (openacc.async.dev2dev, openacc_async_dev2dev)
 	  || !DLSYM_OPT (openacc.get_property, openacc_get_property))
 	{
 	  /* Require all the OpenACC handlers if we have
diff --git a/libgomp/testsuite/libgomp.c++/allocate-2.C b/libgomp/testsuite/libgomp.c++/allocate-2.C
new file mode 100644
index 0000000..f79cada
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/allocate-2.C
@@ -0,0 +1,329 @@
+/* { dg-do run } */
+/* { dg-additional-options "-fdump-tree-omplower" } */
+
+/* For the 4 vars in omp_parallel, 4 in omp_target and 1 of 2 in each of no_alloc{,2}_func.  */
+/* { dg-final { scan-tree-dump-times "__builtin_GOMP_alloc \\(" 10 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_GOMP_free \\(" 10 "omplower" } } */
+
+#include <omp.h>
+
+
+void
+check_int (int *x, int y)
+{
+  if (*x != y)
+    __builtin_abort ();
+}
+
+void
+check_ptr (int **x, int *y)
+{
+  if (*x != y)
+    __builtin_abort ();
+}
+
+
+template<typename t>
+t
+no_alloc_func ()
+{
+  /* There is no __builtin_GOMP_alloc / __builtin_GOMP_free as
+     allocator == omp_default_mem_alloc (known at compile time. */
+  t no_alloc, alloc_has_align = 3;
+  #pragma omp allocate(no_alloc) allocator(omp_default_mem_alloc)
+  /* But this one is allocated because of align. */
+  #pragma omp allocate(alloc_has_align) allocator(omp_default_mem_alloc) align(sizeof(t))
+  no_alloc = 7;
+  return no_alloc + alloc_has_align;
+}
+
+template<typename t>
+t
+no_alloc2_func()
+{
+  /* There is no __builtin_GOMP_alloc / __builtin_GOMP_free as
+     no_alloc2 is TREE_UNUSED.  But there is for is_alloc2.  */
+  t no_alloc2, is_alloc2;
+  #pragma omp allocate(no_alloc2, is_alloc2)
+  is_alloc2 = 7;
+  return is_alloc2;
+}
+
+
+template<typename t>
+void
+omp_parallel ()
+{
+  int n = 6;
+  t iii = 5, jjj[5], kkk[n];
+  t *ptr = (t *) 0x1234;
+  #pragma omp allocate(iii, jjj, kkk, ptr)
+
+  for (int i = 0; i < 5; i++)
+    jjj[i] = 3*i;
+  for (int i = 0; i < 6; i++)
+    kkk[i] = 7*i;
+
+  #pragma omp parallel default(none) firstprivate(iii, jjj, kkk, ptr) if(0)
+  {
+    if (iii != 5)
+      __builtin_abort();
+    iii = 7;
+    check_int (&iii, 7);
+    for (int i = 0; i < 5; i++)
+      if (jjj[i] != 3*i)
+	__builtin_abort ();
+    for (int i = 0; i < 6; i++)
+      if (kkk[i] != 7*i)
+	__builtin_abort ();
+    for (int i = 0; i < 5; i++)
+      jjj[i] = 4*i;
+    for (int i = 0; i < 6; i++)
+      kkk[i] = 8*i;
+    for (int i = 0; i < 5; i++)
+      check_int (&jjj[i], 4*i);
+    for (int i = 0; i < 6; i++)
+      check_int (&kkk[i], 8*i);
+    if (ptr != (int *) 0x1234)
+      __builtin_abort ();
+    ptr = (int *) 0xabcd;
+    if (ptr != (int *) 0xabcd)
+      __builtin_abort ();
+    check_ptr (&ptr, (int *) 0xabcd);
+  }
+  if (iii != 5)
+    __builtin_abort ();
+  check_int (&iii, 5);
+  for (int i = 0; i < 5; i++)
+    {
+      if (jjj[i] != 3*i)
+	__builtin_abort ();
+      check_int (&jjj[i], 3*i);
+    }
+  for (int i = 0; i < 6; i++)
+    {
+      if (kkk[i] != 7*i)
+	__builtin_abort ();
+      check_int (&kkk[i], 7*i);
+    }
+  if (ptr != (int *) 0x1234)
+    __builtin_abort ();
+  check_ptr (&ptr, (int *) 0x1234);
+
+  #pragma omp parallel default(firstprivate) if(0)
+  {
+    if (iii != 5)
+      __builtin_abort();
+    iii = 7;
+    check_int (&iii, 7);
+    for (int i = 0; i < 5; i++)
+      if (jjj[i] != 3*i)
+	__builtin_abort ();
+    for (int i = 0; i < 6; i++)
+      if (kkk[i] != 7*i)
+	__builtin_abort ();
+    for (int i = 0; i < 5; i++)
+      jjj[i] = 4*i;
+    for (int i = 0; i < 6; i++)
+      kkk[i] = 8*i;
+    for (int i = 0; i < 5; i++)
+      check_int (&jjj[i], 4*i);
+    for (int i = 0; i < 6; i++)
+      check_int (&kkk[i], 8*i);
+    if (ptr != (int *) 0x1234)
+      __builtin_abort ();
+    ptr = (int *) 0xabcd;
+    if (ptr != (int *) 0xabcd)
+      __builtin_abort ();
+    check_ptr (&ptr, (int *) 0xabcd);
+  }
+  if (iii != 5)
+    __builtin_abort ();
+  check_int (&iii, 5);
+  for (int i = 0; i < 5; i++)
+    {
+      if (jjj[i] != 3*i)
+	__builtin_abort ();
+      check_int (&jjj[i], 3*i);
+    }
+  for (int i = 0; i < 6; i++)
+    {
+      if (kkk[i] != 7*i)
+	__builtin_abort ();
+      check_int (&kkk[i], 7*i);
+    }
+  if (ptr != (int *) 0x1234)
+    __builtin_abort ();
+  check_ptr (&ptr, (int *) 0x1234);
+}
+
+
+template<typename t>
+void
+omp_target ()
+{
+  int n = 6;
+  t iii = 5, jjj[5], kkk[n];
+  t *ptr = (int *) 0x1234;
+  #pragma omp allocate(iii, jjj, kkk, ptr)
+
+  for (int i = 0; i < 5; i++)
+    jjj[i] = 3*i;
+  for (int i = 0; i < 6; i++)
+    kkk[i] = 7*i;
+
+  #pragma omp target defaultmap(none) firstprivate(iii, jjj, kkk, ptr)
+  {
+    if (iii != 5)
+      __builtin_abort();
+    iii = 7;
+    check_int (&iii, 7);
+    for (int i = 0; i < 5; i++)
+      if (jjj[i] != 3*i)
+	__builtin_abort ();
+    for (int i = 0; i < 6; i++)
+      if (kkk[i] != 7*i)
+	__builtin_abort ();
+    for (int i = 0; i < 5; i++)
+      jjj[i] = 4*i;
+    for (int i = 0; i < 6; i++)
+      kkk[i] = 8*i;
+    for (int i = 0; i < 5; i++)
+      check_int (&jjj[i], 4*i);
+    for (int i = 0; i < 6; i++)
+      check_int (&kkk[i], 8*i);
+    if (ptr != (int *) 0x1234)
+      __builtin_abort ();
+    ptr = (int *) 0xabcd;
+    if (ptr != (int *) 0xabcd)
+      __builtin_abort ();
+    check_ptr (&ptr, (int *) 0xabcd);
+  }
+  if (iii != 5)
+    __builtin_abort ();
+  check_int (&iii, 5);
+  for (int i = 0; i < 5; i++)
+    {
+      if (jjj[i] != 3*i)
+	__builtin_abort ();
+      check_int (&jjj[i], 3*i);
+    }
+  for (int i = 0; i < 6; i++)
+    {
+      if (kkk[i] != 7*i)
+	__builtin_abort ();
+      check_int (&kkk[i], 7*i);
+    }
+  if (ptr != (int *) 0x1234)
+    __builtin_abort ();
+  check_ptr (&ptr, (int *) 0x1234);
+
+  #pragma omp target defaultmap(firstprivate)
+  {
+    if (iii != 5)
+      __builtin_abort();
+    iii = 7;
+    check_int (&iii, 7);
+    for (int i = 0; i < 5; i++)
+      if (jjj[i] != 3*i)
+	__builtin_abort ();
+    for (int i = 0; i < 6; i++)
+      if (kkk[i] != 7*i)
+	__builtin_abort ();
+    for (int i = 0; i < 5; i++)
+      jjj[i] = 4*i;
+    for (int i = 0; i < 6; i++)
+      kkk[i] = 8*i;
+    for (int i = 0; i < 5; i++)
+      check_int (&jjj[i], 4*i);
+    for (int i = 0; i < 6; i++)
+      check_int (&kkk[i], 8*i);
+    if (ptr != (int *) 0x1234)
+      __builtin_abort ();
+    ptr = (int *) 0xabcd;
+    if (ptr != (int *) 0xabcd)
+      __builtin_abort ();
+    check_ptr (&ptr, (int *) 0xabcd);
+  }
+  if (iii != 5)
+    __builtin_abort ();
+  check_int (&iii, 5);
+  for (int i = 0; i < 5; i++)
+    {
+      if (jjj[i] != 3*i)
+	__builtin_abort ();
+      check_int (&jjj[i], 3*i);
+    }
+  for (int i = 0; i < 6; i++)
+    {
+      if (kkk[i] != 7*i)
+	__builtin_abort ();
+      check_int (&kkk[i], 7*i);
+    }
+  if (ptr != (int *) 0x1234)
+    __builtin_abort ();
+  check_ptr (&ptr, (int *) 0x1234);
+
+  #pragma omp target defaultmap(tofrom)
+  {
+    if (iii != 5)
+      __builtin_abort();
+    iii = 7;
+    check_int (&iii, 7);
+    for (int i = 0; i < 5; i++)
+      if (jjj[i] != 3*i)
+	__builtin_abort ();
+    for (int i = 0; i < 6; i++)
+      if (kkk[i] != 7*i)
+	__builtin_abort ();
+    for (int i = 0; i < 5; i++)
+      jjj[i] = 4*i;
+    for (int i = 0; i < 6; i++)
+      kkk[i] = 8*i;
+    for (int i = 0; i < 5; i++)
+      check_int (&jjj[i], 4*i);
+    for (int i = 0; i < 6; i++)
+      check_int (&kkk[i], 8*i);
+    if (ptr != (int *) 0x1234)
+      __builtin_abort ();
+    ptr = (int *) 0xabcd;
+    if (ptr != (int *) 0xabcd)
+      __builtin_abort ();
+    check_ptr (&ptr, (int *) 0xabcd);
+  }
+
+  if (iii != 7)
+    __builtin_abort ();
+  check_int (&iii, 7);
+  for (int i = 0; i < 5; i++)
+    {
+      if (jjj[i] != 4*i)
+	__builtin_abort ();
+      check_int (&jjj[i], 4*i);
+    }
+  for (int i = 0; i < 6; i++)
+    {
+      if (kkk[i] != 8*i)
+	__builtin_abort ();
+      check_int (&kkk[i], 8*i);
+    }
+  if (ptr != (int *) 0xabcd)
+    __builtin_abort ();
+  check_ptr (&ptr, (int *) 0xabcd);
+}
+
+int
+foo()
+{
+  return no_alloc_func<int>() + no_alloc2_func<int>();
+}
+
+int
+main ()
+{
+  omp_parallel<int> ();
+  omp_target<int> ();
+  if (foo() != 10 + 7)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/array-shaping-1.C b/libgomp/testsuite/libgomp.c++/array-shaping-1.C
new file mode 100644
index 0000000..6ff5f94
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/array-shaping-1.C
@@ -0,0 +1,469 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <string.h>
+#include <assert.h>
+
+volatile int yy = 4, zz = 2, str_str = 2;
+
+template<typename T>
+void foo()
+{
+  T *arr;
+  int x = 5;
+  T arr2d[10][10];
+
+  arr = new T[100];
+
+  /* Update whole reshaped array.  */
+
+  memset (arr, 0, 100 * sizeof (T));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < x; j++)
+    for (int i = 0; i < 10; i++)
+      arr[j * 10 + i] = i ^ j;
+
+#pragma omp target update to(([10][x]) arr)
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if (j < x)
+	assert (arr[j * 10 + i] == i ^ j);
+      else
+	assert (arr[j * 10 + i] == 0);
+
+
+  /* Strided update.  */
+
+  memset (arr, 0, 100 * sizeof (T));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < 20; j++)
+    for (int i = 0; i < 5; i++)
+      arr[j * 5 + i] = i + j;
+
+#pragma omp target update to(([5][5]) arr[0:3][0:3:2])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 20; j++)
+    for (int i = 0; i < 5; i++)
+      if (j < 3 && (i & 1) == 0 && i < 6)
+	assert (arr[j * 5 + i] == i + j);
+      else
+	assert (arr[j * 5 + i] == 0);
+
+
+  /* Reshaped update, contiguous.  */
+
+  memset (arr, 0, 100 * sizeof (T));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < 20; j++)
+    for (int i = 0; i < 5; i++)
+      arr[j * 5 + i] = 2 * j + i;
+
+#pragma omp target update to(([5][5]) arr[0:5][0:5])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 20; j++)
+    for (int i = 0; i < 5; i++)
+      if (j < 5 && i < 5)
+	assert (arr[j * 5 + i] == 2 * j + i);
+      else
+	assert (arr[j * 5 + i] == 0);
+
+
+  /* Strided update on actual array.  */
+
+  memset (arr2d, 0, 100 * sizeof (T));
+
+#pragma omp target enter data map(to: arr2d)
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      arr2d[j][i] = j + 2 * i;
+
+#pragma omp target update to(arr2d[0:5:2][5:2])
+
+#pragma omp target exit data map(from: arr2d)
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if ((j & 1) == 0 && i >= 5 && i < 7)
+	assert (arr2d[j][i] == j + 2 * i);
+      else
+	assert (arr2d[j][i] == 0);
+
+
+  /* Update with non-constant bounds.  */
+
+  memset (arr, 0, 100 * sizeof (T));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      arr[j * 10 + i] = (2 * j) ^ i;
+
+  x = 3;
+  int y = yy, z = zz, str = str_str;
+  /* This is actually [0:3:2] [4:2:2].  */
+#pragma omp target update to(([10][10]) arr[0:x:2][y:z:str])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if ((j & 1) == 0 && j < 6 && (i & 1) == 0 && i >= 4 && i < 8)
+	assert (arr[j * 10 + i] == (2 * j) ^ i);
+      else
+	assert (arr[j * 10 + i] == 0);
+
+
+  /* Update with full "major" dimension.  */
+
+  memset (arr, 0, 100 * sizeof (T));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      arr[j * 10 + i] = i + j;
+
+#pragma omp target update to(([10][10]) arr[0:10][3:1])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if (i == 3)
+	assert (arr[j * 10 + i] == i + j);
+      else
+	assert (arr[j * 10 + i] == 0);
+
+
+  /* Update with full "minor" dimension.  */
+
+  memset (arr, 0, 100 * sizeof (T));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      arr[j * 10 + i] = 3 * (i + j);
+
+#pragma omp target update to(([10][10]) arr[3:2][0:10])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if (j >= 3 && j < 5)
+	assert (arr[j * 10 + i] == 3 * (i + j));
+      else
+	assert (arr[j * 10 + i] == 0);
+
+
+  /* Rectangle update.  */
+
+  memset (arr, 0, 100 * sizeof (T));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      arr[j * 10 + i] = 5 * (i + j);
+
+#pragma omp target update to(([10][10]) arr[3:2][0:9])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if (j >= 3 && j < 5 && i < 9)
+	assert (arr[j * 10 + i] == 5 * (i + j));
+      else
+	assert (arr[j * 10 + i] == 0);
+
+
+  /* One-dimensional strided update.  */
+
+  memset (arr, 0, 100 * sizeof (T));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int i = 0; i < 100; i++)
+    arr[i] = i + 99;
+
+#pragma omp target update to(([100]) arr[3:33:3])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int i = 0; i < 100; i++)
+    if (i >= 3 && ((i - 3) % 3) == 0)
+      assert (arr[i] == i + 99);
+    else
+      assert (arr[i] == 0);
+
+
+  /* One-dimensional strided update without explicit array shape.  */
+
+  memset (arr, 0, 100 * sizeof (T));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int i = 0; i < 100; i++)
+    arr[i] = i + 121;
+
+#pragma omp target update to(arr[3:33:3])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int i = 0; i < 100; i++)
+    if (i >= 3 && ((i - 3) % 3) == 0)
+      assert (arr[i] == i + 121);
+    else
+      assert (arr[i] == 0);
+
+  delete[] arr;
+}
+
+int main()
+{
+  int *arr;
+  int x = 5;
+  int arr2d[10][10];
+
+  arr = new int[100];
+
+  /* Update whole reshaped array.  */
+
+  memset (arr, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < x; j++)
+    for (int i = 0; i < 10; i++)
+      arr[j * 10 + i] = i ^ j;
+
+#pragma omp target update to(([10][x]) arr)
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if (j < x)
+	assert (arr[j * 10 + i] == i ^ j);
+      else
+	assert (arr[j * 10 + i] == 0);
+
+
+  /* Strided update.  */
+
+  memset (arr, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < 20; j++)
+    for (int i = 0; i < 5; i++)
+      arr[j * 5 + i] = i + j;
+
+#pragma omp target update to(([5][5]) arr[0:3][0:3:2])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 20; j++)
+    for (int i = 0; i < 5; i++)
+      if (j < 3 && (i & 1) == 0 && i < 6)
+	assert (arr[j * 5 + i] == i + j);
+      else
+	assert (arr[j * 5 + i] == 0);
+
+
+  /* Reshaped update, contiguous.  */
+
+  memset (arr, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < 20; j++)
+    for (int i = 0; i < 5; i++)
+      arr[j * 5 + i] = 2 * j + i;
+
+#pragma omp target update to(([5][5]) arr[0:5][0:5])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 20; j++)
+    for (int i = 0; i < 5; i++)
+      if (j < 5 && i < 5)
+	assert (arr[j * 5 + i] == 2 * j + i);
+      else
+	assert (arr[j * 5 + i] == 0);
+
+
+  /* Strided update on actual array.  */
+
+  memset (arr2d, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr2d)
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      arr2d[j][i] = j + 2 * i;
+
+#pragma omp target update to(arr2d[0:5:2][5:2])
+
+#pragma omp target exit data map(from: arr2d)
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if ((j & 1) == 0 && i >= 5 && i < 7)
+	assert (arr2d[j][i] == j + 2 * i);
+      else
+	assert (arr2d[j][i] == 0);
+
+
+  /* Update with non-constant bounds.  */
+
+  memset (arr, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      arr[j * 10 + i] = (2 * j) ^ i;
+
+  x = 3;
+  int y = yy, z = zz, str = str_str;
+  /* This is actually [0:3:2] [4:2:2].  */
+#pragma omp target update to(([10][10]) arr[0:x:2][y:z:str])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if ((j & 1) == 0 && j < 6 && (i & 1) == 0 && i >= 4 && i < 8)
+	assert (arr[j * 10 + i] == (2 * j) ^ i);
+      else
+	assert (arr[j * 10 + i] == 0);
+
+
+  /* Update with full "major" dimension.  */
+
+  memset (arr, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      arr[j * 10 + i] = i + j;
+
+#pragma omp target update to(([10][10]) arr[0:10][3:1])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if (i == 3)
+	assert (arr[j * 10 + i] == i + j);
+      else
+	assert (arr[j * 10 + i] == 0);
+
+
+  /* Update with full "minor" dimension.  */
+
+  memset (arr, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      arr[j * 10 + i] = 3 * (i + j);
+
+#pragma omp target update to(([10][10]) arr[3:2][0:10])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if (j >= 3 && j < 5)
+	assert (arr[j * 10 + i] == 3 * (i + j));
+      else
+	assert (arr[j * 10 + i] == 0);
+
+
+  /* Rectangle update.  */
+
+  memset (arr, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      arr[j * 10 + i] = 5 * (i + j);
+
+#pragma omp target update to(([10][10]) arr[3:2][0:9])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if (j >= 3 && j < 5 && i < 9)
+	assert (arr[j * 10 + i] == 5 * (i + j));
+      else
+	assert (arr[j * 10 + i] == 0);
+
+
+  /* One-dimensional strided update.  */
+
+  memset (arr, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int i = 0; i < 100; i++)
+    arr[i] = i + 99;
+
+#pragma omp target update to(([100]) arr[3:33:3])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int i = 0; i < 100; i++)
+    if (i >= 3 && ((i - 3) % 3) == 0)
+      assert (arr[i] == i + 99);
+    else
+      assert (arr[i] == 0);
+
+
+  /* One-dimensional strided update without explicit array shape.  */
+
+  memset (arr, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int i = 0; i < 100; i++)
+    arr[i] = i + 121;
+
+#pragma omp target update to(arr[3:33:3])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int i = 0; i < 100; i++)
+    if (i >= 3 && ((i - 3) % 3) == 0)
+      assert (arr[i] == i + 121);
+    else
+      assert (arr[i] == 0);
+
+  delete[] arr;
+
+  foo<long> ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/array-shaping-10.C b/libgomp/testsuite/libgomp.c++/array-shaping-10.C
new file mode 100644
index 0000000..648f02d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/array-shaping-10.C
@@ -0,0 +1,61 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <assert.h>
+#include <string.h>
+
+#define N 10
+
+template<typename T>
+void foo ()
+{
+  T tarr[N * N];
+
+  memset (tarr, 0, N * N * sizeof (T));
+
+#pragma omp target enter data map(to: tarr)
+
+#pragma omp target
+  {
+    for (int i = 0; i < N; i++)
+      for (int j = 0; j < N; j++)
+	tarr[i * N + j] = 2 * (i + j);
+  }
+
+  /* An array, but cast to a pointer, then reshaped.  */
+#pragma omp target update from(([N][N]) ((T *) &tarr[0])[4:3][5:3])
+
+  for (int i = 4; i < 7; i++)
+    for (int j = 5; j < 8; j++)
+      assert (tarr[i * N + j] == 2 * (i + j));
+
+#pragma omp target exit data map(delete: tarr)
+}
+
+int main ()
+{
+  int iarr[N * N];
+
+  memset (iarr, 0, N * N * sizeof (int));
+
+#pragma omp target enter data map(to: iarr)
+
+#pragma omp target
+  {
+    for (int i = 0; i < 10; i++)
+      for (int j = 0; j < 10; j++)
+	iarr[i * 10 + j] = i + j;
+  }
+
+  /* An array, but cast to a pointer, then reshaped.  */
+#pragma omp target update from(([10][10]) ((int *) &iarr[0])[4:3][4:3])
+
+  for (int i = 4; i < 7; i++)
+    for (int j = 4; j < 7; j++)
+      assert (iarr[i * 10 + j] == i + j);
+
+#pragma omp target exit data map(delete: iarr)
+
+  foo<unsigned short> ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/array-shaping-11.C b/libgomp/testsuite/libgomp.c++/array-shaping-11.C
new file mode 100644
index 0000000..6b15bd6
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/array-shaping-11.C
@@ -0,0 +1,63 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <assert.h>
+#include <string.h>
+
+#define N 10
+
+template<typename T>
+void foo ()
+{
+  T tarr_real[N * N];
+  T (&tarr)[N * N] = tarr_real;
+
+  memset (tarr, 0, N * N * sizeof (T));
+
+#pragma omp target enter data map(to: tarr)
+
+#pragma omp target
+  {
+    for (int i = 0; i < N; i++)
+      for (int j = 0; j < N; j++)
+	tarr[i * N + j] = 2 * (i + j);
+  }
+
+  /* A ref to an array, but cast to a pointer, then reshaped.  */
+#pragma omp target update from(([N][N]) ((T *) &tarr[0])[4:3][5:3])
+
+  for (int i = 4; i < 7; i++)
+    for (int j = 5; j < 8; j++)
+      assert (tarr[i * N + j] == 2 * (i + j));
+
+#pragma omp target exit data map(delete: tarr)
+}
+
+int main ()
+{
+  int iarr_real[N * N];
+  int (&iarr)[N * N] = iarr_real;
+
+  memset (iarr, 0, N * N * sizeof (int));
+
+#pragma omp target enter data map(to: iarr)
+
+#pragma omp target
+  {
+    for (int i = 0; i < 10; i++)
+      for (int j = 0; j < 10; j++)
+	iarr[i * 10 + j] = i + j;
+  }
+
+  /* A ref to an array, but cast to a pointer, then reshaped.  */
+#pragma omp target update from(([10][10]) ((int *) &iarr[0])[4:3][4:3])
+
+  for (int i = 4; i < 7; i++)
+    for (int j = 4; j < 7; j++)
+      assert (iarr[i * 10 + j] == i + j);
+
+#pragma omp target exit data map(delete: iarr)
+
+  foo<unsigned short> ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/array-shaping-12.C b/libgomp/testsuite/libgomp.c++/array-shaping-12.C
new file mode 100644
index 0000000..103c99a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/array-shaping-12.C
@@ -0,0 +1,65 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <assert.h>
+#include <string.h>
+
+#define N 10
+
+template<typename T>
+void foo ()
+{
+  T tarr_real[N * N];
+  T *tarrp = &tarr_real[0];
+  T **tarrpp = &tarrp;
+
+  memset (tarrp, 0, N * N * sizeof (T));
+
+#pragma omp target enter data map(to: tarr_real)
+
+#pragma omp target
+  {
+    for (int i = 0; i < N; i++)
+      for (int j = 0; j < N; j++)
+	tarrp[i * N + j] = 2 * (i + j);
+  }
+
+  /* A pointer with an extra indirection.  */
+#pragma omp target update from(([N][N]) (*tarrpp)[4:3][5:3])
+
+  for (int i = 4; i < 7; i++)
+    for (int j = 5; j < 8; j++)
+      assert (tarrp[i * N + j] == 2 * (i + j));
+
+#pragma omp target exit data map(delete: tarr_real)
+}
+
+int main ()
+{
+  int iarr_real[N * N];
+  int *iarrp = &iarr_real[0];
+  int **iarrpp = &iarrp;
+
+  memset (iarrp, 0, N * N * sizeof (int));
+
+#pragma omp target enter data map(to: iarr_real)
+
+#pragma omp target
+  {
+    for (int i = 0; i < 10; i++)
+      for (int j = 0; j < 10; j++)
+	iarrp[i * 10 + j] = i + j;
+  }
+
+  /* A pointer with an extra indirection.  */
+#pragma omp target update from(([10][10]) (*iarrpp)[4:3][4:3])
+
+  for (int i = 4; i < 7; i++)
+    for (int j = 4; j < 7; j++)
+      assert (iarrp[i * 10 + j] == i + j);
+
+#pragma omp target exit data map(delete: iarr_real)
+
+  foo<unsigned short> ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/array-shaping-13.C b/libgomp/testsuite/libgomp.c++/array-shaping-13.C
new file mode 100644
index 0000000..29345ca
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/array-shaping-13.C
@@ -0,0 +1,89 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <assert.h>
+#include <string.h>
+
+#define N 10
+
+template<typename T>
+void foo ()
+{
+  T *tptr = new T[N * N * N];
+
+  memset (tptr, 0, N * N * N * sizeof (T));
+
+#pragma omp target enter data map(to: tptr[0:N*N*N])
+
+#pragma omp target
+  {
+    for (int i = 0; i < N; i++)
+      for (int j = 0; j < N; j++)
+	tptr[i * N * N + 4 * N + j] = 2 * (i + j);
+  }
+
+  /* An array ref between two array sections.  */
+#pragma omp target update from(([N][N][N]) tptr[4:3][4][5:3])
+
+  for (int i = 4; i < 7; i++)
+    for (int j = 5; j < 8; j++)
+      assert (tptr[i * N * N + 4 * N + j] == 2 * (i + j));
+
+  memset (tptr, 0, N * N * N * sizeof (T));
+
+  for (int i = 0; i < N; i++)
+    tptr[2 * N * N + i * N + 4] = 4 * i;
+
+  /* Array section between two array refs.  */
+#pragma omp target update to(([N][N][N]) tptr[2][3:6][4])
+
+#pragma omp target exit data map(from: tptr[0:N*N*N])
+
+  for (int i = 3; i < 9; i++)
+    assert (tptr[2 * N * N + i * N + 4] == 4 * i);
+
+#pragma omp target exit data map(delete: tptr[0:N*N*N])
+
+  delete[] tptr;
+}
+
+int main ()
+{
+  int *iptr = new int[N * N * N];
+
+  memset (iptr, 0, N * N * N * sizeof (int));
+
+#pragma omp target enter data map(to: iptr[0:N*N*N])
+
+#pragma omp target
+  {
+    for (int i = 0; i < N; i++)
+      for (int j = 0; j < N; j++)
+	iptr[i * N * N + 4 * N + j] = i + j;
+  }
+
+  /* An array ref between two array sections.  */
+#pragma omp target update from(([N][N][N]) iptr[2:3][4][6:3])
+
+  for (int i = 2; i < 5; i++)
+    for (int j = 6; j < 9; j++)
+      assert (iptr[i * N * N + 4 * N + j] == i + j);
+
+  memset (iptr, 0, N * N * N * sizeof (int));
+
+  for (int i = 0; i < N; i++)
+    iptr[2 * N * N + i * N + 4] = 3 * i;
+
+  /* Array section between two array refs.  */
+#pragma omp target update to(([N][N][N]) iptr[2][3:6][4])
+
+#pragma omp target exit data map(from: iptr[0:N*N*N])
+
+  for (int i = 3; i < 9; i++)
+    assert (iptr[2 * N * N + i * N + 4] == 3 * i);
+
+  delete[] iptr;
+
+  foo<unsigned long> ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/array-shaping-2.C b/libgomp/testsuite/libgomp.c++/array-shaping-2.C
new file mode 100644
index 0000000..027543e8
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/array-shaping-2.C
@@ -0,0 +1,38 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <string.h>
+#include <assert.h>
+
+template<typename T>
+void foo (T *w)
+{
+  memset (w, 0, sizeof (T) * 100);
+
+#pragma omp target enter data map(to: w[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      w[j * 10 + i] = i + j;
+
+#pragma omp target update to(([10][10]) w[3:2][1:8])
+
+#pragma omp target exit data map(from: w[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if (j >= 3 && j < 5 && i >= 1 && i < 9)
+	assert (w[j * 10 + i] == i + j);
+      else
+	assert (w[j * 10 + i] == 0);
+}
+
+int main()
+{
+  int *arr = new int[100];
+
+  foo<int> (arr);
+
+  delete[] arr;
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/array-shaping-3.C b/libgomp/testsuite/libgomp.c++/array-shaping-3.C
new file mode 100644
index 0000000..09ff04b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/array-shaping-3.C
@@ -0,0 +1,38 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <string.h>
+#include <assert.h>
+
+template<int C, int D>
+void foo (double *w)
+{
+  memset (w, 0, sizeof (double) * 100);
+
+#pragma omp target enter data map(to: w[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      w[j * 10 + i] = i * 3 + j * 2;
+
+#pragma omp target update to(([C][D]) w[3:2][1:8])
+
+#pragma omp target exit data map(from: w[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if (j >= 3 && j < 5 && i >= 1 && i < 9)
+	assert (w[j * 10 + i] == i * 3 + j * 2);
+      else
+	assert (w[j * 10 + i] == 0.0f);
+}
+
+int main()
+{
+  double *arr = new double[100];
+
+  foo<10, 10> (arr);
+
+  delete[] arr;
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/array-shaping-4.C b/libgomp/testsuite/libgomp.c++/array-shaping-4.C
new file mode 100644
index 0000000..efa115e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/array-shaping-4.C
@@ -0,0 +1,38 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <string.h>
+#include <assert.h>
+
+template<auto C, auto D>
+void foo (double *w)
+{
+  memset (w, 0, sizeof (double) * 100);
+
+#pragma omp target enter data map(to: w[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      w[j * 10 + i] = i * 2 + j * 3;
+
+#pragma omp target update to(([C][D]) w[3:2][1:8])
+
+#pragma omp target exit data map(from: w[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if (j >= 3 && j < 5 && i >= 1 && i < 9)
+	assert (w[j * 10 + i] == i * 2 + j * 3);
+      else
+	assert (w[j * 10 + i] == 0.0f);
+}
+
+int main()
+{
+  double *arr = new double[100];
+
+  foo<10, 10> (arr);
+
+  delete[] arr;
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/array-shaping-5.C b/libgomp/testsuite/libgomp.c++/array-shaping-5.C
new file mode 100644
index 0000000..7046a13
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/array-shaping-5.C
@@ -0,0 +1,38 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <string.h>
+#include <assert.h>
+
+template<typename T, auto C>
+void foo (T *w, int e, int f, int g)
+{
+  memset (w, 0, sizeof (T) * 100);
+
+#pragma omp target enter data map(to: w[:100])
+
+  for (int j = 0; j < e; j++)
+    for (int i = 0; i < C; i++)
+      w[j * C + i] = i + j;
+
+#pragma omp target update to(([e][C]) w[3:2][f:g])
+
+#pragma omp target exit data map(from: w[:100])
+
+  for (int j = 0; j < e; j++)
+    for (int i = 0; i < C; i++)
+      if (j >= 3 && j < 5 && i >= f && i < f + g)
+	assert (w[j * C + i] == i + j);
+      else
+	assert (w[j * C + i] == 0.0f);
+}
+
+int main()
+{
+  float *arr = new float[100];
+
+  foo<float, 10> (arr, 10, 1, 8);
+
+  delete[] arr;
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/array-shaping-6.C b/libgomp/testsuite/libgomp.c++/array-shaping-6.C
new file mode 100644
index 0000000..b960b5e5
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/array-shaping-6.C
@@ -0,0 +1,54 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <assert.h>
+#include <string.h>
+
+template<typename T>
+void foo (T *&aref)
+{
+#pragma omp target enter data map(to: aref[:100])
+
+#pragma omp target
+  {
+    for (int i = 0; i < 10; i++)
+      for (int j = 0; j < 10; j++)
+	aref[i * 10 + j] = i + j;
+  }
+
+#pragma omp target update from(([10][10]) aref[2:3:2][7:3])
+
+  for (int i = 2; i < 8; i += 2)
+    for (int j = 7; j < 10; j++)
+      assert (aref[i * 10 + j] == i + j);
+
+#pragma omp target exit data map(delete: aref[:100])
+}
+
+int main()
+{
+  float *arr = new float[100];
+  float *&w = arr;
+
+  memset (arr, 0, 100 * sizeof (float));
+
+#pragma omp target enter data map(to: w[:100])
+
+#pragma omp target
+  {
+    for (int i = 0; i < 10; i++)
+      for (int j = 0; j < 10; j++)
+	w[i * 10 + j] = i + j;
+  }
+
+#pragma omp target update from(([10][10]) w[4:3][4:3])
+
+  for (int i = 4; i < 7; i++)
+    for (int j = 4; j < 7; j++)
+      assert (w[i * 10 + j] == i + j);
+
+#pragma omp target exit data map(delete: w[:100])
+
+  foo<float> (arr);
+
+  delete[] arr;
+}
diff --git a/libgomp/testsuite/libgomp.c++/array-shaping-7.C b/libgomp/testsuite/libgomp.c++/array-shaping-7.C
new file mode 100644
index 0000000..b6193f8
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/array-shaping-7.C
@@ -0,0 +1,54 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <assert.h>
+#include <string.h>
+
+template<typename T>
+void foo (T (&aref)[10][10])
+{
+#pragma omp target enter data map(to: aref)
+
+#pragma omp target
+  {
+    for (int i = 0; i < 10; i++)
+      for (int j = 0; j < 10; j++)
+	aref[i][j] = i + j;
+  }
+
+#pragma omp target update from(aref[2:3:2][7:3])
+
+  for (int i = 2; i < 8; i += 2)
+    for (int j = 7; j < 10; j++)
+      assert (aref[i][j] == i + j);
+
+#pragma omp target exit data map(delete: aref)
+}
+
+int main()
+{
+  float arr2d[10][10];
+  float (&w)[10][10] = arr2d;
+
+  memset (&arr2d, 0, 100 * sizeof (float));
+
+#pragma omp target enter data map(to: w)
+
+#pragma omp target
+  {
+    for (int i = 0; i < 10; i++)
+      for (int j = 0; j < 10; j++)
+	w[i][j] = i + j;
+  }
+
+#pragma omp target update from(w[4:3][4:3])
+
+  for (int i = 4; i < 7; i++)
+    for (int j = 4; j < 7; j++)
+      assert (w[i][j] == i + j);
+
+#pragma omp target exit data map(delete: w)
+
+  foo<float> (arr2d);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/array-shaping-8.C b/libgomp/testsuite/libgomp.c++/array-shaping-8.C
new file mode 100644
index 0000000..a96cf3c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/array-shaping-8.C
@@ -0,0 +1,65 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <assert.h>
+#include <string.h>
+
+template<typename T>
+struct C {
+  T *&aptr;
+
+  C(T *&aptr_1) : aptr(aptr_1)
+  {
+  }
+};
+
+template<typename T>
+void foo (T *c)
+{
+#pragma omp target enter data map(to: c->aptr, c->aptr[:100])
+
+#pragma omp target
+  {
+    for (int i = 0; i < 10; i++)
+      for (int j = 0; j < 10; j++)
+	c->aptr[i * 10 + j] = i + j;
+  }
+
+#pragma omp target update from(([10][10]) c->aptr[2:3:2][7:3])
+
+  for (int i = 2; i < 8; i += 2)
+    for (int j = 7; j < 10; j++)
+      assert (c->aptr[i * 10 + j] == i + j);
+
+#pragma omp target exit data map(delete: c->aptr, c->aptr[:100])
+}
+
+int main()
+{
+  float *arr = new float[100];
+  C<float> cvar(arr);
+
+  memset (arr, 0, 100 * sizeof (float));
+
+#pragma omp target enter data map(to: cvar.aptr, cvar.aptr[:100])
+
+#pragma omp target
+  {
+    for (int i = 0; i < 10; i++)
+      for (int j = 0; j < 10; j++)
+	cvar.aptr[i * 10 + j] = i + j;
+  }
+
+#pragma omp target update from(([10][10]) cvar.aptr[4:3][4:3])
+
+  for (int i = 4; i < 7; i++)
+    for (int j = 4; j < 7; j++)
+      assert (cvar.aptr[i * 10 + j] == i + j);
+
+#pragma omp target exit data map(delete: cvar.aptr, cvar.aptr[:100])
+
+  foo<C<float> > (&cvar);
+
+  delete[] arr;
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/array-shaping-9.C b/libgomp/testsuite/libgomp.c++/array-shaping-9.C
new file mode 100644
index 0000000..786fe9d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/array-shaping-9.C
@@ -0,0 +1,95 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <assert.h>
+#include <string.h>
+
+#define N 10
+
+struct B {
+  int (&aref)[N][N];
+
+  B(int (&aref1)[N][N]) : aref(aref1)
+  {
+  }
+};
+
+template<typename T, int S>
+struct C {
+  T (&aref)[S][S];
+
+  C(T (&aref1)[S][S]) : aref(aref1)
+  {
+  }
+};
+
+template<typename T>
+void foo (T *c)
+{
+#pragma omp target enter data map(to: c->aref)
+
+#pragma omp target
+  {
+    for (int i = 0; i < 10; i++)
+      for (int j = 0; j < 10; j++)
+	c->aref[i][j] = 2 * (i + j);
+  }
+
+#pragma omp target update from(c->aref[2:3:2][7:3])
+
+  for (int i = 2; i < 8; i += 2)
+    for (int j = 7; j < 10; j++)
+      assert (c->aref[i][j] == 2 * (i + j));
+
+#pragma omp target exit data map(delete: c->aref)
+}
+
+int main()
+{
+  int iarr[N][N];
+  float farr[N][N];
+  B bvar(iarr);
+  C<float, N> cvar(farr);
+
+  memset (iarr, 0, N * N * sizeof (int));
+  memset (farr, 0, N * N * sizeof (float));
+
+#pragma omp target enter data map(to: bvar.aref)
+
+#pragma omp target
+  {
+    for (int i = 0; i < 10; i++)
+      for (int j = 0; j < 10; j++)
+	bvar.aref[i][j] = i + j;
+  }
+
+#pragma omp target update from(bvar.aref[4:3][4:3])
+
+  for (int i = 4; i < 7; i++)
+    for (int j = 4; j < 7; j++)
+      assert (bvar.aref[i][j] == i + j);
+
+#pragma omp target exit data map(delete: bvar.aref)
+
+#pragma omp target enter data map(to: cvar.aref)
+
+#pragma omp target
+  {
+    for (int i = 0; i < 10; i++)
+      for (int j = 0; j < 10; j++)
+	cvar.aref[i][j] = i + j;
+  }
+
+#pragma omp target update from(cvar.aref[4:3][4:3])
+
+  for (int i = 4; i < 7; i++)
+    for (int j = 4; j < 7; j++)
+      assert (cvar.aref[i][j] == i + j);
+
+#pragma omp target exit data map(delete: cvar.aref)
+
+  memset (farr, 0, N * N * sizeof (float));
+
+  foo<C<float, N> > (&cvar);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/c++.exp b/libgomp/testsuite/libgomp.c++/c++.exp
index ed096e1..5be949b 100644
--- a/libgomp/testsuite/libgomp.c++/c++.exp
+++ b/libgomp/testsuite/libgomp.c++/c++.exp
@@ -1,6 +1,15 @@
 load_lib libgomp-dg.exp
 load_gcc_lib gcc-dg.exp
 
+proc check_effective_target_c { } {
+    return 0
+}
+
+proc check_effective_target_c++ { } {
+    return 1
+}
+
+
 if { $blddir != "" } {
     set libstdc++_library_path "../libstdc++-v3/src/.libs"
     set shlib_ext [get_shlib_extension]
diff --git a/libgomp/testsuite/libgomp.c++/declare-mapper-1.C b/libgomp/testsuite/libgomp.c++/declare-mapper-1.C
new file mode 100644
index 0000000..aba4f42
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/declare-mapper-1.C
@@ -0,0 +1,87 @@
+// { dg-do run }
+
+#include <cstdlib>
+#include <cassert>
+
+#define N 64
+
+struct points
+{
+  double *x;
+  double *y;
+  double *z;
+  size_t len;
+};
+
+#pragma omp declare mapper(points p) map(to:p.x, p.y, p.z) \
+				     map(p.x[0:p.len]) \
+				     map(p.y[0:p.len]) \
+				     map(p.z[0:p.len])
+
+struct shape
+{
+  points tmp;
+  points *pts;
+  int metadata[128];
+};
+
+#pragma omp declare mapper(shape s) map(tofrom:s.pts, *s.pts) map(alloc:s.tmp)
+
+void
+alloc_points (points *pts, size_t sz)
+{
+  pts->x = new double[sz];
+  pts->y = new double[sz];
+  pts->z = new double[sz];
+  pts->len = sz;
+  for (int i = 0; i < sz; i++)
+    pts->x[i] = pts->y[i] = pts->z[i] = 0;
+}
+
+int main (int argc, char *argv[])
+{
+  shape myshape;
+  points mypts;
+
+  myshape.pts = &mypts;
+
+  alloc_points (&myshape.tmp, N);
+  myshape.pts = new points;
+  alloc_points (myshape.pts, N);
+
+  #pragma omp target map(myshape)
+  {
+    for (int i = 0; i < N; i++)
+      {
+	myshape.pts->x[i]++;
+	myshape.pts->y[i]++;
+	myshape.pts->z[i]++;
+      }
+  }
+
+  for (int i = 0; i < N; i++)
+    {
+      assert (myshape.pts->x[i] == 1);
+      assert (myshape.pts->y[i] == 1);
+      assert (myshape.pts->z[i] == 1);
+    }
+
+  #pragma omp target
+  {
+    for (int i = 0; i < N; i++)
+      {
+	myshape.pts->x[i]++;
+	myshape.pts->y[i]++;
+	myshape.pts->z[i]++;
+      }
+  }
+
+  for (int i = 0; i < N; i++)
+    {
+      assert (myshape.pts->x[i] == 2);
+      assert (myshape.pts->y[i] == 2);
+      assert (myshape.pts->z[i] == 2);
+    }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/declare-mapper-2.C b/libgomp/testsuite/libgomp.c++/declare-mapper-2.C
new file mode 100644
index 0000000..d848fdb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/declare-mapper-2.C
@@ -0,0 +1,55 @@
+// { dg-do run }
+
+#include <cassert>
+
+#define N 256
+
+struct doublebuf
+{
+  int buf_a[N][N];
+  int buf_b[N][N];
+};
+
+#pragma omp declare mapper(lo:doublebuf b) map(b.buf_a[0:N/2][0:N]) \
+					   map(b.buf_b[0:N/2][0:N])
+
+#pragma omp declare mapper(hi:doublebuf b) map(b.buf_a[N/2:N/2][0:N]) \
+					   map(b.buf_b[N/2:N/2][0:N])
+
+int main (int argc, char *argv[])
+{
+  doublebuf db;
+
+  for (int i = 0; i < N; i++)
+    for (int j = 0; j < N; j++)
+      db.buf_a[i][j] = db.buf_b[i][j] = 0;
+
+  #pragma omp target map(mapper(lo), tofrom:db)
+  {
+    for (int i = 0; i < N / 2; i++)
+      for (int j = 0; j < N; j++)
+	{
+	  db.buf_a[i][j]++;
+	  db.buf_b[i][j]++;
+	}
+  }
+
+  #pragma omp target map(mapper(hi), tofrom:db)
+  {
+    for (int i = N / 2; i < N; i++)
+      for (int j = 0; j < N; j++)
+	{
+	  db.buf_a[i][j]++;
+	  db.buf_b[i][j]++;
+	}
+  }
+
+  for (int i = 0; i < N; i++)
+    for (int j = 0; j < N; j++)
+      {
+	assert (db.buf_a[i][j] == 1);
+	assert (db.buf_b[i][j] == 1);
+      }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/declare-mapper-3.C b/libgomp/testsuite/libgomp.c++/declare-mapper-3.C
new file mode 100644
index 0000000..ea9b7de
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/declare-mapper-3.C
@@ -0,0 +1,63 @@
+// { dg-do run }
+
+#include <cstdlib>
+#include <cassert>
+
+struct S {
+  int *myarr;
+};
+
+#pragma omp declare mapper (S s) map(to:s.myarr) map (tofrom: s.myarr[0:20])
+
+namespace A {
+#pragma omp declare mapper (S s) map(to:s.myarr) map (tofrom: s.myarr[0:100])
+}
+
+namespace B {
+#pragma omp declare mapper (S s) map(to:s.myarr) map (tofrom: s.myarr[100:100])
+}
+
+namespace A
+{
+  void incr_a (S my_s)
+  {
+#pragma omp target
+    {
+      for (int i = 0; i < 100; i++)
+	my_s.myarr[i]++;
+    }
+  }
+}
+
+namespace B
+{
+  void incr_b (S my_s)
+  {
+#pragma omp target
+    {
+      for (int i = 100; i < 200; i++)
+	my_s.myarr[i]++;
+    }
+  }
+}
+
+int main (int argc, char *argv[])
+{
+  S my_s;
+
+  my_s.myarr = (int *) calloc (200, sizeof (int));
+
+#pragma omp target
+  {
+    for (int i = 0; i < 20; i++)
+      my_s.myarr[i]++;
+  }
+
+  A::incr_a (my_s);
+  B::incr_b (my_s);
+
+  for (int i = 0; i < 200; i++)
+    assert (my_s.myarr[i] == (i < 20) ? 2 : 1);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/declare-mapper-4.C b/libgomp/testsuite/libgomp.c++/declare-mapper-4.C
new file mode 100644
index 0000000..f194e63
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/declare-mapper-4.C
@@ -0,0 +1,63 @@
+// { dg-do run }
+
+#include <cstdlib>
+#include <cassert>
+
+struct S {
+  int *myarr;
+};
+
+#pragma omp declare mapper (S s) map(to:s.myarr) map (tofrom: s.myarr[0:20])
+
+namespace A {
+#pragma omp declare mapper (S s) map(to:s.myarr) map (tofrom: s.myarr[0:100])
+}
+
+namespace B {
+#pragma omp declare mapper (S s) map(to:s.myarr) map (tofrom: s.myarr[100:100])
+}
+
+namespace A
+{
+  void incr_a (S &my_s)
+  {
+#pragma omp target
+    {
+      for (int i = 0; i < 100; i++)
+	my_s.myarr[i]++;
+    }
+  }
+}
+
+namespace B
+{
+  void incr_b (S &my_s)
+  {
+#pragma omp target
+    {
+      for (int i = 100; i < 200; i++)
+	my_s.myarr[i]++;
+    }
+  }
+}
+
+int main (int argc, char *argv[])
+{
+  S my_s;
+
+  my_s.myarr = (int *) calloc (200, sizeof (int));
+
+#pragma omp target
+  {
+    for (int i = 0; i < 20; i++)
+      my_s.myarr[i]++;
+  }
+
+  A::incr_a (my_s);
+  B::incr_b (my_s);
+
+  for (int i = 0; i < 200; i++)
+    assert (my_s.myarr[i] == (i < 20) ? 2 : 1);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/declare-mapper-5.C b/libgomp/testsuite/libgomp.c++/declare-mapper-5.C
new file mode 100644
index 0000000..0030de8
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/declare-mapper-5.C
@@ -0,0 +1,52 @@
+// { dg-do run }
+
+#include <cassert>
+
+struct S
+{
+  int *myarr;
+  int len;
+};
+
+class C
+{
+  S smemb;
+#pragma omp declare mapper (custom:S s) map(to:s.myarr) \
+					map(tofrom:s.myarr[0:s.len])
+
+public:
+  C(int l)
+  {
+    smemb.myarr = new int[l];
+    smemb.len = l;
+    for (int i = 0; i < l; i++)
+      smemb.myarr[i] = 0;
+  }
+  void bump();
+  void check();
+};
+
+void
+C::bump ()
+{
+#pragma omp target map(mapper(custom), tofrom: smemb)
+  {
+    for (int i = 0; i < smemb.len; i++)
+      smemb.myarr[i]++;
+  }
+}
+
+void
+C::check ()
+{
+  for (int i = 0; i < smemb.len; i++)
+    assert (smemb.myarr[i] == 1);
+}
+
+int main (int argc, char *argv[])
+{
+  C test (100);
+  test.bump ();
+  test.check ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/declare-mapper-6.C b/libgomp/testsuite/libgomp.c++/declare-mapper-6.C
new file mode 100644
index 0000000..14ed10d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/declare-mapper-6.C
@@ -0,0 +1,37 @@
+// { dg-do run }
+
+#include <cassert>
+
+template <typename T>
+void adjust (T param)
+{
+#pragma omp declare mapper (T x) map(to:x.len, x.base) \
+				 map(tofrom:x.base[0:x.len])
+
+#pragma omp target
+  for (int i = 0; i < param.len; i++)
+    param.base[i]++;
+}
+
+struct S {
+  int len;
+  int *base;
+};
+
+int main (int argc, char *argv[])
+{
+  S a;
+
+  a.len = 100;
+  a.base = new int[a.len];
+
+  for (int i = 0; i < a.len; i++)
+    a.base[i] = 0;
+
+  adjust (a);
+
+  for (int i = 0; i < a.len; i++)
+    assert (a.base[i] == 1);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/declare-mapper-7.C b/libgomp/testsuite/libgomp.c++/declare-mapper-7.C
new file mode 100644
index 0000000..ab63209
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/declare-mapper-7.C
@@ -0,0 +1,48 @@
+// { dg-do run }
+
+#include <cassert>
+
+struct S
+{
+  int *myarr;
+};
+
+struct T
+{
+  S *s;
+};
+
+#pragma omp declare mapper (s100: S x) map(to: x.myarr) \
+				       map(tofrom: x.myarr[0:100])
+
+void
+bump (T t)
+{
+  /* Here we have an implicit/default mapper invoking a named mapper.  We
+     need to make sure that can be located properly at gimplification
+     time.  */
+#pragma omp declare mapper (T t) map(to:t.s) map(mapper(s100), tofrom: t.s[0])
+
+#pragma omp target
+  for (int i = 0; i < 100; i++)
+    t.s->myarr[i]++;
+}
+
+int main (int argc, char *argv[])
+{
+  S my_s;
+  T my_t;
+
+  my_s.myarr = new int[100];
+  my_t.s = &my_s;
+
+  for (int i = 0; i < 100; i++)
+    my_s.myarr[i] = 0;
+
+  bump (my_t);
+
+  for (int i = 0; i < 100; i++)
+    assert (my_s.myarr[i] == 1);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/declare-mapper-8.C b/libgomp/testsuite/libgomp.c++/declare-mapper-8.C
new file mode 100644
index 0000000..3818e52
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/declare-mapper-8.C
@@ -0,0 +1,61 @@
+// { dg-do run }
+
+#include <cassert>
+
+struct S
+{
+  int *myarr;
+  int len;
+};
+
+template<typename T>
+class C
+{
+  T memb;
+#pragma omp declare mapper (T t) map(to:t.len, t.myarr) \
+				 map(tofrom:t.myarr[0:t.len])
+
+public:
+  C(int sz);
+  ~C();
+  void bump();
+  void check();
+};
+
+template<typename T>
+C<T>::C(int sz)
+{
+  memb.myarr = new int[sz];
+  for (int i = 0; i < sz; i++)
+    memb.myarr[i] = 0;
+  memb.len = sz;
+}
+
+template<typename T>
+C<T>::~C()
+{
+  delete[] memb.myarr;
+}
+
+template<typename T>
+void C<T>::bump()
+{
+#pragma omp target map(memb)
+  for (int i = 0; i < memb.len; i++)
+    memb.myarr[i]++;
+}
+
+template<typename T>
+void C<T>::check()
+{
+  for (int i = 0; i < memb.len; i++)
+    assert (memb.myarr[i] == 1);
+}
+
+int main(int argc, char *argv[])
+{
+  C<S> c_int(100);
+  c_int.bump();
+  c_int.check();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/delim-declare-variant-1.C b/libgomp/testsuite/libgomp.c++/delim-declare-variant-1.C
new file mode 100644
index 0000000..bf146dd
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/delim-declare-variant-1.C
@@ -0,0 +1,29 @@
+/* { dg-additional-options "-foffload=disable" } */
+
+/* Check that variants within a "begin declare variant" directive
+   are attached to the correct overloaded function.  */
+
+int f (int x) { return x; }
+
+#pragma omp begin declare variant match (implementation={vendor("gnu")})
+int f (int x) { return -1; }
+#pragma omp end declare variant
+
+int f (int x, int y) { return x * y; }
+
+#pragma omp begin declare variant match (construct={target})
+int f (int x, int y) { return -2; }
+#pragma omp end declare variant
+
+int f (int x, int y, int z) { return x * y * z; }
+
+#pragma omp begin declare variant match (device={kind("host")})
+int f (int x, int y, int z) { return -3; }
+#pragma omp end declare variant
+
+int main (void)
+{
+  if (f (10) != -1) __builtin_abort ();
+  if (f (10, 20) != 200) __builtin_abort ();   /* no match on this one */
+  if (f (10, 20, 30) != -3) __builtin_abort ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/delim-declare-variant-2.C b/libgomp/testsuite/libgomp.c++/delim-declare-variant-2.C
new file mode 100644
index 0000000..6641768
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/delim-declare-variant-2.C
@@ -0,0 +1,37 @@
+/* Check that "omp begin declare variant" works on methods in a
+   class declaration.  */
+
+class test1 {
+
+ private:
+  int n;
+  static int m;
+
+ public:
+
+  void set_n (int x) { n = x; }
+  int get_n (void) { return n; }
+
+  static void set_m (int x) { m = x; }
+  static int get_m (void) { return m; }
+
+  #pragma omp begin declare variant match (implementation={vendor("gnu")})
+  int get_n (void) { return n * 2; }
+  static int get_m (void) { return m * 2; }
+  #pragma omp end declare variant
+
+  #pragma omp begin declare variant match (construct={target})
+  int get_n (void) { return this->n * 2; }
+  #pragma omp end declare variant
+};
+
+int test1::m;
+
+int main (void)
+{
+  test1 t1;
+  t1.set_n (10);
+  if (t1.get_n () != 20) __builtin_abort ();
+  test1::set_m (1);
+  if (test1::get_m () != 2) __builtin_abort ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/delim-declare-variant-7.C b/libgomp/testsuite/libgomp.c++/delim-declare-variant-7.C
new file mode 100644
index 0000000..60cc5d8
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/delim-declare-variant-7.C
@@ -0,0 +1,39 @@
+/* Check that "omp begin declare variant" works on methods in a template
+   class declaration.  */
+
+template <typename T>
+class test1 {
+
+ private:
+  T n;
+  static T m;
+
+ public:
+
+  void set_n (T x) { n = x; }
+  T get_n (void) { return n; }
+
+  static void set_m (T x) { m = x; }
+  static T get_m (void) { return m; }
+
+  #pragma omp begin declare variant match (implementation={vendor("gnu")})
+  T get_n (void) { return n * 2; }
+  static T get_m (void) { return m * 2; }
+  #pragma omp end declare variant
+
+  #pragma omp begin declare variant match (construct={target})
+  T get_n (void) { return this->n * 2; }
+  #pragma omp end declare variant
+};
+
+template <typename T>
+T test1<T>::m;
+
+int main (void)
+{
+  test1<int> t1;
+  t1.set_n (10);
+  if (t1.get_n () != 20) __builtin_abort ();
+  test1<int>::set_m (1);
+  if (test1<int>::get_m () != 2) __builtin_abort ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/need-device-ptr.C b/libgomp/testsuite/libgomp.c++/need-device-ptr.C
new file mode 100644
index 0000000..d7babff
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/need-device-ptr.C
@@ -0,0 +1,175 @@
+// Test the need_device_ptr and need_device_addr modifiers to the adjust_args clause
+
+#include <omp.h>
+
+void fptr_var (int *x1, int *x2, int *x3, int **x3a, int *x4, int *x5, int *x6, int **x6a)
+{
+  #pragma omp target is_device_ptr (x1)
+  { if (*x1 != 1) __builtin_abort (); *x1 *= -1; }
+
+  #pragma omp target is_device_ptr (x2)
+  { if (*x2 != 2) __builtin_abort (); *x2 *= -1; }
+
+  #pragma omp target is_device_ptr (x3)
+  { if (*x3 != 3) __builtin_abort (); *x3 *= -1; }
+
+  #pragma omp target is_device_ptr (x3a)
+  { if (**x3a != 30) __builtin_abort (); **x3a *= -1; }
+
+  #pragma omp target is_device_ptr (x4)
+  { if (*x4 != 4) __builtin_abort (); *x4 *= -1; }
+
+  #pragma omp target is_device_ptr (x5)
+  { if (*x5 != 5) __builtin_abort (); *x5 *= -1; }
+
+  #pragma omp target is_device_ptr (x6)
+  { if (*x6 != 6) __builtin_abort (); *x6 *= -1; }
+
+  #pragma omp target is_device_ptr (x6a)
+  { if (**x6a != 60) __builtin_abort (); **x6a *= -1; }
+}
+
+#pragma omp declare variant(fptr_var) match(construct={dispatch}) adjust_args (need_device_ptr : 1:8)
+void fptr (int *x1, int *x2, int *x3, int **x3a, int *x4, int *x5, int *x6, int **x6a);
+
+void faddr_var (int &x1, int &x2, int &x3, int *&x3a, int &x4, int &x5, int &x6, int *&x6a)
+{
+  #pragma omp target has_device_addr (x1)
+  { if (x1 != 1) __builtin_abort (); x1 *= -1; }
+
+  #pragma omp target has_device_addr (x2)
+  { if (x2 != 2) __builtin_abort (); x2 *= -1; }
+
+  #pragma omp target has_device_addr (x3)
+  { if (x3 != 3) __builtin_abort (); x3 *= -1; }
+
+  #pragma omp target has_device_addr (x3a)
+  { if (*x3a != 30) __builtin_abort (); *x3a *= -1; }
+
+  #pragma omp target has_device_addr (x4)
+  { if (x4 != 4) __builtin_abort (); x4 *= -1; }
+
+  #pragma omp target has_device_addr (x5)
+  { if (x5 != 5) __builtin_abort (); x5 *= -1; }
+
+  #pragma omp target has_device_addr (x6)
+  { if (x6 != 6) __builtin_abort (); x6 *= -1; }
+
+  #pragma omp target has_device_addr (x6a)
+  { if (*x6a != 60) __builtin_abort (); *x6a *= -1; }
+}
+
+#pragma omp declare variant(faddr_var) match(construct={dispatch}) adjust_args (need_device_addr : 1:8)
+void faddr (int &x1, int &x2, int &x3, int *&, int &x4, int &x5, int &x6, int *&);
+
+void caller_ptr(int x, int &y, int *z, int *zptr)
+{
+  int a = 4;
+  int bval = 5;
+  int &b = bval;
+  int *c = (int*) __builtin_malloc (sizeof (int));
+  int *cptr;
+  *c = 6;
+
+  zptr = (int *) omp_target_alloc (sizeof (int), omp_get_default_device ()); 
+  cptr = (int *) omp_target_alloc (sizeof (int), omp_get_default_device ()); 
+
+  #pragma omp target is_device_ptr(cptr, zptr)
+  {
+    *zptr = 30;
+    *cptr = 60;
+  }
+
+  #pragma omp target enter data map(x, a, b, c[:1], cptr, zptr)
+
+  #pragma omp dispatch
+  fptr (&x, &y, z, &zptr, &a, &b, c, &cptr);
+
+  #pragma omp target exit data map(x, a, b, c[:1], cptr, zptr)
+  #pragma omp target update from(y, z[:1])
+
+  if (x != -1) __builtin_abort ();
+  if (y != -2) __builtin_abort ();
+  if (*z != -3) __builtin_abort ();
+
+  if (a != -4) __builtin_abort ();
+  if (b != -5) __builtin_abort ();
+  if (*c != -6) __builtin_abort ();
+
+  #pragma omp target is_device_ptr(cptr, zptr)
+  {
+    if (*zptr != -30) __builtin_abort ();
+    if (*cptr != -60) __builtin_abort ();
+  }
+
+  __builtin_free (c);
+  omp_target_free (cptr, omp_get_default_device ());
+  omp_target_free (zptr, omp_get_default_device ());
+}
+
+void caller_addr(int x, int &y, int *z, int *zptr)
+{
+  int a = 4;
+  int bval = 5;
+  int &b = bval;
+  int *c = (int*) __builtin_malloc (sizeof (int));
+  int *cptr;
+  *c = 6;
+
+  zptr = (int *) omp_target_alloc (sizeof (int), omp_get_default_device ()); 
+  cptr = (int *) omp_target_alloc (sizeof (int), omp_get_default_device ()); 
+
+  #pragma omp target is_device_ptr(cptr, zptr)
+  {
+    *zptr = 30;
+    *cptr = 60;
+  }
+
+  #pragma omp target enter data map(x, a, b, c[:1], cptr, zptr)
+
+  #pragma omp dispatch
+  faddr (x, y, *z, zptr, a, b, *c, cptr);
+
+  #pragma omp target exit data map(x, a, b, c[:1], cptr, zptr)
+  #pragma omp target update from(y, z[:1])
+
+  if (x != -1) __builtin_abort ();
+  if (y != -2) __builtin_abort ();
+  if (*z != -3) __builtin_abort ();
+
+  if (a != -4) __builtin_abort ();
+  if (b != -5) __builtin_abort ();
+  if (*c != -6) __builtin_abort ();
+
+  #pragma omp target is_device_ptr(cptr, zptr)
+  {
+    if (*zptr != -30) __builtin_abort ();
+    if (*cptr != -60) __builtin_abort ();
+  }
+
+
+  __builtin_free (c);
+}
+
+int
+main ()
+{
+  int x = 1;
+  int yval = 2;
+  int &y = yval;
+  int *z = (int *) __builtin_malloc (sizeof (int));
+  int *zptr;
+  *z = 3;
+
+  #pragma omp target data map(y, z[:1])
+    caller_ptr (x, y, z, zptr);
+
+  x = 1;
+  y = 2;
+  *z = 3;
+
+  #pragma omp target data map(y, z[:1], zptr)
+    caller_addr (x, y, z, zptr);
+
+  __builtin_free (z);
+}
diff --git a/libgomp/testsuite/libgomp.c++/pr119692-1-4.C b/libgomp/testsuite/libgomp.c++/pr119692-1-4.C
index 6995f26..af9fe1c 100644
--- a/libgomp/testsuite/libgomp.c++/pr119692-1-4.C
+++ b/libgomp/testsuite/libgomp.c++/pr119692-1-4.C
@@ -3,6 +3,9 @@
 /* { dg-additional-options -DDEFAULT=defaultmap(firstprivate) }
    Wrong code for offloading execution.
    { dg-xfail-run-if PR119692 { offload_device } } */
+/* There are configurations where we 'WARNING: program timed out.' while in
+   'dynamic_cast', see <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119692#c6>.
+   { dg-timeout 10 } ... to make sure that happens quickly.  */
 /* { dg-additional-options -fdump-tree-gimple } */
 
 #include "pr119692-1-1.C"
diff --git a/libgomp/testsuite/libgomp.c++/pr119692-1-5.C b/libgomp/testsuite/libgomp.c++/pr119692-1-5.C
index 02121b6..e5c6e07 100644
--- a/libgomp/testsuite/libgomp.c++/pr119692-1-5.C
+++ b/libgomp/testsuite/libgomp.c++/pr119692-1-5.C
@@ -3,6 +3,9 @@
 /* { dg-additional-options -DDEFAULT=defaultmap(to) }
    Wrong code for offloading execution.
    { dg-xfail-run-if PR119692 { offload_device } } */
+/* There are configurations where we 'WARNING: program timed out.' while in
+   'dynamic_cast', see <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119692#c6>.
+   { dg-timeout 10 } ... to make sure that happens quickly.  */
 /* { dg-additional-options -fdump-tree-gimple } */
 
 #include "pr119692-1-1.C"
diff --git a/libgomp/testsuite/libgomp.c++/target-cdtor-1.C b/libgomp/testsuite/libgomp.c++/target-cdtor-1.C
new file mode 100644
index 0000000..ecb029e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-cdtor-1.C
@@ -0,0 +1,104 @@
+/* Offloaded C++ objects construction and destruction.  */
+
+/* { dg-additional-options -fdump-tree-optimized-raw-asmname }
+   { dg-additional-options -foffload-options=-fdump-tree-optimized-raw-asmname } */
+
+#include <omp.h>
+#include <vector>
+
+#pragma omp declare target
+
+struct S
+{
+  int x;
+
+  S()
+    : x(-1)
+  {
+    __builtin_printf("%s, %d, %d\n", __FUNCTION__, x, omp_is_initial_device());
+  }
+  S(int x)
+    : x(x)
+  {
+    __builtin_printf("%s, %d, %d\n", __FUNCTION__, x, omp_is_initial_device());
+  }
+  ~S()
+  {
+    __builtin_printf("%s, %d, %d\n", __FUNCTION__, x, omp_is_initial_device());
+  }
+};
+
+#pragma omp end declare target
+
+S sH1(7);
+
+#pragma omp declare target
+
+S sHD1(5);
+
+std::vector<S> svHD1(2);
+
+#pragma omp end declare target
+
+S sH2(3);
+
+int main()
+{
+  int c = 0;
+
+  __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device());
+
+#pragma omp target map(c)
+  {
+    __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device());
+  }
+
+#pragma omp target map(c)
+  {
+    __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device());
+  }
+
+  __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device());
+
+  return 0;
+}
+
+/* Verify '__cxa_atexit' calls.
+
+   For the host, there are four expected calls:
+   { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, } 4 optimized { target cxa_atexit } } }
+   { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZN1SD1Ev, \&sH1, \&__dso_handle>} 1 optimized { target cxa_atexit } } }
+   { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZN1SD1Ev, \&sHD1, \&__dso_handle>} 1 optimized { target cxa_atexit } } }
+   { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZNSt6vectorI1SSaIS0_EED1Ev, \&svHD1, \&__dso_handle>} 1 optimized { target cxa_atexit } } }
+   { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZN1SD1Ev, \&sH2, \&__dso_handle>} 1 optimized { target cxa_atexit } } }
+
+   For the device, there are two expected calls:
+   { dg-final { scan-offload-tree-dump-times {gimple_call <__cxa_atexit, } 2 optimized { target cxa_atexit } } }
+   { dg-final { scan-offload-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZN1SD1Ev, \&sHD1, \&__dso_handle>} 1 optimized { target cxa_atexit } } }
+   { dg-final { scan-offload-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZNSt6vectorI1SSaIS0_EED1Ev, \&svHD1, \&__dso_handle>} 1 optimized { target cxa_atexit } } }
+*/
+
+/* C++ objects are constructed in order of appearance (..., and destructed in reverse order).
+   { dg-output {S, 7, 1[\r\n]+} }
+   { dg-output {S, 5, 1[\r\n]+} }
+   { dg-output {S, -1, 1[\r\n]+} }
+   { dg-output {S, -1, 1[\r\n]+} }
+   { dg-output {S, 3, 1[\r\n]+} }
+   { dg-output {main:1, 1[\r\n]+} }
+   { dg-output {S, 5, 0[\r\n]+} { target offload_device } }
+   { dg-output {S, -1, 0[\r\n]+} { target offload_device } }
+   { dg-output {S, -1, 0[\r\n]+} { target offload_device } }
+   { dg-output {main:2, 1[\r\n]+} { target { ! offload_device } } }
+   { dg-output {main:2, 0[\r\n]+} { target offload_device } }
+   { dg-output {main:3, 1[\r\n]+} { target { ! offload_device } } }
+   { dg-output {main:3, 0[\r\n]+} { target offload_device } }
+   { dg-output {main:4, 1[\r\n]+} }
+   { dg-output {~S, -1, 0[\r\n]+} { target offload_device } }
+   { dg-output {~S, -1, 0[\r\n]+} { target offload_device } }
+   { dg-output {~S, 5, 0[\r\n]+} { target offload_device } }
+   { dg-output {~S, 3, 1[\r\n]+} }
+   { dg-output {~S, -1, 1[\r\n]+} }
+   { dg-output {~S, -1, 1[\r\n]+} }
+   { dg-output {~S, 5, 1[\r\n]+} }
+   { dg-output {~S, 7, 1[\r\n]+} }
+*/
diff --git a/libgomp/testsuite/libgomp.c++/target-cdtor-2.C b/libgomp/testsuite/libgomp.c++/target-cdtor-2.C
new file mode 100644
index 0000000..75e48ca
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-cdtor-2.C
@@ -0,0 +1,140 @@
+/* Offloaded 'constructor' and 'destructor' functions, and C++ objects construction and destruction.  */
+
+/* { dg-require-effective-target init_priority } */
+
+/* { dg-additional-options -fdump-tree-optimized-raw-asmname }
+   { dg-additional-options -foffload-options=-fdump-tree-optimized-raw-asmname } */
+
+#include <omp.h>
+#include <vector>
+
+#pragma omp declare target
+
+struct S
+{
+  int x;
+
+  S()
+    : x(-1)
+  {
+    __builtin_printf("%s, %d, %d\n", __FUNCTION__, x, omp_is_initial_device());
+  }
+  S(int x)
+    : x(x)
+  {
+    __builtin_printf("%s, %d, %d\n", __FUNCTION__, x, omp_is_initial_device());
+  }
+  ~S()
+  {
+    __builtin_printf("%s, %d, %d\n", __FUNCTION__, x, omp_is_initial_device());
+  }
+};
+
+#pragma omp end declare target
+
+S sH1 __attribute__((init_priority(1500))) (7);
+
+#pragma omp declare target
+
+S sHD1 __attribute__((init_priority(2000))) (5);
+
+std::vector<S> svHD1 __attribute__((init_priority(1000))) (2);
+
+static void
+__attribute__((constructor(20000)))
+initDH1()
+{
+  __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device());
+}
+
+static void
+__attribute__((destructor(20000)))
+finiDH1()
+{
+  __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device());
+}
+
+#pragma omp end declare target
+
+S sH2 __attribute__((init_priority(500))) (3);
+
+static void
+__attribute__((constructor(10000)))
+initH1()
+{
+  __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device());
+}
+
+static void
+__attribute__((destructor(10000)))
+finiH1()
+{
+  __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device());
+}
+
+int main()
+{
+  int c = 0;
+
+  __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device());
+
+#pragma omp target map(c)
+  {
+    __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device());
+  }
+
+#pragma omp target map(c)
+  {
+    __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device());
+  }
+
+  __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device());
+
+  return 0;
+}
+
+/* Verify '__cxa_atexit' calls.
+
+   For the host, there are four expected calls:
+   { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, } 4 optimized { target cxa_atexit } } }
+   { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZN1SD1Ev, \&sH1, \&__dso_handle>} 1 optimized { target cxa_atexit } } }
+   { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZN1SD1Ev, \&sHD1, \&__dso_handle>} 1 optimized { target cxa_atexit } } }
+   { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZNSt6vectorI1SSaIS0_EED1Ev, \&svHD1, \&__dso_handle>} 1 optimized { target cxa_atexit } } }
+   { dg-final { scan-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZN1SD1Ev, \&sH2, \&__dso_handle>} 1 optimized { target cxa_atexit } } }
+
+   For the device, there are two expected calls:
+   { dg-final { scan-offload-tree-dump-times {gimple_call <__cxa_atexit, } 2 optimized { target cxa_atexit } } }
+   { dg-final { scan-offload-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZN1SD1Ev, \&sHD1, \&__dso_handle>} 1 optimized { target cxa_atexit } } }
+   { dg-final { scan-offload-tree-dump-times {gimple_call <__cxa_atexit, NULL, _ZNSt6vectorI1SSaIS0_EED1Ev, \&svHD1, \&__dso_handle>} 1 optimized { target cxa_atexit } } }
+*/
+
+/* Defined order in which 'constructor' functions, and 'destructor' functions are run, and C++ objects are constructed (..., and destructed in reverse order).
+   { dg-output {S, 3, 1[\r\n]+} }
+   { dg-output {S, -1, 1[\r\n]+} }
+   { dg-output {S, -1, 1[\r\n]+} }
+   { dg-output {S, 7, 1[\r\n]+} }
+   { dg-output {S, 5, 1[\r\n]+} }
+   { dg-output {initH1, 1[\r\n]+} }
+   { dg-output {initDH1, 1[\r\n]+} }
+   { dg-output {main:1, 1[\r\n]+} }
+   { dg-output {S, -1, 0[\r\n]+} { target offload_device } }
+   { dg-output {S, -1, 0[\r\n]+} { target offload_device } }
+   { dg-output {S, 5, 0[\r\n]+} { target offload_device } }
+   { dg-output {initDH1, 0[\r\n]+} { target offload_device } }
+   { dg-output {main:2, 1[\r\n]+} { target  { ! offload_device } } }
+   { dg-output {main:2, 0[\r\n]+} { target offload_device } }
+   { dg-output {main:3, 1[\r\n]+} { target { ! offload_device } } }
+   { dg-output {main:3, 0[\r\n]+} { target offload_device } }
+   { dg-output {main:4, 1[\r\n]+} }
+   { dg-output {~S, 5, 0[\r\n]+} { target offload_device } }
+   { dg-output {~S, -1, 0[\r\n]+} { target offload_device } }
+   { dg-output {~S, -1, 0[\r\n]+} { target offload_device } }
+   { dg-output {finiDH1, 0[\r\n]+} { target offload_device } }
+   { dg-output {~S, 5, 1[\r\n]+} }
+   { dg-output {~S, 7, 1[\r\n]+} }
+   { dg-output {~S, -1, 1[\r\n]+} }
+   { dg-output {~S, -1, 1[\r\n]+} }
+   { dg-output {~S, 3, 1[\r\n]+} }
+   { dg-output {finiDH1, 1[\r\n]+} }
+   { dg-output {finiH1, 1[\r\n]+} }
+*/
diff --git a/libgomp/testsuite/libgomp.c++/target-exceptions-bad_cast-1.C b/libgomp/testsuite/libgomp.c++/target-exceptions-bad_cast-1.C
index 3848295..a862652 100644
--- a/libgomp/testsuite/libgomp.c++/target-exceptions-bad_cast-1.C
+++ b/libgomp/testsuite/libgomp.c++/target-exceptions-bad_cast-1.C
@@ -23,3 +23,6 @@
    PR119692.
 
    { dg-shouldfail {'std::bad_cast' exception} } */
+/* There are configurations where we 'WARNING: program timed out.' while in
+   'dynamic_cast', see <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119692#c6>.
+   { dg-timeout 10 } ... to make sure that happens quickly.  */
diff --git a/libgomp/testsuite/libgomp.c++/target-exceptions-bad_cast-2.C b/libgomp/testsuite/libgomp.c++/target-exceptions-bad_cast-2.C
index 8861740..ff15c9f 100644
--- a/libgomp/testsuite/libgomp.c++/target-exceptions-bad_cast-2.C
+++ b/libgomp/testsuite/libgomp.c++/target-exceptions-bad_cast-2.C
@@ -22,3 +22,6 @@
 
    For GCN, nvptx offload execution, there is no 'catch'ing; any exception is fatal.
    { dg-shouldfail {'MyException' exception} { offload_device } } */
+/* There are configurations where we 'WARNING: program timed out.' while in
+   'dynamic_cast', see <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119692#c6>.
+   { dg-timeout 10 } ... to make sure that happens quickly.  */
diff --git a/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1-offload-sorry-GCN.C b/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1-offload-sorry-GCN.C
index 3cdedf4..d4dccf1 100644
--- a/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1-offload-sorry-GCN.C
+++ b/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1-offload-sorry-GCN.C
@@ -14,8 +14,10 @@
 
 /* In this specific C++ arrangement, distilled from PR118794, GCC synthesizes
    '__builtin_eh_pointer', '__builtin_unwind_resume' calls as dead code in 'f':
-   { dg-final { scan-tree-dump-times {gimple_call <__builtin_eh_pointer, } 1 optimized } }
-   { dg-final { scan-tree-dump-times {gimple_call <__builtin_unwind_resume, } 1 optimized } }
+   { dg-final { scan-tree-dump-times {gimple_call <__builtin_eh_pointer, } 1 optimized { target { ! { arm_eabi || tic6x-*-* } } } } }
+   { dg-final { scan-tree-dump-times {gimple_call <__builtin_unwind_resume, } 1 optimized { target { ! { arm_eabi || tic6x-*-* } } } } }
+   ..., just 'targetm.arm_eabi_unwinder' is different:
+   { dg-final { scan-tree-dump-times {gimple_call <__builtin_cxa_end_cleanup, } 1 optimized { target { arm_eabi || tic6x-*-* } } } }
    { dg-final { only_for_offload_target amdgcn-amdhsa scan-offload-tree-dump-times {gimple_call <__builtin_eh_pointer, } 1 optimized } }
    { dg-final { only_for_offload_target amdgcn-amdhsa scan-offload-tree-dump-times {gimple_call <__builtin_unwind_resume, } 1 optimized } }
    Given '-O0' and '-foffload-options=-mno-fake-exceptions', offload compilation fails:
diff --git a/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1-offload-sorry-nvptx.C b/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1-offload-sorry-nvptx.C
index ef996cf..724e34b 100644
--- a/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1-offload-sorry-nvptx.C
+++ b/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1-offload-sorry-nvptx.C
@@ -14,8 +14,10 @@
 
 /* In this specific C++ arrangement, distilled from PR118794, GCC synthesizes
    '__builtin_eh_pointer', '__builtin_unwind_resume' calls as dead code in 'f':
-   { dg-final { scan-tree-dump-times {gimple_call <__builtin_eh_pointer, } 1 optimized } }
-   { dg-final { scan-tree-dump-times {gimple_call <__builtin_unwind_resume, } 1 optimized } }
+   { dg-final { scan-tree-dump-times {gimple_call <__builtin_eh_pointer, } 1 optimized { target { ! { arm_eabi || tic6x-*-* } } } } }
+   { dg-final { scan-tree-dump-times {gimple_call <__builtin_unwind_resume, } 1 optimized { target { ! { arm_eabi || tic6x-*-* } } } } }
+   ..., just 'targetm.arm_eabi_unwinder' is different:
+   { dg-final { scan-tree-dump-times {gimple_call <__builtin_cxa_end_cleanup, } 1 optimized { target { arm_eabi || tic6x-*-* } } } }
    { dg-final { only_for_offload_target nvptx-none scan-offload-tree-dump-times {gimple_call <__builtin_eh_pointer, } 1 optimized } }
    { dg-final { only_for_offload_target nvptx-none scan-offload-tree-dump-times {gimple_call <__builtin_unwind_resume, } 1 optimized } }
    Given '-O0' and '-foffload-options=-mno-fake-exceptions', offload compilation fails:
diff --git a/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1.C b/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1.C
index 24e3d07..24eb7a5 100644
--- a/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1.C
+++ b/libgomp/testsuite/libgomp.c++/target-exceptions-pr118794-1.C
@@ -51,7 +51,9 @@ int main()
 
 /* In this specific C++ arrangement, distilled from PR118794, GCC synthesizes
    '__builtin_eh_pointer', '__builtin_unwind_resume' calls as dead code in 'f':
-   { dg-final { scan-tree-dump-times {gimple_call <__builtin_eh_pointer, } 1 optimized } }
-   { dg-final { scan-tree-dump-times {gimple_call <__builtin_unwind_resume, } 1 optimized } }
+   { dg-final { scan-tree-dump-times {gimple_call <__builtin_eh_pointer, } 1 optimized { target { ! { arm_eabi || tic6x-*-* } } } } }
+   { dg-final { scan-tree-dump-times {gimple_call <__builtin_unwind_resume, } 1 optimized { target { ! { arm_eabi || tic6x-*-* } } } } }
+   ..., just 'targetm.arm_eabi_unwinder' is different:
+   { dg-final { scan-tree-dump-times {gimple_call <__builtin_cxa_end_cleanup, } 1 optimized { target { arm_eabi || tic6x-*-* } } } }
    { dg-final { scan-offload-tree-dump-times {gimple_call <__builtin_eh_pointer, } 1 optimized } }
    { dg-final { scan-offload-tree-dump-times {gimple_call <__builtin_unwind_resume, } 1 optimized } } */
diff --git a/libgomp/testsuite/libgomp.c++/target-exceptions-throw-1.C b/libgomp/testsuite/libgomp.c++/target-exceptions-throw-1.C
index 2467061..a4e7a10 100644
--- a/libgomp/testsuite/libgomp.c++/target-exceptions-throw-1.C
+++ b/libgomp/testsuite/libgomp.c++/target-exceptions-throw-1.C
@@ -4,9 +4,6 @@
    { dg-additional-options -fexceptions } */
 /* { dg-additional-options -fdump-tree-optimized-raw }
    { dg-additional-options -foffload-options=-fdump-tree-optimized-raw } */
-/* { dg-bogus {Size expression must be absolute\.} PR119737 { target offload_target_amdgcn xfail *-*-* } 0 }
-   { dg-ice PR119737 { offload_target_amdgcn } }
-   { dg-excess-errors {'mkoffload' failures etc.} { xfail offload_target_amdgcn } } */
 
 #include "../libgomp.oacc-c++/exceptions-throw-1.C"
 
diff --git a/libgomp/testsuite/libgomp.c++/target-exceptions-throw-2.C b/libgomp/testsuite/libgomp.c++/target-exceptions-throw-2.C
index e85e6c3..97f4845 100644
--- a/libgomp/testsuite/libgomp.c++/target-exceptions-throw-2.C
+++ b/libgomp/testsuite/libgomp.c++/target-exceptions-throw-2.C
@@ -4,9 +4,6 @@
    { dg-additional-options -fexceptions } */
 /* { dg-additional-options -fdump-tree-optimized-raw }
    { dg-additional-options -foffload-options=-fdump-tree-optimized-raw } */
-/* { dg-bogus {Size expression must be absolute\.} PR119737 { target offload_target_amdgcn xfail *-*-* } 0 }
-   { dg-ice PR119737 { offload_target_amdgcn } }
-   { dg-excess-errors {'mkoffload' failures etc.} { xfail offload_target_amdgcn } } */
 
 #include "../libgomp.oacc-c++/exceptions-throw-2.C"
 
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-10.C b/libgomp/testsuite/libgomp.c++/target-flex-10.C
new file mode 100644
index 0000000..8fa9af7
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-10.C
@@ -0,0 +1,215 @@
+/* Basic container usage.  */
+
+#include <vector>
+#include <deque>
+#include <list>
+#include <set>
+#include <map>
+#if __cplusplus >= 201103L
+#include <array>
+#include <forward_list>
+#include <unordered_set>
+#include <unordered_map>
+#endif
+
+bool vector_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      std::vector<int> vector;
+      ok = vector.empty();
+    }
+  return ok;
+}
+
+bool deque_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      std::deque<int> deque;
+      ok = deque.empty();
+    }
+  return ok;
+}
+
+bool list_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      std::list<int> list;
+      ok = list.empty();
+    }
+  return ok;
+}
+
+bool map_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      std::map<int, int> map;
+      ok = map.empty();
+    }
+  return ok;
+}
+
+bool set_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      std::set<int> set;
+      ok = set.empty();
+    }
+  return ok;
+}
+
+bool multimap_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      std::multimap<int, int> multimap;
+      ok = multimap.empty();
+    }
+  return ok;
+}
+
+bool multiset_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      std::multiset<int, int> multiset;
+      ok = multiset.empty();
+    }
+  return ok;
+}
+
+#if __cplusplus >= 201103L
+
+bool array_test()
+{
+  static constexpr std::size_t array_size = 42;
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      std::array<int, array_size> array{};
+      ok = array[0] == 0
+	   && array[array_size - 1] == 0;
+    }
+  return ok;
+}
+
+bool forward_list_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      std::forward_list<int> forward_list;
+      ok = forward_list.empty();
+    }
+  return ok;
+}
+
+bool unordered_map_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      std::unordered_map<int, int> unordered_map;
+      ok = unordered_map.empty();
+    }
+  return ok;
+}
+
+bool unordered_set_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      std::unordered_set<int> unordered_set;
+      ok = unordered_set.empty();
+    }
+  return ok;
+}
+
+bool unordered_multimap_test()
+{
+
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      std::unordered_multimap<int, int> unordered_multimap;
+      ok = unordered_multimap.empty();
+    }
+  return ok;
+}
+
+bool unordered_multiset_test()
+{
+
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      std::unordered_multiset<int> unordered_multiset;
+      ok = unordered_multiset.empty();
+    }
+  return ok;
+}
+
+#else
+bool array_test() { return true; }
+bool forward_list_test() { return true; }
+bool unordered_map_test() { return true; }
+bool unordered_set_test() { return true; }
+bool unordered_multimap_test() { return true; }
+bool unordered_multiset_test() { return true; }
+#endif
+
+int main()
+{
+  const bool vec_res                = vector_test();
+  __builtin_printf("vector            : %s\n", vec_res                ? "PASS" : "FAIL");
+  const bool deque_res              = deque_test();
+  __builtin_printf("deque             : %s\n", deque_res              ? "PASS" : "FAIL");
+  const bool list_res               = list_test();
+  __builtin_printf("list              : %s\n", list_res               ? "PASS" : "FAIL");
+  const bool map_res                = map_test();
+  __builtin_printf("map               : %s\n", map_res                ? "PASS" : "FAIL");
+  const bool set_res                = set_test();
+  __builtin_printf("set               : %s\n", set_res                ? "PASS" : "FAIL");
+  const bool multimap_res           = multimap_test();
+  __builtin_printf("multimap          : %s\n", multimap_res           ? "PASS" : "FAIL");
+  const bool multiset_res           = multiset_test();
+  __builtin_printf("multiset          : %s\n", multiset_res           ? "PASS" : "FAIL");
+  const bool array_res              = array_test();
+  __builtin_printf("array             : %s\n", array_res              ? "PASS" : "FAIL");
+  const bool forward_list_res       = forward_list_test();
+  __builtin_printf("forward_list      : %s\n", forward_list_res       ? "PASS" : "FAIL");
+  const bool unordered_map_res      = unordered_map_test();
+  __builtin_printf("unordered_map     : %s\n", unordered_map_res      ? "PASS" : "FAIL");
+  const bool unordered_set_res      = unordered_set_test();
+  __builtin_printf("unordered_set     : %s\n", unordered_set_res      ? "PASS" : "FAIL");
+  const bool unordered_multimap_res = unordered_multimap_test();
+  __builtin_printf("unordered_multimap: %s\n", unordered_multimap_res ? "PASS" : "FAIL");
+  const bool unordered_multiset_res = unordered_multiset_test();
+  __builtin_printf("unordered_multiset: %s\n", unordered_multiset_res ? "PASS" : "FAIL");
+  const bool ok = vec_res
+		  && deque_res
+		  && list_res
+		  && map_res
+		  && set_res
+		  && multimap_res
+		  && multiset_res
+		  && array_res
+		  && forward_list_res
+		  && unordered_map_res
+		  && unordered_set_res
+		  && unordered_multimap_res
+		  && unordered_multiset_res;
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-100.C b/libgomp/testsuite/libgomp.c++/target-flex-100.C
new file mode 100644
index 0000000..7ab047f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-100.C
@@ -0,0 +1,210 @@
+/* Container adaptors in target region.
+   Does not test comparison operators other than equality to allow these tests
+   to be generalized to arbitrary input data.  */
+
+#include <algorithm>
+#include <cstdio>
+#include <deque>
+#include <queue>
+#include <stack>
+#include <vector>
+
+#include "target-flex-common.h"
+
+template<typename T, std::size_t Size>
+bool test_stack(T (&arr)[Size])
+{
+  bool ok;
+  #pragma omp target map(from: ok) map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      const std::size_t half_size = Size / 2;
+      const T first_element = arr[0];
+      const T middle_element = arr[half_size - 1];
+      const T last_element = arr[Size - 1];
+      typedef std::stack<T, std::vector<T> > stack_type;
+      stack_type stack;
+      VERIFY (stack.empty());
+      VERIFY (stack.size() == 0);
+      {
+	/* Do half with push.  */
+	std::size_t idx = 0;
+	for (; idx < half_size; ++idx)
+	  {
+	    stack.push(arr[idx]);
+	    VERIFY (stack.top() == arr[idx]);
+	  }
+	VERIFY (stack.size() == half_size);
+	VERIFY (static_cast<const stack_type&>(stack).size() == half_size);
+	for (; idx < Size; ++idx)
+	  {
+	    #if __cplusplus >= 201103L
+	      /* Do the rest with emplace if C++11 or higher.  */
+	      stack.emplace(arr[idx]);
+	    #else
+	      /* Otherwise just use push again.  */
+	      stack.push(arr[idx]);
+	    #endif
+	    VERIFY (stack.top() == arr[idx]);
+	  }
+	VERIFY (stack.size() == Size);
+	VERIFY (static_cast<const stack_type&>(stack).size() == Size);
+
+	const stack_type stack_orig = stack_type(std::vector<T>(arr, arr + Size));
+	VERIFY (stack == stack_orig);
+	/* References are contained in their own scope so we don't accidently
+	   add tests referencing them after they have been invalidated.  */
+	{
+	  const T& const_top = static_cast<const stack_type&>(stack).top();
+	  VERIFY (const_top == last_element);
+	  T& mutable_top = stack.top();
+	  mutable_top = first_element;
+	  VERIFY (const_top == first_element);
+	}
+	/* Will only compare inequal if the first and last elements are different.  */
+	VERIFY (first_element != last_element || stack != stack_orig);
+	for (std::size_t count = Size - half_size; count != 0; --count)
+	  stack.pop();
+	VERIFY (stack.top() == middle_element);
+	const stack_type stack_half_orig = stack_type(std::vector<T>(arr, arr + half_size));
+	VERIFY (stack == stack_half_orig);
+      }
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+template<typename T, std::size_t Size>
+bool test_queue(T (&arr)[Size])
+{
+  bool ok;
+  #pragma omp target map(from: ok) map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      const std::size_t half_size = Size / 2;
+      const T first_element = arr[0];
+      const T last_element = arr[Size - 1];
+      typedef std::queue<T, std::deque<T> > queue_type;
+      queue_type queue;
+      VERIFY (queue.empty());
+      VERIFY (queue.size() == 0);
+      {
+	/* Do half with push.  */
+	std::size_t idx = 0;
+	for (; idx < half_size; ++idx)
+	  {
+	    queue.push(arr[idx]);
+	    VERIFY (queue.back() == arr[idx]);
+	    VERIFY (queue.front() == first_element);
+	  }
+	VERIFY (queue.size() == half_size);
+	VERIFY (static_cast<const queue_type&>(queue).size() == half_size);
+	for (; idx < Size; ++idx)
+	  {
+	    #if __cplusplus >= 201103L
+	      /* Do the rest with emplace if C++11 or higher.  */
+	      queue.emplace(arr[idx]);
+	    #else
+	      /* Otherwise just use push again.  */
+	      queue.push(arr[idx]);
+	    #endif
+	    VERIFY (queue.back() == arr[idx]);
+	  }
+	VERIFY (queue.size() == Size);
+	VERIFY (static_cast<const queue_type&>(queue).size() == Size);
+
+	const queue_type queue_orig = queue_type(std::deque<T>(arr, arr + Size));
+	VERIFY (queue == queue_orig);
+
+	/* References are contained in their own scope so we don't accidently
+	   add tests referencing them after they have been invalidated.  */
+	{
+	  const T& const_front = static_cast<const queue_type&>(queue).front();
+	  VERIFY (const_front == first_element);
+	  T& mutable_front = queue.front();
+
+	  const T& const_back = static_cast<const queue_type&>(queue).back();
+	  VERIFY (const_back == last_element);
+	  T& mutable_back = queue.back();
+	  {
+	    using std::swap;
+	    swap(mutable_front, mutable_back);
+	  }
+	  VERIFY (const_front == last_element);
+	  VERIFY (const_back == first_element);
+	  /* Will only compare inequal if the first and last elements are different.  */
+	  VERIFY (first_element != last_element || queue != queue_orig);
+	  /* Return the last element to normal for the next comparison.  */
+	  mutable_back = last_element;
+	}
+
+	const T middle_element = arr[half_size];
+	for (std::size_t count = Size - half_size; count != 0; --count)
+	  queue.pop();
+	VERIFY (queue.front() == middle_element);
+	const queue_type queue_upper_half = queue_type(std::deque<T>(arr + half_size, arr + Size));
+	VERIFY (queue == queue_upper_half);
+      }
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+template<typename T, std::size_t Size>
+bool test_priority_queue(T (&arr)[Size], const T min_value, const T max_value)
+{
+  bool ok;
+  #pragma omp target map(from: ok) map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      typedef std::priority_queue<T, std::vector<T> > priority_queue_type;
+      {
+	priority_queue_type pqueue;
+	VERIFY (pqueue.empty());
+	VERIFY (pqueue.size() == 0);
+      }
+      {
+	priority_queue_type pqueue(arr, arr + Size);
+	VERIFY (!pqueue.empty());
+	VERIFY (pqueue.size() == Size);
+	VERIFY (static_cast<const priority_queue_type&>(pqueue).size() == Size);
+
+	const T old_max = pqueue.top();
+
+	#if __cplusplus >= 201103L
+	  pqueue.emplace(max_value);
+	#else
+	  pqueue.push(max_value);
+	#endif
+	VERIFY (pqueue.top() == max_value);
+	pqueue.pop();
+	VERIFY (pqueue.top() == old_max);
+	pqueue.push(min_value);
+	VERIFY (pqueue.top() == old_max);
+	pqueue.push(max_value);
+	VERIFY (pqueue.top() == max_value);
+	pqueue.pop();
+	VERIFY (pqueue.top() == old_max);
+	VERIFY (pqueue.size() == Size + 1);
+
+	for (std::size_t count = Size; count != 0; --count)
+	  pqueue.pop();
+	VERIFY (pqueue.size() == 1);
+	VERIFY (pqueue.top() == min_value);
+      }
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+int main()
+{
+  int arr[10] = {0,1,2,3,4,5,6,7,8,9};
+
+  return test_stack(arr)
+	 && test_queue(arr)
+	 && test_priority_queue(arr, 0, 1000) ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-101.C b/libgomp/testsuite/libgomp.c++/target-flex-101.C
new file mode 100644
index 0000000..be9037e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-101.C
@@ -0,0 +1,136 @@
+/* { dg-additional-options -std=c++23 } */
+
+/* C++23 container adaptors in target region.
+   Severely needs additional tests.  */
+
+#include <cstdio>
+#include <utility>
+#include <version>
+
+#if __cpp_lib_flat_map >= 202207L
+#define ENABLE_FLAT_MAP 1
+#endif
+#if __cpp_lib_flat_set >= 202207L
+#define ENABLE_FLAT_SET 1
+#endif
+
+#ifdef ENABLE_FLAT_MAP
+#include <flat_map>
+#endif
+#ifdef ENABLE_FLAT_SET
+#include <flat_set>
+#endif
+
+#include "target-flex-common.h"
+
+#ifdef ENABLE_FLAT_MAP
+template<typename K, typename V, typename std::size_t Size>
+bool test_flat_map(std::pair<K, V> (&arr)[Size])
+{
+  bool ok;
+  #pragma omp target map(from: ok) map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      {
+	using flat_map_type = std::flat_map<K, V>;
+	flat_map_type map = {arr, arr + Size};
+
+	VERIFY (!map.empty());
+	for (const auto& element : arr)
+	  VERIFY (map.contains(element.first));
+      }
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+template<typename K, typename V, typename std::size_t Size>
+bool test_flat_multimap(std::pair<K, V> (&arr)[Size])
+{
+  bool ok;
+  #pragma omp target map(from: ok) map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      {
+	using flat_map_type = std::flat_map<K, V>;
+	flat_map_type map = {arr, arr + Size};
+
+	VERIFY (!map.empty());
+	for (const auto& element : arr)
+	  VERIFY (map.contains(element.first));
+      }
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+#else
+template<typename K, typename V, typename std::size_t Size>
+bool test_flat_map(std::pair<K, V> (&arr)[Size]) { return true; }
+
+template<typename K, typename V, typename std::size_t Size>
+bool test_flat_multimap(std::pair<K, V> (&arr)[Size]) { return true; }
+#endif
+
+#ifdef ENABLE_FLAT_SET
+template<typename T, typename std::size_t Size>
+bool test_flat_set(T (&arr)[Size])
+{
+  bool ok;
+  #pragma omp target map(from: ok) map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      {
+	using flat_set_type = std::flat_set<T>;
+	flat_set_type set = {arr, arr + Size};
+
+	VERIFY (!set.empty());
+	for (const auto& element : arr)
+	  VERIFY (set.contains(element));
+      }
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+template<typename T, typename std::size_t Size>
+bool test_flat_multiset(T (&arr)[Size])
+{
+  bool ok;
+  #pragma omp target map(from: ok) map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      {
+	using flat_multiset_type = std::flat_multiset<T>;
+	flat_multiset_type multiset = {arr, arr + Size};
+
+	VERIFY (!multiset.empty());
+	for (const auto& element : arr)
+	  VERIFY (multiset.contains(element));
+      }
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+#else
+template<typename T, typename std::size_t Size>
+bool test_flat_set(T (&arr)[Size]) { return true; }
+
+template<typename T, typename std::size_t Size>
+bool test_flat_multiset(T (&arr)[Size]) { return true; }
+#endif
+
+int main()
+{
+  int arr[10] = {0,1,2,3,4,5,6,7,8,9};
+  std::pair<int, int> pairs[10] = {{ 1,  2}, { 2,  4}, { 3,  6}, { 4,  8}, { 5, 10},
+				   { 6, 12}, { 7, 14}, { 8, 16}, { 9, 18}, {10, 20}};
+
+  return test_flat_set(arr)
+	 && test_flat_multiset(arr)
+	 && test_flat_map(pairs)
+	 && test_flat_multimap(pairs) ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-11.C b/libgomp/testsuite/libgomp.c++/target-flex-11.C
new file mode 100644
index 0000000..6d55129
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-11.C
@@ -0,0 +1,444 @@
+/* Check constructors/destructors are called in containers.  */
+
+#include <vector>
+#include <deque>
+#include <list>
+#include <set>
+#include <map>
+#include <utility>
+#if __cplusplus >= 201103L
+#include <array>
+#include <forward_list>
+#include <unordered_set>
+#include <unordered_map>
+#endif
+
+#include "target-flex-common.h"
+
+struct indirect_counter
+{
+  typedef int counter_value_type;
+  counter_value_type *_count_ptr;
+
+  indirect_counter(counter_value_type *count_ptr) BL_NOEXCEPT : _count_ptr(count_ptr) {
+    ++(*_count_ptr);
+  }
+  indirect_counter(const indirect_counter& other) BL_NOEXCEPT : _count_ptr(other._count_ptr) {
+    ++(*_count_ptr);
+  }
+  /* Don't declare a move constructor, we want to copy no matter what.  */
+  ~indirect_counter() {
+    --(*_count_ptr);
+  }
+};
+
+bool operator==(indirect_counter const& lhs, indirect_counter const& rhs) BL_NOEXCEPT
+  { return lhs._count_ptr == rhs._count_ptr; }
+bool operator<(indirect_counter const& lhs, indirect_counter const& rhs) BL_NOEXCEPT
+  { return lhs._count_ptr < rhs._count_ptr; }
+
+#if __cplusplus >= 201103L
+template<>
+struct std::hash<indirect_counter>
+{
+  std::size_t operator()(const indirect_counter& ic) const noexcept
+    { return std::hash<indirect_counter::counter_value_type *>{}(ic._count_ptr); }
+};
+#endif
+
+/* Not a container, just a sanity check really.  */
+bool automatic_lifetime_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      bool inner_ok = true;
+      int counter = 0;
+      {
+	indirect_counter c = indirect_counter(&counter);
+	indirect_counter(static_cast<int*>(&counter));
+      }
+      VERIFY (counter == 0);
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+bool vector_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      bool inner_ok = true;
+      int counter = 0;
+      {
+	std::vector<indirect_counter> vec(42, indirect_counter(&counter));
+	VERIFY (counter == 42);
+	vec.resize(32, indirect_counter(&counter));
+	VERIFY (counter == 32);
+	vec.push_back(indirect_counter(&counter));
+	VERIFY (counter == 33);
+	vec.pop_back();
+	VERIFY (counter == 32);
+	vec.pop_back();
+	VERIFY (counter == 31);
+	vec.resize(100, indirect_counter(&counter));
+	VERIFY (counter == 100);
+      }
+      VERIFY (counter == 0);
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+bool deque_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      bool inner_ok = true;
+      int counter = 0;
+      {
+	std::deque<indirect_counter> vec(42, indirect_counter(&counter));
+	VERIFY (counter == 42);
+	vec.resize(32, indirect_counter(&counter));
+	VERIFY (counter == 32);
+	vec.push_back(indirect_counter(&counter));
+	VERIFY (counter == 33);
+	vec.pop_back();
+	VERIFY (counter == 32);
+	vec.pop_back();
+	VERIFY (counter == 31);
+	vec.resize(100, indirect_counter(&counter));
+	VERIFY (counter == 100);
+      }
+      VERIFY (counter == 0);
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+bool list_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      bool inner_ok = true;
+      int counter = 0;
+      {
+	std::list<indirect_counter> list(42, indirect_counter(&counter));
+	VERIFY (counter == 42);
+	list.resize(32, indirect_counter(&counter));
+	VERIFY (counter == 32);
+	list.push_back(indirect_counter(&counter));
+	VERIFY (counter == 33);
+	list.pop_back();
+	VERIFY (counter == 32);
+	list.pop_back();
+	VERIFY (counter == 31);
+	list.resize(100, indirect_counter(&counter));
+	VERIFY (counter == 100);
+      }
+      VERIFY (counter == 0);
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+bool map_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      bool inner_ok = true;
+      int counter = 0;
+      {
+	std::map<int, indirect_counter> map;
+	map.insert(std::make_pair(1, indirect_counter(&counter)));
+	VERIFY (counter == 1);
+	map.insert(std::make_pair(1, indirect_counter(&counter)));
+	VERIFY (counter == 1);
+	map.insert(std::make_pair(2, indirect_counter(&counter)));
+	VERIFY (counter == 2);
+      }
+      VERIFY (counter == 0);
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+bool set_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      bool inner_ok = true;
+      int counter0 = 0;
+      int counter1 = 0;
+      {
+	std::set<indirect_counter> set;
+	set.insert(indirect_counter(&counter0));
+	VERIFY (counter0 == 1);
+	set.insert(indirect_counter(&counter0));
+	VERIFY (counter0 == 1);
+	set.insert(indirect_counter(&counter1));
+	VERIFY (counter0 == 1 && counter1 == 1);
+      }
+      VERIFY (counter0 == 0 && counter1 == 0);
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+bool multimap_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      bool inner_ok = true;
+      int counter = 0;
+      {
+	std::multimap<int, indirect_counter> multimap;
+	multimap.insert(std::make_pair(1, indirect_counter(&counter)));
+	VERIFY (counter == 1);
+	multimap.insert(std::make_pair(1, indirect_counter(&counter)));
+	VERIFY (counter == 2);
+	multimap.insert(std::make_pair(2, indirect_counter(&counter)));
+	VERIFY (counter == 3);
+      }
+      VERIFY (counter == 0);
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+bool multiset_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      bool inner_ok = true;
+      int counter0 = 0;
+      int counter1 = 0;
+      {
+	std::multiset<indirect_counter> multiset;
+	multiset.insert(indirect_counter(&counter0));
+	VERIFY (counter0 == 1);
+	multiset.insert(indirect_counter(&counter0));
+	VERIFY (counter0 == 2);
+	multiset.insert(indirect_counter(&counter1));
+	VERIFY (counter0 == 2 && counter1 == 1);
+      }
+      VERIFY (counter0 == 0 && counter1 == 0);
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+#if __cplusplus >= 201103L
+
+bool array_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      bool inner_ok = true;
+      int counter = 0;
+      {
+	indirect_counter ic(&counter);
+	std::array<indirect_counter, 10> array{ic, ic, ic, ic, ic,
+					       ic, ic, ic, ic, ic};
+	VERIFY (counter == 11);
+      }
+      VERIFY (counter == 0);
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+bool forward_list_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      bool inner_ok = true;
+      int counter = 0;
+      {
+	std::forward_list<indirect_counter> forward_list(42, indirect_counter(&counter));
+	VERIFY (counter == 42);
+	forward_list.resize(32, indirect_counter(&counter));
+	VERIFY (counter == 32);
+	forward_list.push_front(indirect_counter(&counter));
+	VERIFY (counter == 33);
+	forward_list.pop_front();
+	VERIFY (counter == 32);
+	forward_list.pop_front();
+	VERIFY (counter == 31);
+	forward_list.resize(100, indirect_counter(&counter));
+	VERIFY (counter == 100);
+      }
+      VERIFY (counter == 0);
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+bool unordered_map_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      bool inner_ok = true;
+      int counter = 0;
+      {
+	std::unordered_map<int, indirect_counter> unordered_map;
+	unordered_map.insert({1, indirect_counter(&counter)});
+	VERIFY (counter == 1);
+	unordered_map.insert({1, indirect_counter(&counter)});
+	VERIFY (counter == 1);
+	unordered_map.insert({2, indirect_counter(&counter)});
+	VERIFY (counter == 2);
+      }
+      VERIFY (counter == 0);
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+bool unordered_set_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      bool inner_ok = true;
+      int counter0 = 0;
+      int counter1 = 0;
+      {
+	std::unordered_set<indirect_counter> unordered_set;
+	unordered_set.insert(indirect_counter(&counter0));
+	VERIFY (counter0 == 1);
+	unordered_set.insert(indirect_counter(&counter0));
+	VERIFY (counter0 == 1);
+	unordered_set.insert(indirect_counter(&counter1));
+	VERIFY (counter0 == 1 && counter1 == 1);
+      }
+      VERIFY (counter0 == 0 && counter1 == 0);
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+bool unordered_multimap_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      bool inner_ok = true;
+      int counter = 0;
+      {
+	std::unordered_multimap<int, indirect_counter> unordered_multimap;
+	unordered_multimap.insert({1, indirect_counter(&counter)});
+	VERIFY (counter == 1);
+	unordered_multimap.insert({1, indirect_counter(&counter)});
+	VERIFY (counter == 2);
+	unordered_multimap.insert({2, indirect_counter(&counter)});
+	VERIFY (counter == 3);
+      }
+      VERIFY (counter == 0);
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+bool unordered_multiset_test()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      bool inner_ok = true;
+      int counter0 = 0;
+      int counter1 = 0;
+      {
+	std::unordered_multiset<indirect_counter> unordered_multiset;
+	unordered_multiset.insert(indirect_counter(&counter0));
+	VERIFY (counter0 == 1);
+	unordered_multiset.insert(indirect_counter(&counter0));
+	VERIFY (counter0 == 2);
+	unordered_multiset.insert(indirect_counter(&counter1));
+	VERIFY (counter0 == 2 && counter1 == 1);
+      }
+      VERIFY (counter0 == 0 && counter1 == 0);
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+#else
+bool array_test() { return true; }
+bool forward_list_test() { return true; }
+bool unordered_map_test() { return true; }
+bool unordered_set_test() { return true; }
+bool unordered_multimap_test() { return true; }
+bool unordered_multiset_test() { return true; }
+#endif
+
+int main()
+{
+  const bool auto_res               = automatic_lifetime_test();
+  const bool vec_res                = vector_test();
+  const bool deque_res              = deque_test();
+  const bool list_res               = list_test();
+  const bool map_res                = map_test();
+  const bool set_res                = set_test();
+  const bool multimap_res           = multimap_test();
+  const bool multiset_res           = multiset_test();
+  const bool array_res              = array_test();
+  const bool forward_list_res       = forward_list_test();
+  const bool unordered_map_res      = unordered_map_test();
+  const bool unordered_set_res      = unordered_set_test();
+  const bool unordered_multimap_res = unordered_multimap_test();
+  const bool unordered_multiset_res = unordered_multiset_test();
+  std::printf("sanity check      : %s\n", auto_res               ? "PASS" : "FAIL");
+  std::printf("vector            : %s\n", vec_res                ? "PASS" : "FAIL");
+  std::printf("deque             : %s\n", deque_res              ? "PASS" : "FAIL");
+  std::printf("list              : %s\n", list_res               ? "PASS" : "FAIL");
+  std::printf("map               : %s\n", map_res                ? "PASS" : "FAIL");
+  std::printf("set               : %s\n", set_res                ? "PASS" : "FAIL");
+  std::printf("multimap          : %s\n", multimap_res           ? "PASS" : "FAIL");
+  std::printf("multiset          : %s\n", multiset_res           ? "PASS" : "FAIL");
+  std::printf("array             : %s\n", array_res              ? "PASS" : "FAIL");
+  std::printf("forward_list      : %s\n", forward_list_res       ? "PASS" : "FAIL");
+  std::printf("unordered_map     : %s\n", unordered_map_res      ? "PASS" : "FAIL");
+  std::printf("unordered_set     : %s\n", unordered_set_res      ? "PASS" : "FAIL");
+  std::printf("unordered_multimap: %s\n", unordered_multimap_res ? "PASS" : "FAIL");
+  std::printf("unordered_multiset: %s\n", unordered_multiset_res ? "PASS" : "FAIL");
+  const bool ok = auto_res
+		  && vec_res
+		  && deque_res
+		  && list_res
+		  && map_res
+		  && set_res
+		  && multimap_res
+		  && multiset_res
+		  && array_res
+		  && forward_list_res
+		  && unordered_map_res
+		  && unordered_set_res
+		  && unordered_multimap_res
+		  && unordered_multiset_res;
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-12.C b/libgomp/testsuite/libgomp.c++/target-flex-12.C
new file mode 100644
index 0000000..024fb73
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-12.C
@@ -0,0 +1,736 @@
+/* Populated with mapped data, validate, mutate, validate again.
+   The cases using sets do not mutate.
+   Note: Some of the code in here really sucks due to being made to be
+   compatible with c++98.  */
+
+#include <vector>
+#include <deque>
+#include <list>
+#include <set>
+#include <map>
+#if __cplusplus >= 201103L
+#include <array>
+#include <forward_list>
+#include <unordered_set>
+#include <unordered_map>
+#endif
+
+#include <limits>
+#include <iterator>
+
+#include "target-flex-common.h"
+
+template<bool B, class T = void>
+struct enable_if {};
+ 
+template<class T>
+struct enable_if<true, T> { typedef T type; };
+
+struct identity_func
+{
+#if __cplusplus < 201103L
+  template<typename T>
+  T& operator()(T& arg) const BL_NOEXCEPT { return arg; }
+  template<typename T>
+  T const& operator()(T const& arg) const BL_NOEXCEPT { return arg; }
+#else
+  template<typename T>
+  constexpr T&& operator()(T&& arg) const BL_NOEXCEPT { return std::forward<T>(arg); }
+#endif
+};
+
+/* Applies projection to the second iterator.  */
+template<typename It0, typename It1, typename Proj>
+bool validate_sequential_elements(const It0 begin0, const It0 end0,
+				  const It1 begin1, const It1 end1,
+				  Proj proj) BL_NOEXCEPT
+{
+  It0 it0 = begin0;
+  It1 it1 = begin1;
+  for (; it0 != end0; ++it0, ++it1)
+    {
+      /* Sizes mismatch, don't bother aborting though just fail the test.  */
+      if (it1 == end1)
+	return false;
+      if (*it0 != proj(*it1))
+	return false;
+    }
+  /* Sizes mismatch, do as above.  */
+  if (it1 != end1)
+    return false;
+  return true;
+}
+
+template<typename It0, typename It1>
+bool validate_sequential_elements(const It0 begin0, const It0 end0,
+				  const It1 begin1, const It1 end1) BL_NOEXCEPT
+{
+  return validate_sequential_elements(begin0, end0, begin1, end1, identity_func());
+}
+
+/* Inefficient, but simple.  */
+template<typename It, typename OutIt>
+void simple_copy(const It begin, const It end, OutIt out) BL_NOEXCEPT
+{
+  for (It it = begin; it != end; ++it, ++out)
+    *out = *it;
+}
+
+template<typename It, typename MutateFn>
+void simple_mutate(const It begin, const It end, MutateFn mut_fn) BL_NOEXCEPT
+{
+  for (It it = begin; it != end; ++it)
+    *it = mut_fn(*it);
+}
+
+template<typename MutationFunc, typename T, std::size_t Size>
+bool vector_test(const T (&arr)[Size])
+{
+  bool ok;
+  T out_arr[Size];
+  T out_mut_arr[Size];
+  #pragma omp target map(from: ok, out_arr[:Size], out_mut_arr[:Size]) \
+		     map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      {
+	std::vector<T> vector(arr, arr + Size);
+	VERIFY (validate_sequential_elements(vector.begin(), vector.end(),
+					     arr, arr + Size));
+	simple_copy(vector.begin(), vector.end(), out_arr);
+	simple_mutate(vector.begin(), vector.end(), MutationFunc());
+	VERIFY (validate_sequential_elements(vector.begin(), vector.end(),
+					     arr, arr + Size, MutationFunc()));
+	simple_copy(vector.begin(), vector.end(), out_mut_arr);
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (validate_sequential_elements(out_arr, out_arr + Size,
+						  arr, arr + Size));
+  VERIFY_NON_TARGET (validate_sequential_elements(out_mut_arr, out_mut_arr + Size,
+						  arr, arr + Size, MutationFunc()));
+  return true;
+}
+
+template<typename MutationFunc, typename T, std::size_t Size>
+bool deque_test(const T (&arr)[Size])
+{
+  bool ok;
+  T out_arr[Size];
+  T out_mut_arr[Size];
+  #pragma omp target map(from: ok, out_arr[:Size], out_mut_arr[:Size]) \
+		     map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      {
+	std::deque<T> deque(arr, arr + Size);
+	VERIFY (validate_sequential_elements(deque.begin(), deque.end(),
+					     arr, arr + Size));
+	simple_copy(deque.begin(), deque.end(), out_arr);
+	simple_mutate(deque.begin(), deque.end(), MutationFunc());
+	VERIFY (validate_sequential_elements(deque.begin(), deque.end(),
+					     arr, arr + Size, MutationFunc()));
+	simple_copy(deque.begin(), deque.end(), out_mut_arr);
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (validate_sequential_elements(out_arr, out_arr + Size,
+						  arr, arr + Size));
+  VERIFY_NON_TARGET (validate_sequential_elements(out_mut_arr, out_mut_arr + Size,
+						  arr, arr + Size, MutationFunc()));
+  return true;
+}
+
+template<typename MutationFunc, typename T, std::size_t Size>
+bool list_test(const T (&arr)[Size])
+{
+  bool ok;
+  T out_arr[Size];
+  T out_mut_arr[Size];
+  #pragma omp target map(from: ok, out_arr[:Size], out_mut_arr[:Size]) \
+		     map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      {
+	std::list<T> list(arr, arr + Size);
+	VERIFY (validate_sequential_elements(list.begin(), list.end(),
+					     arr, arr + Size));
+	simple_copy(list.begin(), list.end(), out_arr);
+	simple_mutate(list.begin(), list.end(), MutationFunc());
+	VERIFY (validate_sequential_elements(list.begin(), list.end(),
+					     arr, arr + Size, MutationFunc()));
+	simple_copy(list.begin(), list.end(), out_mut_arr);
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (validate_sequential_elements(out_arr, out_arr + Size,
+						  arr, arr + Size));
+  VERIFY_NON_TARGET (validate_sequential_elements(out_mut_arr, out_mut_arr + Size,
+						  arr, arr + Size, MutationFunc()));
+  return true;
+}
+
+template<typename T>
+const T& get_key(const T& arg) BL_NOEXCEPT
+  { return arg; }
+template<typename K, typename V>
+const K& get_key(const std::pair<K, V>& pair) BL_NOEXCEPT
+  { return pair.first; }
+template<typename T>
+const T& get_value(const T& arg) BL_NOEXCEPT
+  { return arg; }
+template<typename K, typename V>
+const K& get_value(const std::pair<K, V>& pair) BL_NOEXCEPT
+  { return pair.second; }
+
+template<typename T>
+struct key_type { typedef T type; };
+template<typename K, typename V>
+struct key_type<std::pair<K, V> > { typedef K type; };
+
+template<typename Proj, typename Container, typename It>
+bool validate_associative(const Container& container,
+			  const It compare_begin,
+			  const It compare_end,
+			  Proj proj) BL_NOEXCEPT
+{
+  const typename Container::const_iterator elem_end = container.end();
+  for (It compare_it = compare_begin; compare_it != compare_end; ++compare_it)
+    {
+      const typename Container::const_iterator elem_it = container.find(get_key(*compare_it));
+      VERIFY_NON_TARGET (elem_it != elem_end);
+      VERIFY_NON_TARGET (proj(get_value(*compare_it)) == get_value(*elem_it));
+    }
+  return true;
+}
+
+template<typename Container, typename It>
+bool validate_associative(const Container& container,
+			  const It compare_begin,
+			  const It compare_end) BL_NOEXCEPT
+{
+  return validate_associative(container, compare_begin, compare_end, identity_func());
+}
+
+template<typename It, typename MutateFn>
+void simple_mutate_map(const It begin, const It end, MutateFn mut_fn) BL_NOEXCEPT
+{
+  for (It it = begin; it != end; ++it)
+    it->second = mut_fn(it->second);
+}
+
+template<typename It, typename OutIter>
+void simple_copy_unique(const It begin, const It end, OutIter out) BL_NOEXCEPT
+{
+  /* In case anyone reads this, I want it to be known that I hate c++98.  */
+  typedef typename key_type<typename std::iterator_traits<It>::value_type>::type key_t;
+  std::set<key_t> already_seen;
+  for (It it = begin; it != end; ++it, ++out)
+    {
+      key_t key = get_key(*it);
+      if (already_seen.find(key) != already_seen.end())
+	continue;
+      already_seen.insert(key);
+      *out = *it;
+    }
+}
+
+template<typename MutationFunc, typename K, typename V, std::size_t Size>
+bool map_test(const std::pair<K, V> (&arr)[Size])
+{
+  std::map<K, V> reference_map(arr, arr + Size);
+  bool ok;
+  /* Both sizes should be the same.  */
+  std::pair<K, V> out_pairs[Size];
+  std::size_t out_size;
+  std::pair<K, V> out_pairs_mut[Size];
+  std::size_t out_size_mut;
+  #pragma omp target map(from: ok, out_pairs[:Size], out_size, \
+			       out_pairs_mut[:Size], out_size_mut) \
+		     map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      {
+	std::vector<std::pair<K, V> > unique_elems;
+	simple_copy_unique(arr, arr + Size,
+			   std::back_insert_iterator<std::vector<std::pair<K, V> > >(unique_elems));
+
+	std::map<K, V> map(arr, arr + Size);
+	VERIFY (validate_associative(map, unique_elems.begin(), unique_elems.end()));
+	simple_copy(map.begin(), map.end(), out_pairs);
+	out_size = map.size();
+	simple_mutate_map(map.begin(), map.end(), MutationFunc());
+	VERIFY (validate_associative(map, unique_elems.begin(), unique_elems.end(),
+				     MutationFunc()));
+	simple_copy(map.begin(), map.end(), out_pairs_mut);
+	out_size_mut = map.size();
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (out_size == out_size_mut);
+  VERIFY_NON_TARGET (validate_associative(reference_map,
+					  out_pairs, out_pairs + out_size));
+  simple_mutate_map(reference_map.begin(), reference_map.end(), MutationFunc());
+  VERIFY_NON_TARGET (validate_associative(reference_map,
+					  out_pairs_mut, out_pairs_mut + out_size_mut));
+  return true;
+}
+
+template<typename T, std::size_t Size>
+bool set_test(const T (&arr)[Size])
+{
+  std::set<T> reference_set(arr, arr + Size);
+  bool ok;
+  /* Both sizes should be the same.  */
+  T out_arr[Size];
+  std::size_t out_size;
+  #pragma omp target map(from: ok, out_arr[:Size], out_size) \
+		     map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      {
+	std::vector<T> unique_elems;
+	simple_copy_unique(arr, arr + Size,
+			   std::back_insert_iterator<std::vector<T> >(unique_elems));
+
+	std::set<T> set(arr, arr + Size);
+	VERIFY (validate_associative(set, unique_elems.begin(), unique_elems.end()));
+	simple_copy(set.begin(), set.end(), out_arr);
+	out_size = set.size();
+	/* Sets can't be mutated, we could create another set with mutated
+	   but it gets a little annoying and probably isn't an interesting test.  */
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (validate_associative(reference_set,
+					  out_arr, out_arr + out_size));
+  return true;
+}
+
+template<typename Proj, typename Container, typename It>
+bool validate_multi_associative(const Container& container,
+				const It compare_begin,
+				const It compare_end,
+				Proj proj) BL_NOEXCEPT
+{
+  /* Once again, for the poor soul reviewing these, I hate c++98.  */
+  typedef typename key_type<typename std::iterator_traits<It>::value_type>::type key_t;
+  typedef std::map<key_t, std::size_t> counter_map; 
+  counter_map key_count_map;
+  for (It it = compare_begin; it != compare_end; ++it)
+    {
+      const key_t& key = get_key(*it);
+      typename counter_map::iterator counter_it
+	= key_count_map.find(key);
+      if (counter_it != key_count_map.end())
+	++counter_it->second;
+      else
+	key_count_map.insert(std::pair<const key_t, std::size_t>(key, std::size_t(1)));
+    }
+  const typename Container::const_iterator elem_end = container.end();
+  for (It compare_it = compare_begin; compare_it != compare_end; ++compare_it)
+    {
+      const key_t& key = get_key(*compare_it);
+      typename counter_map::iterator count_it = key_count_map.find(key);
+      std::size_t key_count = count_it != key_count_map.end() ? count_it->second
+							      : std::size_t(0);
+      VERIFY_NON_TARGET (key_count > std::size_t(0) && "this will never happen");
+      /* This gets tested multiple times but that should be fine.  */
+      VERIFY_NON_TARGET (key_count == container.count(key));
+      typename Container::const_iterator elem_it = container.find(key);
+      /* This will never happen if the previous case passed.  */
+      VERIFY_NON_TARGET (elem_it != elem_end);
+      bool found_element = false;
+      for (; elem_it != elem_end; ++elem_it)
+	if (proj(get_value(*compare_it)) == get_value(*elem_it))
+	  {
+	    found_element = true;
+	    break;
+	  }
+      VERIFY_NON_TARGET (found_element);
+    }
+  return true;
+}
+
+template<typename Container, typename It>
+bool validate_multi_associative(const Container& container,
+				const It compare_begin,
+				const It compare_end) BL_NOEXCEPT
+{
+  return validate_multi_associative(container, compare_begin, compare_end, identity_func());
+}
+
+template<typename MutationFunc, typename K, typename V, std::size_t Size>
+bool multimap_test(const std::pair<K, V> (&arr)[Size])
+{
+  std::multimap<K, V> reference_multimap(arr, arr + Size);
+  bool ok;
+  std::pair<K, V> out_pairs[Size];
+  std::pair<K, V> out_pairs_mut[Size];
+  #pragma omp target map(from: ok, out_pairs[:Size], out_pairs_mut[:Size]) \
+		     map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      {
+	std::multimap<K, V> multimap(arr, arr + Size);
+	VERIFY (validate_multi_associative(multimap, arr, arr + Size));
+	simple_copy(multimap.begin(), multimap.end(), out_pairs);
+	simple_mutate_map(multimap.begin(), multimap.end(), MutationFunc());
+	VERIFY (validate_multi_associative(multimap, arr, arr + Size, MutationFunc()));
+	simple_copy(multimap.begin(), multimap.end(), out_pairs_mut);
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (validate_multi_associative(reference_multimap,
+						out_pairs, out_pairs + Size));
+  simple_mutate_map(reference_multimap.begin(), reference_multimap.end(), MutationFunc());
+  VERIFY_NON_TARGET (validate_multi_associative(reference_multimap,
+						out_pairs_mut, out_pairs_mut + Size));
+  return true;
+}
+
+template<typename T, std::size_t Size>
+bool multiset_test(const T (&arr)[Size])
+{
+  std::multiset<T> reference_multiset(arr, arr + Size);
+  bool ok;
+  T out_arr[Size];
+  #pragma omp target map(from: ok, out_arr[:Size]) \
+		     map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      {
+	std::multiset<T> set(arr, arr + Size);
+	VERIFY (validate_multi_associative(set, arr, arr + Size));
+	simple_copy(set.begin(), set.end(), out_arr);
+	/* Sets can't be mutated, we could create another set with mutated
+	   but it gets a little annoying and probably isn't an interesting test.  */
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (validate_multi_associative(reference_multiset,
+						out_arr, out_arr + Size));
+  return true;
+}
+
+#if __cplusplus >= 201103L
+
+template<typename MutationFunc, typename T, std::size_t Size>
+bool array_test(const T (&arr)[Size])
+{
+  bool ok;
+  T out_arr[Size];
+  T out_mut_arr[Size];
+  #pragma omp target map(from: ok, out_arr[:Size], out_mut_arr[:Size]) \
+		     map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      {
+	std::array<T, Size> std_array{};
+	/* Special case for std::array since it can't be initialized
+	   with iterators.  */
+	{
+	  T zero_val = T{};
+	  for (auto it = std_array.begin(); it != std_array.end(); ++it)
+	    VERIFY (*it == zero_val);
+	}
+	simple_copy(arr, arr + Size, std_array.begin());
+	VERIFY (validate_sequential_elements(std_array.begin(), std_array.end(),
+					     arr, arr + Size));
+	simple_copy(std_array.begin(), std_array.end(), out_arr);
+	simple_mutate(std_array.begin(), std_array.end(), MutationFunc());
+	VERIFY (validate_sequential_elements(std_array.begin(), std_array.end(),
+					     arr, arr + Size, MutationFunc()));
+	simple_copy(std_array.begin(), std_array.end(), out_mut_arr);
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (validate_sequential_elements(out_arr, out_arr + Size,
+						  arr, arr + Size));
+  VERIFY_NON_TARGET (validate_sequential_elements(out_mut_arr, out_mut_arr + Size,
+						  arr, arr + Size, MutationFunc()));
+  return true;
+}
+
+template<typename MutationFunc, typename T, std::size_t Size>
+bool forward_list_test(const T (&arr)[Size])
+{
+  bool ok;
+  T out_arr[Size];
+  T out_mut_arr[Size];
+  #pragma omp target map(from: ok, out_arr[:Size], out_mut_arr[:Size]) \
+		     map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      {
+	std::forward_list<T> fwd_list(arr, arr + Size);
+	VERIFY (validate_sequential_elements(fwd_list.begin(), fwd_list.end(),
+					     arr, arr + Size));
+	simple_copy(fwd_list.begin(), fwd_list.end(), out_arr);
+	simple_mutate(fwd_list.begin(), fwd_list.end(), MutationFunc());
+	VERIFY (validate_sequential_elements(fwd_list.begin(), fwd_list.end(),
+					     arr, arr + Size, MutationFunc()));
+	simple_copy(fwd_list.begin(), fwd_list.end(), out_mut_arr);
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (validate_sequential_elements(out_arr, out_arr + Size,
+						  arr, arr + Size));
+  VERIFY_NON_TARGET (validate_sequential_elements(out_mut_arr, out_mut_arr + Size,
+						  arr, arr + Size, MutationFunc()));
+  return true;
+}
+
+template<typename MutationFunc, typename K, typename V, std::size_t Size>
+bool unordered_map_test(const std::pair<K, V> (&arr)[Size])
+{
+  std::unordered_map<K, V> reference_map(arr, arr + Size);
+  bool ok;
+  /* Both sizes should be the same.  */
+  std::pair<K, V> out_pairs[Size];
+  std::size_t out_size;
+  std::pair<K, V> out_pairs_mut[Size];
+  std::size_t out_size_mut;
+  #pragma omp target map(from: ok, out_pairs[:Size], out_size, \
+			       out_pairs_mut[:Size], out_size_mut) \
+		     map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      {
+	std::vector<std::pair<K, V> > unique_elems;
+	simple_copy_unique(arr, arr + Size,
+			   std::back_insert_iterator<std::vector<std::pair<K, V> > >(unique_elems));
+
+	std::unordered_map<K, V> map(arr, arr + Size);
+	VERIFY (validate_associative(map, unique_elems.begin(), unique_elems.end()));
+	simple_copy(map.begin(), map.end(), out_pairs);
+	out_size = map.size();
+	simple_mutate_map(map.begin(), map.end(), MutationFunc());
+	VERIFY (validate_associative(map, unique_elems.begin(), unique_elems.end(),
+				     MutationFunc()));
+	simple_copy(map.begin(), map.end(), out_pairs_mut);
+	out_size_mut = map.size();
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (out_size == out_size_mut);
+  VERIFY_NON_TARGET (validate_associative(reference_map,
+					  out_pairs, out_pairs + out_size));
+  simple_mutate_map(reference_map.begin(), reference_map.end(), MutationFunc());
+  VERIFY_NON_TARGET (validate_associative(reference_map,
+					  out_pairs_mut, out_pairs_mut + out_size_mut));
+  return true;
+}
+
+template<typename T, std::size_t Size>
+bool unordered_set_test(const T (&arr)[Size])
+{
+  std::unordered_set<T> reference_set(arr, arr + Size);
+  bool ok;
+  /* Both sizes should be the same.  */
+  T out_arr[Size];
+  std::size_t out_size;
+  #pragma omp target map(from: ok, out_arr[:Size], out_size) \
+		     map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      {
+	std::vector<T> unique_elems;
+	simple_copy_unique(arr, arr + Size,
+			   std::back_insert_iterator<std::vector<T> >(unique_elems));
+
+	std::unordered_set<T> set(arr, arr + Size);
+	VERIFY (validate_associative(set, unique_elems.begin(), unique_elems.end()));
+	simple_copy(set.begin(), set.end(), out_arr);
+	out_size = set.size();
+	/* Sets can't be mutated, we could create another set with mutated
+	   but it gets a little annoying and probably isn't an interesting test.  */
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (validate_associative(reference_set,
+					  out_arr, out_arr + out_size));
+  return true;
+}
+
+template<typename MutationFunc, typename K, typename V, std::size_t Size>
+bool unordered_multimap_test(const std::pair<K, V> (&arr)[Size])
+{
+  std::unordered_multimap<K, V> reference_multimap(arr, arr + Size);
+  bool ok;
+  std::pair<K, V> out_pairs[Size];
+  std::pair<K, V> out_pairs_mut[Size];
+  #pragma omp target map(from: ok, out_pairs[:Size], out_pairs_mut[:Size]) \
+		     map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      {
+	std::unordered_multimap<K, V> multimap(arr, arr + Size);
+	VERIFY (validate_multi_associative(multimap, arr, arr + Size));
+	simple_copy(multimap.begin(), multimap.end(), out_pairs);
+	simple_mutate_map(multimap.begin(), multimap.end(), MutationFunc());
+	VERIFY (validate_multi_associative(multimap, arr, arr + Size, MutationFunc()));
+	simple_copy(multimap.begin(), multimap.end(), out_pairs_mut);
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (validate_multi_associative(reference_multimap,
+						out_pairs, out_pairs + Size));
+  simple_mutate_map(reference_multimap.begin(), reference_multimap.end(), MutationFunc());
+  VERIFY_NON_TARGET (validate_multi_associative(reference_multimap,
+						out_pairs_mut, out_pairs_mut + Size));
+  return true;
+}
+
+template<typename T, std::size_t Size>
+bool unordered_multiset_test(const T (&arr)[Size])
+{
+  std::unordered_multiset<T> reference_multiset(arr, arr + Size);
+  bool ok;
+  T out_arr[Size];
+  #pragma omp target map(from: ok, out_arr[:Size]) \
+		     map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      {
+	std::unordered_multiset<T> set(arr, arr + Size);
+	VERIFY (validate_multi_associative(set, arr, arr + Size));
+	simple_copy(set.begin(), set.end(), out_arr);
+	/* Sets can't be mutated, we could create another set with mutated
+	   but it gets a little annoying and probably isn't an interesting test.  */
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (validate_multi_associative(reference_multiset,
+						out_arr, out_arr + Size));
+  return true;
+}
+
+#else
+template<typename, typename T, std::size_t Size> bool array_test(const T (&arr)[Size]) { return true; }
+template<typename, typename T, std::size_t Size> bool forward_list_test(const T (&arr)[Size]) { return true; }
+template<typename, typename T, std::size_t Size> bool unordered_map_test(const T (&arr)[Size]) { return true; }
+template<typename T, std::size_t Size> bool unordered_set_test(const T (&arr)[Size]) { return true; }
+template<typename, typename T, std::size_t Size> bool unordered_multimap_test(const T (&arr)[Size]) { return true; }
+template<typename T, std::size_t Size> bool unordered_multiset_test(const T (&arr)[Size]) { return true; }
+#endif
+
+/* This clamps to the maximum value to guard against overflowing,
+   assuming std::numeric_limits is specialized for T.  */
+struct multiply_by_2
+{
+  template<typename T>
+  typename enable_if<std::numeric_limits<T>::is_specialized, T>::type
+  operator()(T arg) const BL_NOEXCEPT {
+    if (arg < static_cast<T>(0))
+      {
+	if (std::numeric_limits<T>::min() / static_cast<T>(2) >= arg)
+	  return std::numeric_limits<T>::min();
+      }
+    else
+      {
+	if (std::numeric_limits<T>::max() / static_cast<T>(2) <= arg)
+	  return std::numeric_limits<T>::max();
+      }
+    return arg * 2;
+  }
+  template<typename T>
+  typename enable_if<!std::numeric_limits<T>::is_specialized, T>::type
+  operator()(T arg) const BL_NOEXCEPT {
+    return arg * 2;
+  }
+};
+
+int main()
+{
+  int data[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  std::pair<int, int> pairs[10] = {std::pair<int, int>( 1,  2),
+				   std::pair<int, int>( 2,  4),
+				   std::pair<int, int>( 3,  6),
+				   std::pair<int, int>( 4,  8),
+				   std::pair<int, int>( 5, 10),
+				   std::pair<int, int>( 6, 12),
+				   std::pair<int, int>( 7, 14),
+				   std::pair<int, int>( 8, 16),
+				   std::pair<int, int>( 9, 18),
+				   std::pair<int, int>(10, 20)};
+  const bool vec_res                = vector_test<multiply_by_2>(data);
+  const bool deque_res              = deque_test<multiply_by_2>(data);
+  const bool list_res               = list_test<multiply_by_2>(data);
+  const bool map_res                = map_test<multiply_by_2>(pairs);
+  const bool set_res                = set_test(data);
+  const bool multimap_res           = multimap_test<multiply_by_2>(pairs);
+  const bool multiset_res           = multiset_test(data);
+  const bool array_res              = array_test<multiply_by_2>(data);
+  const bool forward_list_res       = forward_list_test<multiply_by_2>(data);
+  const bool unordered_map_res      = unordered_map_test<multiply_by_2>(pairs);
+  const bool unordered_set_res      = unordered_set_test(data);
+  const bool unordered_multimap_res = unordered_multimap_test<multiply_by_2>(pairs);
+  const bool unordered_multiset_res = unordered_multiset_test(data);
+  std::printf("vector            : %s\n", vec_res                ? "PASS" : "FAIL");
+  std::printf("deque             : %s\n", deque_res              ? "PASS" : "FAIL");
+  std::printf("list              : %s\n", list_res               ? "PASS" : "FAIL");
+  std::printf("map               : %s\n", map_res                ? "PASS" : "FAIL");
+  std::printf("set               : %s\n", set_res                ? "PASS" : "FAIL");
+  std::printf("multimap          : %s\n", multimap_res           ? "PASS" : "FAIL");
+  std::printf("multiset          : %s\n", multiset_res           ? "PASS" : "FAIL");
+  std::printf("array             : %s\n", array_res              ? "PASS" : "FAIL");
+  std::printf("forward_list      : %s\n", forward_list_res       ? "PASS" : "FAIL");
+  std::printf("unordered_map     : %s\n", unordered_map_res      ? "PASS" : "FAIL");
+  std::printf("unordered_set     : %s\n", unordered_set_res      ? "PASS" : "FAIL");
+  std::printf("unordered_multimap: %s\n", unordered_multimap_res ? "PASS" : "FAIL");
+  std::printf("unordered_multiset: %s\n", unordered_multiset_res ? "PASS" : "FAIL");
+  const bool ok = vec_res
+		  && deque_res
+		  && list_res
+		  && map_res
+		  && set_res
+		  && multimap_res
+		  && multiset_res
+		  && array_res
+		  && forward_list_res
+		  && unordered_map_res
+		  && unordered_set_res
+		  && unordered_multimap_res
+		  && unordered_multiset_res;
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-2000.C b/libgomp/testsuite/libgomp.c++/target-flex-2000.C
new file mode 100644
index 0000000..688c014
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-2000.C
@@ -0,0 +1,32 @@
+/* Tiny tuple test.  */
+
+#include <tuple>
+
+#include "target-flex-common.h"
+
+bool test(int arg)
+{
+  bool ok;
+  int out;
+  std::tuple tup = {'a', arg, 3.14f};
+  #pragma omp target map(from: ok, out) map(to: tup)
+    {
+      bool inner_ok = true;
+      {
+	VERIFY (std::get<0>(tup) == 'a');
+	out = std::get<1>(tup);
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (out == arg);
+  return true;
+}
+
+int main()
+{
+  volatile int arg = 42u;
+  return test(arg) ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-2001.C b/libgomp/testsuite/libgomp.c++/target-flex-2001.C
new file mode 100644
index 0000000..f1a6c12
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-2001.C
@@ -0,0 +1,61 @@
+/* { dg-additional-options "-std=c++20" } */
+
+/* Functional  */
+
+#include <functional>
+#include <utility>
+
+#include "target-flex-common.h"
+
+template<typename T,typename Fn>
+auto invoke_unary(T&& a, Fn&& fn) noexcept
+{
+  return std::invoke(std::forward<Fn>(fn),
+		     std::forward<T>(a));
+}
+
+template<typename T, typename U, typename Fn>
+auto invoke_binary(T&& a, U&& b, Fn&& fn) noexcept
+{
+  return std::invoke(std::forward<Fn>(fn),
+		     std::forward<T>(a),
+		     std::forward<U>(b));
+}
+
+bool test(unsigned arg)
+{
+  bool ok;
+  #pragma omp target map(from: ok) map(to: arg)
+    {
+      bool inner_ok = true;
+      {
+	VERIFY (std::plus{}(arg, 2) == arg + 2);
+	auto bound_plus_arg = std::bind_front(std::plus{}, arg);
+	VERIFY (bound_plus_arg(10) == arg + 10);
+	VERIFY (bound_plus_arg(20) == arg + 20);
+
+	VERIFY (std::not_fn(std::not_equal_to{})(arg, arg));
+	VERIFY (invoke_binary(arg, arg, std::not_fn(std::not_equal_to{})));
+	auto bound_equals_arg = std::bind_front(std::not_fn(std::not_equal_to{}), arg);
+	VERIFY (bound_equals_arg(arg));
+	VERIFY (std::not_fn(bound_equals_arg)(arg + 1));
+	VERIFY (invoke_unary(arg, bound_equals_arg));
+
+	VERIFY (std::not_fn(std::ranges::not_equal_to{})(arg, arg));
+	VERIFY (invoke_binary(arg, arg, std::not_fn(std::ranges::not_equal_to{})));
+	auto bound_ranges_equals_arg = std::bind_front(std::not_fn(std::ranges::not_equal_to{}), arg);
+	VERIFY (bound_ranges_equals_arg(arg));
+	VERIFY (std::not_fn(bound_ranges_equals_arg)(arg + 1));
+	VERIFY (invoke_unary(arg, bound_ranges_equals_arg));
+      }
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+int main()
+{
+  volatile unsigned arg = 42u;
+  return test(arg) ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-2002.C b/libgomp/testsuite/libgomp.c++/target-flex-2002.C
new file mode 100644
index 0000000..f738806
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-2002.C
@@ -0,0 +1,97 @@
+/* { dg-additional-options "-std=c++23" } */
+
+/* expected/optional  */
+
+#include <optional>
+#include <expected>
+
+#include "target-flex-common.h"
+
+std::optional<unsigned> make_optional(bool b, unsigned arg = 0u) noexcept
+{
+  if (!b)
+    return std::nullopt;
+  return {arg};
+}
+
+bool test_optional(unsigned arg)
+{
+  bool ok;
+  #pragma omp target map(from: ok) map(to: arg)
+    {
+      bool inner_ok = true;
+      {
+	auto null_opt = make_optional(false);
+	VERIFY (!null_opt);
+	VERIFY (!null_opt.has_value());
+	VERIFY (null_opt.value_or(arg * 2u) == arg * 2u);
+	VERIFY (null_opt.or_else([&](){ return std::optional<unsigned>{arg}; })
+			.transform([](int a){ return a * 2u; })
+			.value_or(0) == arg * 2u);
+
+	auto opt = make_optional(true, arg);
+	VERIFY (opt);
+	VERIFY (opt.has_value());
+	VERIFY (opt.value() == arg);
+	VERIFY (*opt == arg);
+	VERIFY (opt.value_or(arg + 42) == arg);
+	VERIFY (opt.or_else([&](){ return std::optional<unsigned>{arg + 42}; })
+		   .transform([](int a){ return a * 2u; })
+		   .value_or(0) == arg * 2u);
+      }
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+struct my_error
+{
+  int _e;
+};
+
+std::expected<unsigned, my_error> make_expected(bool b, unsigned arg = 0u) noexcept
+{
+  if (!b)
+    return std::unexpected{my_error{-1}};
+  return {arg};
+}
+
+bool test_expected(unsigned arg)
+{
+  bool ok;
+  #pragma omp target map(from: ok) map(to: arg)
+    {
+      bool inner_ok = true;
+      {
+	auto unexpected = make_expected(false);
+	VERIFY (!unexpected);
+	VERIFY (!unexpected.has_value());
+	VERIFY (unexpected.error()._e == -1);
+	VERIFY (unexpected.value_or(arg * 2u) == arg * 2u);
+	VERIFY (unexpected.or_else([&](my_error e){ return std::expected<unsigned, my_error>{arg}; })
+			  .transform([](int a){ return a * 2u; })
+			  .value_or(0) == arg * 2u);
+
+	auto expected = make_expected(true, arg);
+	VERIFY (expected);
+	VERIFY (expected.has_value());
+	VERIFY (expected.value() == arg);
+	VERIFY (*expected == arg);
+	VERIFY (expected.value_or(arg + 42) == arg);
+	VERIFY (expected.or_else([&](my_error e){ return std::expected<unsigned, my_error>{std::unexpected{e}}; })
+			.transform([](int a){ return a * 2u; })
+			.value_or(0) == arg * 2u);
+      }
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+int main()
+{
+  volatile unsigned arg = 42;
+  return test_optional(arg)
+	 && test_expected(arg) ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-2003.C b/libgomp/testsuite/libgomp.c++/target-flex-2003.C
new file mode 100644
index 0000000..8e8ca8e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-2003.C
@@ -0,0 +1,176 @@
+/* { dg-additional-options "-std=c++20" } */
+
+/* bit_cast and memcpy  */
+
+#include <bit>
+#include <cstring>
+
+#include "target-flex-common.h"
+
+struct S0
+{
+  int _v0;
+  char _v1;
+  long long _v2;
+};
+
+struct S1
+{
+  int _v0;
+  char _v1;
+  long long _v2;
+};
+
+bool test_bit_cast(int arg)
+{
+  bool ok;
+  S1 s1_out;
+  #pragma omp target map(from: ok, s1_out) map(to: arg)
+    {
+      bool inner_ok = true;
+      {
+	long long v = static_cast<long long>(arg + 42ll);
+	S0 s = {arg, 'a', v};
+	VERIFY (std::bit_cast<S1>(s)._v0 == arg);
+	VERIFY (std::bit_cast<S1>(s)._v1 == 'a');
+	VERIFY (std::bit_cast<S1>(s)._v2 == v);
+	s1_out = std::bit_cast<S1>(s);
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  long long v = static_cast<long long>(arg + 42ll);
+  VERIFY_NON_TARGET (std::bit_cast<S0>(s1_out)._v0 == arg);
+  VERIFY_NON_TARGET (std::bit_cast<S0>(s1_out)._v1 == 'a');
+  VERIFY_NON_TARGET (std::bit_cast<S0>(s1_out)._v2 == v);
+  return true;
+}
+
+
+struct OutStruct
+{
+  std::size_t _id;
+  void *_next;
+};
+
+struct Extendable1
+{
+  std::size_t _id;
+  void *_next;
+  int _v;
+};
+
+struct Extendable2
+{
+  std::size_t _id;
+  void *_next;
+  char _str[256];
+};
+
+struct Extendable3
+{
+  std::size_t _id;
+  void *_next;
+  const int *_nums;
+  std::size_t _size;
+};
+
+struct ExtendableUnknown
+{
+  std::size_t _id;
+  void *_next;
+};
+
+template<typename To, std::size_t Id>
+To *get_extendable(void *p)
+{
+  while (p != nullptr)
+    {
+      OutStruct out;
+      std::memcpy(&out, p, sizeof(OutStruct));
+      if (out._id == Id)
+	return static_cast<To *>(p);
+      p = out._next;
+    }
+  return nullptr;
+}
+
+bool test_memcpy(int arg, const int *nums, std::size_t nums_size)
+{
+  bool ok;
+  Extendable2 e2_out;
+  #pragma omp target map(from: ok, e2_out) map(to: arg, nums[:nums_size], nums_size)
+    {
+      bool inner_ok = true;
+      {
+	Extendable3 e3 = {3u, nullptr, nums, nums_size};
+	ExtendableUnknown u1 = {100u, &e3};
+	Extendable2 e2 = {2u, &u1, {'H', 'e', 'l', 'l', 'o', '!', '\000'}};
+	ExtendableUnknown u2 = {101u, &e2};
+	ExtendableUnknown u3 = {102u, &u2};
+	ExtendableUnknown u4 = {142u, &u3};
+	Extendable1 e1 = {1u, &u4, arg};
+
+	void *p = &e1;
+	while (p != nullptr)
+	  {
+	    /* You can always cast a pointer to a struct to a pointer to
+	       the type of it's first member.  */
+	    switch (*static_cast<std::size_t *>(p))
+	      {
+		case 1:
+		  {
+		    Extendable1 *e1_p = static_cast<Extendable1 *>(p);
+		    p = e1_p->_next;
+		    VERIFY (e1_p->_v == arg);
+		    break;
+		  }
+		case 2:
+		  {
+		    Extendable2 *e2_p = static_cast<Extendable2 *>(p);
+		    p = e2_p->_next;
+		    VERIFY (std::strcmp(e2_p->_str, "Hello!") == 0);
+		    break;
+		  }
+		case 3:
+		  {
+		    Extendable3 *e3_p = static_cast<Extendable3 *>(p);
+		    p = e3_p->_next;
+		    VERIFY (nums == e3_p->_nums);
+		    VERIFY (nums_size == e3_p->_size);
+		    break;
+		  }
+		default:
+		  {
+		    /* Casting to a pointer to OutStruct invokes undefined
+		       behavior though, memcpy is required to extract the _next
+		       member.  */
+		    OutStruct out;
+		    std::memcpy(&out, p, sizeof(OutStruct));
+		    p = out._next;
+		  }
+	      }
+	  }
+	Extendable2 *e2_p = get_extendable<Extendable2, 2u>(&e1);
+	VERIFY (e2_p != nullptr);
+	e2_out = *e2_p;
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (e2_out._id == 2u);
+  VERIFY_NON_TARGET (std::strcmp(e2_out._str, "Hello!") == 0);
+  return true;
+}
+
+int main()
+{
+  volatile int arg = 42;
+  int arr[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  return test_bit_cast(arg)
+	 && test_memcpy(arg, arr, 8) ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-30.C b/libgomp/testsuite/libgomp.c++/target-flex-30.C
new file mode 100644
index 0000000..c66075b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-30.C
@@ -0,0 +1,51 @@
+/* std::initializer_list in target region.  */
+
+#include <initializer_list>
+#include <array>
+
+#include "target-flex-common.h"
+
+bool test_initializer_list(int arg)
+{
+  static constexpr std::size_t out_arr_size = 7;
+  int out_arr[out_arr_size];
+  bool ok;
+  #pragma omp target map(from: ok, out_arr[:out_arr_size]) map(to: arg)
+    {
+      bool inner_ok = true;
+      {
+	auto il = {0, 1, 2, 3, 4, 5, arg};
+
+	int sum = 0;
+	for (auto const& e : il)
+	  sum += e;
+	VERIFY (sum == 0 + 1 + 2 + 3 + 4 + 5 + arg);
+
+	auto* out_it = out_arr;
+	const auto* const out_end = out_arr + out_arr_size;
+	for (auto const& e : il)
+	  {
+	    VERIFY (out_it != out_end);
+	    *out_it = e;
+	    ++out_it;
+	  }
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+
+  std::array<int, out_arr_size> reference_array = {0, 1, 2, 3, 4, 5, arg};
+  const auto *out_arr_it = out_arr;
+  for (auto const& e : reference_array)
+    VERIFY_NON_TARGET (e == *(out_arr_it++));
+
+  return true;
+}
+
+int main()
+{
+  volatile int arg = 42;
+  return test_initializer_list(arg) ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-300.C b/libgomp/testsuite/libgomp.c++/target-flex-300.C
new file mode 100644
index 0000000..329a189
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-300.C
@@ -0,0 +1,51 @@
+/* { dg-additional-options -std=c++23 } */
+
+/* numerics  */
+
+#include <algorithm>
+#include <numeric>
+#include <ranges>
+#include <span>
+#include <vector>
+
+//TODO PR120454 "C++ constexpr vs. OpenMP implicit mapping"
+#pragma omp declare target(std::ranges::all_of, std::ranges::iota)
+
+#include "target-flex-common.h"
+
+namespace stdr = std::ranges;
+
+bool test(std::size_t arg)
+{
+  bool ok;
+  int midpoint_out;
+  std::vector<int> vec(arg);
+  int *data = vec.data();
+  std::size_t size = vec.size();
+  #pragma omp target defaultmap(none) map(from: ok, midpoint_out) map(tofrom: data[:size]) map(to: arg, size)
+  /* <https://baylibre.slack.com/archives/C06TTV7HMMG/p1748508583437829>
+     { dg-bogus {sorry, unimplemented: unsupported map expression '<lambda closure object>.*} TODO { xfail *-*-* } .-2 } */
+    {
+      std::span span = {data, size};
+      bool inner_ok = true;
+      {
+	VERIFY (stdr::all_of(span, [](int v){ return v == int{}; }));
+	stdr::iota(span, 0);
+	midpoint_out = *std::midpoint(span.data(), span.data() + span.size());
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (stdr::equal(vec, std::views::iota(0, static_cast<int>(vec.size()))));
+  VERIFY_NON_TARGET (*std::midpoint(vec.data(), vec.data() + vec.size())
+		     == midpoint_out);
+  return true;
+}
+
+int main()
+{
+  volatile std::size_t arg = 42;
+  return test(arg) ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-31.C b/libgomp/testsuite/libgomp.c++/target-flex-31.C
new file mode 100644
index 0000000..adaf18f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-31.C
@@ -0,0 +1,80 @@
+/* std::initializer_list in target region.  */
+
+#include <initializer_list>
+
+#include "target-flex-common.h"
+
+struct S0
+{
+  int _v;
+  S0(std::initializer_list<int> il)
+    : _v(0)
+  {
+    for (auto const& e : il)
+      _v += e;
+  }
+};
+
+struct S1
+{
+  int _v;
+  template<typename T>
+  S1(std::initializer_list<T> il)
+    : _v(0)
+  {
+    for (auto const& e : il)
+      _v += e;
+  }
+};
+
+template<typename T>
+struct S2
+{
+  T _v;
+  S2(std::initializer_list<T> il)
+    : _v(0)
+  {
+    for (auto const& e : il)
+      _v += e;
+  }
+};
+
+#if __cplusplus >= 201703L
+template<typename T>
+S2(std::initializer_list<T>) -> S2<T>;
+#endif
+
+bool test_initializer_list(int arg)
+{
+  bool ok;
+  #pragma omp target map(from: ok) map(to: arg)
+    {
+      bool inner_ok = true;
+      {
+	static constexpr int partial_sum = 0 + 1 + 2 + 3 + 4 + 5;
+
+	S0 s0{0, 1, 2, 3, 4, 5, arg};
+	VERIFY (s0._v == partial_sum + arg);
+
+	S1 s1{0, 1, 2, 3, 4, 5, arg};
+	VERIFY (s1._v == partial_sum + arg);
+
+	S2<int> s2{0, 1, 2, 3, 4, 5, arg};
+	VERIFY (s2._v == partial_sum + arg);
+
+	#if __cplusplus >= 201703L
+	  S2 s2_ctad{0, 1, 2, 3, 4, 5, arg};
+	  VERIFY (s2_ctad._v == partial_sum + arg);
+	#endif
+      }
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+int main()
+{
+  volatile int arg = 42;
+  return test_initializer_list(arg) ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-32.C b/libgomp/testsuite/libgomp.c++/target-flex-32.C
new file mode 100644
index 0000000..7f74401a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-32.C
@@ -0,0 +1,50 @@
+/* std::initializer_list constructor of std::vector (explicit template arg) */
+
+#include <vector>
+#include <array>
+
+#include "target-flex-common.h"
+
+bool test_initializer_list(int arg)
+{
+  static constexpr std::size_t out_arr_size = 7;
+  int out_arr[out_arr_size];
+  bool ok;
+  #pragma omp target map(from: ok, out_arr[:out_arr_size]) map(to: arg)
+    {
+      bool inner_ok = true;
+      {
+	std::vector<int> vec{0, 1, 2, 3, 4, 5, arg};
+	int sum = 0;
+	for (auto const& e : vec)
+	  sum += e;
+	VERIFY (sum == 0 + 1 + 2 + 3 + 4 + 5 + arg);
+
+	auto* out_it = out_arr;
+	const auto* const out_end = out_arr + out_arr_size;
+	for (auto const& e : vec)
+	  {
+	    VERIFY (out_it != out_end);
+	    *out_it = e;
+	    ++out_it;
+	  }
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+
+  std::array<int, out_arr_size> reference_array = {0, 1, 2, 3, 4, 5, arg};
+  const auto *out_arr_it = out_arr;
+  for (auto const& e : reference_array)
+    VERIFY_NON_TARGET (e == *(out_arr_it++));
+
+  return true;
+}
+
+int main()
+{
+  volatile int arg = 42;
+  return test_initializer_list(arg) ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-33.C b/libgomp/testsuite/libgomp.c++/target-flex-33.C
new file mode 100644
index 0000000..bb8a39b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-33.C
@@ -0,0 +1,52 @@
+/* { dg-additional-options "-std=c++17" } */
+
+/* deduced std::initializer_list constructor of std::vector (CTAD) */
+
+#include <vector>
+#include <array>
+
+#include "target-flex-common.h"
+
+bool test_initializer_list(int arg)
+{
+  static constexpr std::size_t out_arr_size = 7;
+  int out_arr[out_arr_size];
+  bool ok;
+  #pragma omp target map(from: ok, out_arr[:out_arr_size]) map(to: arg)
+    {
+      bool inner_ok = true;
+      {
+	std::vector vec{0, 1, 2, 3, 4, 5, arg};
+	int sum = 0;
+	for (auto const& e : vec)
+	  sum += e;
+	VERIFY (sum == 0 + 1 + 2 + 3 + 4 + 5 + arg);
+
+	auto* out_it = out_arr;
+	const auto* const out_end = out_arr + out_arr_size;
+	for (auto const& e : vec)
+	  {
+	    VERIFY (out_it != out_end);
+	    *out_it = e;
+	    ++out_it;
+	  }
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+
+  std::array<int, out_arr_size> reference_array = {0, 1, 2, 3, 4, 5, arg};
+  const auto *out_arr_it = out_arr;
+  for (auto const& e : reference_array)
+    VERIFY_NON_TARGET (e == *(out_arr_it++));
+
+  return true;
+}
+
+int main()
+{
+  volatile int arg = 42;
+  return test_initializer_list(arg) ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-41.C b/libgomp/testsuite/libgomp.c++/target-flex-41.C
new file mode 100644
index 0000000..4d36341
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-41.C
@@ -0,0 +1,94 @@
+/* { dg-additional-options "-std=c++20" } */
+
+/* <iterator> c++20  */
+
+/* std::common_iterator uses std::variant.  */
+
+#include <vector>
+#include <iterator>
+#include <span>
+
+//TODO PR120454 "C++ constexpr vs. OpenMP implicit mapping"
+#pragma omp declare target(std::ranges::distance, std::ranges::next)
+
+#include "target-flex-common.h"
+
+namespace stdr = std::ranges;
+
+template<typename It0, typename It1>
+bool simple_equal(const It0 begin0, const It0 end0,
+		  const It1 begin1, const It1 end1) BL_NOEXCEPT
+{
+  It0 it0 = begin0;
+  It1 it1 = begin1;
+  for (; it0 != end0; ++it0, ++it1)
+    if (it1 == end1 || *it0 != *it1)
+      return false;
+  return true;
+}
+
+template<typename It, typename OutIt>
+void simple_copy(const It begin, const It end, OutIt out) BL_NOEXCEPT
+{
+  for (It it = begin; it != end; ++it, ++out)
+    *out = *it;
+}
+
+template<typename T, std::size_t Size>
+bool test(const T (&arr)[Size])
+{
+  bool ok;
+  T out_rev_arr[Size];
+  T out_fwd_arr[Size];
+  T out_first_half_arr[Size / 2];
+  #pragma omp target defaultmap(none) \
+		     map(from: ok, out_rev_arr[:Size], out_fwd_arr[:Size], \
+			       out_first_half_arr[:Size / 2]) \
+		     map(to: arr[:Size])
+    {
+      bool inner_ok = true;
+      {
+	std::span<const T> span = {arr, Size};
+	std::vector<T> rev_vec(std::reverse_iterator{span.end()},
+			       std::reverse_iterator{span.begin()});
+	VERIFY (std::distance(span.begin(), span.end())
+		== std::distance(rev_vec.begin(), rev_vec.end()));
+	VERIFY (stdr::distance(span.begin(), span.end())
+		== stdr::distance(rev_vec.begin(), rev_vec.end()));
+	VERIFY (stdr::distance(span) == stdr::distance(rev_vec));
+	VERIFY (simple_equal(span.begin(), span.end(),
+			     std::reverse_iterator{rev_vec.end()},
+			     std::reverse_iterator{rev_vec.begin()}));
+	simple_copy(rev_vec.begin(), rev_vec.end(), out_rev_arr);
+	simple_copy(std::reverse_iterator{rev_vec.end()},
+		    std::reverse_iterator{rev_vec.begin()},
+		    out_fwd_arr);
+	using counted_iter = std::counted_iterator<decltype(span.begin())>;
+	using common_iter = std::common_iterator<counted_iter,
+						 std::default_sentinel_t>;
+	std::vector<T> front_half;
+	simple_copy(common_iter{counted_iter{span.begin(), Size / 2}},
+		    common_iter{std::default_sentinel},
+		    std::back_insert_iterator{front_half});
+	VERIFY (simple_equal(span.begin(), stdr::next(span.begin(), Size / 2),
+			     front_half.begin(), front_half.end()));
+	simple_copy(front_half.begin(), front_half.end(), out_first_half_arr);
+      }
+      end:
+      ok = inner_ok;
+    }
+  VERIFY_NON_TARGET (simple_equal(std::reverse_iterator{arr + Size},
+				  std::reverse_iterator{arr},
+				  out_rev_arr, out_rev_arr + Size));
+  VERIFY_NON_TARGET (simple_equal(arr, arr + Size,
+				  out_fwd_arr, out_fwd_arr + Size));
+  VERIFY_NON_TARGET (simple_equal(arr, arr + Size / 2,
+				  out_first_half_arr, out_first_half_arr + Size / 2));
+  return ok;
+}
+
+int main()
+{
+  int arr[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  return test(arr) ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-60.C b/libgomp/testsuite/libgomp.c++/target-flex-60.C
new file mode 100644
index 0000000..393bb3c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-60.C
@@ -0,0 +1,48 @@
+/* algorithms pre c++20 */
+
+#include <algorithm>
+#include <vector>
+
+#include "target-flex-common.h"
+
+template<typename T, std::size_t Size>
+bool test(const T (&arr)[Size])
+{
+  bool ok;
+  T out_2x_arr[Size];
+  T out_shifted_arr[Size];
+  #pragma omp target map(from: ok, out_2x_arr[:Size], out_shifted_arr[:Size]) \
+		     map(to: arr[:Size])
+  /* <https://baylibre.slack.com/archives/C06TTV7HMMG/p1748508583437829>
+     { dg-bogus {sorry, unimplemented: unsupported map expression '<lambda closure object>.*} TODO { xfail *-*-* } .-3 } */
+    {
+      std::vector<T> vec(Size);
+      std::vector<T> mutated(Size);
+      bool inner_ok = true;
+      {
+	std::copy(arr, arr + Size, vec.begin());
+	VERIFY (std::equal(arr, arr + Size, vec.begin()));
+	std::transform(vec.begin(), vec.end(), mutated.begin(),
+		       [](const T& v){ return v * 2; });
+	std::copy(mutated.begin(), mutated.end(), out_2x_arr);
+	std::rotate(vec.begin(), std::next(vec.begin(), Size / 2), vec.end());
+	std::copy(vec.begin(), vec.end(), out_shifted_arr);
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (std::equal(arr, arr + Size, out_2x_arr,
+				[](const T& a, const T& b){ return a * 2 == b; }));
+  std::vector<T> shifted(arr, arr + Size);
+  std::rotate(shifted.begin(), std::next(shifted.begin(), Size / 2), shifted.end());
+  VERIFY_NON_TARGET (std::equal(out_shifted_arr, out_shifted_arr + Size, shifted.begin()));
+  return true;
+}
+
+int main()
+{
+  int arr[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  return test(arr) ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-61.C b/libgomp/testsuite/libgomp.c++/target-flex-61.C
new file mode 100644
index 0000000..e06133a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-61.C
@@ -0,0 +1,56 @@
+/* { dg-additional-options "-std=c++20" } */
+
+/* ranged algorithms c++20 */
+
+#include <algorithm>
+#include <ranges>
+#include <vector>
+
+//TODO PR120454 "C++ constexpr vs. OpenMP implicit mapping"
+#pragma omp declare target(std::ranges::copy, std::ranges::equal, std::ranges::rotate, std::ranges::transform)
+
+#include "target-flex-common.h"
+
+namespace stdr = std::ranges;
+
+template<typename T, std::size_t Size>
+bool test(const T (&arr)[Size])
+{
+  bool ok;
+  T out_2x_arr[Size];
+  T out_shifted_arr[Size];
+  #pragma omp target defaultmap(none) \
+		     map(from: ok, out_2x_arr[:Size], out_shifted_arr[:Size]) \
+		     map(to: arr[:Size])
+  /* <https://baylibre.slack.com/archives/C06TTV7HMMG/p1748508583437829>
+     { dg-bogus {sorry, unimplemented: unsupported map expression '<lambda closure object>.*} TODO { xfail *-*-* } .-4 } */
+    {
+      std::vector<T> vec(Size);
+      std::vector<T> mutated(Size);
+      bool inner_ok = true;
+      {
+	stdr::copy(arr, vec.begin());
+	VERIFY (stdr::equal(arr, vec));
+	stdr::transform(vec, mutated.begin(),
+			[](const T& v){ return v * 2; });
+	stdr::copy(mutated, out_2x_arr);
+	stdr::rotate(vec, std::next(vec.begin(), Size / 2));
+	stdr::copy(vec, out_shifted_arr);
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (stdr::equal(arr, out_2x_arr, stdr::equal_to{}, [](const T& v){ return v * 2; }));
+  std::vector<T> shifted(arr, arr + Size);
+  stdr::rotate(shifted, std::next(shifted.begin(), Size / 2));
+  VERIFY_NON_TARGET (stdr::equal(out_shifted_arr, shifted));
+  return true;
+}
+
+int main()
+{
+  int arr[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  return test(arr) ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-62.C b/libgomp/testsuite/libgomp.c++/target-flex-62.C
new file mode 100644
index 0000000..2e74b20
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-62.C
@@ -0,0 +1,52 @@
+/* { dg-additional-options -std=c++23 } */
+
+/* std::views stuff.  Also tests std::tuple with std::views::zip.  */
+
+#include <algorithm>
+#include <ranges>
+#include <span>
+
+//TODO PR120454 "C++ constexpr vs. OpenMP implicit mapping"
+#pragma omp declare target(std::ranges::all_of, std::ranges::equal, std::ranges::fold_left, std::views::reverse, std::views::zip)
+
+#include "target-flex-common.h"
+
+namespace stdr = std::ranges;
+namespace stdv = std::views;
+
+bool f()
+{
+  const int arr_fwd[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  const int arr_rev[8] = {7, 6, 5, 4, 3, 2, 1, 0};
+
+  bool ok;
+  #pragma omp target defaultmap(none) map(from: ok) map(to: arr_fwd[:8], arr_rev[:8])
+  /* <https://baylibre.slack.com/archives/C06TTV7HMMG/p1748508583437829>
+     { dg-bogus {sorry, unimplemented: unsupported map expression '<lambda closure object>.*} TODO { xfail *-*-* } .-2 } */
+    {
+      std::span<const int> fwd = {arr_fwd, 8};
+      std::span<const int> rev = {arr_rev, 8};
+      bool inner_ok = true;
+      {
+	VERIFY(stdr::equal(fwd, rev | stdv::reverse));
+	VERIFY(stdr::equal(fwd | stdv::drop(4) | stdv::reverse,
+			   rev | stdv::take(4)));
+	for (auto [first, second] : stdv::zip(fwd, rev))
+	  VERIFY(first + second == 7);
+	auto plus = [](int a, int b){ return a + b; };
+	auto is_even = [](int v){ return v % 2 == 0; };
+	VERIFY(stdr::fold_left(fwd | stdv::filter(is_even), 0, plus)
+	       == 12);
+	VERIFY(stdr::all_of(fwd | stdv::transform([](int v){ return v * 2; }),
+			    is_even));
+      }
+      end:
+      ok = inner_ok;
+    }
+  return ok;
+}
+
+int main()
+{
+  return f() ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-70.C b/libgomp/testsuite/libgomp.c++/target-flex-70.C
new file mode 100644
index 0000000..9e9383d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-70.C
@@ -0,0 +1,26 @@
+/* CTAD in target regions.  */
+
+template<typename T>
+struct S
+{
+  T _v;
+};
+
+template<typename T>
+S(T) -> S<T>;
+
+bool f()
+{
+  bool ok;
+  #pragma omp target map(from: ok)
+    {
+      S s{42};
+      ok = s._v == 42;
+    }
+  return ok;
+}
+
+int main()
+{
+  return f() ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-80.C b/libgomp/testsuite/libgomp.c++/target-flex-80.C
new file mode 100644
index 0000000..f41a1bb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-80.C
@@ -0,0 +1,49 @@
+// { dg-additional-options "-std=c++20" }
+
+/* std::span  */
+
+#include <span>
+
+#include "target-flex-common.h"
+
+template<typename It0, typename It1>
+bool simple_equal(It0 it0, const It0 end0,
+		  It1 it1, const It1 end1) noexcept
+{
+  for (; it0 != end0; ++it0, ++it1)
+    if (it1 == end1 || *it0 != *it1)
+      return false;
+  return true;
+}
+
+template<typename T, std::size_t Size>
+bool test(const T (&arr)[Size])
+{
+  bool ok;
+  T out_arr[Size];
+  #pragma omp target map(from: ok) map(to: arr[:Size])
+    {
+      std::span span = {arr, Size};
+      bool inner_ok = true;
+      {
+	VERIFY (!span.empty());
+	VERIFY (span.size() == Size);
+	auto out_it = out_arr;
+	for (auto elem : span)
+	  *(out_it++) = elem;
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (simple_equal(arr, arr + Size,
+				  out_arr, out_arr + Size));
+  return true;
+}
+
+int main()
+{
+  int arr[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  return test(arr) ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-81.C b/libgomp/testsuite/libgomp.c++/target-flex-81.C
new file mode 100644
index 0000000..950c122
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-81.C
@@ -0,0 +1,77 @@
+/* { dg-additional-options "-std=c++20" } */
+
+#include <ranges>
+#include <span>
+#include <type_traits>
+#include <vector>
+
+#include "target-flex-common.h"
+
+namespace stdr = std::ranges;
+
+template<typename It0, typename It1>
+bool simple_equal(It0 it0, const It0 end0,
+		  It1 it1, const It1 end1) noexcept
+{
+  for (; it0 != end0; ++it0, ++it1)
+    if (it1 == end1 || *it0 != *it1)
+      return false;
+  return true;
+}
+
+template<typename Rn0, typename Rn1>
+bool simple_equal(Rn0&& rn0, Rn1&& rn1) noexcept
+{
+  return simple_equal(stdr::begin(rn0), stdr::end(rn0),
+		      stdr::begin(rn1), stdr::end(rn1));
+}
+
+template<typename Rn>
+bool test(Rn&& range)
+{
+  using value_type = stdr::range_value_t<std::remove_cvref_t<Rn>>;
+  std::vector<value_type> vec = {stdr::begin(range), stdr::end(range)};
+  value_type *data = vec.data();
+  std::size_t size = vec.size();
+  bool ok;
+  #pragma omp target map(from: ok) map(tofrom: data[:size]) map(to: size)
+  /* <https://baylibre.slack.com/archives/C06TTV7HMMG/p1748508583437829>
+     { dg-bogus {sorry, unimplemented: unsupported map expression '<lambda closure object>.*} TODO { xfail *-*-* } .-2 } */
+    {
+      std::vector<value_type> orig = {data, data + size};
+      std::span<value_type> span = {data, size};
+      bool inner_ok = true;
+      {
+	auto mul_by_2 = [](const value_type& v){ return v * 2; };
+	VERIFY (simple_equal(orig, span));
+	for (auto& elem : span)
+	  elem = mul_by_2(elem);
+	VERIFY (simple_equal(orig | std::views::transform(mul_by_2), span));
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  auto mul_by_2 = [](const value_type& v){ return v * 2; };
+  VERIFY_NON_TARGET (simple_equal(range | std::views::transform(mul_by_2), vec));
+  return true;
+}
+
+struct my_int
+{
+  int _v;
+  bool operator==(my_int const&) const = default;
+  my_int operator*(int rhs) const noexcept {
+    return {_v * rhs};
+  }
+};
+
+int main()
+{
+  std::vector<int> ints = {1, 2, 3, 4, 5};
+  const bool ints_res = test(ints);
+  std::vector<my_int> my_ints = {my_int{1}, my_int{2}, my_int{3}, my_int{4}, my_int{5}};
+  const bool my_ints_res = test(my_ints);
+  return ints_res && my_ints_res ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-90.C b/libgomp/testsuite/libgomp.c++/target-flex-90.C
new file mode 100644
index 0000000..b3f1197
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-90.C
@@ -0,0 +1,107 @@
+/* structured bindings  */
+
+#include <array>
+#include <tuple>
+
+#include "target-flex-common.h"
+
+template<typename Array, typename Tuple, typename Struct>
+bool test(Array array, Tuple tuple, Struct s)
+{
+  bool ok;
+  auto array_2nd_in = std::get<2>(array);
+  auto tuple_2nd_in = std::get<2>(tuple);
+  auto s_2nd_in = s._2;
+  decltype(array_2nd_in) array_2nd_out_0;
+  decltype(tuple_2nd_in) tuple_2nd_out_0;
+  decltype(s_2nd_in) s_2nd_out_0;
+  decltype(array_2nd_in) array_2nd_out_1;
+  decltype(tuple_2nd_in) tuple_2nd_out_1;
+  decltype(s_2nd_in) s_2nd_out_1;
+  decltype(array_2nd_in) array_2nd_out_2;
+  decltype(tuple_2nd_in) tuple_2nd_out_2;
+  decltype(s_2nd_in) s_2nd_out_2;
+  #pragma omp target map(from: ok, \
+			       array_2nd_out_0, tuple_2nd_out_0, s_2nd_out_0, \
+			       array_2nd_out_1, tuple_2nd_out_1, s_2nd_out_1, \
+			       array_2nd_out_2, tuple_2nd_out_2, s_2nd_out_2) \
+		     map(to: array_2nd_in, tuple_2nd_in, s_2nd_in, array, tuple, s)
+    {
+      bool inner_ok = true;
+      {
+	{
+	  auto [array_0th, array_1st, array_2nd] = array;
+	  VERIFY (array_2nd_in == array_2nd);
+	  VERIFY (std::get<2>(array) == array_2nd);
+	  array_2nd_out_0 = array_2nd;
+	  auto [tuple_0th, tuple_1st, tuple_2nd] = tuple;
+	  VERIFY (tuple_2nd_in == tuple_2nd);
+	  VERIFY (std::get<2>(tuple) == tuple_2nd);
+	  tuple_2nd_out_0 = tuple_2nd;
+	  auto [s_0th, s_1st, s_2nd] = s;
+	  VERIFY (s_2nd_in == s_2nd);
+	  VERIFY (s._2 == s_2nd);
+	  s_2nd_out_0 = s_2nd;
+	}
+	{
+	  auto& [array_0th, array_1st, array_2nd] = array;
+	  VERIFY (array_2nd_in == array_2nd);
+	  VERIFY (std::get<2>(array) == array_2nd);
+	  array_2nd_out_1 = array_2nd;
+	  auto& [tuple_0th, tuple_1st, tuple_2nd] = tuple;
+	  VERIFY (tuple_2nd_in == tuple_2nd);
+	  VERIFY (std::get<2>(tuple) == tuple_2nd);
+	  tuple_2nd_out_1 = tuple_2nd;
+	  auto& [s_0th, s_1st, s_2nd] = s;
+	  VERIFY (s_2nd_in == s_2nd);
+	  VERIFY (s._2 == s_2nd);
+	  s_2nd_out_1 = s_2nd;
+	}
+	{
+	  const auto& [array_0th, array_1st, array_2nd] = array;
+	  VERIFY (array_2nd_in == array_2nd);
+	  VERIFY (std::get<2>(array) == array_2nd);
+	  array_2nd_out_2 = array_2nd;
+	  const auto& [tuple_0th, tuple_1st, tuple_2nd] = tuple;
+	  VERIFY (tuple_2nd_in == tuple_2nd);
+	  VERIFY (std::get<2>(tuple) == tuple_2nd);
+	  tuple_2nd_out_2 = tuple_2nd;
+	  const auto& [s_0th, s_1st, s_2nd] = s;
+	  VERIFY (s_2nd_in == s_2nd);
+	  VERIFY (s._2 == s_2nd);
+	  s_2nd_out_2 = s_2nd;
+	}
+      }
+      end:
+      ok = inner_ok;
+    }
+  if (!ok)
+    return false;
+  VERIFY_NON_TARGET (array_2nd_out_0 == array_2nd_in);
+  VERIFY_NON_TARGET (tuple_2nd_out_0 == tuple_2nd_in);
+  VERIFY_NON_TARGET (s_2nd_out_0 == s_2nd_in);
+  VERIFY_NON_TARGET (array_2nd_out_1 == array_2nd_in);
+  VERIFY_NON_TARGET (tuple_2nd_out_1 == tuple_2nd_in);
+  VERIFY_NON_TARGET (s_2nd_out_1 == s_2nd_in);
+  VERIFY_NON_TARGET (array_2nd_out_2 == array_2nd_in);
+  VERIFY_NON_TARGET (tuple_2nd_out_2 == tuple_2nd_in);
+  VERIFY_NON_TARGET (s_2nd_out_2 == s_2nd_in);
+
+  return true;
+}
+
+struct S
+{
+  char _0;
+  float _1;
+  int _2;
+};
+
+int main()
+{
+  const bool test_res
+    = test(std::array{0, 1, 2},
+	   std::tuple{'a', 3.14f, 42},
+	   S{'a', 3.14f, 42});
+  return test_res ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-flex-common.h b/libgomp/testsuite/libgomp.c++/target-flex-common.h
new file mode 100644
index 0000000..14523c4
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-flex-common.h
@@ -0,0 +1,40 @@
+#include <cstdio>
+
+#if __cplusplus >= 201103L
+  #define BL_NOEXCEPT noexcept
+#else
+  #define BL_NOEXCEPT throw()
+#endif
+
+#if defined __has_builtin
+# if __has_builtin (__builtin_LINE)
+#  define VERIFY_LINE __builtin_LINE ()
+# endif
+#endif
+#if !defined VERIFY_LINE
+# define VERIFY_LINE __LINE__
+#endif
+
+/* I'm not a huge fan of macros but in the interest of keeping the code that
+   isn't being tested as simple as possible, we use them.  */
+
+#define VERIFY(EXPR) \
+  do {										\
+    if (!(EXPR))								\
+      {										\
+	std::printf("VERIFY ln: %d `" #EXPR "` evaluated to false\n",		\
+		    VERIFY_LINE);						\
+	inner_ok = false;							\
+	goto end;								\
+      }										\
+  } while (false)
+
+#define VERIFY_NON_TARGET(EXPR) \
+  do {										\
+    if (!(EXPR))								\
+      {										\
+	std::printf("VERIFY ln: %d `" #EXPR "` evaluated to false\n",		\
+		    VERIFY_LINE);						\
+	return false;								\
+      }										\
+  } while (false)
diff --git a/libgomp/testsuite/libgomp.c++/target-std__array-concurrent-usm.C b/libgomp/testsuite/libgomp.c++/target-std__array-concurrent-usm.C
new file mode 100644
index 0000000..9923783
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__array-concurrent-usm.C
@@ -0,0 +1,5 @@
+#pragma omp requires unified_shared_memory self_maps
+
+#define MEM_SHARED
+
+#include "target-std__array-concurrent.C"
diff --git a/libgomp/testsuite/libgomp.c++/target-std__array-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__array-concurrent.C
new file mode 100644
index 0000000..c42105a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__array-concurrent.C
@@ -0,0 +1,62 @@
+// { dg-do run }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+#include <stdlib.h>
+#include <time.h>
+#include <array>
+#include <algorithm>
+
+#define N 50000
+
+void init (int data[])
+{
+  for (int i = 0; i < N; ++i)
+    data[i] = rand ();
+}
+
+#pragma omp declare target
+bool validate (const std::array<int,N> &arr, int data[])
+{
+  for (int i = 0; i < N; ++i)
+    if (arr[i] != data[i] * data[i])
+      return false;
+  return true;
+}
+#pragma omp end declare target
+
+int main (void)
+{
+  int data[N];
+  bool ok;
+  std::array<int,N> arr;
+
+  srand (time (NULL));
+  init (data);
+
+#ifndef MEM_SHARED
+  #pragma omp target data map (to: data[:N]) map (alloc: arr)
+#endif
+    {
+      #pragma omp target
+	{
+#ifndef MEM_SHARED
+	  new (&arr) std::array<int,N> ();
+#endif
+	  std::copy (data, data + N, arr.begin ());
+	}
+
+      #pragma omp target teams distribute parallel for
+	for (int i = 0; i < N; ++i)
+	  arr[i] *= arr[i];
+
+      #pragma omp target map (from: ok)
+	{
+	  ok = validate (arr, data);
+#ifndef MEM_SHARED
+	  arr.~array ();
+#endif
+	}
+    }
+
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__bitset-concurrent-usm.C b/libgomp/testsuite/libgomp.c++/target-std__bitset-concurrent-usm.C
new file mode 100644
index 0000000..9023ef8
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__bitset-concurrent-usm.C
@@ -0,0 +1,5 @@
+#pragma omp requires unified_shared_memory self_maps
+
+#define MEM_SHARED
+
+#include "target-std__bitset-concurrent.C"
diff --git a/libgomp/testsuite/libgomp.c++/target-std__bitset-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__bitset-concurrent.C
new file mode 100644
index 0000000..4fcce93
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__bitset-concurrent.C
@@ -0,0 +1,69 @@
+// { dg-do run }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+#include <stdlib.h>
+#include <time.h>
+#include <bitset>
+#include <set>
+#include <algorithm>
+
+#define N 4000
+#define MAX 16384
+
+void init (int data[])
+{
+  std::set<int> _set;
+  for (int i = 0; i < N; ++i)
+    {
+      // Avoid duplicates in data array.
+      do
+	data[i] = rand () % MAX;
+      while (_set.find (data[i]) != _set.end ());
+      _set.insert (data[i]);
+    }
+}
+
+bool validate (int sum, int data[])
+{
+  int total = 0;
+  for (int i = 0; i < N; ++i)
+    total += data[i];
+  return sum == total;
+}
+
+int main (void)
+{
+  int data[N];
+  std::bitset<MAX> _set;
+  int sum = 0;
+
+  srand (time (NULL));
+  init (data);
+
+#ifndef MEM_SHARED
+  #pragma omp target data map (to: data[:N]) map (alloc: _set)
+#endif
+    {
+      #pragma omp target
+	{
+#ifndef MEM_SHARED
+	  new (&_set) std::bitset<MAX> ();
+#endif
+	  for (int i = 0; i < N; ++i)
+	    _set[data[i]] = true;
+	}
+
+      #pragma omp target teams distribute parallel for reduction (+:sum)
+	for (int i = 0; i < MAX; ++i)
+	  if (_set[i])
+	    sum += i;
+
+#ifndef MEM_SHARED
+      #pragma omp target
+	_set.~bitset ();
+#endif
+    }
+
+  bool ok = validate (sum, data);
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__cmath.C b/libgomp/testsuite/libgomp.c++/target-std__cmath.C
new file mode 100644
index 0000000..aaf7152
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__cmath.C
@@ -0,0 +1,340 @@
+// { dg-do run }
+// { dg-additional-options "-std=c++20" }
+
+#include <cmath>
+#include <numbers>
+
+#define FP_EQUAL(x,y) (std::abs ((x) - (y)) < 1E-6)
+
+#pragma omp declare target
+template<typename T> bool test_basic ()
+{
+  T x = -3.456789;
+  T y = 1.234567;
+  T z = 5.678901;
+
+  if (std::abs (x) != -x)
+    return false;
+  if (!FP_EQUAL (std::trunc (x / y) * y + std::fmod (x, y), x))
+    return false;
+  if (!FP_EQUAL (x - std::round (x / y) * y, std::remainder (x, y)))
+    return false;
+  if (!FP_EQUAL (std::fma (x, y, z), x * y + z))
+    return false;
+  if (std::fmax (x, y) != (x > y ? x : y))
+    return false;
+  if (std::fmin (x, y) != (x < y ? x : y))
+    return false;
+  if (std::fdim (x, y) != std::max(x - y, (T) 0.0))
+    return false;
+  if (std::fdim (y, x) != std::max(y - x, (T) 0.0))
+    return false;
+  return true;
+}
+
+template<typename T> bool test_exp ()
+{
+  T x = -4.567890;
+  T y = 2.345678;
+
+  if (!FP_EQUAL (std::exp (x), std::pow (std::numbers::e_v<T>, x)))
+    return false;
+  if (!FP_EQUAL (std::exp2 (y), std::pow ((T) 2.0, y)))
+    return false;
+  if (!FP_EQUAL (std::expm1 (y), std::exp (y) - (T) 1.0))
+    return false;
+  if (!FP_EQUAL (std::log (std::exp (x)), x))
+    return false;
+  if (!FP_EQUAL (std::log10 (std::pow ((T) 10.0, y)), y))
+    return false;
+  if (!FP_EQUAL (std::log2 (std::exp2 (y)), y))
+    return false;
+  if (!FP_EQUAL (std::log1p (std::expm1 (y)), y))
+    return false;
+  return true;
+}
+
+template<typename T> bool test_power ()
+{
+  T x = 7.234251;
+  T y = 0.340128;
+
+  if (!FP_EQUAL (std::log (std::pow (x, y)) / std::log (x), y))
+    return false;
+  if (!FP_EQUAL (std::sqrt (x) * std::sqrt (x), x))
+    return false;
+  if (!FP_EQUAL (std::cbrt (x) * std::cbrt (x) * std::cbrt (x), x))
+    return false;
+  if (!FP_EQUAL (std::hypot (x, y), std::sqrt (x * x + y * y)))
+    return false;
+  return true;
+}
+
+template<typename T> bool test_trig ()
+{
+  T theta = std::numbers::pi / 4;
+  T phi = std::numbers::pi / 6;
+
+  if (!FP_EQUAL (std::sin (theta), std::sqrt ((T) 2) / 2))
+    return false;
+  if (!FP_EQUAL (std::sin (phi), 0.5))
+    return false;
+  if (!FP_EQUAL (std::cos (theta), std::sqrt ((T) 2) / 2))
+    return false;
+  if (!FP_EQUAL (std::cos (phi), std::sqrt ((T) 3) / 2))
+    return false;
+  if (!FP_EQUAL (std::tan (theta), 1.0))
+    return false;
+  if (!FP_EQUAL (std::tan (phi), std::sqrt ((T) 3) / 3))
+    return false;
+
+  T x = 0.33245623;
+
+  if (!FP_EQUAL (std::asin (std::sin (x)), x))
+    return false;
+  if (!FP_EQUAL (std::acos (std::cos (x)), x))
+    return false;
+  if (!FP_EQUAL (std::atan (std::tan (x)), x))
+    return false;
+  if (!FP_EQUAL (std::atan2 (std::sin (x), std::cos (x)), x))
+    return false;
+  return true;
+}
+
+template<typename T> bool test_hyperbolic ()
+{
+  T x = 0.7423532;
+
+  if (!FP_EQUAL (std::sinh (x), (std::exp (x) - std::exp (-x)) / (T) 2.0))
+    return false;
+  if (!FP_EQUAL (std::cosh (x), (std::exp (x) + std::exp (-x)) / (T) 2.0))
+    return false;
+  if (!FP_EQUAL (std::tanh (x), std::sinh (x) / std::cosh (x)))
+    return false;
+  if (!FP_EQUAL (std::asinh (std::sinh (x)), x))
+    return false;
+  if (!FP_EQUAL (std::acosh (std::cosh (x)), x))
+    return false;
+  if (!FP_EQUAL (std::atanh (std::tanh (x)), x))
+    return false;
+  return true;
+}
+
+template<typename T> bool test_erf ()
+{
+  if (!FP_EQUAL (std::erf ((T) 0), 0))
+    return false;
+  if (!FP_EQUAL (std::erf ((T) INFINITY), 1))
+    return false;
+  if (!FP_EQUAL (std::erf ((T) -INFINITY), -1))
+    return false;
+
+  if (!FP_EQUAL (std::erfc (0), 1))
+    return false;
+  if (!FP_EQUAL (std::erfc ((T) INFINITY), 0))
+    return false;
+  if (!FP_EQUAL (std::erfc ((T) -INFINITY), 2))
+    return false;
+
+  return true;
+}
+
+template<typename T> bool test_gamma ()
+{
+  if (!FP_EQUAL (std::tgamma ((T) 5), 4*3*2*1))
+    return false;
+  if (!FP_EQUAL (std::tgamma ((T) 0.5), std::sqrt (std::numbers::pi_v<T>)))
+    return false;
+  if (!FP_EQUAL (std::tgamma ((T) -0.5), (T) -2 * std::sqrt (std::numbers::pi_v<T>)))
+    return false;
+  if (!FP_EQUAL (std::tgamma ((T) 2.5), (T) 0.75 * std::sqrt (std::numbers::pi_v<T>)))
+    return false;
+  if (!FP_EQUAL (std::tgamma ((T) -2.5), (T) -8.0/15 * std::sqrt (std::numbers::pi_v<T>)))
+    return false;
+
+  if (!FP_EQUAL (std::lgamma ((T) 5), std::log ((T) 4*3*2*1)))
+    return false;
+  if (!FP_EQUAL (std::lgamma ((T) 0.5), std::log (std::sqrt (std::numbers::pi_v<T>))))
+    return false;
+  if (!FP_EQUAL (std::lgamma ((T) 2.5),
+		 std::log ((T) 0.75 * std::sqrt (std::numbers::pi_v<T>))))
+    return false;
+
+  return true;
+}
+
+template<typename T> bool test_rounding ()
+{
+  T x = -2.5678;
+  T y = 3.6789;
+
+  if (std::ceil (x) != -2)
+    return false;
+  if (std::floor (x) != -3)
+    return false;
+  if (std::trunc (x) != -2)
+    return false;
+  if (std::round (x) != -3)
+    return false;
+
+  if (std::ceil (y) != 4)
+    return false;
+  if (std::floor (y) != 3)
+    return false;
+  if (std::trunc (y) != 3)
+    return false;
+  if (std::round (y) != 4)
+    return false;
+
+  /* Not testing std::rint and std::nearbyint due to dependence on
+     floating-point environment.  */
+
+  return true;
+}
+
+template<typename T> bool test_fpmanip ()
+{
+  T x = -2.3456789;
+  T y = 3.6789012;
+  int exp;
+
+  T mantissa = std::frexp (x, &exp);
+  if (std::ldexp (mantissa, exp) != x)
+    return false;
+  if (std::logb (x) + 1 != exp)
+    return false;
+  if (std::ilogb (x) + 1 != exp)
+    return false;
+  if (std::scalbn (x, -exp) != mantissa)
+    return false;
+
+  T next = std::nextafter (x, y);
+  if (!(next > x && next < y))
+    return false;
+
+#if 0
+  /* TODO Due to 'std::nexttoward' using 'long double to', this triggers a
+     '80-bit-precision floating-point numbers unsupported (mode ‘XF’)' error
+     with x86_64 host and nvptx, GCN offload compilers, or
+     '128-bit-precision floating-point numbers unsupported (mode ‘TF’)' error
+     with powerpc64le host and nvptx offload compiler, for example;
+     PR71064 'nvptx offloading: "long double" data type'.
+     It ought to work on systems where the host's 'long double' is the same as
+     'double' ('DF'): aarch64, for example?  */
+  next = std::nexttoward (x, y);
+  if (!(next > x && next < y))
+    return false;
+#endif
+
+  if (std::copysign (x, y) != std::abs (x))
+    return false;
+  if (std::copysign (y, x) != -y)
+    return false;
+
+  return true;
+}
+
+template<typename T> bool test_classify ()
+{
+  T x = -2.3456789;
+  T y = 3.6789012;
+
+  if (std::fpclassify (x) != FP_NORMAL || std::fpclassify (y) != FP_NORMAL)
+    return false;
+  if (std::fpclassify ((T) INFINITY) != FP_INFINITE
+      || std::fpclassify ((T) -INFINITY) != FP_INFINITE)
+    return false;
+  if (std::fpclassify ((T) 0.0) != FP_ZERO)
+    return false;
+  if (std::fpclassify ((T) NAN) != FP_NAN)
+    return false;
+  if (!std::isfinite (x) || !std::isfinite (y))
+    return false;
+  if (std::isfinite ((T) INFINITY) || std::isfinite ((T) -INFINITY))
+    return false;
+  if (std::isinf (x) || std::isinf (y))
+    return false;
+  if (!std::isinf ((T) INFINITY) || !std::isinf ((T) -INFINITY))
+    return false;
+  if (std::isnan (x) || std::isnan (y))
+    return false;
+  if (!std::isnan ((T) 0.0 / (T) 0.0))
+    return false;
+  if (std::isnan (x) || std::isnan (y))
+    return false;
+  if (!std::isnormal (x) || !std::isnormal (y))
+    return false;
+  if (std::isnormal ((T) 0.0) || std::isnormal ((T) INFINITY) || std::isnormal ((T) NAN))
+    return false;
+  if (!std::signbit (x) || std::signbit (y))
+    return false;
+
+  return true;
+}
+
+template<typename T> bool test_compare ()
+{
+  T x = 5.6789012;
+  T y = 8.9012345;
+
+  if (std::isgreater (x, y))
+    return false;
+  if (std::isgreater (x, x))
+    return false;
+  if (std::isgreaterequal (x, y))
+    return false;
+  if (!std::isgreaterequal (x, x))
+    return false;
+  if (!std::isless (x, y))
+    return false;
+  if (std::isless (x, x))
+    return false;
+  if (!std::islessequal (x, y))
+    return false;
+  if (!std::islessequal (x, x))
+    return false;
+  if (!std::islessgreater (x, y))
+    return false;
+  if (std::islessgreater (x, x))
+    return false;
+  if (std::isunordered (x, y))
+    return false;
+  if (!std::isunordered (x, NAN))
+    return false;
+  return true;
+}
+#pragma omp end declare target
+
+#define RUN_TEST(func) \
+{ \
+  pass++; \
+  bool ok = test_##func<float> (); \
+  if (!ok) { result = pass; break; } \
+  pass++; \
+  ok = test_##func<double> (); \
+  if (!ok) { result = pass; break; } \
+}
+
+int main (void)
+{
+  int result = 0;
+
+  #pragma omp target map (tofrom: result)
+    do {
+      int pass = 0;
+
+      RUN_TEST (basic);
+      RUN_TEST (exp);
+      RUN_TEST (power);
+      RUN_TEST (trig);
+      RUN_TEST (hyperbolic);
+      RUN_TEST (erf);
+      RUN_TEST (gamma);
+      RUN_TEST (rounding);
+      RUN_TEST (fpmanip);
+      RUN_TEST (classify);
+      RUN_TEST (compare);
+    } while (false);
+
+  return result;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__complex.C b/libgomp/testsuite/libgomp.c++/target-std__complex.C
new file mode 100644
index 0000000..e392d17
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__complex.C
@@ -0,0 +1,175 @@
+// { dg-do run }
+// { dg-additional-options "-std=c++20" }
+
+#include <cmath>
+#include <complex>
+#include <numbers>
+
+using namespace std::complex_literals;
+
+#define FP_EQUAL(x,y) (std::abs ((x) - (y)) < 1E-6)
+#define COMPLEX_EQUAL(x,y) (FP_EQUAL ((x).real (), (y).real ()) \
+			    && FP_EQUAL ((x).imag (), (y).imag ()))
+
+#pragma omp declare target
+template<typename T> bool test_complex ()
+{
+  std::complex<T> z (-1.334, 5.763);
+
+  if (!FP_EQUAL (z.real (), (T) -1.334))
+    return false;
+  if (!FP_EQUAL (z.imag (), (T) 5.763))
+    return false;
+  if (!FP_EQUAL (std::abs (z),
+		 std::sqrt (z.real () * z.real () + z.imag () * z.imag ())))
+    return false;
+  if (!FP_EQUAL (std::arg (z), std::atan2 (z.imag (), z.real ())))
+    return false;
+  if (!FP_EQUAL (std::norm (z), z.real () * z.real () + z.imag () * z.imag ()))
+    return false;
+
+  auto conj = std::conj (z);
+  if (!FP_EQUAL (conj.real (), z.real ())
+      || !FP_EQUAL (conj.imag (), -z.imag ()))
+    return false;
+
+  if (std::proj (z) != z)
+    return false;
+
+  auto infz1 = std::proj (std::complex<float> (INFINITY, -1));
+  if (infz1.real () != INFINITY || infz1.imag () != (T) -0.0)
+    return false;
+  auto infz2 = std::proj (std::complex<float> (0, -INFINITY));
+  if (infz2.real () != INFINITY || infz2.imag () != (T) -0.0)
+    return false;
+
+  auto polarz = std::polar ((T) 1.5, std::numbers::pi_v<T> / 4);
+  if (!FP_EQUAL (polarz.real (), (T) 1.5 * std::cos (std::numbers::pi_v<T> / 4))
+      || !FP_EQUAL (polarz.imag (),
+		    (T) 1.5* std::sin (std::numbers::pi_v<T> / 4)))
+    return false;
+
+  return true;
+}
+
+template<typename T> bool test_complex_exp_log ()
+{
+  std::complex<T> z (-1.724, -3.763);
+
+  // Euler's identity
+  auto eulerz = std::exp (std::complex<T> (0, std::numbers::pi));
+  eulerz += 1.0;
+  if (!COMPLEX_EQUAL (eulerz, std::complex<T> ()))
+    return false;
+
+  auto my_exp_z
+    = std::complex<T> (std::exp (z.real ()) * std::cos (z.imag ()),
+		       std::exp (z.real ()) * std::sin (z.imag ()));
+  if (!COMPLEX_EQUAL (std::exp (z), my_exp_z))
+    return false;
+
+  if (!COMPLEX_EQUAL (std::log10 (z),
+		      std::log (z) / std::log (std::complex<T> (10))))
+    return false;
+
+  return true;
+}
+
+template<typename T> bool test_complex_trig ()
+{
+  std::complex<T> z (std::numbers::pi / 8, std::numbers::pi / 10);
+  const std::complex<T> i (0, 1);
+
+  auto my_sin_z
+    = std::complex<T> (std::sin (z.real ()) * std::cosh (z.imag ()),
+		       std::cos (z.real ()) * std::sinh (z.imag ()));
+  if (!COMPLEX_EQUAL (std::sin (z), my_sin_z))
+    return false;
+
+  auto my_cos_z
+    = std::complex<T> (std::cos (z.real ()) * std::cosh (z.imag ()),
+		       -std::sin (z.real ()) * std::sinh (z.imag ()));
+  if (!COMPLEX_EQUAL (std::cos (z), my_cos_z))
+    return false;
+
+  auto my_tan_z
+    = std::complex<T> (std::sin (2*z.real ()), std::sinh (2*z.imag ()))
+      / (std::cos (2*z.real ()) + std::cosh (2*z.imag ()));
+  if (!COMPLEX_EQUAL (std::tan (z), my_tan_z))
+    return false;
+
+  auto my_sinh_z
+    = std::complex<T> (std::sinh (z.real ()) * std::cos (z.imag ()),
+		       std::cosh (z.real ()) * std::sin (z.imag ()));
+  if (!COMPLEX_EQUAL (std::sinh (z), my_sinh_z))
+    return false;
+
+  auto my_cosh_z
+    = std::complex<T> (std::cosh (z.real ()) * std::cos (z.imag ()),
+		       std::sinh (z.real ()) * std::sin (z.imag ()));
+  if (!COMPLEX_EQUAL (std::cosh (z), my_cosh_z))
+    return false;
+
+  auto my_tanh_z
+    = std::complex<T> (std::sinh (2*z.real ()),
+		       std::sin (2*z.imag ()))
+		       / (std::cosh (2*z.real ()) + std::cos (2*z.imag ()));
+  if (!COMPLEX_EQUAL (std::tanh (z), my_tanh_z))
+    return false;
+
+  auto my_asin_z = -i * std::log (i * z + std::sqrt ((T) 1.0 - z*z));
+  if (!COMPLEX_EQUAL (std::asin (z), my_asin_z))
+    return false;
+
+  auto my_acos_z
+    = std::complex<T> (std::numbers::pi / 2)
+		       + i * std::log (i * z + std::sqrt ((T) 1.0 - z*z));
+  if (!COMPLEX_EQUAL (std::acos (z), my_acos_z))
+    return false;
+
+  auto my_atan_z = std::complex<T> (0, -0.5) * (std::log ((i - z) / (i + z)));
+  if (!COMPLEX_EQUAL (std::atan (z), my_atan_z))
+    return false;
+
+  auto my_asinh_z = std::log (z + std::sqrt (z*z + (T) 1.0));
+  if (!COMPLEX_EQUAL (std::asinh (z), my_asinh_z))
+    return false;
+
+  auto my_acosh_z = std::log (z + std::sqrt (z*z - (T) 1.0));
+  if (!COMPLEX_EQUAL (std::acosh (z), my_acosh_z))
+    return false;
+
+  auto my_atanh_z
+    = std::complex<T> (0.5) * (std::log ((T) 1.0 + z) - std::log ((T) 1.0 - z));
+  if (!COMPLEX_EQUAL (std::atanh (z), my_atanh_z))
+    return false;
+
+  return true;
+}
+#pragma omp end declare target
+
+#define RUN_TEST(func) \
+{ \
+  pass++; \
+  bool ok = test_##func<float> (); \
+  if (!ok) { result = pass; break; } \
+  pass++; \
+  ok = test_##func<double> (); \
+  if (!ok) { result = pass; break; } \
+}
+
+int main (void)
+{
+  int result = 0;
+
+  #pragma omp target map (tofrom: result)
+    do {
+      int pass = 0;
+
+      RUN_TEST (complex);
+      RUN_TEST (complex_exp_log);
+      RUN_TEST (complex_trig);
+    } while (false);
+
+  return result;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__deque-concurrent-usm.C b/libgomp/testsuite/libgomp.c++/target-std__deque-concurrent-usm.C
new file mode 100644
index 0000000..863a1de
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__deque-concurrent-usm.C
@@ -0,0 +1,5 @@
+#pragma omp requires unified_shared_memory self_maps
+
+#define MEM_SHARED
+
+#include "target-std__deque-concurrent.C"
diff --git a/libgomp/testsuite/libgomp.c++/target-std__deque-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__deque-concurrent.C
new file mode 100644
index 0000000..9c2d6fa
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__deque-concurrent.C
@@ -0,0 +1,64 @@
+// { dg-do run }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+#include <stdlib.h>
+#include <time.h>
+#include <deque>
+#include <algorithm>
+
+#define N 50000
+
+void init (int data[])
+{
+  for (int i = 0; i < N; ++i)
+    data[i] = rand ();
+}
+
+#pragma omp declare target
+bool validate (const std::deque<int> &_deque, int data[])
+{
+  for (int i = 0; i < N; ++i)
+    if (_deque[i] != data[i] * data[i])
+      return false;
+  return true;
+}
+#pragma omp end declare target
+
+int main (void)
+{
+  int data[N];
+  bool ok;
+
+  srand (time (NULL));
+  init (data);
+
+#ifdef MEM_SHARED
+  std::deque<int> _deque (std::begin (data), std::end (data));
+#else
+  std::deque<int> _deque;
+#endif
+
+#ifndef MEM_SHARED
+  #pragma omp target data map (to: data[:N]) map (alloc: _deque)
+#endif
+    {
+#ifndef MEM_SHARED
+      #pragma omp target
+	new (&_deque) std::deque<int> (std::begin (data), std::end (data));
+#endif
+
+      #pragma omp target teams distribute parallel for
+	for (int i = 0; i < N; ++i)
+	  _deque[i] *= _deque[i];
+
+      #pragma omp target map (from: ok)
+	{
+	  ok = validate (_deque, data);
+#ifndef MEM_SHARED
+	  _deque.~deque ();
+#endif
+	}
+    }
+
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__flat_map-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__flat_map-concurrent.C
new file mode 100644
index 0000000..9e59907
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__flat_map-concurrent.C
@@ -0,0 +1,71 @@
+// { dg-do run }
+// { dg-additional-options "-std=c++23" }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+/* { dg-ice {TODO PR120450} { offload_target_amdgcn && { ! offload_device_shared_as } } }
+   { dg-excess-errors {'mkoffload' failure etc.} { xfail { offload_target_amdgcn && { ! offload_device_shared_as } } } }
+   (For effective-target 'offload_device_shared_as', we've got '-DMEM_SHARED', and therefore don't invoke the constructor with placement new.)  */
+
+#include <stdlib.h>
+#include <time.h>
+#include <set>
+#include <flat_map>
+
+#define N 3000
+
+void init (int data[], bool unique)
+{
+  std::set<int> _set;
+  for (int i = 0; i < N; ++i)
+    {
+      // Avoid duplicates in data array if unique is true.
+      do
+	data[i] = rand ();
+      while (unique && _set.count (data[i]) > 0);
+      _set.insert (data[i]);
+    }
+}
+
+bool validate (long long sum, int keys[], int data[])
+{
+  long long total = 0;
+  for (int i = 0; i < N; ++i)
+    total += (long long) keys[i] * data[i];
+  return sum == total;
+}
+
+int main (void)
+{
+  int keys[N], data[N];
+  std::flat_map<int,int> _map;
+
+  srand (time (NULL));
+  init (keys, true);
+  init (data, false);
+
+  #pragma omp target enter data map (to: keys[:N], data[:N]) map (alloc: _map)
+
+  #pragma omp target
+    {
+#ifndef MEM_SHARED
+      new (&_map) std::flat_map<int,int> ();
+#endif
+      for (int i = 0; i < N; ++i)
+	_map[keys[i]] = data[i];
+    }
+
+  long long sum = 0;
+  #pragma omp target teams distribute parallel for reduction (+:sum)
+    for (int i = 0; i < N; ++i)
+      sum += (long long) keys[i] * _map[keys[i]];
+
+#ifndef MEM_SHARED
+  #pragma omp target
+    _map.~flat_map ();
+#endif
+
+  #pragma omp target exit data map (release: _map)
+
+  bool ok = validate (sum, keys, data);
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__flat_multimap-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__flat_multimap-concurrent.C
new file mode 100644
index 0000000..1dc60c8
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__flat_multimap-concurrent.C
@@ -0,0 +1,70 @@
+// { dg-do run }
+// { dg-additional-options "-std=c++23" }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+/* { dg-ice {TODO PR120450} { offload_target_amdgcn && { ! offload_device_shared_as } } }
+   { dg-excess-errors {'mkoffload' failure etc.} { xfail { offload_target_amdgcn && { ! offload_device_shared_as } } } }
+   (For effective-target 'offload_device_shared_as', we've got '-DMEM_SHARED', and therefore don't invoke the constructor with placement new.)  */
+
+#include <stdlib.h>
+#include <time.h>
+#include <flat_map>
+
+// Make sure that KEY_MAX is less than N to ensure some duplicate keys.
+#define N 3000
+#define KEY_MAX 1000
+
+void init (int data[], int max)
+{
+  for (int i = 0; i < N; ++i)
+    data[i] = i % max;
+}
+
+bool validate (long long sum, int keys[], int data[])
+{
+  long long total = 0;
+  for (int i = 0; i < N; ++i)
+    total += (long long) keys[i] * data[i];
+  return sum == total;
+}
+
+int main (void)
+{
+  int keys[N], data[N];
+  std::flat_multimap<int,int> _map;
+
+  srand (time (NULL));
+  init (keys, KEY_MAX);
+  init (data, RAND_MAX);
+
+  #pragma omp target enter data map (to: keys[:N], data[:N]) map (alloc: _map)
+
+  #pragma omp target
+    {
+#ifndef MEM_SHARED
+      new (&_map) std::flat_multimap<int,int> ();
+#endif
+      for (int i = 0; i < N; ++i)
+	_map.insert({keys[i], data[i]});
+    }
+
+  long long sum = 0;
+  #pragma omp target teams distribute parallel for reduction (+:sum)
+    for (int i = 0; i < KEY_MAX; ++i)
+      {
+	auto range = _map.equal_range (i);
+	for (auto it = range.first; it != range.second; ++it) {
+	  sum += (long long) it->first * it->second;
+	}
+      }
+
+#ifndef MEM_SHARED
+  #pragma omp target
+    _map.~flat_multimap ();
+#endif
+
+  #pragma omp target exit data map (release: _map)
+
+  bool ok = validate (sum, keys, data);
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__flat_multiset-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__flat_multiset-concurrent.C
new file mode 100644
index 0000000..59b59bf
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__flat_multiset-concurrent.C
@@ -0,0 +1,60 @@
+// { dg-do run }
+// { dg-additional-options "-std=c++23" }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+#include <stdlib.h>
+#include <time.h>
+#include <flat_set>
+#include <algorithm>
+
+// MAX should be less than N to ensure that some duplicates occur.
+#define N 4000
+#define MAX 1000
+
+void init (int data[])
+{
+  for (int i = 0; i < N; ++i)
+    data[i] = rand () % MAX;
+}
+
+bool validate (int sum, int data[])
+{
+  int total = 0;
+  for (int i = 0; i < N; ++i)
+    total += data[i];
+  return sum == total;
+}
+
+int main (void)
+{
+  int data[N];
+  std::flat_multiset<int> set;
+  int sum = 0;
+
+  srand (time (NULL));
+  init (data);
+
+  #pragma omp target data map (to: data[:N]) map (alloc: set)
+    {
+      #pragma omp target
+	{
+#ifndef MEM_SHARED
+	  new (&set) std::flat_multiset<int> ();
+#endif
+	  for (int i = 0; i < N; ++i)
+	    set.insert (data[i]);
+	}
+
+      #pragma omp target teams distribute parallel for reduction (+:sum)
+	for (int i = 0; i < MAX; ++i)
+	  sum += i * set.count (i);
+
+#ifndef MEM_SHARED
+      #pragma omp target
+	set.~flat_multiset ();
+#endif
+    }
+
+  bool ok = validate (sum, data);
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__flat_set-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__flat_set-concurrent.C
new file mode 100644
index 0000000..b255cd5
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__flat_set-concurrent.C
@@ -0,0 +1,67 @@
+// { dg-do run }
+// { dg-additional-options "-std=c++23" }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+#include <stdlib.h>
+#include <time.h>
+#include <flat_set>
+#include <algorithm>
+
+#define N 4000
+#define MAX 16384
+
+void init (int data[])
+{
+  std::flat_set<int> _set;
+  for (int i = 0; i < N; ++i)
+    {
+      // Avoid duplicates in data array.
+      do
+	data[i] = rand () % MAX;
+      while (_set.count (data[i]) != 0);
+      _set.insert (data[i]);
+    }
+}
+
+bool validate (int sum, int data[])
+{
+  int total = 0;
+  for (int i = 0; i < N; ++i)
+    total += data[i];
+  return sum == total;
+}
+
+int main (void)
+{
+  int data[N];
+  std::flat_set<int> _set;
+  int sum = 0;
+
+  srand (time (NULL));
+  init (data);
+
+  #pragma omp target data map (to: data[:N]) map (alloc: _set)
+    {
+      #pragma omp target
+	{
+#ifndef MEM_SHARED
+	  new (&_set) std::flat_set<int> ();
+#endif
+	  for (int i = 0; i < N; ++i)
+	    _set.insert (data[i]);
+	}
+
+      #pragma omp target teams distribute parallel for reduction (+:sum)
+	for (int i = 0; i < MAX; ++i)
+	  if (_set.count (i) > 0)
+	    sum += i;
+
+#ifndef MEM_SHARED
+      #pragma omp target
+	_set.~flat_set ();
+#endif
+    }
+
+  bool ok = validate (sum, data);
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__forward_list-concurrent-usm.C b/libgomp/testsuite/libgomp.c++/target-std__forward_list-concurrent-usm.C
new file mode 100644
index 0000000..60d5cee
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__forward_list-concurrent-usm.C
@@ -0,0 +1,5 @@
+#pragma omp requires unified_shared_memory self_maps
+
+#define MEM_SHARED
+
+#include "target-std__forward_list-concurrent.C"
diff --git a/libgomp/testsuite/libgomp.c++/target-std__forward_list-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__forward_list-concurrent.C
new file mode 100644
index 0000000..6b0ee65
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__forward_list-concurrent.C
@@ -0,0 +1,83 @@
+// { dg-do run }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+#include <stdlib.h>
+#include <time.h>
+#include <omp.h>
+#include <forward_list>
+#include <algorithm>
+
+#define N 3000
+
+void init (int data[])
+{
+  for (int i = 0; i < N; ++i)
+    data[i] = rand ();
+}
+
+#pragma omp declare target
+bool validate (const std::forward_list<int> &list, int data[])
+{
+  int i = 0;
+  for (auto &v : list)
+    {
+      if (v != data[i] * data[i])
+	return false;
+      ++i;
+    }
+  return true;
+}
+#pragma omp end declare target
+
+int main (void)
+{
+  int data[N];
+  bool ok;
+
+  srand (time (NULL));
+  init (data);
+
+#ifdef MEM_SHARED
+  std::forward_list<int> list (std::begin (data), std::end (data));
+#else
+  std::forward_list<int> list;
+#endif
+
+#ifndef MEM_SHARED
+  #pragma omp target data map (to: data[:N]) map (alloc: list)
+#endif
+    {
+#ifndef MEM_SHARED
+      #pragma omp target
+	new (&list) std::forward_list<int> (std::begin (data), std::end (data));
+#endif
+
+      #pragma omp target teams
+	do
+	  {
+	    int len = N / omp_get_num_teams () + (N % omp_get_num_teams () > 0);
+	    int start = len * omp_get_team_num ();
+	    if (start >= N)
+	      break;
+	    if (start + len >= N)
+	      len = N - start;
+	    auto it = list.begin ();
+	    std::advance (it, start);
+	    for (int i = 0; i < len; ++i)
+	      {
+		*it *= *it;
+		++it;
+	      }
+	  } while (false);
+
+      #pragma omp target map (from: ok)
+	{
+	  ok = validate (list, data);
+#ifndef MEM_SHARED
+	  list.~forward_list ();
+#endif
+	}
+    }
+
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__list-concurrent-usm.C b/libgomp/testsuite/libgomp.c++/target-std__list-concurrent-usm.C
new file mode 100644
index 0000000..5057bf9
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__list-concurrent-usm.C
@@ -0,0 +1,5 @@
+#pragma omp requires unified_shared_memory self_maps
+
+#define MEM_SHARED
+
+#include "target-std__list-concurrent.C"
diff --git a/libgomp/testsuite/libgomp.c++/target-std__list-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__list-concurrent.C
new file mode 100644
index 0000000..1f44a17
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__list-concurrent.C
@@ -0,0 +1,83 @@
+// { dg-do run }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+#include <stdlib.h>
+#include <time.h>
+#include <omp.h>
+#include <list>
+#include <algorithm>
+
+#define N 3000
+
+void init (int data[])
+{
+  for (int i = 0; i < N; ++i)
+    data[i] = rand ();
+}
+
+#pragma omp declare target
+bool validate (const std::list<int> &_list, int data[])
+{
+  int i = 0;
+  for (auto &v : _list)
+    {
+      if (v != data[i] * data[i])
+	return false;
+      ++i;
+    }
+  return true;
+}
+#pragma omp end declare target
+
+int main (void)
+{
+  int data[N];
+  bool ok;
+
+  srand (time (NULL));
+  init (data);
+
+#ifdef MEM_SHARED
+  std::list<int> _list (std::begin (data), std::end (data));
+#else
+  std::list<int> _list;
+#endif
+
+#ifndef MEM_SHARED
+  #pragma omp target data map (to: data[:N]) map (alloc: _list)
+#endif
+    {
+#ifndef MEM_SHARED
+      #pragma omp target
+	new (&_list) std::list<int> (std::begin (data), std::end (data));
+#endif
+
+      #pragma omp target teams
+	do
+	  {
+	    int len = N / omp_get_num_teams () + (N % omp_get_num_teams () > 0);
+	    int start = len * omp_get_team_num ();
+	    if (start >= N)
+	      break;
+	    if (start + len >= N)
+	      len = N - start;
+	    auto it = _list.begin ();
+	    std::advance (it, start);
+	    for (int i = 0; i < len; ++i)
+	      {
+		*it *= *it;
+		++it;
+	      }
+	  } while (false);
+
+      #pragma omp target map (from: ok)
+	{
+	  ok = validate (_list, data);
+#ifndef MEM_SHARED
+	  _list.~list ();
+#endif
+	}
+    }
+
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__map-concurrent-usm.C b/libgomp/testsuite/libgomp.c++/target-std__map-concurrent-usm.C
new file mode 100644
index 0000000..fe37426
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__map-concurrent-usm.C
@@ -0,0 +1,5 @@
+#pragma omp requires unified_shared_memory self_maps
+
+#define MEM_SHARED
+
+#include "target-std__map-concurrent.C"
diff --git a/libgomp/testsuite/libgomp.c++/target-std__map-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__map-concurrent.C
new file mode 100644
index 0000000..36556ef
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__map-concurrent.C
@@ -0,0 +1,70 @@
+// { dg-do run }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+#include <stdlib.h>
+#include <time.h>
+#include <set>
+#include <map>
+
+#define N 3000
+
+void init (int data[], bool unique)
+{
+  std::set<int> _set;
+  for (int i = 0; i < N; ++i)
+    {
+      // Avoid duplicates in data array if unique is true.
+      do
+	data[i] = rand ();
+      while (unique && _set.find (data[i]) != _set.end ());
+      _set.insert (data[i]);
+    }
+}
+
+bool validate (long long sum, int keys[], int data[])
+{
+  long long total = 0;
+  for (int i = 0; i < N; ++i)
+    total += (long long) keys[i] * data[i];
+  return sum == total;
+}
+
+int main (void)
+{
+  int keys[N], data[N];
+  std::map<int,int> _map;
+
+  srand (time (NULL));
+  init (keys, true);
+  init (data, false);
+
+#ifndef MEM_SHARED
+  #pragma omp target enter data map (to: keys[:N], data[:N]) map (alloc: _map)
+#endif
+
+  #pragma omp target
+    {
+#ifndef MEM_SHARED
+      new (&_map) std::map<int,int> ();
+#endif
+      for (int i = 0; i < N; ++i)
+	_map[keys[i]] = data[i];
+    }
+
+  long long sum = 0;
+  #pragma omp target teams distribute parallel for reduction (+:sum)
+    for (int i = 0; i < N; ++i)
+      sum += (long long) keys[i] * _map[keys[i]];
+
+#ifndef MEM_SHARED
+  #pragma omp target
+    _map.~map ();
+#endif
+
+#ifndef MEM_SHARED
+  #pragma omp target exit data map (release: _map)
+#endif
+
+  bool ok = validate (sum, keys, data);
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__multimap-concurrent-usm.C b/libgomp/testsuite/libgomp.c++/target-std__multimap-concurrent-usm.C
new file mode 100644
index 0000000..79f9245
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__multimap-concurrent-usm.C
@@ -0,0 +1,5 @@
+#pragma omp requires unified_shared_memory self_maps
+
+#define MEM_SHARED
+
+#include "target-std__multimap-concurrent.C"
diff --git a/libgomp/testsuite/libgomp.c++/target-std__multimap-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__multimap-concurrent.C
new file mode 100644
index 0000000..6a4a4e8
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__multimap-concurrent.C
@@ -0,0 +1,68 @@
+// { dg-do run }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+#include <stdlib.h>
+#include <time.h>
+#include <map>
+
+// Make sure that KEY_MAX is less than N to ensure some duplicate keys.
+#define N 3000
+#define KEY_MAX 1000
+
+void init (int data[], int max)
+{
+  for (int i = 0; i < N; ++i)
+    data[i] = rand () % max;
+}
+
+bool validate (long long sum, int keys[], int data[])
+{
+  long long total = 0;
+  for (int i = 0; i < N; ++i)
+    total += (long long) keys[i] * data[i];
+  return sum == total;
+}
+
+int main (void)
+{
+  int keys[N], data[N];
+  std::multimap<int,int> _map;
+
+  srand (time (NULL));
+  init (keys, KEY_MAX);
+  init (data, RAND_MAX);
+
+#ifndef MEM_SHARED
+  #pragma omp target enter data map (to: keys[:N], data[:N]) map (alloc: _map)
+#endif
+
+  #pragma omp target
+    {
+#ifndef MEM_SHARED
+      new (&_map) std::multimap<int,int> ();
+#endif
+      for (int i = 0; i < N; ++i)
+	_map.insert({keys[i], data[i]});
+    }
+
+  long long sum = 0;
+  #pragma omp target teams distribute parallel for reduction (+:sum)
+    for (int i = 0; i < KEY_MAX; ++i)
+      {
+	auto range = _map.equal_range (i);
+	for (auto it = range.first; it != range.second; ++it)
+	  sum += (long long) it->first * it->second;
+      }
+
+#ifndef MEM_SHARED
+  #pragma omp target
+    _map.~multimap ();
+#endif
+
+#ifndef MEM_SHARED
+  #pragma omp target exit data map (release: _map)
+#endif
+
+  bool ok = validate (sum, keys, data);
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__multiset-concurrent-usm.C b/libgomp/testsuite/libgomp.c++/target-std__multiset-concurrent-usm.C
new file mode 100644
index 0000000..2d80756
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__multiset-concurrent-usm.C
@@ -0,0 +1,5 @@
+#pragma omp requires unified_shared_memory self_maps
+
+#define MEM_SHARED
+
+#include "target-std__multiset-concurrent.C"
diff --git a/libgomp/testsuite/libgomp.c++/target-std__multiset-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__multiset-concurrent.C
new file mode 100644
index 0000000..b12402e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__multiset-concurrent.C
@@ -0,0 +1,62 @@
+// { dg-do run }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <set>
+#include <algorithm>
+
+// MAX should be less than N to ensure that some duplicates occur.
+#define N 4000
+#define MAX 1000
+
+void init (int data[])
+{
+  for (int i = 0; i < N; ++i)
+    data[i] = rand () % MAX;
+}
+
+bool validate (int sum, int data[])
+{
+  int total = 0;
+  for (int i = 0; i < N; ++i)
+    total += data[i];
+  return sum == total;
+}
+
+int main (void)
+{
+  int data[N];
+  std::multiset<int> set;
+  int sum = 0;
+
+  srand (time (NULL));
+  init (data);
+
+#ifndef MEM_SHARED
+  #pragma omp target data map (to: data[:N]) map (alloc: set)
+#endif
+    {
+      #pragma omp target
+	{
+#ifndef MEM_SHARED
+	  new (&set) std::multiset<int> ();
+#endif
+	  for (int i = 0; i < N; ++i)
+	    set.insert (data[i]);
+	}
+
+      #pragma omp target teams distribute parallel for reduction (+:sum)
+	for (int i = 0; i < MAX; ++i)
+	  sum += i * set.count (i);
+
+#ifndef MEM_SHARED
+      #pragma omp target
+	set.~multiset ();
+#endif
+    }
+
+  bool ok = validate (sum, data);
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__numbers.C b/libgomp/testsuite/libgomp.c++/target-std__numbers.C
new file mode 100644
index 0000000..a6b3665
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__numbers.C
@@ -0,0 +1,93 @@
+// { dg-do run }
+// { dg-additional-options "-std=c++20" }
+
+#include <cmath>
+#include <numbers>
+
+#define FP_EQUAL(x,y) (std::abs ((x) - (y)) < 1E-6)
+
+#pragma omp declare target
+template<typename T> bool test_pi ()
+{
+  if (!FP_EQUAL (std::sin (std::numbers::pi_v<T>), (T) 0.0))
+    return false;
+  if (!FP_EQUAL (std::cos (std::numbers::pi_v<T>), (T) -1.0))
+    return false;
+  if (!FP_EQUAL (std::numbers::pi_v<T> * std::numbers::inv_pi_v<T>, (T) 1.0))
+    return false;
+  if (!FP_EQUAL (std::numbers::pi_v<T> * std::numbers::inv_sqrtpi_v<T>
+		 * std::numbers::inv_sqrtpi_v<T>, (T) 1.0))
+    return false;
+  return true;
+}
+
+template<typename T> bool test_sqrt ()
+{
+  if (!FP_EQUAL (std::numbers::sqrt2_v<T> * std::numbers::sqrt2_v<T>, (T) 2.0))
+    return false;
+  if (!FP_EQUAL (std::numbers::sqrt3_v<T> * std::numbers::sqrt3_v<T>, (T) 3.0))
+    return false;
+  return true;
+}
+
+template<typename T> bool test_phi ()
+{
+  T myphi = ((T) 1.0 + std::sqrt ((T) 5.0)) / (T) 2.0;
+  if (!FP_EQUAL (myphi, std::numbers::phi_v<T>))
+    return false;
+  return true;
+}
+
+template<typename T> bool test_log ()
+{
+  if (!FP_EQUAL (std::log ((T) 2.0), std::numbers::ln2_v<T>))
+    return false;
+  if (!FP_EQUAL (std::log ((T) 10.0), std::numbers::ln10_v<T>))
+    return false;
+  if (!FP_EQUAL (std::log2 ((T) std::numbers::e), std::numbers::log2e_v<T>))
+    return false;
+  if (!FP_EQUAL (std::log10 ((T) std::numbers::e), std::numbers::log10e_v<T>))
+    return false;
+  return true;
+}
+
+template<typename T> bool test_egamma ()
+{
+  T myegamma = 0.0;
+  #pragma omp parallel for reduction(+:myegamma)
+    for (int k = 2; k < 100000; ++k)
+      myegamma += (std::riemann_zeta (k) - 1) / k;
+  myegamma = (T) 1 - myegamma;
+  if (!FP_EQUAL (myegamma, std::numbers::egamma_v<T>))
+    return false;
+  return true;
+}
+#pragma omp end declare target
+
+#define RUN_TEST(func) \
+{ \
+  pass++; \
+  bool ok = test_##func<float> (); \
+  if (!ok) { result = pass; break; } \
+  pass++; \
+  ok = test_##func<double> (); \
+  if (!ok) { result = pass; break; } \
+}
+
+int main (void)
+{
+  int result = 0;
+
+  #pragma omp target map (tofrom: result)
+    do {
+      int pass = 0;
+
+      RUN_TEST (pi);
+      RUN_TEST (sqrt);
+      RUN_TEST (phi);
+      RUN_TEST (log);
+      RUN_TEST (egamma);
+    } while (false);
+
+  return result;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__set-concurrent-usm.C b/libgomp/testsuite/libgomp.c++/target-std__set-concurrent-usm.C
new file mode 100644
index 0000000..54f62e3
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__set-concurrent-usm.C
@@ -0,0 +1,5 @@
+#pragma omp requires unified_shared_memory self_maps
+
+#define MEM_SHARED
+
+#include "target-std__set-concurrent.C"
diff --git a/libgomp/testsuite/libgomp.c++/target-std__set-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__set-concurrent.C
new file mode 100644
index 0000000..cd23128
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__set-concurrent.C
@@ -0,0 +1,68 @@
+// { dg-do run }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+#include <stdlib.h>
+#include <time.h>
+#include <set>
+#include <algorithm>
+
+#define N 4000
+#define MAX 16384
+
+void init (int data[])
+{
+  std::set<int> _set;
+  for (int i = 0; i < N; ++i)
+    {
+      // Avoid duplicates in data array.
+      do
+	data[i] = rand () % MAX;
+      while (_set.find (data[i]) != _set.end ());
+      _set.insert (data[i]);
+    }
+}
+
+bool validate (int sum, int data[])
+{
+  int total = 0;
+  for (int i = 0; i < N; ++i)
+    total += data[i];
+  return sum == total;
+}
+
+int main (void)
+{
+  int data[N];
+  std::set<int> _set;
+  int sum = 0;
+
+  srand (time (NULL));
+  init (data);
+
+#ifndef MEM_SHARED
+  #pragma omp target data map (to: data[:N]) map (alloc: _set)
+#endif
+    {
+      #pragma omp target
+	{
+#ifndef MEM_SHARED
+	  new (&_set) std::set<int> ();
+#endif
+	  for (int i = 0; i < N; ++i)
+	    _set.insert (data[i]);
+	}
+
+      #pragma omp target teams distribute parallel for reduction (+:sum)
+	for (int i = 0; i < MAX; ++i)
+	  if (_set.find (i) != _set.end ())
+	    sum += i;
+
+#ifndef MEM_SHARED
+      #pragma omp target
+	_set.~set ();
+#endif
+    }
+
+  bool ok = validate (sum, data);
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__span-concurrent-usm.C b/libgomp/testsuite/libgomp.c++/target-std__span-concurrent-usm.C
new file mode 100644
index 0000000..7ef16bf
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__span-concurrent-usm.C
@@ -0,0 +1,7 @@
+// { dg-additional-options "-std=c++20" }
+
+#pragma omp requires unified_shared_memory self_maps
+
+#define MEM_SHARED
+
+#include "target-std__span-concurrent.C"
diff --git a/libgomp/testsuite/libgomp.c++/target-std__span-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__span-concurrent.C
new file mode 100644
index 0000000..046b3c1
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__span-concurrent.C
@@ -0,0 +1,66 @@
+// { dg-do run }
+// { dg-additional-options "-std=c++20" }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+#include <stdlib.h>
+#include <time.h>
+#include <span>
+
+#define N 64
+
+void init (int data[])
+{
+  for (int i = 0; i < N; ++i)
+    data[i] = rand ();
+}
+
+#pragma omp declare target
+bool validate (const std::span<int, N> &span, int data[])
+{
+  for (int i = 0; i < N; ++i)
+    if (span[i] != data[i] * data[i])
+      return false;
+  return true;
+}
+#pragma omp end declare target
+
+int main (void)
+{
+  int data[N];
+  bool ok;
+  int elements[N];
+  std::span<int, N> span(elements);
+
+  srand (time (NULL));
+  init (data);
+
+#ifndef MEM_SHARED
+  #pragma omp target enter data map (to: data[:N]) map (alloc: elements, span)
+#endif
+
+  #pragma omp target
+    {
+#ifndef MEM_SHARED
+      new (&span) std::span<int, N> (elements);
+#endif
+      std::copy (data, data + N, span.begin ());
+    }
+
+  #pragma omp target teams distribute parallel for
+    for (int i = 0; i < N; ++i)
+      span[i] *= span[i];
+
+  #pragma omp target map (from: ok)
+    {
+      ok = validate (span, data);
+#ifndef MEM_SHARED
+      span.~span ();
+#endif
+    }
+
+#ifndef MEM_SHARED
+  #pragma omp target exit data map (release: elements, span)
+#endif
+
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__unordered_map-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__unordered_map-concurrent.C
new file mode 100644
index 0000000..00d7943
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__unordered_map-concurrent.C
@@ -0,0 +1,66 @@
+// { dg-do run }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+#include <stdlib.h>
+#include <time.h>
+#include <set>
+#include <unordered_map>
+
+#define N 3000
+
+void init (int data[], bool unique)
+{
+  std::set<int> _set;
+  for (int i = 0; i < N; ++i)
+    {
+      // Avoid duplicates in data array if unique is true.
+      do
+	data[i] = rand ();
+      while (unique && _set.count (data[i]) > 0);
+      _set.insert (data[i]);
+    }
+}
+
+bool validate (long long sum, int keys[], int data[])
+{
+  long long total = 0;
+  for (int i = 0; i < N; ++i)
+    total += (long long) keys[i] * data[i];
+  return sum == total;
+}
+
+int main (void)
+{
+  int keys[N], data[N];
+  std::unordered_map<int,int> _map;
+
+  srand (time (NULL));
+  init (keys, true);
+  init (data, false);
+
+  #pragma omp target enter data map (to: keys[:N], data[:N]) map (alloc: _map)
+
+  #pragma omp target
+    {
+#ifndef MEM_SHARED
+      new (&_map) std::unordered_map<int,int> ();
+#endif
+      for (int i = 0; i < N; ++i)
+	_map[keys[i]] = data[i];
+    }
+
+  long long sum = 0;
+  #pragma omp target teams distribute parallel for reduction (+:sum)
+    for (int i = 0; i < N; ++i)
+      sum += (long long) keys[i] * _map[keys[i]];
+
+#ifndef MEM_SHARED
+  #pragma omp target
+    _map.~unordered_map ();
+#endif
+
+  #pragma omp target exit data map (release: _map)
+
+  bool ok = validate (sum, keys, data);
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__unordered_multimap-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__unordered_multimap-concurrent.C
new file mode 100644
index 0000000..2567634
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__unordered_multimap-concurrent.C
@@ -0,0 +1,65 @@
+// { dg-do run }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+#include <stdlib.h>
+#include <time.h>
+#include <unordered_map>
+
+// Make sure that KEY_MAX is less than N to ensure some duplicate keys.
+#define N 3000
+#define KEY_MAX 1000
+
+void init (int data[], int max)
+{
+  for (int i = 0; i < N; ++i)
+    data[i] = i % max;
+}
+
+bool validate (long long sum, int keys[], int data[])
+{
+  long long total = 0;
+  for (int i = 0; i < N; ++i)
+    total += (long long) keys[i] * data[i];
+  return sum == total;
+}
+
+int main (void)
+{
+  int keys[N], data[N];
+  std::unordered_multimap<int,int> _map;
+
+  srand (time (NULL));
+  init (keys, KEY_MAX);
+  init (data, RAND_MAX);
+
+  #pragma omp target enter data map (to: keys[:N], data[:N]) map (alloc: _map)
+
+  #pragma omp target
+    {
+#ifndef MEM_SHARED
+      new (&_map) std::unordered_multimap<int,int> ();
+#endif
+      for (int i = 0; i < N; ++i)
+	_map.insert({keys[i], data[i]});
+    }
+
+  long long sum = 0;
+  #pragma omp target teams distribute parallel for reduction (+:sum)
+    for (int i = 0; i < KEY_MAX; ++i)
+      {
+	auto range = _map.equal_range (i);
+	for (auto it = range.first; it != range.second; ++it) {
+	  sum += (long long) it->first * it->second;
+	}
+      }
+
+#ifndef MEM_SHARED
+  #pragma omp target
+    _map.~unordered_multimap ();
+#endif
+
+  #pragma omp target exit data map (release: _map)
+
+  bool ok = validate (sum, keys, data);
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__unordered_multiset-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__unordered_multiset-concurrent.C
new file mode 100644
index 0000000..da6c875
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__unordered_multiset-concurrent.C
@@ -0,0 +1,59 @@
+// { dg-do run }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+#include <stdlib.h>
+#include <time.h>
+#include <unordered_set>
+#include <algorithm>
+
+// MAX should be less than N to ensure that some duplicates occur.
+#define N 4000
+#define MAX 1000
+
+void init (int data[])
+{
+  for (int i = 0; i < N; ++i)
+    data[i] = rand () % MAX;
+}
+
+bool validate (int sum, int data[])
+{
+  int total = 0;
+  for (int i = 0; i < N; ++i)
+    total += data[i];
+  return sum == total;
+}
+
+int main (void)
+{
+  int data[N];
+  std::unordered_multiset<int> set;
+  int sum = 0;
+
+  srand (time (NULL));
+  init (data);
+
+  #pragma omp target data map (to: data[:N]) map (alloc: set)
+    {
+      #pragma omp target
+	{
+#ifndef MEM_SHARED
+	  new (&set) std::unordered_multiset<int> ();
+#endif
+	  for (int i = 0; i < N; ++i)
+	    set.insert (data[i]);
+	}
+
+      #pragma omp target teams distribute parallel for reduction (+:sum)
+	for (int i = 0; i < MAX; ++i)
+	  sum += i * set.count (i);
+
+#ifndef MEM_SHARED
+      #pragma omp target
+	set.~unordered_multiset ();
+#endif
+    }
+
+  bool ok = validate (sum, data);
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__unordered_set-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__unordered_set-concurrent.C
new file mode 100644
index 0000000..b7bd935
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__unordered_set-concurrent.C
@@ -0,0 +1,66 @@
+// { dg-do run }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+#include <stdlib.h>
+#include <time.h>
+#include <unordered_set>
+#include <algorithm>
+
+#define N 4000
+#define MAX 16384
+
+void init (int data[])
+{
+  std::unordered_set<int> _set;
+  for (int i = 0; i < N; ++i)
+    {
+      // Avoid duplicates in data array.
+      do
+	data[i] = rand () % MAX;
+      while (_set.count (data[i]) != 0);
+      _set.insert (data[i]);
+    }
+}
+
+bool validate (int sum, int data[])
+{
+  int total = 0;
+  for (int i = 0; i < N; ++i)
+    total += data[i];
+  return sum == total;
+}
+
+int main (void)
+{
+  int data[N];
+  std::unordered_set<int> _set;
+  int sum = 0;
+
+  srand (time (NULL));
+  init (data);
+
+  #pragma omp target data map (to: data[:N]) map (alloc: _set)
+    {
+      #pragma omp target
+	{
+#ifndef MEM_SHARED
+	  new (&_set) std::unordered_set<int> ();
+#endif
+	  for (int i = 0; i < N; ++i)
+	    _set.insert (data[i]);
+	}
+
+      #pragma omp target teams distribute parallel for reduction (+:sum)
+	for (int i = 0; i < MAX; ++i)
+	  if (_set.count (i) > 0)
+	    sum += i;
+
+#ifndef MEM_SHARED
+      #pragma omp target
+	_set.~unordered_set ();
+#endif
+    }
+
+  bool ok = validate (sum, data);
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__valarray-1.C b/libgomp/testsuite/libgomp.c++/target-std__valarray-1.C
new file mode 100644
index 0000000..865cde2
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__valarray-1.C
@@ -0,0 +1,179 @@
+// { dg-additional-options -std=c++20 }
+// { dg-output-file target-std__valarray-1.output }
+
+#include <valarray>
+#include <ostream>
+#include <sstream>
+
+
+/*TODO Work around PR118484 "ICE during IPA pass: cp, segfault in determine_versionability ipa-cp.cc:467".
+
+We can't:
+
+    #pragma omp declare target(std::basic_streambuf<char, std::char_traits<char>>::basic_streambuf)
+
+... because:
+
+    error: overloaded function name ‘std::basic_streambuf<char>::__ct ’ in clause ‘enter’
+
+Therefore, use dummy classes in '#pragma omp declare target':
+*/
+
+#pragma omp declare target
+
+// For 'std::basic_streambuf<char, std::char_traits<char> >::basic_streambuf':
+
+class dummy_basic_streambuf__char
+  : public std::basic_streambuf<char>
+{
+public:
+  dummy_basic_streambuf__char() {}
+};
+
+// For 'std::basic_ios<char, std::char_traits<char> >::basic_ios()':
+
+class dummy_basic_ios__char
+  : public std::basic_ios<char>
+{
+public:
+  dummy_basic_ios__char() {}
+};
+
+#pragma omp end declare target
+
+
+int main()
+{
+  // Due to PR120021 "Offloading vs. C++ 'std::initializer_list'", we can't construct these on the device.
+  std::initializer_list<int> v1_i = {10, 20, 30, 40, 50};
+  const int *v1_i_data = std::data(v1_i);
+  size_t v1_i_size = v1_i.size();
+  std::initializer_list<int> v2_i = {5, 4, 3, 2, 1};
+  const int *v2_i_data = std::data(v2_i);
+  size_t v2_i_size = v2_i.size();
+  std::initializer_list<int> shiftData_i = {1, 2, 3, 4, 5};
+  const int *shiftData_i_data = std::data(shiftData_i);
+  size_t shiftData_i_size = shiftData_i.size();
+#pragma omp target \
+  defaultmap(none) \
+  map(to: v1_i_data[:v1_i_size], v1_i_size, \
+          v2_i_data[:v2_i_size], v2_i_size, \
+          shiftData_i_data[:shiftData_i_size], shiftData_i_size)
+  {
+    /* Manually set up a buffer we can stream into, similar to 'cout << [...]', and print it at the end of region.  */
+    std::stringbuf out_b;
+    std::ostream out(&out_b);
+
+    std::valarray<int> v1(v1_i_data, v1_i_size);
+    out << "\nv1:";
+    for (auto val : v1)
+      out << " " << val;
+
+    std::valarray<int> v2(v2_i_data, v2_i_size);
+    out << "\nv2:";
+    for (auto val : v2)
+      out << " " << val;
+
+    std::valarray<int> sum = v1 + v2;
+    out << "\nv1 + v2:";
+    for (auto val : sum)
+      out << " " << val;
+
+    std::valarray<int> diff = v1 - v2;
+    out << "\nv1 - v2:";
+    for (auto val : diff)
+      out << " " << val;
+
+    std::valarray<int> product = v1 * v2;
+    out << "\nv1 * v2:";
+    for (auto val : product)
+      out << " " << val;
+
+    std::valarray<int> quotient = v1 / v2;
+    out << "\nv1 / v2:";
+    for (auto val : quotient)
+      out << " " << val;
+
+    std::valarray<int> squares = pow(v1, 2);
+    out << "\npow(v1, 2):";
+    for (auto val : squares)
+      out << " " << val;
+
+    std::valarray<int> sinhs = sinh(v2);
+    out << "\nsinh(v2):";
+    for (auto val : sinhs)
+      out << " " << val;
+
+    std::valarray<int> logs = log(v1 * v2);
+    out << "\nlog(v1 * v2):";
+    for (auto val : logs)
+      out << " " << val;
+
+    std::valarray<int> data(12);
+    for (size_t i = 0; i < data.size(); ++i)
+      data[i] = i;
+    out << "\nOriginal array:";
+    for (auto val : data)
+      out << " " << val;
+
+    std::slice slice1(2, 5, 1);
+    std::valarray<int> sliced1 = data[slice1];
+    out << "\nSlice(2, 5, 1):";
+    for (auto val : sliced1)
+      out << " " << val;
+
+    std::slice slice2(1, 4, 3);
+    std::valarray<int> sliced2 = data[slice2];
+    out << "\nSlice(1, 4, 3):";
+    for (auto val : sliced2)
+      out << " " << val;
+
+    data[slice1] = 99;
+    out << "\nArray after slice modification:";
+    for (auto val : data)
+      out << " " << val;
+
+    std::valarray<bool> mask = (v1 > 20);
+    out << "\nElements of v1 > 20:";
+    for (size_t i = 0; i < v1.size(); ++i)
+      {
+	if (mask[i])
+	  out << " " << v1[i];
+      }
+
+    std::valarray<int> masked = v1[mask];
+    out << "\nMasked array:";
+    for (auto val : masked)
+      out << " " << val;
+
+    std::valarray<int> shiftData(shiftData_i_data, shiftData_i_size);
+    out << "\nOriginal shiftData:";
+    for (auto val : shiftData)
+      out << " " << val;
+
+    std::valarray<int> shifted = shiftData.shift(2);
+    out << "\nshift(2):";
+    for (auto val : shifted)
+      out << " " << val;
+
+    std::valarray<int> cshifted = shiftData.cshift(-1);
+    out << "\ncshift(-1):";
+    for (auto val : cshifted)
+      out << " " << val;
+
+    out << "\nSum(v1): " << v1.sum();
+    out << "\nMin(v1): " << v1.min();
+    out << "\nMax(v1): " << v1.max();
+
+    out << "\n";
+
+    /* Terminate with a NUL.  Otherwise, we'd have to use:
+           __builtin_printf("%.*s", (int) out_b_sv.size(), out_b_sv.data());
+       ... which nvptx 'printf', as implemented via PTX 'vprintf', doesn't support (TODO).  */
+    out << '\0';
+    std::string_view out_b_sv = out_b.view();
+    __builtin_printf("%s", out_b_sv.data());
+  }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__valarray-1.output b/libgomp/testsuite/libgomp.c++/target-std__valarray-1.output
new file mode 100644
index 0000000..c441e06
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__valarray-1.output
@@ -0,0 +1,22 @@
+
+v1: 10 20 30 40 50
+v2: 5 4 3 2 1
+v1 + v2: 15 24 33 42 51
+v1 - v2: 5 16 27 38 49
+v1 * v2: 50 80 90 80 50
+v1 / v2: 2 5 10 20 50
+pow(v1, 2): 100 400 900 1600 2500
+sinh(v2): 74 27 10 3 1
+log(v1 * v2): 3 4 4 4 3
+Original array: 0 1 2 3 4 5 6 7 8 9 10 11
+Slice(2, 5, 1): 2 3 4 5 6
+Slice(1, 4, 3): 1 4 7 10
+Array after slice modification: 0 1 99 99 99 99 99 7 8 9 10 11
+Elements of v1 > 20: 30 40 50
+Masked array: 30 40 50
+Original shiftData: 1 2 3 4 5
+shift(2): 3 4 5 0 0
+cshift(-1): 5 1 2 3 4
+Sum(v1): 150
+Min(v1): 10
+Max(v1): 50
diff --git a/libgomp/testsuite/libgomp.c++/target-std__valarray-concurrent-usm.C b/libgomp/testsuite/libgomp.c++/target-std__valarray-concurrent-usm.C
new file mode 100644
index 0000000..41ec80e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__valarray-concurrent-usm.C
@@ -0,0 +1,5 @@
+#pragma omp requires unified_shared_memory self_maps
+
+#define MEM_SHARED
+
+#include "target-std__valarray-concurrent.C"
diff --git a/libgomp/testsuite/libgomp.c++/target-std__valarray-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__valarray-concurrent.C
new file mode 100644
index 0000000..8933072b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__valarray-concurrent.C
@@ -0,0 +1,66 @@
+// { dg-do run }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+#include <stdlib.h>
+#include <time.h>
+#include <valarray>
+
+#define N 50000
+
+void init (int data[])
+{
+  for (int i = 0; i < N; ++i)
+    data[i] = rand ();
+}
+
+#pragma omp declare target
+bool validate (const std::valarray<int> &arr, int data[])
+{
+  for (int i = 0; i < N; ++i)
+    if (arr[i] != data[i] * data[i] + i)
+      return false;
+  return true;
+}
+#pragma omp end declare target
+
+int main (void)
+{
+  int data[N];
+  bool ok;
+
+  srand (time (NULL));
+  init (data);
+
+#ifdef MEM_SHARED
+  std::valarray<int> arr (data, N);
+#else
+  std::valarray<int> arr;
+#endif
+
+#ifndef MEM_SHARED
+  #pragma omp target data map (to: data[:N]) map (alloc: arr)
+#endif
+    {
+      #pragma omp target
+	{
+#ifndef MEM_SHARED
+	  new (&arr) std::valarray<int> (data, N);
+#endif
+	  arr *= arr;
+	}
+
+      #pragma omp target teams distribute parallel for
+	for (int i = 0; i < N; ++i)
+	  arr[i] += i;
+
+      #pragma omp target map (from: ok)
+	{
+	  ok = validate (arr, data);
+#ifndef MEM_SHARED
+	  arr.~valarray ();
+#endif
+	}
+    }
+
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-std__vector-concurrent-usm.C b/libgomp/testsuite/libgomp.c++/target-std__vector-concurrent-usm.C
new file mode 100644
index 0000000..967bff3
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__vector-concurrent-usm.C
@@ -0,0 +1,5 @@
+#pragma omp requires unified_shared_memory self_maps
+
+#define MEM_SHARED
+
+#include "target-std__vector-concurrent.C"
diff --git a/libgomp/testsuite/libgomp.c++/target-std__vector-concurrent.C b/libgomp/testsuite/libgomp.c++/target-std__vector-concurrent.C
new file mode 100644
index 0000000..a94b4cf
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-std__vector-concurrent.C
@@ -0,0 +1,63 @@
+// { dg-do run }
+// { dg-additional-options -DMEM_SHARED { target offload_device_shared_as } }
+
+#include <stdlib.h>
+#include <time.h>
+#include <vector>
+
+#define N 50000
+
+void init (int data[])
+{
+  for (int i = 0; i < N; ++i)
+    data[i] = rand ();
+}
+
+#pragma omp declare target
+bool validate (const std::vector<int> &vec, int data[])
+{
+  for (int i = 0; i < N; ++i)
+    if (vec[i] != data[i] * data[i])
+      return false;
+  return true;
+}
+#pragma omp end declare target
+
+int main (void)
+{
+  int data[N];
+  bool ok;
+
+  srand (time (NULL));
+  init (data);
+
+#ifdef MEM_SHARED
+  std::vector<int> vec (data, data + N);
+#else
+  std::vector<int> vec;
+#endif
+
+#ifndef MEM_SHARED
+  #pragma omp target data map (to: data[:N]) map (alloc: vec)
+#endif
+    {
+#ifndef MEM_SHARED
+      #pragma omp target
+	new (&vec) std::vector<int> (data, data + N);
+#endif
+
+      #pragma omp target teams distribute parallel for
+	for (int i = 0; i < N; ++i)
+	  vec[i] *= vec[i];
+
+      #pragma omp target map (from: ok)
+	{
+	  ok = validate (vec, data);
+#ifndef MEM_SHARED
+	  vec.~vector ();
+#endif
+	}
+    }
+
+  return ok ? 0 : 1;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/alloc-pinned-1.c b/libgomp/testsuite/libgomp.c-c++-common/alloc-pinned-1.c
new file mode 100644
index 0000000..7733395
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/alloc-pinned-1.c
@@ -0,0 +1,28 @@
+/* { dg-do run } */
+/* { dg-additional-options "-foffload-memory=pinned" } */
+/* { dg-skip-if "Pinning not implemented on this host" { ! *-*-linux-gnu* } } */
+
+#if __cplusplus
+#define EXTERNC extern "C"
+#else
+#define EXTERNC
+#endif
+
+/* Intercept the libgomp initialization call to check it happens.  */
+
+int good = 0;
+
+EXTERNC void
+GOMP_enable_pinned_mode ()
+{
+  good = 1;
+}
+
+int
+main ()
+{
+  if (!good)
+    __builtin_exit (1);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/allocate-4.c b/libgomp/testsuite/libgomp.c-c++-common/allocate-4.c
index e81cc40..706c851 100644
--- a/libgomp/testsuite/libgomp.c/allocate-4.c
+++ b/libgomp/testsuite/libgomp.c-c++-common/allocate-4.c
@@ -1,6 +1,3 @@
-/* TODO: move to ../libgomp.c-c++-common once C++ is implemented. */
-/* NOTE: { target c } is unsupported with with the C compiler.  */
-
 /* { dg-do run } */
 /* { dg-additional-options "-fdump-tree-gimple" } */
 
diff --git a/libgomp/testsuite/libgomp.c/allocate-5.c b/libgomp/testsuite/libgomp.c-c++-common/allocate-5.c
index beaf164..3bbe78d 100644
--- a/libgomp/testsuite/libgomp.c/allocate-5.c
+++ b/libgomp/testsuite/libgomp.c-c++-common/allocate-5.c
@@ -1,6 +1,3 @@
-/* TODO: move to ../libgomp.c-c++-common once C++ is implemented. */
-/* NOTE: { target c } is unsupported with with the C compiler.  */
-
 /* { dg-do run } */
 /* { dg-additional-options "-fdump-tree-gimple" } */
 
diff --git a/libgomp/testsuite/libgomp.c/allocate-6.c b/libgomp/testsuite/libgomp.c-c++-common/allocate-6.c
index 6d7278c..669581b 100644
--- a/libgomp/testsuite/libgomp.c/allocate-6.c
+++ b/libgomp/testsuite/libgomp.c-c++-common/allocate-6.c
@@ -1,6 +1,3 @@
-/* TODO: move to ../libgomp.c-c++-common once C++ is implemented. */
-/* NOTE: { target c } is unsupported with with the C compiler.  */
-
 /* { dg-do run } */
 /* { dg-additional-options "-fdump-tree-omplower" } */
 
diff --git a/libgomp/testsuite/libgomp.c-c++-common/array-shaping-14.c b/libgomp/testsuite/libgomp.c-c++-common/array-shaping-14.c
new file mode 100644
index 0000000..4ca6f79
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/array-shaping-14.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target offload_device_nonshared_as } } */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+typedef struct {
+  int *ptr;
+} S;
+
+int main(void)
+{
+  S q;
+  q.ptr = (int *) calloc (9 * 11, sizeof (int));
+
+#pragma omp target enter data map(to: q.ptr, q.ptr[0:9*11])
+
+#pragma omp target
+  for (int i = 0; i < 9*11; i++)
+    q.ptr[i] = i;
+
+#pragma omp target update from(([9][11]) q.ptr[3:3:2][1:4:3])
+
+  for (int j = 0; j < 9; j++)
+    for (int i = 0; i < 11; i++)
+      if (j >= 3 && j <= 7 && ((j - 3) % 2) == 0
+	  && i >= 1 && i <= 10 && ((i - 1) % 3) == 0)
+	assert (q.ptr[j * 11 + i] == j * 11 + i);
+      else
+	assert (q.ptr[j * 11 + i] == 0);
+
+#pragma omp target exit data map(release: q.ptr, q.ptr[0:9*11])
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/collapse-4.c b/libgomp/testsuite/libgomp.c-c++-common/collapse-4.c
new file mode 100644
index 0000000..c0af29f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/collapse-4.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+
+#include <stdlib.h>
+
+int
+main (void)
+{
+  int i, j;
+  int count = 0;
+
+  #pragma omp parallel for collapse(2)
+    for (i = 0; i < 80000; i++)
+      for (j = 0; j < 80000; j++)
+	if (i == 66666 && j == 77777)
+	  /* In the collapsed loop space, this is iteration
+	     66666*80000+77777==5,333,357,777.  If the type of the iterator
+	     for the collapsed loop is only a 32-bit unsigned int, then this
+	     iteration will exceed its maximum range and be skipped.  */
+	  count++;
+
+  if (count != 1)
+    abort ();
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-10.c b/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-10.c
new file mode 100644
index 0000000..ca5aef4
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-10.c
@@ -0,0 +1,60 @@
+/* { dg-do run } */
+
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define N 64
+
+typedef struct {
+  int *arr;
+  int size;
+} B;
+
+#pragma omp declare mapper (mapB : B myb) map(to: myb.size, myb.arr) \
+					  map(tofrom: myb.arr[0:myb.size])
+
+struct A {
+  int *arr1;
+  B *arr2;
+  int arr3[N];
+};
+
+int
+main (int argc, char *argv[])
+{
+  struct A var;
+
+  memset (&var, 0, sizeof var);
+  var.arr1 = (int *) calloc (N, sizeof (int));
+  var.arr2 = (B *) malloc (sizeof (B));
+  var.arr2->arr = (int *) calloc (N, sizeof (float));
+  var.arr2->size = N;
+
+  {
+    #pragma omp declare mapper (struct A x) map(to: x.arr1, x.arr2) \
+			  map(tofrom: x.arr1[0:N]) \
+			  map(mapper(mapB), tofrom: x.arr2[0:1])
+    #pragma omp target
+    {
+      for (int i = 0; i < N; i++)
+	{
+	  var.arr1[i]++;
+	  var.arr2->arr[i]++;
+	}
+    }
+  }
+
+  for (int i = 0; i < N; i++)
+    {
+      assert (var.arr1[i] == 1);
+      assert (var.arr2->arr[i] == 1);
+      assert (var.arr3[i] == 0);
+    }
+
+  free (var.arr1);
+  free (var.arr2->arr);
+  free (var.arr2);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-11.c b/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-11.c
new file mode 100644
index 0000000..942d6a5
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-11.c
@@ -0,0 +1,59 @@
+/* { dg-do run } */
+
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define N 64
+
+typedef struct B_tag {
+  int *arr;
+  int size;
+} B;
+
+#pragma omp declare mapper (B myb) map(to: myb.size, myb.arr) \
+				   map(tofrom: myb.arr[0:myb.size])
+
+struct A {
+  int *arr1;
+  B *arr2;
+  int arr3[N];
+};
+
+int
+main (int argc, char *argv[])
+{
+  struct A var;
+
+  memset (&var, 0, sizeof var);
+  var.arr1 = (int *) calloc (N, sizeof (int));
+  var.arr2 = (B *) malloc (sizeof (B));
+  var.arr2->arr = (int *) calloc (N, sizeof (int));
+  var.arr2->size = N;
+
+  {
+    #pragma omp declare mapper (struct A x) map(to: x.arr1, x.arr2) \
+			map(tofrom: x.arr1[0:N]) map(tofrom: x.arr2[0:1])
+    #pragma omp target
+    {
+      for (int i = 0; i < N; i++)
+	{
+	  var.arr1[i]++;
+	  var.arr2->arr[i]++;
+	}
+    }
+  }
+
+  for (int i = 0; i < N; i++)
+    {
+      assert (var.arr1[i] == 1);
+      assert (var.arr2->arr[i] == 1);
+      assert (var.arr3[i] == 0);
+    }
+
+  free (var.arr1);
+  free (var.arr2->arr);
+  free (var.arr2);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-12.c b/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-12.c
new file mode 100644
index 0000000..cbedee5
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-12.c
@@ -0,0 +1,87 @@
+/* { dg-do run } */
+
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define N 64
+
+typedef struct {
+  int *arr;
+  int size;
+} B;
+
+#pragma omp declare mapper (samename : B myb) map(to: myb.size, myb.arr) \
+					      map(tofrom: myb.arr[0:myb.size])
+
+typedef struct {
+  int *arr;
+  int size;
+} C;
+
+
+struct A {
+  int *arr1;
+  B *arr2;
+  C *arr3;
+};
+
+int
+main (int argc, char *argv[])
+{
+  struct A var;
+
+  memset (&var, 0, sizeof var);
+  var.arr1 = (int *) calloc (N, sizeof (int));
+  var.arr2 = (B *) malloc (sizeof (B));
+  var.arr2->arr = (int *) calloc (N, sizeof (int));
+  var.arr2->size = N;
+  var.arr3 = (C *) malloc (sizeof (C));
+  var.arr3->arr = (int *) calloc (N, sizeof (int));
+  var.arr3->size = N;
+
+  {
+    #pragma omp declare mapper (struct A x) map(to: x.arr1, x.arr2) \
+			map(tofrom: x.arr1[0:N]) \
+			map(mapper(samename), tofrom: x.arr2[0:1])
+    #pragma omp target
+    {
+      for (int i = 0; i < N; i++)
+	{
+	  var.arr1[i]++;
+	  var.arr2->arr[i]++;
+	}
+    }
+  }
+
+  {
+    #pragma omp declare mapper (samename : C myc) map(to: myc.size, myc.arr) \
+			map(tofrom: myc.arr[0:myc.size])
+    #pragma omp declare mapper (struct A x) map(to: x.arr1, x.arr3) \
+			map(tofrom: x.arr1[0:N]) \
+			map(mapper(samename), tofrom: *x.arr3)
+    #pragma omp target
+    {
+      for (int i = 0; i < N; i++)
+	{
+	  var.arr1[i]++;
+	  var.arr3->arr[i]++;
+	}
+    }
+  }
+
+  for (int i = 0; i < N; i++)
+    {
+      assert (var.arr1[i] == 2);
+      assert (var.arr2->arr[i] == 1);
+      assert (var.arr3->arr[i] == 1);
+    }
+
+  free (var.arr1);
+  free (var.arr2->arr);
+  free (var.arr2);
+  free (var.arr3->arr);
+  free (var.arr3);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-13.c b/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-13.c
new file mode 100644
index 0000000..c4784eb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-13.c
@@ -0,0 +1,55 @@
+/* { dg-do run } */
+
+#include <assert.h>
+
+struct T {
+  int a;
+  int b;
+  int c;
+};
+
+void foo (void)
+{
+  struct T x;
+  x.a = x.b = x.c = 0;
+
+#pragma omp target
+  {
+    x.a++;
+    x.c++;
+  }
+
+  assert (x.a == 1);
+  assert (x.b == 0);
+  assert (x.c == 1);
+}
+
+// An identity mapper.  This should do the same thing as the default!
+#pragma omp declare mapper (struct T v) map(v)
+
+void bar (void)
+{
+  struct T x;
+  x.a = x.b = x.c = 0;
+
+#pragma omp target
+  {
+    x.b++;
+  }
+
+#pragma omp target map(x)
+  {
+    x.a++;
+  }
+
+  assert (x.a == 1);
+  assert (x.b == 1);
+  assert (x.c == 0);
+}
+
+int main (int argc, char *argv[])
+{
+  foo ();
+  bar ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-14.c b/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-14.c
new file mode 100644
index 0000000..3e6027e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-14.c
@@ -0,0 +1,57 @@
+/* { dg-do run } */
+
+#include <stdlib.h>
+#include <assert.h>
+
+struct Z {
+  int *arr;
+};
+
+void baz (struct Z *zarr, int len)
+{
+#pragma omp declare mapper (struct Z myvar) map(to: myvar.arr) \
+					    map(tofrom: myvar.arr[0:len])
+  zarr[0].arr = (int *) calloc (len, sizeof (int));
+  zarr[5].arr = (int *) calloc (len, sizeof (int));
+
+#pragma omp target map(zarr, *zarr)
+  {
+    for (int i = 0; i < len; i++)
+      zarr[0].arr[i]++;
+  }
+
+#pragma omp target map(zarr, zarr[5])
+  {
+    for (int i = 0; i < len; i++)
+      zarr[5].arr[i]++;
+  }
+
+#pragma omp target map(zarr[5])
+  {
+    for (int i = 0; i < len; i++)
+      zarr[5].arr[i]++;
+  }
+
+#pragma omp target map(zarr, zarr[5:1])
+  {
+    for (int i = 0; i < len; i++)
+      zarr[5].arr[i]++;
+  }
+
+  for (int i = 0; i < len; i++)
+    assert (zarr[0].arr[i] == 1);
+
+  for (int i = 0; i < len; i++)
+    assert (zarr[5].arr[i] == 3);
+
+  free (zarr[5].arr);
+  free (zarr[0].arr);
+}
+
+int
+main (int argc, char *argv[])
+{
+  struct Z myzarr[10];
+  baz (myzarr, 256);
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-18.c b/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-18.c
new file mode 100644
index 0000000..50f37cb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-18.c
@@ -0,0 +1,33 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+typedef struct {
+  int *ptr;
+} S;
+
+int main(void)
+{
+#pragma omp declare mapper(grid: S x) map(([9][11]) x.ptr[3:3:2][1:4:3])
+  S q;
+  q.ptr = (int *) calloc (9 * 11, sizeof (int));
+
+#pragma omp target enter data map(to: q.ptr, q.ptr[0:9*11])
+
+#pragma omp target
+  for (int i = 0; i < 9*11; i++)
+    q.ptr[i] = i;
+
+#pragma omp target update from(mapper(grid): q)
+
+  for (int j = 0; j < 9; j++)
+    for (int i = 0; i < 11; i++)
+      if (j >= 3 && j <= 7 && ((j - 3) % 2) == 0
+	  && i >= 1 && i <= 10 && ((i - 1) % 3) == 0)
+	assert (q.ptr[j * 11 + i] == j * 11 + i);
+      else
+	assert (q.ptr[j * 11 + i] == 0);
+
+#pragma omp target exit data map(release: q.ptr, q.ptr[0:9*11])
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-9.c b/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-9.c
new file mode 100644
index 0000000..324d535
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/declare-mapper-9.c
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define N 64
+
+struct A {
+  int *arr1;
+  float *arr2;
+  int arr3[N];
+};
+
+int
+main (int argc, char *argv[])
+{
+  struct A var;
+
+  memset (&var, 0, sizeof var);
+  var.arr1 = (int *) calloc (N, sizeof (int));
+  var.arr2 = (float *) calloc (N, sizeof (float));
+
+  {
+    #pragma omp declare mapper (struct A x) map(to: x.arr1) \
+					    map(tofrom: x.arr1[0:N])
+    #pragma omp target
+    {
+      for (int i = 0; i < N; i++)
+	var.arr1[i]++;
+    }
+  }
+
+  {
+    #pragma omp declare mapper (struct A x) map(to: x.arr2) \
+					    map(tofrom: x.arr2[0:N])
+    #pragma omp target
+    {
+      for (int i = 0; i < N; i++)
+	var.arr2[i]++;
+    }
+  }
+
+  {
+    #pragma omp declare mapper (struct A x) map(tofrom: x.arr3[0:N])
+    #pragma omp target
+    {
+      for (int i = 0; i < N; i++)
+	var.arr3[i]++;
+    }
+  }
+
+  for (int i = 0; i < N; i++)
+    {
+      assert (var.arr1[i] == 1);
+      assert (var.arr2[i] == 1);
+      assert (var.arr3[i] == 1);
+    }
+
+  free (var.arr1);
+  free (var.arr2);
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/delim-declare-variant-1.c b/libgomp/testsuite/libgomp.c-c++-common/delim-declare-variant-1.c
new file mode 100644
index 0000000..916f8a6
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/delim-declare-variant-1.c
@@ -0,0 +1,45 @@
+/* Check basic functionality for the delimited form of "declare variant"
+   - no error re duplicate definitions
+   - variants are registered and correctly resolved at call site.  */
+
+int foo (int a)
+{
+  return a;
+}
+
+int bar (int x)
+{
+  return x;
+}
+
+#pragma omp begin declare variant match (construct={target})
+int foo (int a)
+{
+  return a + 1;
+}
+
+int bar (int x)
+{
+  return x * 2;
+}
+#pragma omp end declare variant
+
+/* Because of the high score value, this variant for "bar" should always be
+   selected even when the one above also matches.  */
+#pragma omp begin declare variant match (implementation={vendor(score(10000):"gnu")})
+int bar (int x)
+{
+  return x * 4;
+}
+#pragma omp end declare variant
+
+int main (void)
+{
+  if (foo (42) != 42) __builtin_abort ();
+  if (bar (3) != 12) __builtin_abort ();
+#pragma omp target
+  {
+    if (foo (42) != 43) __builtin_abort ();
+    if (bar (3) != 12) __builtin_abort ();
+  }
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/dispatch-3.c b/libgomp/testsuite/libgomp.c-c++-common/dispatch-3.c
new file mode 100644
index 0000000..2c41e3c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/dispatch-3.c
@@ -0,0 +1,35 @@
+/* { dg-additional-options "-fdump-tree-gimple" }  */
+
+/* PR c++/118859  */
+
+void f_var(int *y) {
+ #pragma omp target is_device_ptr(y)
+ {
+   if (*y != 5)
+     __builtin_abort ();
+   *y += 10;
+ }
+}
+#pragma omp declare variant(f_var) match(construct={dispatch}) adjust_args(need_device_ptr : 1)
+void f(int *);
+
+static void test()
+{
+ int x = 5;
+ #pragma omp target enter data map(x)
+
+ #pragma omp dispatch
+   f(&x);
+
+ #pragma omp target exit data map(x)
+ if (x != 15)
+   __builtin_abort ();
+}
+
+int main()
+{
+ test();
+}
+
+// { dg-final { scan-tree-dump "D\\.\[0-9\]+ = __builtin_omp_get_mapped_ptr \\(&x, D\\.\[0-9\]+\\);" "gimple" } }
+// { dg-final { scan-tree-dump "f_var \\(D\\.\[0-9\]+\\);" "gimple" } }
diff --git a/libgomp/testsuite/libgomp.c-c++-common/for-17.c b/libgomp/testsuite/libgomp.c-c++-common/for-17.c
new file mode 100644
index 0000000..9771aaf
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/for-17.c
@@ -0,0 +1,69 @@
+/* { dg-options "-fopenmp-target=acc" } */
+/* { dg-additional-options "-std=gnu99" { target c } } */
+
+#define M(x, y, z) O(x, y, z)
+#define O(x, y, z) x ## _ ## y ## _ ## z
+
+#define DO_PRAGMA(x) _Pragma (#x)
+
+#undef OMPFROM
+#undef OMPTO
+#define OMPFROM(v) DO_PRAGMA (omp target update from(v))
+#define OMPTO(v) DO_PRAGMA (omp target update to(v))
+
+#pragma omp declare target
+
+#define OMPTGT DO_PRAGMA (omp target)
+#define F parallel for
+#define G pf
+#define S
+#define N(x) M(x, G, ompacc)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#undef OMPTGT
+
+#pragma omp end declare target
+
+#define F target parallel for
+#define G tpf
+#define S
+#define N(x) M(x, G, ompacc)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F target teams distribute
+#define G ttd
+#define S
+#define N(x) M(x, G, ompacc)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F target teams distribute parallel for
+#define G ttdpf
+#define S
+#define N(x) M(x, G, ompacc)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+int
+main ()
+{
+  if (test_pf_ompacc ()
+      || test_tpf_ompacc ()
+      || test_ttd_ompacc ()
+      || test_ttdpf_ompacc ())
+    __builtin_abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/for-18.c b/libgomp/testsuite/libgomp.c-c++-common/for-18.c
new file mode 100644
index 0000000..2486d3a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/for-18.c
@@ -0,0 +1,5 @@
+/* { dg-options "-fopenmp-target=acc" } */
+/* { dg-additional-options "-std=gnu99" {target c } } */
+
+#define CONDNE
+#include "for-17.c"
diff --git a/libgomp/testsuite/libgomp.c-c++-common/mapper-iterators-1.c b/libgomp/testsuite/libgomp.c-c++-common/mapper-iterators-1.c
new file mode 100644
index 0000000..1938237
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/mapper-iterators-1.c
@@ -0,0 +1,83 @@
+/* { dg-do run } */
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#define DIM1 4
+#define DIM2 16
+
+struct S {
+  int *arr1;
+  float *arr2;
+  size_t len;
+};
+
+size_t
+mkarray (struct S arr[])
+{
+  size_t sum = 0;
+
+  for (int i = 0; i < DIM1; i++)
+    {
+      memset (&arr[i], 0, sizeof (struct S));
+      arr[i].len = DIM2;
+      arr[i].arr1 = (int *) calloc (arr[i].len, sizeof (int));
+      for (int j = 0; j < DIM2; j++)
+	{
+	  size_t value = (i + 1) * (j + 1);
+	  sum += value;
+	  arr[i].arr1[j] = value;
+	}
+    }
+
+  return sum;
+}
+
+int main ()
+{
+  struct S arr[DIM1];
+  size_t sum = 0xdeadbeef;
+  size_t expected = mkarray (arr);
+
+  #pragma omp declare mapper (struct S x) \
+	map(to: x.arr1[0:DIM2]) \
+	map(to: x.arr2[0:DIM2]) \
+	map(to: x.len)
+
+  #pragma omp target map(iterator(int i=0:DIM1), to: arr[i]) map(from: sum)
+    {
+      sum = 0;
+#ifdef DEBUG
+      __builtin_printf ("&sum: %p\n", &sum);
+#endif
+      for (int i = 0; i < DIM1; i++)
+	{
+#ifdef DEBUG
+	  __builtin_printf ("&arr[%d] = %p\n", i, &arr[i]);
+	  __builtin_printf ("arr[%d].len = %d\n", i, arr[i].len);
+	  __builtin_printf ("arr[%d].arr1 = %p\n", i, arr[i].arr1);
+	  __builtin_printf ("arr[%d].arr2 = %p\n", i, arr[i].arr2);
+#endif
+	  for (int j = 0; j < DIM2; j++)
+	    {
+#ifdef DEBUG
+	      __builtin_printf ("(i=%d,j=%d): %p\n", i, j, &arr[i].arr1[j]);
+	      __builtin_printf ("(i=%d,j=%d): %d\n", i, j, arr[i].arr1[j]);
+#endif
+	      sum += arr[i].arr1[j];
+#ifdef DEBUG
+	      __builtin_printf ("sum: %ld\n", sum);
+#endif
+	    }
+	}
+    }
+
+#ifdef DEBUG
+  __builtin_printf ("&sum: %p\n", &sum);
+  __builtin_printf ("sum:%zd (expected: %zd)\n", sum, expected);
+#endif
+
+  return sum != expected;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/mapper-iterators-2.c b/libgomp/testsuite/libgomp.c-c++-common/mapper-iterators-2.c
new file mode 100644
index 0000000..76f00fb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/mapper-iterators-2.c
@@ -0,0 +1,81 @@
+/* { dg-do run } */
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#define DIM1 4
+#define DIM2 16
+
+#ifdef DEBUG
+#undef DEBUG
+#define DEBUG(...) __builtin_printf (__VA_ARGS__)
+#else
+#define DEBUG(...)
+#endif
+
+struct S {
+  int *arr1;
+  float *arr2;
+  size_t len;
+};
+
+size_t
+mkarray (struct S arr[])
+{
+  size_t sum = 0;
+
+  for (int i = 0; i < DIM1; i++)
+    {
+      memset (&arr[i], 0, sizeof (struct S));
+      arr[i].len = DIM2;
+      arr[i].arr1 = (int *) calloc (arr[i].len, sizeof (int));
+      for (int j = 0; j < DIM2; j++)
+	{
+	  size_t value = (i + 1) * (j + 1);
+	  sum += value;
+	  arr[i].arr1[j] = value;
+	}
+    }
+
+  return sum;
+}
+
+int main ()
+{
+  struct S arr[DIM1];
+  size_t sum = 0xdeadbeef;
+  size_t expected = mkarray (arr);
+
+  #pragma omp declare mapper (struct S x) \
+	map(to: x.arr1[0:DIM2]) \
+	map(to: x.arr2[0:DIM2]) \
+	map(to: x.len)
+
+  /* This should be equivalent to map(iterator(int i=0:DIM1), to: arr[i])  */
+  #pragma omp target map(iterator(int i=0:DIM1:2, j=0:2), to: arr[i+j]) map(from: sum)
+    {
+      sum = 0;
+      DEBUG ("&sum: %p\n", &sum);
+      for (int i = 0; i < DIM1; i++)
+	{
+	  DEBUG ("&arr[%d] = %p\n", i, &arr[i]);
+	  DEBUG ("arr[%d].len = %d\n", i, arr[i].len);
+	  DEBUG ("arr[%d].arr1 = %p\n", i, arr[i].arr1);
+	  DEBUG ("arr[%d].arr2 = %p\n", i, arr[i].arr2);
+	  for (int j = 0; j < DIM2; j++)
+	    {
+	      DEBUG ("(i=%d,j=%d): %p\n", i, j, &arr[i].arr1[j]);
+	      DEBUG ("(i=%d,j=%d): %d\n", i, j, arr[i].arr1[j]);
+	      sum += arr[i].arr1[j];
+	      DEBUG ("sum: %ld\n", sum);
+	    }
+	}
+    }
+
+  DEBUG ("&sum: %p\n", &sum);
+  DEBUG ("sum:%zd (expected: %zd)\n", sum, expected);
+
+  return sum != expected;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/mapper-iterators-3.c b/libgomp/testsuite/libgomp.c-c++-common/mapper-iterators-3.c
new file mode 100644
index 0000000..9d67c38
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/mapper-iterators-3.c
@@ -0,0 +1,98 @@
+/* { dg-do run } */
+
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define N 64
+#define DIM 4
+
+#ifdef DEBUG
+#undef DEBUG
+#define DEBUG(...) __builtin_printf (__VA_ARGS__)
+#else
+#define DEBUG(...)
+#endif
+
+typedef struct {
+  int *arr;
+  int size;
+} B;
+
+#pragma omp declare mapper (mapB : B myb) map(to: myb.size, myb.arr) \
+					  map(tofrom: myb.arr[0:myb.size+1])
+
+struct A {
+  int *arr1;
+  B *arr2;
+  int arr3[N];
+};
+
+int
+main (int argc, char *argv[])
+{
+  struct A var[DIM];
+
+  for (int i=0; i < DIM; i++)
+    {
+      memset (&var[i], 0, sizeof var[i]);
+      var[i].arr1 = (int *) calloc (N, sizeof (int));
+      var[i].arr2 = (B *) malloc (sizeof (B));
+      var[i].arr2->arr = (int *) calloc (N+1, sizeof (float));
+      var[i].arr2->size = N+1;
+      DEBUG ("host &var[%d]:%p\n", i, &var[i]);
+      DEBUG ("host var[%d].arr1:%p\n", i, var[i].arr1);
+      DEBUG ("host var[%d].arr2:%p\n", i, var[i].arr2);
+      DEBUG ("host var[%d].arr2->arr:%p\n", i, var[i].arr2->arr);
+      DEBUG ("host var[%d].arr2->size:%d\n", i, var[i].arr2->size);
+    }
+
+  {
+    #pragma omp declare mapper (struct A x) map(to: x.arr1, x.arr2) \
+			  map(tofrom: x.arr1[0:N]) \
+			  map(mapper(mapB), tofrom: x.arr2[0:1])
+    #pragma omp target map(iterator(int i=0:DIM), tofrom: var[i])
+    {
+      for (int i = 0; i < DIM; i++)
+	{
+	  DEBUG ("&var[%d]:%p\n", i, &var[i]);
+	  DEBUG ("var[%d].arr1:%p\n", i, var[i].arr1);
+	  DEBUG ("var[%d].arr2:%p\n", i, var[i].arr2);
+	  if (var[i].arr2)
+	    {
+	      DEBUG ("var[%d].arr2->arr:%p\n", i, var[i].arr2->arr);
+	      DEBUG ("var[%d].arr2->size:%d\n", i, var[i].arr2->size);
+	    }
+	  for (int j = 0; j < N; j++)
+	    {
+	      DEBUG ("&var[%d].arr1[%d]:%p\n", i, j, &var[i].arr1[j]);
+	      var[i].arr1[j]++;
+	      if (var[i].arr2)
+		{
+		  DEBUG ("&var[%d].arr2->arr[%d]:%p\n", i, j, &var[i].arr2->arr[j]);
+		  var[i].arr2->arr[j]++;
+		}
+	      else
+		DEBUG ("SKIP arr2\n");
+	    }
+	}
+    }
+  }
+
+  for (int i = 0; i < DIM; i++)
+    for (int j = 0; j < N; j++)
+      {
+	assert (var[i].arr1[j] == 1);
+	assert (var[i].arr2->arr[j] == 1);
+	assert (var[i].arr3[j] == 0);
+      }
+
+  for (int i = 0; i < DIM; i++)
+    {
+      free (var[i].arr1);
+      free (var[i].arr2->arr);
+      free (var[i].arr2);
+    }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/target-abi-struct-1-O0.c b/libgomp/testsuite/libgomp.c-c++-common/target-abi-struct-1-O0.c
new file mode 100644
index 0000000..9bf949a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/target-abi-struct-1-O0.c
@@ -0,0 +1,3 @@
+/* { dg-additional-options -O0 } */
+
+#include "target-abi-struct-1.c"
diff --git a/libgomp/testsuite/libgomp.c-c++-common/target-abi-struct-1.c b/libgomp/testsuite/libgomp.c-c++-common/target-abi-struct-1.c
new file mode 100644
index 0000000..d9268af
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/target-abi-struct-1.c
@@ -0,0 +1 @@
+#include "../libgomp.oacc-c-c++-common/abi-struct-1.c"
diff --git a/libgomp/testsuite/libgomp.c-c++-common/target-cdtor-1.c b/libgomp/testsuite/libgomp.c-c++-common/target-cdtor-1.c
new file mode 100644
index 0000000..e6099cf
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/target-cdtor-1.c
@@ -0,0 +1,89 @@
+/* Offloaded 'constructor' and 'destructor' functions.  */
+
+#include <omp.h>
+
+#pragma omp declare target
+
+static void
+__attribute__((constructor))
+initHD1()
+{
+  __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device());
+}
+
+static void
+__attribute__((constructor))
+initHD2()
+{
+  __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device());
+}
+
+static void
+__attribute__((destructor))
+finiHD1()
+{
+  __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device());
+}
+
+static void
+__attribute__((destructor))
+finiHD2()
+{
+  __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device());
+}
+
+#pragma omp end declare target
+
+static void
+__attribute__((constructor))
+initH1()
+{
+  __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device());
+}
+
+static void
+__attribute__((destructor))
+finiH2()
+{
+  __builtin_printf("%s, %d\n", __FUNCTION__, omp_is_initial_device());
+}
+
+int main()
+{
+  int c = 0;
+
+  __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device());
+
+#pragma omp target map(c)
+  {
+    __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device());
+  }
+
+#pragma omp target map(c)
+  {
+    __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device());
+  }
+
+  __builtin_printf("%s:%d, %d\n", __FUNCTION__, ++c, omp_is_initial_device());
+
+  return 0;
+}
+
+/* The order is undefined, in which same-priority 'constructor' functions, and 'destructor' functions are run.
+   { dg-output {init[^,]+, 1[\r\n]+} }
+   { dg-output {init[^,]+, 1[\r\n]+} }
+   { dg-output {init[^,]+, 1[\r\n]+} }
+   { dg-output {main:1, 1[\r\n]+} }
+   { dg-output {initHD[^,]+, 0[\r\n]+} { target offload_device } }
+   { dg-output {initHD[^,]+, 0[\r\n]+} { target offload_device } }
+   { dg-output {main:2, 1[\r\n]+} { target { ! offload_device } } }
+   { dg-output {main:2, 0[\r\n]+} { target offload_device } }
+   { dg-output {main:3, 1[\r\n]+} { target  { ! offload_device } } }
+   { dg-output {main:3, 0[\r\n]+} { target offload_device } }
+   { dg-output {main:4, 1[\r\n]+} }
+   { dg-output {finiHD[^,]+, 0[\r\n]+} { target offload_device } }
+   { dg-output {finiHD[^,]+, 0[\r\n]+} { target offload_device } }
+   { dg-output {fini[^,]+, 1[\r\n]+} }
+   { dg-output {fini[^,]+, 1[\r\n]+} }
+   { dg-output {fini[^,]+, 1[\r\n]+} }
+*/
diff --git a/libgomp/testsuite/libgomp.c-c++-common/target-map-iterators-1.c b/libgomp/testsuite/libgomp.c-c++-common/target-map-iterators-1.c
new file mode 100644
index 0000000..b3d87f2
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/target-map-iterators-1.c
@@ -0,0 +1,47 @@
+/* { dg-do run } */
+/* { dg-require-effective-target offload_device_nonshared_as } */
+
+/* Test transfer of dynamically-allocated arrays to target using map
+   iterators.  */
+
+#include <stdlib.h>
+
+#define DIM1 8
+#define DIM2 15
+
+int mkarray (int *x[])
+{
+  int expected = 0;
+
+  for (int i = 0; i < DIM1; i++)
+    {
+      x[i] = (int *) malloc (DIM2 * sizeof (int));
+      for (int j = 0; j < DIM2; j++)
+	{
+	  x[i][j] = rand ();
+	  expected += x[i][j];
+	}
+    }
+
+  return expected;
+}
+
+int main (void)
+{
+  int *x[DIM1];
+  int y;
+
+  int expected = mkarray (x);
+
+  #pragma omp target enter data map(to: x)
+  #pragma omp target map(iterator(i=0:DIM1), to: x[i][:DIM2]) \
+		     map(from: y)
+    {
+      y = 0;
+      for (int i = 0; i < DIM1; i++)
+	for (int j = 0; j < DIM2; j++)
+	  y += x[i][j];
+    }
+
+  return y - expected;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/target-map-iterators-2.c b/libgomp/testsuite/libgomp.c-c++-common/target-map-iterators-2.c
new file mode 100644
index 0000000..8569b55
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/target-map-iterators-2.c
@@ -0,0 +1,44 @@
+/* { dg-do run } */
+/* { dg-require-effective-target offload_device_nonshared_as } */
+
+/* Test transfer of dynamically-allocated arrays from target using map
+   iterators.  */
+
+#include <stdlib.h>
+
+#define DIM1 8
+#define DIM2 15
+
+void mkarray (int *x[])
+{
+  for (int i = 0; i < DIM1; i++)
+    x[i] = (int *) malloc (DIM2 * sizeof (int));
+}
+
+int main (void)
+{
+  int *x[DIM1];
+  int y, expected;
+
+  mkarray (x);
+
+  #pragma omp target enter data map(alloc: x)
+  #pragma omp target map(iterator(i=0:DIM1), from: x[i][:DIM2]) \
+		     map(from: expected)
+    {
+      expected = 0;
+      for (int i = 0; i < DIM1; i++)
+	for (int j = 0; j < DIM2; j++)
+	  {
+	    x[i][j] = (i+1) * (j+1);
+	    expected += x[i][j];
+	  }
+    }
+
+  y = 0;
+  for (int i = 0; i < DIM1; i++)
+    for (int j = 0; j < DIM2; j++)
+      y += x[i][j];
+
+  return y - expected;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/target-map-iterators-3.c b/libgomp/testsuite/libgomp.c-c++-common/target-map-iterators-3.c
new file mode 100644
index 0000000..be30fa65d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/target-map-iterators-3.c
@@ -0,0 +1,56 @@
+/* { dg-do run } */
+/* { dg-require-effective-target offload_device_nonshared_as } */
+
+/* Test transfer of dynamically-allocated arrays to target using map
+   iterators, with multiple iterators and function calls in the iterator
+   expression.  */
+
+#include <stdlib.h>
+
+#define DIM1 16
+#define DIM2 15
+
+int mkarrays (int *x[], int *y[])
+{
+  int expected = 0;
+
+  for (int i = 0; i < DIM1; i++)
+    {
+      x[i] = (int *) malloc (DIM2 * sizeof (int));
+      y[i] = (int *) malloc (sizeof (int));
+      *y[i] = rand ();
+      for (int j = 0; j < DIM2; j++)
+	{
+	  x[i][j] = rand ();
+	  expected += x[i][j] * *y[i];
+	}
+    }
+
+  return expected;
+}
+
+int f (int i, int j)
+{
+  return i * 4 + j;
+}
+
+int main (void)
+{
+  int *x[DIM1], *y[DIM1];
+  int sum;
+
+  int expected = mkarrays (x, y);
+
+  #pragma omp target enter data map(to: x, y)
+  #pragma omp target map(iterator(i=0:DIM1/4, j=0:4), to: x[f(i, j)][:DIM2]) \
+		     map(iterator(i=0:DIM1), to: y[i][:1]) \
+		     map(from: sum)
+    {
+      sum = 0;
+      for (int i = 0; i < DIM1; i++)
+	for (int j = 0; j < DIM2; j++)
+	  sum += x[i][j] * y[i][0];
+    }
+
+  return sum - expected;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/target-map-iterators-4.c b/libgomp/testsuite/libgomp.c-c++-common/target-map-iterators-4.c
new file mode 100644
index 0000000..6217367
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/target-map-iterators-4.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-require-effective-target offload_device_nonshared_as } */
+
+/* Test transfer of dynamically-allocated arrays to target using map
+   iterators with non-constant bounds.  */
+
+#include <stdlib.h>
+
+#define DIM1 8
+#define DIM2 15
+
+int mkarray (int *x[], int *dim1)
+{
+  int expected = 0;
+  *dim1 = DIM1;
+  for (int i = 0; i < DIM1; i++)
+    {
+      x[i] = (int *) malloc (DIM2 * sizeof (int));
+      for (int j = 0; j < DIM2; j++)
+	{
+	  x[i][j] = rand ();
+	  expected += x[i][j];
+	}
+    }
+
+  return expected;
+}
+
+int main (void)
+{
+  int *x[DIM1];
+  int y;
+  int dim1;
+
+  int expected = mkarray (x, &dim1);
+
+  #pragma omp target enter data map(to: x)
+  #pragma omp target map(iterator(i=0:dim1), to: x[i][:DIM2]) \
+		     map(from: y)
+    {
+      y = 0;
+      for (int i = 0; i < dim1; i++)
+	for (int j = 0; j < DIM2; j++)
+	  y += x[i][j];
+    }
+
+  return y - expected;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/target-map-iterators-5.c b/libgomp/testsuite/libgomp.c-c++-common/target-map-iterators-5.c
new file mode 100644
index 0000000..54b4818
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/target-map-iterators-5.c
@@ -0,0 +1,59 @@
+/* { dg-do run } */
+/* { dg-require-effective-target offload_device_nonshared_as } */
+
+/* Test transfer of dynamically-allocated arrays to target using map
+   iterators, with multiple iterators, function calls and non-constant
+   bounds in the iterator expression.  */
+
+#include <stdlib.h>
+
+#define DIM1 16
+#define DIM2 15
+
+int mkarrays (int *x[], int *y[], int *dim1)
+{
+  int expected = 0;
+
+  *dim1 = DIM1;
+  for (int i = 0; i < DIM1; i++)
+    {
+      x[i] = (int *) malloc (DIM2 * sizeof (int));
+      y[i] = (int *) malloc (sizeof (int));
+      *y[i] = rand ();
+      for (int j = 0; j < DIM2; j++)
+	{
+	  x[i][j] = rand ();
+	  expected += x[i][j] * *y[i];
+	}
+    }
+
+  return expected;
+}
+
+int f (int i, int j)
+{
+  return i * 4 + j;
+}
+
+int main (void)
+{
+  int *x[DIM1], *y[DIM1];
+  int sum;
+
+  int dim1;
+  int expected = mkarrays (x, y, &dim1);
+  int dim1_4 = dim1 / 4;
+
+  #pragma omp target enter data map(to: x, y)
+  #pragma omp target map(iterator(i=0:dim1_4, j=0:4), to: x[f(i, j)][:DIM2]) \
+		     map(iterator(i=0:dim1), to: y[i][:1]) \
+		     map(from: sum)
+    {
+      sum = 0;
+      for (int i = 0; i < dim1; i++)
+	for (int j = 0; j < DIM2; j++)
+	  sum += x[i][j] * y[i][0];
+    }
+
+  return sum - expected;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/target-update-iterators-1.c b/libgomp/testsuite/libgomp.c-c++-common/target-update-iterators-1.c
new file mode 100644
index 0000000..5a4cad5
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/target-update-iterators-1.c
@@ -0,0 +1,65 @@
+/* { dg-do run } */
+
+/* Test target enter data and target update to the target using map
+   iterators.  */
+
+#include <stdlib.h>
+
+#define DIM1 8
+#define DIM2 15
+
+int mkarray (int *x[])
+{
+  int expected = 0;
+  for (int i = 0; i < DIM1; i++)
+    {
+      x[i] = (int *) malloc (DIM2 * sizeof (int));
+      for (int j = 0; j < DIM2; j++)
+	{
+	  x[i][j] = rand ();
+	  expected += x[i][j];
+	}
+    }
+
+  return expected;
+}
+
+int main (void)
+{
+  int *x[DIM1];
+  int sum;
+  int expected = mkarray (x);
+
+  #pragma omp target enter data map(to: x[:DIM1])
+  #pragma omp target enter data map(iterator(i=0:DIM1), to: x[i][:DIM2])
+  #pragma omp target map(from: sum)
+    {
+      sum = 0;
+      for (int i = 0; i < DIM1; i++)
+	for (int j = 0; j < DIM2; j++)
+	  sum += x[i][j];
+    }
+
+  if (sum != expected)
+    return 1;
+
+  expected = 0;
+  for (int i = 0; i < DIM1; i++)
+    for (int j = 0; j < DIM2; j++)
+      {
+	x[i][j] *= rand ();
+	expected += x[i][j];
+      }
+
+  #pragma omp target update to(iterator(i=0:DIM1): x[i][:DIM2])
+
+  #pragma omp target map(from: sum)
+    {
+      sum = 0;
+      for (int i = 0; i < DIM1; i++)
+	for (int j = 0; j < DIM2; j++)
+	  sum += x[i][j];
+    }
+
+  return sum != expected;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/target-update-iterators-2.c b/libgomp/testsuite/libgomp.c-c++-common/target-update-iterators-2.c
new file mode 100644
index 0000000..93438d0
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/target-update-iterators-2.c
@@ -0,0 +1,58 @@
+/* { dg-do run } */
+/* { dg-require-effective-target offload_device_nonshared_as } */
+
+/* Test target enter data and target update from the target using map
+   iterators.  */
+
+#include <stdlib.h>
+
+#define DIM1 8
+#define DIM2 15
+
+void mkarray (int *x[])
+{
+  for (int i = 0; i < DIM1; i++)
+    {
+      x[i] = (int *) malloc (DIM2 * sizeof (int));
+      for (int j = 0; j < DIM2; j++)
+	x[i][j] = 0;
+    }
+}
+
+int main (void)
+{
+  int *x[DIM1];
+  int sum, expected;
+
+  mkarray (x);
+
+  #pragma omp target enter data map(alloc: x[:DIM1])
+  #pragma omp target enter data map(iterator(i=0:DIM1), to: x[i][:DIM2])
+  #pragma omp target map(from: expected)
+    {
+      expected = 0;
+      for (int i = 0; i < DIM1; i++)
+	for (int j = 0; j < DIM2; j++)
+	  {
+	    x[i][j] = (i + 1) * (j + 2);
+	    expected += x[i][j];
+	  }
+    }
+
+  /* Host copy of x should remain unchanged.  */
+  sum = 0;
+  for (int i = 0; i < DIM1; i++)
+    for (int j = 0; j < DIM2; j++)
+      sum += x[i][j];
+  if (sum != 0)
+    return 1;
+
+  #pragma omp target update from(iterator(i=0:DIM1): x[i][:DIM2])
+
+  /* Host copy should now be updated.  */
+  sum = 0;
+  for (int i = 0; i < DIM1; i++)
+    for (int j = 0; j < DIM2; j++)
+      sum += x[i][j];
+  return sum - expected;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/target-update-iterators-3.c b/libgomp/testsuite/libgomp.c-c++-common/target-update-iterators-3.c
new file mode 100644
index 0000000..a70b21c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/target-update-iterators-3.c
@@ -0,0 +1,67 @@
+/* { dg-do run } */
+/* { dg-require-effective-target offload_device_nonshared_as } */
+
+/* Test target enter data and target update to the target using map
+   iterators with a function.  */
+
+#include <stdlib.h>
+
+#define DIM1 8
+#define DIM2 15
+
+void mkarray (int *x[])
+{
+  for (int i = 0; i < DIM1; i++)
+    {
+      x[i] = (int *) malloc (DIM2 * sizeof (int));
+      for (int j = 0; j < DIM2; j++)
+	x[i][j] = rand ();
+    }
+}
+
+int f (int i)
+{
+  return i * 2;
+}
+
+int main (void)
+{
+  int *x[DIM1], x_new[DIM1][DIM2];
+  int sum, expected;
+
+  mkarray (x);
+
+  #pragma omp target enter data map(alloc: x[:DIM1])
+  #pragma omp target enter data map(iterator(i=0:DIM1), to: x[i][:DIM2])
+
+  /* Update x on host.  */
+  for (int i = 0; i < DIM1; i++)
+    for (int j = 0; j < DIM2; j++)
+      {
+	x_new[i][j] = x[i][j];
+	x[i][j] = (i + 1) * (j + 2);
+      }
+
+  /* Update a subset of x on target.  */
+  #pragma omp target update to(iterator(i=0:DIM1/2): x[f (i)][:DIM2])
+
+  #pragma omp target map(from: sum)
+    {
+      sum = 0;
+      for (int i = 0; i < DIM1; i++)
+	for (int j = 0; j < DIM2; j++)
+	  sum += x[i][j];
+    }
+
+  /* Calculate expected value on host.  */
+  for (int i = 0; i < DIM1/2; i++)
+    for (int j = 0; j < DIM2; j++)
+      x_new[f (i)][j] = x[f (i)][j];
+
+  expected = 0;
+  for (int i = 0; i < DIM1; i++)
+    for (int j = 0; j < DIM2; j++)
+      expected += x_new[i][j];
+
+  return sum - expected;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/target-update-iterators-4.c b/libgomp/testsuite/libgomp.c-c++-common/target-update-iterators-4.c
new file mode 100644
index 0000000..810b881
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/target-update-iterators-4.c
@@ -0,0 +1,66 @@
+/* { dg-do run } */
+
+/* Test target enter data and target update to the target using map
+   iterators with non-constant bounds.  */
+
+#include <stdlib.h>
+
+#define DIM1 8
+#define DIM2 15
+
+int mkarray (int *x[], int *dim1)
+{
+  int expected = 0;
+  *dim1 = DIM1;
+  for (int i = 0; i < DIM1; i++)
+    {
+      x[i] = (int *) malloc (DIM2 * sizeof (int));
+      for (int j = 0; j < DIM2; j++)
+	{
+	  x[i][j] = rand ();
+	  expected += x[i][j];
+	}
+    }
+
+  return expected;
+}
+
+int main (void)
+{
+  int *x[DIM1];
+  int sum, dim1;
+  int expected = mkarray (x, &dim1);
+
+  #pragma omp target enter data map(to: x[:DIM1])
+  #pragma omp target enter data map(iterator(i=0:dim1), to: x[i][:DIM2])
+  #pragma omp target map(from: sum)
+    {
+      sum = 0;
+      for (int i = 0; i < dim1; i++)
+	for (int j = 0; j < DIM2; j++)
+	  sum += x[i][j];
+    }
+
+  if (sum != expected)
+    return 1;
+
+  expected = 0;
+  for (int i = 0; i < dim1; i++)
+    for (int j = 0; j < DIM2; j++)
+      {
+	x[i][j] *= rand ();
+	expected += x[i][j];
+      }
+
+  #pragma omp target update to(iterator(i=0:dim1): x[i][:DIM2])
+
+  #pragma omp target map(from: sum)
+    {
+      sum = 0;
+      for (int i = 0; i < dim1; i++)
+	for (int j = 0; j < DIM2; j++)
+	  sum += x[i][j];
+    }
+
+  return sum != expected;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/uses_allocators-1.c b/libgomp/testsuite/libgomp.c-c++-common/uses_allocators-1.c
new file mode 100644
index 0000000..21074a3
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/uses_allocators-1.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-fdump-tree-original -fdump-tree-gimple" } */
+
+#include <omp.h>
+
+omp_alloctrait_key_t k;
+omp_alloctrait_value_t v;
+
+int main (void)
+{
+  omp_allocator_handle_t foo, bar;
+  const omp_alloctrait_t foo_traits[] = { { omp_atk_pinned,    omp_atv_true },
+					  { omp_atk_partition, omp_atv_nearest } };
+  #pragma omp target
+    ;
+  #pragma omp target uses_allocators (bar)
+    ;
+  #pragma omp target uses_allocators (foo (foo_traits))
+    ;
+  #pragma omp target uses_allocators (foo (foo_traits), bar (foo_traits))
+    ;
+  #pragma omp target uses_allocators (memspace(omp_high_bw_mem_space) : foo)
+    ;
+  #pragma omp target uses_allocators (traits(foo_traits) : bar)
+    ;
+  #pragma omp target parallel uses_allocators (memspace(omp_high_bw_mem_space), traits(foo_traits) : bar)
+    ;
+  #pragma omp target parallel uses_allocators (traits(foo_traits), memspace(omp_high_bw_mem_space) : bar) uses_allocators(foo)
+  {
+    void *p = omp_alloc ((unsigned long) 32, bar);
+    omp_free (p, bar);
+  }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "pragma omp target uses_allocators\\(bar: memspace\\(\\), traits\\(\\)\\)" "original" } } */
+/* { dg-final { scan-tree-dump "pragma omp target uses_allocators\\(foo: memspace\\(\\), traits\\(foo_traits\\)\\)" "original" } } */
+/* { dg-final { scan-tree-dump "pragma omp target uses_allocators\\(bar: memspace\\(\\), traits\\(foo_traits\\)\\) uses_allocators\\(foo: memspace\\(\\), traits\\(foo_traits\\)\\)" "original" } } */
+/* { dg-final { scan-tree-dump "pragma omp target uses_allocators\\(foo: memspace\\(omp_high_bw_mem_space\\), traits\\(\\)\\)" "original" } } */
+/* { dg-final { scan-tree-dump "pragma omp target uses_allocators\\(bar: memspace\\(\\), traits\\(foo_traits\\)\\)" "original" } } */
+/* { dg-final { scan-tree-dump "pragma omp target uses_allocators\\(bar: memspace\\(omp_high_bw_mem_space\\), traits\\(foo_traits\\)\\)" "original" } } */
+/* { dg-final { scan-tree-dump "pragma omp target uses_allocators\\(bar: memspace\\(omp_high_bw_mem_space\\), traits\\(foo_traits\\)\\) uses_allocators\\(foo: memspace\\(\\), traits\\(\\)\\)" "original" } } */
+
+/* { dg-final { scan-tree-dump "pragma omp target num_teams\\(-2\\) thread_limit\\(0\\) uses_allocators\\(bar: memspace\\(\\), traits\\(\\)\\) private\\(bar\\)" "gimple" } } */
+/* { dg-final { scan-tree-dump "pragma omp target num_teams\\(-2\\) thread_limit\\(0\\) uses_allocators\\(foo: memspace\\(\\), traits\\(foo_traits\\)\\) private\\(foo\\)" "gimple" } } */
+/* { dg-final { scan-tree-dump "pragma omp target num_teams\\(-2\\) thread_limit\\(0\\) uses_allocators\\(bar: memspace\\(\\), traits\\(foo_traits\\)\\) uses_allocators\\(foo: memspace\\(\\), traits\\(foo_traits\\)\\) private\\(bar\\) private\\(foo\\)" "gimple" } } */
+/* { dg-final { scan-tree-dump "pragma omp target num_teams\\(-2\\) thread_limit\\(0\\) uses_allocators\\(foo: memspace\\(omp_high_bw_mem_space\\), traits\\(\\)\\) private\\(foo\\)" "gimple" } } */
+/* { dg-final { scan-tree-dump "pragma omp target num_teams\\(-2\\) thread_limit\\(0\\) uses_allocators\\(bar: memspace\\(\\), traits\\(foo_traits\\)\\) private\\(bar\\)" "gimple" } } */
+/* { dg-final { scan-tree-dump "pragma omp target num_teams\\(-2\\) thread_limit\\(0\\) uses_allocators\\(bar: memspace\\(omp_high_bw_mem_space\\), traits\\(foo_traits\\)\\) private\\(bar\\)" "gimple" } } */
+/* { dg-final { scan-tree-dump "pragma omp target num_teams\\(-2\\) thread_limit\\(0\\) uses_allocators\\(bar: memspace\\(omp_high_bw_mem_space\\), traits\\(foo_traits\\)\\) uses_allocators\\(foo: memspace\\(\\), traits\\(\\)\\) private\\(bar\\) private\\(foo\\)" "gimple" } } */
+
+/* { dg-final { scan-tree-dump-times "__builtin_omp_init_allocator" 9 "gimple" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_omp_destroy_allocator" 9 "gimple" } } */
diff --git a/libgomp/testsuite/libgomp.c-c++-common/uses_allocators-2.c b/libgomp/testsuite/libgomp.c-c++-common/uses_allocators-2.c
new file mode 100644
index 0000000..f350c0a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/uses_allocators-2.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+
+#include <omp.h>
+
+omp_alloctrait_key_t k;
+omp_alloctrait_value_t v;
+
+int main (void)
+{
+  omp_allocator_handle_t foo, bar;
+  const omp_alloctrait_t traits_array[] = { { omp_atk_pinned,    omp_atv_true },
+					    { omp_atk_partition, omp_atv_nearest } };
+
+  #pragma omp target uses_allocators (baz) /* { dg-error "'baz' undeclared .first use in this function." "" { target c } } */
+    ;                                      /* { dg-error "'baz' has not been declared" "" { target c++ } .-1 } */
+  #pragma omp target uses_allocators (foo (xyz)) /* { dg-error "'xyz' undeclared .first use in this function." "" { target c } } */
+    ;                                            /* { dg-error "'xyz' has not been declared" "" { target c++ } .-1 } */
+  #pragma omp target uses_allocators (foo (traits_array), baz (traits_array)) /* { dg-error "'baz' has not been declared" "" { target c++ } } */
+    ;
+  #pragma omp target uses_allocators (memspace(omp_no_such_space) : foo) /* { dg-error "'omp_no_such_space' undeclared .first use in this function." "" { target c } } */
+    ;                                                                    /* { dg-error "'omp_no_such_space' has not been declared" "" { target c++ } .-1 } */
+  #pragma omp target uses_allocators (memspace(1) : foo) /* { dg-error "expected identifier before numeric constant" } */
+    ;                                                    /* { dg-error "expected '\\\)' before ':' token" "" { target c } .-1 } */
+  #pragma omp target uses_allocators (memspace(omp_no_such_space) : foo, bar) /* { dg-error "'uses_allocators' clause only accepts a single allocator when using modifiers" } */
+    ;                                                                         /* { dg-error "'omp_no_such_space' has not been declared" "" { target c++ } .-1 } */
+  #pragma omp target uses_allocators (traits(xyz) : bar) /* { dg-error "traits array must be of 'const omp_alloctrait_t \\\[\\\]' type" "" { target c } } */
+    ;                                                    /* { dg-error "'xyz' has not been declared" "" { target c++ } .-1 } */
+  #pragma omp target uses_allocators (memspace(omp_high_bw_mem_space), traits(traits_array), memspace (omp_no_such_space) : bar) /* { dg-error "duplicate 'memspace' modifier" } */
+    ;
+  #pragma omp target uses_allocators (traitz(traits_array), memspace(omp_high_bw_mem_space) : bar) /* { dg-error "unknown modifier 'traitz'" } */
+    ;
+  #pragma omp target uses_allocators (omp_null_allocator) /* { dg-error "'omp_null_allocator' cannot be used in 'uses_allocators' clause" } */
+    ;
+  #pragma omp target uses_allocators (memspace(omp_high_bw_mem_space) : foo, bar) /* { dg-error "'uses_allocators' clause only accepts a single allocator when using modifiers" } */
+    ;
+  #pragma omp target uses_allocators (memspace(omp_high_bw_mem_space) : foo(foo_traits)) /* { dg-error "legacy 'foo\\\(foo_traits\\\)' traits syntax not allowed in 'uses_allocators' clause when using modifiers" } */
+    ;
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/uses_allocators-3.c b/libgomp/testsuite/libgomp.c-c++-common/uses_allocators-3.c
new file mode 100644
index 0000000..de9ab92
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/uses_allocators-3.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-fdump-tree-original -fdump-tree-gimple" } */
+
+#include <omp.h>
+
+int main (void)
+{
+  omp_allocator_handle_t memspace, traits;
+  const omp_alloctrait_t mytraits[] = { { omp_atk_pinned,    omp_atv_true },
+					{ omp_atk_partition, omp_atv_nearest } };
+  #pragma omp target uses_allocators (memspace)
+    ;
+  #pragma omp target uses_allocators (traits)
+    ;
+  #pragma omp target uses_allocators (traits, memspace)
+    ;
+  #pragma omp target uses_allocators (traits (mytraits))
+    ;
+  #pragma omp target uses_allocators (memspace (mytraits), omp_default_mem_alloc)
+    ;
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "pragma omp target uses_allocators\\(memspace: memspace\\(\\), traits\\(\\)\\)" "original" } } */
+/* { dg-final { scan-tree-dump "pragma omp target uses_allocators\\(traits: memspace\\(\\), traits\\(\\)\\)" "original" } } */
+/* { dg-final { scan-tree-dump "pragma omp target uses_allocators\\(memspace: memspace\\(\\), traits\\(\\)\\) uses_allocators\\(traits: memspace\\(\\), traits\\(\\)\\)" "original" } } */
+/* { dg-final { scan-tree-dump "pragma omp target uses_allocators\\(traits: memspace\\(\\), traits\\(mytraits\\)\\)" "original" } } */
+/* { dg-final { scan-tree-dump "pragma omp target uses_allocators\\(memspace: memspace\\(\\), traits\\(mytraits\\)\\)" "original" } } */
+
+/* { dg-final { scan-tree-dump "pragma omp target num_teams\\(-2\\) thread_limit\\(0\\) uses_allocators\\(memspace: memspace\\(\\), traits\\(\\)\\) private\\(memspace\\)" "gimple" } } */
+/* { dg-final { scan-tree-dump "pragma omp target num_teams\\(-2\\) thread_limit\\(0\\) uses_allocators\\(traits: memspace\\(\\), traits\\(\\)\\) private\\(traits\\)" "gimple" } } */
+/* { dg-final { scan-tree-dump "pragma omp target num_teams\\(-2\\) thread_limit\\(0\\) uses_allocators\\(memspace: memspace\\(\\), traits\\(\\)\\) uses_allocators\\(traits: memspace\\(\\), traits\\(\\)\\) private\\(traits\\) private\\(memspace\\)" "gimple" } } */
+/* { dg-final { scan-tree-dump "pragma omp target num_teams\\(-2\\) thread_limit\\(0\\) uses_allocators\\(traits: memspace\\(\\), traits\\(mytraits\\)\\) private\\(traits\\)" "gimple" } } */
+/* { dg-final { scan-tree-dump "pragma omp target num_teams\\(-2\\) thread_limit\\(0\\) uses_allocators\\(memspace: memspace\\(\\), traits\\(mytraits\\)\\) private\\(memspace\\)" "gimple" } } */
+
+/* { dg-final { scan-tree-dump-times "__builtin_omp_init_allocator" 6 "gimple" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_omp_destroy_allocator" 6 "gimple" } } */
diff --git a/libgomp/testsuite/libgomp.c-c++-common/uses_allocators-4.c b/libgomp/testsuite/libgomp.c-c++-common/uses_allocators-4.c
new file mode 100644
index 0000000..5942a0d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/uses_allocators-4.c
@@ -0,0 +1,53 @@
+/* { dg-additional-options "-fdump-tree-gimple" } */
+
+#include <stdint.h>
+#include <omp.h>
+
+int
+main ()
+{
+  int x, *xbuf[10];
+  omp_allocator_handle_t my_alloc;
+  const omp_alloctrait_t trait[1]= {{omp_atk_alignment,128}};
+
+  #pragma omp target uses_allocators(omp_low_lat_mem_alloc) map(tofrom: x, xbuf) defaultmap(none)
+    #pragma omp parallel allocate(allocator(omp_low_lat_mem_alloc), align(128): x, xbuf) if(0) firstprivate(x, xbuf)
+      {
+	if ((uintptr_t) &x % 128 != 0)
+	  __builtin_abort ();
+	if ((uintptr_t) xbuf % 128 != 0)
+	  __builtin_abort ();
+      }
+
+  my_alloc = (omp_allocator_handle_t) 0xABCD;
+
+  #pragma omp target uses_allocators(traits(trait): my_alloc) defaultmap(none) map(tofrom: x, xbuf)
+    #pragma omp parallel allocate(allocator(my_alloc): x, xbuf) if(0) firstprivate(x, xbuf)
+      {
+	if ((uintptr_t) &x % 128 != 0)
+	  __builtin_abort ();
+	if ((uintptr_t) xbuf % 128 != 0)
+	  __builtin_abort ();
+      }
+
+  if (my_alloc != (omp_allocator_handle_t) 0xABCD)
+    __builtin_abort ();
+
+  /* The following creates an allocator with empty traits + default mem space. */
+  #pragma omp target uses_allocators(my_alloc) map(tofrom: x, xbuf) defaultmap(none)
+    #pragma omp parallel allocate(allocator(my_alloc), align(128): x, xbuf) if(0) firstprivate(x, xbuf)
+      {
+	if ((uintptr_t) &x % 128 != 0)
+	  __builtin_abort ();
+	if ((uintptr_t) xbuf % 128 != 0)
+	  __builtin_abort ();
+      }
+
+  if (my_alloc != (omp_allocator_handle_t) 0xABCD)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "#pragma omp target .*private\\(my_alloc\\).*uses_allocators\\(my_alloc: memspace\\(\\), traits\\(trait\\)\\)" 1 "gimple" } } */
+/* { dg-final { scan-tree-dump-times "#pragma omp target .*private\\(my_alloc\\).*uses_allocators\\(my_alloc: memspace\\(\\), traits\\(\\)\\)" 1 "gimple" } } */
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-1.c b/libgomp/testsuite/libgomp.c/alloc-pinned-1.c
index 672f245..693f903 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-1.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-1.c
@@ -2,6 +2,8 @@
 
 /* { dg-skip-if "Pinning not implemented on this host" { ! *-*-linux-gnu* } } */
 
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
 /* Test that pinned memory works.  */
 
 #include <stdio.h>
@@ -63,10 +65,16 @@ verify0 (char *p, size_t s)
 int
 main ()
 {
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* Go big or go home.
+     The OS ulimit does not affect memory locked via CUDA for NVPTX devices. */
+  const int SIZE = 40 * 1024 * 1024;
+#else
   /* Allocate at least a page each time, allowing space for overhead,
      but stay within the ulimit.  */
   const int SIZE = PAGE_SIZE - 128;
   CHECK_SIZE (SIZE * 5);  // This is intended to help diagnose failures
+#endif
 
   const omp_alloctrait_t traits[] = {
       { omp_atk_pinned, 1 }
@@ -88,21 +96,39 @@ main ()
     abort ();
 
   int amount = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount != 0)
+    abort ();
+#else
   if (amount == 0)
     abort ();
+#endif
 
   p = omp_realloc (p, SIZE * 2, allocator, allocator);
 
   int amount2 = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount2 != 0)
+    abort ();
+#else
   if (amount2 <= amount)
     abort ();
+#endif
 
   /* SIZE*2 ensures that it doesn't slot into the space possibly
      vacated by realloc.  */
   p = omp_calloc (1, SIZE * 2, allocator);
 
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (get_pinned_mem () != 0)
+    abort ();
+#else
   if (get_pinned_mem () <= amount2)
     abort ();
+#endif
 
   verify0 (p, SIZE * 2);
 
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-2.c b/libgomp/testsuite/libgomp.c/alloc-pinned-2.c
index b6d1d83..e7ac64e 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-2.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-2.c
@@ -2,6 +2,8 @@
 
 /* { dg-skip-if "Pinning not implemented on this host" { ! *-*-linux-gnu* } } */
 
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
 /* Test that pinned memory works (pool_size code path).  */
 
 #include <stdio.h>
@@ -63,10 +65,16 @@ verify0 (char *p, size_t s)
 int
 main ()
 {
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* Go big or go home.
+     The OS ulimit does not affect memory locked via CUDA for NVPTX devices. */
+  const int SIZE = 40 * 1024 * 1024;
+#else
   /* Allocate at least a page each time, allowing space for overhead,
      but stay within the ulimit.  */
   const int SIZE = PAGE_SIZE - 128;
   CHECK_SIZE (SIZE * 5);  // This is intended to help diagnose failures
+#endif
 
   const omp_alloctrait_t traits[] = {
       { omp_atk_pinned, 1 },
@@ -89,16 +97,28 @@ main ()
     abort ();
 
   int amount = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount != 0)
+    abort ();
+#else
   if (amount == 0)
     abort ();
+#endif
 
   p = omp_realloc (p, SIZE * 2, allocator, allocator);
   if (!p)
     abort ();
 
   int amount2 = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount2 != 0)
+    abort ();
+#else
   if (amount2 <= amount)
     abort ();
+#endif
 
   /* SIZE*2 ensures that it doesn't slot into the space possibly
      vacated by realloc.  */
@@ -106,8 +126,14 @@ main ()
   if (!p)
     abort ();
 
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (get_pinned_mem () != 0)
+    abort ();
+#else
   if (get_pinned_mem () <= amount2)
     abort ();
+#endif
 
   verify0 (p, SIZE * 2);
 
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-3.c b/libgomp/testsuite/libgomp.c/alloc-pinned-3.c
index 11dc818..250cb55 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-3.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-3.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
 /* Test that pinned memory fails correctly.  */
 
 #include <stdio.h>
@@ -75,8 +77,15 @@ verify0 (char *p, size_t s)
 int
 main ()
 {
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* Go big or go home.
+     The OS ulimit does not affect memory locked via CUDA for NVPTX devices. */
+  const int SIZE = 40 * 1024 * 1024;
+#else
   /* This needs to be large enough to cover multiple pages.  */
   const int SIZE = PAGE_SIZE * 4;
+#endif
+  const int PIN_LIMIT = PAGE_SIZE * 2;
 
   /* Pinned memory, no fallback.  */
   const omp_alloctrait_t traits1[] = {
@@ -101,23 +110,34 @@ main ()
 #endif
 
   /* Ensure that the limit is smaller than the allocation.  */
-  set_pin_limit (SIZE / 2);
+  set_pin_limit (PIN_LIMIT);
 
   // Sanity check
   if (get_pinned_mem () != 0)
     abort ();
 
-  // Should fail
   void *p1 = omp_alloc (SIZE, allocator1);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'.
+  if (!p1)
+    abort ();
+#else
+  // Should fail
   if (p1)
     abort ();
+#endif
 
-  // Should fail
   void *p2 = omp_calloc (1, SIZE, allocator1);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'.
+  if (!p2)
+    abort ();
+#else
+  // Should fail
   if (p2)
     abort ();
+#endif
 
-  // Should fall back
   void *p3 = omp_alloc (SIZE, allocator2);
   if (!p3)
     abort ();
@@ -128,16 +148,29 @@ main ()
     abort ();
   verify0 (p4, SIZE);
 
-  // Should fail to realloc
   void *notpinned = omp_alloc (SIZE, omp_default_mem_alloc);
   void *p5 = omp_realloc (notpinned, SIZE, allocator1, omp_default_mem_alloc);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'; does reallocate.
+  if (!notpinned || !p5 || p5 == notpinned)
+    abort ();
+#else
+  // Should fail to realloc
   if (!notpinned || p5)
     abort ();
+#endif
 
-  // Should fall back to no realloc needed
+#ifdef OFFLOAD_DEVICE_NVPTX
+  void *p6 = omp_realloc (p5, SIZE, allocator2, allocator1);
+  // Does reallocate.
+  if (p5 == p6)
+    abort ();
+#else
   void *p6 = omp_realloc (notpinned, SIZE, allocator2, omp_default_mem_alloc);
+  // Should fall back to no realloc needed
   if (p6 != notpinned)
     abort ();
+#endif
 
   // No memory should have been pinned
   int amount = get_pinned_mem ();
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-4.c b/libgomp/testsuite/libgomp.c/alloc-pinned-4.c
index 2ecd01f..b7a9966 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-4.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-4.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
 /* Test that pinned memory fails correctly, pool_size code path.  */
 
 #include <stdio.h>
@@ -75,8 +77,15 @@ verify0 (char *p, size_t s)
 int
 main ()
 {
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* Go big or go home.
+     The OS ulimit does not affect memory locked via CUDA for NVPTX devices. */
+  const int SIZE = 40 * 1024 * 1024;
+#else
   /* This needs to be large enough to cover multiple pages.  */
   const int SIZE = PAGE_SIZE * 4;
+#endif
+  const int PIN_LIMIT = PAGE_SIZE * 2;
 
   /* Pinned memory, no fallback.  */
   const omp_alloctrait_t traits1[] = {
@@ -103,21 +112,33 @@ main ()
 #endif
 
   /* Ensure that the limit is smaller than the allocation.  */
-  set_pin_limit (SIZE / 2);
+  set_pin_limit (PIN_LIMIT);
 
   // Sanity check
   if (get_pinned_mem () != 0)
     abort ();
 
-  // Should fail
   void *p = omp_alloc (SIZE, allocator1);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'.
+  if (!p)
+    abort ();
+#else
+  // Should fail
   if (p)
     abort ();
+#endif
 
-  // Should fail
   p = omp_calloc (1, SIZE, allocator1);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'.
+  if (!p)
+    abort ();
+#else
+  // Should fail
   if (p)
     abort ();
+#endif
 
   // Should fall back
   p = omp_alloc (SIZE, allocator2);
@@ -130,16 +151,29 @@ main ()
     abort ();
   verify0 (p, SIZE);
 
-  // Should fail to realloc
   void *notpinned = omp_alloc (SIZE, omp_default_mem_alloc);
   p = omp_realloc (notpinned, SIZE, allocator1, omp_default_mem_alloc);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'; does reallocate.
+  if (!notpinned || !p || p == notpinned)
+    abort ();
+#else
+  // Should fail to realloc
   if (!notpinned || p)
     abort ();
+#endif
 
-  // Should fall back to no realloc needed
+#ifdef OFFLOAD_DEVICE_NVPTX
+  void *p_ = omp_realloc (p, SIZE, allocator2, allocator1);
+  // Does reallocate.
+  if (p_ == p)
+    abort ();
+#else
   p = omp_realloc (notpinned, SIZE, allocator2, omp_default_mem_alloc);
+  // Should fall back to no realloc needed
   if (p != notpinned)
     abort ();
+#endif
 
   // No memory should have been pinned
   int amount = get_pinned_mem ();
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-5.c b/libgomp/testsuite/libgomp.c/alloc-pinned-5.c
index 0ba2feb..cc77764 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-5.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-5.c
@@ -2,6 +2,8 @@
 
 /* { dg-skip-if "Pinning not implemented on this host" { ! *-*-linux-gnu* } } */
 
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
 /* Test that ompx_gnu_pinned_mem_alloc works.  */
 
 #include <stdio.h>
@@ -63,10 +65,16 @@ verify0 (char *p, size_t s)
 int
 main ()
 {
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* Go big or go home.
+     The OS ulimit does not affect memory locked via CUDA for NVPTX devices. */
+  const int SIZE = 40 * 1024 * 1024;
+#else
   /* Allocate at least a page each time, allowing space for overhead,
      but stay within the ulimit.  */
   const int SIZE = PAGE_SIZE - 128;
   CHECK_SIZE (SIZE * 5);
+#endif
 
   // Sanity check
   if (get_pinned_mem () != 0)
@@ -77,22 +85,40 @@ main ()
     abort ();
 
   int amount = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount != 0)
+    abort ();
+#else
   if (amount == 0)
     abort ();
+#endif
 
   p = omp_realloc (p, SIZE * 2, ompx_gnu_pinned_mem_alloc,
 		   ompx_gnu_pinned_mem_alloc);
 
   int amount2 = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount2 != 0)
+    abort ();
+#else
   if (amount2 <= amount)
     abort ();
+#endif
 
   /* SIZE*2 ensures that it doesn't slot into the space possibly
      vacated by realloc.  */
   p = omp_calloc (1, SIZE * 2, ompx_gnu_pinned_mem_alloc);
 
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (get_pinned_mem () != 0)
+    abort ();
+#else
   if (get_pinned_mem () <= amount2)
     abort ();
+#endif
 
   verify0 (p, SIZE * 2);
 
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-6.c b/libgomp/testsuite/libgomp.c/alloc-pinned-6.c
index 99f1269..6dd5544 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-6.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-6.c
@@ -1,4 +1,5 @@
 /* { dg-do run } */
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
 
 /* Test that ompx_gnu_pinned_mem_alloc fails correctly.  */
 
@@ -66,32 +67,57 @@ set_pin_limit (int size)
 int
 main ()
 {
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* Go big or go home.
+     The OS ulimit does not affect memory locked via CUDA for NVPTX devices. */
+  const int SIZE = 40 * 1024 * 1024;
+#else
   /* Allocate at least a page each time, but stay within the ulimit.  */
   const int SIZE = PAGE_SIZE * 4;
+#endif
+  const int PIN_LIMIT = PAGE_SIZE*2;
 
   /* Ensure that the limit is smaller than the allocation.  */
-  set_pin_limit (SIZE / 2);
+  set_pin_limit (PIN_LIMIT);
 
   // Sanity check
   if (get_pinned_mem () != 0)
     abort ();
 
-  // Should fail
   void *p = omp_alloc (SIZE, ompx_gnu_pinned_mem_alloc);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'.
+  if (!p)
+    abort ();
+#else
+  // Should fail
   if (p)
     abort ();
+#endif
 
-  // Should fail
   p = omp_calloc (1, SIZE, ompx_gnu_pinned_mem_alloc);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'.
+  if (!p)
+    abort ();
+#else
+  // Should fail
   if (p)
     abort ();
+#endif
 
-  // Should fail to realloc
   void *notpinned = omp_alloc (SIZE, omp_default_mem_alloc);
   p = omp_realloc (notpinned, SIZE, ompx_gnu_pinned_mem_alloc,
 		   omp_default_mem_alloc);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'; does reallocate.
+  if (!notpinned || !p || p == notpinned)
+    abort ();
+#else
+  // Should fail to realloc
   if (!notpinned || p)
     abort ();
+#endif
 
   // No memory should have been pinned
   int amount = get_pinned_mem ();
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-7.c b/libgomp/testsuite/libgomp.c/alloc-pinned-7.c
new file mode 100644
index 0000000..44652aa
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-7.c
@@ -0,0 +1,63 @@
+/* { dg-do run } */
+/* { dg-additional-options "-foffload-memory=pinned" } */
+
+/* { dg-skip-if "Pinning not implemented on this host" { ! *-*-linux-gnu* } } */
+
+/* Test that -foffload-memory=pinned works.  */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef __linux__
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+
+int
+get_pinned_mem ()
+{
+  int pid = getpid ();
+  char buf[100];
+  sprintf (buf, "/proc/%d/status", pid);
+
+  FILE *proc = fopen (buf, "r");
+  if (!proc)
+    abort ();
+  while (fgets (buf, 100, proc))
+    {
+      int val;
+      if (sscanf (buf, "VmLck: %d", &val))
+	{
+	  fclose (proc);
+	  return val;
+	}
+    }
+  abort ();
+}
+#else
+int
+get_pinned_mem ()
+{
+  return 0;
+}
+
+#define mlockall(...) 0
+#endif
+
+#include <omp.h>
+
+int
+main ()
+{
+  // Sanity check
+  if (get_pinned_mem () == 0)
+    {
+      /* -foffload-memory=pinned has failed, but maybe that's because
+	 isufficient pinned memory was available.  */
+      if (mlockall (MCL_CURRENT | MCL_FUTURE) == 0)
+	abort ();
+    }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-8.c b/libgomp/testsuite/libgomp.c/alloc-pinned-8.c
new file mode 100644
index 0000000..0fc737b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-8.c
@@ -0,0 +1,122 @@
+/* { dg-do run } */
+
+/* { dg-skip-if "Pinning not implemented on this host" { ! *-*-linux-gnu* } } */
+
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
+/* Test that pinned memory works for small allocations.  */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef __linux__
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+#include <sys/resource.h>
+
+#define PAGE_SIZE sysconf(_SC_PAGESIZE)
+#define CHECK_SIZE(SIZE) { \
+  struct rlimit limit; \
+  if (getrlimit (RLIMIT_MEMLOCK, &limit) \
+      || limit.rlim_cur <= SIZE) \
+    fprintf (stderr, "insufficient lockable memory; please increase ulimit\n"); \
+  }
+
+int
+get_pinned_mem ()
+{
+  int pid = getpid ();
+  char buf[100];
+  sprintf (buf, "/proc/%d/status", pid);
+
+  FILE *proc = fopen (buf, "r");
+  if (!proc)
+    abort ();
+  while (fgets (buf, 100, proc))
+    {
+      int val;
+      if (sscanf (buf, "VmLck: %d", &val))
+	{
+	  fclose (proc);
+	  return val;
+	}
+    }
+  abort ();
+}
+#else
+#error "OS unsupported"
+#endif
+
+static void
+verify0 (char *p, size_t s)
+{
+  for (size_t i = 0; i < s; ++i)
+    if (p[i] != 0)
+      abort ();
+}
+
+#include <omp.h>
+
+int
+main ()
+{
+  /* Choose a small size where all our allocations fit on one page.  */
+  const int SIZE = 10;
+#ifndef OFFLOAD_DEVICE_NVPTX
+  CHECK_SIZE (SIZE*4);
+#endif
+
+  const omp_alloctrait_t traits[] = {
+      { omp_atk_pinned, 1 }
+  };
+  omp_allocator_handle_t allocator = omp_init_allocator (omp_default_mem_space, 1, traits);
+
+  // Sanity check
+  if (get_pinned_mem () != 0)
+    abort ();
+
+  void *p = omp_alloc (SIZE, allocator);
+  if (!p)
+    abort ();
+
+  int amount = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount != 0)
+    abort ();
+#else
+  if (amount == 0)
+    abort ();
+#endif
+
+  p = omp_realloc (p, SIZE * 2, allocator, allocator);
+
+  int amount2 = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount2 != 0)
+    abort ();
+#else
+  /* A small allocation should not allocate another page.  */
+  if (amount2 != amount)
+    abort ();
+#endif
+
+  p = omp_calloc (1, SIZE, allocator);
+
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (get_pinned_mem () != 0)
+    abort ();
+#else
+  /* A small allocation should not allocate another page.  */
+  if (get_pinned_mem () != amount2)
+    abort ();
+#endif
+
+  verify0 (p, SIZE);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/array-shaping-1.c b/libgomp/testsuite/libgomp.c/array-shaping-1.c
new file mode 100644
index 0000000..808c5f9
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/array-shaping-1.c
@@ -0,0 +1,236 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <string.h>
+#include <assert.h>
+#include <stdlib.h>
+
+volatile int yy = 4, zz = 2, str_str = 2;
+
+int main()
+{
+  int *arr;
+  int x = 5;
+  int arr2d[10][10];
+
+  arr = calloc (100, sizeof (int));
+
+  /* Update whole reshaped array.  */
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < x; j++)
+    for (int i = 0; i < 10; i++)
+      arr[j * 10 + i] = i ^ j;
+
+#pragma omp target update to(([10][x]) arr)
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if (j < x)
+	assert (arr[j * 10 + i] == i ^ j);
+      else
+	assert (arr[j * 10 + i] == 0);
+
+
+  /* Strided update.  */
+
+  memset (arr, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < 20; j++)
+    for (int i = 0; i < 5; i++)
+      arr[j * 5 + i] = i + j;
+
+#pragma omp target update to(([5][5]) arr[0:3][0:3:2])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 20; j++)
+    for (int i = 0; i < 5; i++)
+      if (j < 3 && (i & 1) == 0 && i < 6)
+	assert (arr[j * 5 + i] == i + j);
+      else
+	assert (arr[j * 5 + i] == 0);
+
+
+  /* Reshaped update, contiguous.  */
+
+  memset (arr, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < 20; j++)
+    for (int i = 0; i < 5; i++)
+      arr[j * 5 + i] = 2 * j + i;
+
+#pragma omp target update to(([5][5]) arr[0:5][0:5])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 20; j++)
+    for (int i = 0; i < 5; i++)
+      if (j < 5 && i < 5)
+	assert (arr[j * 5 + i] == 2 * j + i);
+      else
+	assert (arr[j * 5 + i] == 0);
+
+
+  /* Strided update on actual array.  */
+
+  memset (arr2d, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr2d)
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      arr2d[j][i] = j + 2 * i;
+
+#pragma omp target update to(arr2d[0:5:2][5:2])
+
+#pragma omp target exit data map(from: arr2d)
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if ((j & 1) == 0 && i >= 5 && i < 7)
+	assert (arr2d[j][i] == j + 2 * i);
+      else
+	assert (arr2d[j][i] == 0);
+
+
+  /* Update with non-constant bounds.  */
+
+  memset (arr, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      arr[j * 10 + i] = (2 * j) ^ i;
+
+  x = 3;
+  int y = yy, z = zz, str = str_str;
+  /* This is actually [0:3:2] [4:2:2].  */
+#pragma omp target update to(([10][10]) arr[0:x:2][y:z:str])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if ((j & 1) == 0 && j < 6 && (i & 1) == 0 && i >= 4 && i < 8)
+	assert (arr[j * 10 + i] == (2 * j) ^ i);
+      else
+	assert (arr[j * 10 + i] == 0);
+
+
+  /* Update with full "major" dimension.  */
+
+  memset (arr, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      arr[j * 10 + i] = i + j;
+
+#pragma omp target update to(([10][10]) arr[0:10][3:1])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if (i == 3)
+	assert (arr[j * 10 + i] == i + j);
+      else
+	assert (arr[j * 10 + i] == 0);
+
+
+  /* Update with full "minor" dimension.  */
+
+  memset (arr, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      arr[j * 10 + i] = 3 * (i + j);
+
+#pragma omp target update to(([10][10]) arr[3:2][0:10])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if (j >= 3 && j < 5)
+	assert (arr[j * 10 + i] == 3 * (i + j));
+      else
+	assert (arr[j * 10 + i] == 0);
+
+
+  /* Rectangle update.  */
+
+  memset (arr, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      arr[j * 10 + i] = 5 * (i + j);
+
+#pragma omp target update to(([10][10]) arr[3:2][0:9])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int j = 0; j < 10; j++)
+    for (int i = 0; i < 10; i++)
+      if (j >= 3 && j < 5 && i < 9)
+	assert (arr[j * 10 + i] == 5 * (i + j));
+      else
+	assert (arr[j * 10 + i] == 0);
+
+
+  /* One-dimensional strided update.  */
+
+  memset (arr, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int i = 0; i < 100; i++)
+    arr[i] = i + 99;
+
+#pragma omp target update to(([100]) arr[3:33:3])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int i = 0; i < 100; i++)
+    if (i >= 3 && ((i - 3) % 3) == 0)
+      assert (arr[i] == i + 99);
+    else
+      assert (arr[i] == 0);
+
+
+  /* One-dimensional strided update without explicit array shape.  */
+
+  memset (arr, 0, 100 * sizeof (int));
+
+#pragma omp target enter data map(to: arr[:100])
+
+  for (int i = 0; i < 100; i++)
+    arr[i] = i + 121;
+
+#pragma omp target update to(arr[3:33:3])
+
+#pragma omp target exit data map(from: arr[:100])
+
+  for (int i = 0; i < 100; i++)
+    if (i >= 3 && ((i - 3) % 3) == 0)
+      assert (arr[i] == i + 121);
+    else
+      assert (arr[i] == 0);
+
+  free (arr);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/array-shaping-2.c b/libgomp/testsuite/libgomp.c/array-shaping-2.c
new file mode 100644
index 0000000..42a6e0c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/array-shaping-2.c
@@ -0,0 +1,39 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <assert.h>
+#include <stdlib.h>
+
+typedef struct {
+  int *aptr;
+} C;
+
+int main()
+{
+  C cvar;
+
+  cvar.aptr = calloc (100, sizeof (float));
+
+#pragma omp target enter data map(to: cvar.aptr, cvar.aptr[:100])
+
+#pragma omp target
+  {
+    for (int i = 0; i < 10; i++)
+      for (int j = 0; j < 10; j++)
+	cvar.aptr[i * 10 + j] = i + j;
+  }
+
+#pragma omp target update from(([10][10]) cvar.aptr[4:3][4:3])
+
+  for (int i = 0; i < 10; i++)
+    for (int j = 0; j < 10; j++)
+      if (i >= 4 && i < 7 && j >= 4 && j < 7)
+	assert (cvar.aptr[i * 10 + j] == i + j);
+      else
+	assert (cvar.aptr[i * 10 + j] == 0);
+
+#pragma omp target exit data map(delete: cvar.aptr, cvar.aptr[:100])
+
+  free (cvar.aptr);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/array-shaping-3.c b/libgomp/testsuite/libgomp.c/array-shaping-3.c
new file mode 100644
index 0000000..5dda2e3
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/array-shaping-3.c
@@ -0,0 +1,42 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define N 10
+
+typedef struct {
+  int arr[N][N];
+} B;
+
+int main()
+{
+  B *bvar = malloc (sizeof (B));
+
+  memset (bvar, 0, sizeof (B));
+
+#pragma omp target enter data map(to: bvar->arr)
+
+#pragma omp target
+  {
+    for (int i = 0; i < 10; i++)
+      for (int j = 0; j < 10; j++)
+	bvar->arr[i][j] = i + j;
+  }
+
+#pragma omp target update from(bvar->arr[4:3][4:3])
+
+  for (int i = 0; i < 10; i++)
+    for (int j = 0; j < 10; j++)
+      if (i >= 4 && i < 7 && j >= 4 && j < 7)
+	assert (bvar->arr[i][j] == i + j);
+      else
+	assert (bvar->arr[i][j] == 0);
+
+#pragma omp target exit data map(delete: bvar->arr)
+
+  free (bvar);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/array-shaping-4.c b/libgomp/testsuite/libgomp.c/array-shaping-4.c
new file mode 100644
index 0000000..2b9e694
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/array-shaping-4.c
@@ -0,0 +1,36 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <assert.h>
+#include <string.h>
+
+#define N 10
+
+int main ()
+{
+  int iarr[N * N];
+
+  memset (iarr, 0, N * N * sizeof (int));
+
+#pragma omp target enter data map(to: iarr)
+
+#pragma omp target
+  {
+    for (int i = 0; i < 10; i++)
+      for (int j = 0; j < 10; j++)
+	iarr[i * 10 + j] = i + j;
+  }
+
+  /* An array, but cast to a pointer, then reshaped.  */
+#pragma omp target update from(([10][10]) ((int *) &iarr[0])[4:3][4:3])
+
+  for (int i = 0; i < 10; i++)
+    for (int j = 0; j < 10; j++)
+      if (i >= 4 && i < 7 && j >= 4 && j < 7)
+	assert (iarr[i * 10 + j] == i + j);
+      else
+	assert (iarr[i * 10 + j] == 0);
+
+#pragma omp target exit data map(delete: iarr)
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/array-shaping-5.c b/libgomp/testsuite/libgomp.c/array-shaping-5.c
new file mode 100644
index 0000000..1034682
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/array-shaping-5.c
@@ -0,0 +1,38 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <assert.h>
+#include <string.h>
+
+#define N 10
+
+int main ()
+{
+  int iarr_real[N * N];
+  int *iarrp = &iarr_real[0];
+  int **iarrpp = &iarrp;
+
+  memset (iarrp, 0, N * N * sizeof (int));
+
+#pragma omp target enter data map(to: iarr_real)
+
+#pragma omp target
+  {
+    for (int i = 0; i < 10; i++)
+      for (int j = 0; j < 10; j++)
+	iarrp[i * 10 + j] = i + j;
+  }
+
+  /* A pointer with an extra indirection.  */
+#pragma omp target update from(([10][10]) (*iarrpp)[4:3][4:3])
+
+  for (int i = 0; i < 10; i++)
+    for (int j = 0; j < 10; j++)
+      if (i >= 4 && i < 7 && j >= 4 && j < 7)
+	assert (iarrp[i * 10 + j] == i + j);
+      else
+	assert (iarrp[i * 10 + j] == 0);
+
+#pragma omp target exit data map(delete: iarr_real)
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/array-shaping-6.c b/libgomp/testsuite/libgomp.c/array-shaping-6.c
new file mode 100644
index 0000000..5938823
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/array-shaping-6.c
@@ -0,0 +1,45 @@
+// { dg-do run { target offload_device_nonshared_as } }
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define N 10
+
+int main ()
+{
+  int *iptr = calloc (N * N * N, sizeof (int));
+
+#pragma omp target enter data map(to: iptr[0:N*N*N])
+
+#pragma omp target
+  {
+    for (int i = 0; i < N; i++)
+      for (int j = 0; j < N; j++)
+	iptr[i * N * N + 4 * N + j] = i + j;
+  }
+
+  /* An array ref between two array sections.  */
+#pragma omp target update from(([N][N][N]) iptr[2:3][4][6:3])
+
+  for (int i = 2; i < 5; i++)
+    for (int j = 6; j < 9; j++)
+      assert (iptr[i * N * N + 4 * N + j] == i + j);
+
+  memset (iptr, 0, N * N * N * sizeof (int));
+
+  for (int i = 0; i < N; i++)
+    iptr[2 * N * N + i * N + 4] = 3 * i;
+
+  /* Array section between two array refs.  */
+#pragma omp target update to(([N][N][N]) iptr[2][3:6][4])
+
+#pragma omp target exit data map(from: iptr[0:N*N*N])
+
+  for (int i = 3; i < 9; i++)
+    assert (iptr[2 * N * N + i * N + 4] == 3 * i);
+
+  free (iptr);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/c.exp b/libgomp/testsuite/libgomp.c/c.exp
index aae2824..4b59957 100644
--- a/libgomp/testsuite/libgomp.c/c.exp
+++ b/libgomp/testsuite/libgomp.c/c.exp
@@ -3,6 +3,14 @@ load_gcc_lib gcc-dg.exp
 
 lappend ALWAYS_CFLAGS "compiler=$GCC_UNDER_TEST"
 
+proc check_effective_target_c { } {
+    return 1
+}
+
+proc check_effective_target_c++ { } {
+    return 0
+}
+
 # If a testcase doesn't have special options, use these.
 if ![info exists DEFAULT_CFLAGS] then {
     set DEFAULT_CFLAGS "-O2"
diff --git a/libgomp/testsuite/libgomp.c/reverse-offload-threads-1.c b/libgomp/testsuite/libgomp.c/reverse-offload-threads-1.c
new file mode 100644
index 0000000..fa74a8e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/reverse-offload-threads-1.c
@@ -0,0 +1,26 @@
+/* { dg-do run }  */
+/* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */
+
+/* Test that the reverse offload message buffers can cope with a lot of
+   requests.  */
+
+#pragma omp requires reverse_offload
+
+int main ()
+{
+  #pragma omp target teams distribute parallel for collapse(2)
+  for (int i=0; i < 100; i++)
+    for (int j=0; j < 16; j++)
+      {
+	int val = 0;
+	#pragma omp target device ( ancestor:1 ) firstprivate(i,j) map(from:val)
+	{
+	  val = i + j;
+	}
+
+	if (val != i + j)
+	  __builtin_abort ();
+      }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/reverse-offload-threads-2.c b/libgomp/testsuite/libgomp.c/reverse-offload-threads-2.c
new file mode 100644
index 0000000..05a2571
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/reverse-offload-threads-2.c
@@ -0,0 +1,31 @@
+/* { dg-do run }  */
+/* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */
+
+/* Test that the reverse offload message buffers can cope with multiple
+   requests from multiple kernels.  */
+
+#pragma omp requires reverse_offload
+
+int main ()
+{
+  for (int n=0; n < 5; n++)
+    {
+      #pragma omp target teams distribute parallel for nowait collapse(2)
+      for (int i=0; i < 32; i++)
+	for (int j=0; j < 16; j++)
+	  {
+	    int val = 0;
+	    #pragma omp target device ( ancestor:1 ) firstprivate(i,j) map(from:val)
+	    {
+	      val = i + j;
+	    }
+
+	    if (val != i + j)
+	      __builtin_abort ();
+	  }
+    }
+
+#pragma omp taskwait
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.fortran/adjust-args-array-descriptor.f90 b/libgomp/testsuite/libgomp.fortran/adjust-args-array-descriptor.f90
new file mode 100644
index 0000000..dd9b57b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/adjust-args-array-descriptor.f90
@@ -0,0 +1,89 @@
+! Test array descriptor handling with the need_device_addr modifier to adjust_args
+
+module m
+  use iso_c_binding
+  implicit none (type, external)
+
+  integer :: case = 0
+contains
+  subroutine var_array_alloc(x)
+    integer, allocatable :: x(:)
+    !$omp target has_device_addr(x)
+    block
+      if (size(x) /= 3) stop 1
+      if (any (x /= [1,2,3])) stop 2
+      x = x * (-1)
+    end block
+  end
+
+  subroutine base_array_alloc(x)
+    !$omp declare variant(var_array_alloc) match(construct={dispatch}) adjust_args(need_device_addr : x)
+    integer, allocatable :: x(:)
+    error stop
+  end
+
+  subroutine var_array_nonalloc(x)
+    integer :: x(:)
+    !$omp target has_device_addr(x)
+    block
+      if (size(x) /= 4) stop 3
+      if (any (x /= [11,22,33,44])) stop 4
+      x = x * (-1)
+    end block
+  end
+
+  subroutine base_array_nonalloc(x)
+    !$omp declare variant(var_array_nonalloc) match(construct={dispatch}) adjust_args(need_device_addr : x)
+    integer :: x(:)
+    error stop
+  end
+
+  subroutine test_array_alloc(y)
+    integer, allocatable :: y(:)
+    !$omp target enter data map(y)
+
+
+  ! Direct call (for testing; value check fails if both are enabled
+  !  !$omp target data use_device_addr(y)
+  !    call var_array_alloc (y)
+  !  !$omp end target data
+
+    !$omp dispatch
+      call base_array_alloc (y)
+
+    !$omp target exit data map(y)
+
+    if (size(y) /= 3) stop 3
+    if (any (y /= [-1,-2,-3])) stop 1
+  end
+
+  subroutine test_array_nonalloc()
+    integer :: y(4)
+    y = [11,22,33,44]
+
+    !$omp target enter data map(y)
+
+    ! Direct call (for testing; value check fails if both are enabled
+    !!$omp target data use_device_addr(y)
+    !  call var_array_nonalloc (y)
+    !!$omp end target data
+
+    !$omp dispatch
+      call base_array_nonalloc (y)
+
+    !$omp target exit data map(y)
+
+    if (size(y) /= 4) stop 3
+    if (any (y /= [-11,-22,-33,-44])) stop 1
+  end
+end module
+
+use m
+implicit none
+integer, allocatable :: z(:)
+
+z = [1,2,3]
+call test_array_alloc(z)
+call test_array_nonalloc()
+
+end
diff --git a/libgomp/testsuite/libgomp.fortran/allocatable-comp-iterators.f90 b/libgomp/testsuite/libgomp.fortran/allocatable-comp-iterators.f90
new file mode 100644
index 0000000..120236a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/allocatable-comp-iterators.f90
@@ -0,0 +1,61 @@
+implicit none
+integer, parameter :: N = 16
+type t
+  integer, allocatable :: a, b(:)
+end type t
+type(t) :: x(N), y(N), z(N)
+integer :: i, j
+integer :: lo = 3, hi = N
+
+!$omp target map(iterator (it=1:N), to: x(it))
+  do i = 1, N
+    if (allocated(x(i)%a)) stop 1
+    if (allocated(x(i)%b)) stop 2
+  end do
+!$omp end target
+
+do i = 1, N
+  allocate(x(i)%a, x(i)%b(-4:6))
+  x(i)%b(:) = [(i, i=-4,6)]
+end do
+
+!$omp target map(iterator (it=2:N), to: x(it))
+  do i = 2, N
+    if (.not. allocated(x(i)%a)) stop 3
+    if (.not. allocated(x(i)%b)) stop 4
+    if (lbound(x(i)%b,1) /= -4) stop 5
+    if (ubound(x(i)%b,1) /= 6) stop 6
+    if (any (x(i)%b /= [(i, i=-4,6)])) stop 7
+  end do
+!$omp end target
+
+!$omp target enter data map(iterator (it=3:N), to: y(it), z(it))
+
+!$omp target map(iterator (it=3:N), to: y(it), z(it))
+  do i = 3, N
+    if (allocated(y(i)%b)) stop 8
+    if (allocated(z(i)%b)) stop 9
+  end do
+!$omp end target
+
+do i = 1, N
+  allocate(y(i)%b(5), z(i)%b(3))
+  y(i)%b = 42
+  z(i)%b = 99
+end do
+
+!$omp target map(iterator (it=3:N), to: y(it))
+  do i = 3, N
+    if (.not.allocated(y(i)%b)) stop 10
+    if (any (y(i)%b /= 42)) stop 11
+  end do
+!$omp end target
+
+!$omp target map(iterator (it=lo:hi), always, tofrom: z(it))
+  do i = 3, N
+    if (.not.allocated(z(i)%b)) stop 12
+    if (any (z(i)%b /= 99)) stop 13
+  end do
+!$omp end target
+
+end
diff --git a/libgomp/testsuite/libgomp.fortran/collapse5.f90 b/libgomp/testsuite/libgomp.fortran/collapse5.f90
new file mode 100644
index 0000000..5632d9b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/collapse5.f90
@@ -0,0 +1,23 @@
+! { dg-do run }
+
+program collapse5
+  implicit none
+
+  integer :: i, j
+  integer :: count = 0
+
+  !$omp parallel do collapse (2)
+    do i = 1, 80000
+      do j = 1, 80000
+        if (i .eq. 66666 .and. j .eq. 77777) then
+	  ! In the collapsed loop space, this is iteration
+	  ! 66666*80000+77777==5,333,357,777.  If the type of the iterator
+	  ! for the collapsed loop is only a 32-bit unsigned int, then this
+	  ! iteration will exceed its maximum range and be skipped.
+	  count = count + 1
+	end if
+      end do
+    end do
+
+  if (count .ne. 1) stop 1
+end
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-10.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-10.f90
new file mode 100644
index 0000000..801becc
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-10.f90
@@ -0,0 +1,40 @@
+! { dg-do run }
+
+program myprog
+type t
+  integer, dimension (8) :: arr1
+end type t
+type u
+  type(t), dimension (:), pointer :: tarr
+end type u
+
+type(u) :: myu
+type(t), dimension (12), target :: myarray
+
+!$omp declare mapper (t :: x) map(x%arr1(1:4))
+!$omp declare mapper (u :: x) map(to: x%tarr) map(x%tarr(1))
+
+myu%tarr => myarray
+
+myu%tarr(1)%arr1(1) = 1
+
+! We can't do this: we have a mapper for "t" elements, and this implicitly maps
+! the whole array.
+!!$omp target map(tofrom:myu%tarr)
+!myu%tarr(1)%arr1(1) = myu%tarr(1)%arr1(1) + 1
+!!$omp end target
+
+! ...but we can do this, because we're just mapping an element of the "t"
+! array.  We still need to map the actual "myu%tarr" descriptor.
+!$omp target map(to:myu%tarr) map(myu%tarr(1)%arr1(1:4))
+myu%tarr(1)%arr1(1) = myu%tarr(1)%arr1(1) + 1
+!$omp end target
+
+!$omp target
+myu%tarr(1)%arr1(1) = myu%tarr(1)%arr1(1) + 1
+!$omp end target
+
+if (myu%tarr(1)%arr1(1).ne.3) stop 1
+
+end program myprog
+
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-11.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-11.f90
new file mode 100644
index 0000000..0fc424a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-11.f90
@@ -0,0 +1,38 @@
+! { dg-do run }
+
+program myprog
+type t
+  integer, dimension (8) :: arr1
+end type t
+type u
+  type(t) :: t_elem
+end type u
+
+type(u) :: myu
+
+!$omp declare mapper (t :: x) map(x%arr1(5:8))
+!$omp declare mapper (tmapper: t :: x) map(x%arr1(1:4))
+!$omp declare mapper (u :: x) map(mapper(tmapper), tofrom: x%t_elem)
+
+myu%t_elem%arr1(1) = 1
+myu%t_elem%arr1(5) = 1
+
+! Different ways of invoking nested mappers, named vs. unnamed
+
+!$omp target map(tofrom:myu%t_elem)
+myu%t_elem%arr1(5) = myu%t_elem%arr1(5) + 1
+!$omp end target
+
+!$omp target map(tofrom:myu)
+myu%t_elem%arr1(1) = myu%t_elem%arr1(1) + 1
+!$omp end target
+
+!$omp target
+myu%t_elem%arr1(1) = myu%t_elem%arr1(1) + 1
+!$omp end target
+
+if (myu%t_elem%arr1(1).ne.3) stop 1
+if (myu%t_elem%arr1(5).ne.2) stop 2
+
+end program myprog
+
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-12.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-12.f90
new file mode 100644
index 0000000..a475501
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-12.f90
@@ -0,0 +1,33 @@
+! { dg-do run }
+
+program myprog
+type t
+  integer, dimension (8) :: arr1
+end type t
+type u
+  type(t) :: t_elem
+end type u
+
+type(u) :: myu
+
+!$omp declare mapper (tmapper: t :: x) map(x%arr1(1:4))
+!$omp declare mapper (u :: x) map(mapper(tmapper), tofrom: x%t_elem)
+
+myu%t_elem%arr1(1) = 1
+
+!$omp target map(tofrom:myu%t_elem)
+myu%t_elem%arr1(1) = myu%t_elem%arr1(1) + 1
+!$omp end target
+
+!$omp target map(tofrom:myu)
+myu%t_elem%arr1(1) = myu%t_elem%arr1(1) + 1
+!$omp end target
+
+!$omp target
+myu%t_elem%arr1(1) = myu%t_elem%arr1(1) + 1
+!$omp end target
+
+if (myu%t_elem%arr1(1).ne.4) stop 1
+
+end program myprog
+
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-13.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-13.f90
new file mode 100644
index 0000000..3cae0fe
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-13.f90
@@ -0,0 +1,49 @@
+! { dg-do run }
+
+module mymod
+type S
+integer :: a
+integer :: b
+integer :: c
+end type S
+
+!$omp declare mapper (S :: x) map(x%c)
+end module mymod
+
+program myprog
+use mymod
+type T
+integer :: a
+integer :: b
+integer :: c
+end type T
+
+type(S) :: mys
+type(T) :: myt
+
+!$omp declare mapper (T :: x) map(x%b)
+
+myt%a = 0
+myt%b = 0
+myt%c = 0
+mys%a = 0
+mys%b = 0
+mys%c = 0
+
+!$omp target
+myt%b = myt%b + 1
+!$omp end target
+
+!$omp target
+mys%c = mys%c + 1
+!$omp end target
+
+!$omp target
+myt%b = myt%b + 2
+mys%c = mys%c + 3
+!$omp end target
+
+if (myt%b.ne.3) stop 1
+if (mys%c.ne.4) stop 2
+
+end program myprog
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-15.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-15.f90
new file mode 100644
index 0000000..eb0dd5f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-15.f90
@@ -0,0 +1,24 @@
+! { dg-do run }
+
+program myprog
+
+type A
+character(len=20) :: string1
+character(len=:), pointer :: string2
+end type A
+
+!$omp declare mapper (A :: x) map(to:x%string1) map(from:x%string2)
+
+type(A) :: var
+
+allocate(character(len=20) :: var%string2)
+
+var%string1 = "hello world"
+
+!$omp target map(to:var%string1) map(from:var%string2)
+var%string2 = var%string1
+!$omp end target
+
+if (var%string2.ne."hello world") stop 1
+
+end program myprog
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-17.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-17.f90
new file mode 100644
index 0000000..c215971
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-17.f90
@@ -0,0 +1,92 @@
+! { dg-do run }
+
+program myprog
+
+type A
+integer :: x
+integer :: y(20)
+integer, dimension(:), pointer :: z
+end type A
+
+integer, target :: arr1(20), arr2(20)
+type(A) :: p, q
+
+p%y = 0
+q%y = 0
+
+p%z => arr1
+q%z => arr2
+
+call mysub (p, q)
+
+if (p%z(1).ne.1) stop 1
+if (q%z(1).ne.1) stop 2
+
+p%y = 0
+q%y = 0
+p%z = 0
+q%z = 0
+
+call mysub2 (p, q)
+
+if (p%z(1).ne.1) stop 3
+if (q%z(1).ne.1) stop 4
+
+p%y = 0
+q%y = 0
+p%z = 0
+q%z = 0
+
+call mysub3 (p, q)
+
+if (p%z(1).ne.1) stop 5
+if (q%z(1).ne.1) stop 6
+
+contains
+
+subroutine mysub(arg1, arg2)
+implicit none
+type(A), intent(inout) :: arg1
+type(A), intent(inout) :: arg2
+
+!$omp declare mapper (A :: x) map(always, to:x) map(tofrom:x%z(:))
+
+!$omp target
+arg1%y(1) = arg1%y(1) + 1
+arg1%z = arg1%y
+arg2%y(1) = arg2%y(1) + 1
+arg2%z = arg2%y
+!$omp end target
+end subroutine mysub
+
+subroutine mysub2(arg1, arg2)
+implicit none
+type(A), intent(inout) :: arg1
+type(A), intent(inout) :: arg2
+
+!$omp declare mapper (A :: x) map(to:x) map(from:x%z(:))
+
+!$omp target
+arg1%y(1) = arg1%y(1) + 1
+arg1%z = arg1%y
+arg2%y(1) = arg2%y(1) + 1
+arg2%z = arg2%y
+!$omp end target
+end subroutine mysub2
+
+subroutine mysub3(arg1, arg2)
+implicit none
+type(A), intent(inout) :: arg1
+type(A), intent(inout) :: arg2
+
+!$omp declare mapper (A :: x) map(to:x) map(from:x%z(:))
+
+!$omp target map(arg1, arg2)
+arg1%y(1) = arg1%y(1) + 1
+arg1%z = arg1%y
+arg2%y(1) = arg2%y(1) + 1
+arg2%z = arg2%y
+!$omp end target
+end subroutine mysub3
+
+end program myprog
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-18.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-18.f90
new file mode 100644
index 0000000..a333b68
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-18.f90
@@ -0,0 +1,46 @@
+! { dg-do run }
+
+module mymod
+type F
+integer :: a, b, c
+integer, dimension(10) :: d
+end type F
+
+type G
+integer :: x, y
+type(F), pointer :: myf
+integer :: z
+end type G
+
+! Check that nested mappers work inside modules.
+
+!$omp declare mapper (F :: f) map(to: f%b) map(f%d)
+!$omp declare mapper (G :: g) map(tofrom: g%myf)
+
+end module mymod
+
+program myprog
+use mymod
+
+type(F), target :: ftmp
+type(G) :: gvar
+
+gvar%myf => ftmp
+
+gvar%myf%d = 0
+
+!$omp target map(gvar%myf)
+gvar%myf%d(1) = gvar%myf%d(1) + 1
+!$omp end target
+
+!$omp target map(gvar)
+gvar%myf%d(1) = gvar%myf%d(1) + 1
+!$omp end target
+
+!$omp target
+gvar%myf%d(1) = gvar%myf%d(1) + 1
+!$omp end target
+
+if (gvar%myf%d(1).ne.3) stop 1
+
+end program myprog
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-19.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-19.f90
new file mode 100644
index 0000000..d864975
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-19.f90
@@ -0,0 +1,29 @@
+! { dg-do run }
+
+program myprog
+type F
+integer :: a, b, c
+integer, dimension(10) :: d
+end type F
+
+type(F), pointer :: myf
+
+!$omp declare mapper (F :: f) map(f%d)
+
+allocate(myf)
+
+myf%d = 0
+
+!$omp target map(myf)
+myf%d(1) = myf%d(1) + 1
+!$omp end target
+
+!$omp target
+myf%d(1) = myf%d(1) + 1
+!$omp end target
+
+if (myf%d(1).ne.2) stop 1
+
+deallocate(myf)
+
+end program myprog
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-2.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-2.f90
new file mode 100644
index 0000000..ec1c0ec
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-2.f90
@@ -0,0 +1,32 @@
+! { dg-do run }
+
+program myprog
+type s
+  integer :: c
+  integer :: d(99)
+end type s
+
+type t
+  type(s) :: mys
+end type t
+
+type u
+  type(t) :: myt
+end type u
+
+type(u) :: myu
+
+!$omp declare mapper (t :: x) map(tofrom: x%mys%c) map(x%mys%d(1:x%mys%c))
+
+myu%myt%mys%c = 1
+myu%myt%mys%d = 0
+
+!$omp target map(tofrom: myu%myt)
+myu%myt%mys%d(1) = myu%myt%mys%d(1) + 1
+myu%myt%mys%c = myu%myt%mys%c + 2
+!$omp end target
+
+if (myu%myt%mys%d(1).ne.1) stop 1
+if (myu%myt%mys%c.ne.3) stop 2
+
+end program myprog
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-20.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-20.f90
new file mode 100644
index 0000000..2068828
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-20.f90
@@ -0,0 +1,29 @@
+! { dg-do run }
+
+program myprog
+type F
+integer :: a, b, c
+integer, dimension(10) :: d
+end type F
+
+type(F), allocatable :: myf
+
+!$omp declare mapper (F :: f) map(f)
+
+allocate(myf)
+
+myf%d = 0
+
+!$omp target map(myf)
+myf%d(1) = myf%d(1) + 1
+!$omp end target
+
+!$omp target
+myf%d(1) = myf%d(1) + 1
+!$omp end target
+
+if (myf%d(1).ne.2) stop 1
+
+deallocate(myf)
+
+end program myprog
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-21.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-21.f90
new file mode 100644
index 0000000..4b8db8b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-21.f90
@@ -0,0 +1,24 @@
+! { dg-do run }
+
+program myprog
+
+type A
+character(len=20) :: string1
+character(len=:), allocatable :: string2
+end type A
+
+!$omp declare mapper (A :: x) map(to:x%string1) map(from:x%string2)
+
+type(A) :: var
+
+allocate(character(len=20) :: var%string2)
+
+var%string1 = "hello world"
+
+!$omp target
+var%string2 = var%string1
+!$omp end target
+
+if (var%string2.ne."hello world") stop 1
+
+end program myprog
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-25.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-25.f90
new file mode 100644
index 0000000..dc1f527
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-25.f90
@@ -0,0 +1,44 @@
+! { dg-do run }
+! { dg-require-effective-target offload_device_nonshared_as }
+
+type t
+integer, allocatable :: arr(:)
+end type t
+
+!$omp declare mapper(odd: T :: tv) map(tv%arr(1::2))
+!$omp declare mapper(even: T :: tv) map(tv%arr(2::2))
+
+type(t) :: var
+integer :: i
+
+allocate(var%arr(100))
+
+var%arr = 0
+
+!$omp target enter data map(to: var)
+
+var%arr = 1
+
+!$omp target update to(mapper(odd): var)
+
+!$omp target
+do i=1,100
+  if (mod(i,2).eq.0.and.var%arr(i).ne.0) stop 1
+  if (mod(i,2).eq.1.and.var%arr(i).ne.1) stop 2
+end do
+!$omp end target
+
+var%arr = 2
+
+!$omp target update to(mapper(even): var)
+
+!$omp target
+do i=1,100
+  if (mod(i,2).eq.0.and.var%arr(i).ne.2) stop 3
+  if (mod(i,2).eq.1.and.var%arr(i).ne.1) stop 4
+end do
+!$omp end target
+
+!$omp target exit data map(delete: var)
+
+end
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-28.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-28.f90
new file mode 100644
index 0000000..6561dec
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-28.f90
@@ -0,0 +1,38 @@
+! { dg-do run }
+
+program p
+
+type t
+integer :: x, y
+end type t
+
+type(t) :: var
+
+var%x = 0
+var%y = 0
+
+var = sub(7)
+
+contains
+
+type(t) function sub(arg)
+integer :: arg
+
+!$omp declare mapper (t :: tvar) map(tvar%x, tvar%y)
+
+!$omp target enter data map(alloc: sub)
+
+sub%x = 5
+sub%y = arg
+
+!$omp target update to(sub)
+
+!$omp target
+if (sub%x.ne.5) stop 1
+if (sub%y.ne.7) stop 2
+!$omp end target
+
+!$omp target exit data map(release: sub)
+
+end function sub
+end program p
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-3.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-3.f90
new file mode 100644
index 0000000..517096d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-3.f90
@@ -0,0 +1,33 @@
+program myprog
+type s
+  integer :: c
+  integer :: d(99)
+end type s
+
+type t
+  type(s) :: mys
+end type t
+
+type u
+  type(t) :: myt
+end type u
+
+type(u) :: myu
+
+!$omp declare mapper (s :: x) map(tofrom: x%c, x%d(1:x%c))
+!$omp declare mapper (t :: x) map(tofrom: x%mys)
+!$omp declare mapper (u :: x) map(tofrom: x%myt)
+
+myu%myt%mys%c = 1
+myu%myt%mys%d = 0
+
+! Nested mappers.
+
+!$omp target map(tofrom: myu)
+myu%myt%mys%d(1) = myu%myt%mys%d(1) + 1
+!$omp end target
+
+if (myu%myt%mys%c.ne.1) stop 1
+if (myu%myt%mys%d(1).ne.1) stop 2
+
+end program myprog
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-30.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-30.f90
new file mode 100644
index 0000000..bfac28c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-30.f90
@@ -0,0 +1,24 @@
+! { dg-do run }
+
+type t
+integer :: x, y
+integer, allocatable :: arr(:)
+end type t
+
+!$omp declare mapper (t :: x) map(x%arr)
+
+type(t) :: var
+
+allocate(var%arr(1:20))
+
+var%arr = 0
+
+! The mapper named literally 'default' should be the default mapper, i.e.
+! the same as the unnamed mapper defined above.
+!$omp target map(mapper(default), tofrom: var)
+var%arr(5) = 5
+!$omp end target
+
+if (var%arr(5).ne.5) stop 1
+
+end
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-4.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-4.f90
new file mode 100644
index 0000000..266845f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-4.f90
@@ -0,0 +1,40 @@
+! { dg-do run }
+
+program myprog
+type s
+  integer :: c
+  integer, allocatable :: d(:)
+end type s
+
+type t
+  type(s) :: mys
+end type t
+
+type u
+  type(t) :: myt
+end type u
+
+type(u) :: myu
+
+! Here, the mappers are declared out of order, but earlier ones can still
+! trigger mappers defined later.  Implementation-wise, this happens during
+! resolution, but from the user perspective it appears to happen at
+! instantiation time -- at which point all mappers are visible.  I think
+! that makes sense.
+!$omp declare mapper (u :: x) map(tofrom: x%myt)
+!$omp declare mapper (t :: x) map(tofrom: x%mys)
+!$omp declare mapper (s :: x) map(tofrom: x%c, x%d(1:x%c))
+
+allocate(myu%myt%mys%d(1:20))
+
+myu%myt%mys%c = 1
+myu%myt%mys%d = 0
+
+!$omp target map(tofrom: myu)
+myu%myt%mys%d(1) = myu%myt%mys%d(1) + 1
+!$omp end target
+
+! Note: we only mapped the first element of the array 'd'.
+if (myu%myt%mys%d(1).ne.1) stop 1
+
+end program myprog
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-6.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-6.f90
new file mode 100644
index 0000000..9ebf8da
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-6.f90
@@ -0,0 +1,28 @@
+! { dg-do run }
+
+program myprog
+type bounds
+  integer :: lo
+  integer :: hi
+end type bounds
+
+integer, allocatable :: myarr(:)
+type(bounds) :: b
+
+! Use the placeholder variable, but not at the top level.
+!$omp declare mapper (bounds :: x) map(tofrom: myarr(x%lo:x%hi))
+
+allocate (myarr(1:100))
+
+b%lo = 4
+b%hi = 6
+
+myarr = 0
+
+!$omp target map(tofrom: b)
+myarr(5) = myarr(5) + 1
+!$omp end target
+
+if (myarr(5).ne.1) stop 1
+
+end program myprog
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-7.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-7.f90
new file mode 100644
index 0000000..6297c8e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-7.f90
@@ -0,0 +1,29 @@
+! { dg-do run }
+
+program myprog
+type s
+  integer :: a
+  integer :: b
+end type s
+
+type t
+  type(s) :: mys
+end type t
+
+type(t) :: myt
+
+! Identity mapper
+
+!$omp declare mapper (s :: x) map(tofrom: x)
+!$omp declare mapper (t :: x) map(tofrom: x%mys)
+
+myt%mys%a = 0
+myt%mys%b = 0
+
+!$omp target map(tofrom: myt)
+myt%mys%a = myt%mys%a + 1
+!$omp end target
+
+if (myt%mys%a.ne.1) stop 1
+
+end program myprog
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-8.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-8.f90
new file mode 100644
index 0000000..254486b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-8.f90
@@ -0,0 +1,115 @@
+! { dg-do run }
+
+program myprog
+type t
+  integer, dimension (8) :: arr1
+end type t
+type u
+  integer, dimension (9) :: arr1
+end type u
+type v
+  integer, dimension (10) :: arr1
+end type v
+type w
+  integer, dimension (11) :: arr1
+end type w
+type y
+  integer, dimension(:), pointer :: ptr1
+end type y
+type z
+  integer, dimension(:), pointer :: ptr1
+end type z
+
+!$omp declare mapper (t::x) map(tofrom:x%arr1)
+!$omp declare mapper (u::x) map(tofrom:x%arr1(:))
+!$omp declare mapper (v::x) map(always,tofrom:x%arr1(1:3))
+!$omp declare mapper (w::x) map(tofrom:x%arr1(1))
+!$omp declare mapper (y::x) map(tofrom:x%ptr1)
+!$omp declare mapper (z::x) map(to:x%ptr1) map(tofrom:x%ptr1(1:3))
+
+type(t) :: myt
+type(u) :: myu
+type(v) :: myv
+type(w) :: myw
+type(y) :: myy
+integer, target, dimension(8) :: arrtgt
+type(z) :: myz
+integer, target, dimension(8) :: arrtgt2
+
+myy%ptr1 => arrtgt
+myz%ptr1 => arrtgt2
+
+myt%arr1 = 0
+
+!$omp target map(myt)
+myt%arr1(1) = myt%arr1(1) + 1
+!$omp end target
+
+!$omp target
+myt%arr1(1) = myt%arr1(1) + 1
+!$omp end target
+
+if (myt%arr1(1).ne.2) stop 1
+
+myu%arr1 = 0
+
+!$omp target map(tofrom:myu%arr1(:))
+myu%arr1(1) = myu%arr1(1) + 1
+!$omp end target
+
+!$omp target
+myu%arr1(1) = myu%arr1(1) + 1
+!$omp end target
+
+if (myu%arr1(1).ne.2) stop 2
+
+myv%arr1 = 0
+
+!$omp target map(always,tofrom:myv%arr1(1:3))
+myv%arr1(1) = myv%arr1(1) + 1
+!$omp end target
+
+!$omp target
+myv%arr1(1) = myv%arr1(1) + 1
+!$omp end target
+
+if (myv%arr1(1).ne.2) stop 3
+
+myw%arr1 = 0
+
+!$omp target map(tofrom:myw%arr1(1))
+myw%arr1(1) = myw%arr1(1) + 1
+!$omp end target
+
+!$omp target
+myw%arr1(1) = myw%arr1(1) + 1
+!$omp end target
+
+if (myw%arr1(1).ne.2) stop 4
+
+myy%ptr1 = 0
+
+!$omp target map(tofrom:myy%ptr1)
+myy%ptr1(1) = myy%ptr1(1) + 1
+!$omp end target
+
+!$omp target map(to:myy%ptr1) map(tofrom:myy%ptr1(1:2))
+myy%ptr1(1) = myy%ptr1(1) + 1
+!$omp end target
+
+!$omp target
+myy%ptr1(1) = myy%ptr1(1) + 1
+!$omp end target
+
+if (myy%ptr1(1).ne.3) stop 5
+
+myz%ptr1(1) = 0
+
+!$omp target
+myz%ptr1(1) = myz%ptr1(1) + 1
+!$omp end target
+
+if (myz%ptr1(1).ne.1) stop 6
+
+end program myprog
+
diff --git a/libgomp/testsuite/libgomp.fortran/declare-mapper-9.f90 b/libgomp/testsuite/libgomp.fortran/declare-mapper-9.f90
new file mode 100644
index 0000000..deaf30b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/declare-mapper-9.f90
@@ -0,0 +1,27 @@
+! { dg-do run }
+
+type t
+  integer, dimension (8) :: arr1
+end type t
+type u
+  type(t), dimension (:), pointer :: tarr
+end type u
+
+type(u) :: myu
+type(t), dimension (1), target :: myarray
+
+!$omp declare mapper (named: t :: x) map(x%arr1(1:4))
+!$omp declare mapper (u :: x) map(to: x%tarr) map(mapper(named), tofrom: x%tarr(1))
+
+myu%tarr => myarray
+myu%tarr(1)%arr1 = 0
+
+! Unnamed mapper invoking named mapper
+
+!$omp target
+myu%tarr(1)%arr1(1) = myu%tarr(1)%arr1(1) + 1
+!$omp end target
+
+if (myu%tarr(1)%arr1(1).ne.1) stop 1
+
+end
diff --git a/libgomp/testsuite/libgomp.fortran/mapper-iterators-1.f90 b/libgomp/testsuite/libgomp.fortran/mapper-iterators-1.f90
new file mode 100644
index 0000000..d0f2bc3
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/mapper-iterators-1.f90
@@ -0,0 +1,38 @@
+program myprog
+  type t
+    integer :: size
+    integer :: arr(99)
+  end type t
+
+  type u
+    type(t) :: myt
+  end type u
+
+  integer :: i, j
+  integer, parameter :: N = 10
+  type(u) :: x(N)
+
+  !$omp declare mapper (t :: x) map(tofrom: x%size, x%arr(1:x%size))
+  !$omp declare mapper (u :: x) map(tofrom: x%myt)
+
+  do i = 1, N
+    x(i)%myt%size = 99
+    do j = 1, 99
+      x(i)%myt%arr(j) = i*j
+    end do
+  end do
+
+  !$omp target map(iterator(i=1:N), tofrom: x(i))
+    do i = 1, N
+      do j = 1, 99
+        x(i)%myt%arr(j) = x(i)%myt%arr(j) + 1
+      end do
+    end do
+  !$omp end target
+
+  do i = 1, N
+    do j = 1, 99
+      if (x(i)%myt%arr(j) /= i*j + 1) stop 1
+    end do
+  end do
+end program myprog
diff --git a/libgomp/testsuite/libgomp.fortran/mapper-iterators-2.f90 b/libgomp/testsuite/libgomp.fortran/mapper-iterators-2.f90
new file mode 100644
index 0000000..a28f7cb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/mapper-iterators-2.f90
@@ -0,0 +1,49 @@
+! { dg-do run }
+
+program myprog
+type t
+  integer, dimension (8) :: arr1
+end type t
+type u
+  type(t) :: t_elem
+end type u
+
+integer :: i
+integer, parameter :: N = 10
+type(u) :: myu(N)
+
+!$omp declare mapper (t :: x) map(x%arr1(5:8))
+!$omp declare mapper (tmapper: t :: x) map(x%arr1(1:4))
+!$omp declare mapper (u :: x) map(mapper(tmapper), tofrom: x%t_elem)
+
+do i = 1, N
+  myu(i)%t_elem%arr1(1) = 1
+  myu(i)%t_elem%arr1(5) = 1
+end do
+
+! Different ways of invoking nested mappers, named vs. unnamed
+
+!$omp target map(iterator (n=1:N) tofrom:myu(n)%t_elem)
+do i = 1, N
+  myu(i)%t_elem%arr1(5) = myu(i)%t_elem%arr1(5) + 1
+end do
+!$omp end target
+
+!$omp target map(iterator (n=1:N) tofrom:myu(n))
+do i = 1, N
+  myu(i)%t_elem%arr1(1) = myu(i)%t_elem%arr1(1) + 1
+end do
+!$omp end target
+
+!$omp target
+do i = 1, N
+  myu(i)%t_elem%arr1(1) = myu(i)%t_elem%arr1(1) + 1
+end do
+!$omp end target
+
+do i = 1, N
+  if (myu(i)%t_elem%arr1(1).ne.3) stop 1
+  if (myu(i)%t_elem%arr1(5).ne.2) stop 2
+end do
+
+end program myprog
diff --git a/libgomp/testsuite/libgomp.fortran/mapper-iterators-3.f90 b/libgomp/testsuite/libgomp.fortran/mapper-iterators-3.f90
new file mode 100644
index 0000000..c550e73
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/mapper-iterators-3.f90
@@ -0,0 +1,33 @@
+! { dg-do run }
+
+program myprog
+
+type A
+character(len=20) :: string1
+character(len=:), pointer :: string2
+end type A
+
+integer, parameter :: N = 8
+integer :: i
+
+!$omp declare mapper (A :: x) map(to:x%string1) map(from:x%string2)
+
+type(A) :: var(N)
+
+do i = 1, N
+  allocate(character(len=20) :: var(i)%string2)
+
+  var(i)%string1 = "hello world"
+end do
+
+!$omp target map(iterator (n=1:N) to:var(n)%string1) map(iterator (n=1:N) from:var(n)%string2)
+do i = 1, N
+  var(i)%string2 = var(i)%string1
+end do
+!$omp end target
+
+do i = 1, N
+  if (var(i)%string2.ne."hello world") stop 1
+end do
+
+end program myprog
diff --git a/libgomp/testsuite/libgomp.fortran/mapper-iterators-4.f90 b/libgomp/testsuite/libgomp.fortran/mapper-iterators-4.f90
new file mode 100644
index 0000000..21db835
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/mapper-iterators-4.f90
@@ -0,0 +1,52 @@
+! { dg-do run }
+
+module mymod
+type F
+integer :: a, b, c
+integer, dimension(10) :: d
+end type F
+
+type G
+integer :: x, y
+type(F), pointer :: myf
+integer :: z
+end type G
+
+! Check that nested mappers work inside modules.
+
+!$omp declare mapper (F :: f) map(to: f%b) map(f%d)
+!$omp declare mapper (G :: g) map(tofrom: g%myf)
+
+end module mymod
+
+program myprog
+use mymod
+
+integer, parameter :: N = 8
+integer :: i
+
+type(F), target :: ftmp(N)
+type(G) :: gvar(N)
+
+do i = 1, N
+  gvar(i)%myf => ftmp(i)
+  gvar(i)%myf%d = 0
+end do
+
+!$omp target map(iterator (n=1:N) tofrom: gvar(n)%myf)
+do i = 1, N
+  gvar(i)%myf%d(1) = gvar(i)%myf%d(1) + 1
+end do
+!$omp end target
+
+!$omp target map(iterator (n=1:N) tofrom: gvar(n))
+do i = 1, N
+  gvar(i)%myf%d(1) = gvar(i)%myf%d(1) + 1
+end do
+!$omp end target
+
+do i = 1, N
+  if (gvar(i)%myf%d(1).ne.2) stop 1
+end do
+
+end program myprog
diff --git a/libgomp/testsuite/libgomp.fortran/need-device-ptr.f90 b/libgomp/testsuite/libgomp.fortran/need-device-ptr.f90
new file mode 100644
index 0000000..c75688c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/need-device-ptr.f90
@@ -0,0 +1,132 @@
+! Comprehensive non-array testcase for need_device_ptr / need_device_addr
+
+module m
+  use iso_c_binding
+  implicit none (type, external)
+
+  integer :: case = 0
+contains
+  subroutine var_ptr_f(n, x, y, z)
+    integer, value :: n
+    type(c_ptr) :: x
+    type(c_ptr), value :: y
+    type(c_ptr), optional :: z
+    !$omp target is_device_ptr(x,y,z)
+    block
+      integer, pointer :: ix, iy, iz
+      call c_f_pointer(x, ix)
+      call c_f_pointer(y, iy)
+      call c_f_pointer(z, iz)
+      if (ix /= 52) stop n*10 + 1
+      if (iy /= 85) stop n*10 + 2
+      if (iz /= 52) stop n*10 + 5
+    end block
+  end
+  subroutine base_ptr_f(n, x, y, z)
+    !$omp declare variant(var_ptr_f) match(construct={dispatch}) adjust_args(need_device_ptr : x, y, z)
+    integer, value :: n
+    type(c_ptr) :: x
+    type(c_ptr), value :: y
+    type(c_ptr), optional :: z
+    error stop n
+  end
+
+  subroutine var_caddr_f(x, y)
+    type(c_ptr) :: x
+    type(c_ptr), optional :: y
+    !$omp target has_device_addr(x, y)
+    block
+      integer, pointer :: ix, iy
+      call c_f_pointer(x, ix)
+      call c_f_pointer(x, iy)
+      if (ix /= 52) stop 3
+      if (iy /= 85) stop 6
+    end block
+  end
+! FIXME: optional args give a "sorry".
+!  subroutine base_caddr_f(x, y)
+!    !$omp declare variant(var_caddr_f) match(construct={dispatch}) adjust_args(need_device_addr : x, y)
+!    type(c_ptr) :: x
+!    type(c_ptr), optional :: y
+!    error stop
+!  end
+
+  subroutine var_iaddr_f(x,y)
+    integer :: x
+    integer, optional :: y
+    !$omp target has_device_addr(x, y)
+    block
+      if (x /= 52) stop 4
+      if (y /= 85) stop 4
+    end block
+  end
+
+! FIXME: optional args give a "sorry".
+!  subroutine base_iaddr_f(x,y)
+!    !$omp declare variant(var_iaddr_f) match(construct={dispatch}) adjust_args(need_device_addr : x, y)
+!    integer :: x
+!    integer, optional :: y
+!    error stop
+!  end
+
+  subroutine test_f(carg1, carg2, carg1v, carg2v, iarg1, iarg2)
+    type(c_ptr) :: carg1, carg2
+    type(c_ptr), value :: carg1v, carg2v
+    integer, target :: iarg1, iarg2
+    type(c_ptr) :: cptr1, cptr2
+    integer, target :: ivar1, ivar2
+
+
+    ivar1 = 52
+    ivar2 = 85
+
+    !$omp target enter data map(to: ivar1, ivar2)
+
+    cptr1 = c_loc(ivar1)
+    cptr2 = c_loc(ivar2)
+
+    !$omp dispatch
+       call base_ptr_f (1, carg1, carg2, carg1)
+    !$omp dispatch
+       call base_ptr_f (2, carg1v, carg2v, carg1v)
+    !$omp dispatch
+       call base_ptr_f (3, cptr1, cptr2, cptr1)
+    !$omp dispatch
+       call base_ptr_f (4, c_loc(iarg1), c_loc(iarg2), c_loc(iarg1))
+    !$omp dispatch
+       call base_ptr_f (6, c_loc(ivar1), c_loc(ivar2), c_loc(ivar1))
+
+! FIXME: optional argument functions not supported yet.
+!    !$omp dispatch
+!       call base_caddr_f (carg1, carg2)
+!    !$omp dispatch
+!       call base_caddr_f (carg1v, carg2v)
+!    !$omp dispatch
+!       call base_caddr_f (cptr1, cptr2)
+!    !$omp dispatch
+!       call base_caddr_f (c_loc(iarg1), c_loc(iarg2))
+!    !$omp dispatch
+!       call base_caddr_f (c_loc(ivar1), c_loc(ivar2))
+!    !$omp dispatch
+!       call base_iaddr_f (iarg1, iarg2)
+!    !$omp dispatch
+!       call base_iaddr_f (ivar1, iarg2)
+
+    !$omp target exit data map(release: ivar1, ivar2)
+  end
+end module m
+
+use m
+implicit none
+integer, target :: mx, my
+type(c_ptr) :: cptr1, cptr2
+mx = 52
+my = 85
+
+cptr1 = c_loc(mx)
+cptr2 = c_loc(my)
+
+!$omp target data map(to: mx, my)
+  call test_f (cptr1, cptr2, cptr1, cptr2, mx, my)
+!$omp end target data
+end
diff --git a/libgomp/testsuite/libgomp.fortran/noncontig-updates-1.f90 b/libgomp/testsuite/libgomp.fortran/noncontig-updates-1.f90
new file mode 100644
index 0000000..6ee87e8
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/noncontig-updates-1.f90
@@ -0,0 +1,54 @@
+! { dg-do run }
+! { dg-require-effective-target offload_device_nonshared_as }
+
+implicit none
+integer, allocatable, target :: arr(:), arr2(:,:)
+integer, pointer :: ap(:), ap2(:,:)
+integer :: i, j
+
+allocate(arr(1:20))
+
+arr = 0
+
+!$omp target enter data map(to: arr)
+
+ap => arr(1:20:2)
+ap = 5
+
+!$omp target update to(ap)
+
+!$omp target exit data map(from: arr)
+
+do i=1,20
+  if (mod(i,2).eq.1.and.arr(i).ne.5) stop 1
+  if (mod(i,2).eq.0.and.arr(i).ne.0) stop 2
+end do
+
+allocate(arr2(1:20,1:20))
+
+ap2 => arr2(2:10:2,3:12:3)
+
+arr2 = 1
+
+!$omp target enter data map(to: arr2)
+
+!$omp target
+ap2 = 5
+!$omp end target
+
+!$omp target update from(ap2)
+
+do i=1,20
+  do j=1,20
+    if (i.ge.2.and.i.le.10.and.mod(i-2,2).eq.0.and.&
+        &j.ge.3.and.j.le.12.and.mod(j-3,3).eq.0) then
+      if (arr2(i,j).ne.5) stop 3
+    else
+      if (arr2(i,j).ne.1) stop 4
+    end if
+  end do
+end do
+
+!$omp target exit data map(delete: arr2)
+
+end
diff --git a/libgomp/testsuite/libgomp.fortran/noncontig-updates-10.f90 b/libgomp/testsuite/libgomp.fortran/noncontig-updates-10.f90
new file mode 100644
index 0000000..c47ce38
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/noncontig-updates-10.f90
@@ -0,0 +1,29 @@
+! { dg-do run }
+! { dg-require-effective-target offload_device_nonshared_as }
+
+character(len=8), allocatable, dimension(:) :: lines
+integer :: i
+
+allocate(lines(10))
+
+lines = "OMPHELLO"
+
+!$omp target enter data map(to: lines)
+
+!$omp target
+lines = "NEWVALUE"
+!$omp end target
+
+!$omp target update from(lines(5:7:2))
+
+do i=1,10
+  if (i.eq.5.or.i.eq.7) then
+    if (lines(i).ne."NEWVALUE") stop 1
+  else
+    if (lines(i).ne."OMPHELLO") stop 2
+  end if
+end do
+
+!$omp target exit data map(delete: lines)
+
+end
diff --git a/libgomp/testsuite/libgomp.fortran/noncontig-updates-11.f90 b/libgomp/testsuite/libgomp.fortran/noncontig-updates-11.f90
new file mode 100644
index 0000000..a93acf2
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/noncontig-updates-11.f90
@@ -0,0 +1,51 @@
+! { dg-do run }
+! { dg-require-effective-target offload_device_nonshared_as }
+
+program p
+implicit none
+real(kind=4) :: arr(10,10,10,10)
+
+call s(arr,9,9,9,9)
+
+contains
+
+subroutine s(arr,m,n,o,p)
+implicit none
+integer :: i,m,n,o,p
+integer :: a,b,c,d
+real(kind=4) :: arr(0:m,0:n,0:o,0:p)
+
+arr = 0
+
+!$omp target enter data map(to: arr)
+
+!$omp target
+do i=0,9
+  arr(i,i,i,i) = i
+end do
+!$omp end target
+
+!$omp target update from(arr(0:2,0:2,0:2,0:2))
+
+do a=0,9
+  do b=0,9
+    do c=0,9
+      do d=0,9
+        if (a.le.2.and.b.le.2.and.c.le.2.and.d.le.2) then
+          if (a.eq.b.and.b.eq.c.and.c.eq.d) then
+            if (arr(a,b,c,d).ne.a) stop 1
+          else
+            if (arr(a,b,c,d).ne.0) stop 2
+          end if
+        else
+          if (arr(a,b,c,d).ne.0) stop 3
+        end if
+      end do
+    end do
+  end do
+end do
+
+!$omp target exit data map(delete: arr)
+
+end subroutine s
+end program p
diff --git a/libgomp/testsuite/libgomp.fortran/noncontig-updates-12.f90 b/libgomp/testsuite/libgomp.fortran/noncontig-updates-12.f90
new file mode 100644
index 0000000..c47fbdb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/noncontig-updates-12.f90
@@ -0,0 +1,59 @@
+! { dg-do run }
+! { dg-require-effective-target offload_device_nonshared_as }
+
+! Test plain, fixed-size arrays, and also pointers to same.
+
+implicit none
+integer(kind=8) :: arr(10,30)
+integer, target :: arr2(9,11,13)
+integer, pointer :: parr(:,:,:)
+integer :: i, j, k
+
+arr = 0
+!$omp target enter data map(to: arr)
+
+!$omp target
+arr = 99
+!$omp end target
+
+!$omp target update from(arr(1:10:3,5:30:7))
+
+do i=1,10
+  do j=1,30
+    if (mod(i-1,3).eq.0.and.mod(j-5,7).eq.0) then
+      if (arr(i,j).ne.99) stop 1
+    else
+      if (arr(i,j).ne.0) stop 2
+    endif
+  end do
+end do
+
+!$omp target exit data map(delete: arr)
+
+arr2 = 0
+parr => arr2
+!$omp target enter data map(to: parr)
+
+!$omp target
+parr = 99
+!$omp end target
+
+!$omp target update from(parr(7:9:2,5:7:2,3:6:3))
+
+do i=1,9
+  do j=1,11
+    do k=1,13
+      if (i.ge.7.and.j.ge.5.and.k.ge.3.and.&
+          &i.le.9.and.j.le.7.and.k.le.6.and.&
+          &mod(i-7,2).eq.0.and.mod(j-5,2).eq.0.and.mod(k-3,3).eq.0) then
+        if (parr(i,j,k).ne.99) stop 3
+      else
+        if (parr(i,j,k).ne.0) stop 4
+      end if
+    end do
+  end do
+end do
+
+!$omp target exit data map(delete: parr)
+
+end
diff --git a/libgomp/testsuite/libgomp.fortran/noncontig-updates-13.f90 b/libgomp/testsuite/libgomp.fortran/noncontig-updates-13.f90
new file mode 100644
index 0000000..42f867e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/noncontig-updates-13.f90
@@ -0,0 +1,42 @@
+! { dg-do run }
+! { dg-require-effective-target offload_device_nonshared_as }
+
+implicit none
+integer, allocatable :: arr(:,:,:,:,:)
+integer :: i, j, k, l, m
+
+allocate (arr(18,19,20,21,22))
+
+arr = 0
+
+!$omp target enter data map(to: arr)
+
+arr = 10
+
+!$omp target update to(arr(1:3:2,1:4:3,1:5:4,1:6:5,1:7:6))
+
+!$omp target
+do i=1,18
+  do j=1,19
+    do k=1,20
+      do l=1,21
+        do m=1,22
+          if ((i.eq.1.or.i.eq.3).and.&
+              &(j.eq.1.or.j.eq.4).and.&
+              &(k.eq.1.or.k.eq.5).and.&
+              &(l.eq.1.or.l.eq.6).and.&
+              &(m.eq.1.or.m.eq.7)) then
+            if (arr(i,j,k,l,m).ne.10) stop 1
+          else
+            if (arr(i,j,k,l,m).ne.0) stop 2
+          end if
+        end do
+      end do
+    end do
+  end do
+end do
+!$omp end target
+
+!$omp target exit data map(delete: arr)
+
+end
diff --git a/libgomp/testsuite/libgomp.fortran/noncontig-updates-2.f90 b/libgomp/testsuite/libgomp.fortran/noncontig-updates-2.f90
new file mode 100644
index 0000000..2d3efb8
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/noncontig-updates-2.f90
@@ -0,0 +1,101 @@
+! { dg-do run }
+! { dg-require-effective-target offload_device_nonshared_as }
+
+program p
+implicit none
+integer, allocatable, target :: arr3(:,:,:)
+integer, pointer :: ap3(:,:,:)
+integer :: i, j, k
+
+allocate(arr3(1:10,1:10,1:10))
+
+! CHECK 1
+
+arr3 = 0
+ap3 => arr3(1:10,1:10,1:10:2)
+
+!$omp target enter data map(to: arr3)
+
+!$omp target
+ap3 = 5
+!$omp end target
+
+!$omp target update from(ap3)
+
+call check(arr3, 0, 1, 1, 2)
+
+!$omp target exit data map(delete: arr3)
+
+! CHECK 2
+
+arr3 = 0
+ap3 => arr3(1:10,1:10:2,1:10)
+
+!$omp target enter data map(to: arr3)
+
+!$omp target
+ap3 = 5
+!$omp end target
+
+!$omp target update from(ap3)
+
+call check(arr3, 2, 1, 2, 1)
+
+!$omp target exit data map(delete: arr3)
+
+! CHECK 3
+
+arr3 = 0
+ap3 => arr3(1:10:2,1:10,1:10)
+
+!$omp target enter data map(to: arr3)
+
+!$omp target
+ap3 = 5
+!$omp end target
+
+!$omp target update from(ap3)
+
+call check(arr3, 4, 2, 1, 1)
+
+!$omp target exit data map(delete: arr3)
+
+! CHECK 4
+
+arr3 = 0
+ap3 => arr3(1:10:2,1:10:2,1:10:2)
+
+!$omp target enter data map(to: arr3)
+
+!$omp target
+ap3 = 5
+!$omp end target
+
+!$omp target update from(ap3)
+
+call check(arr3, 6, 2, 2, 2)
+
+!$omp target exit data map(delete: arr3)
+
+contains
+
+subroutine check(arr,cb,s1,s2,s3)
+implicit none
+integer :: arr(:,:,:)
+integer :: cb, s1, s2, s3
+
+do i=1,10
+  do j=1,10
+    do k=1,10
+      if (mod(k-1,s1).eq.0.and.mod(j-1,s2).eq.0.and.mod(i-1,s3).eq.0) then
+        if (arr(k,j,i).ne.5) stop cb+1
+      else
+        if (arr(k,j,i).ne.0) stop cb+2
+      end if
+    end do
+  end do
+end do
+
+end subroutine check
+
+end program p
diff --git a/libgomp/testsuite/libgomp.fortran/noncontig-updates-3.f90 b/libgomp/testsuite/libgomp.fortran/noncontig-updates-3.f90
new file mode 100644
index 0000000..14f1288
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/noncontig-updates-3.f90
@@ -0,0 +1,47 @@
+program p
+! { dg-do run }
+! { dg-require-effective-target offload_device_nonshared_as }
+
+integer :: A(200)
+A = [(i, i=1,200)]
+!$omp target enter data map(to: A(40:200))
+call foo(A(101:))
+
+contains
+
+subroutine foo(x)
+integer, target :: x(100)
+integer, pointer :: p(:,:)
+integer :: i, j
+
+p(0:5,-5:-1) => x(::2)
+
+!$omp target
+x = x * 2
+!$omp end target
+
+!$omp target update from(x(1:20:2))
+
+do i=1,20
+if (mod(i,2).eq.1 .and. x(i).ne.(100+i)*2) stop 1
+if (mod(i,2).eq.0 .and. x(i).ne.100+i) stop 2
+end do
+
+!$omp target
+p = 0
+!$omp end target
+
+!$omp target update from(p(::3,::2))
+
+do i=0,5
+  do j=-5,-1
+    if (mod(i,3).eq.0 .and. mod(j+5,2).eq.0) then
+      if (p(i,j).ne.0) stop 3
+    else
+      if (p(i,j).eq.0) stop 4
+    end if
+  end do
+end do
+
+end subroutine foo
+end program p
diff --git a/libgomp/testsuite/libgomp.fortran/noncontig-updates-4.f90 b/libgomp/testsuite/libgomp.fortran/noncontig-updates-4.f90
new file mode 100644
index 0000000..46e8c23
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/noncontig-updates-4.f90
@@ -0,0 +1,78 @@
+! { dg-do run }
+! { dg-require-effective-target offload_device_nonshared_as }
+
+type t
+  complex(kind=8) :: c
+  integer :: i
+end type t
+
+type u
+  integer :: i, j
+  complex(kind=8) :: c
+  integer :: k
+end type u
+
+type(t), target :: var(10)
+type(u), target :: var2(10)
+complex(kind=8), pointer :: ptr(:)
+integer :: i
+
+do i=1,10
+  var(i)%c = dcmplx(i,0)
+  var(i)%i = i
+end do
+
+ptr => var(:)%c
+
+!$omp target enter data map(to: var)
+
+!$omp target
+var(:)%c = dcmplx(0,0)
+var(:)%i = 0
+!$omp end target
+
+!$omp target update from(ptr)
+
+do i=1,10
+  if (var(i)%c.ne.dcmplx(0,0)) stop 1
+  if (var(i)%i.ne.i) stop 2
+end do
+
+!$omp target exit data map(delete: var)
+
+! Now do it again with a differently-ordered derived type.
+
+do i=1,10
+  var2(i)%c = dcmplx(0,i)
+  var2(i)%i = i
+  var2(i)%j = i * 2
+  var2(i)%k = i * 3
+end do
+
+ptr => var2(::2)%c
+
+!$omp target enter data map(to: var2)
+
+!$omp target
+var2(:)%c = dcmplx(0,0)
+var2(:)%i = 0
+var2(:)%j = 0
+var2(:)%k = 0
+!$omp end target
+
+!$omp target update from(ptr)
+
+do i=1,10
+  if (mod(i,2).eq.1) then
+    if (var2(i)%c.ne.dcmplx(0,0)) stop 3
+  else
+    if (var2(i)%c.ne.dcmplx(0,i)) stop 4
+  end if
+  if (var2(i)%i.ne.i) stop 5
+  if (var2(i)%j.ne.i * 2) stop 6
+  if (var2(i)%k.ne.i * 3) stop 7
+end do
+
+!$omp target exit data map(delete: var2)
+
+end
diff --git a/libgomp/testsuite/libgomp.fortran/noncontig-updates-5.f90 b/libgomp/testsuite/libgomp.fortran/noncontig-updates-5.f90
new file mode 100644
index 0000000..9cc20fa3
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/noncontig-updates-5.f90
@@ -0,0 +1,55 @@
+! { dg-do run }
+! { dg-require-effective-target offload_device_nonshared_as }
+
+! Only some of an array mapped on the target
+
+integer, target :: arr(100)
+integer, pointer :: ptr(:)
+
+arr = [(i * 2, i=1,100)]
+
+!$omp target enter data map(to: arr(51:100))
+
+!$omp target
+arr(51:100) = arr(51:100) + 1
+!$omp end target
+
+!$omp target update from(arr(51:100:2))
+
+do i=1,100
+  if (i.le.50) then
+    if (arr(i).ne.i*2) stop 1
+  else
+    if (mod(i,2).eq.1 .and. arr(i).ne.i*2+1) stop 2
+    if (mod(i,2).eq.0 .and. arr(i).ne.i*2) stop 3
+  end if
+end do
+
+!$omp target exit data map(delete: arr)
+
+arr = [(i * 2, i=1,100)]
+
+! Similar, but update via pointer.
+
+ptr => arr(51:100)
+
+!$omp target enter data map(to: ptr(1:50))
+
+!$omp target
+ptr = ptr + 1
+!$omp end target
+
+!$omp target update from(ptr(::2))
+
+do i=1,100
+  if (i.le.50) then
+    if (arr(i).ne.i*2) stop 1
+  else
+    if (mod(i,2).eq.1 .and. arr(i).ne.i*2+1) stop 2
+    if (mod(i,2).eq.0 .and. arr(i).ne.i*2) stop 3
+  end if
+end do
+
+!$omp target exit data map(delete: ptr)
+
+end
diff --git a/libgomp/testsuite/libgomp.fortran/noncontig-updates-6.f90 b/libgomp/testsuite/libgomp.fortran/noncontig-updates-6.f90
new file mode 100644
index 0000000..5c42b90
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/noncontig-updates-6.f90
@@ -0,0 +1,34 @@
+! { dg-do run }
+! { dg-require-effective-target offload_device_nonshared_as }
+
+program p
+implicit none
+integer, dimension(100) :: parr
+integer :: i
+
+parr = [(i, i=1,100)]
+
+!$omp target enter data map(to: parr)
+
+call s(parr)
+
+do i=1,100
+  if (mod(i,3).eq.1 .and. parr(i).ne.999) stop 1
+  if (mod(i,3).ne.1 .and. parr(i).ne.i) stop 2
+end do
+
+!$omp target exit data map(delete: parr)
+
+contains
+subroutine s(arr)
+implicit none
+integer, intent(inout) :: arr(*)
+
+!$omp target map(alloc: arr(1:100))
+arr(1:100) = 999
+!$omp end target
+
+!$omp target update from(arr(1:100:3))
+
+end subroutine s
+end program p
diff --git a/libgomp/testsuite/libgomp.fortran/noncontig-updates-7.f90 b/libgomp/testsuite/libgomp.fortran/noncontig-updates-7.f90
new file mode 100644
index 0000000..120fd9c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/noncontig-updates-7.f90
@@ -0,0 +1,36 @@
+! { dg-do run }
+! { dg-require-effective-target offload_device_nonshared_as }
+
+! Assumed-shape arrays
+
+program p
+implicit none
+integer, dimension(100) :: parr
+integer :: i
+
+parr = [(i, i=1,100)]
+
+!$omp target enter data map(to: parr)
+
+call s(parr)
+
+do i=1,100
+  if (mod(i,3).eq.1 .and. parr(i).ne.999) stop 1
+  if (mod(i,3).ne.1 .and. parr(i).ne.i) stop 2
+end do
+
+!$omp target exit data map(delete: parr)
+
+contains
+subroutine s(arr)
+implicit none
+integer, intent(inout) :: arr(:)
+
+!$omp target
+arr = 999
+!$omp end target
+
+!$omp target update from(arr(1:100:3))
+
+end subroutine s
+end program p
diff --git a/libgomp/testsuite/libgomp.fortran/noncontig-updates-8.f90 b/libgomp/testsuite/libgomp.fortran/noncontig-updates-8.f90
new file mode 100644
index 0000000..d9b3c9c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/noncontig-updates-8.f90
@@ -0,0 +1,39 @@
+! { dg-do run }
+! { dg-require-effective-target offload_device_nonshared_as }
+
+! Test biasing for target-region lookup.
+
+implicit none
+integer, allocatable, target :: var(:,:,:)
+integer, pointer :: p(:,:,:)
+integer :: i, j, k
+
+allocate(var(1:20,5:25,10:30))
+
+var = 0
+
+!$omp target enter data map(to: var)
+
+!$omp target
+var = 99
+!$omp end target
+
+p => var(1:3:2,5:5,10:10)
+
+!$omp target update from(p)
+
+do i=1,20
+  do j=5,25
+    do k=10,30
+      if ((i.eq.1.or.i.eq.3).and.j.eq.5.and.k.eq.10) then
+        if (var(i,j,k).ne.99) stop 1
+      else
+        if (var(i,j,k).ne.0) stop 2
+      end if
+    end do
+  end do
+end do
+
+!$omp target exit data map(delete: var)
+
+end
diff --git a/libgomp/testsuite/libgomp.fortran/noncontig-updates-9.f90 b/libgomp/testsuite/libgomp.fortran/noncontig-updates-9.f90
new file mode 100644
index 0000000..689a46a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/noncontig-updates-9.f90
@@ -0,0 +1,34 @@
+! { dg-do run }
+! { dg-require-effective-target offload_device_nonshared_as }
+
+! This test case hits the problem described in:
+! https://gcc.gnu.org/pipermail/gcc-patches/2023-February/612219.html
+
+! { dg-xfail-run-if "'enter data' bug" { offload_device_nonshared_as } }
+
+character(len=:), allocatable, dimension(:) :: lines
+integer :: i
+
+allocate(character(len=8) :: lines(10))
+
+lines = "OMPHELLO"
+
+!$omp target enter data map(to: lines)
+
+!$omp target
+lines = "NEWVALUE"
+!$omp end target
+
+!$omp target update from(lines(5:7:2))
+
+do i=1,10
+  if (i.eq.5.or.i.eq.7) then
+    if (lines(i).ne."NEWVALUE") stop 1
+  else
+    if (lines(i).ne."OMPHELLO") stop 2
+  end if
+end do
+
+!$omp target exit data map(delete: lines)
+
+end
diff --git a/libgomp/testsuite/libgomp.fortran/target-13.f90 b/libgomp/testsuite/libgomp.fortran/target-13.f90
index 6aacc77..e6334a5 100644
--- a/libgomp/testsuite/libgomp.fortran/target-13.f90
+++ b/libgomp/testsuite/libgomp.fortran/target-13.f90
@@ -76,7 +76,7 @@ var3a = var3
 
 ! ---------------
 
-!$omp target update from(var1%at(2:3))
+!$omp target update from(var1%at(::2))
 
 if (var1a /= var1) error stop
 if (any (var2a /= var2)) error stop
@@ -134,17 +134,20 @@ var1a%at(2)%a = var1a%at(2)%a * 7
 var1a%at(3)%s = var1a%at(3)%s * (-3)
 
 block
-  integer, volatile :: i1,i2,i3,i4
+  integer, volatile :: i1,i2,i3,i4,i5,i6
   i1 = 1
   i2 = 2
   i3 = 1
-  i4 = 2
-  !$omp target update from(var3(i1:i2)) from(var1%at(i3:i4))
+  i4 = 1
+  i5 = 2
+  i6 = 1
+  !$omp target update from(var3(i1:i2:i3)) from(var1%at(i4:i5:i6))
   i1 = 3
   i2 = 3
   i3 = 1
   i4 = 5
-  !$omp target update from(var1%at(i1)%s) from(var1%at(i2)%a(i3:i4))
+  i5 = 1
+  !$omp target update from(var1%at(i1)%s) from(var1%at(i1)%a(i3:i4:i5))
 end block
 
 if (var1 /= var1) error stop
diff --git a/libgomp/testsuite/libgomp.fortran/target-enter-data-3a.f90 b/libgomp/testsuite/libgomp.fortran/target-enter-data-3a.f90
new file mode 100644
index 0000000..1fe3f03
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/target-enter-data-3a.f90
@@ -0,0 +1,567 @@
+! { dg-additional-options "-cpp" }
+
+! FIXME: Some tests do not work yet. Those are for now in '#if 0'
+
+! Check that 'map(alloc:' properly works with
+! - deferred-length character strings
+! - arrays with array descriptors
+! For those, the array descriptor / string length must be mapped with 'to:'
+
+program main
+implicit none
+
+type t
+  integer :: ic(2:5), ic2
+  character(len=11) :: ccstr(3:4), ccstr2
+  character(len=11,kind=4) :: cc4str(3:7), cc4str2
+  integer, pointer :: pc(:), pc2
+  character(len=:), pointer :: pcstr(:), pcstr2
+  character(len=:,kind=4), pointer :: pc4str(:), pc4str2
+end type t
+
+type(t) :: dt
+
+integer :: ii(5), ii2
+character(len=11) :: clstr(-1:1), clstr2
+character(len=11,kind=4) :: cl4str(0:3), cl4str2
+integer, pointer :: ip(:), ip2
+integer, allocatable :: ia(:), ia2
+character(len=:), pointer :: pstr(:), pstr2
+character(len=:), allocatable :: astr(:), astr2
+character(len=:,kind=4), pointer :: p4str(:), p4str2
+character(len=:,kind=4), allocatable :: a4str(:), a4str2
+
+
+allocate(dt%pc(5), dt%pc2)
+allocate(character(len=2) :: dt%pcstr(2))
+allocate(character(len=4) :: dt%pcstr2)
+
+allocate(character(len=3,kind=4) :: dt%pc4str(2:3))
+allocate(character(len=5,kind=4) :: dt%pc4str2)
+
+allocate(ip(5), ip2, ia(8), ia2)
+allocate(character(len=2) :: pstr(-2:0))
+allocate(character(len=4) :: pstr2)
+allocate(character(len=6) :: astr(3:5))
+allocate(character(len=8) :: astr2)
+
+allocate(character(len=3,kind=4) :: p4str(2:4))
+allocate(character(len=5,kind=4) :: p4str2)
+allocate(character(len=7,kind=4) :: a4str(-2:3))
+allocate(character(len=9,kind=4) :: a4str2)
+
+
+! integer :: ic(2:5), ic2
+
+!$omp target enter data map(alloc: dt%ic)
+!$omp target map(alloc: dt%ic)
+  if (size(dt%ic) /= 4) error stop
+  if (lbound(dt%ic, 1) /= 2) error stop
+  if (ubound(dt%ic, 1) /= 5) error stop
+  dt%ic = [22, 33, 44, 55]
+!$omp end target
+!$omp target exit data map(from: dt%ic)
+if (size(dt%ic) /= 4) error stop
+if (lbound(dt%ic, 1) /= 2) error stop
+if (ubound(dt%ic, 1) /= 5) error stop
+if (any (dt%ic /= [22, 33, 44, 55])) error stop
+
+!$omp target enter data map(alloc: dt%ic2)
+!$omp target map(alloc: dt%ic2)
+  dt%ic2 = 42
+!$omp end target
+!$omp target exit data map(from: dt%ic2)
+if (dt%ic2 /= 42) error stop
+
+
+! character(len=11) :: ccstr(3:4), ccstr2
+
+!$omp target enter data map(alloc: dt%ccstr)
+!$omp target map(alloc: dt%ccstr)
+  if (len(dt%ccstr) /= 11) error stop
+  if (size(dt%ccstr) /= 2) error stop
+  if (lbound(dt%ccstr, 1) /= 3) error stop
+  if (ubound(dt%ccstr, 1) /= 4) error stop
+  dt%ccstr = ["12345678901", "abcdefghijk"]
+!$omp end target
+!$omp target exit data map(from: dt%ccstr)
+if (len(dt%ccstr) /= 11) error stop
+if (size(dt%ccstr) /= 2) error stop
+if (lbound(dt%ccstr, 1) /= 3) error stop
+if (ubound(dt%ccstr, 1) /= 4) error stop
+if (any (dt%ccstr /= ["12345678901", "abcdefghijk"])) error stop
+
+!$omp target enter data map(alloc: dt%ccstr2)
+!$omp target map(alloc: dt%ccstr2)
+  if (len(dt%ccstr2) /= 11) error stop
+  dt%ccstr2 = "ABCDEFGHIJK"
+!$omp end target
+!$omp target exit data map(from: dt%ccstr2)
+if (len(dt%ccstr2) /= 11) error stop
+if (dt%ccstr2 /= "ABCDEFGHIJK") error stop
+
+
+! character(len=11,kind=4) :: cc4str(3:7), cc4str2
+
+#if 0
+! Value check fails
+!$omp target map(alloc: dt%cc4str)
+  if (len(dt%cc4str) /= 11) error stop
+  if (size(dt%cc4str) /= 5) error stop
+  if (lbound(dt%cc4str, 1) /= 3) error stop
+  if (ubound(dt%cc4str, 1) /= 7) error stop
+  dt%cc4str = [4_"12345678901", 4_"abcdefghijk", &
+               4_"qerftcea6ds", 4_"a1f9g37ga4.", &
+               4_"45ngwj56sj2"]
+!$omp end target
+!$omp target exit data map(from: dt%cc4str)
+if (len(dt%cc4str) /= 11) error stop
+if (size(dt%cc4str) /= 5) error stop
+if (lbound(dt%cc4str, 1) /= 3) error stop
+if (ubound(dt%cc4str, 1) /= 7) error stop
+if (dt%cc4str(3) /= 4_"12345678901") error stop
+if (dt%cc4str(4) /= 4_"abcdefghijk") error stop
+if (dt%cc4str(5) /= 4_"qerftcea6ds") error stop
+if (dt%cc4str(6) /= 4_"a1f9g37ga4.") error stop
+if (dt%cc4str(7) /= 4_"45ngwj56sj2") error stop
+#endif
+
+!$omp target enter data map(alloc: dt%cc4str2)
+!$omp target map(alloc: dt%cc4str2)
+  if (len(dt%cc4str2) /= 11) error stop
+  dt%cc4str2 = 4_"ABCDEFGHIJK"
+!$omp end target
+!$omp target exit data map(from: dt%cc4str2)
+if (len(dt%cc4str2) /= 11) error stop
+if (dt%cc4str2 /= 4_"ABCDEFGHIJK") error stop
+
+
+! integer, pointer :: pc(:), pc2
+! allocate(dt%pc(5), dt%pc2)
+
+#if 0
+! libgomp: GOMP_target_enter_exit_data unhandled kind 0x00
+
+!$omp target enter data map(alloc: dt%pc)
+!$omp target map(alloc: dt%pc)
+  if (.not. associated(dt%pc)) error stop
+  if (size(dt%pc) /= 5) error stop
+  if (lbound(dt%pc, 1) /= 1) error stop
+  if (ubound(dt%pc, 1) /= 5) error stop
+  dt%pc = [11, 22, 33, 44, 55]
+!$omp end target
+!$omp target exit data map(from: dt%pc)
+if (.not. associated(dt%pc)) error stop
+if (size(dt%pc) /= 5) error stop
+if (lbound(dt%pc, 1) /= 1) error stop
+if (ubound(dt%pc, 1) /= 5) error stop
+if (any (dt%pc /= [11, 22, 33, 44, 55])) error stop
+#endif
+
+!$omp target enter data map(alloc: dt%pc2)
+!$omp target map(alloc: dt%pc2)
+  if (.not. associated(dt%pc2)) error stop
+  dt%pc2 = 99
+!$omp end target
+!$omp target exit data map(from: dt%pc2)
+if (dt%pc2 /= 99) error stop
+if (.not. associated(dt%pc2)) error stop
+
+
+! character(len=:), pointer :: pcstr(:), pcstr2
+! allocate(character(len=2) :: dt%pcstr(2))
+! allocate(character(len=4) :: dt%pcstr2)
+
+#if 0
+! libgomp: GOMP_target_enter_exit_data unhandled kind 0x00
+
+!$omp target enter data map(alloc: dt%pcstr)
+!$omp target map(alloc: dt%pcstr)
+  if (.not. associated(dt%pcstr)) error stop
+  if (len(dt%pcstr) /= 2) error stop
+  if (size(dt%pcstr) /= 2) error stop
+  if (lbound(dt%pcstr, 1) /= 1) error stop
+  if (ubound(dt%pcstr, 1) /= 2) error stop
+  dt%pcstr = ["01", "jk"]
+!$omp end target
+!$omp target exit data map(from: dt%pcstr)
+if (.not. associated(dt%pcstr)) error stop
+if (len(dt%pcstr) /= 2) error stop
+if (size(dt%pcstr) /= 2) error stop
+if (lbound(dt%pcstr, 1) /= 1) error stop
+if (ubound(dt%pcstr, 1) /= 2) error stop
+if (any (dt%pcstr /= ["01", "jk"])) error stop
+#endif
+
+#if 0
+! libgomp: GOMP_target_enter_exit_data unhandled kind 0x01
+
+!$omp target enter data map(alloc: dt%pcstr2)
+!$omp target map(alloc: dt%pcstr2)
+  if (.not. associated(dt%pcstr2)) error stop
+  if (len(dt%pcstr2) /= 4) error stop
+  dt%pcstr2 = "HIJK"
+!$omp end target
+!$omp target exit data map(from: dt%pcstr2)
+if (.not. associated(dt%pcstr2)) error stop
+if (len(dt%pcstr2) /= 4) error stop
+if (dt%pcstr2 /= "HIJK") error stop
+#endif
+
+
+! character(len=:,kind=4), pointer :: pc4str(:), pc4str2
+! allocate(character(len=3,kind=4) :: dt%pc4str(2:3))
+! allocate(character(len=5,kind=4) :: dt%pc4str2)
+
+#if 0
+! libgomp: GOMP_target_enter_exit_data unhandled kind 0x00
+
+!$omp target enter data map(alloc: dt%pc4str)
+!$omp target map(alloc: dt%pc4str)
+  if (.not. associated(dt%pc4str)) error stop
+  if (len(dt%pc4str) /= 3) error stop
+  if (size(dt%pc4str) /= 2) error stop
+  if (lbound(dt%pc4str, 1) /= 2) error stop
+  if (ubound(dt%pc4str, 1) /= 3) error stop
+  dt%pc4str = [4_"456", 4_"tzu"]
+!$omp end target
+!$omp target exit data map(from: dt%pc4str)
+if (.not. associated(dt%pc4str)) error stop
+if (len(dt%pc4str) /= 3) error stop
+if (size(dt%pc4str) /= 2) error stop
+if (lbound(dt%pc4str, 1) /= 2) error stop
+if (ubound(dt%pc4str, 1) /= 3) error stop
+if (dt%pc4str(2) /= 4_"456") error stop
+if (dt%pc4str(3) /= 4_"tzu") error stop
+#endif
+
+#if 0
+! libgomp: GOMP_target_enter_exit_data unhandled kind 0x01
+
+!$omp target enter data map(alloc: dt%pc4str2)
+!$omp target map(alloc: dt%pc4str2)
+  if (.not. associated(dt%pc4str2)) error stop
+  if (len(dt%pc4str2) /= 5) error stop
+  dt%pc4str2 = 4_"98765"
+!$omp end target
+!$omp target exit data map(from: dt%pc4str2)
+if (.not. associated(dt%pc4str2)) error stop
+if (len(dt%pc4str2) /= 5) error stop
+if (dt%pc4str2 /= 4_"98765") error stop
+#endif
+
+
+! integer :: ii(5), ii2
+
+!$omp target enter data map(alloc: ii)
+!$omp target map(alloc: ii)
+  if (size(ii) /= 5) error stop
+  if (lbound(ii, 1) /= 1) error stop
+  if (ubound(ii, 1) /= 5) error stop
+  ii = [-1, -2, -3, -4, -5]
+!$omp end target
+!$omp target exit data map(from: ii)
+if (size(ii) /= 5) error stop
+if (lbound(ii, 1) /= 1) error stop
+if (ubound(ii, 1) /= 5) error stop
+if (any (ii /= [-1, -2, -3, -4, -5])) error stop
+
+!$omp target enter data map(alloc: ii2)
+!$omp target map(alloc: ii2)
+  ii2 = -410
+!$omp end target
+!$omp target exit data map(from: ii2)
+if (ii2 /= -410) error stop
+
+
+! character(len=11) :: clstr(-1:1), clstr2
+
+!$omp target enter data map(alloc: clstr)
+!$omp target map(alloc: clstr)
+  if (len(clstr) /= 11) error stop
+  if (size(clstr) /= 3) error stop
+  if (lbound(clstr, 1) /= -1) error stop
+  if (ubound(clstr, 1) /= 1) error stop
+  clstr = ["12345678901", "abcdefghijk", "ABCDEFGHIJK"]
+!$omp end target
+!$omp target exit data map(from: clstr)
+if (len(clstr) /= 11) error stop
+if (size(clstr) /= 3) error stop
+if (lbound(clstr, 1) /= -1) error stop
+if (ubound(clstr, 1) /= 1) error stop
+if (any (clstr /= ["12345678901", "abcdefghijk", "ABCDEFGHIJK"])) error stop
+
+!$omp target enter data map(alloc: clstr2)
+!$omp target map(alloc: clstr2)
+  if (len(clstr2) /= 11) error stop
+  clstr2 = "ABCDEFghijk"
+!$omp end target
+!$omp target exit data map(from: clstr2)
+if (len(clstr2) /= 11) error stop
+if (clstr2 /= "ABCDEFghijk") error stop
+
+
+! character(len=11,kind=4) :: cl4str(0:3), cl4str2
+
+!$omp target enter data map(alloc: cl4str)
+!$omp target map(alloc: cl4str)
+  if (len(cl4str) /= 11) error stop
+  if (size(cl4str) /= 4) error stop
+  if (lbound(cl4str, 1) /= 0) error stop
+  if (ubound(cl4str, 1) /= 3) error stop
+  cl4str = [4_"12345678901", 4_"abcdefghijk", &
+            4_"qerftcea6ds", 4_"a1f9g37ga4."]
+!$omp end target
+!$omp target exit data map(from: cl4str)
+if (len(cl4str) /= 11) error stop
+if (size(cl4str) /= 4) error stop
+if (lbound(cl4str, 1) /= 0) error stop
+if (ubound(cl4str, 1) /= 3) error stop
+if (cl4str(0) /= 4_"12345678901") error stop
+if (cl4str(1) /= 4_"abcdefghijk") error stop
+if (cl4str(2) /= 4_"qerftcea6ds") error stop
+if (cl4str(3) /= 4_"a1f9g37ga4.") error stop
+
+!$omp target enter data map(alloc: cl4str2)
+!$omp target map(alloc: cl4str2)
+  if (len(cl4str2) /= 11) error stop
+  cl4str2 = 4_"ABCDEFGHIJK"
+!$omp end target
+!$omp target exit data map(from: cl4str2)
+if (len(cl4str2) /= 11) error stop
+if (cl4str2 /= 4_"ABCDEFGHIJK") error stop
+
+
+! allocate(ip(5), ip2, ia(8), ia2)
+
+!$omp target enter data map(alloc: ip)
+!$omp target map(alloc: ip)
+  if (.not. associated(ip)) error stop
+  if (size(ip) /= 5) error stop
+  if (lbound(ip, 1) /= 1) error stop
+  if (ubound(ip, 1) /= 5) error stop
+  ip = [11, 22, 33, 44, 55]
+!$omp end target
+!$omp target exit data map(from: ip)
+if (.not. associated(ip)) error stop
+if (size(ip) /= 5) error stop
+if (lbound(ip, 1) /= 1) error stop
+if (ubound(ip, 1) /= 5) error stop
+if (any (ip /= [11, 22, 33, 44, 55])) error stop
+
+!$omp target enter data map(alloc: ip2)
+!$omp target map(alloc: ip2)
+  if (.not. associated(ip2)) error stop
+  ip2 = 99
+!$omp end target
+!$omp target exit data map(from: ip2)
+if (ip2 /= 99) error stop
+if (.not. associated(ip2)) error stop
+
+
+! allocate(ip(5), ip2, ia(8), ia2)
+
+!$omp target enter data map(alloc: ia)
+!$omp target map(alloc: ia)
+  if (.not. allocated(ia)) error stop
+  if (size(ia) /= 8) error stop
+  if (lbound(ia, 1) /= 1) error stop
+  if (ubound(ia, 1) /= 8) error stop
+  ia = [1,2,3,4,5,6,7,8]
+!$omp end target
+!$omp target exit data map(from: ia)
+if (.not. allocated(ia)) error stop
+if (size(ia) /= 8) error stop
+if (lbound(ia, 1) /= 1) error stop
+if (ubound(ia, 1) /= 8) error stop
+if (any (ia /= [1,2,3,4,5,6,7,8])) error stop
+
+!$omp target enter data map(alloc: ia2)
+!$omp target map(alloc: ia2)
+  if (.not. allocated(ia2)) error stop
+  ia2 = 102
+!$omp end target
+!$omp target exit data map(from: ia2)
+if (ia2 /= 102) error stop
+if (.not. allocated(ia2)) error stop
+
+
+! character(len=:), pointer :: pstr(:), pstr2
+! allocate(character(len=2) :: pstr(-2:0))
+! allocate(character(len=4) :: pstr2)
+
+#if 0
+! libgomp: nvptx_alloc error: out of memory
+
+!$omp target enter data map(alloc: pstr)
+!$omp target map(alloc: pstr)
+  if (.not. associated(pstr)) error stop
+  if (len(pstr) /= 2) error stop
+  if (size(pstr) /= 3) error stop
+  if (lbound(pstr, 1) /= -2) error stop
+  if (ubound(pstr, 1) /= 0) error stop
+  pstr = ["01", "jk", "aq"]
+!$omp end target
+!$omp target exit data map(from: pstr)
+if (.not. associated(pstr)) error stop
+if (len(pstr) /= 2) error stop
+if (size(pstr) /= 3) error stop
+if (lbound(pstr, 1) /= -2) error stop
+if (ubound(pstr, 1) /= 0) error stop
+if (any (pstr /= ["01", "jk", "aq"])) error stop
+#endif
+
+!$omp target enter data map(alloc: pstr2)
+!$omp target map(alloc: pstr2)
+  if (.not. associated(pstr2)) error stop
+  if (len(pstr2) /= 4) error stop
+  pstr2 = "HIJK"
+!$omp end target
+!$omp target exit data map(from: pstr2)
+if (.not. associated(pstr2)) error stop
+if (len(pstr2) /= 4) error stop
+if (pstr2 /= "HIJK") error stop
+
+
+! character(len=:), allocatable :: astr(:), astr2
+! allocate(character(len=6) :: astr(3:5))
+! allocate(character(len=8) :: astr2)
+
+#if 0
+! libgomp: nvptx_alloc error: out of memory
+
+!$omp target enter data map(alloc: astr)
+!$omp target map(alloc: astr)
+  if (.not. allocated(astr)) error stop
+  if (len(astr) /= 6) error stop
+  if (size(astr) /= 3) error stop
+  if (lbound(astr, 1) /= 3) error stop
+  if (ubound(astr, 1) /= 5) error stop
+  astr = ["01db45", "jk$D%S", "zutg47"]
+!$omp end target
+!$omp target exit data map(from: astr)
+if (.not. allocated(astr)) error stop
+if (len(astr) /= 6) error stop
+if (size(astr) /= 3) error stop
+if (lbound(astr, 1) /= 3) error stop
+if (ubound(astr, 1) /= 5) error stop
+if (any (astr /= ["01db45", "jk$D%S", "zutg47"])) error stop
+#endif
+
+#if 0
+! libgomp: nvptx_alloc error: out of memory
+
+!$omp target enter data map(alloc: astr2)
+!$omp target map(alloc: astr2)
+  if (.not. allocated(astr2)) error stop
+  if (len(astr2) /= 8) error stop
+  astr2 = "HIJKhijk"
+!$omp end target
+!$omp target exit data map(from: astr2)
+if (.not. allocated(astr2)) error stop
+if (len(astr2) /= 8) error stop
+if (astr2 /= "HIJKhijk") error stop
+#endif
+
+
+! character(len=:,kind=4), pointer :: p4str(:), p4str2
+! allocate(character(len=3,kind=4) :: p4str(2:4))
+! allocate(character(len=5,kind=4) :: p4str2)
+
+#if 0
+! FAILS with value check
+
+!$omp target enter data map(alloc: p4str)
+!$omp target map(alloc: p4str)
+  if (.not. associated(p4str)) error stop
+  if (len(p4str) /= 3) error stop
+  if (size(p4str) /= 3) error stop
+  if (lbound(p4str, 1) /= 2) error stop
+  if (ubound(p4str, 1) /= 4) error stop
+  p4str(:) = [4_"f85", 4_"8af", 4_"A%F"]
+!$omp end target
+!$omp target exit data map(from: p4str)
+if (.not. associated(p4str)) error stop
+if (len(p4str) /= 3) error stop
+if (size(p4str) /= 3) error stop
+if (lbound(p4str, 1) /= 2) error stop
+if (ubound(p4str, 1) /= 4) error stop
+if (p4str(2)  /= 4_"f85") error stop
+if (p4str(3)  /= 4_"8af") error stop
+if (p4str(4)  /= 4_"A%F") error stop
+#endif
+
+!$omp target enter data map(alloc: p4str2)
+!$omp target map(alloc: p4str2)
+  if (.not. associated(p4str2)) error stop
+  if (len(p4str2) /= 5) error stop
+  p4str2 = 4_"9875a"
+!$omp end target
+!$omp target exit data map(from: p4str2)
+if (.not. associated(p4str2)) error stop
+if (len(p4str2) /= 5) error stop
+if (p4str2 /= 4_"9875a") error stop
+
+
+! character(len=:,kind=4), allocatable :: a4str(:), a4str2
+! allocate(character(len=7,kind=4) :: a4str(-2:3))
+! allocate(character(len=9,kind=4) :: a4str2)
+
+#if 0
+! libgomp: Trying to map into device [0x1027ba0..0x251050bb9c9ebba0) object when [0x7ffd026e6708..0x7ffd026e6710) is already mapped
+
+!$omp target enter data map(alloc: a4str)
+!$omp target map(alloc: a4str)
+  if (.not. allocated(a4str)) error stop
+  if (len(a4str) /= 7) error stop
+  if (size(a4str) /= 6) error stop
+  if (lbound(a4str, 1) /= -2) error stop
+  if (ubound(a4str, 1) /= 3) error stop
+  ! See PR fortran/107508 why '(:)' is required
+  a4str(:) = [4_"sf456aq", 4_"3dtzu24", 4_"_4fh7sm", 4_"=ff85s7", 4_"j=8af4d", 4_".,A%Fsz"]
+!$omp end target
+!$omp target exit data map(from: a4str)
+if (.not. allocated(a4str)) error stop
+if (len(a4str) /= 7) error stop
+if (size(a4str) /= 6) error stop
+if (lbound(a4str, 1) /= -2) error stop
+if (ubound(a4str, 1) /= 3) error stop
+if (a4str(-2) /= 4_"sf456aq") error stop
+if (a4str(-1) /= 4_"3dtzu24") error stop
+if (a4str(0)  /= 4_"_4fh7sm") error stop
+if (a4str(1)  /= 4_"=ff85s7") error stop
+if (a4str(2)  /= 4_"j=8af4d") error stop
+if (a4str(3)  /= 4_".,A%Fsz") error stop
+#endif
+
+!$omp target enter data map(alloc: a4str2)
+!$omp target map(alloc: a4str2)
+  if (.not. allocated(a4str2)) error stop
+  if (len(a4str2) /= 9) error stop
+  a4str2 = 4_"98765a23d"
+!$omp end target
+!$omp target exit data map(from: a4str2)
+if (.not. allocated(a4str2)) error stop
+if (len(a4str2) /= 9) error stop
+if (a4str2 /= 4_"98765a23d") error stop
+
+
+deallocate(dt%pc, dt%pc2)
+deallocate(dt%pcstr)
+deallocate(dt%pcstr2)
+
+deallocate(dt%pc4str)
+deallocate(dt%pc4str2)
+
+deallocate(ip, ip2, ia, ia2)
+deallocate(pstr)
+deallocate(pstr2)
+deallocate(astr)
+deallocate(astr2)
+
+deallocate(p4str)
+deallocate(p4str2)
+deallocate(a4str)
+deallocate(a4str2)
+
+end
diff --git a/libgomp/testsuite/libgomp.fortran/target-map-iterators-1.f90 b/libgomp/testsuite/libgomp.fortran/target-map-iterators-1.f90
new file mode 100644
index 0000000..80e077e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/target-map-iterators-1.f90
@@ -0,0 +1,45 @@
+! { dg-do run }
+
+! Test transfer of dynamically-allocated arrays to target using map
+! iterators.
+
+program test
+  implicit none
+
+  integer, parameter :: DIM1 = 8
+  integer, parameter :: DIM2 = 15
+
+  type :: array_ptr
+    integer, pointer :: arr(:)
+  end type
+
+  type (array_ptr) :: x(DIM1)
+  integer :: expected, sum, i, j
+
+  expected = mkarray ()
+
+  !$omp target map(iterator(i=1:DIM1), to: x(i)%arr(:)) map(from: sum)
+    sum = 0
+    do i = 1, DIM1
+      do j = 1, DIM2
+	sum = sum + x(i)%arr(j)
+      end do
+    end do
+  !$omp end target
+
+  if (sum .ne. expected) stop 1
+contains
+  integer function mkarray ()
+    integer :: exp = 0
+
+    do i = 1, DIM1
+      allocate (x(i)%arr(DIM2))
+      do j = 1, DIM2
+        x(i)%arr(j) = i * j
+	exp = exp + x(i)%arr(j)
+      end do
+    end do
+
+    mkarray = exp
+  end function
+end program
diff --git a/libgomp/testsuite/libgomp.fortran/target-map-iterators-2.f90 b/libgomp/testsuite/libgomp.fortran/target-map-iterators-2.f90
new file mode 100644
index 0000000..cf0e7fb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/target-map-iterators-2.f90
@@ -0,0 +1,45 @@
+! { dg-do run }
+
+! Test transfer of dynamically-allocated arrays from target using map
+! iterators.
+
+program test
+  implicit none
+
+  integer, parameter :: DIM1 = 8
+  integer, parameter :: DIM2 = 15
+
+  type :: array_ptr
+    integer, pointer :: arr(:)
+  end type
+
+  type (array_ptr) :: x(DIM1)
+  integer :: expected, sum, i, j
+
+  call mkarray
+
+  !$omp target map(iterator(i=1:DIM1), from: x(i)%arr(:)) map(from: expected)
+    expected = 0
+    do i = 1, DIM1
+      do j = 1, DIM2
+	x(i)%arr(j) = (i+1) * (j+1)
+	expected = expected + x(i)%arr(j)
+      end do
+    end do
+  !$omp end target
+
+  sum = 0
+  do i = 1, DIM1
+    do j = 1, DIM2
+      sum = sum + x(i)%arr(j)
+    end do
+  end do
+
+  if (sum .ne. expected) stop 1
+contains
+  subroutine mkarray
+    do i = 1, DIM1
+      allocate (x(i)%arr(DIM2))
+    end do
+  end subroutine
+end program
diff --git a/libgomp/testsuite/libgomp.fortran/target-map-iterators-3.f90 b/libgomp/testsuite/libgomp.fortran/target-map-iterators-3.f90
new file mode 100644
index 0000000..d62fc1d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/target-map-iterators-3.f90
@@ -0,0 +1,56 @@
+! { dg-do run }
+
+! Test transfer of dynamically-allocated arrays to target using map
+! iterators, with multiple iterators and function calls in the iterator
+! expression.
+
+program test
+  implicit none
+
+  integer, parameter :: DIM1 = 16
+  integer, parameter :: DIM2 = 4
+
+  type :: array_ptr
+    integer, pointer :: arr(:)
+  end type
+
+  type (array_ptr) :: x(DIM1), y(DIM1)
+  integer :: expected, sum, i, j, k
+
+  expected = mkarrays ()
+
+  !$omp target map(iterator(i=0:DIM1/4-1, j=0:3), to: x(f (i, j))%arr(:)) &
+  !$omp        map(iterator(k=1:DIM1), to: y(k)%arr(:)) &
+  !$omp        map(from: sum)
+    sum = 0
+    do i = 1, DIM1
+      do j = 1, DIM2
+	sum = sum + x(i)%arr(j) * y(i)%arr(j)
+      end do
+    end do
+  !$omp end target
+
+  if (sum .ne. expected) stop 1
+contains
+  integer function mkarrays ()
+    integer :: exp = 0
+
+    do i = 1, DIM1
+      allocate (x(i)%arr(DIM2))
+      allocate (y(i)%arr(DIM2))
+      do j = 1, DIM2
+	x(i)%arr(j) = i * j
+	y(i)%arr(j) = i + j
+	exp = exp + x(i)%arr(j) * y(i)%arr(j)
+      end do
+    end do
+
+    mkarrays = exp
+  end function
+
+  integer function f (i, j)
+    integer, intent(in) :: i, j
+
+    f = i * 4 + j + 1
+  end function
+end program
diff --git a/libgomp/testsuite/libgomp.fortran/target-map-iterators-4.f90 b/libgomp/testsuite/libgomp.fortran/target-map-iterators-4.f90
new file mode 100644
index 0000000..85f6287
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/target-map-iterators-4.f90
@@ -0,0 +1,48 @@
+! { dg-do run }
+
+! Test transfer of dynamically-allocated arrays to target using map
+! iterators with variable bounds.
+
+program test
+  implicit none
+
+  integer, parameter :: DIM1 = 8
+  integer, parameter :: DIM2 = 15
+
+  type :: array_ptr
+    integer, pointer :: arr(:)
+  end type
+
+  type (array_ptr) :: x(DIM1)
+  integer :: expected, sum, i, j
+  integer :: i_ubound
+
+  expected = mkarray (i_ubound)
+
+  !$omp target map(iterator(i=1:i_ubound), to: x(i)%arr(:)) map(from: sum)
+    sum = 0
+    do i = 1, i_ubound
+      do j = 1, DIM2
+	sum = sum + x(i)%arr(j)
+      end do
+    end do
+  !$omp end target
+
+  if (sum .ne. expected) stop 1
+contains
+  integer function mkarray (ubound)
+    integer, intent(out) :: ubound
+    integer :: exp = 0
+
+    do i = 1, DIM1
+      allocate (x(i)%arr(DIM2))
+      do j = 1, DIM2
+	x(i)%arr(j) = i * j
+	exp = exp + x(i)%arr(j)
+      end do
+    end do
+
+    ubound = DIM1
+    mkarray = exp
+  end function
+end program
diff --git a/libgomp/testsuite/libgomp.fortran/target-map-iterators-5.f90 b/libgomp/testsuite/libgomp.fortran/target-map-iterators-5.f90
new file mode 100644
index 0000000..4c47ee5
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/target-map-iterators-5.f90
@@ -0,0 +1,61 @@
+! { dg-do run }
+
+! Test transfer of dynamically-allocated arrays to target using map
+! iterators, with multiple iterators, function calls and non-constant
+! bounds in the iterator expression.
+
+program test
+  implicit none
+
+  integer, parameter :: DIM1 = 16
+  integer, parameter :: DIM2 = 4
+
+  type :: array_ptr
+    integer, pointer :: arr(:)
+  end type
+
+  type (array_ptr) :: x(DIM1), y(DIM1)
+  integer :: expected, sum, i, j, k
+  integer :: i_ubound
+  integer :: k_ubound
+
+  expected = mkarrays (k_ubound)
+  i_ubound = k_ubound / 4 - 1
+
+  !$omp target map(iterator(i=0:i_ubound, j=0:3), to: x(f (i, j))%arr(:)) &
+  !$omp        map(iterator(k=1:k_ubound), to: y(k)%arr(:)) &
+  !$omp        map(from: sum)
+    sum = 0
+    do i = 1, DIM1
+      do j = 1, DIM2
+	sum = sum + x(i)%arr(j) * y(i)%arr(j)
+      end do
+    end do
+  !$omp end target
+
+  if (sum .ne. expected) stop 1
+contains
+  integer function mkarrays (ubound)
+    integer, intent(out) :: ubound
+    integer :: exp = 0
+
+    do i = 1, DIM1
+      allocate (x(i)%arr(DIM2))
+      allocate (y(i)%arr(DIM2))
+      do j = 1, DIM2
+	x(i)%arr(j) = i * j
+	y(i)%arr(j) = i + j
+	exp = exp + x(i)%arr(j) * y(i)%arr(j)
+      end do
+    end do
+
+    ubound = DIM1
+    mkarrays = exp
+  end function
+
+  integer function f (i, j)
+    integer, intent(in) :: i, j
+
+    f = i * 4 + j + 1
+  end function
+end program
diff --git a/libgomp/testsuite/libgomp.fortran/target-update-iterators-1.f90 b/libgomp/testsuite/libgomp.fortran/target-update-iterators-1.f90
new file mode 100644
index 0000000..e9a13a3
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/target-update-iterators-1.f90
@@ -0,0 +1,68 @@
+! { dg-do run }
+
+! Test target enter data and target update to the target using map
+! iterators.
+
+program test
+  integer, parameter :: DIM1 = 8
+  integer, parameter :: DIM2 = 15
+
+  type :: array_ptr
+    integer, pointer :: arr(:)
+  end type
+
+  type (array_ptr) :: x(DIM1)
+  integer :: expected, sum, i, j
+
+  expected = mkarray (x)
+
+  !$omp target enter data map(to: x)
+  !$omp target enter data map(iterator(i=1:DIM1), to: x(i)%arr(:))
+  !$omp target map(from: sum)
+    sum = 0
+    do i = 1, DIM1
+      do j = 1, DIM2
+	sum = sum + x(i)%arr(j)
+      end do
+    end do
+  !$omp end target
+
+  print *, sum, expected
+  if (sum .ne. expected) stop 1
+
+  expected = 0
+  do i = 1, DIM1
+    do j = 1, DIM2
+      x(i)%arr(j) = x(i)%arr(j) * i * j
+      expected = expected + x(i)%arr(j)
+    end do
+  end do
+
+  !$omp target update to(iterator(i=1:DIM1): x(i)%arr(:))
+
+  !$omp target map(from: sum)
+    sum = 0
+    do i = 1, DIM1
+      do j = 1, DIM2
+	sum = sum + x(i)%arr(j)
+      end do
+    end do
+  !$omp end target
+
+  if (sum .ne. expected) stop 2
+contains
+  integer function mkarray (x)
+    type (array_ptr), intent(inout) :: x(DIM1)
+    integer :: exp = 0
+
+    do i = 1, DIM1
+      allocate (x(i)%arr(DIM2))
+      do j = 1, DIM2
+	x(i)%arr(j) = i * j
+	exp = exp + x(i)%arr(j)
+      end do
+    end do
+
+    mkarray = exp
+  end function
+end program
diff --git a/libgomp/testsuite/libgomp.fortran/target-update-iterators-2.f90 b/libgomp/testsuite/libgomp.fortran/target-update-iterators-2.f90
new file mode 100644
index 0000000..2e982bc
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/target-update-iterators-2.f90
@@ -0,0 +1,63 @@
+! { dg-do run }
+! { dg-require-effective-target offload_device_nonshared_as }
+
+! Test target enter data and target update from the target using map
+! iterators.
+
+program test
+  integer, parameter :: DIM1 = 8
+  integer, parameter :: DIM2 = 15
+
+  type :: array_ptr
+    integer, pointer :: arr(:)
+  end type
+
+  type (array_ptr) :: x(DIM1)
+  integer :: sum, expected
+
+  call mkarray (x)
+
+  !$omp target enter data map(to: x(:DIM1))
+  !$omp target enter data map(iterator(i=1:DIM1), to: x(i)%arr(:))
+  !$omp target map(from: expected)
+    expected = 0
+    do i = 1, DIM1
+      do j = 1, DIM2
+	x(i)%arr(j) = (i + 1) * (j + 2)
+	expected = expected + x(i)%arr(j)
+      end do
+    end do
+  !$omp end target
+
+  ! Host copy of x should remain unchanged.
+  sum = 0
+  do i = 1, DIM1
+    do j = 1, DIM2
+      sum = sum + x(i)%arr(j)
+    end do
+  end do
+  if (sum .ne. 0) stop 1
+
+  !$omp target update from(iterator(i=1:DIM1): x(i)%arr(:))
+
+  ! Host copy should now be updated.
+  sum = 0
+  do i = 1, DIM1
+    do j = 1, DIM2
+      sum = sum + x(i)%arr(j)
+    end do
+  end do
+
+  if (sum .ne. expected) stop 2
+contains
+  subroutine mkarray (x)
+    type (array_ptr), intent(inout) :: x(DIM1)
+
+    do i = 1, DIM1
+      allocate (x(i)%arr(DIM2))
+      do j = 1, DIM2
+	x(i)%arr(j) = 0
+      end do
+    end do
+  end subroutine
+end program
diff --git a/libgomp/testsuite/libgomp.fortran/target-update-iterators-3.f90 b/libgomp/testsuite/libgomp.fortran/target-update-iterators-3.f90
new file mode 100644
index 0000000..54b2a6c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/target-update-iterators-3.f90
@@ -0,0 +1,78 @@
+! { dg-do run }
+! { dg-require-effective-target offload_device_nonshared_as }
+
+! Test target enter data and target update to the target using map
+! iterators with a function.
+
+program test
+  implicit none
+
+  integer, parameter :: DIM1 = 8
+  integer, parameter :: DIM2 = 15
+
+  type :: array_ptr
+    integer, pointer :: arr(:)
+  end type
+
+  type (array_ptr) :: x(DIM1)
+  integer :: x_new(DIM1, DIM2)
+  integer :: expected, sum, i, j
+
+  call mkarray (x)
+
+  !$omp target enter data map(to: x(:DIM1))
+  !$omp target enter data map(iterator(i=1:DIM1), to: x(i)%arr(:))
+
+  ! Update x on host.
+  do i = 1, DIM1
+    do j = 1, DIM2
+      x_new(i, j) = x(i)%arr(j)
+      x(i)%arr(j) = (i + 1) * (j + 2);
+    end do
+  end do
+
+  ! Update a subset of x on target.
+  !$omp target update to(iterator(i=1:DIM1/2): x(f (i))%arr(:))
+
+  !$omp target map(from: sum)
+    sum = 0
+    do i = 1, DIM1
+      do j = 1, DIM2
+	sum = sum + x(i)%arr(j)
+      end do
+    end do
+  !$omp end target
+
+  ! Calculate expected value on host.
+  do i = 1, DIM1/2
+    do j = 1, DIM2
+      x_new(f (i), j) = x(f (i))%arr(j)
+    end do
+  end do
+
+  expected = 0
+  do i = 1, DIM1
+    do j = 1, DIM2
+      expected = expected + x_new(i, j)
+    end do
+  end do
+
+  if (sum .ne. expected) stop 1
+contains
+  subroutine mkarray (x)
+    type (array_ptr), intent(inout) :: x(DIM1)
+
+    do i = 1, DIM1
+      allocate (x(i)%arr(DIM2))
+      do j = 1, DIM2
+	x(i)%arr(j) = i * j
+      end do
+    end do
+  end subroutine
+
+  integer function f (i)
+    integer, intent(in) :: i
+
+    f = i * 2
+  end function
+end program
diff --git a/libgomp/testsuite/libgomp.fortran/target-update-iterators-4.f90 b/libgomp/testsuite/libgomp.fortran/target-update-iterators-4.f90
new file mode 100644
index 0000000..9f138aa
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/target-update-iterators-4.f90
@@ -0,0 +1,70 @@
+! { dg-do run }
+
+! Test target enter data and target update to the target using map
+! iterators with non-constant bounds.
+
+program test
+  integer, parameter :: DIM1 = 8
+  integer, parameter :: DIM2 = 15
+
+  type :: array_ptr
+    integer, pointer :: arr(:)
+  end type
+
+  type (array_ptr) :: x(DIM1)
+  integer :: expected, sum, i, j, ubound
+
+  expected = mkarray (x, ubound)
+
+  !$omp target enter data map(to: x)
+  !$omp target enter data map(iterator(i=1:ubound), to: x(i)%arr(:))
+  !$omp target map(from: sum)
+    sum = 0
+    do i = 1, ubound
+      do j = 1, DIM2
+	sum = sum + x(i)%arr(j)
+      end do
+    end do
+  !$omp end target
+
+  print *, sum, expected
+  if (sum .ne. expected) stop 1
+
+  expected = 0
+  do i = 1, ubound
+    do j = 1, DIM2
+      x(i)%arr(j) = x(i)%arr(j) * i * j
+      expected = expected + x(i)%arr(j)
+    end do
+  end do
+
+  !$omp target update to(iterator(i=1:ubound): x(i)%arr(:))
+
+  !$omp target map(from: sum)
+    sum = 0
+    do i = 1, ubound
+      do j = 1, DIM2
+	sum = sum + x(i)%arr(j)
+      end do
+    end do
+  !$omp end target
+
+  if (sum .ne. expected) stop 2
+contains
+  integer function mkarray (x, bound)
+    type (array_ptr), intent(inout) :: x(DIM1)
+    integer, intent(out) :: bound
+    integer :: exp = 0
+
+    do i = 1, DIM1
+      allocate (x(i)%arr(DIM2))
+      do j = 1, DIM2
+	x(i)%arr(j) = i * j
+	exp = exp + x(i)%arr(j)
+      end do
+    end do
+
+    bound = DIM1
+    mkarray = exp
+  end function
+end program
diff --git a/libgomp/testsuite/libgomp.fortran/uses_allocators_2.f90 b/libgomp/testsuite/libgomp.fortran/uses_allocators_2.f90
index 0732796..bb98403 100644
--- a/libgomp/testsuite/libgomp.fortran/uses_allocators_2.f90
+++ b/libgomp/testsuite/libgomp.fortran/uses_allocators_2.f90
@@ -3,8 +3,6 @@
 ! Minimal test for valid code:
 ! - predefined allocators do not need any special treatment in uses_allocators
 !   (as 'requires dynamic_allocators' is the default).
-!
-! - Non-predefined allocators are currently rejected ('sorry)'
 
 subroutine test
   use omp_lib
@@ -35,22 +33,22 @@ subroutine non_predef
 
   integer(kind=omp_allocator_handle_kind) :: a1, a2, a3
 
-  !$omp target uses_allocators(omp_default_mem_alloc, a1(trait), a2(trait2))  ! { dg-message "sorry, unimplemented: 'uses_allocators' clause with traits and memory spaces" }
+  !$omp target uses_allocators(omp_default_mem_alloc, a1(trait), a2(trait2))
   block; end block
 
-  !$omp target parallel uses_allocators(omp_default_mem_alloc, a1(trait), a2(trait2))  ! { dg-message "sorry, unimplemented: 'uses_allocators' clause with traits and memory spaces" }
+  !$omp target parallel uses_allocators(omp_default_mem_alloc, a1(trait), a2(trait2))
   block; end block
 
 
   !$omp target uses_allocators(traits(trait):a1) &
-  !$omp&        uses_allocators ( memspace ( omp_low_lat_mem_space ) , traits ( trait2 ) : a2 , a3)  ! { dg-message "sorry, unimplemented: 'uses_allocators' clause with traits and memory spaces" }
+  !$omp&        uses_allocators ( memspace ( omp_low_lat_mem_space ) , traits ( trait2 ) : a2 , a3)
   block; end block
 
   !$omp target parallel uses_allocators(traits(trait):a1) &
-  !$omp&        uses_allocators ( memspace ( omp_low_lat_mem_space ) , traits ( trait2 ) : a2 , a3)  ! { dg-message "sorry, unimplemented: 'uses_allocators' clause with traits and memory spaces" }
+  !$omp&        uses_allocators ( memspace ( omp_low_lat_mem_space ) , traits ( trait2 ) : a2 , a3)
   block; end block
 
-  !$omp target uses_allocators ( traits(trait2) , memspace ( omp_low_lat_mem_space ) : a2 , a3)  ! { dg-message "sorry, unimplemented: 'uses_allocators' clause with traits and memory spaces" }
+  !$omp target uses_allocators ( traits(trait2) , memspace ( omp_low_lat_mem_space ) : a2 , a3)
   block; end block
 end subroutine
 
@@ -62,7 +60,7 @@ subroutine trait_present
   integer(kind=omp_allocator_handle_kind) :: a1
 
   ! Invalid in OpenMP 5.0 / 5.1, but valid since 5.2 the same as omp_default_mem_space + emptry traits array
-  !$omp target uses_allocators ( a1 )  ! { dg-message "sorry, unimplemented: 'uses_allocators' clause with traits and memory spaces" }
+  !$omp target uses_allocators ( a1 )
   block; end block
 end
 
@@ -76,13 +74,13 @@ subroutine odd_names
   integer(kind=omp_allocator_handle_kind) :: traits
   integer(kind=omp_allocator_handle_kind) :: memspace
 
-  !$omp target uses_allocators ( traits(trait1), memspace(trait1) )  ! { dg-message "sorry, unimplemented: 'uses_allocators' clause with traits and memory spaces" }
+  !$omp target uses_allocators ( traits(trait1), memspace(trait1) )
   block; end block
 
-  !$omp target uses_allocators ( traits(trait1), memspace(omp_low_lat_mem_space)  : traits)  ! { dg-message "sorry, unimplemented: 'uses_allocators' clause with traits and memory spaces" }
+  !$omp target uses_allocators ( traits(trait1), memspace(omp_low_lat_mem_space)  : traits)
   block; end block
 
-  !$omp target uses_allocators ( memspace(omp_low_lat_mem_space), traits(trait1) : memspace)  ! { dg-message "sorry, unimplemented: 'uses_allocators' clause with traits and memory spaces" }
+  !$omp target uses_allocators ( memspace(omp_low_lat_mem_space), traits(trait1) : memspace)
   block; end block
 end
 
@@ -94,6 +92,6 @@ subroutine more_checks
   integer(kind=omp_allocator_handle_kind) :: a1, a2(4)
   integer(kind=1) :: a3
 
-  !$omp target uses_allocators(memspace (omp_low_lat_mem_space) : a1 )  ! { dg-message "sorry, unimplemented: 'uses_allocators' clause with traits and memory spaces" }
+  !$omp target uses_allocators(memspace (omp_low_lat_mem_space) : a1 )
   block; end block
 end
diff --git a/libgomp/testsuite/libgomp.fortran/uses_allocators_3.f90 b/libgomp/testsuite/libgomp.fortran/uses_allocators_3.f90
new file mode 100644
index 0000000..8acdd42
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/uses_allocators_3.f90
@@ -0,0 +1,62 @@
+! { dg-do compile }
+! { dg-additional-options "-fdump-tree-original -fdump-tree-gimple" }
+
+program main
+  use omp_lib
+  implicit none
+  integer, allocatable :: arr(:)
+  integer (omp_allocator_handle_kind) :: bar, foo
+
+  type (omp_alloctrait), parameter :: traits_array(*) = &
+       [omp_alloctrait(omp_atk_pinned,omp_atv_true),&
+       omp_alloctrait(omp_atk_partition,omp_atv_nearest)]
+
+  !$omp target allocate(bar : arr) uses_allocators(bar)
+  block
+    allocate(arr(100))
+  end block
+
+  !$omp target uses_allocators(omp_default_mem_alloc)
+  block
+  end block
+
+  !$omp target uses_allocators(bar(traits_array), foo (traits_array))
+  block
+    if (foo == 0) stop 1
+  end block
+
+  !$omp target uses_allocators(traits(traits_array) : bar)
+  block
+  end block
+
+  !$omp target parallel uses_allocators(memspace (omp_low_lat_mem_space) : bar)
+  block
+  end block
+
+  !$omp target parallel uses_allocators(memspace (omp_high_bw_mem_space), traits(traits_array) : bar)
+  block
+    use iso_c_binding
+    type(c_ptr) :: ptr
+    integer(c_size_t) :: sz = 32
+    ptr = omp_alloc (sz, bar)
+    call omp_free (ptr, bar)
+  end block
+
+end program main
+
+! { dg-final { scan-tree-dump "pragma omp target allocate\\(allocator\\(bar\\):arr\\) uses_allocators\\(bar: memspace\\(\\), traits\\(\\)\\)" "original" } }
+! { dg-final { scan-tree-dump "pragma omp target" "original" } }
+! { dg-final { scan-tree-dump "pragma omp target uses_allocators\\(bar: memspace\\(\\), traits\\(traits_array\\)\\) uses_allocators\\(foo: memspace\\(\\), traits\\(traits_array\\)\\)" "original" } }
+! { dg-final { scan-tree-dump "pragma omp target uses_allocators\\(bar: memspace\\(\\), traits\\(traits_array\\)\\)" "original" } }
+! { dg-final { scan-tree-dump "pragma omp target uses_allocators\\(bar: memspace\\(omp_low_lat_mem_space\\), traits\\(\\)\\)" "original" } }
+! { dg-final { scan-tree-dump "pragma omp target uses_allocators\\(bar: memspace\\(omp_high_bw_mem_space\\), traits\\(traits_array\\)\\)" "original" } }
+
+! { dg-final { scan-tree-dump "pragma omp target num_teams\\(-2\\) thread_limit\\(0\\) allocate\\(allocator\\(bar\\):arr\\) uses_allocators\\(bar: memspace\\(\\), traits\\(\\)\\) private\\(bar\\)" "gimple" } }
+! { dg-final { scan-tree-dump "pragma omp target" "gimple" } }
+! { dg-final { scan-tree-dump "pragma omp target num_teams\\(-2\\) thread_limit\\(0\\) uses_allocators\\(bar: memspace\\(\\), traits\\(traits_array\\)\\) uses_allocators\\(foo: memspace\\(\\), traits\\(traits_array\\)\\) private\\(foo\\) private\\(bar\\)" "gimple" } }
+! { dg-final { scan-tree-dump "pragma omp target num_teams\\(-2\\) thread_limit\\(0\\) uses_allocators\\(bar: memspace\\(\\), traits\\(traits_array\\)\\) private\\(bar\\)" "gimple" } }
+! { dg-final { scan-tree-dump "pragma omp target num_teams\\(-2\\) thread_limit\\(0\\) uses_allocators\\(bar: memspace\\(omp_low_lat_mem_space\\), traits\\(\\)\\) firstprivate\\(omp_low_lat_mem_space\\) private\\(bar\\)" "gimple" } }
+! { dg-final { scan-tree-dump "pragma omp target num_teams\\(-2\\) thread_limit\\(0\\) uses_allocators\\(bar: memspace\\(omp_high_bw_mem_space\\), traits\\(traits_array\\)\\) firstprivate\\(omp_high_bw_mem_space\\) private\\(bar\\)" "gimple" } }
+
+! { dg-final { scan-tree-dump-times "__builtin_omp_init_allocator" 6 "gimple" } }
+! { dg-final { scan-tree-dump-times "__builtin_omp_destroy_allocator" 6 "gimple" } }
diff --git a/libgomp/testsuite/libgomp.fortran/uses_allocators_4.f90 b/libgomp/testsuite/libgomp.fortran/uses_allocators_4.f90
new file mode 100644
index 0000000..00f1dcb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/uses_allocators_4.f90
@@ -0,0 +1,54 @@
+! { dg-do compile }
+
+program main
+  use omp_lib
+  implicit none
+  integer (omp_allocator_handle_kind) :: bar, foo
+
+  type (omp_alloctrait), parameter :: traits_array(*) = &
+       [omp_alloctrait(omp_atk_pinned,omp_atv_true),&
+       omp_alloctrait(omp_atk_partition,omp_atv_nearest)]
+
+  !$omp target uses_allocators(omp_non_existant_alloc) ! { dg-error "Allocator 'omp_non_existant_alloc' at .1. in USES_ALLOCATORS must be a scalar integer of kind 'omp_allocator_handle_kind'" }
+  block  ! { dg-error "Symbol 'omp_non_existant_alloc' at .1. has no IMPLICIT type; did you mean 'omp_const_mem_alloc'\?" "" { target *-*-* } .-1 }
+  end block
+
+  !$omp target uses_allocators(bar(traits_array), foo (traits_array), ) ! { dg-error "Invalid character in name" }
+  block
+  end block
+
+  !$omp target uses_allocators(traits(xyz) : bar) ! { dg-error "Symbol 'xyz' at .1. has no IMPLICIT type" }
+  block  ! { dg-error "Traits array 'xyz' in USES_ALLOCATORS .1. must be a one-dimensional named constant array of type 'omp_alloctrait'" "" { target *-*-* } .-1 }
+  end block
+
+  !$omp target uses_allocators(memspace(omp_non_existant_mem_space) : foo) ! { dg-error "Symbol 'omp_non_existant_mem_space' at .1. has no IMPLICIT type; did you mean 'omp_const_mem_space'\?" }
+  ! { dg-error "Memspace 'omp_non_existant_mem_space' at .1. in USES_ALLOCATORS must be a predefined memory space" "" { target *-*-* } .-1 }
+
+  block
+  end block
+
+  !$omp target uses_allocators(traits(traits_array), traits(traits_array) : bar) ! { dg-error "Duplicate TRAITS modifier at .1. in USES_ALLOCATORS clause" }
+  block
+  end block
+
+  !$omp target uses_allocators(memspace(omp_default_mem_space), memspace(omp_default_mem_space) : foo) ! { dg-error "Duplicate MEMSPACE modifier at .1. in USES_ALLOCATORS clause" }
+  block
+  end block
+
+  !$omp target uses_allocators(memspace(omp_default_mem_space), traits(traits_array), traits(traits_array) : foo) ! { dg-error "Duplicate TRAITS modifier at .1. in USES_ALLOCATORS clause" }
+  block
+  end block
+
+  !$omp target uses_allocators (omp_null_allocator) ! { dg-error "Allocator 'omp_null_allocator' at .1. in USES_ALLOCATORS must either a variable or a predefined allocator" }
+  block
+  end block
+
+  !$omp target uses_allocators (memspace(omp_high_bw_mem_space) : foo, bar)
+  block
+  end block
+
+  !$omp target uses_allocators (memspace(omp_high_bw_mem_space) : foo(foo_traits)) ! { dg-error "70:Unexpected '\\(' at .1." }
+  block
+  end block
+
+end program main
diff --git a/libgomp/testsuite/libgomp.fortran/uses_allocators_5.f90 b/libgomp/testsuite/libgomp.fortran/uses_allocators_5.f90
new file mode 100644
index 0000000..00f8710
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/uses_allocators_5.f90
@@ -0,0 +1,14 @@
+! { dg-do compile }
+
+program main
+  use omp_lib
+  implicit none
+  integer, allocatable :: arr(:)
+  integer (omp_allocator_handle_kind) :: bar
+
+  !$omp target allocate(bar : arr) ! { dg-error "allocator 'bar' requires 'uses_allocators.bar.' clause in target region" }
+  block
+    allocate(arr(100))
+  end block
+
+end program main
diff --git a/libgomp/testsuite/libgomp.fortran/uses_allocators_6.f90 b/libgomp/testsuite/libgomp.fortran/uses_allocators_6.f90
new file mode 100644
index 0000000..993435fd
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/uses_allocators_6.f90
@@ -0,0 +1,50 @@
+! { dg-additional-options "-fdump-tree-gimple" }
+
+program main
+  use iso_c_binding
+  use omp_lib
+  implicit none (type, external)
+  integer :: x, xbuf(10)
+  integer(c_intptr_t) :: iptr
+  integer(omp_allocator_handle_kind) :: my_alloc
+  type(omp_alloctrait), parameter :: trait(*) = [omp_alloctrait(omp_atk_alignment, 128)]
+
+  !$omp target uses_allocators(omp_low_lat_mem_alloc) map(tofrom: x, xbuf) defaultmap(none)
+    !$omp parallel allocate(allocator(omp_low_lat_mem_alloc), align(128): x, xbuf) if(.false.) firstprivate(x, xbuf)
+      if (mod (TRANSFER (loc(x), iptr), 128) /= 0) &
+        stop 1
+      if (mod (TRANSFER (loc(xbuf), iptr), 128) /= 0) &
+        stop 2
+    !$omp end parallel
+  !$omp end target
+
+  my_alloc = transfer(int(z'ABCD', omp_allocator_handle_kind), my_alloc)
+
+  !$omp target uses_allocators(traits(trait): my_alloc) defaultmap(none) map(tofrom: x, xbuf)
+    !$omp parallel allocate(allocator(my_alloc): x, xbuf) if(.false.) firstprivate(x, xbuf)
+      if (mod (TRANSFER (loc(x), iptr), 128) /= 0) &
+        stop 3
+      if (mod (TRANSFER (loc(xbuf), iptr), 128) /= 0) &
+        stop 4
+    !$omp end parallel
+  !$omp end target
+
+  if (transfer(my_alloc, 0_omp_allocator_handle_kind) /= int(z'ABCD', omp_allocator_handle_kind)) &
+    stop 5
+
+  ! The following creates an allocator with empty traits + default mem space.
+  !$omp target uses_allocators(my_alloc) map(tofrom: x, xbuf) defaultmap(none)
+    !$omp parallel allocate(allocator(my_alloc), align(128): x, xbuf) if(.false.) firstprivate(x, xbuf)
+      if (mod (TRANSFER (loc(x), iptr), 128) /= 0) &
+        stop 6
+      if (mod (TRANSFER (loc(xbuf), iptr), 128) /= 0) &
+        stop 7
+    !$omp end parallel
+  !$omp end target
+
+  if (transfer(my_alloc, 0_omp_allocator_handle_kind) /= int(z'ABCD', omp_allocator_handle_kind)) &
+    stop 8
+end
+
+! { dg-final { scan-tree-dump-times "#pragma omp target .*private\\(my_alloc\\).*uses_allocators\\(my_alloc: memspace\\(\\), traits\\(trait\\)\\)" 1 "gimple" } }
+! { dg-final { scan-tree-dump-times "#pragma omp target .*private\\(my_alloc\\).*uses_allocators\\(my_alloc: memspace\\(\\), traits\\(\\)\\)" 1 "gimple" } }
diff --git a/libgomp/testsuite/libgomp.oacc-c++/exceptions-bad_cast-1.C b/libgomp/testsuite/libgomp.oacc-c++/exceptions-bad_cast-1.C
index 0545601..6957a6c 100644
--- a/libgomp/testsuite/libgomp.oacc-c++/exceptions-bad_cast-1.C
+++ b/libgomp/testsuite/libgomp.oacc-c++/exceptions-bad_cast-1.C
@@ -52,3 +52,6 @@ int main()
    PR119692.
 
    { dg-shouldfail {'std::bad_cast' exception} } */
+/* There are configurations where we 'WARNING: program timed out.' while in
+   'dynamic_cast', see <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119692#c6>.
+   { dg-timeout 10 } ... to make sure that happens quickly.  */
diff --git a/libgomp/testsuite/libgomp.oacc-c++/exceptions-bad_cast-2.C b/libgomp/testsuite/libgomp.oacc-c++/exceptions-bad_cast-2.C
index 24399ef..0f84cf2 100644
--- a/libgomp/testsuite/libgomp.oacc-c++/exceptions-bad_cast-2.C
+++ b/libgomp/testsuite/libgomp.oacc-c++/exceptions-bad_cast-2.C
@@ -58,3 +58,6 @@ int main()
 
    For GCN, nvptx offload execution, there is no 'catch'ing; any exception is fatal.
    { dg-shouldfail {'std::bad_cast' exception} { ! openacc_host_selected } } */
+/* There are configurations where we 'WARNING: program timed out.' while in
+   'dynamic_cast', see <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119692#c6>.
+   { dg-timeout 10 } ... to make sure that happens quickly.  */
diff --git a/libgomp/testsuite/libgomp.oacc-c++/exceptions-bad_cast-3.C b/libgomp/testsuite/libgomp.oacc-c++/exceptions-bad_cast-3.C
index 4fa419f..e9372fa 100644
--- a/libgomp/testsuite/libgomp.oacc-c++/exceptions-bad_cast-3.C
+++ b/libgomp/testsuite/libgomp.oacc-c++/exceptions-bad_cast-3.C
@@ -44,6 +44,6 @@ int main()
   }
 }
 
-/* { dg-final { scan-tree-dump-not {(?n)#pragma omp target oacc_serial map\(tofrom:_ZTI2C2 \[len: [0-9]+\]\) map\(tofrom:_ZTI2C1 \[len: [0-9]+\]\) map\(tofrom:_ZTV2C1 \[len: [0-9]+\]\)$} gimple { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-not {(?n)#pragma omp target oacc_serial map\(tofrom:_ZTI2C2 \[len: [0-9]+\] \[runtime_implicit\]\) map\(tofrom:_ZTI2C1 \[len: [0-9]+\] \[runtime_implicit\]\) map\(tofrom:_ZTV2C1 \[len: [0-9]+\] \[runtime_implicit\]\)$} gimple { xfail *-*-* } } } */
 
 /* { dg-final { scan-tree-dump-times {gimple_call <__cxa_bad_cast, } 1 optimized } } */
diff --git a/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-1.C b/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-1.C
index f2ef751..08c5766 100644
--- a/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-1.C
+++ b/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-1.C
@@ -4,9 +4,6 @@
    { dg-additional-options -fexceptions } */
 /* { dg-additional-options -fdump-tree-optimized-raw }
    { dg-additional-options -foffload-options=-fdump-tree-optimized-raw } */
-/* { dg-bogus {Size expression must be absolute\.} PR119737 { target { openacc_radeon_accel_selected && __OPTIMIZE__ } xfail *-*-* } 0 }
-   { dg-ice PR119737 { openacc_radeon_accel_selected && __OPTIMIZE__ } }
-   { dg-excess-errors {'mkoffload' failure etc.} { xfail { openacc_radeon_accel_selected && __OPTIMIZE__ } } } */
 
 /* See also '../libgomp.c++/target-exceptions-throw-1.C'.  */
 
diff --git a/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-2.C b/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-2.C
index f6dc970..a7408cd 100644
--- a/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-2.C
+++ b/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-2.C
@@ -6,9 +6,6 @@
    { dg-additional-options -foffload-options=-fdump-tree-optimized-raw } */
 /* { dg-bogus {undefined symbol: typeinfo name for MyException} PR119806 { target { openacc_radeon_accel_selected && { ! __OPTIMIZE__ } } xfail *-*-* } 0 }
    { dg-excess-errors {'mkoffload' failure etc.} { xfail { openacc_radeon_accel_selected && { ! __OPTIMIZE__ } } } } */
-/* { dg-bogus {Size expression must be absolute\.} PR119737 { target { openacc_radeon_accel_selected && __OPTIMIZE__ } xfail *-*-* } 0 }
-   { dg-ice PR119737 { openacc_radeon_accel_selected && __OPTIMIZE__ } }
-   { dg-excess-errors {'mkoffload' failures etc.} { xfail { openacc_radeon_accel_selected && __OPTIMIZE__ } } } */
 /* { dg-bogus {Initial value type mismatch} PR119806 { target { openacc_nvidia_accel_selected && { ! __OPTIMIZE__ } } xfail *-*-* } 0 }
    { dg-excess-errors {'mkoffload' failure etc.} { xfail { openacc_nvidia_accel_selected && { ! __OPTIMIZE__ } } } } */
 
diff --git a/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-3.C b/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-3.C
index 74a62b3..6664f80 100644
--- a/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-3.C
+++ b/libgomp/testsuite/libgomp.oacc-c++/exceptions-throw-3.C
@@ -37,7 +37,7 @@ int main()
   }
 }
 
-/* { dg-final { scan-tree-dump-not {(?n)#pragma omp target oacc_serial map\(tofrom:_ZTI11MyException \[len: [0-9]+\]\)$} gimple { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-not {(?n)#pragma omp target oacc_serial map\(tofrom:_ZTI11MyException \[len: [0-9]+\] \[runtime_implicit\]\)$} gimple { xfail *-*-* } } } */
 
 /* { dg-final { scan-tree-dump-times {gimple_call <__cxa_allocate_exception, } 1 optimized } }
    { dg-final { scan-tree-dump-times {gimple_call <__cxa_throw, } 1 optimized } } */
diff --git a/libgomp/testsuite/libgomp.oacc-c++/firstprivate-int.C b/libgomp/testsuite/libgomp.oacc-c++/firstprivate-int.C
new file mode 100644
index 0000000..86b8722
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c++/firstprivate-int.C
@@ -0,0 +1,83 @@
+/* Verify the GOMP_MAP_FIRSTPRIVATE_INT optimization on various types.
+   This test is similer to the test in libgomp.oacc-c-c++-common, but
+   it focuses on reference types. */
+
+#include <assert.h>
+#include <stdint.h>
+#include <complex.h>
+
+void test_ref (int8_t &i8i, int8_t &i8o, int16_t &i16i, int16_t &i16o,
+	       int32_t &i32i, int32_t &i32o, int64_t &i64i, int64_t &i64o,
+	       uint8_t &u8i, uint8_t &u8o, uint16_t &u16i, uint16_t &u16o,
+	       uint32_t &u32i, uint32_t &u32o, uint64_t &u64i, uint64_t &u64o,
+	       float &r32i, float &r32o, double &r64i, double &r64o,
+	       int _Complex &cii, int _Complex &cio,
+	       float _Complex &cfi, float _Complex &cfo,
+	       double _Complex &cdi, double _Complex &cdo)
+{
+#pragma acc parallel firstprivate (i8i,i16i,i32i,i64i,u8i,u16i,u32i,u64i) \
+  firstprivate(r32i,r64i,cii,cfi,cdi) copyout(i8o,i16o,i32o,i64o) \
+  copyout(u8o,u16o,u32o,u64o,r32o,r64o,cio,cfo,cdo) num_gangs(1)
+  {
+    i8o = i8i;
+    i16o = i16i;
+    i32o = i32i;
+    i64o = i64i;
+
+    u8o = u8i;
+    u16o = u16i;
+    u32o = u32i;
+    u64o = u64i;
+
+    r32o = r32i;
+    r64o = r64i;
+
+    cio = cii;
+    cfo = cfi;
+    cdo = cdi;
+  }
+}
+
+int
+main ()
+{
+  int8_t  i8i  = -1, i8o;
+  int16_t i16i = -2, i16o;
+  int32_t i32i = -3, i32o;
+  int64_t i64i = -4, i64o;
+
+  uint8_t  u8i  = 1,  u8o;
+  uint16_t u16i = 2, u16o;
+  uint32_t u32i = 3, u32o;
+  uint64_t u64i = 4, u64o;
+
+  float  r32i = .5, r32o;
+  double r64i = .25, r64o;
+
+  int _Complex    cii = 2, cio;
+  float _Complex  cfi = 4, cfo;
+  double _Complex cdi = 8, cdo;
+
+  test_ref (i8i, i8o, i16i, i16o, i32i, i32o, i64i, i64o, u8i, u8o, u16i,
+	    u16o, u32i, u32o, u64i, u64o, r32i, r32o, r64i, r64o, cii, cio,
+	    cfi, cfo, cdi, cdo);
+
+  assert (i8o == i8i);
+  assert (i16o == i16i);
+  assert (i32o == i32i);
+  assert (i64o == i64i);
+
+  assert (u8o == u8i);
+  assert (u16o == u16i);
+  assert (u32o == u32i);
+  assert (u64o == u64i);
+
+  assert (r32o == r32i);
+  assert (r64o == r64i);
+
+  assert (cio == cii);
+  assert (cfo == cfi);
+  assert (cdo == cdi);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c++/pr119692-1-1.C b/libgomp/testsuite/libgomp.oacc-c++/pr119692-1-1.C
index 5c3e037..4a876f7 100644
--- a/libgomp/testsuite/libgomp.oacc-c++/pr119692-1-1.C
+++ b/libgomp/testsuite/libgomp.oacc-c++/pr119692-1-1.C
@@ -39,4 +39,4 @@ int main()
   }
 }
 
-/* { dg-final { scan-tree-dump-not {(?n)#pragma omp target oacc_serial map\(tofrom:_ZTI2C2 \[len: [0-9]+\]\) map\(tofrom:_ZTI2C1 \[len: [0-9]+\]\) map\(tofrom:_ZTV2C1 \[len: [0-9]+\]\)$} gimple { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-not {(?n)#pragma omp target oacc_serial map\(tofrom:_ZTI2C2 \[len: [0-9]+\] \[runtime_implicit\]\) map\(tofrom:_ZTI2C1 \[len: [0-9]+\] \[runtime_implicit\]\) map\(tofrom:_ZTV2C1 \[len: [0-9]+\] \[runtime_implicit\]\)$} gimple { xfail *-*-* } } } */
diff --git a/libgomp/testsuite/libgomp.oacc-c++/pr119692-1-2.C b/libgomp/testsuite/libgomp.oacc-c++/pr119692-1-2.C
index 207b183..052e423 100644
--- a/libgomp/testsuite/libgomp.oacc-c++/pr119692-1-2.C
+++ b/libgomp/testsuite/libgomp.oacc-c++/pr119692-1-2.C
@@ -9,4 +9,4 @@
 
 /* { dg-bogus {using 'vector_length \(32\)', ignoring 1} {} { target openacc_nvidia_accel_selected xfail *-*-* } 0 } */
 
-/* { dg-final { scan-tree-dump-not {(?n)#pragma omp target oacc_serial default\(none\) map\(tofrom:_ZTI2C2 \[len: [0-9]+\]\) map\(tofrom:_ZTI2C1 \[len: [0-9]+\]\) map\(tofrom:_ZTV2C1 \[len: [0-9]+\]\)$} gimple { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-not {(?n)#pragma omp target oacc_serial default\(none\) map\(tofrom:_ZTI2C2 \[len: [0-9]+\] \[runtime_implicit\]\) map\(tofrom:_ZTI2C1 \[len: [0-9]+\] \[runtime_implicit\]\) map\(tofrom:_ZTV2C1 \[len: [0-9]+\] \[runtime_implicit\]\)$} gimple { xfail *-*-* } } } */
diff --git a/libgomp/testsuite/libgomp.oacc-c++/pr119692-1-3.C b/libgomp/testsuite/libgomp.oacc-c++/pr119692-1-3.C
index e9b44de..fd1844b 100644
--- a/libgomp/testsuite/libgomp.oacc-c++/pr119692-1-3.C
+++ b/libgomp/testsuite/libgomp.oacc-c++/pr119692-1-3.C
@@ -9,4 +9,4 @@
 
 /* { dg-bogus {using 'vector_length \(32\)', ignoring 1} {} { target openacc_nvidia_accel_selected xfail *-*-* } 0 } */
 
-/* { dg-final { scan-tree-dump-not {(?n)#pragma omp target oacc_serial default\(present\) map\(force_present:_ZTI2C2 \[len: [0-9]+\]\) map\(force_present:_ZTI2C1 \[len: [0-9]+\]\) map\(force_present:_ZTV2C1 \[len: [0-9]+\]\)$} gimple { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-not {(?n)#pragma omp target oacc_serial default\(present\) map\(force_present:_ZTI2C2 \[len: [0-9]+\] \[runtime_implicit\]\) map\(force_present:_ZTI2C1 \[len: [0-9]+\] \[runtime_implicit\]\) map\(force_present:_ZTV2C1 \[len: [0-9]+\] \[runtime_implicit\]\)$} gimple { xfail *-*-* } } } */
diff --git a/libgomp/testsuite/libgomp.oacc-c++/privatized-ref-3.C b/libgomp/testsuite/libgomp.oacc-c++/privatized-ref-3.C
index 11e1cef..5c70260 100644
--- a/libgomp/testsuite/libgomp.oacc-c++/privatized-ref-3.C
+++ b/libgomp/testsuite/libgomp.oacc-c++/privatized-ref-3.C
@@ -47,7 +47,7 @@ void gangs (void)
     int tmpvar;
     int &tmpref = tmpvar;
 #pragma acc loop collapse(2) gang private(tmpref) /* { dg-line l_loop[incr c_loop] } */
-    /* { dg-note {variable 'tmpref' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop } */
+    /* { dg-note {variable 'tmpref' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { xfail *-*-* } l_loop$c_loop } */
     /* { dg-note {variable 'j' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop } */
     /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop } */
     for (i = 0; i < 256; i++)
@@ -96,7 +96,7 @@ void workers (void)
     for (i = 0; i < 256; i++)
       {
 #pragma acc loop worker private(tmpref) /* { dg-line l_loop[incr c_loop] } */
-	/* { dg-note {variable 'tmpref' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop } */
+	/* { dg-note {variable 'tmpref' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { xfail *-*-* } l_loop$c_loop } */
 	/* { dg-note {variable 'j' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop } */
 	for (j = 0; j < 256; j++)
 	  {
@@ -142,7 +142,7 @@ void vectors (void)
     for (i = 0; i < 256; i++)
       {
 #pragma acc loop vector private(tmpref) /* { dg-line l_loop[incr c_loop] } */
-	/* { dg-note {variable 'tmpref' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop } */
+	/* { dg-note {variable 'tmpref' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { xfail *-*-* } l_loop$c_loop } */
 	/* { dg-note {variable 'j' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop } */
 	for (j = 0; j < 256; j++)
 	  {
@@ -184,7 +184,7 @@ void gangs_workers_vectors (void)
     int tmpvar;
     int &tmpref = tmpvar;
 #pragma acc loop collapse(2) gang worker vector private(tmpref) /* { dg-line l_loop[incr c_loop] } */
-    /* { dg-note {variable 'tmpref' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop } */
+    /* { dg-note {variable 'tmpref' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { xfail *-*-* } l_loop$c_loop } */
     /* { dg-note {variable 'j' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop } */
     /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop } */
     for (i = 0; i < 256; i++)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/abi-struct-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/abi-struct-1.c
new file mode 100644
index 0000000..4b54171
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/abi-struct-1.c
@@ -0,0 +1,125 @@
+/* Inspired by 'gcc.target/nvptx/abi-struct-arg.c', 'gcc.target/nvptx/abi-struct-ret.c'.  */
+
+/* See also '../libgomp.c-c++-common/target-abi-struct-1.c'.  */
+
+/* To exercise PR119835 (if optimizations enabled): disable inlining, so that
+   GIMPLE passes still see the functions that return aggregate types.  */
+#pragma GCC optimize "-fno-inline"
+
+typedef struct {} empty;  /* See 'gcc/doc/extend.texi', "Empty Structures".  */
+typedef struct {char a;} schar;
+typedef struct {short a;} sshort;
+typedef struct {int a;} sint;
+typedef struct {long long a;} slonglong;
+typedef struct {int a, b[12];} sint_13;
+
+#pragma omp declare target
+
+#define M(T) ({T t; t.a = sizeof t; t;})
+
+static __SIZE_TYPE__ empty_a;
+#pragma acc declare create(empty_a)
+#pragma acc routine
+static empty rempty(void)
+{
+  return ({empty t; empty_a = sizeof t; t;});
+}
+
+#pragma acc routine
+static schar rschar(void)
+{
+  return M(schar);
+}
+
+#pragma acc routine
+static sshort rsshort(void)
+{
+  return M(sshort);
+}
+
+#pragma acc routine
+static sint rsint(void)
+{
+  return M(sint);
+}
+
+#pragma acc routine
+static slonglong rslonglong(void)
+{
+  return M(slonglong);
+}
+
+#pragma acc routine
+static sint_13 rsint_13(void)
+{
+  return M(sint_13);
+}
+
+#pragma acc routine
+static void aempty(empty empty)
+{
+  (void) empty;
+
+  __SIZE_TYPE__ empty_a_exp;
+#ifndef __cplusplus
+  empty_a_exp = 0;
+#else
+  empty_a_exp = sizeof (char);
+#endif
+  if (empty_a != empty_a_exp)
+    __builtin_abort();
+}
+
+#pragma acc routine
+static void aschar(schar schar)
+{
+  if (schar.a != sizeof (char))
+    __builtin_abort();
+}
+
+#pragma acc routine
+static void asshort(sshort sshort)
+{
+  if (sshort.a != sizeof (short))
+    __builtin_abort();
+}
+
+#pragma acc routine
+static void asint(sint sint)
+{
+  if (sint.a != sizeof (int))
+    __builtin_abort();
+}
+
+#pragma acc routine
+static void aslonglong(slonglong slonglong)
+{
+  if (slonglong.a != sizeof (long long))
+    __builtin_abort();
+}
+
+#pragma acc routine
+static void asint_13(sint_13 sint_13)
+{
+  if (sint_13.a != (sizeof (int) * 13))
+    __builtin_abort();
+}
+
+#pragma omp end declare target
+
+int main()
+{
+#pragma omp target
+#pragma acc serial
+  /* { dg-bogus {using 'vector_length \(32\)', ignoring 1} {} { target openacc_nvidia_accel_selected xfail *-*-* } .-1 } */
+  {
+    aempty(rempty());
+    aschar(rschar());
+    asshort(rsshort());
+    asint(rsint());
+    aslonglong(rslonglong());
+    asint_13(rsint_13());
+  }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_memcpy_device-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_memcpy_device-1.c
new file mode 100644
index 0000000..eda651d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_memcpy_device-1.c
@@ -0,0 +1,96 @@
+/* { dg-prune-output "using .vector_length \\(32\\)" } */
+
+/* PR libgomp/93226  */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <openacc.h>
+
+enum { N = 1024 };
+
+static int D[N];
+#pragma acc declare device_resident(D)
+
+#pragma acc routine
+intptr_t init_d()
+{
+  for (int i = 0; i < N; i++)
+    D[i] = 27*i;
+  return (intptr_t) &D[0];
+}
+
+int
+main ()
+{
+  int *a, *b, *e;
+  void *d_a, *d_b, *d_c, *d_d, *d_e, *d_f;
+  intptr_t intptr;
+  bool fail = false;
+
+  a = (int *) malloc (N*sizeof (int));
+  b = (int *) malloc (N*sizeof (int));
+  e = (int *) malloc (N*sizeof (int));
+  d_c = acc_malloc (N*sizeof (int));
+  d_f = acc_malloc (N*sizeof (int));
+
+  memset (e, 0xff, N*sizeof (int));
+  d_e = acc_copyin (e, N*sizeof (int));
+
+  #pragma acc serial copyout(intptr)
+    intptr = init_d ();
+  d_d = (void*) intptr;
+  acc_memcpy_device (d_c, d_d, N*sizeof (int));
+
+  #pragma acc serial copy(fail) deviceptr(d_c) firstprivate(intptr)
+  {
+    int *cc = (int *) d_c;
+    int *dd = (int *) intptr;
+    for (int i = 0; i < N; i++)
+      if (dd[i] != 27*i || cc[i] != 27*i)
+	{
+	  fail = true;
+	  __builtin_abort ();
+	}
+  }
+  if (fail) __builtin_abort ();
+
+  for (int i = 0; i < N; i++)
+    a[i] = 11*i;
+  for (int i = 0; i < N; i++)
+    b[i] = 31*i;
+
+  d_a = acc_copyin (a, N*sizeof (int));
+  acc_copyin_async (b, N*sizeof (int), acc_async_noval);
+
+  #pragma acc parallel deviceptr(d_c) async
+  {
+    int *cc = (int *) d_c;
+    #pragma acc loop
+    for (int i = 0; i < N; i++)
+      cc[i] = -17*i;
+  }
+
+  acc_memcpy_device_async (d_d, d_a, N*sizeof (int), acc_async_noval);
+  acc_memcpy_device_async (d_f, d_c, N*sizeof (int), acc_async_noval);
+  acc_wait (acc_async_noval);
+  d_b = acc_deviceptr (b);
+  acc_memcpy_device_async (d_e, d_b, N*sizeof (int), acc_async_noval);
+  acc_wait (acc_async_noval);
+
+  #pragma acc serial deviceptr(d_d, d_e, d_f) copy(fail)
+  {
+    int *dd = (int *) d_d;
+    int *ee = (int *) d_e;
+    int *ff = (int *) d_f;
+    for (int i = 0; i < N; i++)
+      if (dd[i] != 11*i
+	  || ee[i] != 31*i
+	  || ff[i] != -17*i)
+	{
+	  fail = true;
+	  __builtin_abort ();
+	}
+  }
+  if (fail) __builtin_abort ();
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c
index d929bfd..a9a8c74 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c
@@ -114,8 +114,6 @@ void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_look
 
 int main()
 {
-  acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
-
   STATE_OP (state, = 0);
   reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
   reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
index b5e7715..91b3732 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
@@ -270,8 +270,6 @@ static void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *
 
 int main()
 {
-  acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
-
   STATE_OP (state, = 0);
   reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
   reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
index 2c85397..2cd2c98 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
@@ -59,6 +59,7 @@ static int state = -1;
 static acc_device_t acc_device_type;
 static int acc_device_num;
 static int num_gangs, num_workers, vector_length;
+static int async;
 
 
 static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
@@ -76,7 +77,7 @@ static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *e
   assert (prof_info->device_type == acc_device_type);
   assert (prof_info->device_number == acc_device_num);
   assert (prof_info->thread_id == -1);
-  assert (prof_info->async == acc_async_noval);
+  assert (prof_info->async == async);
   assert (prof_info->async_queue == prof_info->async);
   assert (prof_info->src_file == NULL);
   assert (prof_info->func_name == NULL);
@@ -166,8 +167,6 @@ void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_look
 
 int main()
 {
-  acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
-
   STATE_OP (state, = 0);
   reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg);
   assert (state == 0);
@@ -176,8 +175,10 @@ int main()
   acc_device_num = acc_get_device_num (acc_device_type);
   assert (state == 0);
 
-  /* Parallelism dimensions: compiler/runtime decides.  */
   STATE_OP (state, = 0);
+  /* Implicit async.  */
+  async = acc_async_noval;
+  /* Parallelism dimensions: compiler/runtime decides.  */
   num_gangs = num_workers = vector_length = 0;
   {
 #define N 100
@@ -203,8 +204,10 @@ int main()
 #undef N
   }
 
-  /* Parallelism dimensions: literal.  */
   STATE_OP (state, = 0);
+  /* Explicit async: without argument.  */
+  async = acc_async_noval;
+  /* Parallelism dimensions: literal.  */
   num_gangs = 30;
   num_workers = 3;
   vector_length = 5;
@@ -212,6 +215,7 @@ int main()
 #define N 100
     int x[N];
 #pragma acc kernels /* { dg-line l_compute[incr c_compute] } */ \
+  async \
   num_gangs (30) num_workers (3) vector_length (5)
     /* { dg-note {OpenACC 'kernels' decomposition: variable 'i' declared in block requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
        { dg-note {variable 'i' made addressable} {} { target *-*-* } l_compute$c_compute } */
@@ -234,8 +238,10 @@ int main()
 #undef N
   }
 
-  /* Parallelism dimensions: variable.  */
   STATE_OP (state, = 0);
+  /* Explicit async: variable.  */
+  async = 123;
+  /* Parallelism dimensions: variable.  */
   num_gangs = 22;
   num_workers = 5;
   vector_length = 7;
@@ -243,6 +249,7 @@ int main()
 #define N 100
     int x[N];
 #pragma acc kernels /* { dg-line l_compute[incr c_compute] } */ \
+  async (async) \
   num_gangs (num_gangs) num_workers (num_workers) vector_length (vector_length)
     /* { dg-note {OpenACC 'kernels' decomposition: variable 'i' declared in block requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
        { dg-note {variable 'i' made addressable} {} { target *-*-* } l_compute$c_compute } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
index 9b4493d..27f86d3 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
@@ -830,8 +830,6 @@ static void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *eve
 
 int main()
 {
-  acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
-
   STATE_OP (state, = 0);
   reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
   reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c
index 5b58c51..a723ad9 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c
@@ -143,8 +143,6 @@ typedef struct E
 
 int main()
 {
-  acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
-
   A A1;
   DEBUG_printf ("s=%zd, vb=%zd\n", sizeof A1, VALID_BYTES_A);
   assert (VALID_BYTES_A <= sizeof A1);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c
index f537868..5c05ee3 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c
@@ -16,7 +16,7 @@ static void cb_any_event (acc_prof_info *prof_info, acc_event_info *event_info,
 {
   DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
 
-  assert (prof_info->version == 201711);
+  assert (prof_info->version == 201811);
 
   ++ev_count;
 }
@@ -56,8 +56,6 @@ void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_look
 
 int main()
 {
-  acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
-
   ev_count = 0;
 
   /* Trigger tests done in 'cb_*' functions.  */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-firstprivate-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-firstprivate-1.c
index 8900a4e..4b88c53 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-firstprivate-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-firstprivate-1.c
@@ -1,6 +1,12 @@
 /* Test behavior of 'firstprivate' lexically vs. dynamically nested inside a
    'data' region.  */
 
+/* The firstprivate_int optimization changes the semantics of firstprivate
+   in dynamically_nested_compute_2 to copy-by-value when not using shared
+   memory, leading to the behaviour suggested in PR92036 for this case.  */
+
+/* { dg-xfail-run-if "firstprivate_int" { *-*-* } { "-DACC_MEM_SHARED=0" } } */
+
 #include <stdlib.h>
 
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-int.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-int.c
new file mode 100644
index 0000000..6d14599
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-int.c
@@ -0,0 +1,67 @@
+/* Verify the GOMP_MAP_FIRSTPRIVATE_INT optimization on various types.  */
+
+#include <assert.h>
+#include <stdint.h>
+#include <complex.h>
+
+int
+main ()
+{
+  int8_t  i8i  = -1, i8o;
+  int16_t i16i = -2, i16o;
+  int32_t i32i = -3, i32o;
+  int64_t i64i = -4, i64o;
+
+  uint8_t  u8i  = 1,  u8o;
+  uint16_t u16i = 2, u16o;
+  uint32_t u32i = 3, u32o;
+  uint64_t u64i = 4, u64o;
+
+  float  r32i = .5, r32o;
+  double r64i = .25, r64o;
+
+  int _Complex    cii = 2, cio;
+  float _Complex  cfi = 4, cfo;
+  double _Complex cdi = 8, cdo;
+
+#pragma acc parallel firstprivate (i8i,i16i,i32i,i64i,u8i,u16i,u32i,u64i) \
+  firstprivate(r32i,r64i,cii,cfi,cdi) copyout(i8o,i16o,i32o,i64o) \
+  copyout(u8o,u16o,u32o,u64o,r32o,r64o,cio,cfo,cdo) num_gangs(1)
+  {
+    i8o = i8i;
+    i16o = i16i;
+    i32o = i32i;
+    i64o = i64i;
+
+    u8o = u8i;
+    u16o = u16i;
+    u32o = u32i;
+    u64o = u64i;
+
+    r32o = r32i;
+    r64o = r64i;
+
+    cio = cii;
+    cfo = cfi;
+    cdo = cdi;
+  }
+
+  assert (i8o == i8i);
+  assert (i16o == i16i);
+  assert (i32o == i32i);
+  assert (i64o == i64i);
+
+  assert (u8o == u8i);
+  assert (u16o == u16i);
+  assert (u32o == u32i);
+  assert (u64o == u64i);
+
+  assert (r32o == r32i);
+  assert (r64o == r64i);
+
+  assert (cio == cii);
+  assert (cfo == cfi);
+  assert (cdo == cdi);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/implicit-mapping-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/implicit-mapping-1.c
new file mode 100644
index 0000000..ed0ab94
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/implicit-mapping-1.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+
+#include <string.h>
+#include <assert.h>
+
+int main(void)
+{
+  int arr[100];
+
+  memset (arr, 0, sizeof (int) * 100);
+
+#pragma acc enter data copyin(arr[30:10])
+
+#pragma acc serial
+/* { dg-warning {using .vector_length \(32\)., ignoring 1} "" { target openacc_nvidia_accel_selected } .-1 } */
+  {
+    arr[33] = 66;
+  }
+
+#pragma acc exit data copyout(arr[30:10])
+
+  assert (arr[33] == 66);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
index 00e0ca8..0c46f95 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
@@ -10,46 +10,14 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuModuleLoad (&module, "./subr.ptx");
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuModuleLoad failed: %d\n", r);
@@ -63,20 +31,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
     abort ();
@@ -91,7 +45,7 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -119,11 +73,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c
index a2918c0..b28d115 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c
@@ -2,6 +2,7 @@
 /* { dg-additional-options "-lcuda" } */
 /* { dg-require-effective-target openacc_cuda } */
 
+#include <sys/time.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -11,47 +12,17 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  const int N = 10;
+  const int N = 3;
   int i;
   CUstream streams[N];
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t diff;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -66,20 +37,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   for (i = 0; i < N; i++)
     {
       streams[i] = (CUstream) acc_get_cuda_stream (i);
@@ -97,9 +54,29 @@ main (int argc, char **argv)
 	  abort ();
     }
 
+  gettimeofday (&tv1, NULL);
+
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[0], NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+      abort ();
+    }
+
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxLaunch failed: %d\n", r);
+      abort ();
+    }
+
+  gettimeofday (&tv2, NULL);
+
+  diff = tv2.tv_sec - tv1.tv_sec;
+
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -113,7 +90,7 @@ main (int argc, char **argv)
 	}
     }
 
-  sleep ((int) (dtime / 1000.0f) + 1);
+  sleep ((diff + 1) * N);
 
   for (i = 0; i < N; i++)
     {
@@ -124,10 +101,6 @@ main (int argc, char **argv)
 	}
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
 
   acc_shutdown (acc_device_nvidia);
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c
index 99b62f1..025cd8a 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c
@@ -11,45 +11,13 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -64,20 +32,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
   if (r != CUDA_SUCCESS)
     {
@@ -88,7 +42,7 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
     
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -101,7 +55,12 @@ main (int argc, char **argv)
       abort ();
     }
 
-  sleep ((int) (dtime / 1000.f) + 1);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize () failed: %d\n", r);
+      abort ();
+    }
 
   if (acc_async_test_all () != 1)
     {
@@ -109,11 +68,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c
index 5b4b3fd..21e0f8c 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c
@@ -2,6 +2,7 @@
 /* { dg-additional-options "-lcuda" } */
 /* { dg-require-effective-target openacc_cuda } */
 
+#include <sys/time.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
@@ -11,47 +12,15 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  const int N = 10;
+  const int N = 6;
   int i;
   CUstream streams[N];
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -66,20 +35,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   for (i = 0; i < N; i++)
     {
       streams[i] = (CUstream) acc_get_cuda_stream (i);
@@ -99,13 +54,12 @@ main (int argc, char **argv)
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
 	  abort ();
 	}
-
     }
 
   if (acc_async_test_all () != 0)
@@ -114,7 +68,12 @@ main (int argc, char **argv)
       abort ();
     }
 
-  sleep ((int) (dtime / 1000.0f) + 1);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+      abort ();
+    }
 
   if (acc_async_test_all () != 1)
     {
@@ -122,11 +81,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
index 939f255..13953df 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
@@ -6,77 +6,53 @@
 #include <stdlib.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
       abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize ();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
+  gettimeofday (&tv2, NULL);
 
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
@@ -92,11 +68,9 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
+  gettimeofday (&tv1, NULL);
 
-  start_timer (0);
-
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -104,38 +78,31 @@ main (int argc, char **argv)
     }
 
   acc_wait (0);
-  /* Test unseen async-argument.  */
-  acc_wait (1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
+
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  if (atime < dtime)
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 1\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   acc_wait (0);
-  /* Test unseen async-argument.  */
-  acc_wait (1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
+
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  if (0.010 < atime)
+  if (t2 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 2\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c
index 804ee39..96c3675 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c
@@ -7,78 +7,55 @@
 #include <stdlib.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime, hitime, lotime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
       abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize ();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  N = nprocs;
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
+  gettimeofday (&tv2, NULL);
 
-  acc_map_data (a, d_a, nbytes);
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
@@ -94,16 +71,11 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -113,27 +85,18 @@ main (int argc, char **argv)
       acc_wait (0);
     }
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  hitime = dtime * N;
-  hitime += hitime * 0.02;
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  lotime = dtime * N;
-  lotime -= lotime * 0.02;
+  t1 *= N;
 
-  if (atime > hitime || atime < lotime)
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c
index f904526..0ec97dd 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c
@@ -7,78 +7,55 @@
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream *streams;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime, hitime, lotime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
       abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize ();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  N = nprocs;
+  gettimeofday (&tv2, NULL);
 
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   streams = (CUstream *) malloc (N * sizeof (void *));
 
@@ -99,16 +76,11 @@ main (int argc, char **argv)
 	  abort ();
     }
 
-  init_timers (1);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -118,27 +90,19 @@ main (int argc, char **argv)
       acc_wait (i);
     }
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  hitime = dtime * N;
-  hitime += hitime * 0.02;
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  lotime = dtime * N;
-  lotime -= lotime * 0.02;
+  t1 *= N;
 
-  if (atime > hitime || atime < lotime)
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
   free (streams);
-  free (a);
-  acc_free (d_a);
 
   acc_shutdown (acc_device_nvidia);
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c
index d8cba4d..fb191c6 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c
@@ -7,77 +7,53 @@
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
       abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize ();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
+  gettimeofday (&tv2, NULL);
 
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
@@ -93,11 +69,9 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
+  gettimeofday (&tv1, NULL);
 
-  start_timer (0);
-
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -106,33 +80,30 @@ main (int argc, char **argv)
 
   acc_wait_all ();
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
+
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  if (atime < dtime)
+  if (t2 > (t1 + (t1 * 0.10)))
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 1\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   acc_wait_all ();
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
+
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  if (0.010 < atime)
+  if (t2 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 2\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
index b805d5f..af8aa11 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
@@ -7,80 +7,55 @@
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime, hitime, lotime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
-
-  devnum = 2;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
       abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize ();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  N = nprocs;
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
+  gettimeofday (&tv2, NULL);
 
-  acc_map_data (a, d_a, nbytes);
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
   if (r != CUDA_SUCCESS)
@@ -106,16 +81,11 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -157,7 +127,7 @@ main (int argc, char **argv)
 
   acc_wait (1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
   if (acc_async_test (0) != 1)
     abort ();
@@ -165,25 +135,16 @@ main (int argc, char **argv)
   if (acc_async_test (1) != 1)
     abort ();
 
-  hitime = dtime * N;
-  hitime += hitime * 0.02;
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  lotime = dtime * N;
-  lotime -= lotime * 0.02;
+  t1 *= N;
 
-  if (atime > hitime || atime < lotime)
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
index 958672c..902d257 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
@@ -7,78 +7,55 @@
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream *streams, stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
-      abort ();
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+	abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize ();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
-      abort ();
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+	abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 500.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
+  gettimeofday (&tv2, NULL);
 
-  N = nprocs;
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   streams = (CUstream *) malloc (N * sizeof (void *));
 
@@ -99,11 +76,6 @@ main (int argc, char **argv)
 	  abort ();
     }
 
-  init_timers (1);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   stream = (CUstream) acc_get_cuda_stream (N);
   if (stream != NULL)
     abort ();
@@ -118,11 +90,11 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (N, stream))
     abort ();
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -130,6 +102,10 @@ main (int argc, char **argv)
 	}
     }
 
+  gettimeofday (&tv2, NULL);
+
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
   acc_wait_all_async (N);
 
   for (i = 0; i <= N; i++)
@@ -146,15 +122,13 @@ main (int argc, char **argv)
 	abort ();
     }
 
-  atime = stop_timer (0);
-
-  if (atime < dtime)
+  if ((t1 * N) < t2)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 1\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   stream = (CUstream) acc_get_cuda_stream (N + 1);
   if (stream != NULL)
@@ -174,35 +148,33 @@ main (int argc, char **argv)
 
   acc_wait (N + 1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
+
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  if (0.10 < atime)
+  if (t1 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 2\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   acc_wait_all_async (N);
 
   acc_wait (N);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (0.10 < atime)
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t1 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 3\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
   free (streams);
-  free (a);
-  acc_free (d_a);
 
   acc_shutdown (acc_device_nvidia);
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c
index a36f8e6..054ffbf 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c
@@ -11,46 +11,18 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay2;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 32;
   int i;
   CUstream *streams;
-  unsigned long **a, **d_a, *tid, ticks;
+  unsigned long **a, **d_a, *tid;
   int nbytes;
-  void *kargs[3];
-  int clkrate;
-  int devnum, nprocs;
+  void *kargs[2];
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -67,10 +39,6 @@ main (int argc, char **argv)
 
   nbytes = sizeof (int);
 
-  ticks = (unsigned long) (200.0 * clkrate);
-
-  N = nprocs;
-
   streams = (CUstream *) malloc (N * sizeof (void *));
 
   a = (unsigned long **) malloc (N * sizeof (unsigned long *));
@@ -104,8 +72,7 @@ main (int argc, char **argv)
   for (i = 0; i < N; i++)
     {
       kargs[0] = (void *) &d_a[i];
-      kargs[1] = (void *) &ticks;
-      kargs[2] = (void *) &tid[i];
+      kargs[1] = (void *) &tid[i];
 
       r = cuLaunchKernel (delay2, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
       if (r != CUDA_SUCCESS)
@@ -113,8 +80,6 @@ main (int argc, char **argv)
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
 	  abort ();
 	}
-
-      ticks = (unsigned long) (50.0 * clkrate);
     }
 
   acc_wait_all_async (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-93.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-93.c
new file mode 100644
index 0000000..b18155d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-93.c
@@ -0,0 +1,19 @@
+/* { dg-do run { target { ! openacc_nvidia_accel_present } } } */
+
+#include <stdio.h>
+#include <openacc.h>
+
+int
+main (void)
+{
+  fprintf (stderr, "CheCKpOInT\n");
+  acc_init (acc_device_nvidia);
+
+  acc_shutdown (acc_device_nvidia);
+
+  return 0;
+}
+
+/* { dg-output "CheCKpOInT(\n|\r\n|\r).*" } */
+/* { dg-output "device type nvidia not supported" } */
+/* { dg-shouldfail "" } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
index c13cab7..4182755 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
@@ -107,7 +107,7 @@ int vector_1 (int *ary, int size)
   {
 #pragma acc loop gang
     for (int jx = 0; jx < 1; jx++)
-#pragma acc loop auto
+#pragma acc loop auto independent
       for (int ix = 0; ix < size; ix++)
 	ary[ix] = place ();
   }
@@ -123,7 +123,7 @@ int vector_2 (int *ary, int size)
   {
 #pragma acc loop worker
     for (int jx = 0; jx < size  / 64; jx++)
-#pragma acc loop auto
+#pragma acc loop auto independent
       for (int ix = 0; ix < 64; ix++)
 	ary[ix + jx * 64] = place ();
   }
@@ -139,7 +139,7 @@ int worker_1 (int *ary, int size)
   {
 #pragma acc loop gang
     for (int kx = 0; kx < 1; kx++)
-#pragma acc loop auto
+#pragma acc loop auto independent
       for (int jx = 0; jx <  size  / 64; jx++)
 #pragma acc loop vector
 	for (int ix = 0; ix < 64; ix++)
@@ -156,7 +156,7 @@ int gang_1 (int *ary, int size)
 #pragma acc parallel num_gangs (32) num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
   /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-1 } */
   {
-#pragma acc loop auto
+#pragma acc loop auto independent
     for (int jx = 0; jx <  size  / 64; jx++)
 #pragma acc loop worker
       for (int ix = 0; ix < 64; ix++)
@@ -172,11 +172,11 @@ int gang_2 (int *ary, int size)
   
 #pragma acc parallel num_gangs (32) num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
   {
-#pragma acc loop auto
+#pragma acc loop auto independent
     for (int kx = 0; kx < size / (32 * 32); kx++)
-#pragma acc loop auto
+#pragma acc loop auto independent
       for (int jx = 0; jx <  32; jx++)
-#pragma acc loop auto
+#pragma acc loop auto independent
 	for (int ix = 0; ix < 32; ix++)
 	  ary[ix + jx * 32 + kx * 32 * 32] = place ();
   }
@@ -190,9 +190,9 @@ int gang_3 (int *ary, int size)
   
 #pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
   {
-#pragma acc loop auto
+#pragma acc loop auto independent
     for (int jx = 0; jx <  size  / 64; jx++)
-#pragma acc loop auto
+#pragma acc loop auto independent
       for (int ix = 0; ix < 64; ix++)
 	ary[ix + jx * 64] = place ();
   }
@@ -206,7 +206,7 @@ int gang_4 (int *ary, int size)
   
 #pragma acc parallel vector_length(32) copy(ary[0:size]) firstprivate (size)
   {
-#pragma acc loop auto
+#pragma acc loop auto independent
     for (int jx = 0; jx <  size; jx++)
       ary[jx] = place ();
   }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-default-compile.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-default-compile.c
new file mode 100644
index 0000000..6c479e4
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-default-compile.c
@@ -0,0 +1,13 @@
+/* { dg-additional-options "-fopenacc-dim=16:16" } */
+/* This code uses nvptx inline assembly guarded with acc_on_device, which is
+   not optimized away at -O0, and then confuses the target assembler.
+   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
+/* { dg-set-target-env-var "GOMP_OPENACC_DIM" "8:8" } */
+
+#include "loop-default.h"
+
+int main ()
+{
+  /* Environment should be ignored.  */
+  return test_1 (16, 16, 32);
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
index d3f6ea2..18d56f6d 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
@@ -67,12 +67,23 @@ int main ()
       int expected = ix;
       if(ondev)
 	{
-	  int chunk_size = (N + gangsize * workersize * vectorsize - 1)
-			   / (gangsize * workersize * vectorsize);
+#if defined (ACC_DEVICE_TYPE_radeon) && defined (__OPTIMIZE__)
+	  int use_vectorsize = 64;
+#else
+	  int use_vectorsize = vectorsize;
+#endif
+	  int chunk_size = (N + gangsize * workersize * use_vectorsize - 1)
+			   / (gangsize * workersize * use_vectorsize);
 	  
+#ifdef ACC_DEVICE_TYPE_radeon
+	  int g = ix / (chunk_size * workersize * use_vectorsize);
+	  int w = (ix / (chunk_size * use_vectorsize)) % workersize;
+	  int v = 0;
+#else
 	  int g = ix / (chunk_size * workersize * vectorsize);
 	  int w = (ix / vectorsize) % workersize;
 	  int v = ix % vectorsize;
+#endif
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
index 4099d60..e29e89d 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
@@ -64,12 +64,23 @@ int main ()
       int val = ix;
       if (ondev)
 	{
-	  int chunk_size = (N + gangsize * workersize * vectorsize - 1)
-			   / (gangsize * workersize * vectorsize);
-	  
+#if defined (ACC_DEVICE_TYPE_radeon) && defined (__OPTIMIZE__)
+	  int use_vectorsize = 64;
+#else
+	  int use_vectorsize = vectorsize;
+#endif
+	  int chunk_size = (N + gangsize * workersize * use_vectorsize - 1)
+			   / (gangsize * workersize * use_vectorsize);
+
+#ifdef ACC_DEVICE_TYPE_radeon
+	  int g = ix / (chunk_size * workersize * use_vectorsize);
+	  int w = (ix / (chunk_size * use_vectorsize)) % workersize;
+	  int v = 0;
+#else
 	  int g = ix / (chunk_size * vectorsize * workersize);
 	  int w = ix / vectorsize % workersize;
 	  int v = ix % vectorsize;
+#endif
 
 	  val = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
index fadb262..616cf50 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
@@ -63,8 +63,24 @@ int main ()
       if(ondev)
 	{
 	  int g = 0;
+#ifdef ACC_DEVICE_TYPE_radeon
+#  ifdef __OPTIMIZE__
+	  int use_vecsize = 64;
+#  else
+	  int use_vecsize = vectorsize;
+#  endif
+	  /* For Radeon, the loop is split into contiguous blocks of
+	     chunk_size * vector_size, with chunk_size selected to cover the
+	     whole iteration space.  Each block is then autovectorized where
+	     possible.  */
+	  int chunk_size = (N + workersize * use_vecsize - 1)
+			   / (workersize * use_vecsize);
+	  int w = ix / (chunk_size * use_vecsize);
+	  int v = 0;
+#else
 	  int w = (ix / vectorsize) % workersize;
 	  int v = ix % vectorsize;
+#endif
 
 	  val = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
index 7732606..560b748 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
@@ -65,8 +65,24 @@ int main ()
       if(ondev)
 	{
 	  int g = 0;
+#ifdef ACC_DEVICE_TYPE_radeon
+#  ifdef __OPTIMIZE__
+	  int use_vecsize = 64;
+#  else
+	  int use_vecsize = vectorsize;
+#  endif
+	  /* For Radeon, the loop is split into contiguous blocks of
+	     chunk_size * vector_size, with chunk_size selected to cover the
+	     whole iteration space.  Each block is then autovectorized where
+	     possible.  */
+	  int chunk_size = (N + workersize * use_vecsize - 1)
+			   / (workersize * use_vecsize);
+	  int w = ix / (chunk_size * use_vecsize);
+	  int v = 0;
+#else
 	  int w = (ix / vectorsize) % workersize;
 	  int v = ix % vectorsize;
+#endif
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-1.c
new file mode 100644
index 0000000..a70375c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-1.c
@@ -0,0 +1,103 @@
+/* { dg-do run } */
+
+#include <stdlib.h>
+#include <assert.h>
+
+#define n 100
+#define m 100
+
+int b[n][m];
+
+void
+test1 (void)
+{
+  int i, j, *a[100];
+
+  /* Array of pointers form test.  */
+  for (i = 0; i < n; i++)
+    {
+      a[i] = (int *)malloc (sizeof (int) * m);
+      for (j = 0; j < m; j++)
+	b[i][j] = j - i;
+    }
+
+  #pragma acc parallel loop copyout(a[0:n][0:m]) copyin(b)
+  for (i = 0; i < n; i++)
+    #pragma acc loop
+    for (j = 0; j < m; j++)
+      a[i][j] = b[i][j];
+
+  for (i = 0; i < n; i++)
+    {
+      for (j = 0; j < m; j++)
+	assert (a[i][j] == b[i][j]);
+      /* Clean up.  */
+      free (a[i]);
+    }
+}
+
+void
+test2 (void)
+{
+  int i, j, **a = (int **) malloc (sizeof (int *) * n);
+
+  /* Separately allocated blocks.  */
+  for (i = 0; i < n; i++)
+    {
+      a[i] = (int *)malloc (sizeof (int) * m);
+      for (j = 0; j < m; j++)
+	b[i][j] = j - i;
+    }
+
+  #pragma acc parallel loop copyout(a[0:n][0:m]) copyin(b)
+  for (i = 0; i < n; i++)
+    #pragma acc loop
+    for (j = 0; j < m; j++)
+      a[i][j] = b[i][j];
+
+  for (i = 0; i < n; i++)
+    {
+      for (j = 0; j < m; j++)
+	assert (a[i][j] == b[i][j]);
+      /* Clean up.  */
+      free (a[i]);
+    }
+  free (a);
+}
+
+void
+test3 (void)
+{
+  int i, j, **a = (int **) malloc (sizeof (int *) * n);
+  a[0] = (int *) malloc (sizeof (int) * n * m);
+
+  /* Rows allocated in one contiguous block.  */
+  for (i = 0; i < n; i++)
+    {
+      a[i] = *a + i * m;
+      for (j = 0; j < m; j++)
+	b[i][j] = j - i;
+    }
+
+  #pragma acc parallel loop copyout(a[0:n][0:m]) copyin(b)
+  for (i = 0; i < n; i++)
+    #pragma acc loop
+    for (j = 0; j < m; j++)
+      a[i][j] = b[i][j];
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < m; j++)
+      assert (a[i][j] == b[i][j]);
+
+  free (a[0]);
+  free (a);
+}
+
+int
+main (void)
+{
+  test1 ();
+  test2 ();
+  test3 ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-2.c
new file mode 100644
index 0000000..b85c637
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-2.c
@@ -0,0 +1,37 @@
+/* { dg-do run } */
+
+#include <assert.h>
+#include "noncontig_array-utils.h"
+
+int
+main (void)
+{
+  int n = 10;
+  int ***a = (int ***) create_ncarray (sizeof (int), n, 3);
+  int ***b = (int ***) create_ncarray (sizeof (int), n, 3);
+  int ***c = (int ***) create_ncarray (sizeof (int), n, 3);
+
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < n; j++)
+      for (int k = 0; k < n; k++)
+	{
+	  a[i][j][k] = i + j * k + k;
+	  b[i][j][k] = j + k * i + i * j;
+	  c[i][j][k] = a[i][j][k];
+	}
+
+  #pragma acc parallel copy (a[0:n][0:n][0:n]) copyin (b[0:n][0:n][0:n])
+  {
+    for (int i = 0; i < n; i++)
+      for (int j = 0; j < n; j++)
+	for (int k = 0; k < n; k++)
+	  a[i][j][k] += b[k][j][i] + i + j + k;
+  }
+
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < n; j++)
+      for (int k = 0; k < n; k++)
+	assert (a[i][j][k] == c[i][j][k] + b[k][j][i] + i + j + k);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-3.c
new file mode 100644
index 0000000..99db207
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-3.c
@@ -0,0 +1,45 @@
+/* { dg-do run } */
+
+#include <assert.h>
+#include "noncontig_array-utils.h"
+
+int main (void)
+{
+  int n = 20, x = 5, y = 12;
+  int *****a = (int *****) create_ncarray (sizeof (int), n, 5);
+
+  int sum1 = 0, sum2 = 0, sum3 = 0;
+
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < n; j++)
+      for (int k = 0; k < n; k++)
+	for (int l = 0; l < n; l++)
+	  for (int m = 0; m < n; m++)
+	    {
+	      a[i][j][k][l][m] = 1;
+	      sum1++;
+	    }
+
+  #pragma acc parallel copy (a[x:y][x:y][x:y][x:y][x:y]) copy(sum2)
+  {
+    for (int i = x; i < x + y; i++)
+      for (int j = x; j < x + y; j++)
+	for (int k = x; k < x + y; k++)
+	  for (int l = x; l < x + y; l++)
+	    for (int m = x; m < x + y; m++)
+	      {
+		a[i][j][k][l][m] = 0;
+		sum2++;
+	      }
+  }
+
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < n; j++)
+      for (int k = 0; k < n; k++)
+	for (int l = 0; l < n; l++)
+	  for (int m = 0; m < n; m++)
+	    sum3 += a[i][j][k][l][m];
+
+  assert (sum1 == sum2 + sum3);
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-4.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-4.c
new file mode 100644
index 0000000..6cfaf98
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-4.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+
+#include <assert.h>
+#include "noncontig_array-utils.h"
+
+int main (void)
+{
+  int n = 128;
+  double ***a = (double ***) create_ncarray (sizeof (double), n, 3);
+  double ***b = (double ***) create_ncarray (sizeof (double), n, 3);
+
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < n; j++)
+      for (int k = 0; k < n; k++)
+	a[i][j][k] = i + j + k + i * j * k;
+
+  /* This test exercises async copyout of non-contiguous array rows.  */
+  #pragma acc parallel copyin(a[0:n][0:n][0:n]) copyout(b[0:n][0:n][0:n]) async(5)
+  {
+    #pragma acc loop gang
+    for (int i = 0; i < n; i++)
+      #pragma acc loop vector
+      for (int j = 0; j < n; j++)
+	for (int k = 0; k < n; k++)
+	  b[i][j][k] = a[i][j][k] * 2.0;
+  }
+
+  #pragma acc wait (5)
+
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < n; j++)
+      for (int k = 0; k < n; k++)
+	assert (b[i][j][k] == a[i][j][k] * 2.0);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-utils.h b/libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-utils.h
new file mode 100644
index 0000000..6900d1f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/noncontig_array-utils.h
@@ -0,0 +1,44 @@
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <stdint.h>
+
+/* Allocate and create a pointer based NDIMS-dimensional array,
+   each dimension DIMLEN long, with ELSIZE sized data elements.  */
+void *
+create_ncarray (size_t elsize, int dimlen, int ndims)
+{
+  size_t blk_size = 0;
+  size_t n = 1;
+
+  for (int i = 0; i < ndims - 1; i++)
+    {
+      n *= dimlen;
+      blk_size += sizeof (void *) * n;
+    }
+  size_t data_rows_num = n;
+  size_t data_rows_offset = blk_size;
+  blk_size += elsize * n * dimlen;
+
+  void *blk = (void *) malloc (blk_size);
+  memset (blk, 0, blk_size);
+  void **curr_dim = (void **) blk;
+  n = 1;
+
+  for (int d = 0; d < ndims - 1; d++)
+    {
+      uintptr_t next_dim = (uintptr_t) (curr_dim + n * dimlen);
+      size_t next_dimlen = dimlen * (d < ndims - 2 ? sizeof (void *) : elsize);
+
+      for (int b = 0; b < n; b++)
+	for (int i = 0; i < dimlen; i++)
+	  if (d < ndims - 1)
+	    curr_dim[b * dimlen + i]
+	      = (void*) (next_dim + b * dimlen * next_dimlen + i * next_dimlen);
+
+      n *= dimlen;
+      curr_dim = (void**) next_dim;
+    }
+  assert (n == data_rows_num);
+  return blk;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/par-reduction-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/par-reduction-3.c
new file mode 100644
index 0000000..856ef0e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/par-reduction-3.c
@@ -0,0 +1,29 @@
+/* Check a parallel reduction which is are explicitly initialized by
+   the user.  */
+
+#include <assert.h>
+
+int
+main ()
+{
+  int n = 10;
+  float accel = 1.0, host = 1.0;
+  int i;
+
+#pragma acc parallel copyin(n) reduction(*:accel)
+  {
+    accel = 1.0;
+#pragma acc loop gang reduction(*:accel)
+    for( i = 1; i <= n; i++)
+      {
+	accel *= 2.0;
+      }
+  }
+
+  for (i = 1; i <= n; i++)
+    host *= 2.0;
+
+  assert (accel == host);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr70828-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr70828-2.c
new file mode 100644
index 0000000..357114c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr70828-2.c
@@ -0,0 +1,34 @@
+/* Subarray declared on data construct, accessed through pointer.  */
+
+#include <assert.h>
+
+void
+s1 (int *arr, int c)
+{
+#pragma acc data copy(arr[5:c-10])
+  {
+#pragma acc parallel loop
+    for (int i = 5; i < c - 5; i++)
+      arr[i] = i;
+  }
+}
+
+int
+main (int argc, char* argv[])
+{
+  const int c = 100;
+  int arr[c];
+
+  for (int i = 0; i < c; i++)
+    arr[i] = 0;
+
+  s1 (arr, c);
+
+  for (int i = 0; i < c; i++)
+    if (i >= 5 && i < c - 5)
+      assert (arr[i] == i);
+    else
+      assert (arr[i] == 0);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr70828.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr70828.c
new file mode 100644
index 0000000..4b6dbd7
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr70828.c
@@ -0,0 +1,27 @@
+/* Subarray declared on enclosing data construct.  */
+
+#include <assert.h>
+
+int
+main ()
+{
+  int a[100], i;
+
+  for (i = 0; i < 100; i++)
+    a[i] = 0;
+
+#pragma acc data copy(a[10:80])
+  {
+    #pragma acc parallel loop
+    for (i = 10; i < 90; i++)
+      a[i] = i;
+  }
+
+  for (i = 0; i < 100; i++)
+    if (i >= 10 && i < 90)
+      assert (a[i] == i);
+    else
+      assert (a[i] == 0);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/privatize-reduction-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/privatize-reduction-1.c
new file mode 100644
index 0000000..206e66f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/privatize-reduction-1.c
@@ -0,0 +1,41 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main (int argc, char *argv[])
+{
+#define N 100
+  int n = N;
+  int i, j, tmp;
+  int input[N*N], output[N], houtput[N];
+
+  for (i = 0; i < n * n; i++)
+    input[i] = i;
+
+  for (i = 0; i < n; i++)
+    {
+      tmp = 0;
+      for (j = 0; j < n; j++)
+	tmp += input[i * n + j];
+      houtput[i] = tmp;
+    }
+
+  #pragma acc parallel loop gang
+  for (i = 0; i < n; i++)
+    {
+      tmp = 0;
+
+      #pragma acc loop worker reduction(+:tmp)
+      for (j = 0; j < n; j++)
+	tmp += input[i * n + j];
+
+      output[i] = tmp;
+    }
+
+  /* Test if every worker-level reduction had correct private result.  */
+  for (i = 0; i < n; i++)
+    if (houtput[i] != output[i])
+      abort ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/privatize-reduction-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/privatize-reduction-2.c
new file mode 100644
index 0000000..0c317dc
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/privatize-reduction-2.c
@@ -0,0 +1,23 @@
+#include <assert.h>
+
+int
+main ()
+{
+  const int n = 1000;
+  int i, j, temp, a[n];
+
+#pragma acc parallel loop
+  for (i = 0; i < n; i++)
+    {
+      temp = i;
+#pragma acc loop reduction (+:temp)
+      for (j = 0; j < n; j++)
+	temp ++;
+      a[i] = temp;
+    }
+
+  for (i = 0; i < n; i++)
+    assert (a[i] == i+n);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-1.c
new file mode 100644
index 0000000..6f1b86a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-1.c
@@ -0,0 +1,69 @@
+/* { dg-do run } */
+
+/* Array reductions.  */
+
+#include <stdlib.h>
+#include "reduction.h"
+
+#define ng 8
+#define nw 4
+#define vl 32
+
+#define N 10
+
+#define check_reduction_array_op_all(type, opr, init, b)	\
+  check_reduction_xxx_xx_all(array, op, type, opr, init, b)
+#define check_reduction_arraysec_op_all(type, opr, init, b)	\
+  check_reduction_xxx_xx_all(arraysec, op, type, opr, init, b)
+#define check_reduction_array_macro_all(type, opr, init, b)	\
+  check_reduction_xxx_xx_all(array, macro, type, opr, init, b)
+#define check_reduction_arraysec_macro_all(type, opr, init, b)	\
+  check_reduction_xxx_xx_all(arraysec, macro, type, opr, init, b)
+    
+int
+main (void)
+{
+  const int n = 100;
+  int ints[n];
+  float flts[n];
+  double dbls[n];
+  int cmp_val = 5;
+
+  for (int i = 0; i < n; i++)
+    {
+      ints[i] = i + 1;
+      flts[i] = i + 1;
+      dbls[i] = i + 1;
+    }
+
+  check_reduction_array_op_all (int, +, 0, ints[i]);
+  check_reduction_array_op_all (int, *, 1, ints[i]);
+  check_reduction_array_op_all (int, &, -1, ints[i]);
+  check_reduction_array_op_all (int, |, 0, ints[i]);
+  check_reduction_array_op_all (int, ^, 0, ints[i]);
+  check_reduction_array_op_all (int, &&, 1, (cmp_val > ints[i]));
+  check_reduction_array_op_all (int, ||, 0, (cmp_val > ints[i]));
+  check_reduction_array_macro_all (int, min, n + 1, ints[i]);
+  check_reduction_array_macro_all (int, max, -1, ints[i]);
+
+  check_reduction_array_op_all (float, +, 0, flts[i]);
+  check_reduction_array_op_all (float, *, 1, flts[i]);
+  check_reduction_array_macro_all (float, min, n + 1, flts[i]);
+  check_reduction_array_macro_all (float, max, -1, flts[i]);
+
+  check_reduction_arraysec_op_all (int, +, 0, ints[i]);
+  check_reduction_arraysec_op_all (float, *, 1, flts[i]);
+  check_reduction_arraysec_macro_all (double, min, n + 1, dbls[i]);
+  check_reduction_arraysec_macro_all (double, max, -1, dbls[i]);
+
+  check_reduction_array_op_all (double, +, 0, dbls[i]);
+#if 0
+  /* Currently fails due to unclear issue, presumably unrelated to reduction
+     mechanics. Avoiding for now.  */
+  check_reduction_array_op_all (double, *, 1.0, dbls[i]);
+#endif
+  check_reduction_array_macro_all (double, min, n + 1, dbls[i]);
+  check_reduction_array_macro_all (double, max, -1, dbls[i]);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c
new file mode 100644
index 0000000..db8b374
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c
@@ -0,0 +1,115 @@
+/* { dg-do run } */
+
+/* More array reduction tests, different combinations of parallel/loop
+   construct, implied/explicit copy clauses, and subarrays. */
+
+#define ARRAY_BODY(ARRAY, MIN, LEN)		\
+  for (int i = 0; i < 10; i++)			\
+    for (int j = MIN; j < MIN + LEN; j++)	\
+      ARRAY[j] += 1;
+
+int main (void)
+{
+  int o[6] = { 5, 1, 1, 5, 9, 9 };
+  int a[6];
+
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    a[i] = o[i];
+
+  #pragma acc parallel
+  #pragma acc loop reduction(+:a[1:2])
+  ARRAY_BODY (a, 1, 2)
+  ARRAY_BODY (o, 1, 2)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel
+  #pragma acc loop gang reduction(+:a[1:2])
+  ARRAY_BODY (a, 1, 2)
+  ARRAY_BODY (o, 1, 2)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a[3:2])
+  #pragma acc loop reduction(+:a[3:2])
+  ARRAY_BODY (a, 3, 2)
+  ARRAY_BODY (o, 3, 2)
+  for (int i = 0; i < 6; i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a[3:2])
+  #pragma acc loop worker reduction(+:a[3:2])
+  ARRAY_BODY (a, 3, 2)
+  ARRAY_BODY (o, 3, 2)
+  for (int i = 0; i < 6; i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a)
+  #pragma acc loop reduction(+:a[0:5])
+  ARRAY_BODY (a, 0, 5)
+  ARRAY_BODY (o, 0, 5)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a)
+  #pragma acc loop vector reduction(+:a[0:5])
+  ARRAY_BODY (a, 0, 5)
+  ARRAY_BODY (o, 0, 5)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel
+  #pragma acc loop reduction(+:a)
+  ARRAY_BODY (a, 4, 1)
+  ARRAY_BODY (o, 4, 1)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a)
+  #pragma acc loop reduction(+:a)
+  ARRAY_BODY (a, 3, 3)
+  ARRAY_BODY (o, 3, 3)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+#if !defined(ACC_DEVICE_TYPE_host)
+
+  #pragma acc parallel loop reduction(+:a)
+  ARRAY_BODY (a, 1, 3)
+  ARRAY_BODY (o, 1, 3)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel loop reduction(+:a[2:3])
+  ARRAY_BODY (a, 2, 3)
+  ARRAY_BODY (o, 2, 3)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel reduction(+:a)
+  ARRAY_BODY (a, 3, 2)
+  ARRAY_BODY (o, 3, 2)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel reduction(+:a[1:2])
+  ARRAY_BODY (a, 1, 2)
+  ARRAY_BODY (o, 1, 2)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+#endif
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c
new file mode 100644
index 0000000..0f023b7
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c
@@ -0,0 +1,114 @@
+/* { dg-do run } */
+
+/* Same as reduction-arrays-2.c test, but with non-constant subarray
+   base indexes.  */
+
+#define ARRAY_BODY(ARRAY, MIN, LEN)		\
+  for (int i = 0; i < 10; i++)			\
+    for (int j = MIN; j < MIN + LEN; j++)	\
+      ARRAY[j] += 1;
+
+int zero = 0;
+int one = 1;
+int two = 2;
+int three = 3;
+int four = 4;
+
+int main (void)
+{
+  int o[6] = { 5, 1, 1, 5, 9, 9 };
+  int a[6];
+
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    a[i] = o[i];
+
+  #pragma acc parallel
+  #pragma acc loop reduction(+:a[one:2])
+  ARRAY_BODY (a, one, 2)
+  ARRAY_BODY (o, one, 2)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel
+  #pragma acc loop gang reduction(+:a[one:2])
+  ARRAY_BODY (a, one, 2)
+  ARRAY_BODY (o, one, 2)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a[three:2])
+  #pragma acc loop reduction(+:a[three:2])
+  ARRAY_BODY (a, three, 2)
+  ARRAY_BODY (o, three, 2)
+  for (int i = 0; i < 6; i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a[three:2])
+  #pragma acc loop worker reduction(+:a[three:2])
+  ARRAY_BODY (a, three, 2)
+  ARRAY_BODY (o, three, 2)
+  for (int i = 0; i < 6; i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a)
+  #pragma acc loop reduction(+:a[zero:5])
+  ARRAY_BODY (a, zero, 5)
+  ARRAY_BODY (o, zero, 5)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a)
+  #pragma acc loop vector reduction(+:a[zero:5])
+  ARRAY_BODY (a, zero, 5)
+  ARRAY_BODY (o, zero, 5)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel
+  #pragma acc loop reduction(+:a)
+  ARRAY_BODY (a, four, 1)
+  ARRAY_BODY (o, four, 1)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a)
+  #pragma acc loop reduction(+:a)
+  ARRAY_BODY (a, three, 3)
+  ARRAY_BODY (o, three, 3)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+#if !defined(ACC_DEVICE_TYPE_host)
+
+  #pragma acc parallel loop reduction(+:a)
+  ARRAY_BODY (a, one, 3)
+  ARRAY_BODY (o, one, 3)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel loop reduction(+:a[two:3])
+  ARRAY_BODY (a, two, 3)
+  ARRAY_BODY (o, two, 3)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel reduction(+:a[one:2])
+  ARRAY_BODY (a, one, 2)
+  ARRAY_BODY (o, one, 2)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+#endif
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c
new file mode 100644
index 0000000..94dd4c4
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c
@@ -0,0 +1,115 @@
+/* { dg-do run } */
+
+/* Same as reduction-arrays-3.c test, but additionally with
+   non-constant subarray lengths.  */
+
+#define ARRAY_BODY(ARRAY, MIN, LEN)		\
+  for (int i = 0; i < 10; i++)			\
+    for (int j = MIN; j < MIN + LEN; j++)	\
+      ARRAY[j] += 1;
+
+int zero = 0;
+int one = 1;
+int two = 2;
+int three = 3;
+int four = 4;
+int five = 5;
+
+int main (void)
+{
+  int o[6] = { 5, 1, 1, 5, 9, 9 };
+  int a[6];
+
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    a[i] = o[i];
+
+  #pragma acc parallel
+  #pragma acc loop reduction(+:a[one:two])
+  ARRAY_BODY (a, one, two)
+  ARRAY_BODY (o, one, two)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel
+  #pragma acc loop gang reduction(+:a[one:two])
+  ARRAY_BODY (a, one, two)
+  ARRAY_BODY (o, one, two)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a[three:two])
+  #pragma acc loop reduction(+:a[three:two])
+  ARRAY_BODY (a, three, two)
+  ARRAY_BODY (o, three, two)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a[three:two])
+  #pragma acc loop worker reduction(+:a[three:two])
+  ARRAY_BODY (a, three, two)
+  ARRAY_BODY (o, three, two)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a)
+  #pragma acc loop reduction(+:a[zero:five])
+  ARRAY_BODY (a, zero, five)
+  ARRAY_BODY (o, zero, five)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a)
+  #pragma acc loop vector reduction(+:a[zero:five])
+  ARRAY_BODY (a, zero, five)
+  ARRAY_BODY (o, zero, five)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel
+  #pragma acc loop reduction(+:a)
+  ARRAY_BODY (a, four, one)
+  ARRAY_BODY (o, four, one)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a)
+  #pragma acc loop reduction(+:a)
+  ARRAY_BODY (a, three, three)
+  ARRAY_BODY (o, three, three)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+#if !defined(ACC_DEVICE_TYPE_host)
+
+  #pragma acc parallel loop reduction(+:a)
+  ARRAY_BODY (a, one, three)
+  ARRAY_BODY (o, one, three)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel loop reduction(+:a[two:three])
+  ARRAY_BODY (a, two, three)
+  ARRAY_BODY (o, two, three)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel reduction(+:a[one:two])
+  ARRAY_BODY (a, one, two)
+  ARRAY_BODY (o, one, two)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+#endif
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c
new file mode 100644
index 0000000..56ae020
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c
@@ -0,0 +1,113 @@
+/* { dg-do run } */
+
+/* Same as reduction-arrays-4.c test, but reduced arrays are VLAs.  */
+
+#define ARRAY_BODY(ARRAY, MIN, LEN)		\
+  for (int i = 0; i < 10; i++)			\
+    for (int j = MIN; j < MIN + LEN; j++)	\
+      ARRAY[j] += 1;
+
+int zero = 0;
+int one = 1;
+int two = 2;
+int three = 3;
+int four = 4;
+int five = 5;
+int six = 6;
+
+int main (void)
+{
+  int init[6] = { 5, 1, 1, 5, 9, 9 };
+  int o[six];
+  int a[six];
+
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    a[i] = o[i] = init[i];
+
+  #pragma acc parallel
+  #pragma acc loop reduction(+:a[one:two])
+  ARRAY_BODY (a, one, two)
+  ARRAY_BODY (o, one, two)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel
+  #pragma acc loop gang reduction(+:a[one:two])
+  ARRAY_BODY (a, one, two)
+  ARRAY_BODY (o, one, two)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a[three:two])
+  #pragma acc loop reduction(+:a[three:two])
+  ARRAY_BODY (a, three, two)
+  ARRAY_BODY (o, three, two)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a[three:two])
+  #pragma acc loop worker reduction(+:a[three:two])
+  ARRAY_BODY (a, three, two)
+  ARRAY_BODY (o, three, two)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a)
+  #pragma acc loop reduction(+:a[zero:five])
+  ARRAY_BODY (a, zero, five)
+  ARRAY_BODY (o, zero, five)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a)
+  #pragma acc loop vector reduction(+:a[zero:five])
+  ARRAY_BODY (a, zero, five)
+  ARRAY_BODY (o, zero, five)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel
+  #pragma acc loop reduction(+:a)
+  ARRAY_BODY (a, four, one)
+  ARRAY_BODY (o, four, one)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel copy(a)
+  #pragma acc loop reduction(+:a)
+  ARRAY_BODY (a, three, three)
+  ARRAY_BODY (o, three, three)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel loop reduction(+:a)
+  ARRAY_BODY (a, one, three)
+  ARRAY_BODY (o, one, three)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel loop reduction(+:a[two:three])
+  ARRAY_BODY (a, two, three)
+  ARRAY_BODY (o, two, three)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  #pragma acc parallel reduction(+:a[one:two])
+  ARRAY_BODY (a, one, two)
+  ARRAY_BODY (o, one, two)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt-2.c
new file mode 100644
index 0000000..350174a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt-2.c
@@ -0,0 +1,32 @@
+#include <complex.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef float _Complex Type;
+
+#define N 32
+
+int
+main (void)
+{
+  Type ary[N];
+
+  for (int ix = 0; ix < N;  ix++)
+    ary[ix] = 1.0 + 1.0j;
+
+  Type tprod = 1.0;
+
+#pragma acc parallel vector_length(32)
+  {
+#pragma acc loop vector reduction (*:tprod)
+    for (int ix = 0; ix < N; ix++)
+      tprod *= ary[ix];
+  }
+
+  Type expected = 65536.0;
+
+  if (tprod != expected)
+    abort ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-structs-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-structs-1.c
new file mode 100644
index 0000000..22216ff
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-structs-1.c
@@ -0,0 +1,121 @@
+/* { dg-do run } */
+
+/* Struct reductions.  */
+
+#include <stdlib.h>
+#include "reduction.h"
+
+#define ng 8
+#define nw 4
+#define vl 32
+
+#define N 10
+
+typedef struct { int x, y; } int_pair;
+typedef struct { float m, n; } flt_pair;
+typedef struct
+{
+  int i;
+  double d;
+  float f;
+  int a[N];
+  int_pair ip;
+  flt_pair fp;
+} rectype;
+
+static void
+init_struct (rectype *rec, int val)
+{
+  rec->i = val;
+  rec->d = (double) val;
+  rec->f = (float) val;
+  for (int i = 0; i < N; i++)
+    rec->a[i] = val;
+  rec->ip.x = val;
+  rec->ip.y = val;
+  rec->fp.m = (float) val;
+  rec->fp.n = (float) val;
+}
+
+static int
+struct_eq (rectype *a, rectype *b)
+{
+  if (a->i != b->i || a->d != b->d
+      || a->f != b->f
+      || a->ip.x != b->ip.x
+      || a->ip.y != b->ip.y
+      || a->fp.m != b->fp.m
+      || a->fp.n != b->fp.n)
+    return 0;
+
+  for (int i = 0; i < N; i++)
+    if (a->a[i] != b->a[i])
+      return 0;
+  return 1;
+}
+
+#define check_reduction_struct_xx(type, op, init, b, gwv_par, gwv_loop, apply) \
+  {									\
+    type res, vres;							\
+    init_struct (&res, init);						\
+    DO_PRAGMA (acc parallel gwv_par copy(res))				\
+    DO_PRAGMA (acc loop gwv_loop reduction (op:res))			\
+    for (int i = 0; i < n; i++)						\
+      {									\
+	res.i = apply (op, res.i, b);					\
+	res.d = apply (op, res.d, b);					\
+	res.f = apply (op, res.f, b);					\
+	for (int j = 0; j < N; j++)					\
+	  res.a[j] = apply (op, res.a[j], b);				\
+	res.ip.x = apply (op, res.ip.x, b);				\
+	res.ip.y = apply (op, res.ip.y, b);				\
+	res.fp.m = apply (op, res.fp.m, b);				\
+	res.fp.n = apply (op, res.fp.n, b);				\
+      }									\
+									\
+    init_struct (&vres, init);						\
+    for (int i = 0; i < n; i++)						\
+      {									\
+        vres.i = apply (op, vres.i, b);					\
+	vres.d = apply (op, vres.d, b);					\
+	vres.f = apply (op, vres.f, b);					\
+	for (int j = 0; j < N; j++)					\
+	  vres.a[j] = apply (op, vres.a[j], b);				\
+	vres.ip.x = apply (op, vres.ip.x, b);				\
+	vres.ip.y = apply (op, vres.ip.y, b);				\
+	vres.fp.m = apply (op, vres.fp.m, b);				\
+	vres.fp.n = apply (op, vres.fp.n, b);				\
+      }									\
+									\
+    if (!struct_eq (&res, &vres))					\
+      __builtin_abort ();						\
+  }
+
+#define operator_apply(op, a, b) (a op b)
+#define check_reduction_struct_op(type, op, init, b, gwv_par, gwv_loop)	\
+  check_reduction_struct_xx(type, op, init, b, gwv_par, gwv_loop, operator_apply)
+
+#define function_apply(op, a, b) (op (a, b))
+#define check_reduction_struct_macro(type, op, init, b, gwv_par, gwv_loop) \
+  check_reduction_struct_xx(type, op, init, b, gwv_par, gwv_loop, function_apply)
+
+#define check_reduction_struct_op_all(type, opr, init, b)	\
+  check_reduction_xxx_xx_all (struct, op, type, opr, init, b)
+#define check_reduction_struct_macro_all(type, opr, init, b)		\
+  check_reduction_xxx_xx_all (struct, macro, type, opr, init, b)
+
+int
+main (void)
+{
+  const int n = 10;
+  int ints[n];
+
+  for (int i = 0; i < n; i++)
+    ints[i] = i + 1;
+
+  check_reduction_struct_op_all (rectype, +, 0, ints[i]);
+  check_reduction_struct_op_all (rectype, *, 1, ints[i]);
+  check_reduction_struct_macro_all (rectype, min, n + 1, ints[i]);
+  check_reduction_struct_macro_all (rectype, max, -1, ints[i]);
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction.h b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction.h
index 1b3f8d4..c928578 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction.h
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction.h
@@ -37,6 +37,58 @@ DO_PRAGMA (acc loop gwv_loop reduction (op:res))			\
       abort ();								\
   }
 
+#define check_reduction_array_xx(type, var, var_in_clause, op, init, b, \
+				 gwv_par, gwv_loop, apply)		\
+  {									\
+   type var[N], var ## _check[N];					\
+   for (int i = 0; i < N; i++)						\
+     var[i] = var ## _check[i] = (init);				\
+   DO_PRAGMA (acc parallel gwv_par copy (var_in_clause))		\
+   DO_PRAGMA (acc loop gwv_loop reduction (op: var_in_clause))		\
+   for (int i = 0; i < n; i++)						\
+     for (int j = 0; j < N; j++)					\
+       var[j] = apply (op, var[j], (b));				\
+									\
+   for (int i = 0; i < n; i++)						\
+     for (int j = 0; j < N; j++)					\
+       var ## _check[j] = apply (op, var ## _check[j], (b));		\
+									\
+   for (int j = 0; j < N; j++)						\
+     if (var[j] != var ## _check[j])					\
+       abort ();							\
+  }
+
+#define operator_apply(op, a, b) (a op b)
+#define check_reduction_array_op(type, op, init, b, gwv_par, gwv_loop)	\
+  check_reduction_array_xx (type, v, v, op, init, b, gwv_par, gwv_loop,	\
+			    operator_apply)
+#define check_reduction_arraysec_op(type, op, init, b, gwv_par, gwv_loop) \
+  check_reduction_array_xx (type, v, v[:N], op, init, b, gwv_par, gwv_loop, \
+			    operator_apply)
+
+
+#define function_apply(op, a, b) (op (a, b))
+#define check_reduction_array_macro(type, op, init, b, gwv_par, gwv_loop)\
+  check_reduction_array_xx (type, v, v, op, init, b, gwv_par, gwv_loop,	\
+			    function_apply)
+#define check_reduction_arraysec_macro(type, op, init, b, gwv_par, gwv_loop)\
+  check_reduction_array_xx (type, v, v[:N], op, init, b, gwv_par, gwv_loop, \
+			    function_apply)
+
+#define check_reduction_xxx_xx_all(tclass, form, type, op, init, b)	\
+  check_reduction_ ## tclass ## _ ## form (type, op, init, b, num_gangs (ng), gang);	\
+  check_reduction_ ## tclass ## _ ## form (type, op, init, b, num_workers (nw), worker); \
+  check_reduction_ ## tclass ## _ ## form (type, op, init, b, vector_length (vl), vector); \
+  check_reduction_ ## tclass ## _ ## form (type, op, init, b,			\
+					   num_gangs (ng) num_workers (nw), gang worker); \
+  check_reduction_ ## tclass ## _ ## form (type, op, init, b,			\
+					   num_gangs (ng) vector_length (vl), gang vector); \
+  check_reduction_ ## tclass ## _ ## form (type, op, init, b,			\
+					   num_workers (nw) vector_length (vl), worker vector); \
+  check_reduction_ ## tclass ## _ ## form (type, op, init, b, \
+					   num_gangs (ng) num_workers (nw) vector_length (vl), \
+					   gang worker vector);
+
 #define max(a, b) (((a) > (b)) ? (a) : (b))
 #define min(a, b) (((a) < (b)) ? (a) : (b))
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
index 81e0811..59249a0 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
@@ -62,12 +62,23 @@ int main ()
       int expected = ix;
       if(ondev)
 	{
-	  int chunk_size = (N + gangsize * workersize * vectorsize - 1)
-			   / (gangsize * workersize * vectorsize);
+#if defined (ACC_DEVICE_TYPE_radeon) && defined (__OPTIMIZE__)
+	  int use_vectorsize = 64;
+#else
+	  int use_vectorsize = vectorsize;
+#endif
+	  int chunk_size = (N + gangsize * workersize * use_vectorsize - 1)
+			   / (gangsize * workersize * use_vectorsize);
 	  
-	  int g = ix / (chunk_size * vectorsize * workersize);
+#ifdef ACC_DEVICE_TYPE_radeon
+	  int g = ix / (chunk_size * workersize * use_vectorsize);
+	  int w = (ix / (chunk_size * use_vectorsize)) % workersize;
+	  int v = 0;
+#else
+	  int g = ix / (chunk_size * workersize * vectorsize);
 	  int w = (ix / vectorsize) % workersize;
 	  int v = ix % vectorsize;
+#endif
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
index 647d075..8eada23 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
@@ -61,8 +61,24 @@ int main ()
       if(ondev)
 	{
 	  int g = 0;
+#ifdef ACC_DEVICE_TYPE_radeon
+#  ifdef __OPTIMIZE__
+	  int use_vecsize = 64;
+#  else
+	  int use_vecsize = vectorsize;
+#  endif
+	  /* For Radeon, the loop is split into contiguous blocks of
+	     chunk_size * vector_size, with chunk_size selected to cover the
+	     whole iteration space.  Each block is then autovectorized where
+	     possible.  */
+	  int chunk_size = (N + workersize * use_vecsize - 1)
+			   / (workersize * use_vecsize);
+	  int w = ix / (chunk_size * use_vecsize);
+	  int v = 0;
+#else
 	  int w = (ix / vectorsize) % workersize;
 	  int v = ix % vectorsize;
+#endif
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h b/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h
index 9db236c..a99c08d 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h
@@ -1,46 +1,23 @@
-
-#if ACC_DEVICE_TYPE_nvidia
-
 #pragma acc routine nohost
-static int clock (void)
-{
-  int thetime;
-
-  asm __volatile__ ("mov.u32 %0, %%clock;" : "=r"(thetime));
-
-  return thetime;
-}
-
-#endif
-
 void
-delay (unsigned long *d_o, unsigned long delay)
+delay ()
 {
-  int start, ticks;
+  int i, sum;
+  const int N = 500000;
 
-  start = clock ();
-
-  ticks = 0;
-
-  while (ticks < delay)
-    ticks = clock () - start;
-
-  return;
+  for (i = 0; i < N; i++)
+    sum = sum + 1;
 }
 
+#pragma acc routine nohost
 void
-delay2 (unsigned long *d_o, unsigned long delay, unsigned long tid)
+delay2 (unsigned long *d_o, unsigned long tid)
 {
-  int start, ticks;
+  int i, sum;
+  const int N = 500000;
 
-  start = clock ();
-
-  ticks = 0;
-
-  while (ticks < delay)
-    ticks = clock () - start;
+  for (i = 0; i < N; i++)
+    sum = sum + 1;
 
   d_o[0] = tid;
-
-  return;
 }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx b/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx
index 6f748fc..88b63bf 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx
@@ -1,148 +1,90 @@
-// BEGIN PREAMBLE
-	.version	3.1
-	.target	sm_30
+	.version 3.1
+	.target sm_30
 	.address_size 64
-// END PREAMBLE
 
-// BEGIN FUNCTION DEF: clock
-.func (.param.u32 %out_retval)clock
-{
-.reg.u32 %retval;
-	.reg.u64 %hr10;
-	.reg.u32 %r22;
-	.reg.u32 %r23;
-	.reg.u32 %r24;
-	.local.align 8 .b8 %frame[8];
-	// #APP 
-// 7 "subr.c" 1
-	mov.u32 %r24, %clock;
-// 0 "" 2
-	// #NO_APP 
-		st.local.u32	[%frame], %r24;
-		ld.local.u32	%r22, [%frame];
-		mov.u32	%r23, %r22;
-		mov.u32	%retval, %r23;
-	st.param.u32	[%out_retval], %retval;
-	ret;
-	}
-// END FUNCTION DEF
-// BEGIN GLOBAL FUNCTION DEF: delay
-.visible .entry delay(.param.u64 %in_ar1, .param.u64 %in_ar2)
-{
-	.reg.u64 %ar1;
-	.reg.u64 %ar2;
-	.reg.u64 %hr10;
-	.reg.u64 %r22;
-	.reg.u32 %r23;
-	.reg.u64 %r24;
-	.reg.u64 %r25;
-	.reg.u32 %r26;
-	.reg.u32 %r27;
-	.reg.u32 %r28;
-	.reg.u32 %r29;
-	.reg.u32 %r30;
-	.reg.u64 %r31;
-	.reg.pred %r32;
-	.local.align 8 .b8 %frame[24];
-	ld.param.u64 %ar1, [%in_ar1];
-	ld.param.u64 %ar2, [%in_ar2];
-		mov.u64	%r24, %ar1;
-		st.u64	[%frame+8], %r24;
-		mov.u64	%r25, %ar2;
-		st.local.u64	[%frame+16], %r25;
+	.visible .entry delay
 	{
-		.param.u32 %retval_in;
-	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r26, [%retval_in];
-}
-		st.local.u32	[%frame+4], %r26;
-		mov.u32	%r27, 0;
-		st.local.u32	[%frame], %r27;
-		bra	$L4;
-$L5:
-	{
-		.param.u32 %retval_in;
-	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r28, [%retval_in];
-}
-		mov.u32	%r23, %r28;
-		ld.local.u32	%r30, [%frame+4];
-		sub.u32	%r29, %r23, %r30;
-		st.local.u32	[%frame], %r29;
-$L4:
-		ld.local.s32	%r22, [%frame];
-		ld.local.u64	%r31, [%frame+16];
-		setp.lo.u64 %r32,%r22,%r31;
-	@%r32	bra	$L5;
+	.reg .u64 %hr10;
+	.reg .u32 %r22;
+	.reg .u32 %r23;
+	.reg .u32 %r24;
+	.reg .u32 %r25;
+	.reg .u32 %r26;
+	.reg .u32 %r27;
+	.reg .u32 %r28;
+	.reg .u32 %r29;
+	.reg .pred %r30;
+	.reg .u64 %frame;
+	.local .align 8 .b8 %farray[16];
+	cvta.local.u64 %frame,%farray;
+	mov.u32 %r22,500000;
+	st.u32 [%frame+8],%r22;
+	mov.u32 %r23,0;
+	st.u32 [%frame],%r23;
+	bra $L2;
+	$L3:
+	ld.u32 %r25,[%frame+4];
+	add.u32 %r24,%r25,1;
+	st.u32 [%frame+4],%r24;
+	ld.u32 %r27,[%frame];
+	add.u32 %r26,%r27,1;
+	st.u32 [%frame],%r26;
+	$L2:
+	ld.u32 %r28,[%frame];
+	ld.u32 %r29,[%frame+8];
+	setp.lt.s32 %r30,%r28,%r29;
+	@%r30 
+	bra $L3;
 	ret;
 	}
-// END FUNCTION DEF
-// BEGIN GLOBAL FUNCTION DEF: delay2
-.visible .entry delay2(.param.u64 %in_ar1, .param.u64 %in_ar2, .param.u64 %in_ar3)
-{
-	.reg.u64 %ar1;
-	.reg.u64 %ar2;
-	.reg.u64 %ar3;
-	.reg.u64 %hr10;
-	.reg.u64 %r22;
-	.reg.u32 %r23;
-	.reg.u64 %r24;
-	.reg.u64 %r25;
-	.reg.u64 %r26;
-	.reg.u32 %r27;
-	.reg.u32 %r28;
-	.reg.u32 %r29;
-	.reg.u32 %r30;
-	.reg.u32 %r31;
-	.reg.u64 %r32;
-	.reg.pred %r33;
-	.reg.u64 %r34;
-	.reg.u64 %r35;
-	.local.align 8 .b8 %frame[32];
-	ld.param.u64 %ar1, [%in_ar1];
-	ld.param.u64 %ar2, [%in_ar2];
-	ld.param.u64 %ar3, [%in_ar3];
-		mov.u64	%r24, %ar1;
-		st.local.u64	[%frame+8], %r24;
-		mov.u64	%r25, %ar2;
-		st.local.u64	[%frame+16], %r25;
-		mov.u64	%r26, %ar3;
-		st.local.u64	[%frame+24], %r26;
-	{
-		.param.u32 %retval_in;
-	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r27, [%retval_in];
-}
-		st.local.u32	[%frame+4], %r27;
-		mov.u32	%r28, 0;
-		st.local.u32	[%frame], %r28;
-		bra	$L8;
-$L9:
-	{
-		.param.u32 %retval_in;
+
+	.visible .entry delay2 (.param .u64 %in_ar1, .param .u64 %in_ar2)
 	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r29, [%retval_in];
-}
-		mov.u32	%r23, %r29;
-		ld.local.u32	%r31, [%frame+4];
-		sub.u32	%r30, %r23, %r31;
-		st.local.u32	[%frame], %r30;
-$L8:
-		ld.local.s32	%r22, [%frame];
-		ld.local.u64	%r32, [%frame+16];
-		setp.lo.u64 %r33,%r22,%r32;
-	@%r33	bra	$L9;
-		ld.local.u64	%r34, [%frame+8];
-		ld.local.u64	%r35, [%frame+24];
-		st.u64	[%r34], %r35;
+	.reg .u64 %ar1;
+	.reg .u64 %ar2;
+	.reg .u64 %hr10;
+	.reg .u64 %r22;
+	.reg .u64 %r23;
+	.reg .u32 %r24;
+	.reg .u32 %r25;
+	.reg .u32 %r26;
+	.reg .u32 %r27;
+	.reg .u32 %r28;
+	.reg .u32 %r29;
+	.reg .u32 %r30;
+	.reg .u32 %r31;
+	.reg .pred %r32;
+	.reg .u64 %r33;
+	.reg .u64 %r34;
+	.reg .u64 %frame;
+	.local .align 8 .b8 %farray[32];
+	cvta.local.u64 %frame,%farray;
+	ld.param.u64 %ar1,[%in_ar1];
+	ld.param.u64 %ar2,[%in_ar2];
+	mov.u64 %r22,%ar1;
+	st.u64 [%frame+16],%r22;
+	mov.u64 %r23,%ar2;
+	st.u64 [%frame+24],%r23;
+	mov.u32 %r24,500000;
+	st.u32 [%frame+8],%r24;
+	mov.u32 %r25,0;
+	st.u32 [%frame],%r25;
+	bra $L5;
+	$L6:
+	ld.u32 %r27,[%frame+4];
+	add.u32 %r26,%r27,1;
+	st.u32 [%frame+4],%r26;
+	ld.u32 %r29,[%frame];
+	add.u32 %r28,%r29,1;
+	st.u32 [%frame],%r28;
+	$L5:
+	ld.u32 %r30,[%frame];
+	ld.u32 %r31,[%frame+8];
+	setp.lt.s32 %r32,%r30,%r31;
+	@%r32 
+	bra $L6;
+	ld.u64 %r33,[%frame+16];
+	ld.u64 %r34,[%frame+24];
+	st.u64 [%r33],%r34;
 	ret;
 	}
-// END FUNCTION DEF
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/timer.h b/libgomp/testsuite/libgomp.oacc-c-c++-common/timer.h
index 53749da..e69de29 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/timer.h
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/timer.h
@@ -1,103 +0,0 @@
-
-#include <stdio.h>
-#include <cuda.h>
-
-static int _Tnum_timers;
-static CUevent *_Tstart_events, *_Tstop_events;
-static CUstream _Tstream;
-
-void
-init_timers (int ntimers)
-{
-  int i;
-  CUresult r;
-
-  _Tnum_timers = ntimers;
-
-  _Tstart_events = (CUevent *) malloc (_Tnum_timers * sizeof (CUevent));
-  _Tstop_events = (CUevent *) malloc (_Tnum_timers * sizeof (CUevent));
-
-  r = cuStreamCreate (&_Tstream, CU_STREAM_DEFAULT);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuStreamCreate failed: %d\n", r);
-      abort ();
-    }
-
-  for (i = 0; i < _Tnum_timers; i++)
-    {
-      r = cuEventCreate (&_Tstart_events[i], CU_EVENT_DEFAULT);
-      if (r != CUDA_SUCCESS)
-	{
-	  fprintf (stderr, "cuEventCreate failed: %d\n", r);
-	  abort ();
-	}
-
-      r = cuEventCreate (&_Tstop_events[i], CU_EVENT_DEFAULT);
-      if (r != CUDA_SUCCESS)
-	{
-	  fprintf (stderr, "cuEventCreate failed: %d\n", r);
-	  abort ();
-	}
-    }
-}
-
-void
-fini_timers (void)
-{
-  int i;
-
-  for (i = 0; i < _Tnum_timers; i++)
-    {
-      cuEventDestroy (_Tstart_events[i]);
-      cuEventDestroy (_Tstop_events[i]);
-    }
-
-  cuStreamDestroy (_Tstream);
-
-  free (_Tstart_events);
-  free (_Tstop_events);
-}
-
-void
-start_timer (int timer)
-{
-  CUresult r;
-
-  r = cuEventRecord (_Tstart_events[timer], _Tstream);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventRecord failed: %d\n", r);
-      abort ();
-    }
-}
-
-float
-stop_timer (int timer)
-{
-  CUresult r;
-  float etime;
-
-  r = cuEventRecord (_Tstop_events[timer], _Tstream);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventRecord failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuEventSynchronize (_Tstop_events[timer]);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventSynchronize failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuEventElapsedTime (&etime, _Tstart_events[timer], _Tstop_events[timer]);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventElapsedTime failed: %d\n", r);
-      abort ();
-    }
-
-  return etime;
-}
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/acc_memcpy_device-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/acc_memcpy_device-1.f90
new file mode 100644
index 0000000..8f3a8f0
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/acc_memcpy_device-1.f90
@@ -0,0 +1,113 @@
+! { dg-prune-output "using .vector_length \\(32\\)" }
+
+! PR libgomp/93226  */
+
+module m
+  use iso_c_binding
+  use openacc
+  implicit none (external, type)
+
+  integer, parameter :: N = 1024
+
+  integer :: D(N)
+  !$acc declare device_resident(D)
+
+contains
+
+  integer(c_intptr_t) function init_d()
+    !$acc routine
+    integer :: i
+    do i = 1, N
+      D(i) = 27*i
+    end do
+    init_d = loc(D)
+  end
+end module
+
+program main
+  use m
+  implicit none (external, type)
+
+  integer, allocatable, target :: a(:), b(:), e(:)
+  type(c_ptr) :: d_a, d_b, d_c, d_d, d_e, d_f
+  integer(c_intptr_t) intptr
+  integer :: i
+  logical fail
+
+  fail = .false.
+
+  allocate(a(N), b(N), e(N))
+  d_c = acc_malloc (N*c_sizeof (i))
+  d_f = acc_malloc (N*c_sizeof (i))
+
+  e = huge(e)
+  call acc_copyin (e, N*c_sizeof (i));
+  d_e = acc_deviceptr (e);
+
+  !$acc serial copyout(intptr)
+    intptr = init_d ()
+  !$acc end serial
+  d_d = transfer(intptr, d_d)
+  call acc_memcpy_device (d_c, d_d, N*c_sizeof (i))
+
+  !$acc serial copy(fail) copy(a) deviceptr(d_c, d_d) firstprivate(intptr)
+    block
+      integer, pointer :: cc(:), dd(:)
+      call c_f_pointer (d_c, cc, [N])
+      call c_f_pointer (d_d, dd, [N])
+      a = cc
+      do i = 1, N
+        if (dd(i) /= 27*i .or. cc(i) /= 27*i) then
+          fail = .true.
+          stop 1
+        end if
+      end do
+    end block
+  !$acc end serial
+  if (fail) error stop 1
+
+  do i = 1, N
+    a(i) = 11*i
+    b(i) = 31*i
+  end do
+
+  call acc_copyin (a, N*c_sizeof (i))
+  d_a = acc_deviceptr (a)
+  call acc_copyin_async (b, N*c_sizeof (i), acc_async_noval)
+
+  !$acc parallel deviceptr(d_c) private(i) async
+    block
+      integer, pointer :: cc(:)
+      call c_f_pointer (d_c, cc, [N])
+      !$acc loop
+      do i = 1, N
+        cc(i) = -17*i
+      end do
+    end block
+  !$acc end parallel
+
+  call acc_memcpy_device_async (d_d, d_a, N*c_sizeof (i), acc_async_noval)
+  call acc_memcpy_device_async (d_f, d_c, N*c_sizeof (i), acc_async_noval)
+  call acc_wait (acc_async_noval)
+  d_b = acc_deviceptr (b)
+  call acc_memcpy_device_async (d_e, d_b, N*c_sizeof (i), acc_async_noval)
+  call acc_wait (acc_async_noval)
+
+  !$acc serial deviceptr(d_d, d_e, d_f) private(i) copy(fail)
+    block
+    integer, pointer :: dd(:), ee(:), ff(:)
+    call c_f_pointer (d_d, dd, [N])
+    call c_f_pointer (d_e, ee, [N])
+    call c_f_pointer (d_f, ff, [N])
+    do i = 1, N
+      if (dd(i) /= 11*i        &
+          .or. ee(i) /= 31*i   &
+          .or. ff(i) /= -17*i) then
+        fail = .true.
+        stop 2
+      end if
+    end do
+    end block
+  !$acc end serial
+  if (fail) error stop 2
+end
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/allocatable-scalar.f90 b/libgomp/testsuite/libgomp.oacc-fortran/allocatable-scalar.f90
new file mode 100644
index 0000000..42b3408
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/allocatable-scalar.f90
@@ -0,0 +1,33 @@
+! Test non-declared allocatable scalars in OpenACC data clauses.
+
+! { dg-do run }
+
+program main
+  implicit none
+  integer, parameter :: n = 100
+  integer, allocatable :: a, c
+  integer :: i, b(n)
+
+  allocate (a)
+
+  a = 50
+
+  !$acc parallel loop
+  do i = 1, n;
+     b(i) = a
+  end do
+
+  do i = 1, n
+     if (b(i) /= a) stop 1
+  end do
+
+  allocate (c)
+
+  !$acc parallel copyout(c) num_gangs(1)
+  c = a
+  !$acc end parallel
+
+  if (c /= a) stop 2
+
+  deallocate (a, c)
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/data-3.f90 b/libgomp/testsuite/libgomp.oacc-fortran/data-3.f90
index 19eb4bd..b5586be 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/data-3.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/data-3.f90
@@ -55,7 +55,8 @@ program asyncwait
   c(:) = 0.0
   d(:) = 0.0
 
-  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) create (d(1:N))
+  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) &
+  !$acc& create (d(1:N))
 
   !$acc parallel async (1)
   do i = 1, N
@@ -76,7 +77,8 @@ program asyncwait
   !$acc end parallel
 
   !$acc wait (1)
-  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) copyout (d(1:N))
+  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) &
+  !$acc& copyout (d(1:N))
 
   do i = 1, N
      if (a(i) .ne. 3.0) STOP 5
@@ -91,7 +93,8 @@ program asyncwait
   d(:) = 0.0
   e(:) = 0.0
 
-  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) create (d(1:N)) copyin (e(1:N))
+  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) &
+  !$acc& create (d(1:N)) copyin (e(1:N))
 
   !$acc parallel async (1)
   do i = 1, N
@@ -118,7 +121,8 @@ program asyncwait
   !$acc end parallel
 
   !$acc wait (1)
-  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) copyout (d(1:N)) copyout (e(1:N))
+  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) &
+  !$acc& copyout (d(1:N)) copyout (e(1:N))
   !$acc exit data delete (N)
 
   do i = 1, N
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-1-directive.f90 b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-1-directive.f90
index 759873b..6e53dc5 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-1-directive.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-1-directive.f90
@@ -2,11 +2,10 @@
 
 ! { dg-do run }
 
-!TODO-OpenACC-declare-allocate
-! Missing support for OpenACC "Changes from Version 2.0 to 2.5":
+! We've got support for OpenACC "Changes from Version 2.0 to 2.5":
 ! "The 'declare create' directive with a Fortran 'allocatable' has new behavior".
-! Thus, after 'allocate'/before 'deallocate', do
-! '!$acc enter data create'/'!$acc exit data delete' manually.
+! Yet, after 'allocate'/before 'deallocate', do
+! '!$acc enter data create'/'!$acc exit data delete' manually, too.
 
 !TODO { dg-additional-options -fno-inline } for stable results regarding OpenACC 'routine'.
 
@@ -213,9 +212,9 @@ program test
   !$acc exit data delete (b)
   deallocate (b)
 end program test ! { dg-line l[incr c] }
-! { dg-bogus {note: variable 'overflow\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {TODO n/a} { xfail *-*-* } l$c }
-! { dg-bogus {note: variable 'not_prev_allocated\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {TODO n/a} { xfail *-*-* } l$c }
-! { dg-bogus {note: variable 'parm\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: artificial} {TODO n/a} { xfail *-*-* } l$c }
+! { dg-bogus {note: variable 'overflow\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l$c }
+! { dg-bogus {note: variable 'not_prev_allocated\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l$c }
+! { dg-bogus {note: variable 'parm\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: artificial} {} { target *-*-* } l$c }
 
 ! Set each element in array 'b' at index i to i*2.
 
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-1-runtime.f90 b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-1-runtime.f90
index e4cb9c3..0072827 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-1-runtime.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-1-runtime.f90
@@ -2,11 +2,10 @@
 
 ! { dg-do run }
 
-!TODO-OpenACC-declare-allocate
-! Missing support for OpenACC "Changes from Version 2.0 to 2.5":
+! We've got support for OpenACC "Changes from Version 2.0 to 2.5":
 ! "The 'declare create' directive with a Fortran 'allocatable' has new behavior".
-! Thus, after 'allocate'/before 'deallocate', call 'acc_create'/'acc_delete'
-! manually.
+! Yet, after 'allocate'/before 'deallocate', call 'acc_create'/'acc_delete'
+! manually, too.
 
 !TODO { dg-additional-options -fno-inline } for stable results regarding OpenACC 'routine'.
 
@@ -213,9 +212,9 @@ program test
   call acc_delete (b)
   deallocate (b)
 end program test ! { dg-line l[incr c] }
-! { dg-bogus {note: variable 'overflow\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {TODO n/a} { xfail *-*-* } l$c }
-! { dg-bogus {note: variable 'not_prev_allocated\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {TODO n/a} { xfail *-*-* } l$c }
-! { dg-bogus {note: variable 'parm\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: artificial} {TODO n/a} { xfail *-*-* } l$c }
+! { dg-bogus {note: variable 'overflow\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l$c }
+! { dg-bogus {note: variable 'not_prev_allocated\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l$c }
+! { dg-bogus {note: variable 'parm\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: artificial} {} { target *-*-* } l$c }
 
 ! Set each element in array 'b' at index i to i*2.
 
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-1.f90
index 1c8ccd9..ab6ff75 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-1.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-1.f90
@@ -1,12 +1,10 @@
 ! Test OpenACC 'declare create' with allocatable arrays.
 
 ! { dg-do run }
+! { dg-additional-options "-Wopenacc-parallelism" }
 
-!TODO-OpenACC-declare-allocate
-! Not currently implementing correct '-DACC_MEM_SHARED=0' behavior:
-! Missing support for OpenACC "Changes from Version 2.0 to 2.5":
+! We've got support for OpenACC "Changes from Version 2.0 to 2.5":
 ! "The 'declare create' directive with a Fortran 'allocatable' has new behavior".
-! { dg-xfail-run-if TODO { *-*-* } { -DACC_MEM_SHARED=0 } }
 
 !TODO { dg-additional-options -fno-inline } for stable results regarding OpenACC 'routine'.
 
@@ -204,9 +202,9 @@ program test
 
   deallocate (b)
 end program test ! { dg-line l[incr c] }
-! { dg-bogus {note: variable 'overflow\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {TODO n/a} { xfail *-*-* } l$c }
-! { dg-bogus {note: variable 'not_prev_allocated\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {TODO n/a} { xfail *-*-* } l$c }
-! { dg-bogus {note: variable 'parm\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: artificial} {TODO n/a} { xfail *-*-* } l$c }
+! { dg-bogus {note: variable 'overflow\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l$c }
+! { dg-bogus {note: variable 'not_prev_allocated\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l$c }
+! { dg-bogus {note: variable 'parm\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: artificial} {} { target *-*-* } l$c }
 
 ! Set each element in array 'b' at index i to i*2.
 
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-2.f90
new file mode 100644
index 0000000..df5ab26
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-2.f90
@@ -0,0 +1,48 @@
+! Test declare create with allocatable scalars.
+
+! { dg-do run }
+
+program main
+  use openacc
+  implicit none
+  integer, parameter :: n = 100
+  integer, allocatable :: a, c
+  integer :: i, b(n)
+  !$acc declare create (c)
+
+  allocate (a)
+
+  a = 50
+
+  !$acc parallel loop firstprivate(a)
+  do i = 1, n;
+     b(i) = a
+  end do
+
+  do i = 1, n
+     if (b(i) /= a) stop 1
+  end do
+
+  allocate (c)
+  a = 100
+
+  if (.not.acc_is_present(c)) stop 2
+
+  !$acc parallel num_gangs(1) present(c)
+  c = a
+  !$acc end parallel
+
+  !$acc update host(c)
+  if (c /= a) stop 3
+
+  !$acc parallel loop
+  do i = 1, n
+     b(i) = c
+  end do
+
+  do i = 1, n
+     if (b(i) /= a) stop 4
+  end do
+
+  deallocate (a, c)
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-3.f90 b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-3.f90
new file mode 100644
index 0000000..c64d4bb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-3.f90
@@ -0,0 +1,219 @@
+! Test declare create with allocatable arrays.
+
+! { dg-do run }
+! { dg-additional-options "-Wopenacc-parallelism" }
+
+module vars
+  implicit none
+  integer, parameter :: n = 100
+  real*8, allocatable :: a, b(:)
+ !$acc declare create (a, b)
+end module vars
+
+program test
+  use vars
+  use openacc
+  implicit none
+  integer :: i
+
+  interface
+     subroutine sub1
+       !$acc routine gang
+     end subroutine sub1
+
+     subroutine sub2
+     end subroutine sub2
+
+     real*8 function fun1 (ix)
+       integer ix
+       !$acc routine seq
+     end function fun1
+
+     real*8 function fun2 (ix)
+       integer ix
+       !$acc routine seq
+     end function fun2
+  end interface
+
+  if (allocated (a)) stop 1
+  if (allocated (b)) stop 2
+
+  ! Test local usage of an allocated declared array.
+
+  allocate (a)
+
+  if (.not.allocated (a)) stop 3
+  if (acc_is_present (a) .neqv. .true.) stop 4
+
+  allocate (b(n))
+
+  if (.not.allocated (b)) stop 5
+  if (acc_is_present (b) .neqv. .true.) stop 6
+
+  a = 2.0
+  !$acc update device(a)
+
+  !$acc parallel loop
+  do i = 1, n
+     b(i) = i * a
+  end do
+
+  if (.not.acc_is_present (b)) stop 7
+
+  !$acc update host(b)
+
+  do i = 1, n
+     if (b(i) /= i*a) stop 8
+  end do
+
+  deallocate (b)
+
+  ! Test the usage of an allocated declared array inside an acc
+  ! routine subroutine.
+
+  allocate (b(n))
+
+  if (.not.allocated (b)) stop 9
+  if (acc_is_present (b) .neqv. .true.) stop 10
+
+  !$acc parallel
+  call sub1
+  !$acc end parallel
+
+  if (.not.acc_is_present (b)) stop 11
+
+  !$acc update host(b)
+
+  do i = 1, n
+     if (b(i) /= a+i*2) stop 12
+  end do
+
+  deallocate (b)
+
+  ! Test the usage of an allocated declared array inside a host
+  ! subroutine.
+
+  call sub2
+
+  if (.not.acc_is_present (b)) stop 13
+
+  !$acc update host(b)
+
+  do i = 1, n
+     if (b(i) /= 1.0) stop 14
+  end do
+
+  deallocate (b)
+
+  if (allocated (b)) stop 15
+
+  ! Test the usage of an allocated declared array inside an acc
+  ! routine function.
+
+  allocate (b(n))
+
+  if (.not.allocated (b)) stop 16
+  if (acc_is_present (b) .neqv. .true.) stop 17
+
+  !$acc parallel loop
+  do i = 1, n
+     b(i) = 1.0
+  end do
+
+  !$acc parallel loop
+  do i = 1, n
+     b(i) = fun1 (i)
+  end do
+
+  if (.not.acc_is_present (b)) stop 18
+
+  !$acc update host(b)
+
+  do i = 1, n
+     if (b(i) /= i) stop 19
+  end do
+
+  deallocate (b)
+
+  ! Test the usage of an allocated declared array inside a host
+  ! function.
+
+  allocate (b(n))
+
+  if (.not.allocated (b)) stop 20
+  if (acc_is_present (b) .neqv. .true.) stop 21
+
+  !$acc parallel loop
+  do i = 1, n
+     b(i) = 1.0
+  end do
+
+  !$acc update host(b)
+
+  do i = 1, n
+     b(i) = fun2 (i)
+  end do
+
+  if (.not.acc_is_present (b)) stop 22
+
+  do i = 1, n
+     if (b(i) /= i*a) stop 23
+  end do
+
+  deallocate (a)
+  deallocate (b)
+end program test
+
+! Set each element in array 'b' at index i to a+i*2.
+
+subroutine sub1 ! { dg-warning "region is worker partitioned" }
+  use vars
+  implicit none
+  integer i
+  !$acc routine gang
+
+  !$acc loop
+  do i = 1, n
+     b(i) = a+i*2
+  end do
+end subroutine sub1
+
+! Allocate array 'b', and set it to all 1.0.
+
+subroutine sub2
+  use vars
+  use openacc
+  implicit none
+  integer i
+
+  allocate (b(n))
+
+  if (.not.allocated (b)) stop 24
+  if (acc_is_present (b) .neqv. .true.) stop 25
+
+  !$acc parallel loop
+  do i = 1, n
+     b(i) = 1.0
+  end do
+end subroutine sub2
+
+! Return b(i) * i;
+
+real*8 function fun1 (i)
+  use vars
+  implicit none
+  integer i
+  !$acc routine seq
+
+  fun1 = b(i) * i
+end function fun1
+
+! Return b(i) * i * a;
+
+real*8 function fun2 (i)
+  use vars
+  implicit none
+  integer i
+
+  fun2 = b(i) * i * a
+end function fun2
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-4.f90 b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-4.f90
new file mode 100644
index 0000000..afbe52f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-4.f90
@@ -0,0 +1,66 @@
+! Test declare create with allocatable arrays and scalars.  The unused
+! declared array 'b' caused an ICE in the past.
+
+! { dg-do run }
+
+module vars
+  implicit none
+  integer, parameter :: n = 100
+  real*8, allocatable :: a, b(:)
+ !$acc declare create (a, b)
+end module vars
+
+program test
+  use vars
+  implicit none
+  integer :: i
+
+  interface
+     subroutine sub1
+     end subroutine sub1
+
+     subroutine sub2
+     end subroutine sub2
+
+     real*8 function fun1 (ix)
+       integer ix
+       !$acc routine seq
+     end function fun1
+
+     real*8 function fun2 (ix)
+       integer ix
+       !$acc routine seq
+     end function fun2
+  end interface
+
+  if (allocated (a)) stop 1
+  if (allocated (b)) stop 2
+
+  ! Test the usage of an allocated declared array inside an acc
+  ! routine subroutine.
+
+  allocate (a)
+  allocate (b(n))
+
+  if (.not.allocated (b)) stop 3
+
+  call sub1
+
+  !$acc update self(a)
+  if (a /= 50) stop 4
+
+  deallocate (a)
+  deallocate (b)
+
+end program test
+
+! Set 'a' to 50.
+
+subroutine sub1
+  use vars
+  implicit none
+  integer i
+
+  a = 50
+  !$acc update device(a)
+end subroutine sub1
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-array_descriptor-1-directive.f90 b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-array_descriptor-1-directive.f90
index 6604f72..0f4d21a 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-array_descriptor-1-directive.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-array_descriptor-1-directive.f90
@@ -7,11 +7,10 @@
 ! host/device array descriptors.
 ! { dg-skip-if n/a { *-*-* } { -DACC_MEM_SHARED=1 } }
 
-!TODO-OpenACC-declare-allocate
-! Missing support for OpenACC "Changes from Version 2.0 to 2.5":
+! We've got support for OpenACC "Changes from Version 2.0 to 2.5":
 ! "The 'declare create' directive with a Fortran 'allocatable' has new behavior".
-! Thus, after 'allocate'/before 'deallocate', do
-! '!$acc enter data create'/'!$acc exit data delete' manually.
+! Yet, after 'allocate'/before 'deallocate', do
+! '!$acc enter data create'/'!$acc exit data delete' manually, too.
 
 
 !TODO { dg-additional-options -fno-inline } for stable results regarding OpenACC 'routine'.
@@ -101,8 +100,6 @@ program test
 
   allocate (b(n1_lb:n1_ub))
   call verify_n1_allocated
-  if (acc_is_present (b)) error stop
-  !$acc enter data create (b)
   ! This is now OpenACC "present":
   if (.not.acc_is_present (b)) error stop
   ! ..., and got the actual array descriptor installed:
@@ -110,15 +107,16 @@ program test
   call verify_n1_allocated
   !$acc end serial
 
+  !$acc enter data create (b)
+  if (.not.acc_is_present (b)) error stop
+  !$acc serial
+  call verify_n1_allocated
+  !$acc end serial
+
   do i = n1_lb, n1_ub
      b(i) = i - 1
   end do
 
-  ! In 'declare-allocatable-array_descriptor-1-runtime.f90', this does "verify
-  ! that host-to-device copy doesn't touch the device-side (still initial)
-  ! array descriptor (but it does copy the array data").  This is here not
-  ! applicable anymore, as we've already gotten the actual array descriptor
-  ! installed.  Thus now verify that it does copy the array data.
   call acc_update_device (b)
   !$acc serial
   call verify_n1_allocated
@@ -143,12 +141,6 @@ program test
   !TODO 'GOMP_MAP_TO_PSET':
   ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma omp target oacc_parallel map\(tofrom:MEM <integer\(kind=[0-9]+\)\[0:\]> \[\(integer\(kind=[0-9]+\)\[0:\] \*\)[^\]]+\] \[len: [^\]]+\]\) map\(alloc:b\.data \[pointer assign, bias: 0\]\) map\(from:id1_2 \[len: [0-9]+\]\)$} 1 gimple } }
 
-  ! In 'declare-allocatable-array_descriptor-1-runtime.f90', this does "verify
-  ! that device-to-host copy doesn't touch the host-side array descriptor,
-  ! doesn't copy out the device-side (still initial) array descriptor (but it
-  ! does copy the array data)".  This is here not applicable anymore, as we've
-  ! already gotten the actual array descriptor installed.  Thus now verify that
-  ! it does copy the array data.
   call acc_update_self (b)
   call verify_n1_allocated
 
@@ -223,14 +215,13 @@ program test
 
   !$acc exit data delete (b)
   if (.not.allocated (b)) error stop
-  if (acc_is_present (b)) error stop
-  ! The device-side array descriptor doesn't get updated, so 'b' still appears
-  ! as "allocated":
+  if (.not.acc_is_present (b)) error stop
   !$acc serial
   call verify_n1_allocated
   !$acc end serial
 
   deallocate (b)
+  !if (acc_is_present (b)) error stop
   call verify_n1_deallocated (.false.)
   ! The device-side array descriptor doesn't get updated, so 'b' still appears
   ! as "allocated":
@@ -260,10 +251,13 @@ program test
 
   allocate (b(n2_lb:n2_ub))
   call verify_n2_allocated
-  if (acc_is_present (b)) error stop
+  if (.not.acc_is_present (b)) error stop
+  !$acc serial
+  call verify_n2_allocated
+  !$acc end serial
+
   !$acc enter data create (b)
   if (.not.acc_is_present (b)) error stop
-  ! ..., and got the actual array descriptor installed:
   !$acc serial
   call verify_n2_allocated
   !$acc end serial
@@ -337,12 +331,13 @@ program test
 
   !$acc exit data delete (b)
   if (.not.allocated (b)) error stop
-  if (acc_is_present (b)) error stop
+  if (.not.acc_is_present (b)) error stop
   !$acc serial
   call verify_n2_allocated
   !$acc end serial
 
   deallocate (b)
+  !if (acc_is_present (b)) error stop
   call verify_n2_deallocated (.false.)
   !$acc serial
   call verify_n2_allocated
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-array_descriptor-1-runtime.f90 b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-array_descriptor-1-runtime.f90
index b27f312..0682256 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-array_descriptor-1-runtime.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-array_descriptor-1-runtime.f90
@@ -7,11 +7,10 @@
 ! host/device array descriptors.
 ! { dg-skip-if n/a { *-*-* } { -DACC_MEM_SHARED=1 } }
 
-!TODO-OpenACC-declare-allocate
-! Missing support for OpenACC "Changes from Version 2.0 to 2.5":
+! We've got support for OpenACC "Changes from Version 2.0 to 2.5":
 ! "The 'declare create' directive with a Fortran 'allocatable' has new behavior".
-! Thus, after 'allocate'/before 'deallocate', call 'acc_create'/'acc_delete'
-! manually.
+! Yet, after 'allocate'/before 'deallocate', call 'acc_create'/'acc_delete'
+! manually, too.
 
 
 !TODO { dg-additional-options -fno-inline } for stable results regarding OpenACC 'routine'.
@@ -101,31 +100,47 @@ program test
 
   allocate (b(n1_lb:n1_ub))
   call verify_n1_allocated
-  if (acc_is_present (b)) error stop
-  call acc_create (b)
   ! This is now OpenACC "present":
   if (.not.acc_is_present (b)) error stop
-  ! This still has the initial array descriptor:
+  ! ..., and got the actual array descriptor installed:
   !$acc serial
-  call verify_initial
+  call verify_n1_allocated
+  !$acc end serial
+
+  call acc_create (b)
+  if (.not.acc_is_present (b)) error stop
+  !$acc serial
+  call verify_n1_allocated
   !$acc end serial
 
   do i = n1_lb, n1_ub
      b(i) = i - 1
   end do
 
-  ! Verify that host-to-device copy doesn't touch the device-side (still
-  ! initial) array descriptor (but it does copy the array data).
   call acc_update_device (b)
   !$acc serial
-  call verify_initial
+  call verify_n1_allocated
   !$acc end serial
 
   b = 40
 
-  ! Verify that device-to-host copy doesn't touch the host-side array
-  ! descriptor, doesn't copy out the device-side (still initial) array
-  ! descriptor (but it does copy the array data).
+  !$acc parallel copyout (id1_1) ! No data clause for 'b' (explicit or implicit): no 'GOMP_MAP_TO_PSET'.
+  call verify_n1_values (-1)
+  id1_1 = 0
+  !$acc end parallel
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma acc parallel map\(from:id1_1\)$} 1 original } }
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma omp target oacc_parallel map\(from:id1_1 \[len: [0-9]+\]\)$} 1 gimple } }
+
+  !$acc parallel copy (b) copyout (id1_2)
+  ! As already present, 'copy (b)' doesn't copy; addend is still '-1'.
+  call verify_n1_values (-1)
+  id1_2 = 0
+  !$acc end parallel
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma acc parallel map\(tofrom:\*\(integer\(kind=[0-9]+\)\[0:\] \* restrict\) b\.data \[len: [^\]]+\]\) map\(to:b \[pointer set, len: [0-9]+\]\) map\(alloc:\(integer\(kind=[0-9]+\)\[0:\] \* restrict\) b\.data \[pointer assign, bias: 0\]\) map\(from:id1_2\)$} 1 original } }
+  !TODO ..., but without an actual use of 'b', the gimplifier removes the
+  !TODO 'GOMP_MAP_TO_PSET':
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma omp target oacc_parallel map\(tofrom:MEM <integer\(kind=[0-9]+\)\[0:\]> \[\(integer\(kind=[0-9]+\)\[0:\] \*\)[^\]]+\] \[len: [^\]]+\]\) map\(alloc:b\.data \[pointer assign, bias: 0\]\) map\(from:id1_2 \[len: [0-9]+\]\)$} 1 gimple } }
+
   call acc_update_self (b)
   call verify_n1_allocated
 
@@ -142,11 +157,19 @@ program test
   ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma omp target oacc_update map\(force_to:MEM <integer\(kind=[0-9]+\)\[0:\]> \[\(integer\(kind=[0-9]+\)\[0:\] \*\)[^\]]+\] \[len: [^\]]+\]\) map\(to:b \[pointer set, len: [0-9]+\]\) map\(alloc:b\.data \[pointer assign, bias: 0\]\) map\(force_from:id1_1 \[len: [0-9]+\]\)$} 1 gimple } }
   ! ..., but it's silently skipped in 'GOACC_update'.
   !$acc serial
-  call verify_initial
+  call verify_n1_allocated
   !$acc end serial
 
   b = 41
 
+  !$acc parallel
+  call verify_n1_values (1)
+  !$acc end parallel
+
+  !$acc parallel copy (b)
+  call verify_n1_values (1)
+  !$acc end parallel
+
   !$acc update self (b) self (id1_2)
   ! We do have 'GOMP_MAP_TO_PSET' here:
   ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma acc update map\(force_from:\*\(integer\(kind=[0-9]+\)\[0:\] \* restrict\) b\.data \[len: [^\]]+\]\) map\(to:b \[pointer set, len: [0-9]+\]\) map\(alloc:\(integer\(kind=[0-9]+\)\[0:\] \* restrict\) b\.data \[pointer assign, bias: 0\]\) map\(force_from:id1_2\);$} 1 original } }
@@ -159,20 +182,9 @@ program test
      b(i) = b(i) + 2
   end do
 
-  ! Now install the actual array descriptor, via a data clause for 'b'
-  ! (explicit or implicit): must get a 'GOMP_MAP_TO_PSET', which then in
-  ! 'gomp_map_vars_internal' is handled as 'declare target', and because of
-  ! '*(void **) hostaddrs[i] != NULL', we've got 'has_always_ptrset == true',
-  ! 'always_to_cnt == 1', and therefore 'gomp_map_vars_existing' does update
-  ! the 'GOMP_MAP_TO_PSET'.
-  !$acc serial present (b) copyin (id1_1)
-  call verify_initial
-  id1_1 = 0
-  !$acc end serial
-  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma acc serial map\(force_present:\*\(integer\(kind=[0-9]+\)\[0:\] \* restrict\) b\.data \[len: [^\]]+\]\) map\(to:b \[pointer set, len: [0-9]+\]\) map\(alloc:\(integer\(kind=[0-9]+\)\[0:\] \* restrict\) b\.data \[pointer assign, bias: 0\]\) map\(to:id1_1\)$} 1 original } }
-  !TODO ..., but without an actual use of 'b', the gimplifier removes the
-  !TODO 'GOMP_MAP_TO_PSET':
-  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma omp target oacc_serial map\(force_present:MEM <integer\(kind=[0-9]+\)\[0:\]> \[\(integer\(kind=[0-9]+\)\[0:\] \*\)[^\]]+\] \[len: [^\]]+\]\) map\(alloc:b\.data \[pointer assign, bias: 0\]\) map\(to:id1_1 \[len: [0-9]+\]\)$} 1 gimple } }
+  ! Now test that (potentially re-)installing the actual array descriptor is a
+  ! no-op, via a data clause for 'b' (explicit or implicit): must get a
+  ! 'GOMP_MAP_TO_PSET'.
   !$acc serial present (b) copyin (id1_2)
   call verify_n1_allocated
   !TODO Use of 'b':
@@ -203,14 +215,13 @@ program test
 
   call acc_delete (b)
   if (.not.allocated (b)) error stop
-  if (acc_is_present (b)) error stop
-  ! The device-side array descriptor doesn't get updated, so 'b' still appears
-  ! as "allocated":
+  if (.not.acc_is_present (b)) error stop
   !$acc serial
   call verify_n1_allocated
   !$acc end serial
 
   deallocate (b)
+  !if (acc_is_present (b)) error stop
   call verify_n1_deallocated (.false.)
   ! The device-side array descriptor doesn't get updated, so 'b' still appears
   ! as "allocated":
@@ -240,12 +251,15 @@ program test
 
   allocate (b(n2_lb:n2_ub))
   call verify_n2_allocated
-  if (acc_is_present (b)) error stop
+  if (.not.acc_is_present (b)) error stop
+  !$acc serial
+  call verify_n2_allocated
+  !$acc end serial
+
   call acc_create (b)
   if (.not.acc_is_present (b)) error stop
-  ! This still has the previous (n1) array descriptor:
   !$acc serial
-  call verify_n1_deallocated (.true.)
+  call verify_n2_allocated
   !$acc end serial
 
   do i = n2_lb, n2_ub
@@ -254,11 +268,19 @@ program test
 
   call acc_update_device (b)
   !$acc serial
-  call verify_n1_deallocated (.true.)
+  call verify_n2_allocated
   !$acc end serial
 
   b = -40
 
+  !$acc parallel
+  call verify_n2_values (20)
+  !$acc end parallel
+
+  !$acc parallel copy (b)
+  call verify_n2_values (20)
+  !$acc end parallel
+
   call acc_update_self (b)
   call verify_n2_allocated
 
@@ -269,11 +291,19 @@ program test
 
   !$acc update device (b)
   !$acc serial
-  call verify_n1_deallocated (.true.)
+  call verify_n2_allocated
   !$acc end serial
 
   b = -41
 
+  !$acc parallel
+  call verify_n2_values (-20)
+  !$acc end parallel
+
+  !$acc parallel copy (b)
+  call verify_n2_values (-20)
+  !$acc end parallel
+
   !$acc update self (b)
   call verify_n2_allocated
 
@@ -301,12 +331,13 @@ program test
 
   call acc_delete (b)
   if (.not.allocated (b)) error stop
-  if (acc_is_present (b)) error stop
+  if (.not.acc_is_present (b)) error stop
   !$acc serial
   call verify_n2_allocated
   !$acc end serial
 
   deallocate (b)
+  !if (acc_is_present (b)) error stop
   call verify_n2_deallocated (.false.)
   !$acc serial
   call verify_n2_allocated
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-array_descriptor-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-array_descriptor-1.f90
new file mode 100644
index 0000000..1105a57
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-array_descriptor-1.f90
@@ -0,0 +1,405 @@
+! Test OpenACC 'declare create' with allocatable arrays.
+
+! { dg-do run }
+
+! Note that we're not testing OpenACC semantics here, but rather documenting
+! current GCC behavior, specifically, behavior concerning updating of
+! host/device array descriptors.
+! { dg-skip-if n/a { *-*-* } { -DACC_MEM_SHARED=1 } }
+
+! We've got support for OpenACC "Changes from Version 2.0 to 2.5":
+! "The 'declare create' directive with a Fortran 'allocatable' has new behavior".
+
+
+!TODO { dg-additional-options -fno-inline } for stable results regarding OpenACC 'routine'.
+
+
+!TODO OpenACC 'serial' vs. GCC/nvptx:
+!TODO { dg-prune-output {using 'vector_length \(32\)', ignoring 1} }
+
+
+! { dg-additional-options -fdump-tree-original }
+! { dg-additional-options -fdump-tree-gimple }
+
+
+module vars
+  implicit none
+  integer, parameter :: n1_lb = -3
+  integer, parameter :: n1_ub = 6
+  integer, parameter :: n2_lb = -9999
+  integer, parameter :: n2_ub = 22222
+
+  integer, allocatable :: b(:)
+  !$acc declare create (b)
+
+end module vars
+
+program test
+  use vars
+  use openacc
+  implicit none
+  integer :: i
+
+  ! Identifiers for purposes of reliable '-fdump-tree-[...]' scanning.
+  integer :: id1_1, id1_2
+
+  interface
+
+     subroutine verify_initial
+       implicit none
+       !$acc routine seq
+     end subroutine verify_initial
+
+     subroutine verify_n1_allocated
+       implicit none
+       !$acc routine seq
+     end subroutine verify_n1_allocated
+
+     subroutine verify_n1_values (addend)
+       implicit none
+       !$acc routine gang
+       integer, value :: addend
+     end subroutine verify_n1_values
+
+     subroutine verify_n1_deallocated (expect_allocated)
+       implicit none
+       !$acc routine seq
+       logical, value :: expect_allocated
+     end subroutine verify_n1_deallocated
+
+     subroutine verify_n2_allocated
+       implicit none
+       !$acc routine seq
+     end subroutine verify_n2_allocated
+
+     subroutine verify_n2_values (addend)
+       implicit none
+       !$acc routine gang
+       integer, value :: addend
+     end subroutine verify_n2_values
+
+     subroutine verify_n2_deallocated (expect_allocated)
+       implicit none
+       !$acc routine seq
+       logical, value :: expect_allocated
+     end subroutine verify_n2_deallocated
+
+  end interface
+
+  call acc_create (id1_1)
+  call acc_create (id1_2)
+
+  call verify_initial
+  ! It is important here (and similarly, following) that there is no data
+  ! clause for 'b' (explicit or implicit): no 'GOMP_MAP_TO_PSET'.
+  !$acc serial
+  call verify_initial
+  !$acc end serial
+
+  allocate (b(n1_lb:n1_ub))
+  call verify_n1_allocated
+  ! This is now OpenACC "present":
+  if (.not.acc_is_present (b)) error stop
+  ! ..., and got the actual array descriptor installed:
+  !$acc serial
+  call verify_n1_allocated
+  !$acc end serial
+
+  do i = n1_lb, n1_ub
+     b(i) = i - 1
+  end do
+
+  call acc_update_device (b)
+  !$acc serial
+  call verify_n1_allocated
+  !$acc end serial
+
+  b = 40
+
+  !$acc parallel copyout (id1_1) ! No data clause for 'b' (explicit or implicit): no 'GOMP_MAP_TO_PSET'.
+  call verify_n1_values (-1)
+  id1_1 = 0
+  !$acc end parallel
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma acc parallel map\(from:id1_1\)$} 1 original } }
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma omp target oacc_parallel map\(from:id1_1 \[len: [0-9]+\]\)$} 1 gimple } }
+
+  !$acc parallel copy (b) copyout (id1_2)
+  ! As already present, 'copy (b)' doesn't copy; addend is still '-1'.
+  call verify_n1_values (-1)
+  id1_2 = 0
+  !$acc end parallel
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma acc parallel map\(tofrom:\*\(integer\(kind=[0-9]+\)\[0:\] \* restrict\) b\.data \[len: [^\]]+\]\) map\(to:b \[pointer set, len: [0-9]+\]\) map\(alloc:\(integer\(kind=[0-9]+\)\[0:\] \* restrict\) b\.data \[pointer assign, bias: 0\]\) map\(from:id1_2\)$} 1 original } }
+  !TODO ..., but without an actual use of 'b', the gimplifier removes the
+  !TODO 'GOMP_MAP_TO_PSET':
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma omp target oacc_parallel map\(tofrom:MEM <integer\(kind=[0-9]+\)\[0:\]> \[\(integer\(kind=[0-9]+\)\[0:\] \*\)[^\]]+\] \[len: [^\]]+\]\) map\(alloc:b\.data \[pointer assign, bias: 0\]\) map\(from:id1_2 \[len: [0-9]+\]\)$} 1 gimple } }
+
+  call acc_update_self (b)
+  call verify_n1_allocated
+
+  do i = n1_lb, n1_ub
+     if (b(i) /= i - 1) error stop
+     b(i) = b(i) + 2
+  end do
+
+  ! The same using the OpenACC 'update' directive.
+
+  !$acc update device (b) self (id1_1)
+  ! We do have 'GOMP_MAP_TO_PSET' here:
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma acc update map\(force_to:\*\(integer\(kind=[0-9]+\)\[0:\] \* restrict\) b\.data \[len: [^\]]+\]\) map\(to:b \[pointer set, len: [0-9]+\]\) map\(alloc:\(integer\(kind=[0-9]+\)\[0:\] \* restrict\) b\.data \[pointer assign, bias: 0\]\) map\(force_from:id1_1\);$} 1 original } }
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma omp target oacc_update map\(force_to:MEM <integer\(kind=[0-9]+\)\[0:\]> \[\(integer\(kind=[0-9]+\)\[0:\] \*\)[^\]]+\] \[len: [^\]]+\]\) map\(to:b \[pointer set, len: [0-9]+\]\) map\(alloc:b\.data \[pointer assign, bias: 0\]\) map\(force_from:id1_1 \[len: [0-9]+\]\)$} 1 gimple } }
+  ! ..., but it's silently skipped in 'GOACC_update'.
+  !$acc serial
+  call verify_n1_allocated
+  !$acc end serial
+
+  b = 41
+
+  !$acc parallel
+  call verify_n1_values (1)
+  !$acc end parallel
+
+  !$acc parallel copy (b)
+  call verify_n1_values (1)
+  !$acc end parallel
+
+  !$acc update self (b) self (id1_2)
+  ! We do have 'GOMP_MAP_TO_PSET' here:
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma acc update map\(force_from:\*\(integer\(kind=[0-9]+\)\[0:\] \* restrict\) b\.data \[len: [^\]]+\]\) map\(to:b \[pointer set, len: [0-9]+\]\) map\(alloc:\(integer\(kind=[0-9]+\)\[0:\] \* restrict\) b\.data \[pointer assign, bias: 0\]\) map\(force_from:id1_2\);$} 1 original } }
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma omp target oacc_update map\(force_from:MEM <integer\(kind=[0-9]+\)\[0:\]> \[\(integer\(kind=[0-9]+\)\[0:\] \*\)[^\]]+\] \[len: [^\]]+\]\) map\(to:b \[pointer set, len: [0-9]+\]\) map\(alloc:b\.data \[pointer assign, bias: 0\]\) map\(force_from:id1_2 \[len: [0-9]+\]\)$} 1 gimple } }
+  ! ..., but it's silently skipped in 'GOACC_update'.
+  call verify_n1_allocated
+
+  do i = n1_lb, n1_ub
+     if (b(i) /= i + 1) error stop
+     b(i) = b(i) + 2
+  end do
+
+  ! Now test that (potentially re-)installing the actual array descriptor is a
+  ! no-op, via a data clause for 'b' (explicit or implicit): must get a
+  ! 'GOMP_MAP_TO_PSET'.
+  !$acc serial present (b) copyin (id1_2)
+  call verify_n1_allocated
+  !TODO Use of 'b':
+  id1_2 = ubound (b, 1)
+  !$acc end serial
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma acc serial map\(force_present:\*\(integer\(kind=[0-9]+\)\[0:\] \* restrict\) b\.data \[len: [^\]]+\]\) map\(to:b \[pointer set, len: [0-9]+\]\) map\(alloc:\(integer\(kind=[0-9]+\)\[0:\] \* restrict\) b\.data \[pointer assign, bias: 0\]\) map\(to:id1_2\)$} 1 original } }
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma omp target oacc_serial map\(force_present:MEM <integer\(kind=[0-9]+\)\[0:\]> \[\(integer\(kind=[0-9]+\)\[0:\] \*\)[^\]]+\] \[len: [^\]]+\]\) map\(to:b \[pointer set, len: [0-9]+\]\) map\(alloc:b\.data \[pointer assign, bias: 0\]\) map\(to:id1_2 \[len: [0-9]+\]\)$} 1 gimple } }
+
+  !$acc parallel copyin (id1_1) ! No data clause for 'b' (explicit or implicit): no 'GOMP_MAP_TO_PSET'.
+  call verify_n1_values (1)
+  id1_1 = 0
+  !$acc end parallel
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma acc parallel map\(to:id1_1\)$} 1 original } }
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma omp target oacc_parallel map\(to:id1_1 \[len: [0-9]+\]\)$} 1 gimple } }
+
+  !$acc parallel copy (b) copyin (id1_2)
+  ! As already present, 'copy (b)' doesn't copy; addend is still '1'.
+  call verify_n1_values (1)
+  id1_2 = 0
+  !$acc end parallel
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma acc parallel map\(tofrom:\*\(integer\(kind=[0-9]+\)\[0:\] \* restrict\) b\.data \[len: [^\]]+\]\) map\(to:b \[pointer set, len: [0-9]+\]\) map\(alloc:\(integer\(kind=[0-9]+\)\[0:\] \* restrict\) b\.data \[pointer assign, bias: 0\]\) map\(to:id1_2\)$} 1 original } }
+  !TODO ..., but without an actual use of 'b', the gimplifier removes the
+  !TODO 'GOMP_MAP_TO_PSET':
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma omp target oacc_parallel map\(tofrom:MEM <integer\(kind=[0-9]+\)\[0:\]> \[\(integer\(kind=[0-9]+\)\[0:\] \*\)[^\]]+\] \[len: [^\]]+\]\) map\(alloc:b\.data \[pointer assign, bias: 0\]\) map\(to:id1_2 \[len: [0-9]+\]\)$} 1 gimple } }
+
+  call verify_n1_allocated
+  if (.not.acc_is_present (b)) error stop
+
+  deallocate (b)
+  !if (acc_is_present (b)) error stop
+  call verify_n1_deallocated (.false.)
+  ! The device-side array descriptor doesn't get updated, so 'b' still appears
+  ! as "allocated":
+  !$acc serial
+  call verify_n1_allocated
+  !$acc end serial
+
+  ! Now try to install the actual array descriptor, via a data clause for 'b'
+  ! (explicit or implicit): must get a 'GOMP_MAP_TO_PSET', which then in
+  ! 'gomp_map_vars_internal' is handled as 'declare target', but because of
+  ! '*(void **) hostaddrs[i] == NULL', we've got 'has_always_ptrset == false',
+  ! 'always_to_cnt == 0', and therefore 'gomp_map_vars_existing' doesn't update
+  ! the 'GOMP_MAP_TO_PSET'.
+  ! The device-side array descriptor doesn't get updated, so 'b' still appears
+  ! as "allocated":
+  !TODO Why does 'present (b)' still work here?
+  !$acc serial present (b) copyout (id1_2)
+  call verify_n1_deallocated (.true.)
+  !TODO Use of 'b'.
+  id1_2 = ubound (b, 1)
+  !$acc end serial
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma acc serial map\(force_present:\*\(integer\(kind=[0-9]+\)\[0:\] \* restrict\) b\.data \[len: [^\]]+\]\) map\(to:b \[pointer set, len: [0-9]+\]\) map\(alloc:\(integer\(kind=[0-9]+\)\[0:\] \* restrict\) b\.data \[pointer assign, bias: 0\]\) map\(from:id1_2\)$} 1 original } }
+  ! { dg-final { scan-tree-dump-times {(?n)^ *#pragma omp target oacc_serial map\(force_present:MEM <integer\(kind=[0-9]+\)\[0:\]> \[\(integer\(kind=[0-9]+\)\[0:\] \*\)[^\]]+\] \[len: [^\]]+\]\) map\(to:b \[pointer set, len: [0-9]+\]\) map\(alloc:b\.data \[pointer assign, bias: 0\]\) map\(from:id1_2 \[len: [0-9]+\]\)$} 1 gimple } }
+
+
+  ! Restart the procedure, with different array dimensions.
+
+  allocate (b(n2_lb:n2_ub))
+  call verify_n2_allocated
+  if (.not.acc_is_present (b)) error stop
+  !$acc serial
+  call verify_n2_allocated
+  !$acc end serial
+
+  do i = n2_lb, n2_ub
+     b(i) = i + 20
+  end do
+
+  call acc_update_device (b)
+  !$acc serial
+  call verify_n2_allocated
+  !$acc end serial
+
+  b = -40
+
+  !$acc parallel
+  call verify_n2_values (20)
+  !$acc end parallel
+
+  !$acc parallel copy (b)
+  call verify_n2_values (20)
+  !$acc end parallel
+
+  call acc_update_self (b)
+  call verify_n2_allocated
+
+  do i = n2_lb, n2_ub
+     if (b(i) /= i + 20) error stop
+     b(i) = b(i) - 40
+  end do
+
+  !$acc update device (b)
+  !$acc serial
+  call verify_n2_allocated
+  !$acc end serial
+
+  b = -41
+
+  !$acc parallel
+  call verify_n2_values (-20)
+  !$acc end parallel
+
+  !$acc parallel copy (b)
+  call verify_n2_values (-20)
+  !$acc end parallel
+
+  !$acc update self (b)
+  call verify_n2_allocated
+
+  do i = n2_lb, n2_ub
+     if (b(i) /= i - 20) error stop
+     b(i) = b(i) + 10
+  end do
+
+  !$acc serial present (b) copy (id1_2)
+  call verify_n2_allocated
+  !TODO Use of 'b':
+  id1_2 = ubound (b, 1)
+  !$acc end serial
+
+  !$acc parallel
+  call verify_n2_values (-20)
+  !$acc end parallel
+
+  !$acc parallel copy (b)
+  call verify_n2_values (-20)
+  !$acc end parallel
+
+  call verify_n2_allocated
+  if (.not.acc_is_present (b)) error stop
+
+  deallocate (b)
+  !if (acc_is_present (b)) error stop
+  call verify_n2_deallocated (.false.)
+  !$acc serial
+  call verify_n2_allocated
+  !$acc end serial
+
+  !$acc serial present (b) copy (id1_2)
+  call verify_n2_deallocated (.true.)
+  !TODO Use of 'b':
+  id1_2 = ubound (b, 1)
+  !$acc end serial
+
+end program test
+
+
+subroutine verify_initial
+  use vars
+  implicit none
+  !$acc routine seq
+
+  if (allocated (b)) error stop "verify_initial allocated"
+  if (any (lbound (b) /= [0])) error stop "verify_initial lbound"
+  if (any (ubound (b) /= [0])) error stop "verify_initial ubound"
+end subroutine verify_initial
+
+subroutine verify_n1_allocated
+  use vars
+  implicit none
+  !$acc routine seq
+
+  if (.not.allocated (b)) error stop "verify_n1_allocated allocated"
+  if (any (lbound (b) /= [n1_lb])) error stop "verify_n1_allocated lbound"
+  if (any (ubound (b) /= [n1_ub])) error stop "verify_n1_allocated ubound"
+end subroutine verify_n1_allocated
+
+subroutine verify_n1_values (addend)
+  use vars
+  implicit none
+  !$acc routine gang
+  integer, value :: addend
+  integer :: i
+
+  !$acc loop
+  do i = n1_lb, n1_ub
+     if (b(i) /= i + addend) error stop
+  end do
+end subroutine verify_n1_values
+
+subroutine verify_n1_deallocated (expect_allocated)
+  use vars
+  implicit none
+  !$acc routine seq
+  logical, value :: expect_allocated
+
+  if (allocated(b) .neqv. expect_allocated) error stop "verify_n1_deallocated allocated"
+  ! Apparently 'deallocate'ing doesn't unset the bounds.
+  if (any (lbound (b) /= [n1_lb])) error stop "verify_n1_deallocated lbound"
+  if (any (ubound (b) /= [n1_ub])) error stop "verify_n1_deallocated ubound"
+end subroutine verify_n1_deallocated
+
+subroutine verify_n2_allocated
+  use vars
+  implicit none
+  !$acc routine seq
+
+  if (.not.allocated(b)) error stop "verify_n2_allocated allocated"
+  if (any (lbound (b) /= [n2_lb])) error stop "verify_n2_allocated lbound"
+  if (any (ubound (b) /= [n2_ub])) error stop "verify_n2_allocated ubound"
+end subroutine verify_n2_allocated
+
+subroutine verify_n2_values (addend)
+  use vars
+  implicit none
+  !$acc routine gang
+  integer, value :: addend
+  integer :: i
+
+  !$acc loop
+  do i = n2_lb, n2_ub
+     if (b(i) /= i + addend) error stop
+  end do
+end subroutine verify_n2_values
+
+subroutine verify_n2_deallocated (expect_allocated)
+  use vars
+  implicit none
+  !$acc routine seq
+  logical, value :: expect_allocated
+
+  if (allocated(b) .neqv. expect_allocated) error stop "verify_n2_deallocated allocated"
+  ! Apparently 'deallocate'ing doesn't unset the bounds.
+  if (any (lbound (b) /= [n2_lb])) error stop "verify_n2_deallocated lbound"
+  if (any (ubound (b) /= [n2_ub])) error stop "verify_n2_deallocated ubound"
+end subroutine verify_n2_deallocated
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/declare-create-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/declare-create-1.f90
new file mode 100644
index 0000000..057b5eb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/declare-create-1.f90
@@ -0,0 +1,22 @@
+! { dg-do run }
+
+module m
+integer :: mint
+!$acc declare create (mint)
+end module m
+
+program p
+use m
+
+mint = 0
+
+!$acc serial
+! { dg-warning {using .vector_length \(32\)., ignoring 1} "" { target openacc_nvidia_accel_selected } .-1 }
+mint = 5
+!$acc end serial
+
+!$acc update host(mint)
+
+if (mint.ne.5) stop 1
+
+end program p
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/declare-create-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/declare-create-2.f90
new file mode 100644
index 0000000..dd7c979
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/declare-create-2.f90
@@ -0,0 +1,26 @@
+! { dg-do run }
+
+module m
+integer, allocatable :: mint
+!$acc declare create (mint)
+end module m
+
+program p
+use m
+
+allocate(mint)
+
+mint = 0
+
+!$acc serial
+! { dg-warning {using .vector_length \(32\)., ignoring 1} "" { target openacc_nvidia_accel_selected } .-1 }
+mint = 5
+!$acc end serial
+
+!$acc update host(mint)
+
+if (mint.ne.5) stop 1
+
+deallocate(mint)
+
+end program p
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/declare-create-3.f90 b/libgomp/testsuite/libgomp.oacc-fortran/declare-create-3.f90
new file mode 100644
index 0000000..7cceaa5
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/declare-create-3.f90
@@ -0,0 +1,26 @@
+! { dg-do run }
+
+module m
+integer, allocatable :: mint(:)
+!$acc declare create (mint)
+end module m
+
+program p
+use m
+
+allocate(mint(1:20))
+
+mint = 0
+
+!$acc serial
+! { dg-warning {using .vector_length \(32\)., ignoring 1} "" { target openacc_nvidia_accel_selected } .-1 }
+mint = 5
+!$acc end serial
+
+!$acc update host(mint)
+
+if (any(mint.ne.5)) stop 1
+
+deallocate(mint)
+
+end program p
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/deviceptr-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/deviceptr-1.f90
new file mode 100644
index 0000000..276a172
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/deviceptr-1.f90
@@ -0,0 +1,197 @@
+! { dg-do run }
+
+! Test the deviceptr clause with various directives
+! and in combination with other directives where
+! the deviceptr variable is implied.
+
+subroutine subr1 (a, b)
+  implicit none
+  integer, parameter :: N = 8
+  integer :: a(N)
+  integer :: b(N)
+  integer :: i = 0
+
+  !$acc data deviceptr (a)
+
+  !$acc parallel copy (b)
+    do i = 1, N
+      a(i) = i * 2
+      b(i) = a(i)
+    end do
+  !$acc end parallel
+
+  !$acc end data
+
+end subroutine
+
+subroutine subr2 (a, b)
+  implicit none
+  integer, parameter :: N = 8
+  integer :: a(N)
+  !$acc declare deviceptr (a)
+  integer :: b(N)
+  integer :: i = 0
+
+  !$acc parallel copy (b)
+    do i = 1, N
+      a(i) = i * 4
+      b(i) = a(i)
+    end do
+  !$acc end parallel
+
+end subroutine
+
+subroutine subr3 (a, b)
+  implicit none
+  integer, parameter :: N = 8
+  integer :: a(N)
+  !$acc declare deviceptr (a)
+  integer :: b(N)
+  integer :: i = 0
+
+  !$acc kernels copy (b)
+    do i = 1, N
+      a(i) = i * 8
+      b(i) = a(i)
+    end do
+  !$acc end kernels
+
+end subroutine
+
+subroutine subr4 (a, b)
+  implicit none
+  integer, parameter :: N = 8
+  integer :: a(N)
+  integer :: b(N)
+  integer :: i = 0
+
+  !$acc parallel deviceptr (a) copy (b)
+    do i = 1, N
+      a(i) = i * 16
+      b(i) = a(i)
+    end do
+  !$acc end parallel
+
+end subroutine
+
+subroutine subr5 (a, b)
+  implicit none
+  integer, parameter :: N = 8
+  integer :: a(N)
+  integer :: b(N)
+  integer :: i = 0
+
+  !$acc kernels deviceptr (a) copy (b)
+    do i = 1, N
+      a(i) = i * 32
+      b(i) = a(i)
+    end do
+  !$acc end kernels
+
+end subroutine
+
+subroutine subr6 (a, b)
+  implicit none
+  integer, parameter :: N = 8
+  integer :: a(N)
+  integer :: b(N)
+  integer :: i = 0
+
+  !$acc parallel deviceptr (a) copy (b)
+    do i = 1, N
+      b(i) = i
+    end do
+  !$acc end parallel
+
+end subroutine
+
+subroutine subr7 (a, b)
+  implicit none
+  integer, parameter :: N = 8
+  integer :: a(N)
+  integer :: b(N)
+  integer :: i = 0
+
+  !$acc data deviceptr (a)
+
+  !$acc parallel copy (b)
+    do i = 1, N
+      a(i) = i * 2
+      b(i) = a(i)
+    end do
+  !$acc end parallel
+
+  !$acc parallel copy (b)
+    do i = 1, N
+      a(i) = b(i) * 2
+      b(i) = a(i)
+    end do
+  !$acc end parallel
+
+  !$acc end data
+
+end subroutine
+
+program main
+  use iso_c_binding, only: c_ptr, c_f_pointer
+  implicit none
+  type (c_ptr) :: cp
+  integer, parameter :: N = 8
+  integer, pointer :: fp(:)
+  integer :: i = 0
+  integer :: b(N)
+
+  interface
+    function acc_malloc (s) bind (C)
+      use iso_c_binding, only: c_ptr, c_size_t
+      integer (c_size_t), value :: s
+      type (c_ptr) :: acc_malloc
+    end function
+  end interface
+
+  cp = acc_malloc (N * sizeof (fp(N)))
+  call c_f_pointer (cp, fp, [N])
+
+  call subr1 (fp, b)
+
+  do i = 1, N
+    if (b(i) .ne. i * 2) call abort
+  end do
+
+  call subr2 (fp, b)
+
+  do i = 1, N
+    if (b(i) .ne. i * 4) call abort
+  end do
+
+  call subr3 (fp, b)
+
+  do i = 1, N
+    if (b(i) .ne. i * 8) call abort
+  end do
+
+  call subr4 (fp, b)
+
+  do i = 1, N
+    if (b(i) .ne. i * 16) call abort
+  end do
+
+  call subr5 (fp, b)
+
+  do i = 1, N
+    if (b(i) .ne. i * 32) call abort
+  end do
+
+  call subr6 (fp, b)
+
+  do i = 1, N
+    if (b(i) .ne. i) call abort
+  end do
+
+  call subr7 (fp, b)
+
+  do i = 1, N
+    if (b(i) .ne. i * 4) call abort
+  end do
+
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/firstprivate-int.f90 b/libgomp/testsuite/libgomp.oacc-fortran/firstprivate-int.f90
new file mode 100644
index 0000000..abc175f3
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/firstprivate-int.f90
@@ -0,0 +1,209 @@
+! Verify the GOMP_MAP_FIRSTPRIVATE_INT optimziation on various types.
+
+! { dg-do run }
+
+program test
+  use iso_fortran_env, only: integer_kinds
+  implicit none
+
+  integer (kind=1)  :: i1i, i1o
+  integer (kind=2)  :: i2i, i2o
+  integer (kind=4)  :: i4i, i4o
+  integer (kind=8)  :: i8i, i8o
+! Use highest-precision integer, which might be less than '16'
+! assume integer_kinds == logical_kinds
+  integer (kind=maxval(integer_kinds)) :: i16i, i16o
+
+  logical (kind=1)  :: l1i, l1o
+  logical (kind=2)  :: l2i, l2o
+  logical (kind=4)  :: l4i, l4o
+  logical (kind=8)  :: l8i, l8o
+  logical (kind=maxval(integer_kinds)) :: l16i, l16o
+
+  real (kind=4)  :: r4i, r4o
+  real (kind=8)  :: r8i, r8o
+
+  complex (kind=4)  :: c4i, c4o
+  complex (kind=8)  :: c8i, c8o
+
+  character (kind=1) :: ch1i, ch1o
+  character (kind=4) :: ch4i, ch4o
+
+  i1i = 1
+  i2i = 2
+  i4i = 3
+  i8i = 4
+  i16i = 5
+
+  l1i = .true.
+  l2i = .false.
+  l4i = .true.
+  l8i = .true.
+  l16i = .false.
+
+  r4i = .5
+  r8i = .25
+
+  c4i = (2, -2)
+  c8i = (4, -4)
+
+  ch1i = "a"
+  ch4i = "b"
+
+  !$acc parallel firstprivate(i1i, i2i, i4i, i8i, i16i) &
+  !$acc copyout(i1o, i2o, i4o, i8o, i16o) &
+  !$acc firstprivate(l1i, l2i, l4i, l8i, l16i) &
+  !$acc copyout(l1o, l2o, l4o, l8o, l16o) &
+  !$acc firstprivate(r4i, r8i) copyout(r4o, r8o) &
+  !$acc firstprivate(c4i, c8i) copyout(c4o, c8o) &
+  !$acc firstprivate(ch1i, ch4i) &
+  !$acc copyout(ch1o, ch4o)
+  i1o = i1i
+  i2o = i2i
+  i4o = i4i
+  i8o = i8i
+  i16o = i16i
+
+  l1o = l1i
+  l2o = l2i
+  l4o = l4i
+  l8o = l8i
+  l16o = l16i
+
+  r4o = r4i
+  r8o = r8i
+
+  c4o = c4i
+  c8o = c8i
+
+  ch1o = ch1i
+  ch4o = ch4i
+  !$acc end parallel
+
+  if (i1i /= i1o) stop 1
+  if (i2i /= i2o) stop 2
+  if (i4i /= i4o) stop 3
+  if (i8i /= i8o) stop 4
+  if (i16i /= i16o) stop 5
+
+  if (l1i .neqv. l1o) stop 6
+  if (l2i .neqv. l2o) stop 7
+  if (l4i .neqv. l4o) stop 8
+  if (l8i .neqv. l8o) stop 9
+  if (l16i .neqv. l16o) stop 10
+
+  if (r4i /= r4o) stop 11
+  if (r8i /= r8o) stop 12
+
+  if (c4i /= c4o) stop 13
+  if (c8i /= c8o) stop 14
+
+  if (ch1i /= ch1o) stop 15
+  if (ch4i /= ch4o) stop 16
+
+  call subtest(i1i, i2i, i4i, i8i, i16i, i1o, i2o, i4o, i8o, i16o, &
+               l1i, l2i, l4i, l8i, l16i, l1o, l2o, l4o, l8o, l16o, &
+               r4i, r8i, r4o, r8o, c4i, c8i, c4o, c8o, &
+               ch1i, ch4i, ch1o, ch4o)
+end program test
+
+subroutine subtest(i1i, i2i, i4i, i8i, i16i, i1o, i2o, i4o, i8o, i16o, &
+                   l1i, l2i, l4i, l8i, l16i, l1o, l2o, l4o, l8o, l16o, &
+                   r4i, r8i, r4o, r8o, c4i, c8i, c4o, c8o, &
+                   ch1i, ch4i, ch1o, ch4o)
+  use iso_fortran_env, only: integer_kinds
+  implicit none
+
+  integer (kind=1)  :: i1i, i1o
+  integer (kind=2)  :: i2i, i2o
+  integer (kind=4)  :: i4i, i4o
+  integer (kind=8)  :: i8i, i8o
+  integer (kind=maxval(integer_kinds)) :: i16i, i16o
+
+  logical (kind=1)  :: l1i, l1o
+  logical (kind=2)  :: l2i, l2o
+  logical (kind=4)  :: l4i, l4o
+  logical (kind=8)  :: l8i, l8o
+  logical (kind=maxval(integer_kinds)) :: l16i, l16o
+
+  real (kind=4)  :: r4i, r4o
+  real (kind=8)  :: r8i, r8o
+
+  complex (kind=4)  :: c4i, c4o
+  complex (kind=8)  :: c8i, c8o
+
+  character (kind=1) :: ch1i, ch1o
+  character (kind=4) :: ch4i, ch4o
+
+  i1i = -i1i
+  i2i = -i2i
+  i4i = -i4i
+  i8i = -i8i
+  i16i = -i16i
+
+  l1i = .not. l1i
+  l2i = .not. l2i
+  l4i = .not. l4i
+  l8i = .not. l8i
+  l16i = .not. l16i
+
+  r4i = -r4i
+  r8i = -r8i
+
+  c4i = -c4i
+  c8i = -c8i
+
+  ch1i = "z"
+  ch4i = "y"
+
+  !$acc parallel firstprivate(i1i, i2i, i4i, i8i, i16i) &
+  !$acc copyout(i1o, i2o, i4o, i8o, i16o) &
+  !$acc firstprivate(l1i, l2i, l4i, l8i, l16i) &
+  !$acc copyout(l1o, l2o, l4o, l8o, l16o) &
+  !$acc firstprivate(r4i, r8i) copyout(r4o, r8o) &
+  !$acc firstprivate(c4i, c8i) copyout(c4o, c8o) &
+  !$acc firstprivate(ch1i, ch4i) &
+  !$acc copyout(ch1o, ch4o)
+  i1o = i1i
+  i2o = i2i
+  i4o = i4i
+  i8o = i8i
+  i16o = i16i
+
+  l1o = l1i
+  l2o = l2i
+  l4o = l4i
+  l8o = l8i
+  l16o = l16i
+
+  r4o = r4i
+  r8o = r8i
+
+  c4o = c4i
+  c8o = c8i
+
+  ch1o = ch1i
+  ch4o = ch4i
+  !$acc end parallel
+
+  if (i1i /= i1o) stop 17
+  if (i2i /= i2o) stop 18
+  if (i4i /= i4o) stop 19
+  if (i8i /= i8o) stop 20
+  if (i16i /= i16o) stop 21
+
+  if (l1i .neqv. l1o) stop 22
+  if (l2i .neqv. l2o) stop 23
+  if (l4i .neqv. l4o) stop 24
+  if (l8i .neqv. l8o) stop 25
+  if (l16i .neqv. l16o) stop 26
+
+  if (r4i /= r4o) stop 27
+  if (r8i /= r8o) stop 28
+
+  if (c4i /= c4o) stop 29
+  if (c8i /= c8o) stop 30
+
+  if (ch1i /= ch1o) stop 31
+  if (ch4i /= ch4o) stop 32
+end subroutine subtest
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/nonlexical-assumed-size-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/nonlexical-assumed-size-1.f90
new file mode 100644
index 0000000..8b173c7
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/nonlexical-assumed-size-1.f90
@@ -0,0 +1,29 @@
+! { dg-do run }
+
+program p
+implicit none
+integer :: myarr(10)
+
+myarr = 0
+
+call subr(myarr)
+
+if (myarr(5).ne.5) stop 1
+
+contains
+
+subroutine subr(arr)
+implicit none
+integer :: arr(*)
+
+!$acc enter data copyin(arr(1:10))
+
+!$acc serial
+! { dg-warning {using .vector_length \(32\)., ignoring 1} "" { target openacc_nvidia_accel_selected } .-1 }
+arr(5) = 5
+!$acc end serial
+
+!$acc exit data copyout(arr(1:10))
+
+end subroutine subr
+end program p
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/nonlexical-assumed-size-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/nonlexical-assumed-size-2.f90
new file mode 100644
index 0000000..659fe8e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/nonlexical-assumed-size-2.f90
@@ -0,0 +1,41 @@
+! { dg-do run }
+
+program p
+implicit none
+integer :: myarr(10)
+
+myarr = 0
+
+call subr(myarr)
+
+if (myarr(5).ne.5) stop 1
+
+contains
+
+subroutine subr(arr)
+implicit none
+integer :: arr(*)
+
+! At first glance, it might not be obvious how this works.  The "enter data"
+! and "exit data" operations expand to a pair of mapping nodes for OpenACC,
+! GOMP_MAP_{TO/FROM} and GOMP_MAP_POINTER.  The former maps the array data,
+! and the latter creates a separate mapping on the target for the pointer
+! itself with a bias so it represents the "zeroth" element.
+
+!$acc enter data copyin(arr(2:8))
+
+! ...then this implicit mapping creates a zero-length array section
+! (GOMP_MAP_ZERO_LEN_ARRAY_SECTION) followed by another GOMP_MAP_POINTER for
+! 'arr'.  But now that pointer is already "present" on the target, so is not
+! overwritten.
+
+!$acc serial
+! { dg-warning {using .vector_length \(32\)., ignoring 1} "" { target openacc_nvidia_accel_selected } .-1 }
+! This access is then done via the on-target pointer.
+arr(5) = 5
+!$acc end serial
+
+!$acc exit data copyout(arr(2:8))
+
+end subroutine subr
+end program p
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/openacc_version-1.f b/libgomp/testsuite/libgomp.oacc-fortran/openacc_version-1.f
index 36e9844..8d4e3f3 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/openacc_version-1.f
+++ b/libgomp/testsuite/libgomp.oacc-fortran/openacc_version-1.f
@@ -4,6 +4,6 @@
       implicit none
       include "openacc_lib.h"
 
-      if (openacc_version .ne. 201711) STOP 1
+      if (openacc_version .ne. 201811) STOP 1
 
       end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/openacc_version-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/openacc_version-2.f90
index e815bc1..c9946c2 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/openacc_version-2.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/openacc_version-2.f90
@@ -4,6 +4,6 @@ program main
   use openacc
   implicit none
 
-  if (openacc_version .ne. 201711) STOP 1
+  if (openacc_version .ne. 201811) STOP 1
 
 end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/optional-private.f90 b/libgomp/testsuite/libgomp.oacc-fortran/optional-private.f90
index df69362..30a55bc 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/optional-private.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/optional-private.f90
@@ -44,7 +44,7 @@ contains
     ! { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 }
     !$acc loop gang private(x)
     ! { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-1 }
-    ! { dg-note {variable 'x' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-2 }
+    ! { dg-note {variable 'x' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { xfail *-*-* } .-2 }
     do i = 1, 32
        x = i * 2;
        arr(i) = arr(i) + x
@@ -72,7 +72,7 @@ contains
     ! { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-1 }
     !$acc loop gang private(pt)
     ! { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-1 }
-    ! { dg-note {variable 'pt' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-2 }
+    ! { dg-note {variable 'pt' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { xfail *-*-* } .-2 }
     do i = 0, 31
        pt%x = i
        pt%y = i * 2
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/optional-reduction.f90 b/libgomp/testsuite/libgomp.oacc-fortran/optional-reduction.f90
index 0bb05b9..91564b2 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/optional-reduction.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/optional-reduction.f90
@@ -34,8 +34,7 @@ contains
 
     !$acc parallel num_gangs(ng) copy(rg)
     !$acc loop reduction(+:rg) gang
-    ! { dg-bogus {'rg\.[0-9]+' is used uninitialized} TODO { xfail *-*-* } .-1 }
-    !   { dg-note {'rg\.[0-9]+' was declared here} {} { target *-*-* } .-2 }
+    ! { dg-bogus {'rg\.[0-9]+' is used uninitialized} "" { target *-*-* } .-1 }
     do i = 1, n
        rg = rg + array(i)
     end do
@@ -43,8 +42,7 @@ contains
 
     !$acc parallel num_workers(nw) copy(rw)
     !$acc loop reduction(+:rw) worker
-    ! { dg-bogus {'rw\.[0-9]+' is used uninitialized} TODO { xfail *-*-* } .-1 }
-    !   { dg-note {'rw\.[0-9]+' was declared here} {} { target *-*-* } .-2 }
+    ! { dg-bogus {'rw\.[0-9]+' is used uninitialized} "" { target *-*-* } .-1 }
     do i = 1, n
        rw = rw + array(i)
     end do
@@ -52,8 +50,7 @@ contains
 
     !$acc parallel vector_length(vl) copy(rv)
     !$acc loop reduction(+:rv) vector
-    ! { dg-bogus {'rv\.[0-9]+' is used uninitialized} TODO { xfail *-*-* } .-1 }
-    !   { dg-note {'rv\.[0-9]+' was declared here} {} { target *-*-* } .-2 }
+    ! { dg-bogus {'rv\.[0-9]+' is used uninitialized} "" { target *-*-* } .-1 }
     do i = 1, n
        rv = rv + array(i)
     end do
@@ -61,8 +58,7 @@ contains
 
     !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
     !$acc loop reduction(+:rc) gang worker vector
-    ! { dg-bogus {'rc\.[0-9]+' is used uninitialized} TODO { xfail *-*-* } .-1 }
-    !   { dg-note {'rc\.[0-9]+' was declared here} {} { target *-*-* } .-2 }
+    ! { dg-bogus {'rc\.[0-9]+' is used uninitialized} "" { target *-*-* } .-1 }
     do i = 1, n
        rc = rc + array(i)
     end do
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/parallel-reduction.f90 b/libgomp/testsuite/libgomp.oacc-fortran/parallel-reduction.f90
index a7b7ade..2b289c2 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/parallel-reduction.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/parallel-reduction.f90
@@ -46,11 +46,9 @@ subroutine redsub(s1, s2, n)
   integer :: s1, s2, n
 
   !$acc parallel reduction(+:s1,s2) num_gangs (10)  copy(s1)
-  ! { dg-bogus {'s1\.[0-9]+' is used uninitialized} TODO { xfail *-*-* } .-1 }
-  !   { dg-note {'s1\.[0-9]+' was declared here} {} { target *-*-* } .-2 }
-  ! { dg-bogus {'s2\.[0-9]+' is used uninitialized} TODO { xfail *-*-* } .-3 }
-  !   { dg-note {'s2\.[0-9]+' was declared here} {} { target *-*-* } .-4 }
-  ! { dg-bogus "\[Ww\]arning: region is gang partitioned but does not contain gang partitioned code" "TODO 'reduction'" { xfail *-*-* } .-5 }
+  ! { dg-bogus {'s1\.[0-9]+' is used uninitialized} "" { target *-*-* } .-1 }
+  ! { dg-bogus {'s2\.[0-9]+' is used uninitialized} "" { target *-*-* } .-2 }
+  ! { dg-bogus "\[Ww\]arning: region is gang partitioned but does not contain gang partitioned code" "TODO 'reduction'" { xfail *-*-* } .-3 }
   s1 = s1 + 1
   s2 = s2 + 1
   !$acc end parallel
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/pr70643.f90 b/libgomp/testsuite/libgomp.oacc-fortran/pr70643.f90
index 5082e36..a9f00ab 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/pr70643.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/pr70643.f90
@@ -18,8 +18,7 @@ SUBROUTINE reduction_kernel(x_min,x_max,y_min,y_max,arr,sum)
 
 !$ACC DATA PRESENT(arr) COPY(sum)
 !$ACC PARALLEL LOOP REDUCTION(+ : sum)
-  ! { dg-bogus {'sum\.[0-9]+' is used uninitialized} TODO { xfail *-*-* } .-1 }
-  !   { dg-note {'sum\.[0-9]+' was declared here} {} { target *-*-* } .-2 }
+  ! { dg-bogus {'sum\.[0-9]+' is used uninitialized} "" { target *-*-* } .-1 }
   DO k=y_min,y_max
     DO j=x_min,x_max
       sum=sum+arr(j,k)
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/pr70828-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-2.f90
new file mode 100644
index 0000000..22a9566
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-2.f90
@@ -0,0 +1,31 @@
+! Subarrays declared on data construct: assumed-shape array.
+
+subroutine s1(n, arr)
+  integer :: n
+  integer :: arr(n)
+
+  !$acc data copy(arr(5:n-10))
+  !$acc parallel loop
+  do i = 10, n - 10
+     arr(i) = i
+  end do
+  !$acc end parallel loop
+  !$acc end data
+end subroutine s1
+
+program test
+  integer, parameter :: n = 100
+  integer i, data(n)
+
+  data(:) = 0
+
+  call s1(n, data)
+
+  do i = 1, n
+     if ((i < 10 .or. i > n-10)) then
+        if ((data(i) .ne. 0)) call abort
+     else if (data(i) .ne. i) then
+        call abort
+     end if
+  end do
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/pr70828-3.f90 b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-3.f90
new file mode 100644
index 0000000..ff17d10
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-3.f90
@@ -0,0 +1,34 @@
+! Subarrays declared on data construct: deferred-shape array.
+
+subroutine s1(n, arr)
+  integer :: n
+  integer :: arr(n)
+
+  !$acc data copy(arr(5:n-10))
+  !$acc parallel loop
+  do i = 10, n - 10
+     arr(i) = i
+  end do
+  !$acc end parallel loop
+  !$acc end data
+end subroutine s1
+
+program test
+  integer, parameter :: n = 100
+  integer i
+  integer, allocatable :: data(:)
+
+  allocate (data(1:n))
+
+  data(:) = 0
+
+  call s1(n, data)
+
+  do i = 1, n
+     if ((i < 10 .or. i > n-10)) then
+        if ((data(i) .ne. 0)) call abort
+     else if (data(i) .ne. i) then
+        call abort
+     end if
+  end do
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/pr70828-4.f90 b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-4.f90
new file mode 100644
index 0000000..01da999
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-4.f90
@@ -0,0 +1,31 @@
+! Subarrays declared on data construct: assumed-size array.
+
+subroutine s1(n, arr)
+  integer :: n
+  integer :: arr(*)
+
+  !$acc data copy(arr(5:n-10))
+  !$acc parallel loop
+  do i = 10, n - 10
+     arr(i) = i
+  end do
+  !$acc end parallel loop
+  !$acc end data
+end subroutine s1
+
+program test
+  integer, parameter :: n = 100
+  integer i, data(n)
+
+  data(:) = 0
+
+  call s1(n, data)
+
+  do i = 1, n
+     if ((i < 10 .or. i > n-10)) then
+        if ((data(i) .ne. 0)) call abort
+     else if (data(i) .ne. i) then
+        call abort
+     end if
+  end do
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/pr70828-5.f90 b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-5.f90
new file mode 100644
index 0000000..8a16e3d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-5.f90
@@ -0,0 +1,29 @@
+! Subarrays on parallel construct (no data construct): assumed-size array.
+
+subroutine s1(n, arr)
+  integer :: n
+  integer :: arr(*)
+
+  !$acc parallel loop copy(arr(5:n-10))
+  do i = 10, n - 10
+     arr(i) = i
+  end do
+  !$acc end parallel loop
+end subroutine s1
+
+program test
+  integer, parameter :: n = 100
+  integer i, data(n)
+
+  data(:) = 0
+
+  call s1(n, data)
+
+  do i = 1, n
+     if ((i < 10 .or. i > n-10)) then
+        if ((data(i) .ne. 0)) call abort
+     else if (data(i) .ne. i) then
+        call abort
+     end if
+  end do
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/pr70828-6.f90 b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-6.f90
new file mode 100644
index 0000000..e99c364
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-6.f90
@@ -0,0 +1,28 @@
+! Subarrays declared on data construct: allocatable array (with array
+! descriptor).
+
+program test
+  integer, parameter :: n = 100
+  integer i
+  integer, allocatable :: data(:)
+
+  allocate (data(1:n))
+
+  data(:) = 0
+
+  !$acc data copy(data(5:n-10))
+  !$acc parallel loop
+  do i = 10, n - 10
+     data(i) = i
+  end do
+  !$acc end parallel loop
+  !$acc end data
+
+  do i = 1, n
+     if ((i < 10 .or. i > n-10)) then
+        if ((data(i) .ne. 0)) call abort
+     else if (data(i) .ne. i) then
+        call abort
+     end if
+  end do
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/pr70828.f90 b/libgomp/testsuite/libgomp.oacc-fortran/pr70828.f90
new file mode 100644
index 0000000..f87d232
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/pr70828.f90
@@ -0,0 +1,24 @@
+! Subarrays on data construct: explicit-shape array.
+
+program test
+  integer, parameter :: n = 100
+  integer i, data(n)
+
+  data(:) = 0
+
+  !$acc data copy(data(5:n-10))
+  !$acc parallel loop
+  do i = 10, n - 10
+     data(i) = i
+  end do
+  !$acc end parallel loop
+  !$acc end data
+
+  do i = 1, n
+     if ((i < 10 .or. i > n-10)) then
+        if ((data(i) .ne. 0)) call abort
+     else if (data(i) .ne. i) then
+        call abort
+     end if
+  end do
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/privatized-ref-1.f95 b/libgomp/testsuite/libgomp.oacc-fortran/privatized-ref-1.f95
index b027d14..1b3367d 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/privatized-ref-1.f95
+++ b/libgomp/testsuite/libgomp.oacc-fortran/privatized-ref-1.f95
@@ -78,7 +78,7 @@ contains
     !$acc loop collapse(2) gang private(t1) ! { dg-line l_loop[incr c_loop] }
     ! { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop }
     ! { dg-note {variable 'j' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop }
-    ! { dg-note {variable 't1' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop }
+    ! { dg-note {variable 't1' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { xfail *-*-* } l_loop$c_loop }
     do i=0,255
       do j=1,256
         t1 = (i * 256 + j) * 97
@@ -103,7 +103,7 @@ contains
     do i=0,255
       !$acc loop worker private(t1) ! { dg-line l_loop[incr c_loop] }
       ! { dg-note {variable 'j' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop }
-      ! { dg-note {variable 't1' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop }
+      ! { dg-note {variable 't1' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { xfail *-*-* } l_loop$c_loop }
       do j=1,256
         t1 = (i * 256 + j) * 99
         res(i * 256 + j) = t1
@@ -127,7 +127,7 @@ contains
     do i=0,255
       !$acc loop vector private(t1) ! { dg-line l_loop[incr c_loop] }
       ! { dg-note {variable 'j' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop }
-      ! { dg-note {variable 't1' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop }
+      ! { dg-note {variable 't1' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { xfail *-*-* } l_loop$c_loop }
       do j=1,256
         t1 = (i * 256 + j) * 101
         res(i * 256 + j) = t1
@@ -149,7 +149,7 @@ contains
     !$acc loop collapse(2) gang worker vector private(t1) ! { dg-line l_loop[incr c_loop] }
     ! { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop }
     ! { dg-note {variable 'j' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop }
-    ! { dg-note {variable 't1' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } l_loop$c_loop }
+    ! { dg-note {variable 't1' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} "" { xfail *-*-* } l_loop$c_loop }
     do i=0,255
       do j=1,256
         t1 = (i * 256 + j) * 103
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-10.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-10.f90
new file mode 100644
index 0000000..f766524
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-10.f90
@@ -0,0 +1,598 @@
+! { dg-do run }
+
+! integer array reductions
+
+program main
+  implicit none
+
+  integer, parameter     :: n = 10, ng = 8, nw = 4, vl = 32
+  integer                :: i, j
+  integer, dimension (n) :: vresult, rg, rw, rv, rc
+  logical, dimension (n) :: lrg, lrw, lrv, lrc, lvresult
+  integer, dimension (n) :: array
+
+  do i = 1, n
+     array(i) = i
+  end do
+
+  !
+  ! '+' reductions
+  !
+
+  rg = 0
+  rw = 0
+  rv = 0
+  rc = 0
+  vresult = 0
+
+  !$acc parallel num_gangs(ng) copy(rg)
+  !$acc loop reduction(+:rg) gang
+  do i = 1, n
+    do j = 1, n
+      rg(j) = rg(j) + array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(rw)
+  !$acc loop reduction(+:rw) worker
+  do i = 1, n
+    do j = 1, n
+      rw(j) = rw(j) + array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(rv)
+  !$acc loop reduction(+:rv) vector
+  do i = 1, n
+    do j = 1, n
+      rv(j) = rv(j) + array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
+  !$acc loop reduction(+:rc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      rc(j) = rc(j) + array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      vresult(j) = vresult(j) + array(i)
+    end do
+  end do
+
+  if (count (rg .ne. vresult) .ne. 0) STOP 1
+  if (count (rw .ne. vresult) .ne. 0) STOP 2
+  if (count (rv .ne. vresult) .ne. 0) STOP 3
+  if (count (rc .ne. vresult) .ne. 0) STOP 4
+
+  !
+  ! '*' reductions
+  !
+
+  rg = 1
+  rw = 1
+  rv = 1
+  rc = 1
+  vresult = 1
+
+  !$acc parallel num_gangs(ng) copy(rg)
+  !$acc loop reduction(*:rg) gang
+  do i = 1, n
+    do j = 1, n
+      rg(j) = rg(j) * array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(rw)
+  !$acc loop reduction(*:rw) worker
+  do i = 1, n
+    do j = 1, n
+      rw(j) = rw(j) * array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(rv)
+  !$acc loop reduction(*:rv) vector
+  do i = 1, n
+    do j = 1, n
+      rv(j) = rv(j) * array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
+  !$acc loop reduction(*:rc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      rc(j) = rc(j) * array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      vresult(j) = vresult(j) * array(i)
+    end do
+  end do
+
+  if (count (rg .ne. vresult) .ne. 0) STOP 5
+  if (count (rw .ne. vresult) .ne. 0) STOP 6
+  if (count (rv .ne. vresult) .ne. 0) STOP 7
+  if (count (rc .ne. vresult) .ne. 0) STOP 8
+
+  !
+  ! 'max' reductions
+  !
+
+  rg = 0
+  rw = 0
+  rv = 0
+  rc = 0
+  vresult = 0
+
+  !$acc parallel num_gangs(ng) copy(rg)
+  !$acc loop reduction(max:rg) gang
+  do i = 1, n
+    do j = 1, n
+      rg(j) = max (rg(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(rw)
+  !$acc loop reduction(max:rw) worker
+  do i = 1, n
+    do j = 1, n
+      rw(j) = max (rw(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(rv)
+  !$acc loop reduction(max:rv) vector
+  do i = 1, n
+    do j = 1, n
+      rv(j) = max (rv(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
+  !$acc loop reduction(max:rc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      rc(j) = max (rc(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      vresult(j) = max (vresult(j), array(i))
+    end do
+  end do
+
+  if (count (rg .ne. vresult) .ne. 0) STOP 9
+  if (count (rw .ne. vresult) .ne. 0) STOP 10
+  if (count (rv .ne. vresult) .ne. 0) STOP 11
+  if (count (rc .ne. vresult) .ne. 0) STOP 12
+
+  !
+  ! 'min' reductions
+  !
+
+  rg = n + 1
+  rw = n + 1
+  rv = n + 1
+  rc = n + 1
+  vresult = n + 1
+
+  !$acc parallel num_gangs(ng) copy(rg)
+  !$acc loop reduction(min:rg) gang
+  do i = 1, n
+    do j = 1, n
+      rg(j) = min (rg(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(rw)
+  !$acc loop reduction(min:rw) worker
+  do i = 1, n
+    do j = 1, n
+      rw(j) = min (rw(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(rv)
+  !$acc loop reduction(min:rv) vector
+  do i = 1, n
+    do j = 1, n
+      rv(j) = min (rv(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
+  !$acc loop reduction(min:rc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      rc(j) = min (rc(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      vresult(j) = min (vresult(j), array(i))
+    end do
+  end do
+
+  if (count (rg .ne. vresult) .ne. 0) STOP 13
+  if (count (rw .ne. vresult) .ne. 0) STOP 14
+  if (count (rv .ne. vresult) .ne. 0) STOP 15
+  if (count (rc .ne. vresult) .ne. 0) STOP 16
+
+  !
+  ! 'iand' reductions
+  !
+
+  rg = 1
+  rw = 1
+  rv = 1
+  rc = 1
+  vresult = 1
+
+  !$acc parallel num_gangs(ng) copy(rg)
+  !$acc loop reduction(iand:rg) gang
+  do i = 1, n
+    do j = 1, n
+      rg(j) = iand (rg(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(rw)
+  !$acc loop reduction(iand:rw) worker
+  do i = 1, n
+    do j = 1, n
+      rw(j) = iand (rw(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(rv)
+  !$acc loop reduction(iand:rv) vector
+  do i = 1, n
+    do j = 1, n
+      rv(j) = iand (rv(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
+  !$acc loop reduction(iand:rc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      rc(j) = iand (rc(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      vresult(j) = iand (vresult(j), array(i))
+    end do
+  end do
+
+  if (count (rg .ne. vresult) .ne. 0) STOP 17
+  if (count (rw .ne. vresult) .ne. 0) STOP 18
+  if (count (rv .ne. vresult) .ne. 0) STOP 19
+  if (count (rc .ne. vresult) .ne. 0) STOP 20
+
+  !
+  ! 'ior' reductions
+  !
+
+  rg = 0
+  rw = 0
+  rv = 0
+  rc = 0
+  vresult = 0
+
+  !$acc parallel num_gangs(ng) copy(rg)
+  !$acc loop reduction(ior:rg) gang
+  do i = 1, n
+    do j = 1, n
+      rg(j) = ior (rg(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(rw)
+  !$acc loop reduction(ior:rw) worker
+  do i = 1, n
+    do j = 1, n
+      rw(j) = ior (rw(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(rv)
+  !$acc loop reduction(ior:rv) vector
+  do i = 1, n
+    do j = 1, n
+      rv(j) = ior (rv(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
+  !$acc loop reduction(ior:rc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      rc(j) = ior (rc(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      vresult(j) = ior (vresult(j), array(i))
+    end do
+  end do
+
+  if (count (rg .ne. vresult) .ne. 0) STOP 21
+  if (count (rw .ne. vresult) .ne. 0) STOP 22
+  if (count (rv .ne. vresult) .ne. 0) STOP 23
+  if (count (rc .ne. vresult) .ne. 0) STOP 24
+
+  !
+  ! 'ieor' reductions
+  !
+
+  rg = 0
+  rw = 0
+  rv = 0
+  rc = 0
+  vresult = 0
+
+  !$acc parallel num_gangs(ng) copy(rg)
+  !$acc loop reduction(ieor:rg) gang
+  do i = 1, n
+    do j = 1, n
+      rg(j) = ieor (rg(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(rw)
+  !$acc loop reduction(ieor:rw) worker
+  do i = 1, n
+    do j = 1, n
+      rw(j) = ieor (rw(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(rv)
+  !$acc loop reduction(ieor:rv) vector
+  do i = 1, n
+    do j = 1, n
+      rv(j) = ieor (rv(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
+  !$acc loop reduction(ieor:rc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      rc(j) = ieor (rc(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      vresult(j) = ieor (vresult(j), array(i))
+    end do
+  end do
+
+  if (count (rg .ne. vresult) .ne. 0) STOP 25
+  if (count (rw .ne. vresult) .ne. 0) STOP 26
+  if (count (rv .ne. vresult) .ne. 0) STOP 27
+  if (count (rc .ne. vresult) .ne. 0) STOP 28
+
+  !
+  ! '.and.' reductions
+  !
+
+  lrg = .true.
+  lrw = .true.
+  lrv = .true.
+  lrc = .true.
+  lvresult = .true.
+
+  !$acc parallel num_gangs(ng) copy(lrg)
+  !$acc loop reduction(.and.:lrg) gang
+  do i = 1, n
+    do j = 1, n
+      lrg(j) = lrg(j) .and. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(lrw)
+  !$acc loop reduction(.and.:lrw) worker
+  do i = 1, n
+    do j = 1, n
+      lrw(j) = lrw(j) .and. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(lrv)
+  !$acc loop reduction(.and.:lrv) vector
+  do i = 1, n
+    do j = 1, n
+      lrv(j) = lrv(j) .and. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(lrc)
+  !$acc loop reduction(.and.:lrc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      lrc(j) = lrc(j) .and. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      lvresult(j) = lvresult(j) .and. (array(i) .ge. 5)
+    end do
+  end do
+
+  if (count (lrg .neqv. lvresult) .ne. 0) STOP 29
+  if (count (lrw .neqv. lvresult) .ne. 0) STOP 30
+  if (count (lrv .neqv. lvresult) .ne. 0) STOP 31
+  if (count (lrc .neqv. lvresult) .ne. 0) STOP 32
+
+  !
+  ! '.or.' reductions
+  !
+
+  lrg = .true.
+  lrw = .true.
+  lrv = .true.
+  lrc = .true.
+  lvresult = .true.
+
+  !$acc parallel num_gangs(ng) copy(lrg)
+  !$acc loop reduction(.or.:lrg) gang
+  do i = 1, n
+    do j = 1, n
+      lrg(j) = lrg(j) .or. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(lrw)
+  !$acc loop reduction(.or.:lrw) worker
+  do i = 1, n
+    do j = 1, n
+      lrw(j) = lrw(j) .or. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(lrv)
+  !$acc loop reduction(.or.:lrv) vector
+  do i = 1, n
+    do j = 1, n
+      lrv(j) = lrv(j) .or. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(lrc)
+  !$acc loop reduction(.or.:lrc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      lrc(j) = lrc(j) .or. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      lvresult(j) = lvresult(j) .or. (array(i) .ge. 5)
+    end do
+  end do
+
+  if (count (lrg .neqv. lvresult) .ne. 0) STOP 33
+  if (count (lrw .neqv. lvresult) .ne. 0) STOP 34
+  if (count (lrv .neqv. lvresult) .ne. 0) STOP 35
+  if (count (lrc .neqv. lvresult) .ne. 0) STOP 36
+
+  !
+  ! '.eqv.' reductions
+  !
+
+  lrg = .true.
+  lrw = .true.
+  lrv = .true.
+  lrc = .true.
+  lvresult = .true.
+
+  !$acc parallel num_gangs(ng) copy(lrg)
+  !$acc loop reduction(.eqv.:lrg) gang
+  do i = 1, n
+    do j = 1, n
+      lrg(j) = lrg(j) .eqv. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(lrw)
+  !$acc loop reduction(.eqv.:lrw) worker
+  do i = 1, n
+    do j = 1, n
+      lrw(j) = lrw(j) .eqv. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(lrv)
+  !$acc loop reduction(.eqv.:lrv) vector
+  do i = 1, n
+    do j = 1, n
+      lrv(j) = lrv(j) .eqv. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(lrc)
+  !$acc loop reduction(.eqv.:lrc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      lrc(j) = lrc(j) .eqv. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      lvresult(j) = lvresult(j) .eqv. (array(i) .ge. 5)
+    end do
+  end do
+
+  if (count (lrg .neqv. lvresult) .ne. 0) STOP 37
+  if (count (lrw .neqv. lvresult) .ne. 0) STOP 38
+  if (count (lrv .neqv. lvresult) .ne. 0) STOP 39
+  if (count (lrc .neqv. lvresult) .ne. 0) STOP 40
+
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-11.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-11.f90
new file mode 100644
index 0000000..220871a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-11.f90
@@ -0,0 +1,424 @@
+! { dg-do run }
+
+! real array reductions
+
+program main
+  implicit none
+
+  integer, parameter     :: n = 10, ng = 8, nw = 4, vl = 32
+  integer                :: i, j
+  real, dimension (n) :: vresult, rg, rw, rv, rc
+  logical, dimension (n) :: lrg, lrw, lrv, lrc, lvresult
+  real, dimension (n) :: array
+
+  do i = 1, n
+     array(i) = i
+  end do
+
+  !
+  ! '+' reductions
+  !
+
+  rg = 0
+  rw = 0
+  rv = 0
+  rc = 0
+  vresult = 0
+
+  !$acc parallel num_gangs(ng) copy(rg)
+  !$acc loop reduction(+:rg) gang
+  do i = 1, n
+    do j = 1, n
+      rg(j) = rg(j) + array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(rw)
+  !$acc loop reduction(+:rw) worker
+  do i = 1, n
+    do j = 1, n
+      rw(j) = rw(j) + array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(rv)
+  !$acc loop reduction(+:rv) vector
+  do i = 1, n
+    do j = 1, n
+      rv(j) = rv(j) + array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
+  !$acc loop reduction(+:rc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      rc(j) = rc(j) + array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      vresult(j) = vresult(j) + array(i)
+    end do
+  end do
+
+  if (count (rg .ne. vresult) .ne. 0) STOP 1
+  if (count (rw .ne. vresult) .ne. 0) STOP 2
+  if (count (rv .ne. vresult) .ne. 0) STOP 3
+  if (count (rc .ne. vresult) .ne. 0) STOP 4
+
+  !
+  ! '*' reductions
+  !
+
+  rg = 1
+  rw = 1
+  rv = 1
+  rc = 1
+  vresult = 1
+
+  !$acc parallel num_gangs(ng) copy(rg)
+  !$acc loop reduction(*:rg) gang
+  do i = 1, n
+    do j = 1, n
+      rg(j) = rg(j) * array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(rw)
+  !$acc loop reduction(*:rw) worker
+  do i = 1, n
+    do j = 1, n
+      rw(j) = rw(j) * array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(rv)
+  !$acc loop reduction(*:rv) vector
+  do i = 1, n
+    do j = 1, n
+      rv(j) = rv(j) * array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
+  !$acc loop reduction(*:rc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      rc(j) = rc(j) * array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      vresult(j) = vresult(j) * array(i)
+    end do
+  end do
+
+  if (count (rg .ne. vresult) .ne. 0) STOP 5
+  if (count (rw .ne. vresult) .ne. 0) STOP 6
+  if (count (rv .ne. vresult) .ne. 0) STOP 7
+  if (count (rc .ne. vresult) .ne. 0) STOP 8
+
+  !
+  ! 'max' reductions
+  !
+
+  rg = 0
+  rw = 0
+  rv = 0
+  rc = 0
+  vresult = 0
+
+  !$acc parallel num_gangs(ng) copy(rg)
+  !$acc loop reduction(max:rg) gang
+  do i = 1, n
+    do j = 1, n
+      rg(j) = max (rg(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(rw)
+  !$acc loop reduction(max:rw) worker
+  do i = 1, n
+    do j = 1, n
+      rw(j) = max (rw(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(rv)
+  !$acc loop reduction(max:rv) vector
+  do i = 1, n
+    do j = 1, n
+      rv(j) = max (rv(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
+  !$acc loop reduction(max:rc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      rc(j) = max (rc(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      vresult(j) = max (vresult(j), array(i))
+    end do
+  end do
+
+  if (count (rg .ne. vresult) .ne. 0) STOP 9
+  if (count (rw .ne. vresult) .ne. 0) STOP 10
+  if (count (rv .ne. vresult) .ne. 0) STOP 11
+  if (count (rc .ne. vresult) .ne. 0) STOP 12
+
+  !
+  ! 'min' reductions
+  !
+
+  rg = n + 1
+  rw = n + 1
+  rv = n + 1
+  rc = n + 1
+  vresult = n + 1
+
+  !$acc parallel num_gangs(ng) copy(rg)
+  !$acc loop reduction(min:rg) gang
+  do i = 1, n
+    do j = 1, n
+      rg(j) = min (rg(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(rw)
+  !$acc loop reduction(min:rw) worker
+  do i = 1, n
+    do j = 1, n
+      rw(j) = min (rw(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(rv)
+  !$acc loop reduction(min:rv) vector
+  do i = 1, n
+    do j = 1, n
+      rv(j) = min (rv(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
+  !$acc loop reduction(min:rc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      rc(j) = min (rc(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      vresult(j) = min (vresult(j), array(i))
+    end do
+  end do
+
+  if (count (rg .ne. vresult) .ne. 0) STOP 13
+  if (count (rw .ne. vresult) .ne. 0) STOP 14
+  if (count (rv .ne. vresult) .ne. 0) STOP 15
+  if (count (rc .ne. vresult) .ne. 0) STOP 16
+
+  !
+  ! '.and.' reductions
+  !
+
+  lrg = .true.
+  lrw = .true.
+  lrv = .true.
+  lrc = .true.
+  lvresult = .true.
+
+  !$acc parallel num_gangs(ng) copy(lrg)
+  !$acc loop reduction(.and.:lrg) gang
+  do i = 1, n
+    do j = 1, n
+      lrg(j) = lrg(j) .and. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(lrw)
+  !$acc loop reduction(.and.:lrw) worker
+  do i = 1, n
+    do j = 1, n
+      lrw(j) = lrw(j) .and. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(lrv)
+  !$acc loop reduction(.and.:lrv) vector
+  do i = 1, n
+    do j = 1, n
+      lrv(j) = lrv(j) .and. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(lrc)
+  !$acc loop reduction(.and.:lrc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      lrc(j) = lrc(j) .and. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      lvresult(j) = lvresult(j) .and. (array(i) .ge. 5)
+    end do
+  end do
+
+  if (count (lrg .neqv. lvresult) .ne. 0) STOP 17
+  if (count (lrw .neqv. lvresult) .ne. 0) STOP 18
+  if (count (lrv .neqv. lvresult) .ne. 0) STOP 19
+  if (count (lrc .neqv. lvresult) .ne. 0) STOP 20
+
+  !
+  ! '.or.' reductions
+  !
+
+  lrg = .true.
+  lrw = .true.
+  lrv = .true.
+  lrc = .true.
+  lvresult = .true.
+
+  !$acc parallel num_gangs(ng) copy(lrg)
+  !$acc loop reduction(.or.:lrg) gang
+  do i = 1, n
+    do j = 1, n
+      lrg(j) = lrg(j) .or. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(lrw)
+  !$acc loop reduction(.or.:lrw) worker
+  do i = 1, n
+    do j = 1, n
+      lrw(j) = lrw(j) .or. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(lrv)
+  !$acc loop reduction(.or.:lrv) vector
+  do i = 1, n
+    do j = 1, n
+      lrv(j) = lrv(j) .or. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(lrc)
+  !$acc loop reduction(.or.:lrc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      lrc(j) = lrc(j) .or. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      lvresult(j) = lvresult(j) .or. (array(i) .ge. 5)
+    end do
+  end do
+
+  if (count (lrg .neqv. lvresult) .ne. 0) STOP 21
+  if (count (lrw .neqv. lvresult) .ne. 0) STOP 22
+  if (count (lrv .neqv. lvresult) .ne. 0) STOP 23
+  if (count (lrc .neqv. lvresult) .ne. 0) STOP 24
+
+  !
+  ! '.eqv.' reductions
+  !
+
+  lrg = .true.
+  lrw = .true.
+  lrv = .true.
+  lrc = .true.
+  lvresult = .true.
+
+  !$acc parallel num_gangs(ng) copy(lrg)
+  !$acc loop reduction(.eqv.:lrg) gang
+  do i = 1, n
+    do j = 1, n
+      lrg(j) = lrg(j) .eqv. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(lrw)
+  !$acc loop reduction(.eqv.:lrw) worker
+  do i = 1, n
+    do j = 1, n
+      lrw(j) = lrw(j) .eqv. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(lrv)
+  !$acc loop reduction(.eqv.:lrv) vector
+  do i = 1, n
+    do j = 1, n
+      lrv(j) = lrv(j) .eqv. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(lrc)
+  !$acc loop reduction(.eqv.:lrc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      lrc(j) = lrc(j) .eqv. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      lvresult(j) = lvresult(j) .eqv. (array(i) .ge. 5)
+    end do
+  end do
+
+  if (count (lrg .neqv. lvresult) .ne. 0) STOP 25
+  if (count (lrw .neqv. lvresult) .ne. 0) STOP 26
+  if (count (lrv .neqv. lvresult) .ne. 0) STOP 27
+  if (count (lrc .neqv. lvresult) .ne. 0) STOP 28
+
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-12.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-12.f90
new file mode 100644
index 0000000..d89d8ed
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-12.f90
@@ -0,0 +1,424 @@
+! { dg-do run }
+
+! double precision array reductions
+
+program main
+  implicit none
+
+  integer, parameter     :: n = 10, ng = 8, nw = 4, vl = 32
+  integer                :: i, j
+  double precision, dimension (n) :: vresult, rg, rw, rv, rc
+  logical, dimension (n) :: lrg, lrw, lrv, lrc, lvresult
+  double precision, dimension (n) :: array
+
+  do i = 1, n
+     array(i) = i
+  end do
+
+  !
+  ! '+' reductions
+  !
+
+  rg = 0
+  rw = 0
+  rv = 0
+  rc = 0
+  vresult = 0
+
+  !$acc parallel num_gangs(ng) copy(rg)
+  !$acc loop reduction(+:rg) gang
+  do i = 1, n
+    do j = 1, n
+      rg(j) = rg(j) + array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(rw)
+  !$acc loop reduction(+:rw) worker
+  do i = 1, n
+    do j = 1, n
+      rw(j) = rw(j) + array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(rv)
+  !$acc loop reduction(+:rv) vector
+  do i = 1, n
+    do j = 1, n
+      rv(j) = rv(j) + array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
+  !$acc loop reduction(+:rc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      rc(j) = rc(j) + array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      vresult(j) = vresult(j) + array(i)
+    end do
+  end do
+
+  if (count (rg .ne. vresult) .ne. 0) STOP 1
+  if (count (rw .ne. vresult) .ne. 0) STOP 2
+  if (count (rv .ne. vresult) .ne. 0) STOP 3
+  if (count (rc .ne. vresult) .ne. 0) STOP 4
+
+  !
+  ! '*' reductions
+  !
+
+  rg = 1
+  rw = 1
+  rv = 1
+  rc = 1
+  vresult = 1
+
+  !$acc parallel num_gangs(ng) copy(rg)
+  !$acc loop reduction(*:rg) gang
+  do i = 1, n
+    do j = 1, n
+      rg(j) = rg(j) * array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(rw)
+  !$acc loop reduction(*:rw) worker
+  do i = 1, n
+    do j = 1, n
+      rw(j) = rw(j) * array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(rv)
+  !$acc loop reduction(*:rv) vector
+  do i = 1, n
+    do j = 1, n
+      rv(j) = rv(j) * array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
+  !$acc loop reduction(*:rc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      rc(j) = rc(j) * array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      vresult(j) = vresult(j) * array(i)
+    end do
+  end do
+
+  if (count (rg .ne. vresult) .ne. 0) STOP 5
+  if (count (rw .ne. vresult) .ne. 0) STOP 6
+  if (count (rv .ne. vresult) .ne. 0) STOP 7
+  if (count (rc .ne. vresult) .ne. 0) STOP 8
+
+  !
+  ! 'max' reductions
+  !
+
+  rg = 0
+  rw = 0
+  rv = 0
+  rc = 0
+  vresult = 0
+
+  !$acc parallel num_gangs(ng) copy(rg)
+  !$acc loop reduction(max:rg) gang
+  do i = 1, n
+    do j = 1, n
+      rg(j) = max (rg(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(rw)
+  !$acc loop reduction(max:rw) worker
+  do i = 1, n
+    do j = 1, n
+      rw(j) = max (rw(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(rv)
+  !$acc loop reduction(max:rv) vector
+  do i = 1, n
+    do j = 1, n
+      rv(j) = max (rv(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
+  !$acc loop reduction(max:rc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      rc(j) = max (rc(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      vresult(j) = max (vresult(j), array(i))
+    end do
+  end do
+
+  if (count (rg .ne. vresult) .ne. 0) STOP 9
+  if (count (rw .ne. vresult) .ne. 0) STOP 10
+  if (count (rv .ne. vresult) .ne. 0) STOP 11
+  if (count (rc .ne. vresult) .ne. 0) STOP 12
+
+  !
+  ! 'min' reductions
+  !
+
+  rg = n + 1
+  rw = n + 1
+  rv = n + 1
+  rc = n + 1
+  vresult = n + 1
+
+  !$acc parallel num_gangs(ng) copy(rg)
+  !$acc loop reduction(min:rg) gang
+  do i = 1, n
+    do j = 1, n
+      rg(j) = min (rg(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(rw)
+  !$acc loop reduction(min:rw) worker
+  do i = 1, n
+    do j = 1, n
+      rw(j) = min (rw(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(rv)
+  !$acc loop reduction(min:rv) vector
+  do i = 1, n
+    do j = 1, n
+      rv(j) = min (rv(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
+  !$acc loop reduction(min:rc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      rc(j) = min (rc(j), array(i))
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      vresult(j) = min (vresult(j), array(i))
+    end do
+  end do
+
+  if (count (rg .ne. vresult) .ne. 0) STOP 13
+  if (count (rw .ne. vresult) .ne. 0) STOP 14
+  if (count (rv .ne. vresult) .ne. 0) STOP 15
+  if (count (rc .ne. vresult) .ne. 0) STOP 16
+
+  !
+  ! '.and.' reductions
+  !
+
+  lrg = .true.
+  lrw = .true.
+  lrv = .true.
+  lrc = .true.
+  lvresult = .true.
+
+  !$acc parallel num_gangs(ng) copy(lrg)
+  !$acc loop reduction(.and.:lrg) gang
+  do i = 1, n
+    do j = 1, n
+      lrg(j) = lrg(j) .and. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(lrw)
+  !$acc loop reduction(.and.:lrw) worker
+  do i = 1, n
+    do j = 1, n
+      lrw(j) = lrw(j) .and. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(lrv)
+  !$acc loop reduction(.and.:lrv) vector
+  do i = 1, n
+    do j = 1, n
+      lrv(j) = lrv(j) .and. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(lrc)
+  !$acc loop reduction(.and.:lrc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      lrc(j) = lrc(j) .and. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      lvresult(j) = lvresult(j) .and. (array(i) .ge. 5)
+    end do
+  end do
+
+  if (count (lrg .neqv. lvresult) .ne. 0) STOP 17
+  if (count (lrw .neqv. lvresult) .ne. 0) STOP 18
+  if (count (lrv .neqv. lvresult) .ne. 0) STOP 19
+  if (count (lrc .neqv. lvresult) .ne. 0) STOP 20
+
+  !
+  ! '.or.' reductions
+  !
+
+  lrg = .true.
+  lrw = .true.
+  lrv = .true.
+  lrc = .true.
+  lvresult = .true.
+
+  !$acc parallel num_gangs(ng) copy(lrg)
+  !$acc loop reduction(.or.:lrg) gang
+  do i = 1, n
+    do j = 1, n
+      lrg(j) = lrg(j) .or. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(lrw)
+  !$acc loop reduction(.or.:lrw) worker
+  do i = 1, n
+    do j = 1, n
+      lrw(j) = lrw(j) .or. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(lrv)
+  !$acc loop reduction(.or.:lrv) vector
+  do i = 1, n
+    do j = 1, n
+      lrv(j) = lrv(j) .or. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(lrc)
+  !$acc loop reduction(.or.:lrc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      lrc(j) = lrc(j) .or. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      lvresult(j) = lvresult(j) .or. (array(i) .ge. 5)
+    end do
+  end do
+
+  if (count (lrg .neqv. lvresult) .ne. 0) STOP 21
+  if (count (lrw .neqv. lvresult) .ne. 0) STOP 22
+  if (count (lrv .neqv. lvresult) .ne. 0) STOP 23
+  if (count (lrc .neqv. lvresult) .ne. 0) STOP 24
+
+  !
+  ! '.eqv.' reductions
+  !
+
+  lrg = .true.
+  lrw = .true.
+  lrv = .true.
+  lrc = .true.
+  lvresult = .true.
+
+  !$acc parallel num_gangs(ng) copy(lrg)
+  !$acc loop reduction(.eqv.:lrg) gang
+  do i = 1, n
+    do j = 1, n
+      lrg(j) = lrg(j) .eqv. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(lrw)
+  !$acc loop reduction(.eqv.:lrw) worker
+  do i = 1, n
+    do j = 1, n
+      lrw(j) = lrw(j) .eqv. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(lrv)
+  !$acc loop reduction(.eqv.:lrv) vector
+  do i = 1, n
+    do j = 1, n
+      lrv(j) = lrv(j) .eqv. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(lrc)
+  !$acc loop reduction(.eqv.:lrc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      lrc(j) = lrc(j) .eqv. (array(i) .ge. 5)
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      lvresult(j) = lvresult(j) .eqv. (array(i) .ge. 5)
+    end do
+  end do
+
+  if (count (lrg .neqv. lvresult) .ne. 0) STOP 25
+  if (count (lrw .neqv. lvresult) .ne. 0) STOP 26
+  if (count (lrv .neqv. lvresult) .ne. 0) STOP 27
+  if (count (lrc .neqv. lvresult) .ne. 0) STOP 28
+
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-13.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-13.f90
new file mode 100644
index 0000000..701cbb9
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-13.f90
@@ -0,0 +1,134 @@
+! { dg-do run }
+
+! complex array reductions
+
+program main
+  implicit none
+
+  integer, parameter     :: n = 10, ng = 8, nw = 4, vl = 32
+  integer                :: i, j
+  complex, dimension (n) :: vresult, rg, rw, rv, rc
+  logical, dimension (n) :: lrg, lrw, lrv, lrc, lvresult
+  complex, dimension (n) :: array
+
+  do i = 1, n
+     array(i) = i
+  end do
+
+  !
+  ! '+' reductions
+  !
+
+  rg = 0
+  rw = 0
+  rv = 0
+  rc = 0
+  vresult = 0
+
+  !$acc parallel num_gangs(ng) copy(rg)
+  !$acc loop reduction(+:rg) gang
+  do i = 1, n
+    do j = 1, n
+      rg(j) = rg(j) + array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(rw)
+  !$acc loop reduction(+:rw) worker
+  do i = 1, n
+    do j = 1, n
+      rw(j) = rw(j) + array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(rv)
+  !$acc loop reduction(+:rv) vector
+  do i = 1, n
+    do j = 1, n
+      rv(j) = rv(j) + array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
+  !$acc loop reduction(+:rc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      rc(j) = rc(j) + array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      vresult(j) = vresult(j) + array(i)
+    end do
+  end do
+
+  if (count (rg .ne. vresult) .ne. 0) STOP 1
+  if (count (rw .ne. vresult) .ne. 0) STOP 2
+  if (count (rv .ne. vresult) .ne. 0) STOP 3
+  if (count (rc .ne. vresult) .ne. 0) STOP 4
+
+  !
+  ! '*' reductions
+  !
+
+  rg = 1
+  rw = 1
+  rv = 1
+  rc = 1
+  vresult = 1
+
+  !$acc parallel num_gangs(ng) copy(rg)
+  !$acc loop reduction(*:rg) gang
+  do i = 1, n
+    do j = 1, n
+      rg(j) = rg(j) * array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_workers(nw) copy(rw)
+  !$acc loop reduction(*:rw) worker
+  do i = 1, n
+    do j = 1, n
+      rw(j) = rw(j) * array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel vector_length(vl) copy(rv)
+  !$acc loop reduction(*:rv) vector
+  do i = 1, n
+    do j = 1, n
+      rv(j) = rv(j) * array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
+  !$acc loop reduction(*:rc) gang worker vector
+  do i = 1, n
+    do j = 1, n
+      rc(j) = rc(j) * array(i)
+    end do
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+    do j = 1, n
+      vresult(j) = vresult(j) * array(i)
+    end do
+  end do
+
+  if (count (rg .ne. vresult) .ne. 0) STOP 5
+  if (count (rw .ne. vresult) .ne. 0) STOP 6
+  if (count (rv .ne. vresult) .ne. 0) STOP 7
+  if (count (rc .ne. vresult) .ne. 0) STOP 8
+
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-14.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-14.f90
new file mode 100644
index 0000000..95e56c9
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-14.f90
@@ -0,0 +1,68 @@
+! { dg-do run }
+
+! record type reductions
+
+program main
+  implicit none
+
+  type t1
+     integer :: i
+     real :: r
+  end type t1
+
+  type t2
+     real :: r
+     integer :: i
+     double precision :: d
+  end type t2
+
+  double precision, parameter :: e = 0.001
+  integer, parameter :: n = 10, ng = 8, nw = 4, vl = 32
+  integer :: i
+  type(t1) :: v1, a1
+  type (t2) :: v2, a2
+
+  v1%i = 0
+  v1%r = 0
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(v1)
+  !$acc loop reduction (+:v1)
+  do i = 1, n
+     v1%i = v1%i + 1
+     v1%r = v1%r + 2
+  end do
+  !$acc end parallel
+  a1%i = 0
+  a1%r = 0
+  do i = 1, n
+     a1%i = a1%i + 1
+     a1%r = a1%r + 2
+  end do
+  if (v1%i .ne. a1%i) STOP 1
+  if (v1%r .ne. a1%r) STOP 2
+
+  v2%i = 1
+  v2%r = 1
+  v2%d = 1
+  !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(v2)
+  !$acc loop reduction (*:v2)
+  do i = 1, n
+     v2%i = v2%i * 2
+     v2%r = v2%r * 1.1
+     v2%d = v2%d * 1.3
+  end do
+  !$acc end parallel
+  a2%i = 1
+  a2%r = 1
+  a2%d = 1
+  do i = 1, n
+     a2%i = a2%i * 2
+     a2%r = a2%r * 1.1
+     a2%d = a2%d * 1.3
+  end do
+
+  if (v2%i .ne. a2%i) STOP 3
+  if (v2%r .ne. a2%r) STOP 4
+  if (abs (v2%d - a2%d) .ge. e) STOP 5
+
+end program main
+
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-15.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-15.f90
new file mode 100644
index 0000000..7a36fb2
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-15.f90
@@ -0,0 +1,98 @@
+! { dg-do run }
+! { dg-additional-options "-cpp" }
+
+#define ARRAY_BODY(ARRAY, MIN, MAX)	\
+  do i = 1, 10;				\
+     do j = MIN, MAX;			\
+        ARRAY(j) = ARRAY(j) + 1;	\
+     end do;				\
+  end do
+
+program main
+  implicit none
+  integer :: i, j, max = 6, two = 2, three = 3, four = 4, five = 5, six = 6
+  integer :: a(6) = (/ 5, 1, 1, 5, 9, 9 /)
+  integer :: o(6)
+  o = a
+
+  !$acc parallel
+  !$acc loop reduction(+:a(2:3))
+  ARRAY_BODY (a, 2, 3)
+  !$acc end parallel
+  ARRAY_BODY (o, 2, 3)
+  do i = 1, max
+     if (a(i) .ne. o(i)) STOP 1
+  end do
+
+  !$acc parallel copy(a(4:6))
+  !$acc loop reduction(+:a(4:6))
+  ARRAY_BODY (a, 4, 6)
+  !$acc end parallel
+  ARRAY_BODY (o, 4, 6)
+  do i = 1, max
+     if (a(i) .ne. o(i)) STOP 2
+  end do
+
+  !$acc parallel copy(a)
+  !$acc loop reduction(+:a(1:6))
+  ARRAY_BODY (a, 1, 6)
+  !$acc end parallel
+  ARRAY_BODY (o, 1, 6)
+  do i = 1, max
+     if (a(i) .ne. o(i)) STOP 3
+  end do
+
+  !$acc parallel
+  !$acc loop reduction(+:a)
+  ARRAY_BODY (a, 4, 4)
+  !$acc end parallel
+  ARRAY_BODY (o, 4, 4)
+  do i = 1, max
+     if (a(i) .ne. o(i)) STOP 4
+  end do
+
+  !$acc parallel copy(a)
+  !$acc loop reduction(+:a)
+  ARRAY_BODY (a, 4, 6)
+  !$acc end parallel
+  ARRAY_BODY (o, 4, 6)
+  do i = 1, max
+     if (a(i) .ne. o(i)) STOP 5
+  end do
+
+#if !defined(ACC_DEVICE_TYPE_host)
+
+  !$acc parallel loop reduction(+:a)
+  ARRAY_BODY (a, 2, 4)
+  !$acc end parallel loop
+  ARRAY_BODY (o, 2, 4)
+  do i = 1, max
+     if (a(i) .ne. o(i)) STOP 6
+  end do
+
+  !$acc parallel loop reduction(+:a(2:4))
+  ARRAY_BODY (a, 2, 4)
+  !$acc end parallel loop
+  ARRAY_BODY (o, 2, 4)
+  do i = 1, max
+     if (a(i) .ne. o(i)) STOP 7
+  end do
+
+  !$acc parallel reduction(+:a)
+  ARRAY_BODY (a, 3, 4)
+  !$acc end parallel
+  ARRAY_BODY (o, 3, 4)
+  do i = 1, max
+     if (a(i) .ne. o(i)) STOP 8
+  end do
+
+  !$acc parallel reduction(+:a(2:3))
+  ARRAY_BODY (a, 2, 3)
+  !$acc end parallel
+  ARRAY_BODY (o, 2, 3)
+  do i = 1, max
+     if (a(i) .ne. o(i)) STOP 9
+  end do
+#endif
+
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-16.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-16.f90
new file mode 100644
index 0000000..c524f2a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-16.f90
@@ -0,0 +1,99 @@
+! { dg-do run }
+! { dg-additional-options "-cpp" }
+
+#define ARRAY_BODY(ARRAY, MIN, MAX)	\
+  do i = 1, 10;				\
+     do j = MIN, MAX;			\
+        ARRAY(j) = ARRAY(j) + 1;	\
+     end do;				\
+  end do
+
+program main
+  implicit none
+  integer :: i, j, max = 6, one = 1, two = 2, three = 3, four = 4, five = 5, six = 6
+  integer :: a(6) = (/ 5, 1, 1, 5, 9, 9 /)
+  integer :: o(6)
+  o = a
+
+  !$acc parallel
+  !$acc loop reduction(+:a(two:three))
+  ARRAY_BODY (a, two, three)
+  !$acc end parallel
+
+  ARRAY_BODY (o, two, three)
+  do i = 1, max
+     if (a(i) .ne. o(i)) STOP 1
+  end do
+
+  !$acc parallel copy(a(four:six))
+  !$acc loop reduction(+:a(four:six))
+  ARRAY_BODY (a, four, six)
+  !$acc end parallel
+  ARRAY_BODY (o, four, six)
+  do i = 1, max
+     if (a(i) .ne. o(i)) STOP 2
+  end do
+
+  !$acc parallel copy(a)
+  !$acc loop reduction(+:a(one:six))
+  ARRAY_BODY (a, one, six)
+  !$acc end parallel
+  ARRAY_BODY (o, one, six)
+  do i = 1, max
+     if (a(i) .ne. o(i)) STOP 3
+  end do
+
+  !$acc parallel
+  !$acc loop reduction(+:a)
+  ARRAY_BODY (a, four, four)
+  !$acc end parallel
+  ARRAY_BODY (o, four, four)
+  do i = 1, max
+     if (a(i) .ne. o(i)) STOP 4
+  end do
+
+  !$acc parallel copy(a)
+  !$acc loop reduction(+:a)
+  ARRAY_BODY (a, four, six)
+  !$acc end parallel
+  ARRAY_BODY (o, four, six)
+  do i = 1, max
+     if (a(i) .ne. o(i)) STOP 5
+  end do
+
+#if !defined(ACC_DEVICE_TYPE_host)
+
+  !$acc parallel loop reduction(+:a)
+  ARRAY_BODY (a, two, four)
+  !$acc end parallel loop
+  ARRAY_BODY (o, two, four)
+  do i = 1, max
+     if (a(i) .ne. o(i)) STOP 6
+  end do
+
+  !$acc parallel loop reduction(+:a(two:four))
+  ARRAY_BODY (a, two, four)
+  !$acc end parallel loop
+  ARRAY_BODY (o, two, four)
+  do i = 1, max
+     if (a(i) .ne. o(i)) STOP 7
+  end do
+
+  !$acc parallel reduction(+:a)
+  ARRAY_BODY (a, three, four)
+  !$acc end parallel
+  ARRAY_BODY (o, three, four)
+  do i = 1, max
+     if (a(i) .ne. o(i)) STOP 8
+  end do
+
+  !$acc parallel reduction(+:a(two:three))
+  ARRAY_BODY (a, two, three)
+  !$acc end parallel
+  ARRAY_BODY (o, two, three)
+  do i = 1, max
+     if (a(i) .ne. o(i)) STOP 9
+  end do
+#endif
+
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-5.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-5.f90
index 88a691f..30fb30a 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/reduction-5.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-5.f90
@@ -38,8 +38,7 @@ subroutine redsub_gang(sum, n, c)
 
   !$acc parallel copyin (n, c) num_gangs(n) copy(sum)
   !$acc loop reduction(+:sum) gang
-  ! { dg-bogus {'sum\.[0-9]+' is used uninitialized} TODO { xfail *-*-* } .-1 }
-  !   { dg-note {'sum\.[0-9]+' was declared here} {} { target *-*-* } .-2 }
+  ! { dg-bogus {'sum\.[0-9]+' is used uninitialized} "" { target *-*-* } .-1 }
   do i = 1, n
      sum = sum + c
   end do
@@ -54,8 +53,7 @@ subroutine redsub_worker(sum, n, c)
   !$acc parallel copyin (n, c) num_workers(4) vector_length (32) copy(sum)
   ! { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-1 }
   !$acc loop reduction(+:sum) worker
-  ! { dg-bogus {'sum\.[0-9]+' is used uninitialized} TODO { xfail *-*-* } .-1 }
-  !   { dg-note {'sum\.[0-9]+' was declared here} {} { target *-*-* } .-2 }
+  ! { dg-bogus {'sum\.[0-9]+' is used uninitialized} "" { target *-*-* } .-1 }
   do i = 1, n
      sum = sum + c
   end do
@@ -69,8 +67,7 @@ subroutine redsub_vector(sum, n, c)
 
   !$acc parallel copyin (n, c) vector_length(32) copy(sum)
   !$acc loop reduction(+:sum) vector
-  ! { dg-bogus {'sum\.[0-9]+' is used uninitialized} TODO { xfail *-*-* } .-1 }
-  !   { dg-note {'sum\.[0-9]+' was declared here} {} { target *-*-* } .-2 }
+  ! { dg-bogus {'sum\.[0-9]+' is used uninitialized} "" { target *-*-* } .-1 }
   do i = 1, n
      sum = sum + c
   end do
@@ -84,8 +81,7 @@ subroutine redsub_combined(sum, n, c)
 
   !$acc parallel num_gangs (8) num_workers (4) vector_length(32) copy(sum)
   !$acc loop reduction(+:sum) gang worker vector
-  ! { dg-bogus {'sum\.[0-9]+' is used uninitialized} TODO { xfail *-*-* } .-1 }
-  !   { dg-note {'sum\.[0-9]+' was declared here} {} { target *-*-* } .-2 }
+  ! { dg-bogus {'sum\.[0-9]+' is used uninitialized} "" { target *-*-* } .-1 }
   do i = 1, n
      sum = sum + c
   end do
@@ -102,12 +98,10 @@ subroutine redsub_nested(sum, n, c)
 
   !$acc parallel num_gangs (8) copy(sum)
   !$acc loop reduction(+:sum) gang
-  ! { dg-bogus {'sum\.[0-9]+' is used uninitialized} TODO { xfail *-*-* } .-1 }
-  !   { dg-note {'sum\.[0-9]+' was declared here} {} { target *-*-* } .-2 }
+  ! { dg-bogus {'sum\.[0-9]+' is used uninitialized} "" { target *-*-* } .-1 }
   do i = 1, ii
      !$acc loop reduction(+:sum) vector
-     ! { dg-bogus {'sum\.[0-9]+' may be used uninitialized} TODO { xfail { ! __OPTIMIZE__ } } .-1 }
-     !   { dg-note {'sum\.[0-9]+' was declared here} {} { target { ! __OPTIMIZE__ } } .-2 }
+     ! { dg-bogus {'sum\.[0-9]+' may be used uninitialized} "" { target { ! __OPTIMIZE__ } } .-1 }
      do j = 1, jj
         sum = sum + c
      end do
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-7.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-7.f90
index 38148f5..03a58a5 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/reduction-7.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-7.f90
@@ -64,8 +64,7 @@ subroutine redsub_bogus(sum, n)
 
   !$acc parallel firstprivate(sum)
   !$acc loop gang worker vector reduction (+:sum)
-  ! { dg-bogus {'sum\.[0-9]+' is used uninitialized} TODO { xfail *-*-* } .-1 }
-  !   { dg-note {'sum\.[0-9]+' was declared here} {} { target *-*-* } .-2 }
+  ! { dg-bogus {'sum\.[0-9]+' is used uninitialized} "" { target *-*-* } .-1 }
   do i = 1, n
      sum = sum + 1
   end do
@@ -84,8 +83,7 @@ subroutine redsub_combined(sum, n, arr)
      sum = i;
 
      !$acc loop reduction(+:sum)
-     ! { dg-bogus {'sum\.[0-9]+' may be used uninitialized} TODO { xfail { ! __OPTIMIZE__ } } .-1 }
-     !   { dg-note {'sum\.[0-9]+' was declared here} {} { target { ! __OPTIMIZE__ } } .-2 }
+     ! { dg-bogus {'sum\.[0-9]+' may be used uninitialized} "" { target { ! __OPTIMIZE__ } } .-1 }
      do j = 1, n
         sum = sum + 1
      end do
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-9.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-9.f90
new file mode 100644
index 0000000..fd64d88
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-9.f90
@@ -0,0 +1,54 @@
+! Test gang reductions on dummy variables.
+
+! { dg-do run }
+
+program main
+  implicit none
+
+  integer g, w, v, c
+
+  g = 0
+  w = 0
+  v = 0
+  c = 0
+
+  call reduction (g, w, v, c)
+
+  if (g /= 10) call abort
+  if (w /= 10) call abort
+  if (v /= 10) call abort
+  if (c /= 100) call abort
+end program main
+
+subroutine reduction (g, w, v, c)
+  implicit none
+
+  integer g, w, v, c, i
+
+  !$acc parallel
+  !$acc loop reduction(+:g) gang
+  do i = 1, 10
+     g = g + 1
+  end do
+  !$acc end parallel
+
+  !$acc parallel
+  !$acc loop reduction(+:w) worker
+  do i = 1, 10
+     w = w + 1
+  end do
+  !$acc end parallel
+
+  !$acc parallel
+  !$acc loop reduction(+:v) vector
+  do i = 1, 10
+     v = v + 1
+  end do
+  !$acc end parallel
+
+  !$acc parallel loop reduction(+:c) gang worker vector
+  do i = 1, 100
+     c = c + 1
+  end do
+  !$acc end parallel loop
+end subroutine reduction
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reference-reductions.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reference-reductions.f90
index 055d225..635b1b0 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/reference-reductions.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reference-reductions.f90
@@ -16,12 +16,10 @@ subroutine param_reduction(var)
 
 !$acc parallel copy(var)
 !$acc loop reduction(+ : var) gang
-  ! { dg-bogus {'var\.[0-9]+' is used uninitialized} TODO { xfail *-*-* } .-1 }
-  !   { dg-note {'var\.[0-9]+' was declared here} {} { target *-*-* } .-2 }
+  ! { dg-bogus {'var\.[0-9]+' is used uninitialized} "" { target *-*-* } .-1 }
  do k=1,10
 !$acc loop vector reduction(+ : var)
-    ! { dg-bogus {'var\.[0-9]+' may be used uninitialized} TODO { xfail { ! __OPTIMIZE__ } } .-1 }
-    !   { dg-note {'var\.[0-9]+' was declared here} {} { target { ! __OPTIMIZE__ } } .-2 }
+    ! { dg-bogus {'var\.[0-9]+' may be used uninitialized} "" { target { ! __OPTIMIZE__ } } .-1 }
     do j=1,100
      var = var + 1.0
     enddo
diff --git a/libgomp/usmpin-allocator.c b/libgomp/usmpin-allocator.c
new file mode 100644
index 0000000..311bda5
--- /dev/null
+++ b/libgomp/usmpin-allocator.c
@@ -0,0 +1,319 @@
+/* Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is a simple "malloc" implementation intended for use with Unified
+   Shared Memory and Pinned Memory.  It allocates memory from a pool allocated
+   and configured by the device plugin (for USM), or the OS-specific allocator
+   (for pinned).
+ 
+   This implementation keeps the allocated/free chain in a side-table (splay
+   tree) to ensure that the allocation routine does not migrate all the USM
+   pages back into host memory.  Keeping the meta-data elsewhere is also useful
+   for pinned memory, which is typically an extremely limited resource.  */
+
+#include <string.h>
+#include "libgomp.h"
+
+/* Use a splay tree to track allocations.  */
+
+typedef struct usmpin_splay_tree_node_s *usmpin_splay_tree_node;
+typedef struct usmpin_splay_tree_s *usmpin_splay_tree;
+typedef struct usmpin_splay_tree_key_s *usmpin_splay_tree_key;
+
+struct usmpin_splay_tree_key_s {
+  void *base;
+  size_t size;
+};
+
+static inline int
+usmpin_splay_compare (usmpin_splay_tree_key x, usmpin_splay_tree_key y)
+{
+  return (x->base == y->base ? 0
+	  : x->base > y->base ? 1
+	  : -1);
+}
+
+#define splay_tree_prefix usmpin
+#include "splay-tree.h"
+
+/* 128-byte granularity means GPU cache-line aligned.  */
+#define ALIGN(VAR) (((VAR) + 127) & ~127)
+
+/* The context data prevents the need for global state.  */
+struct usmpin_context {
+  int lock;
+  struct usmpin_splay_tree_s allocations;
+  struct usmpin_splay_tree_s free_space;
+};
+
+usmpin_ctx_p
+usmpin_init_context ()
+{
+  return calloc (1, sizeof (struct usmpin_context));
+}
+
+/* Coalesce contiguous free space into one entry.  This considers the entries
+   either side of the root node only, so it should be called each time a new
+   entry in inserted into the root.  */
+
+static void
+usmpin_coalesce_free_space (usmpin_ctx_p ctx)
+{
+  usmpin_splay_tree_node prev, next, node = ctx->free_space.root;
+
+  for (prev = node->left; prev && prev->right; prev = prev->right)
+    ;
+  for (next = node->right; next && next->left; next = next->left)
+    ;
+
+  /* Coalesce adjacent free chunks.  */
+  if (next
+      && node->key.base + node->key.size == next->key.base)
+    {
+      /* Free chunk follows.  */
+      node->key.size += next->key.size;
+      usmpin_splay_tree_remove (&ctx->free_space, &next->key);
+      free (next);
+    }
+  if (prev
+      && prev->key.base + prev->key.size == node->key.base)
+    {
+      /* Free chunk precedes.  */
+      prev->key.size += node->key.size;
+      usmpin_splay_tree_remove (&ctx->free_space, &node->key);
+      free (node);
+    }
+}
+
+/* Add a new memory region into the free chain.  This is how the USM heap is
+   initialized and extended.  If the new region is contiguous with an existing
+   region then any free space will be coalesced.  */
+
+void
+usmpin_register_memory (usmpin_ctx_p ctx, char *base, size_t size)
+{
+  if (base == NULL || ctx == NULL)
+    return;
+
+  while (__atomic_exchange_n (&ctx->lock, 1, MEMMODEL_ACQUIRE) == 1)
+    ;
+
+  usmpin_splay_tree_node node;
+  node = malloc (sizeof (struct usmpin_splay_tree_node_s));
+  node->key.base = base;
+  node->key.size = size;
+  node->left = NULL;
+  node->right = NULL;
+  usmpin_splay_tree_insert (&ctx->free_space, node);
+  usmpin_coalesce_free_space (ctx);
+
+  __atomic_store_n (&ctx->lock, 0, MEMMODEL_RELEASE);
+}
+
+/* This splay_tree_foreach callback selects the first free space large enough
+   to hold the allocation needed.  Since the splay_tree walk may start in the
+   middle the "first" isn't necessarily the "leftmost" entry.  */
+
+struct usmpin_callback_data {
+  size_t size;
+  usmpin_splay_tree_node found;
+};
+
+static int
+usmpin_alloc_callback (usmpin_splay_tree_key key, void *data)
+{
+  struct usmpin_callback_data *cbd = (struct usmpin_callback_data *)data;
+
+  if (key->size >= cbd->size)
+    {
+      cbd->found = (usmpin_splay_tree_node)key;
+      return 1;
+    }
+
+  return 0;
+}
+
+/* USM "malloc".  Selects and moves and address range from ctx->free_space to
+   ctx->allocations, while leaving any excess in ctx->free_space.  */
+
+void *
+usmpin_alloc (usmpin_ctx_p ctx, size_t size)
+{
+  if (ctx == NULL)
+    return NULL;
+
+  /* Memory is allocated in N-byte granularity.  */
+  size = ALIGN (size);
+
+  /* Acquire the lock.  */
+  while (__atomic_exchange_n (&ctx->lock, 1, MEMMODEL_ACQUIRE) == 1)
+    ;
+
+  if (!ctx->free_space.root)
+    {
+      /* No memory registered, or no free space.  */
+      __atomic_store_n (&ctx->lock, 0, MEMMODEL_RELEASE);
+      return NULL;
+    }
+
+  /* Find a suitable free block.  */
+  struct usmpin_callback_data cbd = {size, NULL};
+  usmpin_splay_tree_foreach_lazy (&ctx->free_space, usmpin_alloc_callback,
+				  &cbd);
+  usmpin_splay_tree_node freenode = cbd.found;
+
+  void *result = NULL;
+  if (freenode)
+    {
+      /* Allocation successful.  */
+      result = freenode->key.base;
+      usmpin_splay_tree_node allocnode = malloc (sizeof (*allocnode));
+      allocnode->key.base = result;
+      allocnode->key.size = size;
+      allocnode->left = NULL;
+      allocnode->right = NULL;
+      usmpin_splay_tree_insert (&ctx->allocations, allocnode);
+
+      /* Update the free chain.  */
+      size_t stillfree_size = freenode->key.size - size;
+      if (stillfree_size > 0)
+	{
+	  freenode->key.base = freenode->key.base + size;
+	  freenode->key.size = stillfree_size;
+	}
+      else
+	{
+	  usmpin_splay_tree_remove (&ctx->free_space, &freenode->key);
+	  free (freenode);
+	}
+    }
+
+  /* Release the lock.  */
+  __atomic_store_n (&ctx->lock, 0, MEMMODEL_RELEASE);
+
+  return result;
+}
+
+/* USM "free".  Moves an address range from ctx->allocations to
+   ctx->free_space and merges that record with any contiguous free memory.  */
+
+void
+usmpin_free (usmpin_ctx_p ctx, void *addr)
+{
+  if (ctx == NULL)
+    return;
+
+  /* Acquire the lock.  */
+  while (__atomic_exchange_n (&ctx->lock, 1, MEMMODEL_ACQUIRE) == 1)
+    ;
+
+  /* Convert the memory map to free.  */
+  struct usmpin_splay_tree_key_s key = {addr};
+  usmpin_splay_tree_key found = usmpin_splay_tree_lookup (&ctx->allocations,
+							  &key);
+  if (!found)
+    GOMP_PLUGIN_fatal ("invalid free");
+  usmpin_splay_tree_remove (&ctx->allocations, &key);
+  usmpin_splay_tree_insert (&ctx->free_space, (usmpin_splay_tree_node)found);
+  usmpin_coalesce_free_space (ctx);
+
+  /* Release the lock.  */
+  __atomic_store_n (&ctx->lock, 0, MEMMODEL_RELEASE);
+}
+
+/* USM "realloc".  Works in-place, if possible; reallocates otherwise.  */
+
+void *
+usmpin_realloc (usmpin_ctx_p ctx, void *addr, size_t newsize)
+{
+  if (ctx == NULL)
+    return NULL;
+
+  newsize = ALIGN (newsize);
+
+  /* Acquire the lock.  */
+  while (__atomic_exchange_n (&ctx->lock, 1, MEMMODEL_ACQUIRE) == 1)
+    ;
+
+  /* Convert the memory map to free.  */
+  struct usmpin_splay_tree_key_s key = {addr};
+  usmpin_splay_tree_key found = usmpin_splay_tree_lookup (&ctx->allocations,
+							  &key);
+  if (!found)
+    GOMP_PLUGIN_fatal ("invalid realloc");
+
+  if (newsize == found->size)
+    ; /* Nothing to do.  */
+  else if (newsize < found->size)
+    {
+      /* We're reducing the allocation size.  */
+      usmpin_splay_tree_node newfree = malloc (sizeof (*newfree));
+      newfree->key.base = found->base + newsize;
+      newfree->key.size = found->size - newsize;
+      newfree->left = NULL;
+      newfree->right = NULL;
+      usmpin_splay_tree_insert (&ctx->free_space, newfree);
+      usmpin_coalesce_free_space (ctx);
+    }
+  else
+    {
+      /* We're extending the allocation.  */
+      struct usmpin_splay_tree_key_s freekey = {addr + found->size};
+      usmpin_splay_tree_key foundfree;
+      foundfree = usmpin_splay_tree_lookup (&ctx->free_space, &freekey);
+      if (foundfree && foundfree->size >= newsize - found->size)
+	{
+	  /* Allocation can be expanded in place.  */
+	  foundfree->base += found->size;
+	  foundfree->size -= newsize - found->size;
+	  found->size = newsize;
+
+	  if (foundfree->size == 0)
+	    usmpin_splay_tree_remove (&ctx->free_space, &freekey);
+	}
+      else
+	{
+	  /* Allocation must be relocated.
+	     Release the lock and use alloc/free.  */
+	  __atomic_store_n (&ctx->lock, 0, MEMMODEL_RELEASE);
+
+	  void *newaddr = usmpin_alloc (ctx, newsize);
+	  if (!newaddr)
+	    return NULL;
+
+	  memcpy (newaddr, addr, found->size);
+	  usmpin_free (ctx, addr);
+	  return newaddr;
+	}
+    }
+
+  /* Release the lock.  */
+  __atomic_store_n (&ctx->lock, 0, MEMMODEL_RELEASE);
+  return addr;
+}
+
+/* Include the splay tree code inline, with the prefixes added.  */
+#define splay_tree_prefix usmpin
+#define splay_tree_c
+#include "splay-tree.h"