140 files changed, 7670 insertions, 522 deletions
diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
index 2e5d923..111143a 100644
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -136,6 +136,12 @@ struct omp_lock_t {
   void *Lock;
 };
 
+// see definition in openmp/runtime kmp.h
+typedef enum omp_severity_t {
+  severity_warning = 1,
+  severity_fatal = 2
+} omp_severity_t;
+
 using InterWarpCopyFnTy = void (*)(void *src, int32_t warp_num);
 using ShuffleReductFnTy = void (*)(void *rhsData, int16_t lane_id,
                                    int16_t lane_offset, int16_t shortCircuit);
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 467e44a..8c2828b 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -30,7 +30,8 @@ enum OMPTgtExecModeFlags : unsigned char {
   OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,
   OMP_TGT_EXEC_MODE_SPMD = 1 << 1,
   OMP_TGT_EXEC_MODE_GENERIC_SPMD =
-      OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
+      OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD,
+  OMP_TGT_EXEC_MODE_SPMD_NO_LOOP = 1 << 2 | OMP_TGT_EXEC_MODE_SPMD
 };
 
 static void
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
index 08ce616..aa5e740 100644
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ b/offload/DeviceRTL/src/Parallelism.cpp
@@ -45,7 +45,24 @@ using namespace ompx;
 
 namespace {
 
-uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
+void numThreadsStrictError(int32_t nt_strict, int32_t nt_severity,
+                           const char *nt_message, int32_t requested,
+                           int32_t actual) {
+  if (nt_message)
+    printf("%s\n", nt_message);
+  else
+    printf("The computed number of threads (%u) does not match the requested "
+           "number of threads (%d). Consider that it might not be supported "
+           "to select exactly %d threads on this target device.\n",
+           actual, requested, requested);
+  if (nt_severity == severity_fatal)
+    __builtin_trap();
+}
+
+uint32_t determineNumberOfThreads(int32_t NumThreadsClause,
+                                  int32_t nt_strict = false,
+                                  int32_t nt_severity = severity_fatal,
+                                  const char *nt_message = nullptr) {
   uint32_t NThreadsICV =
       NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
   uint32_t NumThreads = mapping::getMaxTeamThreads();
@@ -55,13 +72,17 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
 
   // SPMD mode allows any number of threads, for generic mode we round down to a
   // multiple of WARPSIZE since it is legal to do so in OpenMP.
-  if (mapping::isSPMDMode())
-    return NumThreads;
+  if (!mapping::isSPMDMode()) {
+    if (NumThreads < mapping::getWarpSize())
+      NumThreads = 1;
+    else
+      NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
+  }
 
-  if (NumThreads < mapping::getWarpSize())
-    NumThreads = 1;
-  else
-    NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
+  if (NumThreadsClause != -1 && nt_strict &&
+      NumThreads != static_cast<uint32_t>(NumThreadsClause))
+    numThreadsStrictError(nt_strict, nt_severity, nt_message, NumThreadsClause,
+                          NumThreads);
 
   return NumThreads;
 }
@@ -82,12 +103,14 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
 
 extern "C" {
 
-[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
-                                                   int32_t num_threads,
-                                                   void *fn, void **args,
-                                                   const int64_t nargs) {
+[[clang::always_inline]] void
+__kmpc_parallel_spmd(IdentTy *ident, int32_t num_threads, void *fn, void **args,
+                     const int64_t nargs, int32_t nt_strict = false,
+                     int32_t nt_severity = severity_fatal,
+                     const char *nt_message = nullptr) {
   uint32_t TId = mapping::getThreadIdInBlock();
-  uint32_t NumThreads = determineNumberOfThreads(num_threads);
+  uint32_t NumThreads =
+      determineNumberOfThreads(num_threads, nt_strict, nt_severity, nt_message);
   uint32_t PTeamSize =
       NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads;
   // Avoid the race between the read of the `icv::Level` above and the write
@@ -140,10 +163,11 @@ extern "C" {
   return;
 }
 
-[[clang::always_inline]] void
-__kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
-                   int32_t num_threads, int proc_bind, void *fn,
-                   void *wrapper_fn, void **args, int64_t nargs) {
+[[clang::always_inline]] void __kmpc_parallel_51(
+    IdentTy *ident, int32_t, int32_t if_expr, int32_t num_threads,
+    int proc_bind, void *fn, void *wrapper_fn, void **args, int64_t nargs,
+    int32_t nt_strict = false, int32_t nt_severity = severity_fatal,
+    const char *nt_message = nullptr) {
   uint32_t TId = mapping::getThreadIdInBlock();
 
   // Assert the parallelism level is zero if disabled by the user.
@@ -156,6 +180,11 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
   // 3) nested parallel regions
   if (OMP_UNLIKELY(!if_expr || state::HasThreadState ||
                    (config::mayUseNestedParallelism() && icv::Level))) {
+    // OpenMP 6.0 12.1.2 requires the num_threads 'strict' modifier to also have
+    // effect when parallel execution is disabled by a corresponding if clause
+    // attached to the parallel directive.
+    if (nt_strict && num_threads > 1)
+      numThreadsStrictError(nt_strict, nt_severity, nt_message, num_threads, 1);
     state::DateEnvironmentRAII DERAII(ident);
     ++icv::Level;
     invokeMicrotask(TId, 0, fn, args, nargs);
@@ -169,12 +198,14 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
     // This was moved to its own routine so it could be called directly
     // in certain situations to avoid resource consumption of unused
     // logic in parallel_51.
-    __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs);
+    __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs, nt_strict,
+                         nt_severity, nt_message);
 
     return;
   }
 
-  uint32_t NumThreads = determineNumberOfThreads(num_threads);
+  uint32_t NumThreads =
+      determineNumberOfThreads(num_threads, nt_strict, nt_severity, nt_message);
   uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
   uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
 
@@ -277,6 +308,16 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
     __kmpc_end_sharing_variables();
 }
 
+[[clang::always_inline]] void __kmpc_parallel_60(
+    IdentTy *ident, int32_t id, int32_t if_expr, int32_t num_threads,
+    int proc_bind, void *fn, void *wrapper_fn, void **args, int64_t nargs,
+    int32_t nt_strict = false, int32_t nt_severity = severity_fatal,
+    const char *nt_message = nullptr) {
+  return __kmpc_parallel_51(ident, id, if_expr, num_threads, proc_bind, fn,
+                            wrapper_fn, args, nargs, nt_strict, nt_severity,
+                            nt_message);
+}
+
 [[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
   // Work function and arguments for L1 parallel region.
   *WorkFn = state::ParallelRegionFn;
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index a875930..59a2cc3 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -698,7 +698,7 @@ template <typename Ty> class StaticLoopChunker {
   static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
                                         Ty NumBlocks, Ty BId, Ty NumThreads,
                                         Ty TId, Ty NumIters,
-                                        bool OneIterationPerThread) {
+                                        uint8_t OneIterationPerThread) {
     Ty KernelIteration = NumBlocks * NumThreads;
 
     // Start index in the normalized space.
@@ -729,7 +729,7 @@ template <typename Ty> class StaticLoopChunker {
                                         Ty BlockChunk, Ty NumBlocks, Ty BId,
                                         Ty ThreadChunk, Ty NumThreads, Ty TId,
                                         Ty NumIters,
-                                        bool OneIterationPerThread) {
+                                        uint8_t OneIterationPerThread) {
     Ty KernelIteration = NumBlocks * BlockChunk;
 
     // Start index in the chunked space.
@@ -767,8 +767,18 @@ template <typename Ty> class StaticLoopChunker {
 
 public:
   /// Worksharing `for`-loop.
+  /// \param[in] Loc Description of source location
+  /// \param[in] LoopBody Function which corresponds to loop body
+  /// \param[in] Arg Pointer to struct which contains loop body args
+  /// \param[in] NumIters Number of loop iterations
+  /// \param[in] NumThreads Number of GPU threads
+  /// \param[in] ThreadChunk Size of thread chunk
+  /// \param[in] OneIterationPerThread If true/nonzero, each thread executes
+  /// only one loop iteration or one thread chunk. This avoids an outer loop
+  /// over all loop iterations/chunks.
   static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
-                  Ty NumIters, Ty NumThreads, Ty ThreadChunk) {
+                  Ty NumIters, Ty NumThreads, Ty ThreadChunk,
+                  uint8_t OneIterationPerThread) {
     ASSERT(NumIters >= 0, "Bad iteration count");
     ASSERT(ThreadChunk >= 0, "Bad thread count");
 
@@ -790,12 +800,13 @@ public:
 
     // If we know we have more threads than iterations we can indicate that to
     // avoid an outer loop.
-    bool OneIterationPerThread = false;
     if (config::getAssumeThreadsOversubscription()) {
-      ASSERT(NumThreads >= NumIters, "Broken assumption");
       OneIterationPerThread = true;
     }
 
+    if (OneIterationPerThread)
+      ASSERT(NumThreads >= NumIters, "Broken assumption");
+
     if (ThreadChunk != 1)
       NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
                                 ThreadChunk, NumThreads, TId, NumIters,
@@ -806,8 +817,17 @@ public:
   }
 
   /// Worksharing `distribute`-loop.
+  /// \param[in] Loc Description of source location
+  /// \param[in] LoopBody Function which corresponds to loop body
+  /// \param[in] Arg Pointer to struct which contains loop body args
+  /// \param[in] NumIters Number of loop iterations
+  /// \param[in] BlockChunk Size of block chunk
+  /// \param[in] OneIterationPerThread If true/nonzero, each thread executes
+  /// only one loop iteration or one thread chunk. This avoids an outer loop
+  /// over all loop iterations/chunks.
   static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
-                         Ty NumIters, Ty BlockChunk) {
+                         Ty NumIters, Ty BlockChunk,
+                         uint8_t OneIterationPerThread) {
     ASSERT(icv::Level == 0, "Bad distribute");
     ASSERT(icv::ActiveLevel == 0, "Bad distribute");
     ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
@@ -831,12 +851,13 @@ public:
 
     // If we know we have more blocks than iterations we can indicate that to
     // avoid an outer loop.
-    bool OneIterationPerThread = false;
     if (config::getAssumeTeamsOversubscription()) {
-      ASSERT(NumBlocks >= NumIters, "Broken assumption");
       OneIterationPerThread = true;
     }
 
+    if (OneIterationPerThread)
+      ASSERT(NumBlocks >= NumIters, "Broken assumption");
+
     if (BlockChunk != NumThreads)
       NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
                                 ThreadChunk, NumThreads, TId, NumIters,
@@ -852,9 +873,20 @@ public:
   }
 
   /// Worksharing `distribute parallel for`-loop.
+  /// \param[in] Loc Description of source location
+  /// \param[in] LoopBody Function which corresponds to loop body
+  /// \param[in] Arg Pointer to struct which contains loop body args
+  /// \param[in] NumIters Number of loop iterations
+  /// \param[in] NumThreads Number of GPU threads
+  /// \param[in] BlockChunk Size of block chunk
+  /// \param[in] ThreadChunk Size of thread chunk
+  /// \param[in] OneIterationPerThread If true/nonzero, each thread executes
+  /// only one loop iteration or one thread chunk. This avoids an outer loop
+  /// over all loop iterations/chunks.
   static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
                             void *Arg, Ty NumIters, Ty NumThreads,
-                            Ty BlockChunk, Ty ThreadChunk) {
+                            Ty BlockChunk, Ty ThreadChunk,
+                            uint8_t OneIterationPerThread) {
     ASSERT(icv::Level == 1, "Bad distribute");
     ASSERT(icv::ActiveLevel == 1, "Bad distribute");
     ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
@@ -882,13 +914,14 @@ public:
 
     // If we know we have more threads (across all blocks) than iterations we
     // can indicate that to avoid an outer loop.
-    bool OneIterationPerThread = false;
     if (config::getAssumeTeamsOversubscription() &
         config::getAssumeThreadsOversubscription()) {
       OneIterationPerThread = true;
-      ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
     }
 
+    if (OneIterationPerThread)
+      ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
+
     if (BlockChunk != NumThreads || ThreadChunk != 1)
       NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
                                 ThreadChunk, NumThreads, TId, NumIters,
@@ -907,24 +940,26 @@ public:
 
 #define OMP_LOOP_ENTRY(BW, TY)                                                 \
   [[gnu::flatten, clang::always_inline]] void                                  \
-      __kmpc_distribute_for_static_loop##BW(                                   \
-          IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,       \
-          TY num_threads, TY block_chunk, TY thread_chunk) {                   \
+  __kmpc_distribute_for_static_loop##BW(                                       \
+      IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
+      TY num_threads, TY block_chunk, TY thread_chunk,                         \
+      uint8_t one_iteration_per_thread) {                                      \
     ompx::StaticLoopChunker<TY>::DistributeFor(                                \
-        loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk);      \
+        loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk,       \
+        one_iteration_per_thread);                                             \
   }                                                                            \
   [[gnu::flatten, clang::always_inline]] void                                  \
-      __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *),  \
-                                        void *arg, TY num_iters,               \
-                                        TY block_chunk) {                      \
-    ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters,           \
-                                            block_chunk);                      \
+  __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *),      \
+                                    void *arg, TY num_iters, TY block_chunk,   \
+                                    uint8_t one_iteration_per_thread) {        \
+    ompx::StaticLoopChunker<TY>::Distribute(                                   \
+        loc, fn, arg, num_iters, block_chunk, one_iteration_per_thread);       \
   }                                                                            \
   [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW(      \
       IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
-      TY num_threads, TY thread_chunk) {                                       \
+      TY num_threads, TY thread_chunk, uint8_t one_iteration_per_thread) {     \
     ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads,     \
-                                     thread_chunk);                            \
+                                     thread_chunk, one_iteration_per_thread);  \
   }
 
 extern "C" {
diff --git a/offload/cmake/OpenMPTesting.cmake b/offload/cmake/OpenMPTesting.cmake
index 8e955ff3..ef8cf34 100644
--- a/offload/cmake/OpenMPTesting.cmake
+++ b/offload/cmake/OpenMPTesting.cmake
@@ -57,7 +57,7 @@ if (${OPENMP_STANDALONE_BUILD})
   if (MSVC OR XCODE)
     set(DEFAULT_LIT_ARGS "${DEFAULT_LIT_ARGS} --no-progress-bar")
   endif()
-  if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  if ("${CMAKE_SYSTEM_NAME}" MATCHES "AIX")
     set(DEFAULT_LIT_ARGS "${DEFAULT_LIT_ARGS} --time-tests --timeout=1800")
   endif()
   set(OPENMP_LIT_ARGS "${DEFAULT_LIT_ARGS}" CACHE STRING "Options for lit.")
diff --git a/offload/include/OpenMP/Mapping.h b/offload/include/OpenMP/Mapping.h
index b9f5c16..93c1e569 100644
--- a/offload/include/OpenMP/Mapping.h
+++ b/offload/include/OpenMP/Mapping.h
@@ -417,12 +417,42 @@ struct MapperComponentsTy {
 typedef void (*MapperFuncPtrTy)(void *, void *, void *, int64_t, int64_t,
                                 void *);
 
+/// Structure to store information about a single ATTACH map entry.
+struct AttachMapInfo {
+  void *PointerBase;
+  void *PointeeBegin;
+  int64_t PointerSize;
+  int64_t MapType;
+  map_var_info_t Pointername;
+
+  AttachMapInfo(void *PointerBase, void *PointeeBegin, int64_t Size,
+                int64_t Type, map_var_info_t Name)
+      : PointerBase(PointerBase), PointeeBegin(PointeeBegin), PointerSize(Size),
+        MapType(Type), Pointername(Name) {}
+};
+
+/// Structure to track ATTACH entries and new allocations across recursive calls
+/// (for handling mappers) to targetDataBegin for a given construct.
+struct AttachInfoTy {
+  /// ATTACH map entries for deferred processing.
+  llvm::SmallVector<AttachMapInfo> AttachEntries;
+
+  /// Key: host pointer, Value: allocation size.
+  llvm::DenseMap<void *, int64_t> NewAllocations;
+
+  AttachInfoTy() = default;
+
+  // Delete copy constructor and copy assignment operator to prevent copying
+  AttachInfoTy(const AttachInfoTy &) = delete;
+  AttachInfoTy &operator=(const AttachInfoTy &) = delete;
+};
+
 // Function pointer type for targetData* functions (targetDataBegin,
 // targetDataEnd and targetDataUpdate).
 typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **,
                                    void **, int64_t *, int64_t *,
                                    map_var_info_t *, void **, AsyncInfoTy &,
-                                   bool);
+                                   AttachInfoTy *, bool);
 
 void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device,
                                bool toStdOut = false);
@@ -431,20 +461,26 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                     void **ArgsBase, void **Args, int64_t *ArgSizes,
                     int64_t *ArgTypes, map_var_info_t *ArgNames,
                     void **ArgMappers, AsyncInfoTy &AsyncInfo,
+                    AttachInfoTy *AttachInfo = nullptr,
                     bool FromMapper = false);
 
 int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                   void **ArgBases, void **Args, int64_t *ArgSizes,
                   int64_t *ArgTypes, map_var_info_t *ArgNames,
                   void **ArgMappers, AsyncInfoTy &AsyncInfo,
-                  bool FromMapper = false);
+                  AttachInfoTy *AttachInfo = nullptr, bool FromMapper = false);
 
 int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                      void **ArgsBase, void **Args, int64_t *ArgSizes,
                      int64_t *ArgTypes, map_var_info_t *ArgNames,
                      void **ArgMappers, AsyncInfoTy &AsyncInfo,
+                     AttachInfoTy *AttachInfo = nullptr,
                      bool FromMapper = false);
 
+// Process deferred ATTACH map entries collected during targetDataBegin.
+int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo,
+                         AsyncInfoTy &AsyncInfo);
+
 struct MappingInfoTy {
   MappingInfoTy(DeviceTy &Device) : Device(Device) {}
 
diff --git a/offload/include/device.h b/offload/include/device.h
index f4b10ab..1e85bb1 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -98,6 +98,10 @@ struct DeviceTy {
   int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
                        int64_t Size, AsyncInfoTy &AsyncInfo);
 
+  // Insert a data fence between previous data operations and the following
+  // operations if necessary for the device.
+  int32_t dataFence(AsyncInfoTy &AsyncInfo);
+
   /// Notify the plugin about a new mapping starting at the host address
   /// \p HstPtr and \p Size bytes.
   int32_t notifyDataMapped(void *HstPtr, int64_t Size);
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 6971780..8fd722b 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -33,9 +33,6 @@
 
 #define OFFLOAD_DEVICE_DEFAULT -1
 
-// Don't format out enums and structs.
-// clang-format off
-
 /// return flags of __tgt_target_XXX public APIs
 enum __tgt_target_return_t : int {
   /// successful offload executed on a target device
@@ -51,39 +48,42 @@ enum __tgt_target_return_t : int {
 /// Data attributes for each data reference used in an OpenMP target region.
 enum tgt_map_type {
   // No flags
-  OMP_TGT_MAPTYPE_NONE            = 0x000,
+  OMP_TGT_MAPTYPE_NONE = 0x000,
   // copy data from host to device
-  OMP_TGT_MAPTYPE_TO              = 0x001,
+  OMP_TGT_MAPTYPE_TO = 0x001,
   // copy data from device to host
-  OMP_TGT_MAPTYPE_FROM            = 0x002,
+  OMP_TGT_MAPTYPE_FROM = 0x002,
   // copy regardless of the reference count
-  OMP_TGT_MAPTYPE_ALWAYS          = 0x004,
+  OMP_TGT_MAPTYPE_ALWAYS = 0x004,
   // force unmapping of data
-  OMP_TGT_MAPTYPE_DELETE          = 0x008,
+  OMP_TGT_MAPTYPE_DELETE = 0x008,
   // map the pointer as well as the pointee
-  OMP_TGT_MAPTYPE_PTR_AND_OBJ     = 0x010,
+  OMP_TGT_MAPTYPE_PTR_AND_OBJ = 0x010,
   // pass device base address to kernel
-  OMP_TGT_MAPTYPE_TARGET_PARAM    = 0x020,
+  OMP_TGT_MAPTYPE_TARGET_PARAM = 0x020,
   // return base device address of mapped data
-  OMP_TGT_MAPTYPE_RETURN_PARAM    = 0x040,
+  OMP_TGT_MAPTYPE_RETURN_PARAM = 0x040,
   // private variable - not mapped
-  OMP_TGT_MAPTYPE_PRIVATE         = 0x080,
+  OMP_TGT_MAPTYPE_PRIVATE = 0x080,
   // copy by value - not mapped
-  OMP_TGT_MAPTYPE_LITERAL         = 0x100,
+  OMP_TGT_MAPTYPE_LITERAL = 0x100,
   // mapping is implicit
-  OMP_TGT_MAPTYPE_IMPLICIT        = 0x200,
+  OMP_TGT_MAPTYPE_IMPLICIT = 0x200,
   // copy data to device
-  OMP_TGT_MAPTYPE_CLOSE           = 0x400,
+  OMP_TGT_MAPTYPE_CLOSE = 0x400,
   // runtime error if not already allocated
-  OMP_TGT_MAPTYPE_PRESENT         = 0x1000,
+  OMP_TGT_MAPTYPE_PRESENT = 0x1000,
   // use a separate reference counter so that the data cannot be unmapped within
   // the structured region
   // This is an OpenMP extension for the sake of OpenACC support.
-  OMP_TGT_MAPTYPE_OMPX_HOLD       = 0x2000,
+  OMP_TGT_MAPTYPE_OMPX_HOLD = 0x2000,
+  // Attach pointer and pointee, after processing all other maps.
+  // Applicable to map-entering directives. Does not change ref-count.
+  OMP_TGT_MAPTYPE_ATTACH = 0x4000,
   // descriptor for non-contiguous target-update
-  OMP_TGT_MAPTYPE_NON_CONTIG      = 0x100000000000,
+  OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000,
   // member of struct, member given by [16 MSBs] - 1
-  OMP_TGT_MAPTYPE_MEMBER_OF       = 0xffff000000000000
+  OMP_TGT_MAPTYPE_MEMBER_OF = 0xffff000000000000
 };
 
 /// Flags for offload entries.
@@ -105,9 +105,9 @@ enum TargetAllocTy : int32_t {
   TARGET_ALLOC_DEVICE_NON_BLOCKING,
 };
 
-inline KernelArgsTy CTorDTorKernelArgs = {1,       0,       nullptr,   nullptr,
-	     nullptr, nullptr, nullptr,   nullptr,
-	     0,      {0,0,0},       {1, 0, 0}, {1, 0, 0}, 0};
+inline KernelArgsTy CTorDTorKernelArgs = {
+    1,       0, nullptr,   nullptr,   nullptr,   nullptr, nullptr,
+    nullptr, 0, {0, 0, 0}, {1, 0, 0}, {1, 0, 0}, 0};
 
 struct DeviceTy;
 
diff --git a/offload/liboffload/API/APIDefs.td b/offload/liboffload/API/APIDefs.td
index 640932d..ea3896f 100644
--- a/offload/liboffload/API/APIDefs.td
+++ b/offload/liboffload/API/APIDefs.td
@@ -31,6 +31,13 @@ class IsHandleType<string Type> {
                 !ne(!find(Type, "_handle_t", !sub(!size(Type), 9)), -1));
 }
 
+// Does the type end with '_cb_t'?
+class IsCallbackType<string Type> {
+  // size("_cb_t") == 5
+  bit ret = !if(!lt(!size(Type), 5), 0,
+                !ne(!find(Type, "_cb_t", !sub(!size(Type), 5)), -1));
+}
+
 // Does the type end with '*'?
 class IsPointerType<string Type> {
   bit ret = !ne(!find(Type, "*", !sub(!size(Type), 1)), -1);
@@ -58,6 +65,7 @@ class Param<string Type, string Name, string Desc, bits<3> Flags = 0> {
   TypeInfo type_info = TypeInfo<"", "">;
   bit IsHandle = IsHandleType<type>.ret;
   bit IsPointer = IsPointerType<type>.ret;
+  bit IsCallback = IsCallbackType<type>.ret;
 }
 
 // A parameter whose range is described by other parameters in the function.
@@ -81,7 +89,7 @@ class ShouldCheckHandle<Param P> {
 }
 
 class ShouldCheckPointer<Param P> {
-  bit ret = !and(P.IsPointer, !eq(!and(PARAM_OPTIONAL, P.flags), 0));
+  bit ret = !and(!or(P.IsPointer, P.IsCallback), !eq(!and(PARAM_OPTIONAL, P.flags), 0));
 }
 
 // For a list of returns that contains a specific return code, find and append
@@ -137,7 +145,6 @@ defvar DefaultReturns = [Return<PREFIX#"_RESULT_SUCCESS">,
                          Return<PREFIX#"_ERRC_DEVICE_LOST">];
 
 class APIObject {
-  string name;
   string desc;
 }
 
@@ -168,6 +175,10 @@ class Enum : APIObject {
   // all Etor values must be TaggedEtor records
   bit is_typed = 0;
 
+  // This refers to whether the enumerator is used to name bits of a bit field,
+  // where consecutive values are bit-shifted rather than incremented.
+  bit is_bit_field = 0;
+
   list<Etor> etors = [];
 }
 
diff --git a/offload/liboffload/API/Common.td b/offload/liboffload/API/Common.td
index 6eaf604..ac27d85 100644
--- a/offload/liboffload/API/Common.td
+++ b/offload/liboffload/API/Common.td
@@ -10,77 +10,64 @@
 //
 //===----------------------------------------------------------------------===//
 
-def : Macro {
-  let name = "OL_VERSION_MAJOR";
+def OL_VERSION_MAJOR : Macro {
   let desc = "Major version of the Offload API";
   let value = "0";
 }
 
-def : Macro {
-  let name = "OL_VERSION_MINOR";
+def OL_VERSION_MINOR : Macro {
   let desc = "Minor version of the Offload API";
   let value = "0";
 }
 
-def : Macro {
-  let name = "OL_VERSION_PATCH";
+def OL_VERSION_PATCH : Macro {
   let desc = "Patch version of the Offload API";
   let value = "1";
 }
 
-def : Macro {
-  let name = "OL_APICALL";
+def OL_APICALL : Macro {
   let desc = "Calling convention for all API functions";
   let condition = "defined(_WIN32)";
   let value = "__cdecl";
   let alt_value = "";
 }
 
-def : Macro {
-  let name = "OL_APIEXPORT";
+def OL_APIEXPORT : Macro {
   let desc = "Microsoft-specific dllexport storage-class attribute";
   let condition = "defined(_WIN32)";
   let value = "__declspec(dllexport)";
   let alt_value = "";
 }
 
-def : Handle {
-  let name = "ol_platform_handle_t";
+def ol_platform_handle_t : Handle {
   let desc = "Handle of a platform instance";
 }
 
-def : Handle {
-  let name = "ol_device_handle_t";
+def ol_device_handle_t : Handle {
   let desc = "Handle of platform's device object";
 }
 
-def : Handle {
-  let name = "ol_context_handle_t";
+def ol_context_handle_t : Handle {
   let desc = "Handle of context object";
 }
 
-def : Handle {
-  let name = "ol_queue_handle_t";
+def ol_queue_handle_t : Handle {
   let desc = "Handle of queue object";
 }
 
-def : Handle {
-  let name = "ol_event_handle_t";
+def ol_event_handle_t : Handle {
   let desc = "Handle of event object";
 }
 
-def : Handle {
-  let name = "ol_program_handle_t";
+def ol_program_handle_t : Handle {
   let desc = "Handle of program object";
 }
 
-def : Handle {
-  let name = "ol_symbol_handle_t";
+def ol_symbol_handle_t : Handle {
   let desc = "Handle of an object in a device's memory for a specific program";
 }
 
-def ErrorCode : Enum {
-  let name = "ol_errc_t";
+def ol_errc_t : Enum {
   let desc = "Defines Return/Error codes";
   let etors =[
     Etor<"SUCCESS", "success">,
@@ -115,8 +102,7 @@ def ErrorCode : Enum {
   ];
 }
 
-def : Struct {
-  let name = "ol_error_struct_t";
+def ol_error_struct_t : Struct {
   let desc = "Details of the error condition returned by an API call";
   let members = [
     StructMember<"ol_errc_t", "Code", "The error code">,
@@ -124,20 +110,17 @@ def : Struct {
   ];
 }
 
-def : Typedef {
-  let name = "ol_result_t";
+def ol_result_t : Typedef {
   let desc = "Result type returned by all entry points.";
-  let value = "const ol_error_struct_t*";
+  let value = "const struct ol_error_struct_t*";
 }
 
-def : Macro {
-  let name = "OL_SUCCESS";
+def OL_SUCCESS : Macro {
   let desc = "Success condition";
   let value = "NULL";
 }
 
-def : Struct {
-  let name = "ol_code_location_t";
+def ol_code_location_t : Struct {
   let desc = "Code location information that can optionally be associated with an API call";
   let members = [
     StructMember<"const char*", "FunctionName", "Function name">,
@@ -147,8 +130,7 @@ def : Struct {
   ];
 }
 
-def : Struct {
-  let name = "ol_dimensions_t";
+def ol_dimensions_t : Struct {
   let desc = "A three element vector";
   let members = [
     StructMember<"uint32_t", "x", "X">,
@@ -157,8 +139,7 @@ def : Struct {
   ];
 }
 
-def : Function {
-  let name = "olInit";
+def olInit : Function {
   let desc = "Perform initialization of the Offload library and plugins";
   let details = [
     "This must be the first API call made by a user of the Offload library",
@@ -168,8 +149,7 @@ def : Function {
   let returns = [];
 }
 
-def : Function {
-  let name = "olShutDown";
+def olShutDown : Function {
   let desc = "Release the resources in use by Offload";
   let details = [
     "This decrements an internal reference count. When this reaches 0, all resources will be released",
diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td
index 857c596..5b54c79 100644
--- a/offload/liboffload/API/Device.td
+++ b/offload/liboffload/API/Device.td
@@ -10,8 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def : Enum {
-  let name = "ol_device_type_t";
+def ol_device_type_t : Enum {
   let desc = "Supported device types.";
   let etors =[
     Etor<"DEFAULT", "The default device type as preferred by the runtime">,
@@ -22,23 +21,54 @@ def : Enum {
   ];
 }
 
-def DeviceInfo : Enum {
-  let name = "ol_device_info_t";
+def ol_device_info_t : Enum {
   let desc = "Supported device info.";
   let is_typed = 1;
-  let etors =[
+  list<TaggedEtor> basic_etors =[
     TaggedEtor<"TYPE", "ol_device_type_t", "type of the device">,
     TaggedEtor<"PLATFORM", "ol_platform_handle_t", "the platform associated with the device">,
     TaggedEtor<"NAME", "char[]", "Device name">,
+    TaggedEtor<"PRODUCT_NAME", "char[]", "Device user-facing marketing name">,
     TaggedEtor<"VENDOR", "char[]", "Device vendor">,
     TaggedEtor<"DRIVER_VERSION", "char[]", "Driver version">,
     TaggedEtor<"MAX_WORK_GROUP_SIZE", "uint32_t", "Maximum total work group size in work items">,
     TaggedEtor<"MAX_WORK_GROUP_SIZE_PER_DIMENSION", "ol_dimensions_t", "Maximum work group size in each dimension">,
+    TaggedEtor<"MAX_WORK_SIZE", "uint32_t", "Maximum total work items">,
+    TaggedEtor<"MAX_WORK_SIZE_PER_DIMENSION", "ol_dimensions_t", "Maximum work items in each dimension">,
+    TaggedEtor<"VENDOR_ID", "uint32_t", "A unique vendor device identifier assigned by PCI-SIG">,
+    TaggedEtor<"NUM_COMPUTE_UNITS", "uint32_t", "The number of parallel compute units available to the device">,
+    TaggedEtor<"MAX_CLOCK_FREQUENCY", "uint32_t", "The maximum configured clock frequency of this device in MHz">,
+    TaggedEtor<"MEMORY_CLOCK_RATE", "uint32_t", "Memory clock frequency in MHz">,
+    TaggedEtor<"ADDRESS_BITS", "uint32_t", "Number of bits used to represent an address in device memory">,
+    TaggedEtor<"MAX_MEM_ALLOC_SIZE", "uint64_t", "The maximum size of memory object allocation in bytes">,
+    TaggedEtor<"GLOBAL_MEM_SIZE", "uint64_t", "The size of global device memory in bytes">,
   ];
+  list<TaggedEtor> fp_configs = !foreach(type, ["Single", "Double", "Half"], TaggedEtor<type # "_FP_CONFIG", "ol_device_fp_capability_flags_t", type # " precision floating point capability">);
+  list<TaggedEtor> native_vec_widths = !foreach(type, ["char","short","int","long","float","double","half"], TaggedEtor<"NATIVE_VECTOR_WIDTH_" # type, "uint32_t", "Native vector width for " # type>);
+  let etors = !listconcat(basic_etors, fp_configs, native_vec_widths);
+}
+
+def ol_device_fp_capability_flag_t : Enum {
+  let desc = "Device floating-point capability flags";
+  let is_bit_field = 1;
+  let etors =[
+    Etor<"CORRECTLY_ROUNDED_DIVIDE_SQRT", "Support correctly rounded divide and sqrt">,
+    Etor<"ROUND_TO_NEAREST", "Support round to nearest">,
+    Etor<"ROUND_TO_ZERO", "Support round to zero">,
+    Etor<"ROUND_TO_INF", "Support round to infinity">,
+    Etor<"INF_NAN", "Support INF to NAN">,
+    Etor<"DENORM", "Support denorm">,
+    Etor<"FMA", "Support fused multiply-add">,
+    Etor<"SOFT_FLOAT", "Basic floating point operations implemented in software">,
+  ];
+}
+
+def ol_device_fp_capability_flags_t : Typedef {
+  let desc = "Device floating-point capability flags";
+  let value = "uint32_t";
 }
 
-def : FptrTypedef {
-  let name = "ol_device_iterate_cb_t";
+def ol_device_iterate_cb_t : FptrTypedef {
   let desc = "User-provided function to be used with `olIterateDevices`";
   let params = [
     Param<"ol_device_handle_t", "Device", "the device handle of the current iteration", PARAM_IN>,
@@ -47,8 +77,7 @@ def : FptrTypedef {
   let return = "bool";
 }
 
-def : Function {
-  let name = "olIterateDevices";
+def olIterateDevices : Function {
   let desc = "Iterates over all available devices, calling the callback for each device.";
   let details = [
     "If the user-provided callback returns `false`, the iteration is stopped."
@@ -62,8 +91,7 @@ def : Function {
   ];
 }
 
-def : Function {
-  let name = "olGetDeviceInfo";
+def olGetDeviceInfo : Function {
   let desc = "Queries the given property of the device.";
   let details = [];
   let params = [
@@ -86,8 +114,7 @@ def : Function {
   ];
 }
 
-def : Function {
-  let name = "olGetDeviceInfoSize";
+def olGetDeviceInfoSize : Function {
   let desc = "Returns the storage size of the given device query.";
   let details = [];
   let params = [
diff --git a/offload/liboffload/API/Event.td b/offload/liboffload/API/Event.td
index 9d217ae..075bf5b 100644
--- a/offload/liboffload/API/Event.td
+++ b/offload/liboffload/API/Event.td
@@ -10,8 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def : Function {
-    let name = "olCreateEvent";
+def olCreateEvent : Function {
     let desc = "Enqueue an event to `Queue` and return it.";
     let details = [
       "This event can be used with `olSyncEvent` and `olWaitEvents` and will be complete once all enqueued work prior to the `olCreateEvent` call is complete.",
@@ -23,8 +22,7 @@ def : Function {
     let returns = [];
 }
 
-def : Function {
-    let name = "olDestroyEvent";
+def olDestroyEvent : Function {
     let desc = "Destroy the event and free all underlying resources.";
     let details = [];
     let params = [
@@ -33,8 +31,7 @@ def : Function {
     let returns = [];
 }
 
-def : Function {
-    let name = "olSyncEvent";
+def olSyncEvent : Function {
     let desc = "Block the calling thread until the event is complete.";
     let details = [];
     let params = [
@@ -43,17 +40,16 @@ def : Function {
     let returns = [];
 }
 
-def : Enum {
-  let name = "ol_event_info_t";
+def ol_event_info_t : Enum {
   let desc = "Supported event info.";
   let is_typed = 1;
   let etors = [
-    TaggedEtor<"QUEUE", "ol_queue_handle_t", "The handle of the queue associated with the device.">
+    TaggedEtor<"QUEUE", "ol_queue_handle_t", "The handle of the queue associated with the device.">,
+    TaggedEtor<"IS_COMPLETE", "bool", "True if and only if the event is complete.">,
   ];
 }
 
-def : Function {
-  let name = "olGetEventInfo";
+def olGetEventInfo : Function {
   let desc = "Queries the given property of the event.";
   let details = [
     "`olGetEventInfoSize` can be used to query the storage size "
@@ -77,8 +73,7 @@ def : Function {
   ];
 }
 
-def : Function {
-  let name = "olGetEventInfoSize";
+def olGetEventInfoSize : Function {
   let desc = "Returns the storage size of the given event query.";
   let details = [];
   let params = [
diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 502fb36..2f5692a 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -6,12 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains Offload API definitions related to launching kernels
+// This file contains Offload API definitions related to kernels
 //
 //===----------------------------------------------------------------------===//
 
-def : Struct {
-    let name = "ol_kernel_launch_size_args_t";
+def ol_kernel_launch_size_args_t : Struct {
     let desc = "Size-related arguments for a kernel launch.";
     let members = [
         StructMember<"size_t", "Dimensions", "Number of work dimensions">,
@@ -21,8 +20,7 @@ def : Struct {
     ];
 }
 
-def : Function {
-    let name = "olLaunchKernel";
+def olLaunchKernel : Function {
     let desc = "Enqueue a kernel launch with the specified size and parameters.";
     let details = [
         "If a queue is not specified, kernel execution happens synchronously",
@@ -42,3 +40,20 @@ def : Function {
         Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
     ];
 }
+
+def olCalculateOptimalOccupancy : Function {
+    let desc = "Given dynamic memory size, query the device for a workgroup size that will result in optimal occupancy.";
+    let details = [
+        "For most devices, this will be the largest workgroup size that will result in all work items fitting on the device at once.",
+    ];
+    let params = [
+        Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>,
+        Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
+        Param<"size_t", "SharedMemory", "dynamic shared memory required per work item in bytes", PARAM_IN>,
+        Param<"size_t*", "GroupSize", "optimal block size", PARAM_OUT>
+    ];
+    let returns = [
+        Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
+        Return<"OL_ERRC_UNSUPPORTED", ["The backend cannot provide this information"]>,
+    ];
+}
diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td
index 5f71585..cc98b67 100644
--- a/offload/liboffload/API/Memory.td
+++ b/offload/liboffload/API/Memory.td
@@ -10,8 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def : Enum {
-  let name = "ol_alloc_type_t";
+def ol_alloc_type_t : Enum {
   let desc = "Represents the type of allocation made with olMemAlloc.";
   let etors = [
     Etor<"HOST", "Host allocation">,
@@ -20,8 +19,7 @@ def : Enum {
   ];
 }
 
-def : Function {
-  let name = "olMemAlloc";
+def olMemAlloc : Function {
   let desc = "Creates a memory allocation on the specified device.";
   let params = [
     Param<"ol_device_handle_t", "Device", "handle of the device to allocate on", PARAM_IN>,
@@ -36,8 +34,7 @@ def : Function {
   ];
 }
 
-def : Function {
-  let name = "olMemFree";
+def olMemFree : Function {
   let desc = "Frees a memory allocation previously made by olMemAlloc.";
   let params = [
     Param<"void*", "Address", "address of the allocation to free", PARAM_IN>,
@@ -45,8 +42,7 @@ def : Function {
   let returns = [];
 }
 
-def : Function {
-    let name = "olMemcpy";
+def olMemcpy : Function {
     let desc = "Enqueue a memcpy operation.";
     let details = [
         "For host pointers, use the host device belonging to the OL_PLATFORM_BACKEND_HOST platform.",
@@ -63,3 +59,22 @@ def : Function {
     ];
     let returns = [];
 }
+
+def olMemFill : Function {
+  let desc = "Fill memory with copies of the given pattern";
+  let details = [
+    "Filling with patterns larger than 4 bytes may be less performant",
+    "The destination pointer and queue must be associated with the same device",
+    "The fill size must be a multiple of the pattern size",
+  ];
+  let params = [
+      Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>,
+      Param<"void*", "Ptr", "destination pointer to start filling at", PARAM_IN>,
+      Param<"size_t", "PatternSize", "the size of the pattern in bytes", PARAM_IN>,
+      Param<"const void*", "PatternPtr", "", PARAM_IN>,
+      Param<"size_t", "FillSize", "number of bytes to fill", PARAM_IN>,
+  ];
+  let returns = [
+    Return<"OL_ERRC_INVALID_SIZE", ["`FillSize % PatternSize != 0`"]>
+  ];
+}
diff --git a/offload/liboffload/API/Platform.td b/offload/liboffload/API/Platform.td
index 97c2cc2..906f899 100644
--- a/offload/liboffload/API/Platform.td
+++ b/offload/liboffload/API/Platform.td
@@ -10,8 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def : Enum {
-  let name = "ol_platform_info_t";
+def ol_platform_info_t : Enum {
   let desc = "Supported platform info.";
   let is_typed = 1;
   let etors = [
@@ -22,8 +21,7 @@ def : Enum {
   ];
 }
 
-def : Enum {
-  let name = "ol_platform_backend_t";
+def ol_platform_backend_t : Enum {
   let desc = "Identifies the native backend of the platform.";
   let etors =[
     Etor<"UNKNOWN", "The backend is not recognized">,
@@ -33,8 +31,7 @@ def : Enum {
   ];
 }
 
-def : Function {
-  let name = "olGetPlatformInfo";
+def olGetPlatformInfo : Function {
   let desc = "Queries the given property of the platform.";
   let details = [
     "`olGetPlatformInfoSize` can be used to query the storage size "
@@ -61,8 +58,7 @@ def : Function {
   ];
 }
 
-def : Function {
-  let name = "olGetPlatformInfoSize";
+def olGetPlatformInfoSize : Function {
   let desc = "Returns the storage size of the given platform query.";
   let details = [];
   let params = [
diff --git a/offload/liboffload/API/Program.td b/offload/liboffload/API/Program.td
index 0476fa1..1f48f65 100644
--- a/offload/liboffload/API/Program.td
+++ b/offload/liboffload/API/Program.td
@@ -10,8 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def : Function {
-    let name = "olCreateProgram";
+def olCreateProgram : Function {
     let desc = "Create a program for the device from the binary image pointed to by `ProgData`.";
     let details = [
         "The provided `ProgData` will be copied and need not outlive the returned handle",
@@ -25,8 +24,7 @@ def : Function {
     let returns = [];
 }
 
-def : Function {
-    let name = "olDestroyProgram";
+def olDestroyProgram : Function {
     let desc = "Destroy the program and free all underlying resources.";
     let details = [];
     let params = [
diff --git a/offload/liboffload/API/Queue.td b/offload/liboffload/API/Queue.td
index 1d9f6f2..ededa9c 100644
--- a/offload/liboffload/API/Queue.td
+++ b/offload/liboffload/API/Queue.td
@@ -10,8 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def : Function {
-    let name = "olCreateQueue";
+def olCreateQueue : Function {
     let desc = "Create a queue for the given device.";
     let details = [];
     let params = [
@@ -21,8 +20,7 @@ def : Function {
     let returns = [];
 }
 
-def : Function {
-    let name = "olDestroyQueue";
+def olDestroyQueue : Function {
     let desc = "Destroy the queue and free all underlying resources.";
     let details = [
       "Any work previously enqueued to the queue is still performed and any events generated for this queue remain valid."
@@ -33,8 +31,7 @@ def : Function {
     let returns = [];
 }
 
-def : Function {
-    let name = "olSyncQueue";
+def olSyncQueue : Function {
     let desc = "Block the calling thread until the enqueued work on a queue is complete.";
     let details = [];
     let params = [
@@ -43,8 +40,7 @@ def : Function {
     let returns = [];
 }
 
-def : Function {
-    let name = "olWaitEvents";
+def olWaitEvents : Function {
     let desc = "Make any future work submitted to this queue wait until the provided events are complete.";
     let details = [
       "All events in `Events` must complete before the queue is unblocked.",
@@ -60,8 +56,7 @@ def : Function {
     ];
 }
 
-def : Enum {
-  let name = "ol_queue_info_t";
+def ol_queue_info_t : Enum {
   let desc = "Supported queue info.";
   let is_typed = 1;
   let etors = [
@@ -70,8 +65,7 @@ def : Enum {
   ];
 }
 
-def : Function {
-  let name = "olGetQueueInfo";
+def olGetQueueInfo : Function {
   let desc = "Queries the given property of the queue.";
   let details = [
     "`olGetQueueInfoSize` can be used to query the storage size "
@@ -95,8 +89,7 @@ def : Function {
   ];
 }
 
-def : Function {
-  let name = "olGetQueueInfoSize";
+def olGetQueueInfoSize : Function {
   let desc = "Returns the storage size of the given queue query.";
   let details = [];
   let params = [
@@ -108,3 +101,27 @@ def : Function {
     Return<"OL_ERRC_INVALID_QUEUE">
   ];
 }
+
+def ol_host_function_cb_t : FptrTypedef {
+  let desc = "Host function for use by `olLaunchHostFunction`.";
+  let params = [
+    Param<"void *", "UserData", "user specified data passed into `olLaunchHostFunction`.", PARAM_IN>,
+  ];
+  let return = "void";
+}
+
+def olLaunchHostFunction : Function {
+  let desc = "Enqueue a callback function on the host.";
+  let details = [
+    "The provided function will be called from the same process as the one that called `olLaunchHostFunction`.",
+    "The callback will not run until all previous work submitted to the queue has completed.",
+    "The callback must return before any work submitted to the queue after it is started.",
+    "The callback must not call any liboffload API functions or any backend specific functions (such as Cuda or HSA library functions).",
+  ];
+  let params = [
+    Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>,
+    Param<"ol_host_function_cb_t", "Callback", "the callback function to call on the host", PARAM_IN>,
+    Param<"void *", "UserData", "a pointer that will be passed verbatim to the callback function", PARAM_IN_OPTIONAL>,
+  ];
+  let returns = [];
+}
diff --git a/offload/liboffload/API/Symbol.td b/offload/liboffload/API/Symbol.td
index 2e94d70..c57a2e1 100644
--- a/offload/liboffload/API/Symbol.td
+++ b/offload/liboffload/API/Symbol.td
@@ -10,8 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def : Enum {
-  let name = "ol_symbol_kind_t";
+def ol_symbol_kind_t : Enum {
   let desc = "The kind of a symbol";
   let etors =[
     Etor<"KERNEL", "a kernel object">,
@@ -19,8 +18,7 @@ def : Enum {
   ];
 }
 
-def : Function {
-    let name = "olGetSymbol";
+def olGetSymbol : Function {
     let desc = "Get a symbol (kernel or global variable) identified by `Name` in the given program.";
     let details = [
         "Symbol handles are owned by the program and do not need to be manually destroyed."
@@ -34,8 +32,7 @@ def : Function {
     let returns = [];
 }
 
-def : Enum {
-  let name = "ol_symbol_info_t";
+def ol_symbol_info_t : Enum {
   let desc = "Supported symbol info.";
   let is_typed = 1;
   let etors = [
@@ -45,8 +42,7 @@ def : Enum {
   ];
 }
 
-def : Function {
-  let name = "olGetSymbolInfo";
+def olGetSymbolInfo : Function {
   let desc = "Queries the given property of the symbol.";
   let details = [
     "`olGetSymbolInfoSize` can be used to query the storage size "
@@ -73,8 +69,7 @@ def : Function {
   ];
 }
 
-def : Function {
-  let name = "olGetSymbolInfoSize";
+def olGetSymbolInfoSize : Function {
   let desc = "Returns the storage size of the given symbol query.";
   let details = [];
   let params = [
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index f5365ca..7e8e297 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -47,10 +47,59 @@ struct ol_device_impl_t {
                    ol_platform_handle_t Platform, InfoTreeNode &&DevInfo)
       : DeviceNum(DeviceNum), Device(Device), Platform(Platform),
         Info(std::forward<InfoTreeNode>(DevInfo)) {}
+
+  ~ol_device_impl_t() {
+    assert(!OutstandingQueues.size() &&
+           "Device object dropped with outstanding queues");
+  }
+
   int DeviceNum;
   GenericDeviceTy *Device;
   ol_platform_handle_t Platform;
   InfoTreeNode Info;
+
+  llvm::SmallVector<__tgt_async_info *> OutstandingQueues;
+  std::mutex OutstandingQueuesMutex;
+
+  /// If the device has any outstanding queues that are now complete, remove it
+  /// from the list and return it.
+  ///
+  /// Queues may be added to the outstanding queue list by olDestroyQueue if
+  /// they are destroyed but not completed.
+  __tgt_async_info *getOutstandingQueue() {
+    // Not locking the `size()` access is fine here - In the worst case we
+    // either miss a queue that exists or loop through an empty array after
+    // taking the lock. Both are sub-optimal but not that bad.
+    if (OutstandingQueues.size()) {
+      std::lock_guard<std::mutex> Lock(OutstandingQueuesMutex);
+
+      // As queues are pulled and popped from this list, longer running queues
+      // naturally bubble to the start of the array. Hence looping backwards.
+      for (auto Q = OutstandingQueues.rbegin(); Q != OutstandingQueues.rend();
+           Q++) {
+        if (!Device->hasPendingWork(*Q)) {
+          auto OutstandingQueue = *Q;
+          *Q = OutstandingQueues.back();
+          OutstandingQueues.pop_back();
+          return OutstandingQueue;
+        }
+      }
+    }
+    return nullptr;
+  }
+
+  /// Complete all pending work for this device and perform any needed cleanup.
+  ///
+  /// After calling this function, no liboffload functions should be called with
+  /// this device handle.
+  llvm::Error destroy() {
+    llvm::Error Result = Plugin::success();
+    for (auto Q : OutstandingQueues)
+      if (auto Err = Device->synchronize(Q, /*Release=*/true))
+        Result = llvm::joinErrors(std::move(Result), std::move(Err));
+    OutstandingQueues.clear();
+    return Result;
+  }
 };
 
 struct ol_platform_impl_t {
@@ -58,23 +107,51 @@ struct ol_platform_impl_t {
                      ol_platform_backend_t BackendType)
       : Plugin(std::move(Plugin)), BackendType(BackendType) {}
   std::unique_ptr<GenericPluginTy> Plugin;
-  std::vector<ol_device_impl_t> Devices;
+  llvm::SmallVector<std::unique_ptr<ol_device_impl_t>> Devices;
   ol_platform_backend_t BackendType;
+
+  /// Complete all pending work for this platform and perform any needed
+  /// cleanup.
+  ///
+  /// After calling this function, no liboffload functions should be called with
+  /// this platform handle.
+  llvm::Error destroy() {
+    llvm::Error Result = Plugin::success();
+    for (auto &D : Devices)
+      if (auto Err = D->destroy())
+        Result = llvm::joinErrors(std::move(Result), std::move(Err));
+
+    if (auto Res = Plugin->deinit())
+      Result = llvm::joinErrors(std::move(Result), std::move(Res));
+
+    return Result;
+  }
 };
 
 struct ol_queue_impl_t {
   ol_queue_impl_t(__tgt_async_info *AsyncInfo, ol_device_handle_t Device)
-      : AsyncInfo(AsyncInfo), Device(Device) {}
+      : AsyncInfo(AsyncInfo), Device(Device), Id(IdCounter++) {}
   __tgt_async_info *AsyncInfo;
   ol_device_handle_t Device;
+  // A unique identifier for the queue
+  size_t Id;
+  static std::atomic<size_t> IdCounter;
 };
+std::atomic<size_t> ol_queue_impl_t::IdCounter(0);
 
 struct ol_event_impl_t {
-  ol_event_impl_t(void *EventInfo, ol_queue_handle_t Queue)
-      : EventInfo(EventInfo), Queue(Queue) {}
+  ol_event_impl_t(void *EventInfo, ol_device_handle_t Device,
+                  ol_queue_handle_t Queue)
+      : EventInfo(EventInfo), Device(Device), QueueId(Queue->Id), Queue(Queue) {
+  }
   // EventInfo may be null, in which case the event should be considered always
   // complete
   void *EventInfo;
+  ol_device_handle_t Device;
+  size_t QueueId;
+  // Events may outlive the queue - don't assume this is always valid.
+  // It is provided only to implement OL_EVENT_INFO_QUEUE. Use QueueId to check
+  // for queue equality instead.
   ol_queue_handle_t Queue;
 };
 
@@ -125,12 +202,13 @@ struct OffloadContext {
   bool TracingEnabled = false;
   bool ValidationEnabled = true;
   DenseMap<void *, AllocInfo> AllocInfoMap{};
+  std::mutex AllocInfoMapMutex{};
   SmallVector<ol_platform_impl_t, 4> Platforms{};
   size_t RefCount;
 
   ol_device_handle_t HostDevice() {
     // The host platform is always inserted last
-    return &Platforms.back().Devices[0];
+    return Platforms.back().Devices[0].get();
   }
 
   static OffloadContext &get() {
@@ -189,8 +267,8 @@ Error initPlugins(OffloadContext &Context) {
         auto Info = Device->obtainInfoImpl();
         if (auto Err = Info.takeError())
           return Err;
-        Platform.Devices.emplace_back(DevNum, Device, &Platform,
-                                      std::move(*Info));
+        Platform.Devices.emplace_back(std::make_unique<ol_device_impl_t>(
+            DevNum, Device, &Platform, std::move(*Info)));
       }
     }
   }
@@ -198,7 +276,8 @@ Error initPlugins(OffloadContext &Context) {
   // Add the special host device
   auto &HostPlatform = Context.Platforms.emplace_back(
       ol_platform_impl_t{nullptr, OL_PLATFORM_BACKEND_HOST});
-  HostPlatform.Devices.emplace_back(-1, nullptr, nullptr, InfoTreeNode{});
+  HostPlatform.Devices.emplace_back(
+      std::make_unique<ol_device_impl_t>(-1, nullptr, nullptr, InfoTreeNode{}));
   Context.HostDevice()->Platform = &HostPlatform;
 
   Context.TracingEnabled = std::getenv("OFFLOAD_TRACE");
@@ -239,7 +318,7 @@ Error olShutDown_impl() {
     if (!P.Plugin || !P.Plugin->is_initialized())
       continue;
 
-    if (auto Res = P.Plugin->deinit())
+    if (auto Res = P.destroy())
       Result = llvm::joinErrors(std::move(Result), std::move(Res));
   }
 
@@ -302,10 +381,57 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
   };
 
   // These are not implemented by the plugin interface
-  if (PropName == OL_DEVICE_INFO_PLATFORM)
+  switch (PropName) {
+  case OL_DEVICE_INFO_PLATFORM:
     return Info.write<void *>(Device->Platform);
-  if (PropName == OL_DEVICE_INFO_TYPE)
+
+  case OL_DEVICE_INFO_TYPE:
     return Info.write<ol_device_type_t>(OL_DEVICE_TYPE_GPU);
+
+  case OL_DEVICE_INFO_SINGLE_FP_CONFIG:
+  case OL_DEVICE_INFO_DOUBLE_FP_CONFIG: {
+    ol_device_fp_capability_flags_t flags{0};
+    flags |= OL_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT |
+             OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
+             OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO |
+             OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF |
+             OL_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
+             OL_DEVICE_FP_CAPABILITY_FLAG_DENORM |
+             OL_DEVICE_FP_CAPABILITY_FLAG_FMA;
+    return Info.write(flags);
+  }
+
+  case OL_DEVICE_INFO_HALF_FP_CONFIG:
+    return Info.write<ol_device_fp_capability_flags_t>(0);
+
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE:
+    return Info.write<uint32_t>(1);
+
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF:
+    return Info.write<uint32_t>(0);
+
+  // None of the existing plugins specify a limit on a single allocation,
+  // so return the global memory size instead
+  case OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE:
+    [[fallthrough]];
+  // AMD doesn't provide the global memory size (trivially) with the device info
+  // struct, so use the plugin interface
+  case OL_DEVICE_INFO_GLOBAL_MEM_SIZE: {
+    uint64_t Mem;
+    if (auto Err = Device->Device->getDeviceMemorySize(Mem))
+      return Err;
+    return Info.write<uint64_t>(Mem);
+  } break;
+
+  default:
+    break;
+  }
+
   if (PropName >= OL_DEVICE_INFO_LAST)
     return createOffloadError(ErrorCode::INVALID_ENUMERATION,
                               "getDeviceInfo enum '%i' is invalid", PropName);
@@ -316,8 +442,10 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
                      "plugin did not provide a response for this information");
   auto Entry = *EntryOpt;
 
+  // Retrieve properties from the plugin interface
   switch (PropName) {
   case OL_DEVICE_INFO_NAME:
+  case OL_DEVICE_INFO_PRODUCT_NAME:
   case OL_DEVICE_INFO_VENDOR:
   case OL_DEVICE_INFO_DRIVER_VERSION: {
     // String values
@@ -327,7 +455,13 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
     return Info.writeString(std::get<std::string>(Entry->Value).c_str());
   }
 
-  case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE: {
+  case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE:
+  case OL_DEVICE_INFO_MAX_WORK_SIZE:
+  case OL_DEVICE_INFO_VENDOR_ID:
+  case OL_DEVICE_INFO_NUM_COMPUTE_UNITS:
+  case OL_DEVICE_INFO_ADDRESS_BITS:
+  case OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY:
+  case OL_DEVICE_INFO_MEMORY_CLOCK_RATE: {
     // Uint32 values
     if (!std::holds_alternative<uint64_t>(Entry->Value))
       return makeError(ErrorCode::BACKEND_FAILURE,
@@ -339,6 +473,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
     return Info.write(static_cast<uint32_t>(Value));
   }
 
+  case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION:
   case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION: {
     // {x, y, z} triples
     ol_dimensions_t Out{0, 0, 0};
@@ -377,6 +512,8 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device,
   assert(Device == OffloadContext::get().HostDevice());
   InfoWriter Info(PropSize, PropValue, PropSizeRet);
 
+  constexpr auto uint32_max = std::numeric_limits<uint32_t>::max();
+
   switch (PropName) {
   case OL_DEVICE_INFO_PLATFORM:
     return Info.write<void *>(Device->Platform);
@@ -384,14 +521,52 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device,
     return Info.write<ol_device_type_t>(OL_DEVICE_TYPE_HOST);
   case OL_DEVICE_INFO_NAME:
     return Info.writeString("Virtual Host Device");
+  case OL_DEVICE_INFO_PRODUCT_NAME:
+    return Info.writeString("Virtual Host Device");
   case OL_DEVICE_INFO_VENDOR:
     return Info.writeString("Liboffload");
   case OL_DEVICE_INFO_DRIVER_VERSION:
     return Info.writeString(LLVM_VERSION_STRING);
   case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE:
-    return Info.write<uint64_t>(1);
+    return Info.write<uint32_t>(1);
   case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION:
     return Info.write<ol_dimensions_t>(ol_dimensions_t{1, 1, 1});
+  case OL_DEVICE_INFO_MAX_WORK_SIZE:
+    return Info.write<uint32_t>(uint32_max);
+  case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION:
+    return Info.write<ol_dimensions_t>(
+        ol_dimensions_t{uint32_max, uint32_max, uint32_max});
+  case OL_DEVICE_INFO_VENDOR_ID:
+    return Info.write<uint32_t>(0);
+  case OL_DEVICE_INFO_NUM_COMPUTE_UNITS:
+    return Info.write<uint32_t>(1);
+  case OL_DEVICE_INFO_SINGLE_FP_CONFIG:
+  case OL_DEVICE_INFO_DOUBLE_FP_CONFIG:
+    return Info.write<ol_device_fp_capability_flags_t>(
+        OL_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT |
+        OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
+        OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO |
+        OL_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF |
+        OL_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
+        OL_DEVICE_FP_CAPABILITY_FLAG_DENORM | OL_DEVICE_FP_CAPABILITY_FLAG_FMA);
+  case OL_DEVICE_INFO_HALF_FP_CONFIG:
+    return Info.write<ol_device_fp_capability_flags_t>(0);
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT:
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE:
+    return Info.write<uint32_t>(1);
+  case OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF:
+    return Info.write<uint32_t>(0);
+  case OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY:
+  case OL_DEVICE_INFO_MEMORY_CLOCK_RATE:
+  case OL_DEVICE_INFO_ADDRESS_BITS:
+    return Info.write<uint32_t>(std::numeric_limits<uintptr_t>::digits);
+  case OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE:
+  case OL_DEVICE_INFO_GLOBAL_MEM_SIZE:
+    return Info.write<uint64_t>(0);
   default:
     return createOffloadError(ErrorCode::INVALID_ENUMERATION,
                               "getDeviceInfo enum '%i' is invalid", PropName);
@@ -420,7 +595,7 @@ Error olGetDeviceInfoSize_impl(ol_device_handle_t Device,
 Error olIterateDevices_impl(ol_device_iterate_cb_t Callback, void *UserData) {
   for (auto &Platform : OffloadContext::get().Platforms) {
     for (auto &Device : Platform.Devices) {
-      if (!Callback(&Device, UserData)) {
+      if (!Callback(Device.get(), UserData)) {
         break;
       }
     }
@@ -449,39 +624,78 @@ Error olMemAlloc_impl(ol_device_handle_t Device, ol_alloc_type_t Type,
     return Alloc.takeError();
 
   *AllocationOut = *Alloc;
-  OffloadContext::get().AllocInfoMap.insert_or_assign(*Alloc,
-                                                      AllocInfo{Device, Type});
+  {
+    std::lock_guard<std::mutex> Lock(OffloadContext::get().AllocInfoMapMutex);
+    OffloadContext::get().AllocInfoMap.insert_or_assign(
+        *Alloc, AllocInfo{Device, Type});
+  }
   return Error::success();
 }
 
 Error olMemFree_impl(void *Address) {
-  if (!OffloadContext::get().AllocInfoMap.contains(Address))
-    return createOffloadError(ErrorCode::INVALID_ARGUMENT,
-                              "address is not a known allocation");
-
-  auto AllocInfo = OffloadContext::get().AllocInfoMap.at(Address);
-  auto Device = AllocInfo.Device;
-  auto Type = AllocInfo.Type;
+  ol_device_handle_t Device;
+  ol_alloc_type_t Type;
+  {
+    std::lock_guard<std::mutex> Lock(OffloadContext::get().AllocInfoMapMutex);
+    if (!OffloadContext::get().AllocInfoMap.contains(Address))
+      return createOffloadError(ErrorCode::INVALID_ARGUMENT,
+                                "address is not a known allocation");
+
+    auto AllocInfo = OffloadContext::get().AllocInfoMap.at(Address);
+    Device = AllocInfo.Device;
+    Type = AllocInfo.Type;
+    OffloadContext::get().AllocInfoMap.erase(Address);
+  }
 
   if (auto Res =
           Device->Device->dataDelete(Address, convertOlToPluginAllocTy(Type)))
     return Res;
 
-  OffloadContext::get().AllocInfoMap.erase(Address);
-
   return Error::success();
 }
 
 Error olCreateQueue_impl(ol_device_handle_t Device, ol_queue_handle_t *Queue) {
   auto CreatedQueue = std::make_unique<ol_queue_impl_t>(nullptr, Device);
-  if (auto Err = Device->Device->initAsyncInfo(&(CreatedQueue->AsyncInfo)))
+
+  auto OutstandingQueue = Device->getOutstandingQueue();
+  if (OutstandingQueue) {
+    // The queue is empty, but we still need to sync it to release any temporary
+    // memory allocations or do other cleanup.
+    if (auto Err =
+            Device->Device->synchronize(OutstandingQueue, /*Release=*/false))
+      return Err;
+    CreatedQueue->AsyncInfo = OutstandingQueue;
+  } else if (auto Err =
+                 Device->Device->initAsyncInfo(&(CreatedQueue->AsyncInfo))) {
     return Err;
+  }
 
   *Queue = CreatedQueue.release();
   return Error::success();
 }
 
-Error olDestroyQueue_impl(ol_queue_handle_t Queue) { return olDestroy(Queue); }
+Error olDestroyQueue_impl(ol_queue_handle_t Queue) {
+  auto *Device = Queue->Device;
+  // This is safe; as soon as olDestroyQueue is called it is not possible to add
+  // any more work to the queue, so if it's finished now it will remain finished
+  // forever.
+  auto Res = Device->Device->hasPendingWork(Queue->AsyncInfo);
+  if (!Res)
+    return Res.takeError();
+
+  if (!*Res) {
+    // The queue is complete, so sync it and throw it back into the pool.
+    if (auto Err = Device->Device->synchronize(Queue->AsyncInfo,
+                                               /*Release=*/true))
+      return Err;
+  } else {
+    // The queue still has outstanding work. Store it so we can check it later.
+    std::lock_guard<std::mutex> Lock(Device->OutstandingQueuesMutex);
+    Device->OutstandingQueues.push_back(Queue->AsyncInfo);
+  }
+
+  return olDestroy(Queue);
+}
 
 Error olSyncQueue_impl(ol_queue_handle_t Queue) {
   // Host plugin doesn't have a queue set so it's not safe to call synchronize
@@ -509,7 +723,7 @@ Error olWaitEvents_impl(ol_queue_handle_t Queue, ol_event_handle_t *Events,
                            "olWaitEvents asked to wait on a NULL event");
 
     // Do nothing if the event is for this queue or the event is always complete
-    if (Event->Queue == Queue || !Event->EventInfo)
+    if (Event->QueueId == Queue->Id || !Event->EventInfo)
       continue;
 
     if (auto Err = Device->waitEvent(Event->EventInfo, Queue->AsyncInfo))
@@ -553,11 +767,11 @@ Error olGetQueueInfoSize_impl(ol_queue_handle_t Queue, ol_queue_info_t PropName,
 }
 
 Error olSyncEvent_impl(ol_event_handle_t Event) {
+  // No event info means that this event was complete on creation
   if (!Event->EventInfo)
-    // Event always complete
     return Plugin::success();
 
-  if (auto Res = Event->Queue->Device->Device->syncEvent(Event->EventInfo))
+  if (auto Res = Event->Device->Device->syncEvent(Event->EventInfo))
     return Res;
 
   return Error::success();
@@ -565,7 +779,7 @@ Error olSyncEvent_impl(ol_event_handle_t Event) {
 
 Error olDestroyEvent_impl(ol_event_handle_t Event) {
   if (Event->EventInfo)
-    if (auto Res = Event->Queue->Device->Device->destroyEvent(Event->EventInfo))
+    if (auto Res = Event->Device->Device->destroyEvent(Event->EventInfo))
       return Res;
 
   return olDestroy(Event);
@@ -575,10 +789,22 @@ Error olGetEventInfoImplDetail(ol_event_handle_t Event,
                                ol_event_info_t PropName, size_t PropSize,
                                void *PropValue, size_t *PropSizeRet) {
   InfoWriter Info(PropSize, PropValue, PropSizeRet);
+  auto Queue = Event->Queue;
 
   switch (PropName) {
   case OL_EVENT_INFO_QUEUE:
-    return Info.write<ol_queue_handle_t>(Event->Queue);
+    return Info.write<ol_queue_handle_t>(Queue);
+  case OL_EVENT_INFO_IS_COMPLETE: {
+    // No event info means that this event was complete on creation
+    if (!Event->EventInfo)
+      return Info.write<bool>(true);
+
+    auto Res = Queue->Device->Device->isEventComplete(Event->EventInfo,
+                                                      Queue->AsyncInfo);
+    if (auto Err = Res.takeError())
+      return Err;
+    return Info.write<bool>(*Res);
+  }
   default:
     return createOffloadError(ErrorCode::INVALID_ENUMERATION,
                               "olGetEventInfo enum '%i' is invalid", PropName);
@@ -604,7 +830,7 @@ Error olCreateEvent_impl(ol_queue_handle_t Queue, ol_event_handle_t *EventOut) {
   if (auto Err = Pending.takeError())
     return Err;
 
-  *EventOut = new ol_event_impl_t(nullptr, Queue);
+  *EventOut = new ol_event_impl_t(nullptr, Queue->Device, Queue);
   if (!*Pending)
     // Queue is empty, don't record an event and consider the event always
     // complete
@@ -656,6 +882,12 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
   return Error::success();
 }
 
+Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize,
+                     const void *PatternPtr, size_t FillSize) {
+  return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize,
+                                         Queue->AsyncInfo);
+}
+
 Error olCreateProgram_impl(ol_device_handle_t Device, const void *ProgData,
                            size_t ProgDataSize, ol_program_handle_t *Program) {
   // Make a copy of the program binary in case it is released by the caller.
@@ -696,6 +928,24 @@ Error olDestroyProgram_impl(ol_program_handle_t Program) {
   return olDestroy(Program);
 }
 
+Error olCalculateOptimalOccupancy_impl(ol_device_handle_t Device,
+                                       ol_symbol_handle_t Kernel,
+                                       size_t DynamicMemSize,
+                                       size_t *GroupSize) {
+  if (Kernel->Kind != OL_SYMBOL_KIND_KERNEL)
+    return createOffloadError(ErrorCode::SYMBOL_KIND,
+                              "provided symbol is not a kernel");
+  auto *KernelImpl = std::get<GenericKernelTy *>(Kernel->PluginImpl);
+
+  auto Res = KernelImpl->maxGroupSize(*Device->Device, DynamicMemSize);
+  if (auto Err = Res.takeError())
+    return Err;
+
+  *GroupSize = *Res;
+
+  return Error::success();
+}
+
 Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
                           ol_symbol_handle_t Kernel, const void *ArgumentsData,
                           size_t ArgumentsSize,
@@ -765,7 +1015,7 @@ Error olGetSymbol_impl(ol_program_handle_t Program, const char *Name,
     return Error::success();
   }
   case OL_SYMBOL_KIND_GLOBAL_VARIABLE: {
-    auto &Global = Program->KernelSymbols[Name];
+    auto &Global = Program->GlobalSymbols[Name];
     if (!Global) {
       GlobalTy GlobalObj{Name};
       if (auto Res =
@@ -833,5 +1083,12 @@ Error olGetSymbolInfoSize_impl(ol_symbol_handle_t Symbol,
   return olGetSymbolInfoImplDetail(Symbol, PropName, 0, nullptr, PropSizeRet);
 }
 
+Error olLaunchHostFunction_impl(ol_queue_handle_t Queue,
+                                ol_host_function_cb_t Callback,
+                                void *UserData) {
+  return Queue->Device->Device->enqueueHostCall(Callback, UserData,
+                                                Queue->AsyncInfo);
+}
+
 } // namespace offload
 } // namespace llvm
diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index f88e30a..6585286 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -191,6 +191,10 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
                                   DstPtr, Size, AsyncInfo);
 }
 
+int32_t DeviceTy::dataFence(AsyncInfoTy &AsyncInfo) {
+  return RTL->data_fence(RTLDeviceID, AsyncInfo);
+}
+
 int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) {
   DP("Notifying about new mapping: HstPtr=" DPxMOD ", Size=%" PRId64 "\n",
      DPxPTR(HstPtr), Size);
diff --git a/offload/libomptarget/interface.cpp b/offload/libomptarget/interface.cpp
index e9b148d..fe18289 100644
--- a/offload/libomptarget/interface.cpp
+++ b/offload/libomptarget/interface.cpp
@@ -30,6 +30,7 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
+#include <memory>
 
 #ifdef OMPT_SUPPORT
 using namespace llvm::omp::target::ompt;
@@ -165,12 +166,24 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
                                              OMPT_GET_RETURN_ADDRESS);)
 
   int Rc = OFFLOAD_SUCCESS;
+
+  // Only allocate AttachInfo for targetDataBegin
+  std::unique_ptr<AttachInfoTy> AttachInfo;
+  if (TargetDataFunction == targetDataBegin)
+    AttachInfo = std::make_unique<AttachInfoTy>();
+
   Rc = TargetDataFunction(Loc, *DeviceOrErr, ArgNum, ArgsBase, Args, ArgSizes,
                           ArgTypes, ArgNames, ArgMappers, AsyncInfo,
-                          false /*FromMapper=*/);
+                          AttachInfo.get(), /*FromMapper=*/false);
 
-  if (Rc == OFFLOAD_SUCCESS)
-    Rc = AsyncInfo.synchronize();
+  if (Rc == OFFLOAD_SUCCESS) {
+    // Process deferred ATTACH entries BEFORE synchronization
+    if (AttachInfo && !AttachInfo->AttachEntries.empty())
+      Rc = processAttachEntries(*DeviceOrErr, *AttachInfo, AsyncInfo);
+
+    if (Rc == OFFLOAD_SUCCESS)
+      Rc = AsyncInfo.synchronize();
+  }
 
   handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
 }
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 5b25d95..4c8eba1 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -293,7 +293,8 @@ void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
 int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
                      int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames,
                      void *ArgMapper, AsyncInfoTy &AsyncInfo,
-                     TargetDataFuncPtrTy TargetDataFunction) {
+                     TargetDataFuncPtrTy TargetDataFunction,
+                     AttachInfoTy *AttachInfo = nullptr) {
   DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper));
 
   // The mapper function fills up Components.
@@ -324,17 +325,184 @@ int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
                               MapperArgsBase.data(), MapperArgs.data(),
                               MapperArgSizes.data(), MapperArgTypes.data(),
                               MapperArgNames.data(), /*arg_mappers*/ nullptr,
-                              AsyncInfo, /*FromMapper=*/true);
+                              AsyncInfo, AttachInfo, /*FromMapper=*/true);
 
   return Rc;
 }
 
+/// Utility function to perform a pointer attachment operation.
+///
+/// For something like:
+/// ```cpp
+///  int *p;
+///  ...
+///  #pragma omp target enter data map(to:p[10:10])
+/// ```
+///
+/// for which the attachment operation gets represented using:
+/// ```
+///   &p, &p[10], sizeof(p), ATTACH
+/// ```
+///
+/// (Hst|Tgt)PtrAddr   represents &p
+/// (Hst|Tgt)PteeBase  represents &p[0]
+/// (Hst|Tgt)PteeBegin represents &p[10]
+///
+/// This function first computes the expected TgtPteeBase using:
+///   `<Select>TgtPteeBase = TgtPteeBegin - (HstPteeBegin - HstPteeBase)`
+///
+/// and then attaches TgtPteeBase to TgtPtrAddr.
+///
+/// \p HstPtrSize represents the size of the pointer p. For C/C++, this
+/// should be same as "sizeof(void*)" (say 8).
+///
+/// However, for Fortran, pointers/allocatables, which are also eligible for
+/// "pointer-attachment", may be implemented using descriptors that contain the
+/// address of the pointee in the first 8 bytes, but also contain other
+/// information such as lower-bound/upper-bound etc in their subsequent fields.
+///
+/// For example, for the following:
+/// ```fortran
+///   integer, allocatable :: x(:)
+///   integer, pointer :: p(:)
+///   ...
+///   p => x(10: 19)
+///   ...
+///   !$omp target enter data map(to:p(:))
+/// ```
+///
+/// The map should trigger a pointer-attachment (assuming the pointer-attachment
+/// conditions as noted on processAttachEntries are met) between the descriptor
+/// for p, and its pointee data.
+///
+/// Since only the first 8 bytes of the descriptor contain the address of the
+/// pointee, an attachment operation on device descriptors involves:
+/// * Setting the first 8 bytes of the device descriptor to point the device
+/// address of the pointee.
+/// * Copying the remaining information about bounds/offset etc. from the host
+/// descriptor to the device descriptor.
+///
+/// The function also handles pointer-attachment portion of PTR_AND_OBJ maps,
+/// like:
+/// ```
+///   &p, &p[10], 10 * sizeof(p[10]), PTR_AND_OBJ
+/// ```
+/// by using `sizeof(void*)` as \p HstPtrSize.
+static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo,
+                                    void **HstPtrAddr, void *HstPteeBase,
+                                    void *HstPteeBegin, void **TgtPtrAddr,
+                                    void *TgtPteeBegin, int64_t HstPtrSize,
+                                    TargetPointerResultTy &PtrTPR) {
+  assert(PtrTPR.getEntry() &&
+         "Need a valid pointer entry to perform pointer-attachment");
+
+  int64_t VoidPtrSize = sizeof(void *);
+  assert(HstPtrSize >= VoidPtrSize && "PointerSize is too small");
+
+  uint64_t Delta = reinterpret_cast<uint64_t>(HstPteeBegin) -
+                   reinterpret_cast<uint64_t>(HstPteeBase);
+  void *TgtPteeBase = reinterpret_cast<void *>(
+      reinterpret_cast<uint64_t>(TgtPteeBegin) - Delta);
+  DP("HstPteeBase: " DPxMOD ", HstPteeBegin: " DPxMOD
+     ", Delta (HstPteeBegin - HstPteeBase): %" PRIu64 ".\n",
+     DPxPTR(HstPteeBase), DPxPTR(HstPteeBegin), Delta);
+  DP("TgtPteeBase (TgtPteeBegin - Delta): " DPxMOD ", TgtPteeBegin : " DPxMOD
+     "\n",
+     DPxPTR(TgtPteeBase), DPxPTR(TgtPteeBegin));
+
+  // Add shadow pointer tracking
+  // TODO: Support shadow-tracking of larger than VoidPtrSize pointers,
+  // to support restoration of Fortran descriptors. Currently, this check
+  // would return false, even if the host Fortran descriptor had been
+  // updated since its previous map, and we should have updated its
+  // device counterpart. e.g.
+  //
+  //   !$omp target enter data map(x(1:100)) !             (1)
+  //   p => x(10: 19)
+  //   !$omp target enter data map(p, p(:)) !              (2)
+  //   p => x(5: 9)
+  //   !$omp target enter data map(attach(always): p(:)) ! (3)
+  //
+  // While PtrAddr(&desc_p) and PteeBase(&p(1)) are same for (2) and (3), the
+  // pointer attachment for (3) needs to update the bounds information
+  // in the descriptor of p on device.
+  if (!PtrTPR.getEntry()->addShadowPointer(
+          ShadowPtrInfoTy{HstPtrAddr, HstPteeBase, TgtPtrAddr, TgtPteeBase})) {
+    DP("Pointer " DPxMOD " is already attached to " DPxMOD "\n",
+       DPxPTR(TgtPtrAddr), DPxPTR(TgtPteeBase));
+    return OFFLOAD_SUCCESS;
+  }
+
+  DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(TgtPtrAddr),
+     DPxPTR(TgtPteeBase));
+
+  // Lambda to handle submitData result and perform final steps.
+  auto HandleSubmitResult = [&](int SubmitResult) -> int {
+    if (SubmitResult != OFFLOAD_SUCCESS) {
+      REPORT("Failed to update pointer on device.\n");
+      return OFFLOAD_FAIL;
+    }
+
+    if (PtrTPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
+        OFFLOAD_SUCCESS)
+      return OFFLOAD_FAIL;
+
+    return OFFLOAD_SUCCESS;
+  };
+
+  bool IsPtrAFortranDescriptor = HstPtrSize > VoidPtrSize;
+  if (!IsPtrAFortranDescriptor) {
+    // For "regular" pointers, we can use the VoidPtrLocation from AsyncInfo as
+    // the buffer space for the submission.
+    void *&BufferElement = AsyncInfo.getVoidPtrLocation();
+    BufferElement = TgtPteeBase;
+
+    // Submit the updated pointer value to device
+    return HandleSubmitResult(Device.submitData(
+        TgtPtrAddr, &BufferElement, VoidPtrSize, AsyncInfo, PtrTPR.getEntry()));
+  }
+
+  // For larger "pointers" (like Fortran's descriptors), we create a dynamic
+  // buffer, which will be eventually destroyed by AsyncInfo's post-processing
+  // callback.
+  char *DataBuffer = new char[HstPtrSize];
+
+  // For such descriptors, to the first VoidPtrSize bytes, we store the
+  // pointee's device address.
+  std::memcpy(DataBuffer, &TgtPteeBase, sizeof(void *));
+
+  // And to the remaining bytes, we copy the remaining contents of the host
+  // descriptor after the initial VoidPtrSize bytes.
+  uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize;
+  void *HstDescriptorFieldsAddr =
+      reinterpret_cast<char *>(HstPtrAddr) + VoidPtrSize;
+  std::memcpy(DataBuffer + VoidPtrSize, HstDescriptorFieldsAddr,
+              HstDescriptorFieldsSize);
+
+  DP("Updating %" PRId64 " bytes of descriptor (" DPxMOD ") (pointer + %" PRId64
+     " additional bytes from host descriptor " DPxMOD ")\n",
+     HstPtrSize, DPxPTR(TgtPtrAddr), HstDescriptorFieldsSize,
+     DPxPTR(HstDescriptorFieldsAddr));
+
+  // Submit the entire buffer to device
+  int SubmitResult = Device.submitData(TgtPtrAddr, DataBuffer, HstPtrSize,
+                                       AsyncInfo, PtrTPR.getEntry());
+
+  AsyncInfo.addPostProcessingFunction([DataBuffer]() -> int {
+    delete[] DataBuffer;
+    return OFFLOAD_SUCCESS;
+  });
+  return HandleSubmitResult(SubmitResult);
+}
+
 /// Internal function to do the mapping and transfer the data to the device
 int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                     void **ArgsBase, void **Args, int64_t *ArgSizes,
                     int64_t *ArgTypes, map_var_info_t *ArgNames,
                     void **ArgMappers, AsyncInfoTy &AsyncInfo,
-                    bool FromMapper) {
+                    AttachInfoTy *AttachInfo, bool FromMapper) {
+  assert(AttachInfo && "AttachInfo must be available for targetDataBegin for "
+                       "handling ATTACH map-types.");
   // process each input.
   for (int32_t I = 0; I < ArgNum; ++I) {
     // Ignore private variables and arrays - there is no mapping for them.
@@ -352,7 +520,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
       int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
                                 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
-                                targetDataBegin);
+                                targetDataBegin, AttachInfo);
 
       if (Rc != OFFLOAD_SUCCESS) {
         REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
@@ -369,6 +537,18 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
     int64_t DataSize = ArgSizes[I];
     map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
 
+    // ATTACH map-types are supposed to be handled after all mapping for the
+    // construct is done. Defer their processing.
+    if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) {
+      AttachInfo->AttachEntries.emplace_back(
+          /*PointerBase=*/HstPtrBase, /*PointeeBegin=*/HstPtrBegin,
+          /*PointerSize=*/DataSize, /*MapType=*/ArgTypes[I],
+          /*PointeeName=*/HstPtrName);
+
+      DP("Deferring ATTACH map-type processing for argument %d\n", I);
+      continue;
+    }
+
     // Adjust for proper alignment if this is a combined entry (for structs).
     // Look at the next argument - if that is MEMBER_OF this one, then this one
     // is a combined entry.
@@ -434,13 +614,18 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                                   : "device failure or illegal mapping");
         return OFFLOAD_FAIL;
       }
+
+      // Track new allocation, for eventual use in attachment decision-making.
+      if (PointerTpr.Flags.IsNewEntry && !IsHostPtr)
+        AttachInfo->NewAllocations[HstPtrBase] = sizeof(void *);
+
       DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
          "\n",
          sizeof(void *), DPxPTR(PointerTgtPtrBegin),
          (PointerTpr.Flags.IsNewEntry ? "" : " not"));
       PointerHstPtrBegin = HstPtrBase;
       // modify current entry.
-      HstPtrBase = *(void **)HstPtrBase;
+      HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
       // No need to update pointee ref count for the first element of the
       // subelement that comes from mapper.
       UpdateRef =
@@ -464,6 +649,11 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                                 : "device failure or illegal mapping");
       return OFFLOAD_FAIL;
     }
+
+    // Track new allocation, for eventual use in attachment decision-making.
+    if (TPR.Flags.IsNewEntry && !IsHostPtr && TgtPtrBegin)
+      AttachInfo->NewAllocations[HstPtrBegin] = DataSize;
+
     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
        " - is%s new\n",
        DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not"));
@@ -476,30 +666,13 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
     }
 
     if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
-
-      uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
-      void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
-
-      if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfoTy{
-              (void **)PointerHstPtrBegin, HstPtrBase,
-              (void **)PointerTgtPtrBegin, ExpectedTgtPtrBase})) {
-        DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
-           DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
-
-        void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
-        TgtPtrBase = ExpectedTgtPtrBase;
-
-        int Ret =
-            Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, sizeof(void *),
-                              AsyncInfo, PointerTpr.getEntry());
-        if (Ret != OFFLOAD_SUCCESS) {
-          REPORT("Copying data to device failed.\n");
-          return OFFLOAD_FAIL;
-        }
-        if (PointerTpr.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
-            OFFLOAD_SUCCESS)
-          return OFFLOAD_FAIL;
-      }
+      int Ret = performPointerAttachment(
+          Device, AsyncInfo, reinterpret_cast<void **>(PointerHstPtrBegin),
+          HstPtrBase, HstPtrBegin,
+          reinterpret_cast<void **>(PointerTgtPtrBegin), TgtPtrBegin,
+          sizeof(void *), PointerTpr);
+      if (Ret != OFFLOAD_SUCCESS)
+        return OFFLOAD_FAIL;
     }
 
     // Check if variable can be used on the device:
@@ -515,6 +688,189 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
   return OFFLOAD_SUCCESS;
 }
 
+/// Process deferred ATTACH map entries collected during targetDataBegin.
+///
+/// From OpenMP's perspective, when mapping something that has a base pointer,
+/// such as:
+/// ```cpp
+///   int *p;
+///   #pragma omp enter target data map(to: p[10:20])
+/// ```
+///
+/// a pointer-attachment between p and &p[10] should occur if both p and
+/// p[10] are present on the device after doing all allocations for all maps
+/// on the construct, and one of the following is true:
+///
+/// * The pointer p was newly allocated while handling the construct
+/// * The pointee p[10:20] was newly allocated while handling the construct
+/// * attach(always) map-type modifier was specified (OpenMP 6.1)
+///
+/// That's why we collect all attach entries and new memory allocations during
+/// targetDataBegin, and use that information to make the decision of whether
+/// to perform a pointer-attachment or not here, after maps have been handled.
+///
+/// Additionally, once we decide that a pointer-attachment should be performed,
+/// we need to make sure that it happens after any previously submitted data
+/// transfers have completed, to avoid the possibility of the pending transfers
+/// clobbering the attachment. For example:
+///
+/// ```cpp
+///   int *p = ...;
+///   int **pp = &p;
+///   map(to: pp[0], p[0])
+/// ```
+///
+/// Which would be represented by:
+/// ```
+/// &pp[0], &pp[0], sizeof(pp[0]), TO (1)
+/// &p[0], &p[0], sizeof(p[0]), TO    (2)
+///
+/// &pp, &pp[0], sizeof(pp), ATTACH   (3)
+/// &p, &p[0], sizeof(p), ATTACH      (4)
+/// ```
+///
+/// (4) and (1) are both trying to modify the device memory corresponding to
+/// `&p`. So, if we decide that (4) should do an attachment, we also need to
+/// ensure that (4) happens after (1) is complete.
+///
+/// For this purpose, we insert a data_fence before the first
+/// pointer-attachment, (3), to ensure that all pending transfers finish first.
+int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo,
+                         AsyncInfoTy &AsyncInfo) {
+  // Report all tracked allocations from both main loop and ATTACH processing
+  if (!AttachInfo.NewAllocations.empty()) {
+    DP("Tracked %u total new allocations:\n",
+       (unsigned)AttachInfo.NewAllocations.size());
+    for (const auto &Alloc : AttachInfo.NewAllocations) {
+      DP("  Host ptr: " DPxMOD ", Size: %" PRId64 " bytes\n",
+         DPxPTR(Alloc.first), Alloc.second);
+    }
+  }
+
+  if (AttachInfo.AttachEntries.empty())
+    return OFFLOAD_SUCCESS;
+
+  DP("Processing %zu deferred ATTACH map entries\n",
+     AttachInfo.AttachEntries.size());
+
+  int Ret = OFFLOAD_SUCCESS;
+  bool IsFirstPointerAttachment = true;
+  for (size_t EntryIdx = 0; EntryIdx < AttachInfo.AttachEntries.size();
+       ++EntryIdx) {
+    const auto &AttachEntry = AttachInfo.AttachEntries[EntryIdx];
+
+    void **HstPtr = reinterpret_cast<void **>(AttachEntry.PointerBase);
+
+    void *HstPteeBase = *HstPtr;
+    void *HstPteeBegin = AttachEntry.PointeeBegin;
+
+    int64_t PtrSize = AttachEntry.PointerSize;
+    int64_t MapType = AttachEntry.MapType;
+
+    DP("Processing ATTACH entry %zu: HstPtr=" DPxMOD ", HstPteeBegin=" DPxMOD
+       ", Size=%" PRId64 ", Type=0x%" PRIx64 "\n",
+       EntryIdx, DPxPTR(HstPtr), DPxPTR(HstPteeBegin), PtrSize, MapType);
+
+    const bool IsAttachAlways = MapType & OMP_TGT_MAPTYPE_ALWAYS;
+
+    // Lambda to check if a pointer was newly allocated
+    auto WasNewlyAllocated = [&](void *Ptr, const char *PtrName) {
+      bool IsNewlyAllocated =
+          llvm::any_of(AttachInfo.NewAllocations, [&](const auto &Alloc) {
+            void *AllocPtr = Alloc.first;
+            int64_t AllocSize = Alloc.second;
+            return Ptr >= AllocPtr &&
+                   Ptr < reinterpret_cast<void *>(
+                             reinterpret_cast<char *>(AllocPtr) + AllocSize);
+          });
+      DP("Attach %s " DPxMOD " was newly allocated: %s\n", PtrName, DPxPTR(Ptr),
+         IsNewlyAllocated ? "yes" : "no");
+      return IsNewlyAllocated;
+    };
+
+    // Only process ATTACH if either the pointee or the pointer was newly
+    // allocated, or the ALWAYS flag is set.
+    if (!IsAttachAlways && !WasNewlyAllocated(HstPteeBegin, "pointee") &&
+        !WasNewlyAllocated(HstPtr, "pointer")) {
+      DP("Skipping ATTACH entry %zu: neither pointer nor pointee was newly "
+         "allocated and no ALWAYS flag\n",
+         EntryIdx);
+      continue;
+    }
+
+    // Lambda to perform target pointer lookup and validation
+    auto LookupTargetPointer =
+        [&](void *Ptr, int64_t Size,
+            const char *PtrType) -> std::optional<TargetPointerResultTy> {
+      // ATTACH map-type does not change ref-count, or do any allocation
+      // We just need to do a lookup for the pointer/pointee.
+      TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin(
+          Ptr, Size, /*UpdateRefCount=*/false,
+          /*UseHoldRefCount=*/false, /*MustContain=*/true);
+
+      DP("Attach %s lookup - IsPresent=%s, IsHostPtr=%s\n", PtrType,
+         TPR.isPresent() ? "yes" : "no",
+         TPR.Flags.IsHostPointer ? "yes" : "no");
+
+      if (!TPR.isPresent()) {
+        DP("Skipping ATTACH entry %zu: %s not present on device\n", EntryIdx,
+           PtrType);
+        return std::nullopt;
+      }
+      if (TPR.Flags.IsHostPointer) {
+        DP("Skipping ATTACH entry %zu: device version of the %s is a host "
+           "pointer.\n",
+           EntryIdx, PtrType);
+        return std::nullopt;
+      }
+
+      return TPR;
+    };
+
+    // Get device version of the pointee (e.g., &p[10]) first, as we can
+    // release its TPR after extracting the pointer value.
+    void *TgtPteeBegin = [&]() -> void * {
+      if (auto PteeTPROpt = LookupTargetPointer(HstPteeBegin, 0, "pointee"))
+        return PteeTPROpt->TargetPointer;
+      return nullptr;
+    }();
+
+    if (!TgtPteeBegin)
+      continue;
+
+    // Get device version of the pointer (e.g., &p) next. We need to keep its
+    // TPR for use in shadow-pointer handling during pointer-attachment.
+    auto PtrTPROpt = LookupTargetPointer(HstPtr, PtrSize, "pointer");
+    if (!PtrTPROpt)
+      continue;
+    TargetPointerResultTy &PtrTPR = *PtrTPROpt;
+    void **TgtPtrBase = reinterpret_cast<void **>(PtrTPR.TargetPointer);
+
+    // Insert a data-fence before the first pointer-attachment.
+    if (IsFirstPointerAttachment) {
+      IsFirstPointerAttachment = false;
+      DP("Inserting a data fence before the first pointer attachment.\n");
+      Ret = Device.dataFence(AsyncInfo);
+      if (Ret != OFFLOAD_SUCCESS) {
+        REPORT("Failed to insert data fence.\n");
+        return OFFLOAD_FAIL;
+      }
+    }
+
+    // Do the pointer-attachment, i.e. update the device pointer to point to
+    // device pointee.
+    Ret = performPointerAttachment(Device, AsyncInfo, HstPtr, HstPteeBase,
+                                   HstPteeBegin, TgtPtrBase, TgtPteeBegin,
+                                   PtrSize, PtrTPR);
+    if (Ret != OFFLOAD_SUCCESS)
+      return OFFLOAD_FAIL;
+
+    DP("ATTACH entry %zu processed successfully\n", EntryIdx);
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
 namespace {
 /// This structure contains information to deallocate a target pointer, aka.
 /// used to fix up the shadow map and potentially delete the entry from the
@@ -624,7 +980,8 @@ postProcessingTargetDataEnd(DeviceTy *Device,
 int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                   void **ArgBases, void **Args, int64_t *ArgSizes,
                   int64_t *ArgTypes, map_var_info_t *ArgNames,
-                  void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
+                  void **ArgMappers, AsyncInfoTy &AsyncInfo,
+                  AttachInfoTy *AttachInfo, bool FromMapper) {
   int Ret = OFFLOAD_SUCCESS;
   auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>();
   // process each input.
@@ -635,6 +992,14 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
       continue;
 
+    // Ignore ATTACH entries - they should only be honored on map-entering
+    // directives. They may be encountered here while handling the "end" part of
+    // "#pragma omp target".
+    if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) {
+      DP("Ignoring ATTACH entry %d in targetDataEnd\n", I);
+      continue;
+    }
+
     if (ArgMappers && ArgMappers[I]) {
       // Instead of executing the regular path of targetDataEnd, call the
       // targetDataMapper variant which will call targetDataEnd again
@@ -900,7 +1265,8 @@ static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
 int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                      void **ArgsBase, void **Args, int64_t *ArgSizes,
                      int64_t *ArgTypes, map_var_info_t *ArgNames,
-                     void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) {
+                     void **ArgMappers, AsyncInfoTy &AsyncInfo,
+                     AttachInfoTy *AttachInfo, bool FromMapper) {
   // process each input.
   for (int32_t I = 0; I < ArgNum; ++I) {
     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
@@ -1213,13 +1579,27 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
   if (!DeviceOrErr)
     FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
 
+  // Create AttachInfo for tracking any ATTACH entries, or new-allocations
+  // when handling the "begin" mapping for a target constructs.
+  AttachInfoTy AttachInfo;
+
   int Ret = targetDataBegin(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes,
-                            ArgTypes, ArgNames, ArgMappers, AsyncInfo);
+                            ArgTypes, ArgNames, ArgMappers, AsyncInfo,
+                            &AttachInfo, false /*FromMapper=*/);
   if (Ret != OFFLOAD_SUCCESS) {
     REPORT("Call to targetDataBegin failed, abort target.\n");
     return OFFLOAD_FAIL;
   }
 
+  // Process collected ATTACH entries
+  if (!AttachInfo.AttachEntries.empty()) {
+    Ret = processAttachEntries(*DeviceOrErr, AttachInfo, AsyncInfo);
+    if (Ret != OFFLOAD_SUCCESS) {
+      REPORT("Failed to process ATTACH entries.\n");
+      return OFFLOAD_FAIL;
+    }
+  }
+
   // List of (first-)private arrays allocated for this target region
   SmallVector<int> TgtArgsPositions(ArgNum, -1);
 
diff --git a/offload/libomptarget/private.h b/offload/libomptarget/private.h
index 0b3d545..90e5e17 100644
--- a/offload/libomptarget/private.h
+++ b/offload/libomptarget/private.h
@@ -55,7 +55,14 @@ printKernelArguments(const ident_t *Loc, const int64_t DeviceId,
     const char *Type = nullptr;
     const char *Implicit =
         (ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT) ? "(implicit)" : "";
-    if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO && ArgTypes[I] & OMP_TGT_MAPTYPE_FROM)
+
+    if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH &&
+        ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS)
+      Type = "attach:always";
+    else if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH)
+      Type = "attach";
+    else if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO &&
+             ArgTypes[I] & OMP_TGT_MAPTYPE_FROM)
       Type = "tofrom";
     else if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO)
       Type = "to";
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
index 61f680b..ad135f7 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
@@ -71,9 +71,15 @@ typedef enum {
 } hsa_isa_info_t;
 
 typedef enum {
+  HSA_MACHINE_MODEL_SMALL = 0,
+  HSA_MACHINE_MODEL_LARGE = 1
+} hsa_machine_model_t;
+
+typedef enum {
   HSA_AGENT_INFO_NAME = 0,
   HSA_AGENT_INFO_VENDOR_NAME = 1,
   HSA_AGENT_INFO_FEATURE = 2,
+  HSA_AGENT_INFO_MACHINE_MODEL = 3,
   HSA_AGENT_INFO_PROFILE = 4,
   HSA_AGENT_INFO_WAVEFRONT_SIZE = 6,
   HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7,
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 3117763..29cfe78 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -67,6 +67,7 @@ typedef enum hsa_amd_agent_info_s {
   HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001,
   HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002,
   HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003,
+  HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY = 0xA008,
   HSA_AMD_AGENT_INFO_PRODUCT_NAME = 0xA009,
   HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A,
   HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B,
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 7961820..c26cfe9 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -570,6 +570,16 @@ struct AMDGPUKernelTy : public GenericKernelTy {
                    KernelLaunchParamsTy LaunchParams,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
+  /// Return maximum block size for maximum occupancy
+  ///
+  /// TODO: This needs to be implemented for amdgpu
+  Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                  uint64_t DynamicMemSize) const override {
+    return Plugin::error(
+        ErrorCode::UNSUPPORTED,
+        "occupancy calculations for AMDGPU are not yet implemented");
+  }
+
   /// Print more elaborate kernel launch info for AMDGPU
   Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
                                KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
@@ -914,6 +924,7 @@ private:
     void *Dst;
     const void *Src;
     size_t Size;
+    size_t NumTimes;
   };
 
   /// Utility struct holding arguments for freeing buffers to memory managers.
@@ -964,9 +975,14 @@ private:
     StreamSlotTy() : Signal(nullptr), Callbacks({}), ActionArgs({}) {}
 
     /// Schedule a host memory copy action on the slot.
-    Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) {
+    ///
+    /// Num times will repeat the copy that many times, sequentually in the dest
+    /// buffer.
+    Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size,
+                              size_t NumTimes = 1) {
       Callbacks.emplace_back(memcpyAction);
-      ActionArgs.emplace_back().MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
+      ActionArgs.emplace_back().MemcpyArgs =
+          MemcpyArgsTy{Dst, Src, Size, NumTimes};
       return Plugin::success();
     }
 
@@ -1063,6 +1079,20 @@ private:
   /// Indicate to spread data transfers across all available SDMAs
   bool UseMultipleSdmaEngines;
 
+  /// Wrapper function for implementing host callbacks
+  static void CallbackWrapper(AMDGPUSignalTy *InputSignal,
+                              AMDGPUSignalTy *OutputSignal,
+                              void (*Callback)(void *), void *UserData) {
+    // The wait call will not error in this context.
+    if (InputSignal)
+      if (auto Err = InputSignal->wait())
+        reportFatalInternalError(std::move(Err));
+
+    Callback(UserData);
+
+    OutputSignal->signal();
+  }
+
   /// Return the current number of asynchronous operations on the stream.
   uint32_t size() const { return NextSlot; }
 
@@ -1192,7 +1222,11 @@ private:
     assert(Args->Dst && "Invalid destination buffer");
     assert(Args->Src && "Invalid source buffer");
 
-    std::memcpy(Args->Dst, Args->Src, Args->Size);
+    auto BasePtr = Args->Dst;
+    for (size_t I = 0; I < Args->NumTimes; I++) {
+      std::memcpy(BasePtr, Args->Src, Args->Size);
+      BasePtr = reinterpret_cast<uint8_t *>(BasePtr) + Args->Size;
+    }
 
     return Plugin::success();
   }
@@ -1397,7 +1431,8 @@ public:
   /// manager once the operation completes.
   Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
                                uint64_t CopySize,
-                               AMDGPUMemoryManagerTy &MemoryManager) {
+                               AMDGPUMemoryManagerTy &MemoryManager,
+                               size_t NumTimes = 1) {
     // Retrieve available signals for the operation's outputs.
     AMDGPUSignalTy *OutputSignals[2] = {};
     if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
@@ -1419,7 +1454,8 @@ public:
       // The std::memcpy is done asynchronously using an async handler. We store
       // the function's information in the action but it is not actually a
       // post action.
-      if (auto Err = Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize))
+      if (auto Err =
+              Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize, NumTimes))
         return Err;
 
       // Make changes on this slot visible to the async handler's thread.
@@ -1440,7 +1476,11 @@ public:
       std::tie(Curr, InputSignal) = consume(OutputSignal);
     } else {
       // All preceding operations completed, copy the memory synchronously.
-      std::memcpy(Inter, Src, CopySize);
+      auto *InterPtr = Inter;
+      for (size_t I = 0; I < NumTimes; I++) {
+        std::memcpy(InterPtr, Src, CopySize);
+        InterPtr = reinterpret_cast<uint8_t *>(InterPtr) + CopySize;
+      }
 
       // Return the second signal because it will not be used.
       OutputSignals[1]->decreaseUseCount();
@@ -1457,11 +1497,11 @@ public:
     if (InputSignal && InputSignal->load()) {
       hsa_signal_t InputSignalRaw = InputSignal->get();
       return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
-                                     Agent, CopySize, 1, &InputSignalRaw,
-                                     OutputSignal->get());
+                                     Agent, CopySize * NumTimes, 1,
+                                     &InputSignalRaw, OutputSignal->get());
     }
     return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
-                                   Agent, CopySize, 0, nullptr,
+                                   Agent, CopySize * NumTimes, 0, nullptr,
                                    OutputSignal->get());
   }
 
@@ -1495,6 +1535,31 @@ public:
                                    OutputSignal->get());
   }
 
+  Error pushHostCallback(void (*Callback)(void *), void *UserData) {
+    // Retrieve an available signal for the operation's output.
+    AMDGPUSignalTy *OutputSignal = nullptr;
+    if (auto Err = SignalManager.getResource(OutputSignal))
+      return Err;
+    OutputSignal->reset();
+    OutputSignal->increaseUseCount();
+
+    AMDGPUSignalTy *InputSignal;
+    {
+      std::lock_guard<std::mutex> Lock(Mutex);
+
+      // Consume stream slot and compute dependencies.
+      InputSignal = consume(OutputSignal).second;
+    }
+
+    // "Leaking" the thread here is consistent with other work added to the
+    // queue. The input and output signals will remain valid until the output is
+    // signaled.
+    std::thread(CallbackWrapper, InputSignal, OutputSignal, Callback, UserData)
+        .detach();
+
+    return Plugin::success();
+  }
+
   /// Synchronize with the stream. The current thread waits until all operations
   /// are finalized and it performs the pending post actions (i.e., releasing
   /// intermediate buffers).
@@ -1519,6 +1584,9 @@ public:
   /// actions for that and prior events.
   Error synchronizeOn(AMDGPUEventTy &Event);
 
+  /// Return true if the event from this queue is complete
+  Expected<bool> isEventComplete(const AMDGPUEventTy &Event);
+
   /// Query the stream and complete pending post actions if operations finished.
   /// Return whether all the operations completed. This operation does not block
   /// the calling thread.
@@ -1683,6 +1751,18 @@ Error AMDGPUStreamTy::synchronizeOn(AMDGPUEventTy &Event) {
   return completeUntil(Event.RecordedSlot);
 }
 
+Expected<bool> AMDGPUStreamTy::isEventComplete(const AMDGPUEventTy &Event) {
+  std::lock_guard<std::mutex> Lock(Mutex);
+  assert(Event.RecordedStream == this && "event is for a different stream");
+
+  if (Event.RecordedSyncCycle < SyncCycle) {
+    return true;
+  }
+  assert(Event.RecordedSyncCycle == SyncCycle && "event is from the future?");
+
+  return !Slots[Event.RecordedSlot].Signal->load();
+}
+
 struct AMDGPUStreamManagerTy final
     : GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> {
   using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>;
@@ -2537,6 +2617,85 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
                                           getAgent(), (uint64_t)Size);
   }
 
+  /// Insert a data fence between previous data operations and the following
+  /// operations. This is a no-op for AMDGPU devices as operations inserted into
+  /// a queue are in-order.
+  Error dataFence(__tgt_async_info *Async) override {
+    return Plugin::success();
+  }
+
+  Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+                     int64_t Size,
+                     AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    // Fast case, where we can use the 4 byte hsa_amd_memory_fill
+    if (Size % 4 == 0 &&
+        (PatternSize == 4 || PatternSize == 2 || PatternSize == 1)) {
+      uint32_t Pattern;
+      if (PatternSize == 1) {
+        auto *Byte = reinterpret_cast<const uint8_t *>(PatternPtr);
+        Pattern = *Byte | *Byte << 8 | *Byte << 16 | *Byte << 24;
+      } else if (PatternSize == 2) {
+        auto *Word = reinterpret_cast<const uint16_t *>(PatternPtr);
+        Pattern = *Word | (*Word << 16);
+      } else if (PatternSize == 4) {
+        Pattern = *reinterpret_cast<const uint32_t *>(PatternPtr);
+      } else {
+        // Shouldn't be here if the pattern size is outwith those values
+        llvm_unreachable("Invalid pattern size");
+      }
+
+      if (hasPendingWorkImpl(AsyncInfoWrapper)) {
+        AMDGPUStreamTy *Stream = nullptr;
+        if (auto Err = getStream(AsyncInfoWrapper, Stream))
+          return Err;
+
+        struct MemFillArgsTy {
+          void *Dst;
+          uint32_t Pattern;
+          int64_t Size;
+        };
+        auto *Args = new MemFillArgsTy{TgtPtr, Pattern, Size / 4};
+        auto Fill = [](void *Data) {
+          MemFillArgsTy *Args = reinterpret_cast<MemFillArgsTy *>(Data);
+          assert(Args && "Invalid arguments");
+
+          auto Status =
+              hsa_amd_memory_fill(Args->Dst, Args->Pattern, Args->Size);
+          delete Args;
+          auto Err =
+              Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n");
+          if (Err) {
+            FATAL_MESSAGE(1, "error performing async fill: %s",
+                          toString(std::move(Err)).data());
+          }
+        };
+
+        // hsa_amd_memory_fill doesn't signal completion using a signal, so use
+        // the existing host callback logic to handle that instead
+        return Stream->pushHostCallback(Fill, Args);
+      } else {
+        // If there is no pending work, do the fill synchronously
+        auto Status = hsa_amd_memory_fill(TgtPtr, Pattern, Size / 4);
+        return Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n");
+      }
+    }
+
+    // Slow case; allocate an appropriate memory size and enqueue copies
+    void *PinnedPtr = nullptr;
+    AMDGPUMemoryManagerTy &PinnedMemoryManager =
+        HostDevice.getPinnedMemoryManager();
+    if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr))
+      return Err;
+
+    AMDGPUStreamTy *Stream = nullptr;
+    if (auto Err = getStream(AsyncInfoWrapper, Stream))
+      return Err;
+
+    return Stream->pushMemoryCopyH2DAsync(TgtPtr, PatternPtr, PinnedPtr,
+                                          PatternSize, PinnedMemoryManager,
+                                          Size / PatternSize);
+  }
+
   /// Initialize the async info for interoperability purposes.
   Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
     // TODO: Implement this function.
@@ -2553,6 +2712,15 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Plugin::success();
   }
 
+  Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
+                            AsyncInfoWrapperTy &AsyncInfo) override {
+    AMDGPUStreamTy *Stream = nullptr;
+    if (auto Err = getStream(AsyncInfo, Stream))
+      return Err;
+
+    return Stream->pushHostCallback(Callback, UserData);
+  };
+
   /// Create an event.
   Error createEventImpl(void **EventPtrStorage) override {
     AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage);
@@ -2601,6 +2769,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Query.takeError();
   }
 
+  Expected<bool> isEventCompleteImpl(void *EventPtr,
+                                     AsyncInfoWrapperTy &AsyncInfo) override {
+    AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
+    auto Stream = AsyncInfo.getQueueAs<AMDGPUStreamTy *>();
+    return Stream && Stream->isEventComplete(*Event);
+  }
+
   /// Synchronize the current thread with the event.
   Error syncEventImpl(void *EventPtr) override {
     AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
@@ -2632,7 +2807,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar);
     if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Product Name", TmpChar);
+      Info.add("Product Name", TmpChar, "", DeviceInfo::PRODUCT_NAME);
 
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
     if (Status == HSA_STATUS_SUCCESS)
@@ -2642,6 +2817,15 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (Status == HSA_STATUS_SUCCESS)
       Info.add("Vendor Name", TmpChar, "", DeviceInfo::VENDOR);
 
+    Info.add("Vendor ID", uint64_t{4130}, "", DeviceInfo::VENDOR_ID);
+
+    hsa_machine_model_t MachineModel;
+    Status = getDeviceAttrRaw(HSA_AGENT_INFO_MACHINE_MODEL, MachineModel);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("Memory Address Size",
+               uint64_t{MachineModel == HSA_MACHINE_MODEL_SMALL ? 32u : 64u},
+               "bits", DeviceInfo::ADDRESS_BITS);
+
     hsa_device_type_t DevType;
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType);
     if (Status == HSA_STATUS_SUCCESS) {
@@ -2692,11 +2876,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
     if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Max Clock Freq", TmpUInt, "MHz");
+      Info.add("Max Clock Freq", TmpUInt, "MHz",
+               DeviceInfo::MAX_CLOCK_FREQUENCY);
+
+    Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY, TmpUInt);
+    if (Status == HSA_STATUS_SUCCESS)
+      Info.add("Max Memory Clock Freq", TmpUInt, "MHz",
+               DeviceInfo::MEMORY_CLOCK_RATE);
 
     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt);
     if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Compute Units", TmpUInt);
+      Info.add("Compute Units", TmpUInt, "", DeviceInfo::NUM_COMPUTE_UNITS);
 
     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt);
     if (Status == HSA_STATUS_SUCCESS)
@@ -2734,11 +2924,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt);
     if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Grid Max Size", TmpUInt);
+      Info.add("Grid Max Size", TmpUInt, "", DeviceInfo::MAX_WORK_SIZE);
 
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim);
     if (Status == HSA_STATUS_SUCCESS) {
-      auto &MaxDim = *Info.add("Grid Max Size per Dimension");
+      auto &MaxDim = *Info.add("Grid Max Size per Dimension", std::monostate{},
+                               "", DeviceInfo::MAX_WORK_SIZE_PER_DIMENSION);
       MaxDim.add("x", GridMaxDim.x);
       MaxDim.add("y", GridMaxDim.y);
       MaxDim.add("z", GridMaxDim.z);
@@ -2778,7 +2969,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
       Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt);
       if (Status == HSA_STATUS_SUCCESS)
-        PoolNode.add("Size", TmpSt, "bytes");
+        PoolNode.add(
+            "Size", TmpSt, "bytes",
+            (Pool->isGlobal() && Pool->isCoarseGrained())
+                ? std::optional<DeviceInfo>{DeviceInfo::GLOBAL_MEM_SIZE}
+                : std::nullopt);
 
       Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
                                 TmpBool);
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index c9ab34b..2c01ed2 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -388,6 +388,9 @@ struct GenericKernelTy {
                            KernelLaunchParamsTy LaunchParams,
                            AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
 
+  virtual Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                          uint64_t DynamicMemSize) const = 0;
+
   /// Get the kernel name.
   const char *getName() const { return Name.c_str(); }
 
@@ -431,6 +434,8 @@ protected:
       return "Generic";
     case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
       return "Generic-SPMD";
+    case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP:
+      return "SPMD-No-Loop";
     }
     llvm_unreachable("Unknown execution mode!");
   }
@@ -468,7 +473,8 @@ private:
                         uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
                         uint32_t &NumThreads, bool IsNumThreadsFromUser) const;
 
-  /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
+  /// Indicate if the kernel works in Generic SPMD, Generic, No-Loop
+  /// or SPMD mode.
   bool isGenericSPMDMode() const {
     return KernelEnvironment.Configuration.ExecMode ==
            OMP_TGT_EXEC_MODE_GENERIC_SPMD;
@@ -483,6 +489,10 @@ private:
   bool isBareMode() const {
     return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_BARE;
   }
+  bool isNoLoopMode() const {
+    return KernelEnvironment.Configuration.ExecMode ==
+           OMP_TGT_EXEC_MODE_SPMD_NO_LOOP;
+  }
 
   /// The kernel name.
   std::string Name;
@@ -944,6 +954,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
                                  AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
 
+  /// Instert a data fence between previous data operations and the following
+  /// operations if necessary for the device
+  virtual Error dataFence(__tgt_async_info *AsyncInfo) = 0;
+
   /// Exchange data between devices (device to device transfer). Calling this
   /// function is only valid if GenericPlugin::isDataExchangable() passing the
   /// two devices returns true.
@@ -953,6 +967,13 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
                                  void *DstPtr, int64_t Size,
                                  AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
 
+  /// Fill data on the device with a pattern from the host
+  Error dataFill(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+                 int64_t Size, __tgt_async_info *AsyncInfo);
+  virtual Error dataFillImpl(void *TgtPtr, const void *PatternPtr,
+                             int64_t PatternSize, int64_t Size,
+                             AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
   /// Run the kernel associated with \p EntryPtr
   Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
                      KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);
@@ -965,6 +986,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Error initDeviceInfo(__tgt_device_info *DeviceInfo);
   virtual Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0;
 
+  /// Enqueue a host call to AsyncInfo
+  Error enqueueHostCall(void (*Callback)(void *), void *UserData,
+                        __tgt_async_info *AsyncInfo);
+  virtual Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
+                                    AsyncInfoWrapperTy &AsyncInfo) = 0;
+
   /// Create an event.
   Error createEvent(void **EventPtrStorage);
   virtual Error createEventImpl(void **EventPtrStorage) = 0;
@@ -984,6 +1011,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   virtual Error waitEventImpl(void *EventPtr,
                               AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
 
+  /// Check if the event enqueued to AsyncInfo is complete
+  Expected<bool> isEventComplete(void *Event, __tgt_async_info *AsyncInfo);
+  virtual Expected<bool>
+  isEventCompleteImpl(void *EventPtr, AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
   /// Synchronize the current thread with the event.
   Error syncEvent(void *EventPtr);
   virtual Error syncEventImpl(void *EventPtr) = 0;
@@ -1448,6 +1480,10 @@ public:
                               int DstDeviceId, void *DstPtr, int64_t Size,
                               __tgt_async_info *AsyncInfo);
 
+  /// Places a fence between previous data movements and following data
+  /// movements if necessary on the device
+  int32_t data_fence(int32_t DeviceId, __tgt_async_info *AsyncInfo);
+
   /// Begin executing a kernel on the given device.
   int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
                         ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs,
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 083d416..e5a313d 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -662,6 +662,10 @@ uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
     return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit());
   }
 
+  // Return the number of teams required to cover the loop iterations.
+  if (isNoLoopMode())
+    return LoopTripCount > 0 ? (((LoopTripCount - 1) / NumThreads) + 1) : 1;
+
   uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks();
   uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
   if (LoopTripCount > 0) {
@@ -1337,16 +1341,19 @@ Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
 
 Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo,
                                    bool ReleaseQueue) {
+  if (!AsyncInfo)
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "invalid async info queue");
+
   SmallVector<void *> AllocsToDelete{};
   {
     std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->Mutex};
 
-    if (!AsyncInfo || !AsyncInfo->Queue)
-      return Plugin::error(ErrorCode::INVALID_ARGUMENT,
-                           "invalid async info queue");
-
-    if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue))
-      return Err;
+    // This can be false when no work has been added to the AsyncInfo. In which
+    // case, the device has nothing to synchronize.
+    if (AsyncInfo->Queue)
+      if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue))
+        return Err;
 
     std::swap(AllocsToDelete, AsyncInfo->AssociatedAllocations);
   }
@@ -1540,6 +1547,16 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
   return Err;
 }
 
+Error GenericDeviceTy::dataFill(void *TgtPtr, const void *PatternPtr,
+                                int64_t PatternSize, int64_t Size,
+                                __tgt_async_info *AsyncInfo) {
+  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+  auto Err =
+      dataFillImpl(TgtPtr, PatternPtr, PatternSize, Size, AsyncInfoWrapper);
+  AsyncInfoWrapper.finalize(Err);
+  return Err;
+}
+
 Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
                                     ptrdiff_t *ArgOffsets,
                                     KernelArgsTy &KernelArgs,
@@ -1589,6 +1606,15 @@ Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) {
   return Err;
 }
 
+Error GenericDeviceTy::enqueueHostCall(void (*Callback)(void *), void *UserData,
+                                       __tgt_async_info *AsyncInfo) {
+  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+
+  auto Err = enqueueHostCallImpl(Callback, UserData, AsyncInfoWrapper);
+  AsyncInfoWrapper.finalize(Err);
+  return Err;
+}
+
 Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) {
   assert(DeviceInfo && "Invalid device info");
 
@@ -1648,6 +1674,22 @@ Expected<bool> GenericDeviceTy::hasPendingWork(__tgt_async_info *AsyncInfo) {
   return Res;
 }
 
+Expected<bool> GenericDeviceTy::isEventComplete(void *Event,
+                                                __tgt_async_info *AsyncInfo) {
+  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+  auto Res = isEventCompleteImpl(Event, AsyncInfoWrapper);
+  if (auto Err = Res.takeError()) {
+    AsyncInfoWrapper.finalize(Err);
+    return Err;
+  }
+
+  auto Err = Plugin::success();
+  AsyncInfoWrapper.finalize(Err);
+  if (Err)
+    return Err;
+  return Res;
+}
+
 Error GenericDeviceTy::syncEvent(void *EventPtr) {
   return syncEventImpl(EventPtr);
 }
@@ -2324,3 +2366,15 @@ int32_t GenericPluginTy::async_barrier(omp_interop_val_t *Interop) {
   }
   return OFFLOAD_SUCCESS;
 }
+
+int32_t GenericPluginTy::data_fence(int32_t DeviceId,
+                                    __tgt_async_info *AsyncInfo) {
+  auto Err = getDevice(DeviceId).dataFence(AsyncInfo);
+  if (Err) {
+    REPORT("failure to place data fence on device %d: %s\n", DeviceId,
+           toString(std::move(Err)).data());
+    return OFFLOAD_FAIL;
+  }
+
+  return OFFLOAD_SUCCESS;
+}
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 361a781..f5b2d07 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -53,6 +53,13 @@ DLWRAP(cuMemcpyDtoHAsync, 4)
 DLWRAP(cuMemcpyHtoD, 3)
 DLWRAP(cuMemcpyHtoDAsync, 4)
 
+DLWRAP(cuMemsetD8Async, 4)
+DLWRAP(cuMemsetD16Async, 4)
+DLWRAP(cuMemsetD32Async, 4)
+DLWRAP(cuMemsetD2D8Async, 6)
+DLWRAP(cuMemsetD2D16Async, 6)
+DLWRAP(cuMemsetD2D32Async, 6)
+
 DLWRAP(cuMemFree, 1)
 DLWRAP(cuMemFreeHost, 1)
 DLWRAP(cuMemFreeAsync, 2)
@@ -72,6 +79,7 @@ DLWRAP(cuDevicePrimaryCtxGetState, 3)
 DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
 DLWRAP(cuDevicePrimaryCtxRetain, 2)
 DLWRAP(cuModuleLoadDataEx, 5)
+DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
 
 DLWRAP(cuDeviceCanAccessPeer, 3)
 DLWRAP(cuCtxEnablePeerAccess, 2)
@@ -82,6 +90,7 @@ DLWRAP(cuCtxSetLimit, 2)
 
 DLWRAP(cuEventCreate, 2)
 DLWRAP(cuEventRecord, 2)
+DLWRAP(cuEventQuery, 1)
 DLWRAP(cuStreamWaitEvent, 3)
 DLWRAP(cuEventSynchronize, 1)
 DLWRAP(cuEventDestroy, 1)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index b6c022c..dec4e33 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -290,6 +290,7 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
 static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
 
 typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
+typedef size_t (*CUoccupancyB2DSize)(int);
 
 CUresult cuCtxGetDevice(CUdevice *);
 CUresult cuDeviceGet(CUdevice *, int);
@@ -321,6 +322,16 @@ CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream);
 CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t);
 CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
 
+CUresult cuMemsetD8Async(CUdeviceptr, unsigned int, size_t, CUstream);
+CUresult cuMemsetD16Async(CUdeviceptr, unsigned int, size_t, CUstream);
+CUresult cuMemsetD32Async(CUdeviceptr, unsigned int, size_t, CUstream);
+CUresult cuMemsetD2D8Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
+                           CUstream);
+CUresult cuMemsetD2D16Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
+                            CUstream);
+CUresult cuMemsetD2D32Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
+                            CUstream);
+
 CUresult cuMemFree(CUdeviceptr);
 CUresult cuMemFreeHost(void *);
 CUresult cuMemFreeAsync(CUdeviceptr, CUstream);
@@ -352,6 +363,7 @@ CUresult cuCtxSetLimit(CUlimit, size_t);
 
 CUresult cuEventCreate(CUevent *, unsigned int);
 CUresult cuEventRecord(CUevent, CUstream);
+CUresult cuEventQuery(CUevent);
 CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int);
 CUresult cuEventSynchronize(CUevent);
 CUresult cuEventDestroy(CUevent);
@@ -372,5 +384,7 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
 CUresult cuMemGetAllocationGranularity(size_t *granularity,
                                        const CUmemAllocationProp *prop,
                                        CUmemAllocationGranularity_flags option);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+                                          CUoccupancyB2DSize, size_t, int);
 
 #endif
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index f3f3783..af3c746 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -157,6 +157,20 @@ struct CUDAKernelTy : public GenericKernelTy {
                    KernelLaunchParamsTy LaunchParams,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
+  /// Return maximum block size for maximum occupancy
+  Expected<uint64_t> maxGroupSize(GenericDeviceTy &,
+                                  uint64_t DynamicMemSize) const override {
+    int minGridSize;
+    int maxBlockSize;
+    auto Res = cuOccupancyMaxPotentialBlockSize(
+        &minGridSize, &maxBlockSize, Func, NULL, DynamicMemSize, INT_MAX);
+    if (auto Err = Plugin::check(
+            Res, "error in cuOccupancyMaxPotentialBlockSize: %s")) {
+      return Err;
+    }
+    return maxBlockSize;
+  }
+
 private:
   /// The CUDA kernel function to execute.
   CUfunction Func;
@@ -844,6 +858,64 @@ struct CUDADeviceTy : public GenericDeviceTy {
                          void *DstPtr, int64_t Size,
                          AsyncInfoWrapperTy &AsyncInfoWrapper) override;
 
+  Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+                     int64_t Size,
+                     AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    if (auto Err = setContext())
+      return Err;
+
+    CUstream Stream;
+    if (auto Err = getStream(AsyncInfoWrapper, Stream))
+      return Err;
+
+    CUresult Res;
+    size_t N = Size / PatternSize;
+    if (PatternSize == 1) {
+      Res = cuMemsetD8Async((CUdeviceptr)TgtPtr,
+                            *(static_cast<const uint8_t *>(PatternPtr)), N,
+                            Stream);
+    } else if (PatternSize == 2) {
+      Res = cuMemsetD16Async((CUdeviceptr)TgtPtr,
+                             *(static_cast<const uint16_t *>(PatternPtr)), N,
+                             Stream);
+    } else if (PatternSize == 4) {
+      Res = cuMemsetD32Async((CUdeviceptr)TgtPtr,
+                             *(static_cast<const uint32_t *>(PatternPtr)), N,
+                             Stream);
+    } else {
+      // For larger patterns we can do a series of strided fills to copy the
+      // pattern efficiently
+      int64_t MemsetSize = PatternSize % 4u == 0u   ? 4u
+                           : PatternSize % 2u == 0u ? 2u
+                                                    : 1u;
+
+      int64_t NumberOfSteps = PatternSize / MemsetSize;
+      int64_t Pitch = NumberOfSteps * MemsetSize;
+      int64_t Height = Size / PatternSize;
+
+      for (auto Step = 0u; Step < NumberOfSteps; ++Step) {
+        if (MemsetSize == 4) {
+          Res = cuMemsetD2D32Async(
+              (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
+              *(static_cast<const uint32_t *>(PatternPtr) + Step), 1u, Height,
+              Stream);
+        } else if (MemsetSize == 2) {
+          Res = cuMemsetD2D16Async(
+              (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
+              *(static_cast<const uint16_t *>(PatternPtr) + Step), 1u, Height,
+              Stream);
+        } else {
+          Res = cuMemsetD2D8Async(
+              (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
+              *(static_cast<const uint8_t *>(PatternPtr) + Step), 1u, Height,
+              Stream);
+        }
+      }
+    }
+
+    return Plugin::check(Res, "error in cuMemset: %s");
+  }
+
   /// Initialize the async info for interoperability purposes.
   Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
     if (auto Err = setContext())
@@ -856,6 +928,13 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::success();
   }
 
+  /// Insert a data fence between previous data operations and the following
+  /// operations. This is a no-op for CUDA devices as operations inserted into
+  /// a queue are in-order.
+  Error dataFence(__tgt_async_info *Async) override {
+    return Plugin::success();
+  }
+
   /// Initialize the device info for interoperability purposes.
   Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
     assert(Context && "Context is null");
@@ -873,6 +952,19 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::success();
   }
 
+  Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
+                            AsyncInfoWrapperTy &AsyncInfo) override {
+    if (auto Err = setContext())
+      return Err;
+
+    CUstream Stream;
+    if (auto Err = getStream(AsyncInfo, Stream))
+      return Err;
+
+    CUresult Res = cuLaunchHostFunc(Stream, Callback, UserData);
+    return Plugin::check(Res, "error in cuStreamLaunchHostFunc: %s");
+  };
+
   /// Create an event.
   Error createEventImpl(void **EventPtrStorage) override {
     CUevent *Event = reinterpret_cast<CUevent *>(EventPtrStorage);
@@ -914,9 +1006,33 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::check(Res, "error in cuStreamWaitEvent: %s");
   }
 
-  // TODO: This should be implementable on CUDA
   Expected<bool> hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfo) override {
-    return true;
+    CUstream Stream;
+    if (auto Err = getStream(AsyncInfo, Stream))
+      return Err;
+
+    CUresult Ret = cuStreamQuery(Stream);
+    if (Ret == CUDA_SUCCESS)
+      return false;
+
+    if (Ret == CUDA_ERROR_NOT_READY)
+      return true;
+
+    return Plugin::check(Ret, "error in cuStreamQuery: %s");
+  }
+
+  Expected<bool> isEventCompleteImpl(void *EventPtr,
+                                     AsyncInfoWrapperTy &) override {
+    CUevent Event = reinterpret_cast<CUevent>(EventPtr);
+
+    CUresult Ret = cuEventQuery(Event);
+    if (Ret == CUDA_SUCCESS)
+      return true;
+
+    if (Ret == CUDA_ERROR_NOT_READY)
+      return false;
+
+    return Plugin::check(Ret, "error in cuEventQuery: %s");
   }
 
   /// Synchronize the current thread with the event.
@@ -944,18 +1060,27 @@ struct CUDADeviceTy : public GenericDeviceTy {
     Info.add("CUDA OpenMP Device Number", DeviceId);
 
     Res = cuDeviceGetName(TmpChar, 1000, Device);
-    if (Res == CUDA_SUCCESS)
+    if (Res == CUDA_SUCCESS) {
       Info.add("Device Name", TmpChar, "", DeviceInfo::NAME);
+      Info.add("Product Name", TmpChar, "", DeviceInfo::PRODUCT_NAME);
+    }
 
     Info.add("Vendor Name", "NVIDIA", "", DeviceInfo::VENDOR);
 
+    Info.add("Vendor ID", uint64_t{4318}, "", DeviceInfo::VENDOR_ID);
+
+    Info.add("Memory Address Size", std::numeric_limits<CUdeviceptr>::digits,
+             "bits", DeviceInfo::ADDRESS_BITS);
+
     Res = cuDeviceTotalMem(&TmpSt, Device);
     if (Res == CUDA_SUCCESS)
-      Info.add("Global Memory Size", TmpSt, "bytes");
+      Info.add("Global Memory Size", TmpSt, "bytes",
+               DeviceInfo::GLOBAL_MEM_SIZE);
 
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add("Number of Multiprocessors", TmpInt);
+      Info.add("Number of Multiprocessors", TmpInt, "",
+               DeviceInfo::NUM_COMPUTE_UNITS);
 
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, TmpInt);
     if (Res == CUDA_SUCCESS)
@@ -995,7 +1120,13 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (Res == CUDA_SUCCESS)
       MaxBlock.add("z", TmpInt);
 
-    auto &MaxGrid = *Info.add("Maximum Grid Dimensions", "");
+    // TODO: I assume CUDA devices have no limit on the amount of threads,
+    // verify this
+    Info.add("Maximum Grid Size", std::numeric_limits<uint32_t>::max(), "",
+             DeviceInfo::MAX_WORK_SIZE);
+
+    auto &MaxGrid = *Info.add("Maximum Grid Dimensions", std::monostate{}, "",
+                              DeviceInfo::MAX_WORK_SIZE_PER_DIMENSION);
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt);
     if (Res == CUDA_SUCCESS)
       MaxGrid.add("x", TmpInt);
@@ -1016,7 +1147,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add("Clock Rate", TmpInt, "kHz");
+      Info.add("Clock Rate", TmpInt / 1000, "MHz",
+               DeviceInfo::MAX_CLOCK_FREQUENCY);
 
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, TmpInt);
     if (Res == CUDA_SUCCESS)
@@ -1053,7 +1185,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add("Memory Clock Rate", TmpInt, "kHz");
+      Info.add("Memory Clock Rate", TmpInt / 1000, "MHz",
+               DeviceInfo::MEMORY_CLOCK_RATE);
 
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, TmpInt);
     if (Res == CUDA_SUCCESS)
@@ -1317,9 +1450,10 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   if (MaxDynCGroupMem >= MaxDynCGroupMemLimit) {
     CUresult AttrResult = cuFuncSetAttribute(
         Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem);
-    return Plugin::check(
-        AttrResult,
-        "Error in cuLaunchKernel while setting the memory limits: %s");
+    if (auto Err = Plugin::check(
+            AttrResult,
+            "error in cuFuncSetAttribute while setting the memory limits: %s"))
+      return Err;
     MaxDynCGroupMemLimit = MaxDynCGroupMem;
   }
 
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index ed52135..f440eba 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -114,6 +114,14 @@ struct GenELF64KernelTy : public GenericKernelTy {
     return Plugin::success();
   }
 
+  /// Return maximum block size for maximum occupancy
+  Expected<uint64_t> maxGroupSize(GenericDeviceTy &Device,
+                                  uint64_t DynamicMemSize) const override {
+    return Plugin::error(
+        ErrorCode::UNSUPPORTED,
+        "occupancy calculations are not implemented for the host device");
+  }
+
 private:
   /// The kernel function to execute.
   void (*Func)(void);
@@ -295,6 +303,28 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
                          "dataExchangeImpl not supported");
   }
 
+  /// Insert a data fence between previous data operations and the following
+  /// operations. This is a no-op for Host devices as operations inserted into
+  /// a queue are in-order.
+  Error dataFence(__tgt_async_info *Async) override {
+    return Plugin::success();
+  }
+
+  Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+                     int64_t Size,
+                     AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    if (PatternSize == 1) {
+      std::memset(TgtPtr, *static_cast<const char *>(PatternPtr), Size);
+    } else {
+      for (unsigned int Step = 0; Step < Size; Step += PatternSize) {
+        auto *Dst = static_cast<char *>(TgtPtr) + Step;
+        std::memcpy(Dst, PatternPtr, PatternSize);
+      }
+    }
+
+    return Plugin::success();
+  }
+
   /// All functions are already synchronous. No need to do anything on this
   /// synchronization function.
   Error synchronizeImpl(__tgt_async_info &AsyncInfo,
@@ -320,6 +350,12 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
                          "initDeviceInfoImpl not supported");
   }
 
+  Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
+                            AsyncInfoWrapperTy &AsyncInfo) override {
+    Callback(UserData);
+    return Plugin::success();
+  };
+
   /// This plugin does not support the event API. Do nothing without failing.
   Error createEventImpl(void **EventPtrStorage) override {
     *EventPtrStorage = nullptr;
@@ -337,6 +373,10 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   Expected<bool> hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfo) override {
     return true;
   }
+  Expected<bool> isEventCompleteImpl(void *Event,
+                                     AsyncInfoWrapperTy &AsyncInfo) override {
+    return true;
+  }
   Error syncEventImpl(void *EventPtr) override { return Plugin::success(); }
 
   /// Print information about the device.
diff --git a/offload/test/mapping/data_member_ref.cpp b/offload/test/mapping/data_member_ref.cpp
index fdb8abc..7947a62 100644
--- a/offload/test/mapping/data_member_ref.cpp
+++ b/offload/test/mapping/data_member_ref.cpp
@@ -60,7 +60,8 @@ int main() {
   printf("Host %d %d.\n", Bar.VRef.Data, V.Data);
   // CHECK: Host 123456.
   printf("Host %d.\n", *Baz.VRef.Data);
-#pragma omp target map(*Baz.VRef.Data) map(from : D1, D2)
+#pragma omp target map(Baz.VRef.Data) map(*Baz.VRef.Data) map(V1.Data[0 : 0])  \
+    map(from : D1, D2)
   {
     // CHECK: Device 123456.
     D1 = *Baz.VRef.Data;
diff --git a/offload/test/mapping/declare_mapper_nested_default_mappers.cpp b/offload/test/mapping/declare_mapper_nested_default_mappers.cpp
index c6c5657..45fd042 100644
--- a/offload/test/mapping/declare_mapper_nested_default_mappers.cpp
+++ b/offload/test/mapping/declare_mapper_nested_default_mappers.cpp
@@ -44,8 +44,8 @@ int main() {
 
   int spp00fa = -1, spp00fca = -1, spp00fb_r = -1;
   __intptr_t p = reinterpret_cast<__intptr_t>(&x[0]);
-#pragma omp target map(tofrom: spp[0][0]) firstprivate(p)                           \
-                   map(from: spp00fa, spp00fca, spp00fb_r)
+#pragma omp target map(tofrom : spp[0][0]) map(alloc : spp[0]) firstprivate(p) \
+    map(from : spp00fa, spp00fca, spp00fb_r)
   {
     spp00fa = spp[0][0].f.a;
     spp00fca = spp[0][0].f.c.a;
diff --git a/offload/test/mapping/declare_mapper_nested_mappers.cpp b/offload/test/mapping/declare_mapper_nested_mappers.cpp
index a9e3f05..a59ed69 100644
--- a/offload/test/mapping/declare_mapper_nested_mappers.cpp
+++ b/offload/test/mapping/declare_mapper_nested_mappers.cpp
@@ -42,8 +42,8 @@ int main() {
   int spp00fa = -1, spp00fb_r = -1, spp00fg1 = -1, spp00fg_r = -1;
   __intptr_t p = reinterpret_cast<__intptr_t>(&x[0]),
              p1 = reinterpret_cast<__intptr_t>(&y[0]);
-#pragma omp target map(tofrom : spp[0][0]) firstprivate(p, p1)                  \
-                   map(from: spp00fa, spp00fb_r, spp00fg1, spp00fg_r)
+#pragma omp target map(tofrom : spp[0][0]) map(alloc : spp[0])                 \
+    firstprivate(p, p1) map(from : spp00fa, spp00fb_r, spp00fg1, spp00fg_r)
   {
     spp00fa = spp[0][0].f.a;
     spp00fb_r = spp[0][0].f.b == reinterpret_cast<void *>(p) ? 1 : 0;
diff --git a/offload/test/mapping/map_ptr_and_star_global.c b/offload/test/mapping/map_ptr_and_star_global.c
index c3b0dd2..869fb8c 100644
--- a/offload/test/mapping/map_ptr_and_star_global.c
+++ b/offload/test/mapping/map_ptr_and_star_global.c
@@ -1,5 +1,7 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
+// REQUIRES: libc
+
 #include <omp.h>
 #include <stdio.h>
 
diff --git a/offload/test/mapping/map_ptr_and_star_local.c b/offload/test/mapping/map_ptr_and_star_local.c
index f0ca84d..cc826b3 100644
--- a/offload/test/mapping/map_ptr_and_star_local.c
+++ b/offload/test/mapping/map_ptr_and_star_local.c
@@ -1,5 +1,7 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
+// REQUIRES: libc
+
 #include <omp.h>
 #include <stdio.h>
 
diff --git a/offload/test/mapping/map_ptr_and_subscript_global.c b/offload/test/mapping/map_ptr_and_subscript_global.c
index a3a10b6..839db06 100644
--- a/offload/test/mapping/map_ptr_and_subscript_global.c
+++ b/offload/test/mapping/map_ptr_and_subscript_global.c
@@ -1,5 +1,7 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
+// REQUIRES: libc
+
 #include <omp.h>
 #include <stdio.h>
 
diff --git a/offload/test/mapping/map_ptr_and_subscript_local.c b/offload/test/mapping/map_ptr_and_subscript_local.c
index bb44999..68ac9dc 100644
--- a/offload/test/mapping/map_ptr_and_subscript_local.c
+++ b/offload/test/mapping/map_ptr_and_subscript_local.c
@@ -1,5 +1,7 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
+// REQUIRES: libc
+
 #include <omp.h>
 #include <stdio.h>
 
diff --git a/offload/test/mapping/map_structptr_and_member_global.c b/offload/test/mapping/map_structptr_and_member_global.c
index 10e72e0..960eea4 100644
--- a/offload/test/mapping/map_structptr_and_member_global.c
+++ b/offload/test/mapping/map_structptr_and_member_global.c
@@ -1,5 +1,7 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
+// REQUIRES: libc
+
 #include <omp.h>
 #include <stdio.h>
 
diff --git a/offload/test/mapping/map_structptr_and_member_local.c b/offload/test/mapping/map_structptr_and_member_local.c
index 9e59551..bd75940 100644
--- a/offload/test/mapping/map_structptr_and_member_local.c
+++ b/offload/test/mapping/map_structptr_and_member_local.c
@@ -1,5 +1,7 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
+// REQUIRES: libc
+
 #include <omp.h>
 #include <stdio.h>
 
diff --git a/offload/test/mapping/ptr_and_obj_motion.c b/offload/test/mapping/ptr_and_obj_motion.c
index 8fa2c98..a94c07aa 100644
--- a/offload/test/mapping/ptr_and_obj_motion.c
+++ b/offload/test/mapping/ptr_and_obj_motion.c
@@ -17,7 +17,7 @@ void init(double vertexx[]) {
 }
 
 void change(DV *dvptr) {
-#pragma omp target map(dvptr->dataptr[0 : 100])
+#pragma omp target map(dvptr->dataptr[0 : 100]) map(alloc : dvptr -> dataptr)
   {
     printf("In change: %lf, expected 77.0\n", dvptr->dataptr[77]);
     dvptr->dataptr[77] += 1.0;
diff --git a/offload/test/mapping/target_derefence_array_pointrs.cpp b/offload/test/mapping/target_derefence_array_pointrs.cpp
index a6dd4069..d213c87 100644
--- a/offload/test/mapping/target_derefence_array_pointrs.cpp
+++ b/offload/test/mapping/target_derefence_array_pointrs.cpp
@@ -18,23 +18,24 @@ void foo(int **t1d) {
 
   for (j = 0; j < 3; j++)
     (*t1d)[j] = 0;
-#pragma omp target map(tofrom : (*t1d)[0 : 3])
+#pragma omp target map(tofrom : (*t1d)[0 : 3]) map(alloc : *t1d)
   { (*t1d)[1] = 1; }
   // CHECK: 1
   printf("%d\n", (*t1d)[1]);
-#pragma omp target map(tofrom : (**t2d)[0 : 3])
+#pragma omp target map(tofrom : (**t2d)[0 : 3]) map(alloc : **t2d, *t2d)
   { (**t2d)[1] = 2; }
   // CHECK: 2
   printf("%d\n", (**t2d)[1]);
-#pragma omp target map(tofrom : (***t3d)[0 : 3])
+#pragma omp target map(tofrom : (***t3d)[0 : 3])                               \
+    map(alloc : ***t3d, **t3d, *t3d)
   { (***t3d)[1] = 3; }
   // CHECK: 3
   printf("%d\n", (***t3d)[1]);
-#pragma omp target map(tofrom : (**t1d))
+#pragma omp target map(tofrom : (**t1d)) map(alloc : *t1d)
   { (*t1d)[0] = 4; }
   // CHECK: 4
   printf("%d\n", (*t1d)[0]);
-#pragma omp target map(tofrom : (*(*(t1d + a) + b)))
+#pragma omp target map(tofrom : (*(*(t1d + a) + b))) map(to : *(t1d + a))
   { *(*(t1d + a) + b) = 5; }
   // CHECK: 5
   printf("%d\n", *(*(t1d + a) + b));
@@ -49,7 +50,7 @@ void bar() {
   for (int i = 0; i < 3; i++) {
     (**a)[1] = i;
   }
-#pragma omp target map((**a)[ : 3])
+#pragma omp target map((**a)[ : 3]) map(alloc : **a, *a)
   {
     (**a)[1] = 6;
     // CHECK: 6
@@ -73,7 +74,8 @@ void zoo(int **f, SSA *sa) {
   *(f + sa->i + 1) = t;
   *(sa->sa->i + *(f + sa->i + 1)) = 4;
   printf("%d\n", *(sa->sa->i + *(1 + sa->i + f)));
-#pragma omp target map(sa, *(sa->sa->i + *(1 + sa->i + f)))
+#pragma omp target map(*(sa->sa->i + *(1 + sa->i + f))) map(alloc : sa->sa)    \
+    map(to : sa->i) map(to : sa->sa->i) map(to : *(1 + sa->i + f))
   { *(sa->sa->i + *(1 + sa->i + f)) = 7; }
   // CHECK: 7
   printf("%d\n", *(sa->sa->i + *(1 + sa->i + f)));
@@ -87,13 +89,13 @@ void xoo() {
 
 void yoo(int **x) {
   *x = (int *)malloc(2 * sizeof(int));
-#pragma omp target map(**x)
+#pragma omp target map(**x) map(alloc : *x)
   {
     **x = 8;
     // CHECK: 8
     printf("%d\n", **x);
   }
-#pragma omp target map(*(*x + 1))
+#pragma omp target map(*(*x + 1)) map(alloc : *x)
   {
     *(*x + 1) = 9;
     // CHECK: 9
diff --git a/offload/test/mapping/target_has_device_addr.c b/offload/test/mapping/target_has_device_addr.c
index e8bfff8..f238832 100644
--- a/offload/test/mapping/target_has_device_addr.c
+++ b/offload/test/mapping/target_has_device_addr.c
@@ -66,8 +66,9 @@ void zoo() {
   short **xpp = &xp[0];
 
   x[1] = 111;
-#pragma omp target data map(tofrom : xpp[1][1]) use_device_addr(xpp[1][1])
-#pragma omp target has_device_addr(xpp[1][1])
+#pragma omp target data map(tofrom : xpp[1][1]) map(xpp[1])                    \
+    use_device_addr(xpp[1])
+#pragma omp target has_device_addr(xpp[1])
   {
     xpp[1][1] = 222;
     // CHECK: 222
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp
new file mode 100644
index 0000000..3b1a819
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_existing.cpp
@@ -0,0 +1,85 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on an array-section.
+// The corresponding data is mapped on a previous enter_data directive.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g, h[10];
+int *ph = &h[0];
+
+struct S {
+  int *paa[10][10];
+
+  void f1(int i) {
+    paa[0][2] = &g;
+
+    int *original_ph3 = &ph[3];
+    int **original_paa02 = &paa[0][2];
+
+#pragma omp target enter data map(to : ph[3 : 4], paa[0][2 : 5])
+    int *mapped_ptr_ph3 =
+        (int *)omp_get_mapped_ptr(&ph[3], omp_get_default_device());
+    int **mapped_ptr_paa02 =
+        (int **)omp_get_mapped_ptr(&paa[0][2], omp_get_default_device());
+
+    // CHECK-COUNT-4: 1
+    printf("%d\n", mapped_ptr_ph3 != nullptr);
+    printf("%d\n", mapped_ptr_paa02 != nullptr);
+    printf("%d\n", original_ph3 != mapped_ptr_ph3);
+    printf("%d\n", original_paa02 != mapped_ptr_paa02);
+
+// (A) use_device_addr operand within mapped address range.
+// CHECK: A: 1
+#pragma omp target data use_device_addr(ph[3 : 4])
+    printf("A: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (B) use_device_addr operand in extended address range, but not
+// mapped address range.
+// CHECK: B: 1
+#pragma omp target data use_device_addr(ph[2])
+    printf("B: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (C) use_device_addr/map: same base-array, different first-location.
+// CHECK: C: 1
+#pragma omp target data map(ph[3 : 2]) use_device_addr(ph[4 : 1])
+    printf("C: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (D) use_device_addr/map: different base-array/pointers.
+// CHECK: D: 1
+#pragma omp target data map(ph) use_device_addr(ph[3 : 4])
+    printf("D: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (E) use_device_addr operand within mapped range of previous map.
+// CHECK: E: 1
+#pragma omp target data use_device_addr(paa[0])
+    printf("E: %d\n", mapped_ptr_paa02 == &paa[0][2]);
+
+// (F) use_device_addr/map: different operands, same base-array.
+// CHECK: F: 1
+#pragma omp target data map(paa[0][3]) use_device_addr(paa[0][2])
+    printf("F: %d\n", mapped_ptr_paa02 == &paa[0][2]);
+
+// (G) use_device_addr/map: different base-array/pointers.
+// CHECK: G: 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0][2])
+    printf("G: %d\n", mapped_ptr_paa02 == &paa[0][2]);
+
+#pragma omp target exit data map(release : ph[3 : 4], paa[0][2 : 5])
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp
new file mode 100644
index 0000000..b9ebde4
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_not_existing.cpp
@@ -0,0 +1,143 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on an array-section.
+// The corresponding data is not previously mapped.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g, h[10];
+int *ph = &h[0];
+
+struct S {
+  int *paa[10][10];
+
+  void f1(int i) {
+    paa[0][2] = &g;
+
+    int *original_ph3 = &ph[3];
+    int **original_paa02 = &paa[0][2];
+
+// (A) No corresponding map, lookup should fail.
+// CHECK: A: 1 1 1
+#pragma omp target data use_device_addr(ph[3 : 4])
+    {
+      int *mapped_ptr_ph3 =
+          (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device());
+      printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+             mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3);
+    }
+
+// (B) use_device_addr/map: different operands, same base-pointer.
+// use_device_addr operand within mapped address range.
+// CHECK: B: 1 1 1
+#pragma omp target data map(ph[2 : 3]) use_device_addr(ph[3 : 1])
+    {
+      int *mapped_ptr_ph4 =
+          (int *)omp_get_mapped_ptr(original_ph3 + 1, omp_get_default_device());
+      printf("B: %d %d %d\n", mapped_ptr_ph4 != nullptr,
+             mapped_ptr_ph4 != original_ph3 + 1, &ph[4] == mapped_ptr_ph4);
+    }
+
+// (C) use_device_addr/map: different base-pointers.
+// No corresponding storage, lookup should fail.
+// CHECK: C: 1 1 1
+#pragma omp target data map(ph) use_device_addr(ph[3 : 4])
+    {
+      int *mapped_ptr_ph3 =
+          (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device());
+      printf("C: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+             mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3);
+    }
+
+// (D) use_device_addr/map: one of two maps with matching base-pointer.
+// use_device_addr operand within mapped address range of second map,
+// lookup should succeed.
+// CHECK: D: 1 1 1
+#pragma omp target data map(ph) map(ph[2 : 5]) use_device_addr(ph[3 : 4])
+    {
+      int *mapped_ptr_ph3 =
+          (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device());
+      printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr,
+             mapped_ptr_ph3 != original_ph3, &ph[3] == mapped_ptr_ph3);
+    }
+
+// (E) No corresponding map, lookup should fail
+// CHECK: E: 1 1 1
+#pragma omp target data use_device_addr(paa[0])
+    {
+      int **mapped_ptr_paa02 =
+          (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device());
+      printf("E: %d %d %d\n", mapped_ptr_paa02 == nullptr,
+             mapped_ptr_paa02 != original_paa02,
+             &paa[0][2] == (int **)nullptr + 2);
+    }
+
+// (F) use_device_addr/map: different operands, same base-array.
+// use_device_addr within mapped address range. Lookup should succeed.
+// CHECK: F: 1 1 1
+#pragma omp target data map(paa) use_device_addr(paa[0])
+    {
+      int **mapped_ptr_paa02 =
+          (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device());
+      printf("F: %d %d %d\n", mapped_ptr_paa02 != nullptr,
+             mapped_ptr_paa02 != original_paa02,
+             &paa[0][2] == mapped_ptr_paa02);
+    }
+
+// (G) use_device_addr/map: different operands, same base-array.
+// use_device_addr extends beyond existing mapping. Not spec compliant.
+// But the lookup succeeds because we use the base-address for translation.
+// CHECK: G: 1 1 1
+#pragma omp target data map(paa[0][4]) use_device_addr(paa[0])
+    {
+      int **mapped_ptr_paa04 = (int **)omp_get_mapped_ptr(
+          original_paa02 + 2, omp_get_default_device());
+      printf("G: %d %d %d\n", mapped_ptr_paa04 != nullptr,
+             mapped_ptr_paa04 != original_paa02 + 2,
+             &paa[0][4] == mapped_ptr_paa04);
+    }
+
+    int *original_paa020 = &paa[0][2][0];
+    int **original_paa0 = (int **)&paa[0];
+
+// (H) use_device_addr/map: different base-pointers.
+// No corresponding storage for use_device_addr opnd, lookup should fail.
+// CHECK: H: 1 1 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0])
+    {
+      int **mapped_ptr_paa020 =
+          (int **)omp_get_mapped_ptr(original_paa020, omp_get_default_device());
+      int **mapped_ptr_paa0 =
+          (int **)omp_get_mapped_ptr(original_paa0, omp_get_default_device());
+      printf("H: %d %d %d\n", mapped_ptr_paa020 != nullptr,
+             mapped_ptr_paa0 == nullptr, &paa[0] == nullptr);
+    }
+
+// (I) use_device_addr/map: one map with different, one with same base-ptr.
+// Lookup should succeed.
+// CHECK: I: 1 1 1
+#pragma omp target data map(paa[0][2][0]) map(paa[0]) use_device_addr(paa[0][2])
+    {
+      int **mapped_ptr_paa02 =
+          (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device());
+      printf("I: %d %d %d\n", mapped_ptr_paa02 != nullptr,
+             mapped_ptr_paa02 != original_paa02,
+             &paa[0][2] == mapped_ptr_paa02);
+    }
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp
new file mode 100644
index 0000000..e9a1124
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_existing.cpp
@@ -0,0 +1,98 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on an array-section on a reference.
+// The corresponding data is mapped on a previous enter_data directive.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g_ptee;
+int &g = g_ptee;
+
+int h_ptee[10];
+int (&h)[10] = h_ptee;
+
+int *ph_ptee = &h_ptee[0];
+int *&ph = ph_ptee;
+int *paa_ptee[10][10];
+
+struct S {
+  int *(&paa)[10][10] = paa_ptee;
+
+  void f1(int i) {
+    paa[0][2] = &g;
+
+    int *original_ph3 = &ph[3];
+    int **original_paa02 = &paa[0][2];
+
+#pragma omp target enter data map(to : ph[3 : 4], paa[0][2 : 5])
+    int *mapped_ptr_ph3 =
+        (int *)omp_get_mapped_ptr(&ph[3], omp_get_default_device());
+    int **mapped_ptr_paa02 =
+        (int **)omp_get_mapped_ptr(&paa[0][2], omp_get_default_device());
+
+    // CHECK-COUNT-4: 1
+    printf("%d\n", mapped_ptr_ph3 != nullptr);
+    printf("%d\n", mapped_ptr_paa02 != nullptr);
+    printf("%d\n", original_ph3 != mapped_ptr_ph3);
+    printf("%d\n", original_paa02 != mapped_ptr_paa02);
+
+// (A) use_device_addr operand within mapped address range.
+// EXPECTED: A: 1
+// CHECK:    A: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data use_device_addr(ph[3 : 4])
+    printf("A: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (B) use_device_addr operand in extended address range, but not
+// mapped address range.
+// EXPECTED: B: 1
+// CHECK:    B: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data use_device_addr(ph[2])
+    printf("B: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (C) use_device_addr/map: same base-array, different first-location.
+// EXPECTED: C: 1
+// CHECK:    C: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph[3 : 2]) use_device_addr(ph[4 : 1])
+    printf("C: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (D) use_device_addr/map: different base-array/pointers.
+// EXPECTED: D: 1
+// CHECK:    D: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) use_device_addr(ph[3 : 4])
+    printf("D: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (E) use_device_addr operand within mapped range of previous map.
+// CHECK: E: 1
+#pragma omp target data use_device_addr(paa[0])
+    printf("E: %d\n", mapped_ptr_paa02 == &paa[0][2]);
+
+// (F) use_device_addr/map: different operands, same base-array.
+// CHECK: F: 1
+#pragma omp target data map(paa[0][3]) use_device_addr(paa[0][2])
+    printf("F: %d\n", mapped_ptr_paa02 == &paa[0][2]);
+
+// (G) use_device_addr/map: different base-array/pointers.
+// CHECK: G: 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0][2])
+    printf("G: %d\n", mapped_ptr_paa02 == &paa[0][2]);
+
+#pragma omp target exit data map(release : ph[3 : 4], paa[0][2 : 5])
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp
new file mode 100644
index 0000000..0090cdb
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_ref_not_existing.cpp
@@ -0,0 +1,158 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on an array-section on a reference.
+// The corresponding data is not previously mapped.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g_ptee;
+int &g = g_ptee;
+
+int h_ptee[10];
+int (&h)[10] = h_ptee;
+
+int *ph_ptee = &h_ptee[0];
+int *&ph = ph_ptee;
+int *paa_ptee[10][10];
+
+struct S {
+  int *(&paa)[10][10] = paa_ptee;
+
+  void f1(int i) {
+    paa[0][2] = &g;
+
+    int *original_ph3 = &ph[3];
+    int **original_paa02 = &paa[0][2];
+
+// (A) No corresponding map, lookup should fail.
+// EXPECTED: A: 1 1 1
+// CHECK:    A: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data use_device_addr(ph[3 : 4])
+    {
+      int *mapped_ptr_ph3 =
+          (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device());
+      printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+             mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3);
+    }
+
+// (B) use_device_addr/map: different operands, same base-pointer.
+// use_device_addr operand within mapped address range.
+// EXPECTED: B: 1 1 1
+// CHECK:    B: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph[2 : 3]) use_device_addr(ph[3 : 1])
+    {
+      int *mapped_ptr_ph4 =
+          (int *)omp_get_mapped_ptr(original_ph3 + 1, omp_get_default_device());
+      printf("B: %d %d %d\n", mapped_ptr_ph4 != nullptr,
+             mapped_ptr_ph4 != original_ph3 + 1, &ph[4] == mapped_ptr_ph4);
+    }
+
+// (C) use_device_addr/map: different base-pointers.
+// No corresponding storage, lookup should fail.
+// EXPECTED: C: 1 1 1
+// CHECK:    C: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) use_device_addr(ph[3 : 4])
+    {
+      int *mapped_ptr_ph3 =
+          (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device());
+      printf("C: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+             mapped_ptr_ph3 != original_ph3, &ph[3] == (int *)nullptr + 3);
+    }
+
+// (D) use_device_addr/map: one of two maps with matching base-pointer.
+// use_device_addr operand within mapped address range of second map,
+// lookup should succeed.
+// EXPECTED: D: 1 1 1
+// CHECK:    D: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) map(ph[2 : 5]) use_device_addr(ph[3 : 4])
+    {
+      int *mapped_ptr_ph3 =
+          (int *)omp_get_mapped_ptr(original_ph3, omp_get_default_device());
+      printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr,
+             mapped_ptr_ph3 != original_ph3, &ph[3] == mapped_ptr_ph3);
+    }
+
+// (E) No corresponding map, lookup should fail
+// CHECK: E: 1 1 1
+#pragma omp target data use_device_addr(paa[0])
+    {
+      int **mapped_ptr_paa02 =
+          (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device());
+      printf("E: %d %d %d\n", mapped_ptr_paa02 == nullptr,
+             mapped_ptr_paa02 != original_paa02,
+             &paa[0][2] == (int **)nullptr + 2);
+    }
+
+// (F) use_device_addr/map: different operands, same base-array.
+// use_device_addr within mapped address range. Lookup should succeed.
+// CHECK: F: 1 1 1
+#pragma omp target data map(paa) use_device_addr(paa[0])
+    {
+      int **mapped_ptr_paa02 =
+          (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device());
+      printf("F: %d %d %d\n", mapped_ptr_paa02 != nullptr,
+             mapped_ptr_paa02 != original_paa02,
+             &paa[0][2] == mapped_ptr_paa02);
+    }
+
+// (G) use_device_addr/map: different operands, same base-array.
+// use_device_addr extends beyond existing mapping. Not spec compliant.
+// But the lookup succeeds because we use the base-address for translation.
+// CHECK: G: 1 1 1
+#pragma omp target data map(paa[0][4]) use_device_addr(paa[0])
+    {
+      int **mapped_ptr_paa04 = (int **)omp_get_mapped_ptr(
+          original_paa02 + 2, omp_get_default_device());
+      printf("G: %d %d %d\n", mapped_ptr_paa04 != nullptr,
+             mapped_ptr_paa04 != original_paa02 + 2,
+             &paa[0][4] == mapped_ptr_paa04);
+    }
+
+    int *original_paa020 = &paa[0][2][0];
+    int **original_paa0 = (int **)&paa[0];
+
+// (H) use_device_addr/map: different base-pointers.
+// No corresponding storage for use_device_addr opnd, lookup should fail.
+// CHECK: H: 1 1 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa[0])
+    {
+      int **mapped_ptr_paa020 =
+          (int **)omp_get_mapped_ptr(original_paa020, omp_get_default_device());
+      int **mapped_ptr_paa0 =
+          (int **)omp_get_mapped_ptr(original_paa0, omp_get_default_device());
+      printf("H: %d %d %d\n", mapped_ptr_paa020 != nullptr,
+             mapped_ptr_paa0 == nullptr, &paa[0] == nullptr);
+    }
+
+// (I) use_device_addr/map: one map with different, one with same base-ptr.
+// Lookup should succeed.
+// CHECK: I: 1 1 1
+#pragma omp target data map(paa[0][2][0]) map(paa[0]) use_device_addr(paa[0][2])
+    {
+      int **mapped_ptr_paa02 =
+          (int **)omp_get_mapped_ptr(original_paa02, omp_get_default_device());
+      printf("I: %d %d %d\n", mapped_ptr_paa02 != nullptr,
+             mapped_ptr_paa02 != original_paa02,
+             &paa[0][2] == mapped_ptr_paa02);
+    }
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp
new file mode 100644
index 0000000..883297f
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_existing.cpp
@@ -0,0 +1,93 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on a variable (not a section).
+// The corresponding data is mapped on a previous enter_data directive.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g, h[10];
+int *ph = &h[0];
+
+struct S {
+  int *paa[10][10];
+
+  void f1(int i) {
+    paa[0][2] = &g;
+
+    void *original_addr_g = &g;
+    void *original_addr_h = &h;
+    void *original_addr_ph = &ph;
+    void *original_addr_paa = &paa;
+
+#pragma omp target enter data map(to : g, h, ph, paa)
+    void *mapped_ptr_g = omp_get_mapped_ptr(&g, omp_get_default_device());
+    void *mapped_ptr_h = omp_get_mapped_ptr(&h, omp_get_default_device());
+    void *mapped_ptr_ph = omp_get_mapped_ptr(&ph, omp_get_default_device());
+    void *mapped_ptr_paa = omp_get_mapped_ptr(&paa, omp_get_default_device());
+
+    // CHECK-COUNT-8: 1
+    printf("%d\n", mapped_ptr_g != nullptr);
+    printf("%d\n", mapped_ptr_h != nullptr);
+    printf("%d\n", mapped_ptr_ph != nullptr);
+    printf("%d\n", mapped_ptr_paa != nullptr);
+    printf("%d\n", original_addr_g != mapped_ptr_g);
+    printf("%d\n", original_addr_h != mapped_ptr_h);
+    printf("%d\n", original_addr_ph != mapped_ptr_ph);
+    printf("%d\n", original_addr_paa != mapped_ptr_paa);
+
+// (A)
+// CHECK: A: 1
+#pragma omp target data use_device_addr(g)
+    printf("A: %d\n", mapped_ptr_g == &g);
+
+// (B)
+// CHECK: B: 1
+#pragma omp target data use_device_addr(h)
+    printf("B: %d\n", mapped_ptr_h == &h);
+
+// (C)
+// CHECK: C: 1
+#pragma omp target data use_device_addr(ph)
+    printf("C: %d\n", mapped_ptr_ph == &ph);
+
+// (D) use_device_addr/map with different base-array/pointer.
+// Address translation should happen for &ph, not &ph[0/1].
+// CHECK: D: 1
+#pragma omp target data map(ph[1 : 2]) use_device_addr(ph)
+    printf("D: %d\n", mapped_ptr_ph == &ph);
+
+// (E)
+// CHECK: E: 1
+#pragma omp target data use_device_addr(paa)
+    printf("E: %d\n", mapped_ptr_paa == &paa);
+
+// (F) use_device_addr/map with same base-array, paa.
+// Address translation should happen for &paa.
+// CHECK: F: 1
+#pragma omp target data map(paa[0][2]) use_device_addr(paa)
+    printf("F: %d\n", mapped_ptr_paa == &paa);
+
+// (G) use_device_addr/map with different base-array/pointer.
+// Address translation should happen for &paa.
+// CHECK: G: 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa)
+    printf("G: %d\n", mapped_ptr_paa == &paa);
+
+#pragma omp target exit data map(release : g, h, ph, paa)
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp
new file mode 100644
index 0000000..79c6f69
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_not_existing.cpp
@@ -0,0 +1,159 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on a variable (not a section).
+// The corresponding data is not previously mapped.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g, h[10];
+int *ph = &h[0];
+
+struct S {
+  int *paa[10][10];
+
+  void f1(int i) {
+    paa[0][2] = &g;
+
+    void *original_addr_g = &g;
+    void *original_addr_h = &h;
+    void *original_addr_ph = &ph;
+    void *original_addr_paa = &paa;
+
+// (A) No corresponding item, lookup should fail.
+// CHECK: A: 1 1 1
+#pragma omp target data use_device_addr(g)
+    {
+      void *mapped_ptr_g =
+          omp_get_mapped_ptr(original_addr_g, omp_get_default_device());
+      printf("A: %d %d %d\n", mapped_ptr_g == nullptr,
+             mapped_ptr_g != original_addr_g, (void *)&g == nullptr);
+    }
+
+// (B) Lookup should succeed.
+// CHECK: B: 1 1 1
+#pragma omp target data map(g) use_device_addr(g)
+    {
+      void *mapped_ptr_g =
+          omp_get_mapped_ptr(original_addr_g, omp_get_default_device());
+      printf("B: %d %d %d\n", mapped_ptr_g != nullptr,
+             mapped_ptr_g != original_addr_g, &g == mapped_ptr_g);
+    }
+
+// (C) No corresponding item, lookup should fail.
+// CHECK: C: 1 1 1
+#pragma omp target data use_device_addr(h)
+    {
+      void *mapped_ptr_h =
+          omp_get_mapped_ptr(original_addr_h, omp_get_default_device());
+      printf("C: %d %d %d\n", mapped_ptr_h == nullptr,
+             mapped_ptr_h != original_addr_h, (void *)&h == nullptr);
+    }
+
+// (D) Lookup should succeed.
+// CHECK: D: 1 1 1
+#pragma omp target data map(h) use_device_addr(h)
+    {
+      void *mapped_ptr_h =
+          omp_get_mapped_ptr(original_addr_h, omp_get_default_device());
+      printf("D: %d %d %d\n", mapped_ptr_h != nullptr,
+             mapped_ptr_h != original_addr_h, &h == mapped_ptr_h);
+    }
+
+// (E) No corresponding item, lookup should fail.
+// CHECK: E: 1 1 1
+#pragma omp target data use_device_addr(ph)
+    {
+      void *mapped_ptr_ph =
+          omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+      printf("E: %d %d %d\n", mapped_ptr_ph == nullptr,
+             mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr);
+    }
+
+// (F) Lookup should succeed.
+// CHECK: F: 1 1 1
+#pragma omp target data map(ph) use_device_addr(ph)
+    {
+      void *mapped_ptr_ph =
+          omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+      printf("F: %d %d %d\n", mapped_ptr_ph != nullptr,
+             mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph);
+    }
+
+// (G) Maps pointee only, but use_device_addr operand is pointer.
+// Lookup should fail.
+// CHECK: G: 1 1 1
+#pragma omp target data map(ph[0 : 1]) use_device_addr(ph)
+    {
+      void *mapped_ptr_ph =
+          omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+      printf("G: %d %d %d\n", mapped_ptr_ph == nullptr,
+             mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr);
+    }
+
+// (H) Maps both pointee and pointer. Lookup for pointer should succeed.
+// CHECK: H: 1 1 1
+#pragma omp target data map(ph[0 : 1]) map(ph) use_device_addr(ph)
+    {
+      void *mapped_ptr_ph =
+          omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+      printf("H: %d %d %d\n", mapped_ptr_ph != nullptr,
+             mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph);
+    }
+
+// (I) No corresponding item, lookup should fail.
+// CHECK: I: 1 1 1
+#pragma omp target data use_device_addr(paa)
+    {
+      void *mapped_ptr_paa =
+          omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+      printf("I: %d %d %d\n", mapped_ptr_paa == nullptr,
+             mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr);
+    }
+
+// (J) Maps pointee only, but use_device_addr operand is pointer.
+// Lookup should fail.
+// CHECK: J: 1 1 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa)
+    {
+      void *mapped_ptr_paa =
+          omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+      printf("J: %d %d %d\n", mapped_ptr_paa == nullptr,
+             mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr);
+    }
+
+// (K) Lookup should succeed.
+// CHECK: K: 1 1 1
+#pragma omp target data map(paa) use_device_addr(paa)
+    {
+      void *mapped_ptr_paa =
+          omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+      printf("K: %d %d %d\n", mapped_ptr_paa != nullptr,
+             mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa);
+    }
+
+// (L) Maps both pointee and pointer. Lookup for pointer should succeed.
+// CHECK: L: 1 1 1
+#pragma omp target data map(paa[0][2][0]) map(paa) use_device_addr(paa)
+    {
+      void *mapped_ptr_paa =
+          omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+      printf("L: %d %d %d\n", mapped_ptr_paa != nullptr,
+             mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa);
+    }
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp
new file mode 100644
index 0000000..f018c65
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_existing.cpp
@@ -0,0 +1,100 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on a reference variable.
+// The corresponding data is mapped on a previous enter_data directive.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g_ptee;
+int &g = g_ptee;
+
+int h_ptee[10];
+int (&h)[10] = h_ptee;
+
+int *ph_ptee = &h_ptee[0];
+int *&ph = ph_ptee;
+int *paa_ptee[10][10];
+
+struct S {
+  int *(&paa)[10][10] = paa_ptee;
+
+  void f1(int i) {
+    paa[0][2] = &g;
+
+    void *original_addr_g = &g;
+    void *original_addr_h = &h;
+    void *original_addr_ph = &ph;
+    void *original_addr_paa = &paa;
+
+#pragma omp target enter data map(to : g, h, ph, paa)
+    void *mapped_ptr_g = omp_get_mapped_ptr(&g, omp_get_default_device());
+    void *mapped_ptr_h = omp_get_mapped_ptr(&h, omp_get_default_device());
+    void *mapped_ptr_ph = omp_get_mapped_ptr(&ph, omp_get_default_device());
+    void *mapped_ptr_paa = omp_get_mapped_ptr(&paa, omp_get_default_device());
+
+    // CHECK-COUNT-8: 1
+    printf("%d\n", mapped_ptr_g != nullptr);
+    printf("%d\n", mapped_ptr_h != nullptr);
+    printf("%d\n", mapped_ptr_ph != nullptr);
+    printf("%d\n", mapped_ptr_paa != nullptr);
+    printf("%d\n", original_addr_g != mapped_ptr_g);
+    printf("%d\n", original_addr_h != mapped_ptr_h);
+    printf("%d\n", original_addr_ph != mapped_ptr_ph);
+    printf("%d\n", original_addr_paa != mapped_ptr_paa);
+
+// (A)
+// CHECK: A: 1
+#pragma omp target data use_device_addr(g)
+    printf("A: %d\n", mapped_ptr_g == &g);
+
+// (B)
+// CHECK: B: 1
+#pragma omp target data use_device_addr(h)
+    printf("B: %d\n", mapped_ptr_h == &h);
+
+// (C)
+// CHECK: C: 1
+#pragma omp target data use_device_addr(ph)
+    printf("C: %d\n", mapped_ptr_ph == &ph);
+
+// (D) use_device_addr/map with different base-array/pointer.
+// Address translation should happen for &ph, not &ph[0/1].
+// CHECK: D: 1
+#pragma omp target data map(ph[1 : 2]) use_device_addr(ph)
+    printf("D: %d\n", mapped_ptr_ph == &ph);
+
+// (E)
+// CHECK: E: 1
+#pragma omp target data use_device_addr(paa)
+    printf("E: %d\n", mapped_ptr_paa == &paa);
+
+// (F) use_device_addr/map with same base-array, paa.
+// Address translation should happen for &paa.
+// CHECK: F: 1
+#pragma omp target data map(paa[0][2]) use_device_addr(paa)
+    printf("F: %d\n", mapped_ptr_paa == &paa);
+
+// (G) use_device_addr/map with different base-array/pointer.
+// Address translation should happen for &paa.
+// CHECK: G: 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa)
+    printf("G: %d\n", mapped_ptr_paa == &paa);
+
+#pragma omp target exit data map(release : g, h, ph, paa)
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp
new file mode 100644
index 0000000..9360db4
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_ref_not_existing.cpp
@@ -0,0 +1,166 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_addr on a reference variable.
+// The corresponding data is not previously mapped.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int g_ptee;
+int &g = g_ptee;
+
+int h_ptee[10];
+int (&h)[10] = h_ptee;
+
+int *ph_ptee = &h_ptee[0];
+int *&ph = ph_ptee;
+int *paa_ptee[10][10];
+
+struct S {
+  int *(&paa)[10][10] = paa_ptee;
+
+  void f1(int i) {
+    paa[0][2] = &g;
+
+    void *original_addr_g = &g;
+    void *original_addr_h = &h;
+    void *original_addr_ph = &ph;
+    void *original_addr_paa = &paa;
+
+// (A) No corresponding item, lookup should fail.
+// CHECK: A: 1 1 1
+#pragma omp target data use_device_addr(g)
+    {
+      void *mapped_ptr_g =
+          omp_get_mapped_ptr(original_addr_g, omp_get_default_device());
+      printf("A: %d %d %d\n", mapped_ptr_g == nullptr,
+             mapped_ptr_g != original_addr_g, (void *)&g == nullptr);
+    }
+
+// (B) Lookup should succeed.
+// CHECK: B: 1 1 1
+#pragma omp target data map(g) use_device_addr(g)
+    {
+      void *mapped_ptr_g =
+          omp_get_mapped_ptr(original_addr_g, omp_get_default_device());
+      printf("B: %d %d %d\n", mapped_ptr_g != nullptr,
+             mapped_ptr_g != original_addr_g, &g == mapped_ptr_g);
+    }
+
+// (C) No corresponding item, lookup should fail.
+// CHECK: C: 1 1 1
+#pragma omp target data use_device_addr(h)
+    {
+      void *mapped_ptr_h =
+          omp_get_mapped_ptr(original_addr_h, omp_get_default_device());
+      printf("C: %d %d %d\n", mapped_ptr_h == nullptr,
+             mapped_ptr_h != original_addr_h, (void *)&h == nullptr);
+    }
+
+// (D) Lookup should succeed.
+// CHECK: D: 1 1 1
+#pragma omp target data map(h) use_device_addr(h)
+    {
+      void *mapped_ptr_h =
+          omp_get_mapped_ptr(original_addr_h, omp_get_default_device());
+      printf("D: %d %d %d\n", mapped_ptr_h != nullptr,
+             mapped_ptr_h != original_addr_h, &h == mapped_ptr_h);
+    }
+
+// (E) No corresponding item, lookup should fail.
+// CHECK: E: 1 1 1
+#pragma omp target data use_device_addr(ph)
+    {
+      void *mapped_ptr_ph =
+          omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+      printf("E: %d %d %d\n", mapped_ptr_ph == nullptr,
+             mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr);
+    }
+
+// (F) Lookup should succeed.
+// CHECK: F: 1 1 1
+#pragma omp target data map(ph) use_device_addr(ph)
+    {
+      void *mapped_ptr_ph =
+          omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+      printf("F: %d %d %d\n", mapped_ptr_ph != nullptr,
+             mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph);
+    }
+
+// (G) Maps pointee only, but use_device_addr operand is pointer.
+// Lookup should fail.
+// CHECK: G: 1 1 1
+#pragma omp target data map(ph[0 : 1]) use_device_addr(ph)
+    {
+      void *mapped_ptr_ph =
+          omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+      printf("G: %d %d %d\n", mapped_ptr_ph == nullptr,
+             mapped_ptr_ph != original_addr_ph, (void *)&ph == nullptr);
+    }
+
+// (H) Maps both pointee and pointer. Lookup for pointer should succeed.
+// CHECK: H: 1 1 1
+#pragma omp target data map(ph[0 : 1]) map(ph) use_device_addr(ph)
+    {
+      void *mapped_ptr_ph =
+          omp_get_mapped_ptr(original_addr_ph, omp_get_default_device());
+      printf("H: %d %d %d\n", mapped_ptr_ph != nullptr,
+             mapped_ptr_ph != original_addr_ph, &ph == mapped_ptr_ph);
+    }
+
+// (I) No corresponding item, lookup should fail.
+// CHECK: I: 1 1 1
+#pragma omp target data use_device_addr(paa)
+    {
+      void *mapped_ptr_paa =
+          omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+      printf("I: %d %d %d\n", mapped_ptr_paa == nullptr,
+             mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr);
+    }
+
+// (J) Maps pointee only, but use_device_addr operand is pointer.
+// Lookup should fail.
+// CHECK: J: 1 1 1
+#pragma omp target data map(paa[0][2][0]) use_device_addr(paa)
+    {
+      void *mapped_ptr_paa =
+          omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+      printf("J: %d %d %d\n", mapped_ptr_paa == nullptr,
+             mapped_ptr_paa != original_addr_paa, (void *)&paa == nullptr);
+    }
+
+// (K) Lookup should succeed.
+// CHECK: K: 1 1 1
+#pragma omp target data map(paa) use_device_addr(paa)
+    {
+      void *mapped_ptr_paa =
+          omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+      printf("K: %d %d %d\n", mapped_ptr_paa != nullptr,
+             mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa);
+    }
+
+// (L) Maps both pointee and pointer. Lookup for pointer should succeed.
+// CHECK: L: 1 1 1
+#pragma omp target data map(paa[0][2][0]) map(paa) use_device_addr(paa)
+    {
+      void *mapped_ptr_paa =
+          omp_get_mapped_ptr(original_addr_paa, omp_get_default_device());
+      printf("L: %d %d %d\n", mapped_ptr_paa != nullptr,
+             mapped_ptr_paa != original_addr_paa, &paa == mapped_ptr_paa);
+    }
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/target_use_device_addr.c b/offload/test/mapping/use_device_addr/target_use_device_addr.c
index 5c2bb8a..4a9dbe2 100644
--- a/offload/test/mapping/target_use_device_addr.c
+++ b/offload/test/mapping/use_device_addr/target_use_device_addr.c
@@ -12,7 +12,9 @@ int main() {
   printf("%d, %p\n", xp[1], &xp[1]);
 #pragma omp target data use_device_addr(xp[1 : 3]) map(tofrom : x)
 #pragma omp target is_device_ptr(xp)
-  { xp[1] = 222; }
+  {
+    xp[1] = 222;
+  }
   // CHECK: 222
   printf("%d, %p\n", xp[1], &xp[1]);
 }
diff --git a/offload/test/mapping/target_wrong_use_device_addr.c b/offload/test/mapping/use_device_addr/target_wrong_use_device_addr.c
index 7a5babd..28ec685 100644
--- a/offload/test/mapping/target_wrong_use_device_addr.c
+++ b/offload/test/mapping/use_device_addr/target_wrong_use_device_addr.c
@@ -14,7 +14,7 @@ int main() {
   // CHECK: host addr=0x[[#%x,HOST_ADDR:]]
   fprintf(stderr, "host addr=%p\n", x);
 
-#pragma omp target data map(to : x [0:10])
+#pragma omp target data map(to : x[0 : 10])
   {
 // CHECK: omptarget device 0 info: variable x does not have a valid device
 // counterpart
@@ -27,4 +27,3 @@ int main() {
 
   return 0;
 }
-
diff --git a/offload/test/mapping/array_section_use_device_ptr.c b/offload/test/mapping/use_device_ptr/array_section_use_device_ptr.c
index 86e2875..4cfcce2 100644
--- a/offload/test/mapping/array_section_use_device_ptr.c
+++ b/offload/test/mapping/use_device_ptr/array_section_use_device_ptr.c
@@ -20,7 +20,9 @@ int main() {
 
   float *A_dev = NULL;
 #pragma omp target data use_device_ptr(A)
-  { A_dev = A; }
+  {
+    A_dev = A;
+  }
 #pragma omp target exit data map(delete : A[FROM : LENGTH])
 
   // CHECK: Success
diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp
new file mode 100644
index 0000000..a7745de
--- /dev/null
+++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_existing.cpp
@@ -0,0 +1,100 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_ptr on a variable.
+// The corresponding data is mapped on a previous enter_data directive.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int aa[10][10];
+int h[10];
+int *ph = &h[0];
+
+struct S {
+  int (*paa)[10][10] = &aa;
+
+  void f1(int i) {
+    paa--;
+    void *original_ph3 = &ph[3];
+    void *original_paa102 = &paa[1][0][2];
+
+#pragma omp target enter data map(to : ph[3 : 4], paa[1][0][2 : 5])
+    void *mapped_ptr_ph3 = omp_get_mapped_ptr(&ph[3], omp_get_default_device());
+    void *mapped_ptr_paa102 =
+        omp_get_mapped_ptr(&paa[1][0][2], omp_get_default_device());
+
+    // CHECK-COUNT-4: 1
+    printf("%d\n", mapped_ptr_ph3 != nullptr);
+    printf("%d\n", mapped_ptr_paa102 != nullptr);
+    printf("%d\n", original_ph3 != mapped_ptr_ph3);
+    printf("%d\n", original_paa102 != mapped_ptr_paa102);
+
+// (A) Mapped data is within extended address range. Lookup should succeed.
+// CHECK: A: 1
+#pragma omp target data use_device_ptr(ph)
+    printf("A: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (B) use_device_ptr/map on pointer, and pointee already exists.
+// Lookup should succeed.
+// CHECK: B: 1
+#pragma omp target data map(ph) use_device_ptr(ph)
+    printf("B: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (C) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: C: 1
+#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph)
+    printf("C: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (D) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: D: 1
+#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph)
+    printf("D: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (E) Mapped data is within extended address range. Lookup should succeed.
+// Lookup should succeed.
+// CHECK: E: 1
+#pragma omp target data use_device_ptr(paa)
+    printf("E: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+// (F) use_device_ptr/map on pointer, and pointee already exists.
+// &paa[0] should be in extended address-range of the existing paa[1][...]
+// Lookup should succeed.
+// FIXME: However, it currently does not. Might need an RT fix.
+// EXPECTED: F: 1
+// CHECK:    F: 0
+#pragma omp target data map(paa) use_device_ptr(paa)
+    printf("F: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+// (G) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: G: 1
+#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa)
+    printf("G: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+// (H) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: H: 1
+#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa)
+    printf("H: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+#pragma omp target exit data map(release : ph[3 : 4], paa[1][0][2 : 5])
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp
new file mode 100644
index 0000000..fe3cdb5
--- /dev/null
+++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_not_existing.cpp
@@ -0,0 +1,125 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_ptr on a variable.
+// The corresponding data is not previously mapped.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int aa[10][10];
+int h[10];
+int *ph = &h[0];
+
+struct S {
+  int (*paa)[10][10] = &aa;
+
+  void f1(int i) {
+    paa--;
+    void *original_addr_ph3 = &ph[3];
+    void *original_addr_paa102 = &paa[1][0][2];
+
+// (A) No corresponding item, lookup should fail.
+// CHECK: A: 1 1 1
+#pragma omp target data use_device_ptr(ph)
+    {
+      void *mapped_ptr_ph3 =
+          omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+      printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+             mapped_ptr_ph3 != original_addr_ph3, ph == nullptr);
+    }
+
+// (B) use_device_ptr/map on pointer, and pointee does not exist.
+// Lookup should fail.
+// CHECK: B: 1 1 1
+#pragma omp target data map(ph) use_device_ptr(ph)
+    {
+      void *mapped_ptr_ph3 =
+          omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+      printf("B: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+             mapped_ptr_ph3 != original_addr_ph3, ph == nullptr);
+    }
+
+// (C) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: C: 1 1 1
+#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph)
+    {
+      void *mapped_ptr_ph3 =
+          omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+      printf("C: %d %d %d\n", mapped_ptr_ph3 != nullptr,
+             mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3);
+    }
+
+// (D) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: D: 1 1 1
+#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph)
+    {
+      void *mapped_ptr_ph3 =
+          omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+      printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr,
+             mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3);
+    }
+
+// (E) No corresponding item, lookup should fail.
+// CHECK: E: 1 1 1
+#pragma omp target data use_device_ptr(paa)
+    {
+      void *mapped_ptr_paa102 =
+          omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+      printf("E: %d %d %d\n", mapped_ptr_paa102 == nullptr,
+             mapped_ptr_paa102 != original_addr_paa102, paa == nullptr);
+    }
+
+// (F) use_device_ptr/map on pointer, and pointee does not exist.
+// Lookup should fail.
+// CHECK: F: 1 1 1
+#pragma omp target data map(paa) use_device_ptr(paa)
+    {
+      void *mapped_ptr_paa102 =
+          omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+      printf("F: %d %d %d\n", mapped_ptr_paa102 == nullptr,
+             mapped_ptr_paa102 != original_addr_paa102, paa == nullptr);
+    }
+
+// (G) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: G: 1 1 1
+#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa)
+    {
+      void *mapped_ptr_paa102 =
+          omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+      printf("G: %d %d %d\n", mapped_ptr_paa102 != nullptr,
+             mapped_ptr_paa102 != original_addr_paa102,
+             &paa[1][0][2] == mapped_ptr_paa102);
+    }
+
+// (H) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: H: 1 1 1
+#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa)
+    {
+      void *mapped_ptr_paa102 =
+          omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+      printf("H: %d %d %d\n", mapped_ptr_paa102 != nullptr,
+             mapped_ptr_paa102 != original_addr_paa102,
+             &paa[1][0][2] == mapped_ptr_paa102);
+    }
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp
new file mode 100644
index 0000000..66e65de
--- /dev/null
+++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_existing.cpp
@@ -0,0 +1,111 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_ptr on a reference variable.
+// The corresponding data is mapped on a previous enter_data directive.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int aa[10][10];
+int (*paa_ptee)[10][10] = &aa;
+
+int h[10];
+int *ph_ptee = &h[0];
+int *&ph = ph_ptee;
+
+struct S {
+  int (*&paa)[10][10] = paa_ptee;
+
+  void f1(int i) {
+    paa--;
+    void *original_ph3 = &ph[3];
+    void *original_paa102 = &paa[1][0][2];
+
+#pragma omp target enter data map(to : ph[3 : 4], paa[1][0][2 : 5])
+    void *mapped_ptr_ph3 = omp_get_mapped_ptr(&ph[3], omp_get_default_device());
+    void *mapped_ptr_paa102 =
+        omp_get_mapped_ptr(&paa[1][0][2], omp_get_default_device());
+
+    // CHECK-COUNT-4: 1
+    printf("%d\n", mapped_ptr_ph3 != nullptr);
+    printf("%d\n", mapped_ptr_paa102 != nullptr);
+    printf("%d\n", original_ph3 != mapped_ptr_ph3);
+    printf("%d\n", original_paa102 != mapped_ptr_paa102);
+
+// (A) Mapped data is within extended address range. Lookup should succeed.
+// EXPECTED: A: 1
+// CHECK:    A: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data use_device_ptr(ph)
+    printf("A: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (B) use_device_ptr/map on pointer, and pointee already exists.
+// Lookup should succeed.
+// EXPECTED: B: 1
+// CHECK:    B: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) use_device_ptr(ph)
+    printf("B: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (C) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// EXPECTED: C: 1
+// CHECK:    C: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph)
+    printf("C: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (D) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// EXPECTED: D: 1
+// CHECK:    D: 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph)
+    printf("D: %d\n", mapped_ptr_ph3 == &ph[3]);
+
+// (E) Mapped data is within extended address range. Lookup should succeed.
+// Lookup should succeed.
+// CHECK: E: 1
+#pragma omp target data use_device_ptr(paa)
+    printf("E: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+// (F) use_device_ptr/map on pointer, and pointee already exists.
+// &paa[0] should be in extended address-range of the existing paa[1][...]
+// Lookup should succeed.
+// FIXME: However, it currently does not. Might need an RT fix.
+// EXPECTED: F: 1
+// CHECK:    F: 0
+#pragma omp target data map(paa) use_device_ptr(paa)
+    printf("F: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+// (G) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: G: 1
+#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa)
+    printf("G: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+// (H) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: H: 1
+#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa)
+    printf("H: %d\n", mapped_ptr_paa102 == &paa[1][0][2]);
+
+#pragma omp target exit data map(release : ph[3 : 4], paa[1][0][2 : 5])
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp
new file mode 100644
index 0000000..419ab3e
--- /dev/null
+++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_ref_not_existing.cpp
@@ -0,0 +1,136 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+// Test for various cases of use_device_ptr on a reference variable.
+// The corresponding data is not previously mapped.
+
+// Note that this tests for the current behavior wherein if a lookup fails,
+// the runtime returns nullptr, instead of the original host-address.
+// That was compatible with OpenMP 5.0, where it was a user error if
+// corresponding storage didn't exist, but with 5.1+, the runtime needs to
+// return the host address, as it needs to assume that the host-address is
+// device-accessible, as the user has guaranteed it.
+// Once the runtime returns the original host-address when the lookup fails, the
+// test will need to be updated.
+
+int aa[10][10];
+int (*paa_ptee)[10][10] = &aa;
+
+int h[10];
+int *ph_ptee = &h[0];
+int *&ph = ph_ptee;
+
+struct S {
+  int (*&paa)[10][10] = paa_ptee;
+
+  void f1(int i) {
+    paa--;
+    void *original_addr_ph3 = &ph[3];
+    void *original_addr_paa102 = &paa[1][0][2];
+
+// (A) No corresponding item, lookup should fail.
+// EXPECTED: A: 1 1 1
+// CHECK:    A: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data use_device_ptr(ph)
+    {
+      void *mapped_ptr_ph3 =
+          omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+      printf("A: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+             mapped_ptr_ph3 != original_addr_ph3, ph == nullptr);
+    }
+
+// (B) use_device_ptr/map on pointer, and pointee does not exist.
+// Lookup should fail.
+// EXPECTED: B: 1 1 1
+// CHECK:    B: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) use_device_ptr(ph)
+    {
+      void *mapped_ptr_ph3 =
+          omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+      printf("B: %d %d %d\n", mapped_ptr_ph3 == nullptr,
+             mapped_ptr_ph3 != original_addr_ph3, ph == nullptr);
+    }
+
+// (C) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// EXPECTED: C: 1 1 1
+// CHECK:    C: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph[3 : 2]) use_device_ptr(ph)
+    {
+      void *mapped_ptr_ph3 =
+          omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+      printf("C: %d %d %d\n", mapped_ptr_ph3 != nullptr,
+             mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3);
+    }
+
+// (D) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// EXPECTED: D: 1 1 1
+// CHECK:    D: 1 1 0
+// FIXME: ph is not being privatized in the region.
+#pragma omp target data map(ph) map(ph[3 : 2]) use_device_ptr(ph)
+    {
+      void *mapped_ptr_ph3 =
+          omp_get_mapped_ptr(original_addr_ph3, omp_get_default_device());
+      printf("D: %d %d %d\n", mapped_ptr_ph3 != nullptr,
+             mapped_ptr_ph3 != original_addr_ph3, &ph[3] == mapped_ptr_ph3);
+    }
+
+// (E) No corresponding item, lookup should fail.
+// CHECK: E: 1 1 1
+#pragma omp target data use_device_ptr(paa)
+    {
+      void *mapped_ptr_paa102 =
+          omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+      printf("E: %d %d %d\n", mapped_ptr_paa102 == nullptr,
+             mapped_ptr_paa102 != original_addr_paa102, paa == nullptr);
+    }
+
+// (F) use_device_ptr/map on pointer, and pointee does not exist.
+// Lookup should fail.
+// CHECK: F: 1 1 1
+#pragma omp target data map(paa) use_device_ptr(paa)
+    {
+      void *mapped_ptr_paa102 =
+          omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+      printf("F: %d %d %d\n", mapped_ptr_paa102 == nullptr,
+             mapped_ptr_paa102 != original_addr_paa102, paa == nullptr);
+    }
+
+// (G) map on pointee: base-pointer of map matches use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: G: 1 1 1
+#pragma omp target data map(paa[1][0][2]) use_device_ptr(paa)
+    {
+      void *mapped_ptr_paa102 =
+          omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+      printf("G: %d %d %d\n", mapped_ptr_paa102 != nullptr,
+             mapped_ptr_paa102 != original_addr_paa102,
+             &paa[1][0][2] == mapped_ptr_paa102);
+    }
+
+// (H) map on pointer and pointee. Base-pointer of map on pointee matches
+// use_device_ptr operand.
+// Lookup should succeed.
+// CHECK: H: 1 1 1
+#pragma omp target data map(paa) map(paa[1][0][2]) use_device_ptr(paa)
+    {
+      void *mapped_ptr_paa102 =
+          omp_get_mapped_ptr(original_addr_paa102, omp_get_default_device());
+      printf("H: %d %d %d\n", mapped_ptr_paa102 != nullptr,
+             mapped_ptr_paa102 != original_addr_paa102,
+             &paa[1][0][2] == mapped_ptr_paa102);
+    }
+  }
+};
+
+S s1;
+int main() { s1.f1(1); }
diff --git a/offload/test/offloading/fortran/declare-target-automap.f90 b/offload/test/offloading/fortran/declare-target-automap.f90
new file mode 100644
index 0000000..b9c2d34
--- /dev/null
+++ b/offload/test/offloading/fortran/declare-target-automap.f90
@@ -0,0 +1,37 @@
+!Offloading test for AUTOMAP modifier in declare target enter
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program automap_program
+   use iso_c_binding, only: c_loc
+   use omp_lib, only: omp_get_default_device, omp_target_is_present
+   integer, parameter :: N = 10
+   integer :: i
+   integer, allocatable, target :: automap_array(:)
+   !$omp declare target enter(automap:automap_array)
+
+   ! false since the storage is not present even though the descriptor is present
+   write (*, *) omp_target_is_present(c_loc(automap_array), omp_get_default_device())
+   ! CHECK: 0
+
+   allocate (automap_array(N))
+   ! true since the storage should be allocated and reference count incremented by the allocate
+   write (*, *) omp_target_is_present(c_loc(automap_array), omp_get_default_device())
+   ! CHECK: 1
+
+   ! since storage is present this should not be a runtime error
+   !$omp target teams loop
+   do i = 1, N
+      automap_array(i) = i
+   end do
+
+   !$omp target update from(automap_array)
+   write (*, *) automap_array
+   ! CHECK: 1 2 3 4 5 6 7 8 9 10
+
+   deallocate (automap_array)
+
+   ! automap_array should have it's storage unmapped on device here
+   write (*, *) omp_target_is_present(c_loc(automap_array), omp_get_default_device())
+   ! CHECK: 0
+end program
diff --git a/offload/test/offloading/strided_multiple_update.c b/offload/test/offloading/strided_multiple_update.c
new file mode 100644
index 0000000..a3e8d10
--- /dev/null
+++ b/offload/test/offloading/strided_multiple_update.c
@@ -0,0 +1,62 @@
+// This test checks that #pragma omp target update from(data1[0:3:4],
+// data2[0:2:5]) correctly updates disjoint strided sections of multiple arrays
+// from the device to the host.
+
+// RUN: %libomptarget-compile-run-and-check-generic
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int len = 12;
+  double data1[len], data2[len];
+
+// Initial values
+#pragma omp target map(tofrom : data1[0 : len], data2[0 : len])
+  {
+    for (int i = 0; i < len; i++) {
+      data1[i] = i;
+      data2[i] = i * 10;
+    }
+  }
+
+  printf("original host array values:\n");
+  printf("data1: ");
+  for (int i = 0; i < len; i++)
+    printf("%.1f ", data1[i]);
+  printf("\ndata2: ");
+  for (int i = 0; i < len; i++)
+    printf("%.1f ", data2[i]);
+  printf("\n\n");
+
+#pragma omp target data map(to : data1[0 : len], data2[0 : len])
+  {
+// Modify arrays on device
+#pragma omp target
+    {
+      for (int i = 0; i < len; i++)
+        data1[i] += i;
+      for (int i = 0; i < len; i++)
+        data2[i] += 100;
+    }
+
+// data1[0:3:4]  // indices 0,4,8
+// data2[0:2:5]  // indices 0,5
+#pragma omp target update from(data1[0 : 3 : 4], data2[0 : 2 : 5])
+  }
+
+  printf("device array values after update from:\n");
+  printf("data1: ");
+  for (int i = 0; i < len; i++)
+    printf("%.1f ", data1[i]);
+  printf("\ndata2: ");
+  for (int i = 0; i < len; i++)
+    printf("%.1f ", data2[i]);
+  printf("\n\n");
+
+  // CHECK: data1: 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0
+  // CHECK: data2: 0.0 10.0 20.0 30.0 40.0 50.0 60.0 70.0 80.0 90.0 100.0 110.0
+
+  // CHECK: data1: 0.0 1.0 2.0 3.0 8.0 5.0 6.0 7.0 16.0 9.0 10.0 11.0
+  // CHECK: data2: 100.0 10.0 20.0 30.0 40.0 150.0 60.0 70.0 80.0 90.0 100.0
+  // 110.0
+}
diff --git a/offload/test/offloading/strided_partial_update.c b/offload/test/offloading/strided_partial_update.c
new file mode 100644
index 0000000..15d477f
--- /dev/null
+++ b/offload/test/offloading/strided_partial_update.c
@@ -0,0 +1,63 @@
+// This test checks that #pragma omp target update from(data[0:4:3]) correctly
+// updates every third element (stride 3) from the device to the host, partially
+// across the array
+
+// RUN: %libomptarget-compile-run-and-check-generic
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int len = 11;
+  double data[len];
+
+#pragma omp target map(tofrom : data[0 : len])
+  {
+    for (int i = 0; i < len; i++)
+      data[i] = i;
+  }
+
+  // Initial values
+  printf("original host array values:\n");
+  for (int i = 0; i < len; i++)
+    printf("%f\n", data[i]);
+  printf("\n");
+
+#pragma omp target data map(to : data[0 : len])
+  {
+// Modify arrays on device
+#pragma omp target
+    for (int i = 0; i < len; i++)
+      data[i] += i;
+
+#pragma omp target update from(data[0 : 4 : 3]) // indices 0,3,6,9
+  }
+
+  printf("device array values after update from:\n");
+  for (int i = 0; i < len; i++)
+    printf("%f\n", data[i]);
+  printf("\n");
+
+  // CHECK: 0.000000
+  // CHECK: 1.000000
+  // CHECK: 2.000000
+  // CHECK: 3.000000
+  // CHECK: 4.000000
+  // CHECK: 5.000000
+  // CHECK: 6.000000
+  // CHECK: 7.000000
+  // CHECK: 8.000000
+  // CHECK: 9.000000
+  // CHECK: 10.000000
+
+  // CHECK: 0.000000
+  // CHECK: 1.000000
+  // CHECK: 2.000000
+  // CHECK: 6.000000
+  // CHECK: 4.000000
+  // CHECK: 5.000000
+  // CHECK: 12.000000
+  // CHECK: 7.000000
+  // CHECK: 8.000000
+  // CHECK: 18.000000
+  // CHECK: 10.000000
+}
diff --git a/offload/test/offloading/strided_update.c b/offload/test/offloading/strided_update.c
new file mode 100644
index 0000000..fe875b7
--- /dev/null
+++ b/offload/test/offloading/strided_update.c
@@ -0,0 +1,54 @@
+// This test checks that "update from" clause in OpenMP is supported when the
+// elements are updated in a non-contiguous manner. This test checks that
+// #pragma omp target update from(data[0:4:2]) correctly updates only every
+// other element (stride 2) from the device to the host
+
+// RUN: %libomptarget-compile-run-and-check-generic
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int len = 8;
+  double data[len];
+#pragma omp target map(tofrom : len, data[0 : len])
+  {
+    for (int i = 0; i < len; i++) {
+      data[i] = i;
+    }
+  }
+  // Initial values
+  printf("original host array values:\n");
+  for (int i = 0; i < len; i++)
+    printf("%f\n", data[i]);
+  printf("\n");
+
+#pragma omp target data map(to : len, data[0 : len])
+  {
+// Modify arrays on device
+#pragma omp target
+    for (int i = 0; i < len; i++) {
+      data[i] += i;
+    }
+
+#pragma omp target update from(data[0 : 4 : 2])
+  }
+  // CHECK: 0.000000
+  // CHECK: 1.000000
+  // CHECK: 4.000000
+  // CHECK: 3.000000
+  // CHECK: 8.000000
+  // CHECK: 5.000000
+  // CHECK: 12.000000
+  // CHECK: 7.000000
+  // CHECK-NOT: 2.000000
+  // CHECK-NOT: 6.000000
+  // CHECK-NOT: 10.000000
+  // CHECK-NOT: 14.000000
+
+  printf("from target array results:\n");
+  for (int i = 0; i < len; i++)
+    printf("%f\n", data[i]);
+  printf("\n");
+
+  return 0;
+}
diff --git a/offload/test/tools/offload-tblgen/default_returns.td b/offload/test/tools/offload-tblgen/default_returns.td
index e919492..41949db 100644
--- a/offload/test/tools/offload-tblgen/default_returns.td
+++ b/offload/test/tools/offload-tblgen/default_returns.td
@@ -6,13 +6,11 @@
 
 include "APIDefs.td"
 
-def : Handle {
-    let name = "ol_foo_handle_t";
+def ol_foo_handle_t : Handle {
     let desc = "Example handle type";
 }
 
-def : Function {
-    let name = "FunctionA";
+def FunctionA : Function {
     let desc = "Function A description";
     let details = [ "Function A detailed information" ];
     let params = [
diff --git a/offload/test/tools/offload-tblgen/entry_points.td b/offload/test/tools/offload-tblgen/entry_points.td
index c66d5b4..94ea820 100644
--- a/offload/test/tools/offload-tblgen/entry_points.td
+++ b/offload/test/tools/offload-tblgen/entry_points.td
@@ -4,8 +4,7 @@
 
 include "APIDefs.td"
 
-def : Function {
-    let name = "FunctionA";
+def FunctionA : Function {
     let desc = "Function A description";
     let details = [ "Function A detailed information" ];
     let params = [
diff --git a/offload/test/tools/offload-tblgen/functions_basic.td b/offload/test/tools/offload-tblgen/functions_basic.td
index dec9357..2802c78 100644
--- a/offload/test/tools/offload-tblgen/functions_basic.td
+++ b/offload/test/tools/offload-tblgen/functions_basic.td
@@ -6,8 +6,7 @@
 
 include "APIDefs.td"
 
-def : Function {
-    let name = "FunctionA";
+def FunctionA : Function {
     let desc = "Function A description";
     let details = [ "Function A detailed information" ];
     let params = [
diff --git a/offload/test/tools/offload-tblgen/functions_code_loc.td b/offload/test/tools/offload-tblgen/functions_code_loc.td
index aec2012..8d7aa00 100644
--- a/offload/test/tools/offload-tblgen/functions_code_loc.td
+++ b/offload/test/tools/offload-tblgen/functions_code_loc.td
@@ -7,8 +7,7 @@
 
 include "APIDefs.td"
 
-def : Function {
-    let name = "FunctionA";
+def FunctionA : Function {
     let desc = "Function A description";
     let details = [ "Function A detailed information" ];
     let params = [
diff --git a/offload/test/tools/offload-tblgen/functions_ranged_param.td b/offload/test/tools/offload-tblgen/functions_ranged_param.td
index d0996b2..1ce8b39 100644
--- a/offload/test/tools/offload-tblgen/functions_ranged_param.td
+++ b/offload/test/tools/offload-tblgen/functions_ranged_param.td
@@ -8,13 +8,11 @@
 
 include "APIDefs.td"
 
-def : Handle {
-    let name = "some_handle_t";
+def some_handle_t : Handle {
     let desc = "An example handle type";
 }
 
-def : Function {
-    let name = "FunctionA";
+def FunctionA : Function {
     let desc = "Function A description";
     let details = [ "Function A detailed information" ];
   let params = [
diff --git a/offload/test/tools/offload-tblgen/print_enum.td b/offload/test/tools/offload-tblgen/print_enum.td
index 97f8696..c7573a9 100644
--- a/offload/test/tools/offload-tblgen/print_enum.td
+++ b/offload/test/tools/offload-tblgen/print_enum.td
@@ -4,8 +4,7 @@
 
 include "APIDefs.td"
 
-def : Enum {
-  let name = "my_enum_t";
+def my_enum_t : Enum {
   let desc = "An example enum";
   let etors =[
     Etor<"VALUE_ONE", "The first enum value">,
diff --git a/offload/test/tools/offload-tblgen/print_function.td b/offload/test/tools/offload-tblgen/print_function.td
index ce1fe4c..74b39f1 100644
--- a/offload/test/tools/offload-tblgen/print_function.td
+++ b/offload/test/tools/offload-tblgen/print_function.td
@@ -5,13 +5,11 @@
 
 include "APIDefs.td"
 
-def : Handle {
-    let name = "ol_foo_handle_t";
+def ol_foo_handle_t : Handle {
     let desc = "Example handle type";
 }
 
-def : Function {
-    let name = "FunctionA";
+def FunctionA : Function {
     let desc = "Function A description";
     let details = [ "Function A detailed information" ];
     let params = [
diff --git a/offload/test/tools/offload-tblgen/type_tagged_enum.td b/offload/test/tools/offload-tblgen/type_tagged_enum.td
index 95964e3..b32531a 100644
--- a/offload/test/tools/offload-tblgen/type_tagged_enum.td
+++ b/offload/test/tools/offload-tblgen/type_tagged_enum.td
@@ -9,13 +9,11 @@
 
 include "APIDefs.td"
 
-def : Handle {
-    let name = "some_handle_t";
+def some_handle_t: Handle {
     let desc = "An example handle type";
 }
 
-def : Enum {
-  let name = "my_type_tagged_enum_t";
+def my_type_tagged_enum_t : Enum {
   let desc = "Example type tagged enum";
   let is_typed = 1;
   let etors = [
@@ -34,8 +32,7 @@ def : Enum {
 // CHECK-API-NEXT: [some_handle_t] Value three.
 // CHECK-API-NEXT: MY_TYPE_TAGGED_ENUM_VALUE_THREE = 2,
 
-def : Function {
-    let name = "FunctionA";
+def FunctionA : Function {
     let desc = "Function A description";
     let details = [ "Function A detailed information" ];
   let params = [
diff --git a/offload/tools/offload-tblgen/APIGen.cpp b/offload/tools/offload-tblgen/APIGen.cpp
index 8c61d1f..1e79c00 100644
--- a/offload/tools/offload-tblgen/APIGen.cpp
+++ b/offload/tools/offload-tblgen/APIGen.cpp
@@ -131,7 +131,8 @@ static void ProcessEnum(const EnumRec &Enum, raw_ostream &OS) {
   OS << formatv("/// @brief {0}\n", Enum.getDesc());
   OS << formatv("typedef enum {0} {{\n", Enum.getName());
 
-  uint32_t EtorVal = 0;
+  // Bitfields start from 1, other enums from 0
+  uint32_t EtorVal = Enum.isBitField();
   for (const auto &EnumVal : Enum.getValues()) {
     if (Enum.isTyped()) {
       OS << MakeComment(
@@ -141,7 +142,12 @@ static void ProcessEnum(const EnumRec &Enum, raw_ostream &OS) {
       OS << MakeComment(EnumVal.getDesc());
     }
     OS << formatv(TAB_1 "{0}_{1} = {2},\n", Enum.getEnumValNamePrefix(),
-                  EnumVal.getName(), EtorVal++);
+                  EnumVal.getName(), EtorVal);
+    if (Enum.isBitField()) {
+      EtorVal <<= 1u;
+    } else {
+      ++EtorVal;
+    }
   }
 
   // Add last_element/force uint32 val
@@ -220,31 +226,23 @@ OL_APIEXPORT ol_result_t OL_APICALL {0}WithCodeLoc(
 void EmitOffloadAPI(const RecordKeeper &Records, raw_ostream &OS) {
   OS << GenericHeader;
   OS << FileHeader;
-  // Generate main API definitions
-  for (auto *R : Records.getAllDerivedDefinitions("APIObject")) {
-    if (R->isSubClassOf("Macro")) {
-      ProcessMacro(MacroRec{R}, OS);
-    } else if (R->isSubClassOf("Typedef")) {
-      ProcessTypedef(TypedefRec{R}, OS);
-    } else if (R->isSubClassOf("Handle")) {
-      ProcessHandle(HandleRec{R}, OS);
-    } else if (R->isSubClassOf("Function")) {
-      ProcessFunction(FunctionRec{R}, OS);
-    } else if (R->isSubClassOf("Enum")) {
-      ProcessEnum(EnumRec{R}, OS);
-    } else if (R->isSubClassOf("Struct")) {
-      ProcessStruct(StructRec{R}, OS);
-    } else if (R->isSubClassOf("FptrTypedef")) {
-      ProcessFptrTypedef(FptrTypedefRec{R}, OS);
-    }
-  }
 
-  // Generate auxiliary definitions (func param structs etc)
+  // Generate main API definitions
+  for (auto *R : Records.getAllDerivedDefinitions("Macro"))
+    ProcessMacro(MacroRec{R}, OS);
+  for (auto *R : Records.getAllDerivedDefinitions("Handle"))
+    ProcessHandle(HandleRec{R}, OS);
+  for (auto *R : Records.getAllDerivedDefinitions("Enum"))
+    ProcessEnum(EnumRec{R}, OS);
+  for (auto *R : Records.getAllDerivedDefinitions("Typedef"))
+    ProcessTypedef(TypedefRec{R}, OS);
+  for (auto *R : Records.getAllDerivedDefinitions("FptrTypedef"))
+    ProcessFptrTypedef(FptrTypedefRec{R}, OS);
+  for (auto *R : Records.getAllDerivedDefinitions("Struct"))
+    ProcessStruct(StructRec{R}, OS);
   for (auto *R : Records.getAllDerivedDefinitions("Function")) {
     ProcessFuncParamStruct(FunctionRec{R}, OS);
-  }
-
-  for (auto *R : Records.getAllDerivedDefinitions("Function")) {
+    ProcessFunction(FunctionRec{R}, OS);
     ProcessFuncWithCodeLocVariant(FunctionRec{R}, OS);
   }
 
diff --git a/offload/tools/offload-tblgen/MiscGen.cpp b/offload/tools/offload-tblgen/MiscGen.cpp
index b90e5cfd..8a8b9ca 100644
--- a/offload/tools/offload-tblgen/MiscGen.cpp
+++ b/offload/tools/offload-tblgen/MiscGen.cpp
@@ -86,7 +86,7 @@ void EmitOffloadErrcodes(const RecordKeeper &Records, raw_ostream &OS) {
 
 )";
 
-  auto ErrorCodeEnum = EnumRec{Records.getDef("ErrorCode")};
+  auto ErrorCodeEnum = EnumRec{Records.getDef("ol_errc_t")};
   uint32_t EtorVal = 0;
   for (const auto &EnumVal : ErrorCodeEnum.getValues()) {
     OS << formatv(TAB_1 "OFFLOAD_ERRC({0}, \"{1}\", {2})\n", EnumVal.getName(),
@@ -107,10 +107,16 @@ void EmitOffloadInfo(const RecordKeeper &Records, raw_ostream &OS) {
 
 )";
 
-  auto ErrorCodeEnum = EnumRec{Records.getDef("DeviceInfo")};
-  uint32_t EtorVal = 0;
-  for (const auto &EnumVal : ErrorCodeEnum.getValues()) {
+  auto Enum = EnumRec{Records.getDef("ol_device_info_t")};
+  // Bitfields start from 1, other enums from 0
+  uint32_t EtorVal = Enum.isBitField();
+  for (const auto &EnumVal : Enum.getValues()) {
     OS << formatv(TAB_1 "OFFLOAD_DEVINFO({0}, \"{1}\", {2})\n",
-                  EnumVal.getName(), EnumVal.getDesc(), EtorVal++);
+                  EnumVal.getName(), EnumVal.getDesc(), EtorVal);
+    if (Enum.isBitField()) {
+      EtorVal <<= 1u;
+    } else {
+      ++EtorVal;
+    }
   }
 }
diff --git a/offload/tools/offload-tblgen/RecordTypes.hpp b/offload/tools/offload-tblgen/RecordTypes.hpp
index 65c0a4c..2abd9e1 100644
--- a/offload/tools/offload-tblgen/RecordTypes.hpp
+++ b/offload/tools/offload-tblgen/RecordTypes.hpp
@@ -16,25 +16,30 @@ namespace llvm {
 namespace offload {
 namespace tblgen {
 
-class HandleRec {
+class APIObject {
 public:
-  explicit HandleRec(const Record *rec) : rec(rec) {}
-  StringRef getName() const { return rec->getValueAsString("name"); }
+  StringRef getName() const { return rec->getName(); }
   StringRef getDesc() const { return rec->getValueAsString("desc"); }
 
-private:
+protected:
+  APIObject(const Record *rec) : rec(rec) {}
   const Record *rec;
 };
 
-class MacroRec {
+class HandleRec : public APIObject {
 public:
-  explicit MacroRec(const Record *rec) : rec(rec) {
-    auto Name = rec->getValueAsString("name");
+  explicit HandleRec(const Record *rec) : APIObject(rec) {};
+};
+
+class MacroRec : public APIObject {
+public:
+  explicit MacroRec(const Record *rec) : APIObject(rec) {
+    auto Name = rec->getName();
     auto OpenBrace = Name.find_first_of("(");
     nameWithoutArgs = Name.substr(0, OpenBrace);
   }
   StringRef getName() const { return nameWithoutArgs; }
-  StringRef getNameWithArgs() const { return rec->getValueAsString("name"); }
+  StringRef getNameWithArgs() const { return rec->getName(); }
   StringRef getDesc() const { return rec->getValueAsString("desc"); }
 
   std::optional<StringRef> getCondition() const {
@@ -46,19 +51,15 @@ public:
   }
 
 private:
-  const Record *rec;
   std::string nameWithoutArgs;
 };
 
-class TypedefRec {
+class TypedefRec : public APIObject {
 public:
-  explicit TypedefRec(const Record *rec) : rec(rec) {}
-  StringRef getName() const { return rec->getValueAsString("name"); }
-  StringRef getDesc() const { return rec->getValueAsString("desc"); }
-  StringRef getValue() const { return rec->getValueAsString("value"); }
+  explicit TypedefRec(const Record *rec) : APIObject(rec) {};
 
-private:
-  const Record *rec;
+public:
+  StringRef getValue() const { return rec->getValueAsString("value"); }
 };
 
 class EnumValueRec {
@@ -74,15 +75,13 @@ private:
   const Record *rec;
 };
 
-class EnumRec {
+class EnumRec : public APIObject {
 public:
-  explicit EnumRec(const Record *rec) : rec(rec) {
+  explicit EnumRec(const Record *rec) : APIObject(rec) {
     for (const auto *Val : rec->getValueAsListOfDefs("etors")) {
       vals.emplace_back(EnumValueRec{Val});
     }
   }
-  StringRef getName() const { return rec->getValueAsString("name"); }
-  StringRef getDesc() const { return rec->getValueAsString("desc"); }
   const std::vector<EnumValueRec> &getValues() const { return vals; }
 
   std::string getEnumValNamePrefix() const {
@@ -92,8 +91,9 @@ public:
 
   bool isTyped() const { return rec->getValueAsBit("is_typed"); }
 
+  bool isBitField() const { return rec->getValueAsBit("is_bit_field"); }
+
 private:
-  const Record *rec;
   std::vector<EnumValueRec> vals;
 };
 
@@ -110,22 +110,19 @@ private:
   const Record *rec;
 };
 
-class StructRec {
+class StructRec : public APIObject {
 public:
-  explicit StructRec(const Record *rec) : rec(rec) {
+  explicit StructRec(const Record *rec) : APIObject(rec) {
     for (auto *Member : rec->getValueAsListOfDefs("all_members")) {
       members.emplace_back(StructMemberRec(Member));
     }
   }
-  StringRef getName() const { return rec->getValueAsString("name"); }
-  StringRef getDesc() const { return rec->getValueAsString("desc"); }
   std::optional<StringRef> getBaseClass() const {
     return rec->getValueAsOptionalString("base_class");
   }
   const std::vector<StructMemberRec> &getMembers() const { return members; }
 
 private:
-  const Record *rec;
   std::vector<StructMemberRec> members;
 };
 
@@ -205,9 +202,9 @@ private:
   const Record *rec;
 };
 
-class FunctionRec {
+class FunctionRec : public APIObject {
 public:
-  FunctionRec(const Record *rec) : rec(rec) {
+  FunctionRec(const Record *rec) : APIObject(rec) {
     for (auto &Ret : rec->getValueAsListOfDefs("all_returns"))
       rets.emplace_back(Ret);
     for (auto &Param : rec->getValueAsListOfDefs("params"))
@@ -219,11 +216,9 @@ public:
                          llvm::convertToSnakeFromCamelCase(getName()));
   }
 
-  StringRef getName() const { return rec->getValueAsString("name"); }
   StringRef getClass() const { return rec->getValueAsString("api_class"); }
   const std::vector<ReturnRec> &getReturns() const { return rets; }
   const std::vector<ParamRec> &getParams() const { return params; }
-  StringRef getDesc() const { return rec->getValueAsString("desc"); }
   std::vector<StringRef> getDetails() const {
     return rec->getValueAsListOfStrings("details");
   }
@@ -234,25 +229,19 @@ public:
 private:
   std::vector<ReturnRec> rets;
   std::vector<ParamRec> params;
-
-  const Record *rec;
 };
 
-class FptrTypedefRec {
+class FptrTypedefRec : public APIObject {
 public:
-  explicit FptrTypedefRec(const Record *rec) : rec(rec) {
+  explicit FptrTypedefRec(const Record *rec) : APIObject(rec) {
     for (auto &Param : rec->getValueAsListOfDefs("params"))
       params.emplace_back(Param);
   }
-  StringRef getName() const { return rec->getValueAsString("name"); }
-  StringRef getDesc() const { return rec->getValueAsString("desc"); }
   StringRef getReturn() const { return rec->getValueAsString("return"); }
   const std::vector<ParamRec> &getParams() const { return params; }
 
 private:
   std::vector<ParamRec> params;
-
-  const Record *rec;
 };
 
 } // namespace tblgen
diff --git a/offload/unittests/Conformance/README.md b/offload/unittests/Conformance/README.md
new file mode 100644
index 0000000..0202242
--- /dev/null
+++ b/offload/unittests/Conformance/README.md
@@ -0,0 +1,83 @@
+# GPU Math Conformance Tests
+
+## Overview
+
+This test suite provides a framework to systematically measure the accuracy of math functions on GPUs and verify their conformance with standards like OpenCL.
+
+While the primary focus is validating the implementations in the C standard math library (LLVM-libm), these tests can also be executed against other math library providers, such as CUDA Math and HIP Math, for comparison.
+
+The goals of this project are to empower LLVM-libm contributors with a robust tool for validating their implementations and to build trust with end-users by providing transparent accuracy data.
+
+### Table of Contents
+
+- [Getting Started](#getting-started)
+- [Running the Tests](#running-the-tests)
+- [Adding New Tests](#adding-new-tests)
+
+## Getting Started
+
+This guide covers how to build the necessary dependencies, which include the new Offload API and the C standard library for both host and GPU targets.
+
+### System Requirements
+
+Before you begin, ensure your system meets the following requirements:
+
+- A system with an AMD or NVIDIA GPU.
+- The latest proprietary GPU drivers installed.
+- The corresponding development SDK for your hardware:
+  - **AMD:** [ROCm SDK](https://rocm.docs.amd.com)
+  - **NVIDIA:** [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit)
+
+### Building the Dependencies
+
+The official documentation for building LLVM-libc for GPUs provides a detailed guide and should be considered the primary reference. Please follow the instructions in the **"Standard runtimes build"** section of that guide:
+
+- [Building the GPU C library (Official Documentation)](https://libc.llvm.org/gpu/building.html)
+
+> [!IMPORTANT]
+> For the conformance tests, the standard `cmake` command from the official documentation must be adapted slightly. You must also add `libc` to the main `-DLLVM_ENABLE_RUNTIMES` list. This is a crucial step because the tests need a host-side build of `libc` to use as the reference oracle for validating GPU results.
+
+## Running the Tests
+
+### Default Test
+
+To build and run the conformance test for a given function (e.g., `logf`) against the default C standard math library `llvm-libm` provider, use the following command. This will execute the test on all available and supported platforms.
+
+```bash
+ninja -C build/runtimes/runtimes-bins offload.conformance.logf
+```
+
+### Testing Other Providers
+
+Once the test binary has been built, you can run it against other math library providers using the `--test-configs` flag.
+
+- **For `cuda-math` on an NVIDIA GPU:**
+
+  ```bash
+  ./build/runtimes/runtimes-bins/offload/logf.conformance --test-configs=cuda-math:cuda
+  ```
+
+- **For `hip-math` on an AMD GPU:**
+
+  ```bash
+  ./build/runtimes/runtimes-bins/offload/logf.conformance --test-configs=hip-math:amdgpu
+  ```
+
+You can also run all available configurations for a test with:
+
+```bash
+./build/runtimes/runtimes-bins/offload/logf.conformance --test-configs=all
+```
+
+## Adding New Tests
+
+To add a conformance test for a new math function, follow these steps:
+
+1. **Implement the Device Kernels**: Create a kernel wrapper for the new function in each provider's source file. For CUDA Math and HIP Math, you must also add a forward declaration for the vendor function in `/device_code/DeviceAPIs.hpp`.
+
+2. **Implement the Host Test**: Create a new `.cpp` file in `/tests`. This file defines the `FunctionConfig` (function and kernel names, as well as ULP tolerance) and the input generation strategy.
+
+    - Use **exhaustive testing** (`ExhaustiveGenerator`) for functions with small input spaces (e.g., half-precision functions and single-precision univariate functions). This strategy iterates over every representable point in the input space, ensuring complete coverage.
+    - Use **randomized testing** (`RandomGenerator`) for functions with large input spaces (e.g., single-precision bivariate and double-precision functions), where exhaustive testing is computationally infeasible. Although not exhaustive, this strategy is deterministic, using a fixed seed to sample a large, reproducible subset of points from the input space.
+
+3. **Add the Build Target**: Add a new `add_conformance_test(...)` entry to `/tests/CMakeLists.txt` to make the test buildable.
diff --git a/offload/unittests/Conformance/device_code/CUDAMath.cpp b/offload/unittests/Conformance/device_code/CUDAMath.cpp
index a351e92..d80660b 100644
--- a/offload/unittests/Conformance/device_code/CUDAMath.cpp
+++ b/offload/unittests/Conformance/device_code/CUDAMath.cpp
@@ -26,6 +26,22 @@ using namespace kernels;
 // Helpers
 //===----------------------------------------------------------------------===//
 
+static inline float powfRoundedExponent(float Base, float Exponent) {
+  return __nv_powf(Base, __nv_roundf(Exponent));
+}
+
+static inline double sincosSin(double X) {
+  double SinX, CosX;
+  __nv_sincos(X, &SinX, &CosX);
+  return SinX;
+}
+
+static inline double sincosCos(double X) {
+  double SinX, CosX;
+  __nv_sincos(X, &SinX, &CosX);
+  return CosX;
+}
+
 static inline float sincosfSin(float X) {
   float SinX, CosX;
   __nv_sincosf(X, &SinX, &CosX);
@@ -44,6 +60,11 @@ static inline float sincosfCos(float X) {
 
 extern "C" {
 
+__gpu_kernel void acosKernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__nv_acos>(NumElements, Out, X);
+}
+
 __gpu_kernel void acosfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__nv_acosf>(NumElements, Out, X);
@@ -54,6 +75,11 @@ __gpu_kernel void acoshfKernel(const float *X, float *Out,
   runKernelBody<__nv_acoshf>(NumElements, Out, X);
 }
 
+__gpu_kernel void asinKernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__nv_asin>(NumElements, Out, X);
+}
+
 __gpu_kernel void asinfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__nv_asinf>(NumElements, Out, X);
@@ -69,16 +95,31 @@ __gpu_kernel void atanfKernel(const float *X, float *Out,
   runKernelBody<__nv_atanf>(NumElements, Out, X);
 }
 
+__gpu_kernel void atan2fKernel(const float *X, const float *Y, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__nv_atan2f>(NumElements, Out, X, Y);
+}
+
 __gpu_kernel void atanhfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__nv_atanhf>(NumElements, Out, X);
 }
 
+__gpu_kernel void cbrtKernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__nv_cbrt>(NumElements, Out, X);
+}
+
 __gpu_kernel void cbrtfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__nv_cbrtf>(NumElements, Out, X);
 }
 
+__gpu_kernel void cosKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__nv_cos>(NumElements, Out, X);
+}
+
 __gpu_kernel void cosfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__nv_cosf>(NumElements, Out, X);
@@ -99,51 +140,127 @@ __gpu_kernel void erffKernel(const float *X, float *Out,
   runKernelBody<__nv_erff>(NumElements, Out, X);
 }
 
+__gpu_kernel void expKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__nv_exp>(NumElements, Out, X);
+}
+
 __gpu_kernel void expfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__nv_expf>(NumElements, Out, X);
 }
 
+__gpu_kernel void exp10Kernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_exp10>(NumElements, Out, X);
+}
+
 __gpu_kernel void exp10fKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__nv_exp10f>(NumElements, Out, X);
 }
 
+__gpu_kernel void exp2Kernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__nv_exp2>(NumElements, Out, X);
+}
+
 __gpu_kernel void exp2fKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__nv_exp2f>(NumElements, Out, X);
 }
 
+__gpu_kernel void expm1Kernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_expm1>(NumElements, Out, X);
+}
+
 __gpu_kernel void expm1fKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__nv_expm1f>(NumElements, Out, X);
 }
 
+__gpu_kernel void hypotKernel(const double *X, const double *Y, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_hypot>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void hypotfKernel(const float *X, const float *Y, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__nv_hypotf>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void logKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__nv_log>(NumElements, Out, X);
+}
+
 __gpu_kernel void logfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__nv_logf>(NumElements, Out, X);
 }
 
+__gpu_kernel void log10Kernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_log10>(NumElements, Out, X);
+}
+
 __gpu_kernel void log10fKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__nv_log10f>(NumElements, Out, X);
 }
 
+__gpu_kernel void log1pKernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_log1p>(NumElements, Out, X);
+}
+
 __gpu_kernel void log1pfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__nv_log1pf>(NumElements, Out, X);
 }
 
+__gpu_kernel void log2Kernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__nv_log2>(NumElements, Out, X);
+}
+
 __gpu_kernel void log2fKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__nv_log2f>(NumElements, Out, X);
 }
 
+__gpu_kernel void powfKernel(const float *X, float *Y, float *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__nv_powf>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void powfRoundedExponentKernel(const float *X, float *Y,
+                                            float *Out,
+                                            size_t NumElements) noexcept {
+  runKernelBody<powfRoundedExponent>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void sinKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__nv_sin>(NumElements, Out, X);
+}
+
 __gpu_kernel void sinfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__nv_sinf>(NumElements, Out, X);
 }
 
+__gpu_kernel void sincosSinKernel(const double *X, double *Out,
+                                  size_t NumElements) noexcept {
+  runKernelBody<sincosSin>(NumElements, Out, X);
+}
+
+__gpu_kernel void sincosCosKernel(const double *X, double *Out,
+                                  size_t NumElements) noexcept {
+  runKernelBody<sincosCos>(NumElements, Out, X);
+}
+
 __gpu_kernel void sincosfSinKernel(const float *X, float *Out,
                                    size_t NumElements) noexcept {
   runKernelBody<sincosfSin>(NumElements, Out, X);
@@ -164,6 +281,11 @@ __gpu_kernel void sinpifKernel(const float *X, float *Out,
   runKernelBody<__nv_sinpif>(NumElements, Out, X);
 }
 
+__gpu_kernel void tanKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__nv_tan>(NumElements, Out, X);
+}
+
 __gpu_kernel void tanfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__nv_tanf>(NumElements, Out, X);
diff --git a/offload/unittests/Conformance/device_code/DeviceAPIs.hpp b/offload/unittests/Conformance/device_code/DeviceAPIs.hpp
index 8476dcb..894652a 100644
--- a/offload/unittests/Conformance/device_code/DeviceAPIs.hpp
+++ b/offload/unittests/Conformance/device_code/DeviceAPIs.hpp
@@ -48,29 +48,49 @@ extern const inline uint32_t __oclc_ISA_version = 9000;
 
 extern "C" {
 
+double __nv_acos(double);
 float __nv_acosf(float);
 float __nv_acoshf(float);
+double __nv_asin(double);
 float __nv_asinf(float);
 float __nv_asinhf(float);
 float __nv_atanf(float);
+float __nv_atan2f(float, float);
 float __nv_atanhf(float);
+double __nv_cbrt(double);
 float __nv_cbrtf(float);
+double __nv_cos(double);
 float __nv_cosf(float);
 float __nv_coshf(float);
 float __nv_cospif(float);
 float __nv_erff(float);
+double __nv_exp(double);
 float __nv_expf(float);
+double __nv_exp10(double);
 float __nv_exp10f(float);
+double __nv_exp2(double);
 float __nv_exp2f(float);
+double __nv_expm1(double);
 float __nv_expm1f(float);
+double __nv_hypot(double, double);
+float __nv_hypotf(float, float);
+double __nv_log(double);
 float __nv_logf(float);
+double __nv_log10(double);
 float __nv_log10f(float);
+double __nv_log1p(double);
 float __nv_log1pf(float);
+double __nv_log2(double);
 float __nv_log2f(float);
+float __nv_powf(float, float);
+float __nv_roundf(float);
+double __nv_sin(double);
 float __nv_sinf(float);
+void __nv_sincos(double, double *, double *);
 void __nv_sincosf(float, float *, float *);
 float __nv_sinhf(float);
 float __nv_sinpif(float);
+double __nv_tan(double);
 float __nv_tanf(float);
 float __nv_tanhf(float);
 } // extern "C"
@@ -81,31 +101,70 @@ float __nv_tanhf(float);
 
 extern "C" {
 
+double __ocml_acos_f64(double);
 float __ocml_acos_f32(float);
+float16 __ocml_acos_f16(float16);
 float __ocml_acosh_f32(float);
+float16 __ocml_acosh_f16(float16);
+double __ocml_asin_f64(double);
 float __ocml_asin_f32(float);
+float16 __ocml_asin_f16(float16);
 float __ocml_asinh_f32(float);
+float16 __ocml_asinh_f16(float16);
 float __ocml_atan_f32(float);
+float16 __ocml_atan_f16(float16);
+float __ocml_atan2_f32(float, float);
 float __ocml_atanh_f32(float);
+float16 __ocml_atanh_f16(float16);
+double __ocml_cbrt_f64(double);
 float __ocml_cbrt_f32(float);
+double __ocml_cos_f64(double);
 float __ocml_cos_f32(float);
+float16 __ocml_cos_f16(float16);
 float __ocml_cosh_f32(float);
+float16 __ocml_cosh_f16(float16);
 float __ocml_cospi_f32(float);
 float __ocml_erf_f32(float);
+double __ocml_exp_f64(double);
 float __ocml_exp_f32(float);
+float16 __ocml_exp_f16(float16);
+double __ocml_exp10_f64(double);
 float __ocml_exp10_f32(float);
+float16 __ocml_exp10_f16(float16);
+double __ocml_exp2_f64(double);
 float __ocml_exp2_f32(float);
+float16 __ocml_exp2_f16(float16);
+double __ocml_expm1_f64(double);
 float __ocml_expm1_f32(float);
+float16 __ocml_expm1_f16(float16);
+double __ocml_hypot_f64(double, double);
+float __ocml_hypot_f32(float, float);
+double __ocml_log_f64(double);
 float __ocml_log_f32(float);
+float16 __ocml_log_f16(float16);
+double __ocml_log10_f64(double);
 float __ocml_log10_f32(float);
+float16 __ocml_log10_f16(float16);
+double __ocml_log1p_f64(double);
 float __ocml_log1p_f32(float);
+double __ocml_log2_f64(double);
 float __ocml_log2_f32(float);
+float16 __ocml_log2_f16(float16);
+float __ocml_pow_f32(float, float);
+float __ocml_round_f32(float);
+double __ocml_sin_f64(double);
 float __ocml_sin_f32(float);
+float16 __ocml_sin_f16(float16);
+double __ocml_sincos_f64(double, double *);
 float __ocml_sincos_f32(float, float *);
 float __ocml_sinh_f32(float);
+float16 __ocml_sinh_f16(float16);
 float __ocml_sinpi_f32(float);
+double __ocml_tan_f64(double);
 float __ocml_tan_f32(float);
+float16 __ocml_tan_f16(float16);
 float __ocml_tanh_f32(float);
+float16 __ocml_tanh_f16(float16);
 } // extern "C"
 
 #endif // HIP_MATH_FOUND
diff --git a/offload/unittests/Conformance/device_code/HIPMath.cpp b/offload/unittests/Conformance/device_code/HIPMath.cpp
index 36efe6b..7cc0ad5 100644
--- a/offload/unittests/Conformance/device_code/HIPMath.cpp
+++ b/offload/unittests/Conformance/device_code/HIPMath.cpp
@@ -26,6 +26,22 @@ using namespace kernels;
 // Helpers
 //===----------------------------------------------------------------------===//
 
+static inline float powfRoundedExponent(float Base, float Exponent) {
+  return __ocml_pow_f32(Base, __ocml_round_f32(Exponent));
+}
+
+static inline double sincosSin(double X) {
+  double CosX;
+  double SinX = __ocml_sincos_f64(X, &CosX);
+  return SinX;
+}
+
+static inline double sincosCos(double X) {
+  double CosX;
+  double SinX = __ocml_sincos_f64(X, &CosX);
+  return CosX;
+}
+
 static inline float sincosfSin(float X) {
   float CosX;
   float SinX = __ocml_sincos_f32(X, &CosX);
@@ -44,51 +60,116 @@ static inline float sincosfCos(float X) {
 
 extern "C" {
 
+__gpu_kernel void acosKernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__ocml_acos_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void acosfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__ocml_acos_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void acosf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<__ocml_acos_f16>(NumElements, Out, X);
+}
+
 __gpu_kernel void acoshfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__ocml_acosh_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void acoshf16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<__ocml_acosh_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void asinKernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__ocml_asin_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void asinfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__ocml_asin_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void asinf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<__ocml_asin_f16>(NumElements, Out, X);
+}
+
 __gpu_kernel void asinhfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__ocml_asinh_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void asinhf16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<__ocml_asinh_f16>(NumElements, Out, X);
+}
+
 __gpu_kernel void atanfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__ocml_atan_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void atanf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<__ocml_atan_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void atan2fKernel(const float *X, const float *Y, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__ocml_atan2_f32>(NumElements, Out, X, Y);
+}
+
 __gpu_kernel void atanhfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__ocml_atanh_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void atanhf16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<__ocml_atanh_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void cbrtKernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__ocml_cbrt_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void cbrtfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__ocml_cbrt_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void cosKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__ocml_cos_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void cosfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__ocml_cos_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void cosf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__ocml_cos_f16>(NumElements, Out, X);
+}
+
 __gpu_kernel void coshfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__ocml_cosh_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void coshf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<__ocml_cosh_f16>(NumElements, Out, X);
+}
+
 __gpu_kernel void cospifKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__ocml_cospi_f32>(NumElements, Out, X);
@@ -99,51 +180,167 @@ __gpu_kernel void erffKernel(const float *X, float *Out,
   runKernelBody<__ocml_erf_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void expKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__ocml_exp_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void expfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__ocml_exp_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void expf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__ocml_exp_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void exp10Kernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__ocml_exp10_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void exp10fKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__ocml_exp10_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void exp10f16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<__ocml_exp10_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void exp2Kernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__ocml_exp2_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void exp2fKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__ocml_exp2_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void exp2f16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<__ocml_exp2_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void expm1Kernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__ocml_expm1_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void expm1fKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__ocml_expm1_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void expm1f16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<__ocml_expm1_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void hypotKernel(const double *X, const double *Y, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__ocml_hypot_f64>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void hypotfKernel(const float *X, const float *Y, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__ocml_hypot_f32>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void logKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__ocml_log_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void logfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__ocml_log_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void logf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__ocml_log_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void log10Kernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__ocml_log10_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void log10fKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__ocml_log10_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void log10f16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<__ocml_log10_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void log1pKernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__ocml_log1p_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void log1pfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__ocml_log1p_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void log2Kernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__ocml_log2_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void log2fKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__ocml_log2_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void log2f16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<__ocml_log2_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void powfKernel(const float *X, float *Y, float *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__ocml_pow_f32>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void powfRoundedExponentKernel(const float *X, float *Y,
+                                            float *Out,
+                                            size_t NumElements) noexcept {
+  runKernelBody<powfRoundedExponent>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void sinKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__ocml_sin_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void sinfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__ocml_sin_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void sinf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__ocml_sin_f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void sincosSinKernel(const double *X, double *Out,
+                                  size_t NumElements) noexcept {
+  runKernelBody<sincosSin>(NumElements, Out, X);
+}
+
+__gpu_kernel void sincosCosKernel(const double *X, double *Out,
+                                  size_t NumElements) noexcept {
+  runKernelBody<sincosCos>(NumElements, Out, X);
+}
+
 __gpu_kernel void sincosfSinKernel(const float *X, float *Out,
                                    size_t NumElements) noexcept {
   runKernelBody<sincosfSin>(NumElements, Out, X);
@@ -159,20 +356,40 @@ __gpu_kernel void sinhfKernel(const float *X, float *Out,
   runKernelBody<__ocml_sinh_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void sinhf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<__ocml_sinh_f16>(NumElements, Out, X);
+}
+
 __gpu_kernel void sinpifKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<__ocml_sinpi_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void tanKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<__ocml_tan_f64>(NumElements, Out, X);
+}
+
 __gpu_kernel void tanfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<__ocml_tan_f32>(NumElements, Out, X);
 }
 
+__gpu_kernel void tanf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__ocml_tan_f16>(NumElements, Out, X);
+}
+
 __gpu_kernel void tanhfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<__ocml_tanh_f32>(NumElements, Out, X);
 }
+
+__gpu_kernel void tanhf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<__ocml_tanh_f16>(NumElements, Out, X);
+}
 } // extern "C"
 
 #endif // HIP_MATH_FOUND
diff --git a/offload/unittests/Conformance/device_code/LLVMLibm.cpp b/offload/unittests/Conformance/device_code/LLVMLibm.cpp
index 8869d87..8673d80 100644
--- a/offload/unittests/Conformance/device_code/LLVMLibm.cpp
+++ b/offload/unittests/Conformance/device_code/LLVMLibm.cpp
@@ -25,6 +25,22 @@ using namespace kernels;
 // Helpers
 //===----------------------------------------------------------------------===//
 
+static inline float powfRoundedExponent(float Base, float Exponent) {
+  return powf(Base, roundf(Exponent));
+}
+
+static inline double sincosSin(double X) {
+  double SinX, CosX;
+  sincos(X, &SinX, &CosX);
+  return SinX;
+}
+
+static inline double sincosCos(double X) {
+  double SinX, CosX;
+  sincos(X, &SinX, &CosX);
+  return CosX;
+}
+
 static inline float sincosfSin(float X) {
   float SinX, CosX;
   sincosf(X, &SinX, &CosX);
@@ -43,111 +59,302 @@ static inline float sincosfCos(float X) {
 
 extern "C" {
 
+__gpu_kernel void acosKernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<acos>(NumElements, Out, X);
+}
+
 __gpu_kernel void acosfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<acosf>(NumElements, Out, X);
 }
 
+__gpu_kernel void acosf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<acosf16>(NumElements, Out, X);
+}
+
 __gpu_kernel void acoshfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<acoshf>(NumElements, Out, X);
 }
 
+__gpu_kernel void acoshf16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<acoshf16>(NumElements, Out, X);
+}
+
+__gpu_kernel void acospif16Kernel(const float16 *X, float16 *Out,
+                                  size_t NumElements) noexcept {
+  runKernelBody<acospif16>(NumElements, Out, X);
+}
+
+__gpu_kernel void asinKernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<asin>(NumElements, Out, X);
+}
+
 __gpu_kernel void asinfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<asinf>(NumElements, Out, X);
 }
 
+__gpu_kernel void asinf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<asinf16>(NumElements, Out, X);
+}
+
 __gpu_kernel void asinhfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<asinhf>(NumElements, Out, X);
 }
 
+__gpu_kernel void asinhf16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<asinhf16>(NumElements, Out, X);
+}
+
 __gpu_kernel void atanfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<atanf>(NumElements, Out, X);
 }
 
+__gpu_kernel void atanf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<atanf16>(NumElements, Out, X);
+}
+
+__gpu_kernel void atan2fKernel(const float *X, const float *Y, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<atan2f>(NumElements, Out, X, Y);
+}
+
 __gpu_kernel void atanhfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<atanhf>(NumElements, Out, X);
 }
 
+__gpu_kernel void atanhf16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<atanhf16>(NumElements, Out, X);
+}
+
+__gpu_kernel void cbrtKernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<cbrt>(NumElements, Out, X);
+}
+
 __gpu_kernel void cbrtfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<cbrtf>(NumElements, Out, X);
 }
 
+__gpu_kernel void cosKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<cos>(NumElements, Out, X);
+}
+
 __gpu_kernel void cosfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<cosf>(NumElements, Out, X);
 }
 
+__gpu_kernel void cosf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<cosf16>(NumElements, Out, X);
+}
+
 __gpu_kernel void coshfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<coshf>(NumElements, Out, X);
 }
 
+__gpu_kernel void coshf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<coshf16>(NumElements, Out, X);
+}
+
 __gpu_kernel void cospifKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<cospif>(NumElements, Out, X);
 }
 
+__gpu_kernel void cospif16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<cospif16>(NumElements, Out, X);
+}
+
 __gpu_kernel void erffKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<erff>(NumElements, Out, X);
 }
 
+__gpu_kernel void expKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<exp>(NumElements, Out, X);
+}
+
 __gpu_kernel void expfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<expf>(NumElements, Out, X);
 }
 
+__gpu_kernel void expf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<expf16>(NumElements, Out, X);
+}
+
+__gpu_kernel void exp10Kernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<exp10>(NumElements, Out, X);
+}
+
 __gpu_kernel void exp10fKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<exp10f>(NumElements, Out, X);
 }
 
+__gpu_kernel void exp10f16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<exp10f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void exp2Kernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<exp2>(NumElements, Out, X);
+}
+
 __gpu_kernel void exp2fKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<exp2f>(NumElements, Out, X);
 }
 
+__gpu_kernel void exp2f16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<exp2f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void expm1Kernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<expm1>(NumElements, Out, X);
+}
+
 __gpu_kernel void expm1fKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<expm1f>(NumElements, Out, X);
 }
 
-__gpu_kernel void hypotf16Kernel(const float16 *X, float16 *Y, float16 *Out,
+__gpu_kernel void expm1f16Kernel(const float16 *X, float16 *Out,
                                  size_t NumElements) noexcept {
+  runKernelBody<expm1f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void hypotKernel(const double *X, const double *Y, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<hypot>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void hypotfKernel(const float *X, const float *Y, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<hypotf>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void hypotf16Kernel(const float16 *X, const float16 *Y,
+                                 float16 *Out, size_t NumElements) noexcept {
   runKernelBody<hypotf16>(NumElements, Out, X, Y);
 }
 
+__gpu_kernel void logKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<log>(NumElements, Out, X);
+}
+
 __gpu_kernel void logfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<logf>(NumElements, Out, X);
 }
 
+__gpu_kernel void logf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<logf16>(NumElements, Out, X);
+}
+
+__gpu_kernel void log10Kernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<log10>(NumElements, Out, X);
+}
+
 __gpu_kernel void log10fKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<log10f>(NumElements, Out, X);
 }
 
+__gpu_kernel void log10f16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<log10f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void log1pKernel(const double *X, double *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<log1p>(NumElements, Out, X);
+}
+
 __gpu_kernel void log1pfKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<log1pf>(NumElements, Out, X);
 }
 
+__gpu_kernel void log2Kernel(const double *X, double *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<log2>(NumElements, Out, X);
+}
+
 __gpu_kernel void log2fKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<log2f>(NumElements, Out, X);
 }
 
+__gpu_kernel void log2f16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<log2f16>(NumElements, Out, X);
+}
+
+__gpu_kernel void powfKernel(const float *X, float *Y, float *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<powf>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void powfRoundedExponentKernel(const float *X, float *Y,
+                                            float *Out,
+                                            size_t NumElements) noexcept {
+  runKernelBody<powfRoundedExponent>(NumElements, Out, X, Y);
+}
+
+__gpu_kernel void sinKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<sin>(NumElements, Out, X);
+}
+
 __gpu_kernel void sinfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<sinf>(NumElements, Out, X);
 }
 
+__gpu_kernel void sinf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<sinf16>(NumElements, Out, X);
+}
+
+__gpu_kernel void sincosSinKernel(const double *X, double *Out,
+                                  size_t NumElements) noexcept {
+  runKernelBody<sincosSin>(NumElements, Out, X);
+}
+
+__gpu_kernel void sincosCosKernel(const double *X, double *Out,
+                                  size_t NumElements) noexcept {
+  runKernelBody<sincosCos>(NumElements, Out, X);
+}
+
 __gpu_kernel void sincosfSinKernel(const float *X, float *Out,
                                    size_t NumElements) noexcept {
   runKernelBody<sincosfSin>(NumElements, Out, X);
@@ -163,23 +370,53 @@ __gpu_kernel void sinhfKernel(const float *X, float *Out,
   runKernelBody<sinhf>(NumElements, Out, X);
 }
 
+__gpu_kernel void sinhf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<sinhf16>(NumElements, Out, X);
+}
+
 __gpu_kernel void sinpifKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<sinpif>(NumElements, Out, X);
 }
 
+__gpu_kernel void sinpif16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<sinpif16>(NumElements, Out, X);
+}
+
+__gpu_kernel void tanKernel(const double *X, double *Out,
+                            size_t NumElements) noexcept {
+  runKernelBody<tan>(NumElements, Out, X);
+}
+
 __gpu_kernel void tanfKernel(const float *X, float *Out,
                              size_t NumElements) noexcept {
   runKernelBody<tanf>(NumElements, Out, X);
 }
 
+__gpu_kernel void tanf16Kernel(const float16 *X, float16 *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<tanf16>(NumElements, Out, X);
+}
+
 __gpu_kernel void tanhfKernel(const float *X, float *Out,
                               size_t NumElements) noexcept {
   runKernelBody<tanhf>(NumElements, Out, X);
 }
 
+__gpu_kernel void tanhf16Kernel(const float16 *X, float16 *Out,
+                                size_t NumElements) noexcept {
+  runKernelBody<tanhf16>(NumElements, Out, X);
+}
+
 __gpu_kernel void tanpifKernel(const float *X, float *Out,
                                size_t NumElements) noexcept {
   runKernelBody<tanpif>(NumElements, Out, X);
 }
+
+__gpu_kernel void tanpif16Kernel(const float16 *X, float16 *Out,
+                                 size_t NumElements) noexcept {
+  runKernelBody<tanpif16>(NumElements, Out, X);
+}
 } // extern "C"
diff --git a/offload/unittests/Conformance/include/mathtest/ExhaustiveGenerator.hpp b/offload/unittests/Conformance/include/mathtest/ExhaustiveGenerator.hpp
index 6f7f7a9..39c6838 100644
--- a/offload/unittests/Conformance/include/mathtest/ExhaustiveGenerator.hpp
+++ b/offload/unittests/Conformance/include/mathtest/ExhaustiveGenerator.hpp
@@ -8,8 +8,8 @@
 ///
 /// \file
 /// This file contains the definition of the ExhaustiveGenerator class, a
-/// concrete input generator that exhaustively creates inputs from a given
-/// sequence of ranges.
+/// concrete range-based generator that exhaustively creates inputs from a
+/// given sequence of ranges.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -17,89 +17,62 @@
 #define MATHTEST_EXHAUSTIVEGENERATOR_HPP
 
 #include "mathtest/IndexedRange.hpp"
-#include "mathtest/InputGenerator.hpp"
+#include "mathtest/RangeBasedGenerator.hpp"
 
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/Support/Parallel.h"
-
-#include <algorithm>
 #include <array>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <optional>
 #include <tuple>
 
 namespace mathtest {
 
 template <typename... InTypes>
 class [[nodiscard]] ExhaustiveGenerator final
-    : public InputGenerator<InTypes...> {
-  static constexpr std::size_t NumInputs = sizeof...(InTypes);
-  static_assert(NumInputs > 0, "The number of inputs must be at least 1");
+    : public RangeBasedGenerator<ExhaustiveGenerator<InTypes...>, InTypes...> {
+
+  friend class RangeBasedGenerator<ExhaustiveGenerator<InTypes...>, InTypes...>;
+
+  using Base = RangeBasedGenerator<ExhaustiveGenerator<InTypes...>, InTypes...>;
+  using IndexArrayType = std::array<uint64_t, Base::NumInputs>;
+
+  using Base::RangesTuple;
+  using Base::Size;
 
 public:
   explicit constexpr ExhaustiveGenerator(
       const IndexedRange<InTypes> &...Ranges) noexcept
-      : RangesTuple(Ranges...) {
-    bool Overflowed = getSizeWithOverflow(Ranges..., Size);
+      : Base(Ranges...) {
+    const auto MaybeSize = getInputSpaceSize(Ranges...);
+
+    assert(MaybeSize.has_value() && "The size is too large");
+    Size = *MaybeSize;
 
-    assert(!Overflowed && "The input space size is too large");
-    assert((Size > 0) && "The input space size must be at least 1");
+    assert((Size > 0) && "The size must be at least 1");
 
     IndexArrayType DimSizes = {};
     std::size_t DimIndex = 0;
     ((DimSizes[DimIndex++] = Ranges.getSize()), ...);
 
-    Strides[NumInputs - 1] = 1;
-    if constexpr (NumInputs > 1)
-      for (int Index = static_cast<int>(NumInputs) - 2; Index >= 0; --Index)
+    Strides[Base::NumInputs - 1] = 1;
+    if constexpr (Base::NumInputs > 1)
+      for (int Index = static_cast<int>(Base::NumInputs) - 2; Index >= 0;
+           --Index)
         Strides[Index] = Strides[Index + 1] * DimSizes[Index + 1];
   }
 
-  void reset() noexcept override { NextFlatIndex = 0; }
-
-  [[nodiscard]] std::size_t
-  fill(llvm::MutableArrayRef<InTypes>... Buffers) noexcept override {
-    const std::array<std::size_t, NumInputs> BufferSizes = {Buffers.size()...};
-    const std::size_t BufferSize = BufferSizes[0];
-    assert((BufferSize != 0) && "Buffer size cannot be zero");
-    assert(std::all_of(BufferSizes.begin(), BufferSizes.end(),
-                       [&](std::size_t Size) { return Size == BufferSize; }) &&
-           "All input buffers must have the same size");
-
-    if (NextFlatIndex >= Size)
-      return 0;
-
-    const auto BatchSize = std::min<uint64_t>(BufferSize, Size - NextFlatIndex);
-    const auto CurrentFlatIndex = NextFlatIndex;
-    NextFlatIndex += BatchSize;
-
-    auto BufferPtrsTuple = std::make_tuple(Buffers.data()...);
-
-    llvm::parallelFor(0, BatchSize, [&](std::size_t Offset) {
-      writeInputs(CurrentFlatIndex, Offset, BufferPtrsTuple);
-    });
-
-    return static_cast<std::size_t>(BatchSize);
-  }
-
 private:
-  using RangesTupleType = std::tuple<IndexedRange<InTypes>...>;
-  using IndexArrayType = std::array<uint64_t, NumInputs>;
-
-  static bool getSizeWithOverflow(const IndexedRange<InTypes> &...Ranges,
-                                  uint64_t &Size) noexcept {
-    Size = 1;
-    bool Overflowed = false;
-
-    auto Multiplier = [&](const uint64_t RangeSize) {
-      if (!Overflowed)
-        Overflowed = __builtin_mul_overflow(Size, RangeSize, &Size);
-    };
+  [[nodiscard]] constexpr IndexArrayType
+  getNDIndex(uint64_t FlatIndex) const noexcept {
+    IndexArrayType NDIndex;
 
-    (Multiplier(Ranges.getSize()), ...);
+    for (std::size_t Index = 0; Index < Base::NumInputs; ++Index) {
+      NDIndex[Index] = FlatIndex / Strides[Index];
+      FlatIndex -= NDIndex[Index] * Strides[Index];
+    }
 
-    return Overflowed;
+    return NDIndex;
   }
 
   template <typename BufferPtrsTupleType>
@@ -109,31 +82,37 @@ private:
     writeInputsImpl<0>(NDIndex, Offset, BufferPtrsTuple);
   }
 
-  constexpr IndexArrayType getNDIndex(uint64_t FlatIndex) const noexcept {
-    IndexArrayType NDIndex;
-
-    for (std::size_t Index = 0; Index < NumInputs; ++Index) {
-      NDIndex[Index] = FlatIndex / Strides[Index];
-      FlatIndex -= NDIndex[Index] * Strides[Index];
-    }
-
-    return NDIndex;
-  }
-
   template <std::size_t Index, typename BufferPtrsTupleType>
   void writeInputsImpl(IndexArrayType NDIndex, uint64_t Offset,
                        BufferPtrsTupleType BufferPtrsTuple) const noexcept {
-    if constexpr (Index < NumInputs) {
+    if constexpr (Index < Base::NumInputs) {
       const auto &Range = std::get<Index>(RangesTuple);
       std::get<Index>(BufferPtrsTuple)[Offset] = Range[NDIndex[Index]];
+
       writeInputsImpl<Index + 1>(NDIndex, Offset, BufferPtrsTuple);
     }
   }
 
-  uint64_t Size = 1;
-  RangesTupleType RangesTuple;
+  [[nodiscard]] static constexpr std::optional<uint64_t>
+  getInputSpaceSize(const IndexedRange<InTypes> &...Ranges) noexcept {
+    uint64_t InputSpaceSize = 1;
+    bool Overflowed = false;
+
+    auto Multiplier = [&](const uint64_t RangeSize) {
+      if (!Overflowed)
+        Overflowed =
+            __builtin_mul_overflow(InputSpaceSize, RangeSize, &InputSpaceSize);
+    };
+
+    (Multiplier(Ranges.getSize()), ...);
+
+    if (Overflowed)
+      return std::nullopt;
+
+    return InputSpaceSize;
+  }
+
   IndexArrayType Strides = {};
-  uint64_t NextFlatIndex = 0;
 };
 } // namespace mathtest
 
diff --git a/offload/unittests/Conformance/include/mathtest/RandomGenerator.hpp b/offload/unittests/Conformance/include/mathtest/RandomGenerator.hpp
new file mode 100644
index 0000000..436cd05
--- /dev/null
+++ b/offload/unittests/Conformance/include/mathtest/RandomGenerator.hpp
@@ -0,0 +1,86 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the definition of the RandomGenerator class, a concrete
+/// range-based generator that randomly creates inputs from a given sequence of
+/// ranges.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef MATHTEST_RANDOMGENERATOR_HPP
+#define MATHTEST_RANDOMGENERATOR_HPP
+
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/RangeBasedGenerator.hpp"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+namespace mathtest {
+
+template <typename... InTypes>
+class [[nodiscard]] RandomGenerator final
+    : public RangeBasedGenerator<RandomGenerator<InTypes...>, InTypes...> {
+
+  friend class RangeBasedGenerator<RandomGenerator<InTypes...>, InTypes...>;
+
+  using Base = RangeBasedGenerator<RandomGenerator<InTypes...>, InTypes...>;
+
+  using Base::RangesTuple;
+  using Base::Size;
+
+public:
+  explicit constexpr RandomGenerator(
+      SeedTy BaseSeed, uint64_t Size,
+      const IndexedRange<InTypes> &...Ranges) noexcept
+      : Base(Size, Ranges...), BaseSeed(BaseSeed) {}
+
+private:
+  [[nodiscard]] static uint64_t getRandomIndex(RandomState &RNG,
+                                               uint64_t RangeSize) noexcept {
+    if (RangeSize == 0)
+      return 0;
+
+    const uint64_t Threshold = (-RangeSize) % RangeSize;
+
+    uint64_t RandomNumber;
+    do {
+      RandomNumber = RNG.next();
+    } while (RandomNumber < Threshold);
+
+    return RandomNumber % RangeSize;
+  }
+
+  template <typename BufferPtrsTupleType>
+  void writeInputs(uint64_t CurrentFlatIndex, uint64_t Offset,
+                   BufferPtrsTupleType BufferPtrsTuple) const noexcept {
+
+    RandomState RNG(SeedTy{BaseSeed.Value ^ (CurrentFlatIndex + Offset)});
+    writeInputsImpl<0>(RNG, Offset, BufferPtrsTuple);
+  }
+
+  template <std::size_t Index, typename BufferPtrsTupleType>
+  void writeInputsImpl(RandomState &RNG, uint64_t Offset,
+                       BufferPtrsTupleType BufferPtrsTuple) const noexcept {
+    if constexpr (Index < Base::NumInputs) {
+      const auto &Range = std::get<Index>(RangesTuple);
+      const auto RandomIndex = getRandomIndex(RNG, Range.getSize());
+      std::get<Index>(BufferPtrsTuple)[Offset] = Range[RandomIndex];
+
+      writeInputsImpl<Index + 1>(RNG, Offset, BufferPtrsTuple);
+    }
+  }
+
+  SeedTy BaseSeed;
+};
+} // namespace mathtest
+
+#endif // MATHTEST_RANDOMGENERATOR_HPP
diff --git a/offload/unittests/Conformance/include/mathtest/RandomState.hpp b/offload/unittests/Conformance/include/mathtest/RandomState.hpp
new file mode 100644
index 0000000..322d531
--- /dev/null
+++ b/offload/unittests/Conformance/include/mathtest/RandomState.hpp
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the definition of the RandomState class, a fast and
+/// lightweight pseudo-random number generator.
+///
+/// The implementation is based on the xorshift* generator, seeded using the
+/// SplitMix64 generator for robust initialization. For more details on the
+/// algorithm, see: https://en.wikipedia.org/wiki/Xorshift
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef MATHTEST_RANDOMSTATE_HPP
+#define MATHTEST_RANDOMSTATE_HPP
+
+#include <cstdint>
+
+struct SeedTy {
+  uint64_t Value;
+};
+
+class [[nodiscard]] RandomState {
+  uint64_t State;
+
+  [[nodiscard]] static constexpr uint64_t splitMix64(uint64_t X) noexcept {
+    X += 0x9E3779B97F4A7C15ULL;
+    X = (X ^ (X >> 30)) * 0xBF58476D1CE4E5B9ULL;
+    X = (X ^ (X >> 27)) * 0x94D049BB133111EBULL;
+    X = (X ^ (X >> 31));
+    return X ? X : 0x9E3779B97F4A7C15ULL;
+  }
+
+public:
+  explicit constexpr RandomState(SeedTy Seed) noexcept
+      : State(splitMix64(Seed.Value)) {}
+
+  inline uint64_t next() noexcept {
+    uint64_t X = State;
+    X ^= X >> 12;
+    X ^= X << 25;
+    X ^= X >> 27;
+    State = X;
+    return X * 0x2545F4914F6CDD1DULL;
+  }
+};
+
+#endif // MATHTEST_RANDOMSTATE_HPP
diff --git a/offload/unittests/Conformance/include/mathtest/RangeBasedGenerator.hpp b/offload/unittests/Conformance/include/mathtest/RangeBasedGenerator.hpp
new file mode 100644
index 0000000..5e1e113
--- /dev/null
+++ b/offload/unittests/Conformance/include/mathtest/RangeBasedGenerator.hpp
@@ -0,0 +1,86 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the definition of the RangeBasedGenerator class, a base
+/// class for input generators that operate on a sequence of ranges.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef MATHTEST_RANGEBASEDGENERATOR_HPP
+#define MATHTEST_RANGEBASEDGENERATOR_HPP
+
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/InputGenerator.hpp"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Parallel.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+namespace mathtest {
+
+template <typename Derived, typename... InTypes>
+class [[nodiscard]] RangeBasedGenerator : public InputGenerator<InTypes...> {
+public:
+  void reset() noexcept override { NextFlatIndex = 0; }
+
+  [[nodiscard]] std::size_t
+  fill(llvm::MutableArrayRef<InTypes>... Buffers) noexcept override {
+    const std::array<std::size_t, NumInputs> BufferSizes = {Buffers.size()...};
+    const std::size_t BufferSize = BufferSizes[0];
+    assert((BufferSize != 0) && "Buffer size cannot be zero");
+    assert(std::all_of(BufferSizes.begin(), BufferSizes.end(),
+                       [&](std::size_t Size) { return Size == BufferSize; }) &&
+           "All input buffers must have the same size");
+
+    if (NextFlatIndex >= Size)
+      return 0;
+
+    const auto BatchSize = std::min<uint64_t>(BufferSize, Size - NextFlatIndex);
+    const auto CurrentFlatIndex = NextFlatIndex;
+    NextFlatIndex += BatchSize;
+
+    auto BufferPtrsTuple = std::make_tuple(Buffers.data()...);
+
+    llvm::parallelFor(0, BatchSize, [&](std::size_t Offset) {
+      static_cast<Derived *>(this)->writeInputs(CurrentFlatIndex, Offset,
+                                                BufferPtrsTuple);
+    });
+
+    return static_cast<std::size_t>(BatchSize);
+  }
+
+protected:
+  using RangesTupleType = std::tuple<IndexedRange<InTypes>...>;
+
+  static constexpr std::size_t NumInputs = sizeof...(InTypes);
+  static_assert(NumInputs > 0, "The number of inputs must be at least 1");
+
+  explicit constexpr RangeBasedGenerator(
+      const IndexedRange<InTypes> &...Ranges) noexcept
+      : RangesTuple(Ranges...) {}
+
+  explicit constexpr RangeBasedGenerator(
+      uint64_t Size, const IndexedRange<InTypes> &...Ranges) noexcept
+      : RangesTuple(Ranges...), Size(Size) {}
+
+  RangesTupleType RangesTuple;
+  uint64_t Size = 0;
+
+private:
+  uint64_t NextFlatIndex = 0;
+};
+} // namespace mathtest
+
+#endif // MATHTEST_RANGEBASEDGENERATOR_HPP
diff --git a/offload/unittests/Conformance/lib/DeviceContext.cpp b/offload/unittests/Conformance/lib/DeviceContext.cpp
index a0068c3..6c3425f 100644
--- a/offload/unittests/Conformance/lib/DeviceContext.cpp
+++ b/offload/unittests/Conformance/lib/DeviceContext.cpp
@@ -55,13 +55,14 @@ static OffloadInitWrapper Wrapper{};
 
 [[nodiscard]] std::string getDeviceName(ol_device_handle_t DeviceHandle) {
   std::size_t PropSize = 0;
-  OL_CHECK(olGetDeviceInfoSize(DeviceHandle, OL_DEVICE_INFO_NAME, &PropSize));
+  OL_CHECK(olGetDeviceInfoSize(DeviceHandle, OL_DEVICE_INFO_PRODUCT_NAME,
+                               &PropSize));
 
   if (PropSize == 0)
     return "";
 
   std::string PropValue(PropSize, '\0');
-  OL_CHECK(olGetDeviceInfo(DeviceHandle, OL_DEVICE_INFO_NAME, PropSize,
+  OL_CHECK(olGetDeviceInfo(DeviceHandle, OL_DEVICE_INFO_PRODUCT_NAME, PropSize,
                            PropValue.data()));
   PropValue.pop_back(); // Remove the null terminator
 
diff --git a/offload/unittests/Conformance/tests/AcosTest.cpp b/offload/unittests/Conformance/tests/AcosTest.cpp
new file mode 100644
index 0000000..bc0d1d2
--- /dev/null
+++ b/offload/unittests/Conformance/tests/AcosTest.cpp
@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the acos function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'acos' function to select the double version
+constexpr auto acosd // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(acos);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<acosd> {
+  static constexpr llvm::StringRef Name = "acos";
+  static constexpr llvm::StringRef KernelName = "acosKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 4;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the acos function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range(/*Begin=*/-1.0,
+                             /*End=*/1.0,
+                             /*Inclusive=*/true);
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<acosd>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Acosf16Test.cpp b/offload/unittests/Conformance/tests/Acosf16Test.cpp
new file mode 100644
index 0000000..ce11cc2
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Acosf16Test.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the acosf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 acosf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<acosf16> {
+  static constexpr llvm::StringRef Name = "acosf16";
+  static constexpr llvm::StringRef KernelName = "acosf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the acosf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range(/*Begin=*/float16(-1.0),
+                              /*End=*/float16(1.0),
+                              /*Inclusive=*/true);
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<acosf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/AcosfTest.cpp b/offload/unittests/Conformance/tests/AcosfTest.cpp
index e69ee3b..65b2d18 100644
--- a/offload/unittests/Conformance/tests/AcosfTest.cpp
+++ b/offload/unittests/Conformance/tests/AcosfTest.cpp
@@ -40,7 +40,9 @@ int main(int argc, const char **argv) {
 
   using namespace mathtest;
 
-  IndexedRange<float> Range;
+  IndexedRange<float> Range(/*Begin=*/-1.0f,
+                            /*End=*/1.0f,
+                            /*Inclusive=*/true);
   ExhaustiveGenerator<float> Generator(Range);
 
   const auto Configs = cl::getTestConfigs();
diff --git a/offload/unittests/Conformance/tests/Acoshf16Test.cpp b/offload/unittests/Conformance/tests/Acoshf16Test.cpp
new file mode 100644
index 0000000..8043447
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Acoshf16Test.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the acoshf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/Numerics.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 acoshf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<acoshf16> {
+  static constexpr llvm::StringRef Name = "acoshf16";
+  static constexpr llvm::StringRef KernelName = "acoshf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the acoshf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range(/*Begin=*/float16(1.0),
+                              /*End=*/getMaxOrInf<float16>(),
+                              /*Inclusive=*/true);
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<acoshf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Acospif16Test.cpp b/offload/unittests/Conformance/tests/Acospif16Test.cpp
new file mode 100644
index 0000000..c5871e2
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Acospif16Test.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the acospif16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 acospif16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<acospif16> {
+  static constexpr llvm::StringRef Name = "acospif16";
+  static constexpr llvm::StringRef KernelName = "acospif16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the acospif16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range(/*Begin=*/float16(-1.0),
+                              /*End=*/float16(1.0),
+                              /*Inclusive=*/true);
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<acospif16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/AsinTest.cpp b/offload/unittests/Conformance/tests/AsinTest.cpp
new file mode 100644
index 0000000..aaaa37a
--- /dev/null
+++ b/offload/unittests/Conformance/tests/AsinTest.cpp
@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the asin function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'asin' function to select the double version
+constexpr auto asind // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(asin);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<asind> {
+  static constexpr llvm::StringRef Name = "asin";
+  static constexpr llvm::StringRef KernelName = "asinKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 4;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the asin function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range(/*Begin=*/-1.0,
+                             /*End=*/1.0,
+                             /*Inclusive=*/true);
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<asind>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Asinf16Test.cpp b/offload/unittests/Conformance/tests/Asinf16Test.cpp
new file mode 100644
index 0000000..5784d6b
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Asinf16Test.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the asinf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 asinf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<asinf16> {
+  static constexpr llvm::StringRef Name = "asinf16";
+  static constexpr llvm::StringRef KernelName = "asinf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the asinf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range(/*Begin=*/float16(-1.0),
+                              /*End=*/float16(1.0),
+                              /*Inclusive=*/true);
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<asinf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/AsinfTest.cpp b/offload/unittests/Conformance/tests/AsinfTest.cpp
index 991f79b..aeee648 100644
--- a/offload/unittests/Conformance/tests/AsinfTest.cpp
+++ b/offload/unittests/Conformance/tests/AsinfTest.cpp
@@ -40,7 +40,9 @@ int main(int argc, const char **argv) {
 
   using namespace mathtest;
 
-  IndexedRange<float> Range;
+  IndexedRange<float> Range(/*Begin=*/-1.0f,
+                            /*End=*/1.0f,
+                            /*Inclusive=*/true);
   ExhaustiveGenerator<float> Generator(Range);
 
   const auto Configs = cl::getTestConfigs();
diff --git a/offload/unittests/Conformance/tests/Asinhf16Test.cpp b/offload/unittests/Conformance/tests/Asinhf16Test.cpp
new file mode 100644
index 0000000..0af9bcb
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Asinhf16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the asinhf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 asinhf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<asinhf16> {
+  static constexpr llvm::StringRef Name = "asinhf16";
+  static constexpr llvm::StringRef KernelName = "asinhf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the asinhf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<asinhf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Atan2fTest.cpp b/offload/unittests/Conformance/tests/Atan2fTest.cpp
new file mode 100644
index 0000000..4a46f9a
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Atan2fTest.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the atan2f function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace mathtest {
+
+template <> struct FunctionConfig<atan2f> {
+  static constexpr llvm::StringRef Name = "atan2f";
+  static constexpr llvm::StringRef KernelName = "atan2fKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 65, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 6;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the atan2f function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<float> RangeX;
+  IndexedRange<float> RangeY;
+  RandomGenerator<float, float> Generator(SeedTy{Seed}, Size, RangeX, RangeY);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<atan2f>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Atanf16Test.cpp b/offload/unittests/Conformance/tests/Atanf16Test.cpp
new file mode 100644
index 0000000..3d3fa38
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Atanf16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the atanf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 atanf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<atanf16> {
+  static constexpr llvm::StringRef Name = "atanf16";
+  static constexpr llvm::StringRef KernelName = "atanf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the atanf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<atanf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Atanhf16Test.cpp b/offload/unittests/Conformance/tests/Atanhf16Test.cpp
new file mode 100644
index 0000000..86a0f82
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Atanhf16Test.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the atanhf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 atanhf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<atanhf16> {
+  static constexpr llvm::StringRef Name = "atanhf16";
+  static constexpr llvm::StringRef KernelName = "atanhf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the atanhf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range(/*Begin=*/float16(-1.0),
+                              /*End=*/float16(1.0),
+                              /*Inclusive=*/true);
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<atanhf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/CMakeLists.txt b/offload/unittests/Conformance/tests/CMakeLists.txt
index 8c0109ba..ad94df8 100644
--- a/offload/unittests/Conformance/tests/CMakeLists.txt
+++ b/offload/unittests/Conformance/tests/CMakeLists.txt
@@ -3,30 +3,72 @@ if(NOT TARGET libc)
     return()
 endif()
 
+add_conformance_test(acos AcosTest.cpp)
 add_conformance_test(acosf AcosfTest.cpp)
+add_conformance_test(acosf16 Acosf16Test.cpp)
 add_conformance_test(acoshf AcoshfTest.cpp)
+add_conformance_test(acoshf16 Acoshf16Test.cpp)
+add_conformance_test(acospif16 Acospif16Test.cpp)
+add_conformance_test(asin AsinTest.cpp)
 add_conformance_test(asinf AsinfTest.cpp)
+add_conformance_test(asinf16 Asinf16Test.cpp)
 add_conformance_test(asinhf AsinhfTest.cpp)
+add_conformance_test(asinhf16 Asinhf16Test.cpp)
 add_conformance_test(atanf AtanfTest.cpp)
+add_conformance_test(atanf16 Atanf16Test.cpp)
+add_conformance_test(atan2f Atan2fTest.cpp)
 add_conformance_test(atanhf AtanhfTest.cpp)
+add_conformance_test(atanhf16 Atanhf16Test.cpp)
+add_conformance_test(cbrt CbrtTest.cpp)
 add_conformance_test(cbrtf CbrtfTest.cpp)
+add_conformance_test(cos CosTest.cpp)
 add_conformance_test(cosf CosfTest.cpp)
+add_conformance_test(cosf16 Cosf16Test.cpp)
 add_conformance_test(coshf CoshfTest.cpp)
+add_conformance_test(coshf16 Coshf16Test.cpp)
 add_conformance_test(cospif CospifTest.cpp)
+add_conformance_test(cospif16 Cospif16Test.cpp)
 add_conformance_test(erff ErffTest.cpp)
+add_conformance_test(exp ExpTest.cpp)
 add_conformance_test(expf ExpfTest.cpp)
+add_conformance_test(expf16 Expf16Test.cpp)
+add_conformance_test(exp10 Exp10Test.cpp)
 add_conformance_test(exp10f Exp10fTest.cpp)
+add_conformance_test(exp10f16 Exp10f16Test.cpp)
+add_conformance_test(exp2 Exp2Test.cpp)
 add_conformance_test(exp2f Exp2fTest.cpp)
+add_conformance_test(exp2f16 Exp2f16Test.cpp)
+add_conformance_test(expm1 Expm1Test.cpp)
 add_conformance_test(expm1f Expm1fTest.cpp)
+add_conformance_test(expm1f16 Expm1f16Test.cpp)
+add_conformance_test(hypot HypotTest.cpp)
+add_conformance_test(hypotf HypotfTest.cpp)
 add_conformance_test(hypotf16 Hypotf16Test.cpp)
+add_conformance_test(log LogTest.cpp)
 add_conformance_test(logf LogfTest.cpp)
+add_conformance_test(logf16 Logf16Test.cpp)
+add_conformance_test(log10 Log10Test.cpp)
 add_conformance_test(log10f Log10fTest.cpp)
+add_conformance_test(log10f16 Log10f16Test.cpp)
+add_conformance_test(log1p Log1pTest.cpp)
 add_conformance_test(log1pf Log1pfTest.cpp)
+add_conformance_test(log2 Log2Test.cpp)
 add_conformance_test(log2f Log2fTest.cpp)
+add_conformance_test(log2f16 Log2f16Test.cpp)
+add_conformance_test(powf PowfTest.cpp)
+add_conformance_test(sin SinTest.cpp)
 add_conformance_test(sinf SinfTest.cpp)
+add_conformance_test(sinf16 Sinf16Test.cpp)
+add_conformance_test(sincos SincosTest.cpp)
 add_conformance_test(sincosf SincosfTest.cpp)
 add_conformance_test(sinhf SinhfTest.cpp)
+add_conformance_test(sinhf16 Sinhf16Test.cpp)
 add_conformance_test(sinpif SinpifTest.cpp)
+add_conformance_test(sinpif16 Sinpif16Test.cpp)
+add_conformance_test(tan TanTest.cpp)
 add_conformance_test(tanf TanfTest.cpp)
+add_conformance_test(tanf16 Tanf16Test.cpp)
 add_conformance_test(tanhf TanhfTest.cpp)
+add_conformance_test(tanhf16 Tanhf16Test.cpp)
 add_conformance_test(tanpif TanpifTest.cpp)
+add_conformance_test(tanpif16 Tanpif16Test.cpp)
diff --git a/offload/unittests/Conformance/tests/CbrtTest.cpp b/offload/unittests/Conformance/tests/CbrtTest.cpp
new file mode 100644
index 0000000..3a6523b
--- /dev/null
+++ b/offload/unittests/Conformance/tests/CbrtTest.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the cbrt function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'cbrt' function to select the double version
+constexpr auto cbrtd // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(cbrt);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<cbrtd> {
+  static constexpr llvm::StringRef Name = "cbrt";
+  static constexpr llvm::StringRef KernelName = "cbrtKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the cbrt function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range;
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<cbrtd>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/CosTest.cpp b/offload/unittests/Conformance/tests/CosTest.cpp
new file mode 100644
index 0000000..e3d3d3d
--- /dev/null
+++ b/offload/unittests/Conformance/tests/CosTest.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the cos function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'cos' function to select the double version
+constexpr auto cosd // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(cos);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<cosd> {
+  static constexpr llvm::StringRef Name = "cos";
+  static constexpr llvm::StringRef KernelName = "cosKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 4;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the cos function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range;
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<cosd>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Cosf16Test.cpp b/offload/unittests/Conformance/tests/Cosf16Test.cpp
new file mode 100644
index 0000000..680e4b9
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Cosf16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the cosf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 cosf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<cosf16> {
+  static constexpr llvm::StringRef Name = "cosf16";
+  static constexpr llvm::StringRef KernelName = "cosf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the cosf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<cosf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Coshf16Test.cpp b/offload/unittests/Conformance/tests/Coshf16Test.cpp
new file mode 100644
index 0000000..1b378b5
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Coshf16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the coshf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 coshf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<coshf16> {
+  static constexpr llvm::StringRef Name = "coshf16";
+  static constexpr llvm::StringRef KernelName = "coshf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the coshf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<coshf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Cospif16Test.cpp b/offload/unittests/Conformance/tests/Cospif16Test.cpp
new file mode 100644
index 0000000..84aa682
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Cospif16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the cospif16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 cospif16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<cospif16> {
+  static constexpr llvm::StringRef Name = "cospif16";
+  static constexpr llvm::StringRef KernelName = "cospif16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the cospif16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<cospif16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Exp10Test.cpp b/offload/unittests/Conformance/tests/Exp10Test.cpp
new file mode 100644
index 0000000..05af478
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Exp10Test.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the exp10 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'exp10' function to select the double version
+constexpr auto exp10d // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(exp10);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<exp10d> {
+  static constexpr llvm::StringRef Name = "exp10";
+  static constexpr llvm::StringRef KernelName = "exp10Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 3;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the exp10 function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range;
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<exp10d>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Exp10f16Test.cpp b/offload/unittests/Conformance/tests/Exp10f16Test.cpp
new file mode 100644
index 0000000..7d61ad0
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Exp10f16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the exp10f16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 exp10f16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<exp10f16> {
+  static constexpr llvm::StringRef Name = "exp10f16";
+  static constexpr llvm::StringRef KernelName = "exp10f16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the exp10f16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<exp10f16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Exp2Test.cpp b/offload/unittests/Conformance/tests/Exp2Test.cpp
new file mode 100644
index 0000000..bb2fa10
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Exp2Test.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the exp2 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'exp2' function to select the double version
+constexpr auto exp2d // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(exp2);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<exp2d> {
+  static constexpr llvm::StringRef Name = "exp2";
+  static constexpr llvm::StringRef KernelName = "exp2Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 3;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the exp2 function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range;
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<exp2d>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Exp2f16Test.cpp b/offload/unittests/Conformance/tests/Exp2f16Test.cpp
new file mode 100644
index 0000000..9ea9256
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Exp2f16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the exp2f16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 exp2f16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<exp2f16> {
+  static constexpr llvm::StringRef Name = "exp2f16";
+  static constexpr llvm::StringRef KernelName = "exp2f16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the exp2f16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<exp2f16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/ExpTest.cpp b/offload/unittests/Conformance/tests/ExpTest.cpp
new file mode 100644
index 0000000..9aa52b1
--- /dev/null
+++ b/offload/unittests/Conformance/tests/ExpTest.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the exp function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'exp' function to select the double version
+constexpr auto expd // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(exp);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<expd> {
+  static constexpr llvm::StringRef Name = "exp";
+  static constexpr llvm::StringRef KernelName = "expKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 3;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the exp function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range;
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<expd>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Expf16Test.cpp b/offload/unittests/Conformance/tests/Expf16Test.cpp
new file mode 100644
index 0000000..8938815
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Expf16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the expf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 expf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<expf16> {
+  static constexpr llvm::StringRef Name = "expf16";
+  static constexpr llvm::StringRef KernelName = "expf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the expf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<expf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Expm1Test.cpp b/offload/unittests/Conformance/tests/Expm1Test.cpp
new file mode 100644
index 0000000..a27944b
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Expm1Test.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the expm1 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'expm1' function to select the double version
+constexpr auto expm1d // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(expm1);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<expm1d> {
+  static constexpr llvm::StringRef Name = "expm1";
+  static constexpr llvm::StringRef KernelName = "expm1Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 3;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the expm1 function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range;
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<expm1d>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Expm1f16Test.cpp b/offload/unittests/Conformance/tests/Expm1f16Test.cpp
new file mode 100644
index 0000000..447196bb
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Expm1f16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the expm1f16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 expm1f16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<expm1f16> {
+  static constexpr llvm::StringRef Name = "expm1f16";
+  static constexpr llvm::StringRef KernelName = "expm1f16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the expm1f16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<expm1f16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/HypotTest.cpp b/offload/unittests/Conformance/tests/HypotTest.cpp
new file mode 100644
index 0000000..0417ad9
--- /dev/null
+++ b/offload/unittests/Conformance/tests/HypotTest.cpp
@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the hypot function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'hypot' function to select the double version
+constexpr auto hypotd // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double, double)>(hypot);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<hypotd> {
+  static constexpr llvm::StringRef Name = "hypot";
+  static constexpr llvm::StringRef KernelName = "hypotKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 4;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the hypot function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> RangeX;
+  IndexedRange<double> RangeY;
+  RandomGenerator<double, double> Generator(SeedTy{Seed}, Size, RangeX, RangeY);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<hypotd>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/HypotfTest.cpp b/offload/unittests/Conformance/tests/HypotfTest.cpp
new file mode 100644
index 0000000..98a4e90
--- /dev/null
+++ b/offload/unittests/Conformance/tests/HypotfTest.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the hypotf function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace mathtest {
+
+template <> struct FunctionConfig<hypotf> {
+  static constexpr llvm::StringRef Name = "hypotf";
+  static constexpr llvm::StringRef KernelName = "hypotfKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 65, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 4;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the hypotf function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<float> RangeX;
+  IndexedRange<float> RangeY;
+  RandomGenerator<float, float> Generator(SeedTy{Seed}, Size, RangeX, RangeY);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<hypotf>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Log10Test.cpp b/offload/unittests/Conformance/tests/Log10Test.cpp
new file mode 100644
index 0000000..bf46f11
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Log10Test.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the log10 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <limits>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'log10' function to select the double version
+constexpr auto log10d // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(log10);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<log10d> {
+  static constexpr llvm::StringRef Name = "log10";
+  static constexpr llvm::StringRef KernelName = "log10Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 3;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the log10 function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range(/*Begin=*/0.0,
+                             /*End=*/std::numeric_limits<double>::infinity(),
+                             /*Inclusive=*/true);
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<log10d>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Log10f16Test.cpp b/offload/unittests/Conformance/tests/Log10f16Test.cpp
new file mode 100644
index 0000000..605e1ae
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Log10f16Test.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the log10f16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/Numerics.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 log10f16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<log10f16> {
+  static constexpr llvm::StringRef Name = "log10f16";
+  static constexpr llvm::StringRef KernelName = "log10f16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the log10f16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range(/*Begin=*/float16(0.0),
+                              /*End=*/getMaxOrInf<float16>(),
+                              /*Inclusive=*/true);
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<log10f16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Log1pTest.cpp b/offload/unittests/Conformance/tests/Log1pTest.cpp
new file mode 100644
index 0000000..023b67e
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Log1pTest.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the log1p function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <limits>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'log1p' function to select the double version
+constexpr auto log1pd // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(log1p);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<log1pd> {
+  static constexpr llvm::StringRef Name = "log1p";
+  static constexpr llvm::StringRef KernelName = "log1pKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the log1p function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range(/*Begin=*/-1.0,
+                             /*End=*/std::numeric_limits<double>::infinity(),
+                             /*Inclusive=*/true);
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<log1pd>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Log2Test.cpp b/offload/unittests/Conformance/tests/Log2Test.cpp
new file mode 100644
index 0000000..2ae7e5c
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Log2Test.cpp
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the log2 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <limits>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'log2' function to select the double version
+constexpr auto log2d // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(log2);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<log2d> {
+  static constexpr llvm::StringRef Name = "log2";
+  static constexpr llvm::StringRef KernelName = "log2Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 3;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the log2 function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range(/*Begin=*/0.0,
+                             /*End=*/std::numeric_limits<double>::infinity(),
+                             /*Inclusive=*/true);
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<log2d>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Log2f16Test.cpp b/offload/unittests/Conformance/tests/Log2f16Test.cpp
new file mode 100644
index 0000000..5ce4696
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Log2f16Test.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the log2f16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/Numerics.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 log2f16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<log2f16> {
+  static constexpr llvm::StringRef Name = "log2f16";
+  static constexpr llvm::StringRef KernelName = "log2f16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the log2f16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range(/*Begin=*/float16(0.0),
+                              /*End=*/getMaxOrInf<float16>(),
+                              /*Inclusive=*/true);
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<log2f16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/LogTest.cpp b/offload/unittests/Conformance/tests/LogTest.cpp
new file mode 100644
index 0000000..ae568e2
--- /dev/null
+++ b/offload/unittests/Conformance/tests/LogTest.cpp
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the log function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <limits>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'log' function to select the double version
+constexpr auto logd // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(log);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<logd> {
+  static constexpr llvm::StringRef Name = "log";
+  static constexpr llvm::StringRef KernelName = "logKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 3;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the log function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range(/*Begin=*/0.0,
+                             /*End=*/std::numeric_limits<double>::infinity(),
+                             /*Inclusive=*/true);
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<logd>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Logf16Test.cpp b/offload/unittests/Conformance/tests/Logf16Test.cpp
new file mode 100644
index 0000000..372dccb
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Logf16Test.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the logf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/Numerics.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 logf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<logf16> {
+  static constexpr llvm::StringRef Name = "logf16";
+  static constexpr llvm::StringRef KernelName = "logf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the logf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range(/*Begin=*/float16(0.0),
+                              /*End=*/getMaxOrInf<float16>(),
+                              /*Inclusive=*/true);
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<logf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/PowfTest.cpp b/offload/unittests/Conformance/tests/PowfTest.cpp
new file mode 100644
index 0000000..246801e
--- /dev/null
+++ b/offload/unittests/Conformance/tests/PowfTest.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the powf function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+static inline float powfRoundedExponent(float Base, float Exponent) {
+  return powf(Base, roundf(Exponent));
+}
+
+namespace mathtest {
+
+template <> struct FunctionConfig<powf> {
+  static constexpr llvm::StringRef Name = "powf (real exponents)";
+  static constexpr llvm::StringRef KernelName = "powfKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 65, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 16;
+};
+
+template <> struct FunctionConfig<powfRoundedExponent> {
+  static constexpr llvm::StringRef Name = "powf (integer exponents)";
+  static constexpr llvm::StringRef KernelName = "powfRoundedExponentKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 65, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 16;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the powf function");
+
+  using namespace mathtest;
+
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<float> RangeX;
+  IndexedRange<float> RangeY;
+  RandomGenerator<float, float> Generator0(SeedTy{42}, Size, RangeX, RangeY);
+  RandomGenerator<float, float> Generator1(SeedTy{51}, Size, RangeX, RangeY);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool RealExponentsPassed =
+      runTests<powf>(Generator0, Configs, DeviceBinaryDir, IsVerbose);
+  bool IntegerExponentsPassed = runTests<powfRoundedExponent>(
+      Generator1, Configs, DeviceBinaryDir, IsVerbose);
+
+  return (RealExponentsPassed && IntegerExponentsPassed) ? EXIT_SUCCESS
+                                                         : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/SinTest.cpp b/offload/unittests/Conformance/tests/SinTest.cpp
new file mode 100644
index 0000000..36897d7
--- /dev/null
+++ b/offload/unittests/Conformance/tests/SinTest.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the sin function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'sin' function to select the double version
+constexpr auto sind // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(sin);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<sind> {
+  static constexpr llvm::StringRef Name = "sin";
+  static constexpr llvm::StringRef KernelName = "sinKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 4;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the sin function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range;
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<sind>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/SincosTest.cpp b/offload/unittests/Conformance/tests/SincosTest.cpp
new file mode 100644
index 0000000..a3d1650
--- /dev/null
+++ b/offload/unittests/Conformance/tests/SincosTest.cpp
@@ -0,0 +1,80 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the sincos function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+static inline double sincosSin(double X) {
+  double SinX, CosX;
+  sincos(X, &SinX, &CosX);
+  return SinX;
+}
+
+static inline double sincosCos(double X) {
+  double SinX, CosX;
+  sincos(X, &SinX, &CosX);
+  return CosX;
+}
+
+namespace mathtest {
+
+template <> struct FunctionConfig<sincosSin> {
+  static constexpr llvm::StringRef Name = "sincos (sin part)";
+  static constexpr llvm::StringRef KernelName = "sincosSinKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 4;
+};
+
+template <> struct FunctionConfig<sincosCos> {
+  static constexpr llvm::StringRef Name = "sincos (cos part)";
+  static constexpr llvm::StringRef KernelName = "sincosCosKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 4;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the sincos function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range;
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool SinPartPassed =
+      runTests<sincosSin>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+  bool CosPartPassed =
+      runTests<sincosCos>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return (SinPartPassed && CosPartPassed) ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Sinf16Test.cpp b/offload/unittests/Conformance/tests/Sinf16Test.cpp
new file mode 100644
index 0000000..4c5fb22
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Sinf16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the sinf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 sinf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<sinf16> {
+  static constexpr llvm::StringRef Name = "sinf16";
+  static constexpr llvm::StringRef KernelName = "sinf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the sinf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<sinf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Sinhf16Test.cpp b/offload/unittests/Conformance/tests/Sinhf16Test.cpp
new file mode 100644
index 0000000..fe6f7dd
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Sinhf16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the sinhf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 sinhf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<sinhf16> {
+  static constexpr llvm::StringRef Name = "sinhf16";
+  static constexpr llvm::StringRef KernelName = "sinhf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the sinhf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<sinhf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Sinpif16Test.cpp b/offload/unittests/Conformance/tests/Sinpif16Test.cpp
new file mode 100644
index 0000000..ff9c93c
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Sinpif16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the sinpif16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 sinpif16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<sinpif16> {
+  static constexpr llvm::StringRef Name = "sinpif16";
+  static constexpr llvm::StringRef KernelName = "sinpif16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the sinpif16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<sinpif16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/TanTest.cpp b/offload/unittests/Conformance/tests/TanTest.cpp
new file mode 100644
index 0000000..3a9a058
--- /dev/null
+++ b/offload/unittests/Conformance/tests/TanTest.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the tan function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/RandomGenerator.hpp"
+#include "mathtest/RandomState.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+namespace {
+
+// Disambiguate the overloaded 'tan' function to select the double version
+constexpr auto tand // NOLINT(readability-identifier-naming)
+    = static_cast<double (*)(double)>(tan);
+} // namespace
+
+namespace mathtest {
+
+template <> struct FunctionConfig<tand> {
+  static constexpr llvm::StringRef Name = "tan";
+  static constexpr llvm::StringRef KernelName = "tanKernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 68, Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 5;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the tan function");
+
+  using namespace mathtest;
+
+  uint64_t Seed = 42;
+  uint64_t Size = 1ULL << 32;
+  IndexedRange<double> Range;
+  RandomGenerator<double> Generator(SeedTy{Seed}, Size, Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed = runTests<tand>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Tanf16Test.cpp b/offload/unittests/Conformance/tests/Tanf16Test.cpp
new file mode 100644
index 0000000..eae9818
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Tanf16Test.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the tanf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 tanf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<tanf16> {
+  static constexpr llvm::StringRef Name = "tanf16";
+  static constexpr llvm::StringRef KernelName = "tanf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  // Note:   The minimum accuracy at the source is 2.5 ULP, but we round it
+  //         down to ensure conformance.
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the tanf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<tanf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Tanhf16Test.cpp b/offload/unittests/Conformance/tests/Tanhf16Test.cpp
new file mode 100644
index 0000000..1a11f3d
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Tanhf16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the tanhf16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 tanhf16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<tanhf16> {
+  static constexpr llvm::StringRef Name = "tanhf16";
+  static constexpr llvm::StringRef KernelName = "tanhf16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Conformance test of the tanhf16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<tanhf16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/Conformance/tests/Tanpif16Test.cpp b/offload/unittests/Conformance/tests/Tanpif16Test.cpp
new file mode 100644
index 0000000..7637480
--- /dev/null
+++ b/offload/unittests/Conformance/tests/Tanpif16Test.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the conformance test of the tanpif16 function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mathtest/CommandLineExtras.hpp"
+#include "mathtest/ExhaustiveGenerator.hpp"
+#include "mathtest/IndexedRange.hpp"
+#include "mathtest/TestConfig.hpp"
+#include "mathtest/TestRunner.hpp"
+#include "mathtest/TypeExtras.hpp"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <math.h>
+
+using namespace mathtest;
+
+extern "C" float16 tanpif16(float16);
+
+namespace mathtest {
+
+template <> struct FunctionConfig<tanpif16> {
+  static constexpr llvm::StringRef Name = "tanpif16";
+  static constexpr llvm::StringRef KernelName = "tanpif16Kernel";
+
+  // Source: The Khronos Group, The OpenCL C Specification v3.0.19, Sec. 7.4,
+  //         Table 69 (Full Profile), Khronos Registry [July 10, 2025].
+  static constexpr uint64_t UlpTolerance = 2;
+};
+} // namespace mathtest
+
+int main(int argc, const char **argv) {
+  llvm::cl::ParseCommandLineOptions(
+      argc, argv, "Conformance test of the tanpif16 function");
+
+  using namespace mathtest;
+
+  IndexedRange<float16> Range;
+  ExhaustiveGenerator<float16> Generator(Range);
+
+  const auto Configs = cl::getTestConfigs();
+  const llvm::StringRef DeviceBinaryDir = DEVICE_BINARY_DIR;
+  const bool IsVerbose = cl::IsVerbose;
+
+  bool Passed =
+      runTests<tanpif16>(Generator, Configs, DeviceBinaryDir, IsVerbose);
+
+  return Passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index 8f0267e..b2d51442 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -20,10 +20,12 @@ add_offload_unittest("init"
 target_compile_definitions("init.unittests" PRIVATE DISABLE_WRAPPER)
 
 add_offload_unittest("kernel"
+    kernel/olCalculateOptimalOccupancy.cpp
     kernel/olLaunchKernel.cpp)
 
 add_offload_unittest("memory"
     memory/olMemAlloc.cpp
+    memory/olMemFill.cpp
     memory/olMemFree.cpp
     memory/olMemcpy.cpp)
 
@@ -41,7 +43,8 @@ add_offload_unittest("queue"
     queue/olDestroyQueue.cpp
     queue/olGetQueueInfo.cpp
     queue/olGetQueueInfoSize.cpp
-    queue/olWaitEvents.cpp)
+    queue/olWaitEvents.cpp
+    queue/olLaunchHostFunction.cpp)
 
 add_offload_unittest("symbol"
     symbol/olGetSymbol.cpp
diff --git a/offload/unittests/OffloadAPI/common/Environment.cpp b/offload/unittests/OffloadAPI/common/Environment.cpp
index ef092cd..8007713 100644
--- a/offload/unittests/OffloadAPI/common/Environment.cpp
+++ b/offload/unittests/OffloadAPI/common/Environment.cpp
@@ -41,9 +41,9 @@ raw_ostream &operator<<(raw_ostream &Out,
 
 raw_ostream &operator<<(raw_ostream &Out, const ol_device_handle_t &Device) {
   size_t Size;
-  olGetDeviceInfoSize(Device, OL_DEVICE_INFO_NAME, &Size);
+  olGetDeviceInfoSize(Device, OL_DEVICE_INFO_PRODUCT_NAME, &Size);
   std::vector<char> Name(Size);
-  olGetDeviceInfo(Device, OL_DEVICE_INFO_NAME, Size, Name.data());
+  olGetDeviceInfo(Device, OL_DEVICE_INFO_PRODUCT_NAME, Size, Name.data());
   Out << Name.data();
   return Out;
 }
diff --git a/offload/unittests/OffloadAPI/common/Fixtures.hpp b/offload/unittests/OffloadAPI/common/Fixtures.hpp
index 43240fa..0538e60f 100644
--- a/offload/unittests/OffloadAPI/common/Fixtures.hpp
+++ b/offload/unittests/OffloadAPI/common/Fixtures.hpp
@@ -26,6 +26,20 @@
   } while (0)
 #endif
 
+#ifndef ASSERT_SUCCESS_OR_UNSUPPORTED
+#define ASSERT_SUCCESS_OR_UNSUPPORTED(ACTUAL)                                  \
+  do {                                                                         \
+    ol_result_t Res = ACTUAL;                                                  \
+    if (Res && Res->Code == OL_ERRC_UNSUPPORTED) {                             \
+      GTEST_SKIP() << #ACTUAL " returned unsupported; skipping test";          \
+      return;                                                                  \
+    } else if (Res && Res->Code != OL_ERRC_SUCCESS) {                          \
+      GTEST_FAIL() << #ACTUAL " returned " << Res->Code << ": "                \
+                   << Res->Details;                                            \
+    }                                                                          \
+  } while (0)
+#endif
+
 // TODO: rework this so the EXPECTED/ACTUAL results are readable
 #ifndef ASSERT_ERROR
 #define ASSERT_ERROR(EXPECTED, ACTUAL)                                         \
@@ -75,6 +89,40 @@ template <typename Fn> inline void threadify(Fn body) {
   }
 }
 
+/// Enqueues a task to the queue that can be manually resolved.
+// It will block until `trigger` is called.
+struct ManuallyTriggeredTask {
+  std::mutex M;
+  std::condition_variable CV;
+  bool Flag = false;
+  ol_event_handle_t CompleteEvent;
+
+  ol_result_t enqueue(ol_queue_handle_t Queue) {
+    if (auto Err = olLaunchHostFunction(
+            Queue,
+            [](void *That) {
+              static_cast<ManuallyTriggeredTask *>(That)->wait();
+            },
+            this))
+      return Err;
+
+    return olCreateEvent(Queue, &CompleteEvent);
+  }
+
+  void wait() {
+    std::unique_lock<std::mutex> lk(M);
+    CV.wait_for(lk, std::chrono::milliseconds(1000), [&] { return Flag; });
+    EXPECT_TRUE(Flag);
+  }
+
+  ol_result_t trigger() {
+    Flag = true;
+    CV.notify_one();
+
+    return olSyncEvent(CompleteEvent);
+  }
+};
+
 struct OffloadTest : ::testing::Test {
   ol_device_handle_t Host = TestEnvironment::getHostDevice();
 };
diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
index 5657320..8cb0b80 100644
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
@@ -13,6 +13,38 @@
 using olGetDeviceInfoTest = OffloadDeviceTest;
 OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetDeviceInfoTest);
 
+#define OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Dev,   \
+                                          Expr)                                \
+  TEST_P(olGetDeviceInfoTest, Test##Dev##TestName) {                           \
+    PropType Value;                                                            \
+    ASSERT_SUCCESS(olGetDeviceInfo(Dev, PropName, sizeof(Value), &Value));     \
+    Expr;                                                                      \
+  }
+
+#define OL_DEVICE_INFO_TEST_DEVICE_SUCCESS(TestName, PropType, PropName)       \
+  OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Device, {})
+
+#define OL_DEVICE_INFO_TEST_HOST_SUCCESS(TestName, PropType, PropName)         \
+  OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Host, {})
+
+#define OL_DEVICE_INFO_TEST_SUCCESS(TestName, PropType, PropName)              \
+  OL_DEVICE_INFO_TEST_DEVICE_SUCCESS(TestName, PropType, PropName)             \
+  OL_DEVICE_INFO_TEST_HOST_SUCCESS(TestName, PropType, PropName)
+
+#define OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(TestName, PropType, PropName,      \
+                                            LowBound)                          \
+  OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Device,      \
+                                    ASSERT_GT(Value, LowBound))
+
+#define OL_DEVICE_INFO_TEST_HOST_VALUE_GT(TestName, PropType, PropName,        \
+                                          LowBound)                            \
+  OL_DEVICE_INFO_TEST_SUCCESS_CHECK(TestName, PropType, PropName, Host,        \
+                                    ASSERT_GT(Value, LowBound))
+
+#define OL_DEVICE_INFO_TEST_VALUE_GT(TestName, PropType, PropName, LowBound)   \
+  OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(TestName, PropType, PropName, LowBound)  \
+  OL_DEVICE_INFO_TEST_HOST_VALUE_GT(TestName, PropType, PropName, LowBound)
+
 TEST_P(olGetDeviceInfoTest, SuccessType) {
   ol_device_type_t DeviceType;
   ASSERT_SUCCESS(olGetDeviceInfo(Device, OL_DEVICE_INFO_TYPE,
@@ -54,6 +86,29 @@ TEST_P(olGetDeviceInfoTest, HostName) {
   ASSERT_EQ(std::strlen(Name.data()), Size - 1);
 }
 
+TEST_P(olGetDeviceInfoTest, SuccessProductName) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(
+      olGetDeviceInfoSize(Device, OL_DEVICE_INFO_PRODUCT_NAME, &Size));
+  ASSERT_GT(Size, 0ul);
+  std::vector<char> Name;
+  Name.resize(Size);
+  ASSERT_SUCCESS(
+      olGetDeviceInfo(Device, OL_DEVICE_INFO_PRODUCT_NAME, Size, Name.data()));
+  ASSERT_EQ(std::strlen(Name.data()), Size - 1);
+}
+
+TEST_P(olGetDeviceInfoTest, HostProductName) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(olGetDeviceInfoSize(Host, OL_DEVICE_INFO_PRODUCT_NAME, &Size));
+  ASSERT_GT(Size, 0ul);
+  std::vector<char> Name;
+  Name.resize(Size);
+  ASSERT_SUCCESS(
+      olGetDeviceInfo(Host, OL_DEVICE_INFO_PRODUCT_NAME, Size, Name.data()));
+  ASSERT_EQ(std::strlen(Name.data()), Size - 1);
+}
+
 TEST_P(olGetDeviceInfoTest, SuccessVendor) {
   size_t Size = 0;
   ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_VENDOR, &Size));
@@ -77,12 +132,8 @@ TEST_P(olGetDeviceInfoTest, SuccessDriverVersion) {
   ASSERT_EQ(std::strlen(DriverVersion.data()), Size - 1);
 }
 
-TEST_P(olGetDeviceInfoTest, SuccessMaxWorkGroupSize) {
-  uint32_t Value;
-  ASSERT_SUCCESS(olGetDeviceInfo(Device, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
-                                 sizeof(Value), &Value));
-  ASSERT_GT(Value, 0u);
-}
+OL_DEVICE_INFO_TEST_VALUE_GT(MaxWorkGroupSize, uint32_t,
+                             OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE, 0);
 
 TEST_P(olGetDeviceInfoTest, SuccessMaxWorkGroupSizePerDimension) {
   ol_dimensions_t Value{0, 0, 0};
@@ -94,6 +145,59 @@ TEST_P(olGetDeviceInfoTest, SuccessMaxWorkGroupSizePerDimension) {
   ASSERT_GT(Value.z, 0u);
 }
 
+OL_DEVICE_INFO_TEST_VALUE_GT(MaxWorkSize, uint32_t,
+                             OL_DEVICE_INFO_MAX_WORK_SIZE, 0);
+
+TEST_P(olGetDeviceInfoTest, SuccessMaxWorkSizePerDimension) {
+  ol_dimensions_t Value{0, 0, 0};
+  ASSERT_SUCCESS(olGetDeviceInfo(Device,
+                                 OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION,
+                                 sizeof(Value), &Value));
+  ASSERT_GT(Value.x, 0u);
+  ASSERT_GT(Value.y, 0u);
+  ASSERT_GT(Value.z, 0u);
+}
+
+OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(VendorId, uint32_t,
+                                    OL_DEVICE_INFO_VENDOR_ID, 0);
+OL_DEVICE_INFO_TEST_HOST_SUCCESS(VendorId, uint32_t, OL_DEVICE_INFO_VENDOR_ID);
+OL_DEVICE_INFO_TEST_VALUE_GT(NumComputeUnits, uint32_t,
+                             OL_DEVICE_INFO_NUM_COMPUTE_UNITS, 0);
+OL_DEVICE_INFO_TEST_VALUE_GT(SingleFPConfig, ol_device_fp_capability_flags_t,
+                             OL_DEVICE_INFO_SINGLE_FP_CONFIG, 0);
+OL_DEVICE_INFO_TEST_SUCCESS(HalfFPConfig, ol_device_fp_capability_flags_t,
+                            OL_DEVICE_INFO_HALF_FP_CONFIG);
+OL_DEVICE_INFO_TEST_VALUE_GT(DoubleFPConfig, ol_device_fp_capability_flags_t,
+                             OL_DEVICE_INFO_DOUBLE_FP_CONFIG, 0);
+OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthChar, uint32_t,
+                             OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR, 0);
+OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthShort, uint32_t,
+                             OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT, 0);
+OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthInt, uint32_t,
+                             OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT, 0);
+OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthLong, uint32_t,
+                             OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG, 0);
+OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthFloat, uint32_t,
+                             OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT, 0);
+OL_DEVICE_INFO_TEST_VALUE_GT(NativeVectorWidthDouble, uint32_t,
+                             OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE, 0);
+OL_DEVICE_INFO_TEST_SUCCESS(NativeVectorWidthHalf, uint32_t,
+                            OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF);
+OL_DEVICE_INFO_TEST_VALUE_GT(MaxClockFrequency, uint32_t,
+                             OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY, 0);
+OL_DEVICE_INFO_TEST_VALUE_GT(MemoryClockRate, uint32_t,
+                             OL_DEVICE_INFO_MEMORY_CLOCK_RATE, 0);
+OL_DEVICE_INFO_TEST_VALUE_GT(AddressBits, uint32_t, OL_DEVICE_INFO_ADDRESS_BITS,
+                             0);
+OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(MaxMemAllocSize, uint64_t,
+                                    OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, 0);
+OL_DEVICE_INFO_TEST_HOST_SUCCESS(MaxMemAllocSize, uint64_t,
+                                 OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE);
+OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(GlobalMemSize, uint64_t,
+                                    OL_DEVICE_INFO_GLOBAL_MEM_SIZE, 0);
+OL_DEVICE_INFO_TEST_HOST_SUCCESS(GlobalMemSize, uint64_t,
+                                 OL_DEVICE_INFO_GLOBAL_MEM_SIZE);
+
 TEST_P(olGetDeviceInfoTest, InvalidNullHandleDevice) {
   ol_device_type_t DeviceType;
   ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
index 4e29978..c4a3c2d 100644
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
@@ -13,48 +13,76 @@
 using olGetDeviceInfoSizeTest = OffloadDeviceTest;
 OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetDeviceInfoSizeTest);
 
-TEST_P(olGetDeviceInfoSizeTest, SuccessType) {
-  size_t Size = 0;
-  ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_TYPE, &Size));
-  ASSERT_EQ(Size, sizeof(ol_device_type_t));
-}
+#define OL_DEVICE_INFO_SIZE_TEST(TestName, PropName, Expr)                     \
+  TEST_P(olGetDeviceInfoSizeTest, Success##TestName) {                         \
+    size_t Size = 0;                                                           \
+    ASSERT_SUCCESS(olGetDeviceInfoSize(Device, PropName, &Size));              \
+    Expr;                                                                      \
+  }
 
-TEST_P(olGetDeviceInfoSizeTest, SuccessPlatform) {
-  size_t Size = 0;
-  ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_PLATFORM, &Size));
-  ASSERT_EQ(Size, sizeof(ol_platform_handle_t));
-}
+#define OL_DEVICE_INFO_SIZE_TEST_EQ(TestName, PropType, PropName)              \
+  OL_DEVICE_INFO_SIZE_TEST(TestName, PropName,                                 \
+                           ASSERT_EQ(Size, sizeof(PropType)));
 
-TEST_P(olGetDeviceInfoSizeTest, SuccessName) {
-  size_t Size = 0;
-  ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_NAME, &Size));
-  ASSERT_NE(Size, 0ul);
-}
-
-TEST_P(olGetDeviceInfoSizeTest, SuccessVendor) {
-  size_t Size = 0;
-  ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_VENDOR, &Size));
-  ASSERT_NE(Size, 0ul);
-}
+#define OL_DEVICE_INFO_SIZE_TEST_NONZERO(TestName, PropName)                   \
+  OL_DEVICE_INFO_SIZE_TEST(TestName, PropName, ASSERT_NE(Size, 0ul));
 
-TEST_P(olGetDeviceInfoSizeTest, SuccessDriverVersion) {
-  size_t Size = 0;
-  ASSERT_SUCCESS(
-      olGetDeviceInfoSize(Device, OL_DEVICE_INFO_DRIVER_VERSION, &Size));
-  ASSERT_NE(Size, 0ul);
-}
+OL_DEVICE_INFO_SIZE_TEST_EQ(Type, ol_device_type_t, OL_DEVICE_INFO_TYPE);
+OL_DEVICE_INFO_SIZE_TEST_EQ(Platform, ol_platform_handle_t,
+                            OL_DEVICE_INFO_PLATFORM);
+OL_DEVICE_INFO_SIZE_TEST_NONZERO(Name, OL_DEVICE_INFO_NAME);
+OL_DEVICE_INFO_SIZE_TEST_NONZERO(ProductName, OL_DEVICE_INFO_PRODUCT_NAME);
+OL_DEVICE_INFO_SIZE_TEST_NONZERO(Vendor, OL_DEVICE_INFO_VENDOR);
+OL_DEVICE_INFO_SIZE_TEST_NONZERO(DriverVersion, OL_DEVICE_INFO_DRIVER_VERSION);
+OL_DEVICE_INFO_SIZE_TEST_EQ(MaxWorkGroupSize, uint32_t,
+                            OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE);
+OL_DEVICE_INFO_SIZE_TEST_EQ(MaxWorkSize, uint32_t,
+                            OL_DEVICE_INFO_MAX_WORK_SIZE);
+OL_DEVICE_INFO_SIZE_TEST_EQ(VendorId, uint32_t, OL_DEVICE_INFO_VENDOR_ID);
+OL_DEVICE_INFO_SIZE_TEST_EQ(NumComputeUnits, uint32_t,
+                            OL_DEVICE_INFO_NUM_COMPUTE_UNITS);
+OL_DEVICE_INFO_SIZE_TEST_EQ(SingleFPConfig, ol_device_fp_capability_flags_t,
+                            OL_DEVICE_INFO_SINGLE_FP_CONFIG);
+OL_DEVICE_INFO_SIZE_TEST_EQ(HalfFPConfig, ol_device_fp_capability_flags_t,
+                            OL_DEVICE_INFO_HALF_FP_CONFIG);
+OL_DEVICE_INFO_SIZE_TEST_EQ(DoubleFPConfig, ol_device_fp_capability_flags_t,
+                            OL_DEVICE_INFO_DOUBLE_FP_CONFIG);
+OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthChar, uint32_t,
+                            OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR);
+OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthShort, uint32_t,
+                            OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT);
+OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthInt, uint32_t,
+                            OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT);
+OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthLong, uint32_t,
+                            OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG);
+OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthFloat, uint32_t,
+                            OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT);
+OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthDouble, uint32_t,
+                            OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE);
+OL_DEVICE_INFO_SIZE_TEST_EQ(NativeVectorWidthHalf, uint32_t,
+                            OL_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF);
+OL_DEVICE_INFO_SIZE_TEST_EQ(MaxClockFrequency, uint32_t,
+                            OL_DEVICE_INFO_MAX_CLOCK_FREQUENCY);
+OL_DEVICE_INFO_SIZE_TEST_EQ(MemoryClockRate, uint32_t,
+                            OL_DEVICE_INFO_MEMORY_CLOCK_RATE);
+OL_DEVICE_INFO_SIZE_TEST_EQ(AddressBits, uint32_t, OL_DEVICE_INFO_ADDRESS_BITS);
+OL_DEVICE_INFO_SIZE_TEST_EQ(MaxMemAllocSize, uint64_t,
+                            OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE);
+OL_DEVICE_INFO_SIZE_TEST_EQ(GlobalMemSize, uint64_t,
+                            OL_DEVICE_INFO_GLOBAL_MEM_SIZE);
 
-TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSize) {
+TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSizePerDimension) {
   size_t Size = 0;
-  ASSERT_SUCCESS(
-      olGetDeviceInfoSize(Device, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE, &Size));
-  ASSERT_EQ(Size, sizeof(uint32_t));
+  ASSERT_SUCCESS(olGetDeviceInfoSize(
+      Device, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION, &Size));
+  ASSERT_EQ(Size, sizeof(ol_dimensions_t));
+  ASSERT_EQ(Size, sizeof(uint32_t) * 3);
 }
 
-TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSizePerDimension) {
+TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkSizePerDimension) {
   size_t Size = 0;
   ASSERT_SUCCESS(olGetDeviceInfoSize(
-      Device, OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION, &Size));
+      Device, OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION, &Size));
   ASSERT_EQ(Size, sizeof(ol_dimensions_t));
   ASSERT_EQ(Size, sizeof(uint32_t) * 3);
 }
diff --git a/offload/unittests/OffloadAPI/event/olGetEventInfo.cpp b/offload/unittests/OffloadAPI/event/olGetEventInfo.cpp
index 908d2dc..b86d15f 100644
--- a/offload/unittests/OffloadAPI/event/olGetEventInfo.cpp
+++ b/offload/unittests/OffloadAPI/event/olGetEventInfo.cpp
@@ -13,13 +13,22 @@
 using olGetEventInfoTest = OffloadEventTest;
 OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetEventInfoTest);
 
-TEST_P(olGetEventInfoTest, SuccessDevice) {
+TEST_P(olGetEventInfoTest, SuccessQueue) {
   ol_queue_handle_t RetrievedQueue;
   ASSERT_SUCCESS(olGetEventInfo(Event, OL_EVENT_INFO_QUEUE,
                                 sizeof(ol_queue_handle_t), &RetrievedQueue));
   ASSERT_EQ(Queue, RetrievedQueue);
 }
 
+TEST_P(olGetEventInfoTest, SuccessIsComplete) {
+  bool Complete = false;
+  while (!Complete) {
+    ASSERT_SUCCESS(olGetEventInfo(Event, OL_EVENT_INFO_IS_COMPLETE,
+                                  sizeof(Complete), &Complete));
+  }
+  ASSERT_EQ(Complete, true);
+}
+
 TEST_P(olGetEventInfoTest, InvalidNullHandle) {
   ol_queue_handle_t RetrievedQueue;
   ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
diff --git a/offload/unittests/OffloadAPI/event/olGetEventInfoSize.cpp b/offload/unittests/OffloadAPI/event/olGetEventInfoSize.cpp
index d7dee58..36f36c3 100644
--- a/offload/unittests/OffloadAPI/event/olGetEventInfoSize.cpp
+++ b/offload/unittests/OffloadAPI/event/olGetEventInfoSize.cpp
@@ -19,6 +19,12 @@ TEST_P(olGetEventInfoSizeTest, SuccessQueue) {
   ASSERT_EQ(Size, sizeof(ol_queue_handle_t));
 }
 
+TEST_P(olGetEventInfoSizeTest, SuccessIsComplete) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(olGetEventInfoSize(Event, OL_EVENT_INFO_IS_COMPLETE, &Size));
+  ASSERT_EQ(Size, sizeof(bool));
+}
+
 TEST_P(olGetEventInfoSizeTest, InvalidNullHandle) {
   size_t Size = 0;
   ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
diff --git a/offload/unittests/OffloadAPI/kernel/olCalculateOptimalOccupancy.cpp b/offload/unittests/OffloadAPI/kernel/olCalculateOptimalOccupancy.cpp
new file mode 100644
index 0000000..17fa383
--- /dev/null
+++ b/offload/unittests/OffloadAPI/kernel/olCalculateOptimalOccupancy.cpp
@@ -0,0 +1,45 @@
+//===------- Offload API tests - olCalculateOptimalOccupancy --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olCalculateOptimalOccupancyTest = OffloadKernelTest;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olCalculateOptimalOccupancyTest);
+
+TEST_P(olCalculateOptimalOccupancyTest, Success) {
+  size_t Size{0};
+  ASSERT_SUCCESS_OR_UNSUPPORTED(
+      olCalculateOptimalOccupancy(Device, Kernel, 0, &Size));
+  ASSERT_GT(Size, 0u);
+}
+
+TEST_P(olCalculateOptimalOccupancyTest, SuccessMem) {
+  size_t Size{0};
+  ASSERT_SUCCESS_OR_UNSUPPORTED(
+      olCalculateOptimalOccupancy(Device, Kernel, 1024, &Size));
+  ASSERT_GT(Size, 0u);
+}
+
+TEST_P(olCalculateOptimalOccupancyTest, NullKernel) {
+  size_t Size;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olCalculateOptimalOccupancy(Device, nullptr, 0, &Size));
+}
+
+TEST_P(olCalculateOptimalOccupancyTest, NullDevice) {
+  size_t Size;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olCalculateOptimalOccupancy(nullptr, Kernel, 0, &Size));
+}
+
+TEST_P(olCalculateOptimalOccupancyTest, NullOutput) {
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olCalculateOptimalOccupancy(Device, Kernel, 0, nullptr));
+}
diff --git a/offload/unittests/OffloadAPI/memory/olMemFill.cpp b/offload/unittests/OffloadAPI/memory/olMemFill.cpp
new file mode 100644
index 0000000..a84ed3d78
--- /dev/null
+++ b/offload/unittests/OffloadAPI/memory/olMemFill.cpp
@@ -0,0 +1,193 @@
+//===------- Offload API tests - olMemFill --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+struct olMemFillTest : OffloadQueueTest {
+  template <typename PatternTy, PatternTy PatternVal, size_t Size,
+            bool Block = false>
+  void test_body() {
+    ManuallyTriggeredTask Manual;
+
+    // Block/enqueue tests ensure that the test has been enqueued to a queue
+    // (rather than being done synchronously if the queue happens to be empty)
+    if constexpr (Block) {
+      ASSERT_SUCCESS(Manual.enqueue(Queue));
+    }
+
+    void *Alloc;
+    ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+    PatternTy Pattern = PatternVal;
+    ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+    if constexpr (Block) {
+      ASSERT_SUCCESS(Manual.trigger());
+    }
+    olSyncQueue(Queue);
+
+    size_t N = Size / sizeof(Pattern);
+    for (size_t i = 0; i < N; i++) {
+      PatternTy *AllocPtr = reinterpret_cast<PatternTy *>(Alloc);
+      ASSERT_EQ(AllocPtr[i], Pattern);
+    }
+
+    olMemFree(Alloc);
+  }
+};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemFillTest);
+
+TEST_P(olMemFillTest, Success8) { test_body<uint8_t, 0x42, 1024>(); }
+TEST_P(olMemFillTest, Success8NotMultiple4) {
+  test_body<uint8_t, 0x42, 1023>();
+}
+TEST_P(olMemFillTest, Success8Enqueue) {
+  test_body<uint8_t, 0x42, 1024, true>();
+}
+TEST_P(olMemFillTest, Success8NotMultiple4Enqueue) {
+  test_body<uint8_t, 0x42, 1023, true>();
+}
+
+TEST_P(olMemFillTest, Success16) { test_body<uint8_t, 0x42, 1024>(); }
+TEST_P(olMemFillTest, Success16NotMultiple4) {
+  test_body<uint16_t, 0x4243, 1022>();
+}
+TEST_P(olMemFillTest, Success16Enqueue) {
+  test_body<uint8_t, 0x42, 1024, true>();
+}
+TEST_P(olMemFillTest, Success16NotMultiple4Enqueue) {
+  test_body<uint16_t, 0x4243, 1022, true>();
+}
+
+TEST_P(olMemFillTest, Success32) { test_body<uint32_t, 0xDEADBEEF, 1024>(); }
+TEST_P(olMemFillTest, Success32Enqueue) {
+  test_body<uint32_t, 0xDEADBEEF, 1024, true>();
+}
+
+TEST_P(olMemFillTest, SuccessLarge) {
+  constexpr size_t Size = 1024;
+  void *Alloc;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+  struct PatternT {
+    uint64_t A;
+    uint64_t B;
+  } Pattern{UINT64_MAX, UINT64_MAX};
+
+  ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+  olSyncQueue(Queue);
+
+  size_t N = Size / sizeof(Pattern);
+  for (size_t i = 0; i < N; i++) {
+    PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
+    ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
+    ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
+  }
+
+  olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, SuccessLargeEnqueue) {
+  constexpr size_t Size = 1024;
+  void *Alloc;
+  ManuallyTriggeredTask Manual;
+  ASSERT_SUCCESS(Manual.enqueue(Queue));
+
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+  struct PatternT {
+    uint64_t A;
+    uint64_t B;
+  } Pattern{UINT64_MAX, UINT64_MAX};
+
+  ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+  Manual.trigger();
+  olSyncQueue(Queue);
+
+  size_t N = Size / sizeof(Pattern);
+  for (size_t i = 0; i < N; i++) {
+    PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
+    ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
+    ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
+  }
+
+  olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, SuccessLargeByteAligned) {
+  constexpr size_t Size = 17 * 64;
+  void *Alloc;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+  struct __attribute__((packed)) PatternT {
+    uint64_t A;
+    uint64_t B;
+    uint8_t C;
+  } Pattern{UINT64_MAX, UINT64_MAX, 255};
+
+  ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+  olSyncQueue(Queue);
+
+  size_t N = Size / sizeof(Pattern);
+  for (size_t i = 0; i < N; i++) {
+    PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
+    ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
+    ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
+    ASSERT_EQ(AllocPtr[i].C, 255);
+  }
+
+  olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, SuccessLargeByteAlignedEnqueue) {
+  constexpr size_t Size = 17 * 64;
+  void *Alloc;
+  ManuallyTriggeredTask Manual;
+  ASSERT_SUCCESS(Manual.enqueue(Queue));
+
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+  struct __attribute__((packed)) PatternT {
+    uint64_t A;
+    uint64_t B;
+    uint8_t C;
+  } Pattern{UINT64_MAX, UINT64_MAX, 255};
+
+  ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+  Manual.trigger();
+  olSyncQueue(Queue);
+
+  size_t N = Size / sizeof(Pattern);
+  for (size_t i = 0; i < N; i++) {
+    PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
+    ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
+    ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
+    ASSERT_EQ(AllocPtr[i].C, 255);
+  }
+
+  olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, InvalidPatternSize) {
+  constexpr size_t Size = 1025;
+  void *Alloc;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+  uint16_t Pattern = 0x4242;
+  ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+               olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+  olSyncQueue(Queue);
+  olMemFree(Alloc);
+}
diff --git a/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp b/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp
index 0dc8527..aa9e372 100644
--- a/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp
+++ b/offload/unittests/OffloadAPI/queue/olDestroyQueue.cpp
@@ -18,6 +18,15 @@ TEST_P(olDestroyQueueTest, Success) {
   Queue = nullptr;
 }
 
+TEST_P(olDestroyQueueTest, SuccessDelayedResolution) {
+  ManuallyTriggeredTask Manual;
+  ASSERT_SUCCESS(Manual.enqueue(Queue));
+  ASSERT_SUCCESS(olDestroyQueue(Queue));
+  Queue = nullptr;
+
+  ASSERT_SUCCESS(Manual.trigger());
+}
+
 TEST_P(olDestroyQueueTest, InvalidNullHandle) {
   ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, olDestroyQueue(nullptr));
 }
diff --git a/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp b/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp
new file mode 100644
index 0000000..aa86750
--- /dev/null
+++ b/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp
@@ -0,0 +1,107 @@
+//===------- Offload API tests - olLaunchHostFunction ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+#include <thread>
+
+struct olLaunchHostFunctionTest : OffloadQueueTest {};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchHostFunctionTest);
+
+struct olLaunchHostFunctionKernelTest : OffloadKernelTest {};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchHostFunctionKernelTest);
+
+TEST_P(olLaunchHostFunctionTest, Success) {
+  ASSERT_SUCCESS(olLaunchHostFunction(Queue, [](void *) {}, nullptr));
+}
+
+TEST_P(olLaunchHostFunctionTest, SuccessSequence) {
+  uint32_t Buff[16] = {1, 1};
+
+  for (auto BuffPtr = &Buff[2]; BuffPtr != &Buff[16]; BuffPtr++) {
+    ASSERT_SUCCESS(olLaunchHostFunction(
+        Queue,
+        [](void *BuffPtr) {
+          uint32_t *AsU32 = reinterpret_cast<uint32_t *>(BuffPtr);
+          AsU32[0] = AsU32[-1] + AsU32[-2];
+        },
+        BuffPtr));
+  }
+
+  ASSERT_SUCCESS(olSyncQueue(Queue));
+
+  for (uint32_t i = 2; i < 16; i++) {
+    ASSERT_EQ(Buff[i], Buff[i - 1] + Buff[i - 2]);
+  }
+}
+
+TEST_P(olLaunchHostFunctionKernelTest, SuccessBlocking) {
+  // Verify that a host kernel can block execution - A host task is created that
+  // only resolves when Block is set to false.
+  ol_kernel_launch_size_args_t LaunchArgs;
+  LaunchArgs.Dimensions = 1;
+  LaunchArgs.GroupSize = {64, 1, 1};
+  LaunchArgs.NumGroups = {1, 1, 1};
+  LaunchArgs.DynSharedMemory = 0;
+
+  ol_queue_handle_t Queue;
+  ASSERT_SUCCESS(olCreateQueue(Device, &Queue));
+
+  void *Mem;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+                            LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
+
+  uint32_t *Data = (uint32_t *)Mem;
+  for (uint32_t i = 0; i < 64; i++) {
+    Data[i] = 0;
+  }
+
+  volatile bool Block = true;
+  ASSERT_SUCCESS(olLaunchHostFunction(
+      Queue,
+      [](void *Ptr) {
+        volatile bool *Block =
+            reinterpret_cast<volatile bool *>(reinterpret_cast<bool *>(Ptr));
+
+        while (*Block)
+          std::this_thread::yield();
+      },
+      const_cast<bool *>(&Block)));
+
+  struct {
+    void *Mem;
+  } Args{Mem};
+  ASSERT_SUCCESS(
+      olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));
+  for (uint32_t i = 0; i < 64; i++) {
+    ASSERT_EQ(Data[i], 0);
+  }
+
+  Block = false;
+  ASSERT_SUCCESS(olSyncQueue(Queue));
+
+  for (uint32_t i = 0; i < 64; i++) {
+    ASSERT_EQ(Data[i], i);
+  }
+
+  ASSERT_SUCCESS(olDestroyQueue(Queue));
+  ASSERT_SUCCESS(olMemFree(Mem));
+}
+
+TEST_P(olLaunchHostFunctionTest, InvalidNullCallback) {
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olLaunchHostFunction(Queue, nullptr, nullptr));
+}
+
+TEST_P(olLaunchHostFunctionTest, InvalidNullQueue) {
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olLaunchHostFunction(nullptr, [](void *) {}, nullptr));
+}