37 files changed, 2046 insertions, 758 deletions
diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
index ef98942..0e956d8 100644
--- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
+++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
@@ -234,6 +234,17 @@ AliasResult AliasAnalysis::alias(Source lhsSrc, Source rhsSrc, mlir::Value lhs,
                  << "  aliasing because same source kind and origin\n");
       if (approximateSource)
         return AliasResult::MayAlias;
+      // One should be careful about relying on MustAlias.
+      // The LLVM definition implies that the two MustAlias
+      // memory objects start at exactly the same location.
+      // With Fortran array slices two objects may have
+      // the same starting location, but otherwise represent
+      // partially overlapping memory locations, e.g.:
+      //   integer :: a(10)
+      //   ... a(5:1:-1) ! starts at a(5) and addresses a(5), ..., a(1)
+      //   ... a(5:10:1) ! starts at a(5) and addresses a(5), ..., a(10)
+      // The current implementation of FIR alias analysis will always
+      // return MayAlias for such cases.
       return AliasResult::MustAlias;
     }
     // If one value is the address of a composite, and if the other value is the
@@ -554,18 +565,28 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
   Source::Attributes attributes;
   mlir::Operation *instantiationPoint{nullptr};
   while (defOp && !breakFromLoop) {
-    ty = defOp->getResultTypes()[0];
     // Value-scoped allocation detection via effects.
     if (classifyAllocateFromEffects(defOp, v) == SourceKind::Allocate) {
       type = SourceKind::Allocate;
       break;
     }
+    // Operations may have multiple results, so we need to analyze
+    // the result for which the source is queried.
+    auto opResult = mlir::cast<OpResult>(v);
+    assert(opResult.getOwner() == defOp && "v must be a result of defOp");
+    ty = opResult.getType();
     llvm::TypeSwitch<Operation *>(defOp)
         .Case<hlfir::AsExprOp>([&](auto op) {
+          // TODO: we should probably always report hlfir.as_expr
+          // as a unique source, and let the codegen decide whether
+          // to use the original buffer or create a copy.
           v = op.getVar();
           defOp = v.getDefiningOp();
         })
         .Case<hlfir::AssociateOp>([&](auto op) {
+          assert(opResult != op.getMustFreeStrorageFlag() &&
+                 "MustFreeStorageFlag result is not an aliasing candidate");
+
           mlir::Value source = op.getSource();
           if (fir::isa_trivial(source.getType())) {
             // Trivial values will always use distinct temp memory,
@@ -579,11 +600,6 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
             defOp = v.getDefiningOp();
           }
         })
-        .Case<fir::ConvertOp>([&](auto op) {
-          // Skip ConvertOp's and track further through the operand.
-          v = op->getOperand(0);
-          defOp = v.getDefiningOp();
-        })
         .Case<fir::PackArrayOp>([&](auto op) {
           // The packed array is not distinguishable from the original
           // array, so skip PackArrayOp and track further through
@@ -592,28 +608,6 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
           defOp = v.getDefiningOp();
           approximateSource = true;
         })
-        .Case<fir::BoxAddrOp>([&](auto op) {
-          v = op->getOperand(0);
-          defOp = v.getDefiningOp();
-          if (mlir::isa<fir::BaseBoxType>(v.getType()))
-            followBoxData = true;
-        })
-        .Case<fir::ArrayCoorOp, fir::CoordinateOp>([&](auto op) {
-          if (isPointerReference(ty))
-            attributes.set(Attribute::Pointer);
-          v = op->getOperand(0);
-          defOp = v.getDefiningOp();
-          if (mlir::isa<fir::BaseBoxType>(v.getType()))
-            followBoxData = true;
-          approximateSource = true;
-        })
-        .Case<fir::EmboxOp, fir::ReboxOp>([&](auto op) {
-          if (followBoxData) {
-            v = op->getOperand(0);
-            defOp = v.getDefiningOp();
-          } else
-            breakFromLoop = true;
-        })
         .Case<fir::LoadOp>([&](auto op) {
           // If load is inside target and it points to mapped item,
           // continue tracking.
@@ -690,6 +684,9 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
           breakFromLoop = true;
         })
         .Case<hlfir::DeclareOp, fir::DeclareOp>([&](auto op) {
+          // The declare operations support FortranObjectViewOpInterface,
+          // but their handling is more complex. Maybe we can find better
+          // abstractions to handle them in a general fashion.
           bool isPrivateItem = false;
           if (omp::BlockArgOpenMPOpInterface argIface =
                   dyn_cast<omp::BlockArgOpenMPOpInterface>(op->getParentOp())) {
@@ -740,7 +737,7 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
             // currently provide any useful information. The host associated
             // access will end up dereferencing the host association tuple,
             // so we may as well stop right now.
-            v = defOp->getResult(0);
+            v = opResult;
             // TODO: if the host associated variable is a dummy argument
             // of the host, I think, we can treat it as SourceKind::Argument
             // for the purpose of alias analysis inside the internal procedure.
@@ -775,21 +772,45 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
           v = op.getMemref();
           defOp = v.getDefiningOp();
         })
-        .Case<hlfir::DesignateOp>([&](auto op) {
-          auto varIf = llvm::cast<fir::FortranVariableOpInterface>(defOp);
-          attributes |= getAttrsFromVariable(varIf);
-          // Track further through the memory indexed into
-          // => if the source arrays/structures don't alias then nor do the
-          //    results of hlfir.designate
-          v = op.getMemref();
+        .Case<fir::FortranObjectViewOpInterface>([&](auto op) {
+          // This case must be located after the cases for concrete
+          // operations that support FortraObjectViewOpInterface,
+          // so that their special handling kicks in.
+
+          // fir.embox/rebox case: this is the only case where we check
+          // for followBoxData.
+          // TODO: it looks like we do not have LIT tests that fail
+          // upon removal of the followBoxData code. We should come up
+          // with a test or remove this code.
+          if (!followBoxData &&
+              (mlir::isa<fir::EmboxOp>(op) || mlir::isa<fir::ReboxOp>(op))) {
+            breakFromLoop = true;
+            return;
+          }
+
+          // Collect attributes from FortranVariableOpInterface operations.
+          if (auto varIf =
+                  mlir::dyn_cast<fir::FortranVariableOpInterface>(defOp))
+            attributes |= getAttrsFromVariable(varIf);
+          // Set Pointer attribute based on the reference type.
+          if (isPointerReference(ty))
+            attributes.set(Attribute::Pointer);
+
+          // Update v to point to the operand that represents the object
+          // referenced by the operation's result.
+          v = op.getViewSource(opResult);
           defOp = v.getDefiningOp();
-          // TODO: there will be some cases which provably don't alias if one
-          // takes into account the component or indices, which are currently
-          // ignored here - leading to false positives
-          // because of this limitation, we need to make sure we never return
-          // MustAlias after going through a designate operation
-          approximateSource = true;
-          if (mlir::isa<fir::BaseBoxType>(v.getType()))
+          // If the input the resulting object references are offsetted,
+          // then set approximateSource.
+          auto offset = op.getViewOffset(opResult);
+          if (!offset || *offset != 0)
+            approximateSource = true;
+
+          // If the source is a box, and the result is not a box,
+          // then this is one of the box "unpacking" operations,
+          // so we should set followBoxData.
+          if (mlir::isa<fir::BaseBoxType>(v.getType()) &&
+              !mlir::isa<fir::BaseBoxType>(ty))
             followBoxData = true;
         })
         .Default([&](auto op) {
diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index 323d1ef..3c86a9d 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -17,6 +17,8 @@
 #include "flang/Evaluate/common.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/MutableBox.h"
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -51,6 +53,8 @@ static const char __ldlu_r2x2[] = "__ldlu_r2x2_";
 static const char __ldlu_r4x4[] = "__ldlu_r4x4_";
 static const char __ldlu_r8x2[] = "__ldlu_r8x2_";
 
+static constexpr unsigned kTMAAlignment = 16;
+
 // CUDA specific intrinsic handlers.
 static constexpr IntrinsicHandler cudaHandlers[]{
     {"__ldca_i4x4",
@@ -195,7 +199,7 @@ static constexpr IntrinsicHandler cudaHandlers[]{
      false},
     {"atomicadd_r4x4",
      static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
-         &CI::genAtomicAddVector<4>),
+         &CI::genAtomicAddVector4x4),
      {{{"a", asAddr}, {"v", asAddr}}},
      false},
     {"atomicaddd",
@@ -368,6 +372,16 @@ static constexpr IntrinsicHandler cudaHandlers[]{
          &CI::genNVVMTime<mlir::NVVM::Clock64Op>),
      {},
      /*isElemental=*/false},
+    {"cluster_block_index",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genClusterBlockIndex),
+     {},
+     /*isElemental=*/false},
+    {"cluster_dim_blocks",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genClusterDimBlocks),
+     {},
+     /*isElemental=*/false},
     {"fence_proxy_async",
      static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
          &CI::genFenceProxyAsync),
@@ -457,6 +471,10 @@ static constexpr IntrinsicHandler cudaHandlers[]{
      static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genSyncWarp),
      {},
      /*isElemental=*/false},
+    {"this_cluster",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisCluster),
+     {},
+     /*isElemental=*/false},
     {"this_grid",
      static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisGrid),
      {},
@@ -744,6 +762,56 @@ fir::ExtendedValue CUDAIntrinsicLibrary::genAtomicAddVector(
   return fir::ArrayBoxValue(res, {ext});
 }
 
+// ATOMICADDVECTOR4x4
+fir::ExtendedValue CUDAIntrinsicLibrary::genAtomicAddVector4x4(
+    mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value a = fir::getBase(args[0]);
+  if (mlir::isa<fir::BaseBoxType>(a.getType()))
+    a = fir::BoxAddrOp::create(builder, loc, a);
+
+  const unsigned extent = 4;
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  mlir::Value ptr = builder.createConvert(loc, llvmPtrTy, a);
+  mlir::Type f32Ty = builder.getF32Type();
+  mlir::Type idxTy = builder.getIndexType();
+  mlir::Type refTy = fir::ReferenceType::get(f32Ty);
+  llvm::SmallVector<mlir::Value> values;
+  for (unsigned i = 0; i < extent; ++i) {
+    mlir::Value pos = builder.createIntegerConstant(loc, idxTy, i);
+    mlir::Value coord = fir::CoordinateOp::create(builder, loc, refTy,
+                                                  fir::getBase(args[1]), pos);
+    mlir::Value value = fir::LoadOp::create(builder, loc, coord);
+    values.push_back(value);
+  }
+
+  auto inlinePtx = mlir::NVVM::InlinePtxOp::create(
+      builder, loc, {f32Ty, f32Ty, f32Ty, f32Ty},
+      {ptr, values[0], values[1], values[2], values[3]}, {},
+      "atom.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};", {});
+
+  llvm::SmallVector<mlir::Value> results;
+  results.push_back(inlinePtx.getResult(0));
+  results.push_back(inlinePtx.getResult(1));
+  results.push_back(inlinePtx.getResult(2));
+  results.push_back(inlinePtx.getResult(3));
+
+  mlir::Type vecF32Ty = mlir::VectorType::get({extent}, f32Ty);
+  mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecF32Ty);
+  mlir::Type i32Ty = builder.getI32Type();
+  for (unsigned i = 0; i < extent; ++i)
+    undef = mlir::LLVM::InsertElementOp::create(
+        builder, loc, undef, results[i],
+        builder.createIntegerConstant(loc, i32Ty, i));
+
+  auto i128Ty = builder.getIntegerType(128);
+  auto i128VecTy = mlir::VectorType::get({1}, i128Ty);
+  mlir::Value vec128 =
+      mlir::vector::BitCastOp::create(builder, loc, i128VecTy, undef);
+  return mlir::vector::ExtractOp::create(builder, loc, vec128,
+                                         mlir::ArrayRef<int64_t>{0});
+}
+
 mlir::Value
 CUDAIntrinsicLibrary::genAtomicAnd(mlir::Type resultType,
                                    llvm::ArrayRef<mlir::Value> args) {
@@ -892,7 +960,7 @@ CUDAIntrinsicLibrary::genBarrierArrive(mlir::Type resultType,
   mlir::Value barrier = convertPtrToNVVMSpace(
       builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
   return mlir::NVVM::MBarrierArriveOp::create(builder, loc, resultType, barrier)
-      .getResult();
+      .getResult(0);
 }
 
 // BARRIER_ARRIBVE_CNT
@@ -981,6 +1049,60 @@ CUDAIntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType,
       .getResult(0);
 }
 
+static void insertValueAtPos(fir::FirOpBuilder &builder, mlir::Location loc,
+                             fir::RecordType recTy, mlir::Value base,
+                             mlir::Value dim, unsigned fieldPos) {
+  auto fieldName = recTy.getTypeList()[fieldPos].first;
+  mlir::Type fieldTy = recTy.getTypeList()[fieldPos].second;
+  mlir::Type fieldIndexType = fir::FieldType::get(base.getContext());
+  mlir::Value fieldIndex =
+      fir::FieldIndexOp::create(builder, loc, fieldIndexType, fieldName, recTy,
+                                /*typeParams=*/mlir::ValueRange{});
+  mlir::Value coord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(fieldTy), base, fieldIndex);
+  fir::StoreOp::create(builder, loc, dim, coord);
+}
+
+// CLUSTER_BLOCK_INDEX
+mlir::Value
+CUDAIntrinsicLibrary::genClusterBlockIndex(mlir::Type resultType,
+                                           llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0);
+  auto recTy = mlir::cast<fir::RecordType>(resultType);
+  assert(recTy && "RecordType expepected");
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+  mlir::Value x = mlir::NVVM::BlockInClusterIdXOp::create(builder, loc, i32Ty);
+  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+  x = mlir::arith::AddIOp::create(builder, loc, x, one);
+  insertValueAtPos(builder, loc, recTy, res, x, 0);
+  mlir::Value y = mlir::NVVM::BlockInClusterIdYOp::create(builder, loc, i32Ty);
+  y = mlir::arith::AddIOp::create(builder, loc, y, one);
+  insertValueAtPos(builder, loc, recTy, res, y, 1);
+  mlir::Value z = mlir::NVVM::BlockInClusterIdZOp::create(builder, loc, i32Ty);
+  z = mlir::arith::AddIOp::create(builder, loc, z, one);
+  insertValueAtPos(builder, loc, recTy, res, z, 2);
+  return res;
+}
+
+// CLUSTER_DIM_BLOCKS
+mlir::Value
+CUDAIntrinsicLibrary::genClusterDimBlocks(mlir::Type resultType,
+                                          llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0);
+  auto recTy = mlir::cast<fir::RecordType>(resultType);
+  assert(recTy && "RecordType expepected");
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+  mlir::Value x = mlir::NVVM::ClusterDimBlocksXOp::create(builder, loc, i32Ty);
+  insertValueAtPos(builder, loc, recTy, res, x, 0);
+  mlir::Value y = mlir::NVVM::ClusterDimBlocksYOp::create(builder, loc, i32Ty);
+  insertValueAtPos(builder, loc, recTy, res, y, 1);
+  mlir::Value z = mlir::NVVM::ClusterDimBlocksZOp::create(builder, loc, i32Ty);
+  insertValueAtPos(builder, loc, recTy, res, z, 2);
+  return res;
+}
+
 // FENCE_PROXY_ASYNC
 void CUDAIntrinsicLibrary::genFenceProxyAsync(
     llvm::ArrayRef<fir::ExtendedValue> args) {
@@ -1080,42 +1202,39 @@ void CUDAIntrinsicLibrary::genSyncThreads(
 mlir::Value
 CUDAIntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType,
                                         llvm::ArrayRef<mlir::Value> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.and";
-  mlir::MLIRContext *context = builder.getContext();
-  mlir::Type i32 = builder.getI32Type();
-  mlir::FunctionType ftype =
-      mlir::FunctionType::get(context, {resultType}, {i32});
-  auto funcOp = builder.createFunction(loc, funcName, ftype);
-  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
-  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
+  mlir::Value arg = builder.createConvert(loc, builder.getI32Type(), args[0]);
+  return mlir::NVVM::BarrierOp::create(
+             builder, loc, resultType, {}, {},
+             mlir::NVVM::BarrierReductionAttr::get(
+                 builder.getContext(), mlir::NVVM::BarrierReduction::AND),
+             arg)
+      .getResult(0);
 }
 
 // SYNCTHREADS_COUNT
 mlir::Value
 CUDAIntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType,
                                           llvm::ArrayRef<mlir::Value> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.popc";
-  mlir::MLIRContext *context = builder.getContext();
-  mlir::Type i32 = builder.getI32Type();
-  mlir::FunctionType ftype =
-      mlir::FunctionType::get(context, {resultType}, {i32});
-  auto funcOp = builder.createFunction(loc, funcName, ftype);
-  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
-  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
+  mlir::Value arg = builder.createConvert(loc, builder.getI32Type(), args[0]);
+  return mlir::NVVM::BarrierOp::create(
+             builder, loc, resultType, {}, {},
+             mlir::NVVM::BarrierReductionAttr::get(
+                 builder.getContext(), mlir::NVVM::BarrierReduction::POPC),
+             arg)
+      .getResult(0);
 }
 
 // SYNCTHREADS_OR
 mlir::Value
 CUDAIntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType,
                                        llvm::ArrayRef<mlir::Value> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.or";
-  mlir::MLIRContext *context = builder.getContext();
-  mlir::Type i32 = builder.getI32Type();
-  mlir::FunctionType ftype =
-      mlir::FunctionType::get(context, {resultType}, {i32});
-  auto funcOp = builder.createFunction(loc, funcName, ftype);
-  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
-  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
+  mlir::Value arg = builder.createConvert(loc, builder.getI32Type(), args[0]);
+  return mlir::NVVM::BarrierOp::create(
+             builder, loc, resultType, {}, {},
+             mlir::NVVM::BarrierReductionAttr::get(
+                 builder.getContext(), mlir::NVVM::BarrierReduction::OR),
+             arg)
+      .getResult(0);
 }
 
 // SYNCWARP
@@ -1125,6 +1244,44 @@ void CUDAIntrinsicLibrary::genSyncWarp(
   mlir::NVVM::SyncWarpOp::create(builder, loc, fir::getBase(args[0]));
 }
 
+// THIS_CLUSTER
+mlir::Value
+CUDAIntrinsicLibrary::genThisCluster(mlir::Type resultType,
+                                     llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0);
+  auto recTy = mlir::cast<fir::RecordType>(resultType);
+  assert(recTy && "RecordType expepected");
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+
+  // SIZE
+  mlir::Value size = mlir::NVVM::ClusterDim::create(builder, loc, i32Ty);
+  auto sizeFieldName = recTy.getTypeList()[1].first;
+  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
+  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
+  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, sizeFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value sizeCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+  fir::StoreOp::create(builder, loc, size, sizeCoord);
+
+  // RANK
+  mlir::Value rank = mlir::NVVM::ClusterId::create(builder, loc, i32Ty);
+  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
+  auto rankFieldName = recTy.getTypeList()[2].first;
+  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
+  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, rankFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value rankCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+  fir::StoreOp::create(builder, loc, rank, rankCoord);
+
+  return res;
+}
+
 // THIS_GRID
 mlir::Value
 CUDAIntrinsicLibrary::genThisGrid(mlir::Type resultType,
@@ -1336,6 +1493,13 @@ void CUDAIntrinsicLibrary::genTMABulkG2S(
       builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {});
 }
 
+static void setAlignment(mlir::Value ptr, unsigned alignment) {
+  if (auto declareOp = mlir::dyn_cast<hlfir::DeclareOp>(ptr.getDefiningOp()))
+    if (auto sharedOp = mlir::dyn_cast<cuf::SharedMemoryOp>(
+            declareOp.getMemref().getDefiningOp()))
+      sharedOp.setAlignment(alignment);
+}
+
 static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc,
                            mlir::Value barrier, mlir::Value src,
                            mlir::Value dst, mlir::Value nelem,
@@ -1343,6 +1507,7 @@ static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize);
   auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
   barrier = builder.createConvert(loc, llvmPtrTy, barrier);
+  setAlignment(dst, kTMAAlignment);
   dst = builder.createConvert(loc, llvmPtrTy, dst);
   src = builder.createConvert(loc, llvmPtrTy, src);
   mlir::NVVM::InlinePtxOp::create(
@@ -1446,6 +1611,7 @@ static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc,
                             mlir::Value src, mlir::Value dst, mlir::Value count,
                             mlir::Value eleSize) {
   mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count);
+  setAlignment(src, kTMAAlignment);
   src = convertPtrToNVVMSpace(builder, loc, src,
                               mlir::NVVM::NVVMMemorySpace::Shared);
   dst = convertPtrToNVVMSpace(builder, loc, dst,
diff --git a/flang/lib/Optimizer/Builder/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp
index 461deb8..2266f4d 100644
--- a/flang/lib/Optimizer/Builder/CUFCommon.cpp
+++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp
@@ -114,3 +114,44 @@ int cuf::computeElementByteSize(mlir::Location loc, mlir::Type type,
     mlir::emitError(loc, "unsupported type");
   return 0;
 }
+
+mlir::Value cuf::computeElementCount(mlir::PatternRewriter &rewriter,
+                                     mlir::Location loc,
+                                     mlir::Value shapeOperand,
+                                     mlir::Type seqType,
+                                     mlir::Type targetType) {
+  if (shapeOperand) {
+    // Dynamic extent - extract from shape operand
+    llvm::SmallVector<mlir::Value> extents;
+    if (auto shapeOp =
+            mlir::dyn_cast<fir::ShapeOp>(shapeOperand.getDefiningOp())) {
+      extents = shapeOp.getExtents();
+    } else if (auto shapeShiftOp = mlir::dyn_cast<fir::ShapeShiftOp>(
+                   shapeOperand.getDefiningOp())) {
+      for (auto i : llvm::enumerate(shapeShiftOp.getPairs()))
+        if (i.index() & 1)
+          extents.push_back(i.value());
+    }
+
+    if (extents.empty())
+      return mlir::Value();
+
+    // Compute total element count by multiplying all dimensions
+    mlir::Value count =
+        fir::ConvertOp::create(rewriter, loc, targetType, extents[0]);
+    for (unsigned i = 1; i < extents.size(); ++i) {
+      auto operand =
+          fir::ConvertOp::create(rewriter, loc, targetType, extents[i]);
+      count = mlir::arith::MulIOp::create(rewriter, loc, count, operand);
+    }
+    return count;
+  } else {
+    // Static extent - use constant array size
+    if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(seqType)) {
+      mlir::IntegerAttr attr =
+          rewriter.getIntegerAttr(targetType, seqTy.getConstantArraySize());
+      return mlir::arith::ConstantOp::create(rewriter, loc, targetType, attr);
+    }
+  }
+  return mlir::Value();
+}
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 5da27d1..c704ac7 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -427,7 +427,8 @@ mlir::Value fir::FirOpBuilder::genTempDeclareOp(
       builder, loc, memref.getType(), memref, shape, typeParams,
       /*dummy_scope=*/nullptr,
       /*storage=*/nullptr,
-      /*storage_offset=*/0, nameAttr, fortranAttrs, cuf::DataAttributeAttr{});
+      /*storage_offset=*/0, nameAttr, fortranAttrs, cuf::DataAttributeAttr{},
+      /*dummy_arg_no=*/mlir::IntegerAttr{});
 }
 
 mlir::Value fir::FirOpBuilder::genStackSave(mlir::Location loc) {
@@ -1392,12 +1393,10 @@ fir::ExtendedValue fir::factory::arraySectionElementToExtendedValue(
   return fir::factory::componentToExtendedValue(builder, loc, element);
 }
 
-void fir::factory::genScalarAssignment(fir::FirOpBuilder &builder,
-                                       mlir::Location loc,
-                                       const fir::ExtendedValue &lhs,
-                                       const fir::ExtendedValue &rhs,
-                                       bool needFinalization,
-                                       bool isTemporaryLHS) {
+void fir::factory::genScalarAssignment(
+    fir::FirOpBuilder &builder, mlir::Location loc,
+    const fir::ExtendedValue &lhs, const fir::ExtendedValue &rhs,
+    bool needFinalization, bool isTemporaryLHS, mlir::ArrayAttr accessGroups) {
   assert(lhs.rank() == 0 && rhs.rank() == 0 && "must be scalars");
   auto type = fir::unwrapSequenceType(
       fir::unwrapPassByRefType(fir::getBase(lhs).getType()));
@@ -1419,7 +1418,9 @@ void fir::factory::genScalarAssignment(fir::FirOpBuilder &builder,
     mlir::Value lhsAddr = fir::getBase(lhs);
     rhsVal = builder.createConvert(loc, fir::unwrapRefType(lhsAddr.getType()),
                                    rhsVal);
-    fir::StoreOp::create(builder, loc, rhsVal, lhsAddr);
+    fir::StoreOp store = fir::StoreOp::create(builder, loc, rhsVal, lhsAddr);
+    if (accessGroups)
+      store.setAccessGroupsAttr(accessGroups);
   }
 }
 
@@ -1670,6 +1671,26 @@ mlir::Value fir::factory::createZeroValue(fir::FirOpBuilder &builder,
                            "numeric or logical type");
 }
 
+mlir::Value fir::factory::createOneValue(fir::FirOpBuilder &builder,
+                                         mlir::Location loc, mlir::Type type) {
+  mlir::Type i1 = builder.getIntegerType(1);
+  if (mlir::isa<fir::LogicalType>(type) || type == i1)
+    return builder.createConvert(loc, type, builder.createBool(loc, true));
+  if (fir::isa_integer(type))
+    return builder.createIntegerConstant(loc, type, 1);
+  if (fir::isa_real(type))
+    return builder.createRealOneConstant(loc, type);
+  if (fir::isa_complex(type)) {
+    fir::factory::Complex complexHelper(builder, loc);
+    mlir::Type partType = complexHelper.getComplexPartType(type);
+    mlir::Value realPart = builder.createRealOneConstant(loc, partType);
+    mlir::Value imagPart = builder.createRealZeroConstant(loc, partType);
+    return complexHelper.createComplex(type, realPart, imagPart);
+  }
+  fir::emitFatalError(loc, "internal: trying to generate one value of non "
+                           "numeric or logical type");
+}
+
 std::optional<std::int64_t>
 fir::factory::getExtentFromTriplet(mlir::Value lb, mlir::Value ub,
                                    mlir::Value stride) {
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 793be32..a345dcb 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -250,7 +250,7 @@ hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
                   const fir::ExtendedValue &exv, llvm::StringRef name,
                   fir::FortranVariableFlagsAttr flags, mlir::Value dummyScope,
                   mlir::Value storage, std::uint64_t storageOffset,
-                  cuf::DataAttributeAttr dataAttr) {
+                  cuf::DataAttributeAttr dataAttr, unsigned dummyArgNo) {
 
   mlir::Value base = fir::getBase(exv);
   assert(fir::conformsWithPassByRef(base.getType()) &&
@@ -281,7 +281,7 @@ hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
       [](const auto &) {});
   auto declareOp = hlfir::DeclareOp::create(
       builder, loc, base, name, shapeOrShift, lenParams, dummyScope, storage,
-      storageOffset, flags, dataAttr);
+      storageOffset, flags, dataAttr, dummyArgNo);
   return mlir::cast<fir::FortranVariableOpInterface>(declareOp.getOperation());
 }
 
@@ -402,9 +402,9 @@ hlfir::Entity hlfir::genVariableBox(mlir::Location loc,
       fir::BoxType::get(var.getElementOrSequenceType(), isVolatile);
   if (forceBoxType) {
     boxType = forceBoxType;
-    mlir::Type baseType =
-        fir::ReferenceType::get(fir::unwrapRefType(forceBoxType.getEleTy()));
-    addr = builder.createConvert(loc, baseType, addr);
+    mlir::Type baseType = fir::ReferenceType::get(
+        fir::unwrapRefType(forceBoxType.getEleTy()), forceBoxType.isVolatile());
+    addr = builder.createConvertWithVolatileCast(loc, baseType, addr);
   }
   auto embox = fir::EmboxOp::create(builder, loc, boxType, addr, shape,
                                     /*slice=*/mlir::Value{}, typeParams);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 3eb6044..75a74ee 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -91,6 +91,11 @@ static bool isStaticallyAbsent(llvm::ArrayRef<mlir::Value> args,
                                size_t argIndex) {
   return args.size() <= argIndex || !args[argIndex];
 }
+static bool isOptional(mlir::Value value) {
+  auto varIface = mlir::dyn_cast_or_null<fir::FortranVariableOpInterface>(
+      value.getDefiningOp());
+  return varIface && varIface.isOptional();
+}
 
 /// Test if an ExtendedValue is present. This is used to test if an intrinsic
 /// argument is present at compile time. This does not imply that the related
@@ -303,6 +308,10 @@ static constexpr IntrinsicHandler handlers[]{
        {"back", asValue, handleDynamicOptional}}},
      /*isElemental=*/false},
     {"floor", &I::genFloor},
+    {"flush",
+     &I::genFlush,
+     {{{"unit", asAddr}}},
+     /*isElemental=*/false},
     {"fraction", &I::genFraction},
     {"free", &I::genFree},
     {"fseek",
@@ -340,6 +349,10 @@ static constexpr IntrinsicHandler handlers[]{
        {"trim_name", asAddr, handleDynamicOptional},
        {"errmsg", asBox, handleDynamicOptional}}},
      /*isElemental=*/false},
+    {"get_team",
+     &I::genGetTeam,
+     {{{"level", asValue, handleDynamicOptional}}},
+     /*isElemental=*/false},
     {"getcwd",
      &I::genGetCwd,
      {{{"c", asBox}, {"status", asAddr, handleDynamicOptional}}},
@@ -486,6 +499,10 @@ static constexpr IntrinsicHandler handlers[]{
        {"dim", asValue},
        {"mask", asBox, handleDynamicOptional}}},
      /*isElemental=*/false},
+    {"irand",
+     &I::genIrand,
+     {{{"i", asAddr, handleDynamicOptional}}},
+     /*isElemental=*/false},
     {"is_contiguous",
      &I::genIsContiguous,
      {{{"array", asBox}}},
@@ -612,6 +629,10 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genPutenv,
      {{{"str", asAddr}, {"status", asAddr, handleDynamicOptional}}},
      /*isElemental=*/false},
+    {"rand",
+     &I::genRand,
+     {{{"i", asAddr, handleDynamicOptional}}},
+     /*isElemental=*/false},
     {"random_init",
      &I::genRandomInit,
      {{{"repeatable", asValue}, {"image_distinct", asValue}}},
@@ -706,6 +727,10 @@ static constexpr IntrinsicHandler handlers[]{
     {"shifta", &I::genShiftA},
     {"shiftl", &I::genShift<mlir::arith::ShLIOp>},
     {"shiftr", &I::genShift<mlir::arith::ShRUIOp>},
+    {"show_descriptor",
+     &I::genShowDescriptor,
+     {{{"d", asBox}}},
+     /*isElemental=*/false},
     {"sign", &I::genSign},
     {"signal",
      &I::genSignalSubroutine,
@@ -749,6 +774,10 @@ static constexpr IntrinsicHandler handlers[]{
      /*isElemental=*/false},
     {"tand", &I::genTand},
     {"tanpi", &I::genTanpi},
+    {"team_number",
+     &I::genTeamNumber,
+     {{{"team", asBox, handleDynamicOptional}}},
+     /*isElemental=*/false},
     {"this_image",
      &I::genThisImage,
      {{{"coarray", asBox},
@@ -3934,6 +3963,40 @@ mlir::Value IntrinsicLibrary::genFloor(mlir::Type resultType,
   return builder.createConvert(loc, resultType, floor);
 }
 
+// FLUSH
+void IntrinsicLibrary::genFlush(llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1);
+
+  mlir::Value unit;
+  if (isStaticallyAbsent(args[0]))
+    // Give a sentinal value of `-1` on the `()` case.
+    unit = builder.createIntegerConstant(loc, builder.getI32Type(), -1);
+  else {
+    unit = fir::getBase(args[0]);
+    if (isOptional(unit)) {
+      mlir::Value isPresent =
+          fir::IsPresentOp::create(builder, loc, builder.getI1Type(), unit);
+      unit = builder
+                 .genIfOp(loc, builder.getI32Type(), isPresent,
+                          /*withElseRegion=*/true)
+                 .genThen([&]() {
+                   mlir::Value loaded = fir::LoadOp::create(builder, loc, unit);
+                   fir::ResultOp::create(builder, loc, loaded);
+                 })
+                 .genElse([&]() {
+                   mlir::Value negOne = builder.createIntegerConstant(
+                       loc, builder.getI32Type(), -1);
+                   fir::ResultOp::create(builder, loc, negOne);
+                 })
+                 .getResults()[0];
+    } else {
+      unit = fir::LoadOp::create(builder, loc, unit);
+    }
+  }
+
+  fir::runtime::genFlush(builder, loc, unit);
+}
+
 // FRACTION
 mlir::Value IntrinsicLibrary::genFraction(mlir::Type resultType,
                                           llvm::ArrayRef<mlir::Value> args) {
@@ -4013,6 +4076,15 @@ IntrinsicLibrary::genFtell(std::optional<mlir::Type> resultType,
   }
 }
 
+// GET_TEAM
+mlir::Value IntrinsicLibrary::genGetTeam(mlir::Type resultType,
+                                         llvm::ArrayRef<mlir::Value> args) {
+  converter->checkCoarrayEnabled();
+  assert(args.size() == 1);
+  return mif::GetTeamOp::create(builder, loc, fir::BoxType::get(resultType),
+                                /*level*/ args[0]);
+}
+
 // GETCWD
 fir::ExtendedValue
 IntrinsicLibrary::genGetCwd(std::optional<mlir::Type> resultType,
@@ -6098,6 +6170,20 @@ IntrinsicLibrary::genIparity(mlir::Type resultType,
                       "IPARITY", resultType, args);
 }
 
+// IRAND
+fir::ExtendedValue
+IntrinsicLibrary::genIrand(mlir::Type resultType,
+                           llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1);
+  mlir::Value i =
+      isStaticallyPresent(args[0])
+          ? fir::getBase(args[0])
+          : fir::AbsentOp::create(builder, loc,
+                                  builder.getRefType(builder.getI32Type()))
+                .getResult();
+  return fir::runtime::genIrand(builder, loc, i);
+}
+
 // IS_CONTIGUOUS
 fir::ExtendedValue
 IntrinsicLibrary::genIsContiguous(mlir::Type resultType,
@@ -6281,12 +6367,6 @@ IntrinsicLibrary::genCharacterCompare(mlir::Type resultType,
       fir::getBase(args[1]), fir::getLen(args[1]));
 }
 
-static bool isOptional(mlir::Value value) {
-  auto varIface = mlir::dyn_cast_or_null<fir::FortranVariableOpInterface>(
-      value.getDefiningOp());
-  return varIface && varIface.isOptional();
-}
-
 // LOC
 fir::ExtendedValue
 IntrinsicLibrary::genLoc(mlir::Type resultType,
@@ -6509,11 +6589,9 @@ static mlir::Value genFastMod(fir::FirOpBuilder &builder, mlir::Location loc,
 mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
                                      llvm::ArrayRef<mlir::Value> args) {
   auto mod = builder.getModule();
-  bool dontUseFastRealMod = false;
-  bool canUseApprox = mlir::arith::bitEnumContainsAny(
-      builder.getFastMathFlags(), mlir::arith::FastMathFlags::afn);
-  if (auto attr = mod->getAttrOfType<mlir::BoolAttr>("fir.no_fast_real_mod"))
-    dontUseFastRealMod = attr.getValue();
+  bool useFastRealMod = false;
+  if (auto attr = mod->getAttrOfType<mlir::BoolAttr>("fir.fast_real_mod"))
+    useFastRealMod = attr.getValue();
 
   assert(args.size() == 2);
   if (resultType.isUnsignedInteger()) {
@@ -6526,7 +6604,7 @@ mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
   if (mlir::isa<mlir::IntegerType>(resultType))
     return mlir::arith::RemSIOp::create(builder, loc, args[0], args[1]);
 
-  if (resultType.isFloat() && canUseApprox && !dontUseFastRealMod) {
+  if (resultType.isFloat() && useFastRealMod) {
     // Treat MOD as an approximate function and code-gen inline code
     // instead of calling into the Fortran runtime library.
     return builder.createConvert(loc, resultType,
@@ -7132,6 +7210,19 @@ IntrinsicLibrary::genPutenv(std::optional<mlir::Type> resultType,
   return {};
 }
 
+// RAND
+fir::ExtendedValue
+IntrinsicLibrary::genRand(mlir::Type, llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1);
+  mlir::Value i =
+      isStaticallyPresent(args[0])
+          ? fir::getBase(args[0])
+          : fir::AbsentOp::create(builder, loc,
+                                  builder.getRefType(builder.getI32Type()))
+                .getResult();
+  return fir::runtime::genRand(builder, loc, i);
+}
+
 // RANDOM_INIT
 void IntrinsicLibrary::genRandomInit(llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 2);
@@ -7797,6 +7888,16 @@ mlir::Value IntrinsicLibrary::genShiftA(mlir::Type resultType,
   return result;
 }
 
+void IntrinsicLibrary::genShowDescriptor(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1 && "expected single argument for show_descriptor");
+  const mlir::Value descriptor = fir::getBase(args[0]);
+
+  assert(fir::isa_box_type(descriptor.getType()) &&
+         "argument must have been lowered to box type");
+  fir::runtime::genShowDescriptor(builder, loc, descriptor);
+}
+
 // SIGNAL
 void IntrinsicLibrary::genSignalSubroutine(
     llvm::ArrayRef<fir::ExtendedValue> args) {
@@ -7953,6 +8054,16 @@ mlir::Value IntrinsicLibrary::genTanpi(mlir::Type resultType,
   return getRuntimeCallGenerator("tan", ftype)(builder, loc, {arg});
 }
 
+// TEAM_NUMBER
+fir::ExtendedValue
+IntrinsicLibrary::genTeamNumber(mlir::Type,
+                                llvm::ArrayRef<fir::ExtendedValue> args) {
+  converter->checkCoarrayEnabled();
+  assert(args.size() == 1);
+  return mif::TeamNumberOp::create(builder, loc,
+                                   /*team*/ fir::getBase(args[0]));
+}
+
 // THIS_IMAGE
 fir::ExtendedValue
 IntrinsicLibrary::genThisImage(mlir::Type resultType,
diff --git a/flang/lib/Optimizer/Builder/Runtime/Character.cpp b/flang/lib/Optimizer/Builder/Runtime/Character.cpp
index 540ecba..2f1772f 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Character.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Character.cpp
@@ -94,27 +94,34 @@ fir::runtime::genCharCompare(fir::FirOpBuilder &builder, mlir::Location loc,
                              mlir::arith::CmpIPredicate cmp,
                              mlir::Value lhsBuff, mlir::Value lhsLen,
                              mlir::Value rhsBuff, mlir::Value rhsLen) {
-  mlir::func::FuncOp beginFunc;
-  switch (discoverKind(lhsBuff.getType())) {
+  int lhsKind = discoverKind(lhsBuff.getType());
+  int rhsKind = discoverKind(rhsBuff.getType());
+  if (lhsKind != rhsKind) {
+    fir::emitFatalError(loc, "runtime does not support comparison of different "
+                             "CHARACTER kind values");
+  }
+  mlir::func::FuncOp func;
+  switch (lhsKind) {
   case 1:
-    beginFunc = fir::runtime::getRuntimeFunc<mkRTKey(CharacterCompareScalar1)>(
+    func = fir::runtime::getRuntimeFunc<mkRTKey(CharacterCompareScalar1)>(
         loc, builder);
     break;
   case 2:
-    beginFunc = fir::runtime::getRuntimeFunc<mkRTKey(CharacterCompareScalar2)>(
+    func = fir::runtime::getRuntimeFunc<mkRTKey(CharacterCompareScalar2)>(
         loc, builder);
     break;
   case 4:
-    beginFunc = fir::runtime::getRuntimeFunc<mkRTKey(CharacterCompareScalar4)>(
+    func = fir::runtime::getRuntimeFunc<mkRTKey(CharacterCompareScalar4)>(
         loc, builder);
     break;
   default:
-    llvm_unreachable("runtime does not support CHARACTER KIND");
+    fir::emitFatalError(
+        loc, "unsupported CHARACTER kind value. Runtime expects 1, 2, or 4.");
   }
-  auto fTy = beginFunc.getFunctionType();
+  auto fTy = func.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, fTy, lhsBuff, rhsBuff,
                                             lhsLen, rhsLen);
-  auto tri = fir::CallOp::create(builder, loc, beginFunc, args).getResult(0);
+  auto tri = fir::CallOp::create(builder, loc, func, args).getResult(0);
   auto zero = builder.createIntegerConstant(loc, tri.getType(), 0);
   return mlir::arith::CmpIOp::create(builder, loc, cmp, tri, zero);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
index 110b1b2..a5f16f8 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
@@ -137,6 +137,15 @@ void fir::runtime::genEtime(fir::FirOpBuilder &builder, mlir::Location loc,
   fir::CallOp::create(builder, loc, runtimeFunc, args);
 }
 
+void fir::runtime::genFlush(fir::FirOpBuilder &builder, mlir::Location loc,
+                            mlir::Value unit) {
+  auto runtimeFunc = fir::runtime::getRuntimeFunc<mkRTKey(Flush)>(loc, builder);
+  llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+      builder, loc, runtimeFunc.getFunctionType(), unit);
+
+  fir::CallOp::create(builder, loc, runtimeFunc, args);
+}
+
 void fir::runtime::genFree(fir::FirOpBuilder &builder, mlir::Location loc,
                            mlir::Value ptr) {
   auto runtimeFunc = fir::runtime::getRuntimeFunc<mkRTKey(Free)>(loc, builder);
@@ -461,3 +470,34 @@ mlir::Value fir::runtime::genChdir(fir::FirOpBuilder &builder,
       fir::runtime::createArguments(builder, loc, func.getFunctionType(), name);
   return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
+
+mlir::Value fir::runtime::genIrand(fir::FirOpBuilder &builder,
+                                   mlir::Location loc, mlir::Value i) {
+  auto runtimeFunc = fir::runtime::getRuntimeFunc<mkRTKey(Irand)>(loc, builder);
+  mlir::FunctionType runtimeFuncTy = runtimeFunc.getFunctionType();
+
+  llvm::SmallVector<mlir::Value> args =
+      fir::runtime::createArguments(builder, loc, runtimeFuncTy, i);
+  return fir::CallOp::create(builder, loc, runtimeFunc, args).getResult(0);
+}
+
+mlir::Value fir::runtime::genRand(fir::FirOpBuilder &builder,
+                                  mlir::Location loc, mlir::Value i) {
+  auto runtimeFunc = fir::runtime::getRuntimeFunc<mkRTKey(Rand)>(loc, builder);
+  mlir::FunctionType runtimeFuncTy = runtimeFunc.getFunctionType();
+
+  mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+  mlir::Value sourceLine =
+      fir::factory::locationToLineNo(builder, loc, runtimeFuncTy.getInput(2));
+
+  llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+      builder, loc, runtimeFuncTy, i, sourceFile, sourceLine);
+  return fir::CallOp::create(builder, loc, runtimeFunc, args).getResult(0);
+}
+
+void fir::runtime::genShowDescriptor(fir::FirOpBuilder &builder,
+                                     mlir::Location loc, mlir::Value descAddr) {
+  mlir::func::FuncOp func{
+      fir::runtime::getRuntimeFunc<mkRTKey(ShowDescriptor)>(loc, builder)};
+  fir::CallOp::create(builder, loc, func, descAddr);
+}
diff --git a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
index 157d435..343d848 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
@@ -1841,7 +1841,7 @@ mlir::Value fir::runtime::genReduce(fir::FirOpBuilder &builder,
 
   assert((fir::isa_real(eleTy) || fir::isa_integer(eleTy) ||
           mlir::isa<fir::LogicalType>(eleTy)) &&
-         "expect real, interger or logical");
+         "expect real, integer or logical");
 
   auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy);
   mlir::func::FuncOp func;
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index ca4aefb..f96d45d 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -692,6 +692,10 @@ struct CallOpConversion : public fir::FIROpConversion<fir::CallOp> {
       }
     }
 
+    if (std::optional<mlir::ArrayAttr> optionalAccessGroups =
+            call.getAccessGroups())
+      llvmCall.setAccessGroups(*optionalAccessGroups);
+
     if (memAttr)
       llvmCall.setMemoryEffectsAttr(
           mlir::cast<mlir::LLVM::MemoryEffectsAttr>(memAttr));
@@ -3402,6 +3406,9 @@ struct LoadOpConversion : public fir::FIROpConversion<fir::LoadOp> {
         loadOp.setTBAATags(*optionalTag);
       else
         attachTBAATag(loadOp, load.getType(), load.getType(), nullptr);
+      if (std::optional<mlir::ArrayAttr> optionalAccessGroups =
+              load.getAccessGroups())
+        loadOp.setAccessGroups(*optionalAccessGroups);
       rewriter.replaceOp(load, loadOp.getResult());
     }
     return mlir::success();
@@ -3733,6 +3740,10 @@ struct StoreOpConversion : public fir::FIROpConversion<fir::StoreOp> {
       if (store.getNontemporal())
         storeOp.setNontemporal(true);
 
+      if (std::optional<mlir::ArrayAttr> optionalAccessGroups =
+              store.getAccessGroups())
+        storeOp.setAccessGroups(*optionalAccessGroups);
+
       newOp = storeOp;
     }
     if (std::optional<mlir::ArrayAttr> optionalTag = store.getTbaa())
diff --git a/flang/lib/Optimizer/CodeGen/PassDetail.h b/flang/lib/Optimizer/CodeGen/PassDetail.h
index f703013..252da02 100644
--- a/flang/lib/Optimizer/CodeGen/PassDetail.h
+++ b/flang/lib/Optimizer/CodeGen/PassDetail.h
@@ -18,7 +18,7 @@
 
 namespace fir {
 
-#define GEN_PASS_CLASSES
+#define GEN_PASS_DECL
 #include "flang/Optimizer/CodeGen/CGPasses.h.inc"
 
 } // namespace fir
diff --git a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
index 1b1d43c..3b137d1 100644
--- a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
@@ -302,11 +302,16 @@ public:
       else
         return mlir::failure();
     }
+    // Extract dummy_arg_no attribute if present
+    mlir::IntegerAttr dummyArgNoAttr;
+    if (auto attr = declareOp->getAttrOfType<mlir::IntegerAttr>("dummy_arg_no"))
+      dummyArgNoAttr = attr;
     // FIXME: Add FortranAttrs and CudaAttrs
     auto xDeclOp = fir::cg::XDeclareOp::create(
         rewriter, loc, declareOp.getType(), declareOp.getMemref(), shapeOpers,
         shiftOpers, declareOp.getTypeparams(), declareOp.getDummyScope(),
-        declareOp.getUniqName());
+        declareOp.getStorage(), declareOp.getStorageOffset(),
+        declareOp.getUniqName(), dummyArgNoAttr);
     LLVM_DEBUG(llvm::dbgs()
                << "rewriting " << declareOp << " to " << xDeclOp << '\n');
     rewriter.replaceOp(declareOp, xDeclOp.getOperation()->getResults());
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index 687007d..97f7f76a 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -333,7 +333,8 @@ void cuf::SharedMemoryOp::build(
       bindcName.empty() ? mlir::StringAttr{} : builder.getStringAttr(bindcName);
   build(builder, result, wrapAllocaResultType(inType),
         mlir::TypeAttr::get(inType), nameAttr, bindcAttr, typeparams, shape,
-        /*offset=*/mlir::Value{});
+        /*offset=*/mlir::Value{}, /*alignment=*/mlir::IntegerAttr{},
+        /*isStatic=*/nullptr);
   result.addAttributes(attributes);
 }
 
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 4f97aca..4e797d6 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -834,6 +834,11 @@ void fir::ArrayCoorOp::getCanonicalizationPatterns(
   patterns.add<SimplifyArrayCoorOp>(context);
 }
 
+std::optional<std::int64_t> fir::ArrayCoorOp::getViewOffset(mlir::OpResult) {
+  // TODO: we can try to compute the constant offset.
+  return std::nullopt;
+}
+
 //===----------------------------------------------------------------------===//
 // ArrayLoadOp
 //===----------------------------------------------------------------------===//
@@ -1086,6 +1091,13 @@ mlir::OpFoldResult fir::BoxAddrOp::fold(FoldAdaptor adaptor) {
   return {};
 }
 
+std::optional<std::int64_t> fir::BoxAddrOp::getViewOffset(mlir::OpResult) {
+  // fir.box_addr just returns the base address stored inside a box,
+  // so the direct accesses through the base address and through the box
+  // are not offsetted.
+  return 0;
+}
+
 //===----------------------------------------------------------------------===//
 // BoxCharLenOp
 //===----------------------------------------------------------------------===//
@@ -1820,6 +1832,11 @@ fir::CoordinateIndicesAdaptor fir::CoordinateOp::getIndices() {
   return CoordinateIndicesAdaptor(getFieldIndicesAttr(), getCoor());
 }
 
+std::optional<std::int64_t> fir::CoordinateOp::getViewOffset(mlir::OpResult) {
+  // TODO: we can try to compute the constant offset.
+  return std::nullopt;
+}
+
 //===----------------------------------------------------------------------===//
 // DispatchOp
 //===----------------------------------------------------------------------===//
@@ -2066,6 +2083,14 @@ bool fir::isContiguousEmbox(fir::EmboxOp embox, bool checkWhole) {
   return false;
 }
 
+std::optional<std::int64_t> fir::EmboxOp::getViewOffset(mlir::OpResult) {
+  // The address offset is zero, unless there is a slice.
+  // TODO: we can handle slices that leave the base address untouched.
+  if (!getSlice())
+    return 0;
+  return std::nullopt;
+}
+
 //===----------------------------------------------------------------------===//
 // EmboxCharOp
 //===----------------------------------------------------------------------===//
@@ -3205,11 +3230,19 @@ mlir::ParseResult fir::DTEntryOp::parse(mlir::OpAsmParser &parser,
       parser.parseAttribute(calleeAttr, fir::DTEntryOp::getProcAttrNameStr(),
                             result.attributes))
     return mlir::failure();
+
+  // Optional "deferred" keyword.
+  if (succeeded(parser.parseOptionalKeyword("deferred"))) {
+    result.addAttribute(fir::DTEntryOp::getDeferredAttrNameStr(),
+                        parser.getBuilder().getUnitAttr());
+  }
   return mlir::success();
 }
 
 void fir::DTEntryOp::print(mlir::OpAsmPrinter &p) {
   p << ' ' << getMethodAttr() << ", " << getProcAttr();
+  if ((*this)->getAttr(fir::DTEntryOp::getDeferredAttrNameStr()))
+    p << " deferred";
 }
 
 //===----------------------------------------------------------------------===//
@@ -3313,6 +3346,14 @@ llvm::LogicalResult fir::ReboxOp::verify() {
   return mlir::success();
 }
 
+std::optional<std::int64_t> fir::ReboxOp::getViewOffset(mlir::OpResult) {
+  // The address offset is zero, unless there is a slice.
+  // TODO: we can handle slices that leave the base address untouched.
+  if (!getSlice())
+    return 0;
+  return std::nullopt;
+}
+
 //===----------------------------------------------------------------------===//
 // ReboxAssumedRankOp
 //===----------------------------------------------------------------------===//
@@ -4252,7 +4293,7 @@ llvm::LogicalResult fir::StoreOp::verify() {
 
 void fir::StoreOp::build(mlir::OpBuilder &builder, mlir::OperationState &result,
                          mlir::Value value, mlir::Value memref) {
-  build(builder, result, value, memref, {});
+  build(builder, result, value, memref, {}, {}, {});
 }
 
 void fir::StoreOp::getEffects(
diff --git a/flang/lib/Optimizer/Dialect/MIF/MIFOps.cpp b/flang/lib/Optimizer/Dialect/MIF/MIFOps.cpp
index c6cc2e8..5f68f3d 100644
--- a/flang/lib/Optimizer/Dialect/MIF/MIFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/MIF/MIFOps.cpp
@@ -15,9 +15,6 @@
 #include "mlir/IR/PatternMatch.h"
 #include "llvm/ADT/SmallVector.h"
 
-#define GET_OP_CLASSES
-#include "flang/Optimizer/Dialect/MIF/MIFOps.cpp.inc"
-
 //===----------------------------------------------------------------------===//
 // NumImagesOp
 //===----------------------------------------------------------------------===//
@@ -151,3 +148,60 @@ llvm::LogicalResult mif::CoSumOp::verify() {
     return emitOpError("`A` shall be of numeric type.");
   return mlir::success();
 }
+
+//===----------------------------------------------------------------------===//
+// ChangeTeamOp
+//===----------------------------------------------------------------------===//
+
+void mif::ChangeTeamOp::build(mlir::OpBuilder &builder,
+                              mlir::OperationState &result, mlir::Value team,
+                              bool ensureTerminator,
+                              llvm::ArrayRef<mlir::NamedAttribute> attributes) {
+  build(builder, result, team, /*stat*/ mlir::Value{}, /*errmsg*/ mlir::Value{},
+        ensureTerminator, attributes);
+}
+
+void mif::ChangeTeamOp::build(mlir::OpBuilder &builder,
+                              mlir::OperationState &result, mlir::Value team,
+                              mlir::Value stat, mlir::Value errmsg,
+                              bool ensureTerminator,
+                              llvm::ArrayRef<mlir::NamedAttribute> attributes) {
+  std::int32_t argStat = 0, argErrmsg = 0;
+  result.addOperands(team);
+  if (stat) {
+    result.addOperands(stat);
+    argStat++;
+  }
+  if (errmsg) {
+    result.addOperands(errmsg);
+    argErrmsg++;
+  }
+
+  mlir::Region *bodyRegion = result.addRegion();
+  bodyRegion->push_back(new mlir::Block{});
+  if (ensureTerminator)
+    ChangeTeamOp::ensureTerminator(*bodyRegion, builder, result.location);
+
+  result.addAttribute(getOperandSegmentSizeAttr(),
+                      builder.getDenseI32ArrayAttr({1, argStat, argErrmsg}));
+  result.addAttributes(attributes);
+}
+
+static mlir::ParseResult parseChangeTeamOpBody(mlir::OpAsmParser &parser,
+                                               mlir::Region &body) {
+  if (parser.parseRegion(body))
+    return mlir::failure();
+
+  auto &builder = parser.getBuilder();
+  mif::ChangeTeamOp::ensureTerminator(body, builder, builder.getUnknownLoc());
+  return mlir::success();
+}
+
+static void printChangeTeamOpBody(mlir::OpAsmPrinter &p, mif::ChangeTeamOp op,
+                                  mlir::Region &body) {
+  p.printRegion(op.getRegion(), /*printEntryBlockArgs=*/true,
+                /*printBlockTerminators=*/true);
+}
+
+#define GET_OP_CLASSES
+#include "flang/Optimizer/Dialect/MIF/MIFOps.cpp.inc"
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index 1332dc5..e42c064 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -261,14 +261,12 @@ updateDeclaredInputTypeWithVolatility(mlir::Type inputType, mlir::Value memref,
   return std::make_pair(inputType, memref);
 }
 
-void hlfir::DeclareOp::build(mlir::OpBuilder &builder,
-                             mlir::OperationState &result, mlir::Value memref,
-                             llvm::StringRef uniq_name, mlir::Value shape,
-                             mlir::ValueRange typeparams,
-                             mlir::Value dummy_scope, mlir::Value storage,
-                             std::uint64_t storage_offset,
-                             fir::FortranVariableFlagsAttr fortran_attrs,
-                             cuf::DataAttributeAttr data_attr) {
+void hlfir::DeclareOp::build(
+    mlir::OpBuilder &builder, mlir::OperationState &result, mlir::Value memref,
+    llvm::StringRef uniq_name, mlir::Value shape, mlir::ValueRange typeparams,
+    mlir::Value dummy_scope, mlir::Value storage, std::uint64_t storage_offset,
+    fir::FortranVariableFlagsAttr fortran_attrs,
+    cuf::DataAttributeAttr data_attr, unsigned dummy_arg_no) {
   auto nameAttr = builder.getStringAttr(uniq_name);
   mlir::Type inputType = memref.getType();
   bool hasExplicitLbs = hasExplicitLowerBounds(shape);
@@ -279,9 +277,12 @@ void hlfir::DeclareOp::build(mlir::OpBuilder &builder,
   }
   auto [hlfirVariableType, firVarType] =
       getDeclareOutputTypes(inputType, hasExplicitLbs);
+  mlir::IntegerAttr argNoAttr;
+  if (dummy_arg_no > 0)
+    argNoAttr = builder.getUI32IntegerAttr(dummy_arg_no);
   build(builder, result, {hlfirVariableType, firVarType}, memref, shape,
         typeparams, dummy_scope, storage, storage_offset, nameAttr,
-        fortran_attrs, data_attr, /*skip_rebox=*/mlir::UnitAttr{});
+        fortran_attrs, data_attr, /*skip_rebox=*/mlir::UnitAttr{}, argNoAttr);
 }
 
 llvm::LogicalResult hlfir::DeclareOp::verify() {
@@ -591,6 +592,12 @@ llvm::LogicalResult hlfir::DesignateOp::verify() {
   return mlir::success();
 }
 
+std::optional<std::int64_t> hlfir::DesignateOp::getViewOffset(mlir::OpResult) {
+  // TODO: we can compute the constant offset
+  // based on the component/indices/etc.
+  return std::nullopt;
+}
+
 //===----------------------------------------------------------------------===//
 // ParentComponentOp
 //===----------------------------------------------------------------------===//
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
index 6a57bf2..8bdf13e 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
@@ -149,13 +149,18 @@ public:
           !assignOp.isTemporaryLHS() &&
           mlir::isa<fir::RecordType>(fir::getElementTypeOf(lhsExv));
 
+      mlir::ArrayAttr accessGroups;
+      if (auto attrs = assignOp.getOperation()->getAttrOfType<mlir::ArrayAttr>(
+              "access_groups"))
+        accessGroups = attrs;
+
       // genScalarAssignment() must take care of potential overlap
       // between LHS and RHS. Note that the overlap is possible
       // also for components of LHS/RHS, and the Assign() runtime
       // must take care of it.
-      fir::factory::genScalarAssignment(builder, loc, lhsExv, rhsExv,
-                                        needFinalization,
-                                        assignOp.isTemporaryLHS());
+      fir::factory::genScalarAssignment(
+          builder, loc, lhsExv, rhsExv, needFinalization,
+          assignOp.isTemporaryLHS(), accessGroups);
     }
     rewriter.eraseOp(assignOp);
     return mlir::success();
@@ -308,7 +313,8 @@ public:
         declareOp.getTypeparams(), declareOp.getDummyScope(),
         /*storage=*/declareOp.getStorage(),
         /*storage_offset=*/declareOp.getStorageOffset(),
-        declareOp.getUniqName(), fortranAttrs, dataAttr);
+        declareOp.getUniqName(), fortranAttrs, dataAttr,
+        declareOp.getDummyArgNoAttr());
 
     // Propagate other attributes from hlfir.declare to fir.declare.
     // OpenACC's acc.declare is one example. Right now, the propagation
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index ce8ebaa..4fa8103 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -931,6 +931,37 @@ private:
   mlir::Value genScalarAdd(mlir::Value value1, mlir::Value value2);
 };
 
+/// Reduction converter for Product.
+class ProductAsElementalConverter
+    : public NumericReductionAsElementalConverterBase<hlfir::ProductOp> {
+  using Base = NumericReductionAsElementalConverterBase;
+
+public:
+  ProductAsElementalConverter(hlfir::ProductOp op,
+                              mlir::PatternRewriter &rewriter)
+      : Base{op, rewriter} {}
+
+private:
+  virtual llvm::SmallVector<mlir::Value> genReductionInitValues(
+      [[maybe_unused]] mlir::ValueRange oneBasedIndices,
+      [[maybe_unused]] const llvm::SmallVectorImpl<mlir::Value> &extents)
+      final {
+    return {fir::factory::createOneValue(builder, loc, getResultElementType())};
+  }
+  virtual llvm::SmallVector<mlir::Value>
+  reduceOneElement(const llvm::SmallVectorImpl<mlir::Value> &currentValue,
+                   hlfir::Entity array,
+                   mlir::ValueRange oneBasedIndices) final {
+    checkReductions(currentValue);
+    hlfir::Entity elementValue =
+        hlfir::loadElementAt(loc, builder, array, oneBasedIndices);
+    return {genScalarMult(currentValue[0], elementValue)};
+  }
+
+  // Generate scalar multiplication of the two values (of the same data type).
+  mlir::Value genScalarMult(mlir::Value value1, mlir::Value value2);
+};
+
 /// Base class for logical reductions like ALL, ANY, COUNT.
 /// They do not have MASK and FastMathFlags.
 template <typename OpT>
@@ -1194,6 +1225,20 @@ mlir::Value SumAsElementalConverter::genScalarAdd(mlir::Value value1,
   llvm_unreachable("unsupported SUM reduction type");
 }
 
+mlir::Value ProductAsElementalConverter::genScalarMult(mlir::Value value1,
+                                                       mlir::Value value2) {
+  mlir::Type ty = value1.getType();
+  assert(ty == value2.getType() && "reduction values' types do not match");
+  if (mlir::isa<mlir::FloatType>(ty))
+    return mlir::arith::MulFOp::create(builder, loc, value1, value2);
+  else if (mlir::isa<mlir::ComplexType>(ty))
+    return fir::MulcOp::create(builder, loc, value1, value2);
+  else if (mlir::isa<mlir::IntegerType>(ty))
+    return mlir::arith::MulIOp::create(builder, loc, value1, value2);
+
+  llvm_unreachable("unsupported MUL reduction type");
+}
+
 mlir::Value ReductionAsElementalConverter::genMaskValue(
     mlir::Value mask, mlir::Value isPresentPred, mlir::ValueRange indices) {
   mlir::OpBuilder::InsertionGuard guard(builder);
@@ -1265,6 +1310,9 @@ public:
     } else if constexpr (std::is_same_v<Op, hlfir::SumOp>) {
       SumAsElementalConverter converter{op, rewriter};
       return converter.convert();
+    } else if constexpr (std::is_same_v<Op, hlfir::ProductOp>) {
+      ProductAsElementalConverter converter{op, rewriter};
+      return converter.convert();
     }
     return rewriter.notifyMatchFailure(op, "unexpected reduction operation");
   }
@@ -3158,6 +3206,7 @@ public:
     mlir::RewritePatternSet patterns(context);
     patterns.insert<TransposeAsElementalConversion>(context);
     patterns.insert<ReductionConversion<hlfir::SumOp>>(context);
+    patterns.insert<ReductionConversion<hlfir::ProductOp>>(context);
     patterns.insert<ArrayShiftConversion<hlfir::CShiftOp>>(context);
     patterns.insert<ArrayShiftConversion<hlfir::EOShiftOp>>(context);
     patterns.insert<CmpCharOpConversion>(context);
diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp
index c1734be..e4d02e9 100644
--- a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp
+++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp
@@ -14,6 +14,9 @@
 
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/Support/InternalNames.h"
+#include "mlir/IR/SymbolTable.h"
+#include "llvm/ADT/SmallSet.h"
 
 namespace fir::acc {
 
@@ -59,4 +62,111 @@ bool PartialEntityAccessModel<hlfir::DeclareOp>::isCompleteView(
   return !getBaseEntity(op);
 }
 
+mlir::SymbolRefAttr AddressOfGlobalModel::getSymbol(mlir::Operation *op) const {
+  return mlir::cast<fir::AddrOfOp>(op).getSymbolAttr();
+}
+
+bool GlobalVariableModel::isConstant(mlir::Operation *op) const {
+  auto globalOp = mlir::cast<fir::GlobalOp>(op);
+  return globalOp.getConstant().has_value();
+}
+
+mlir::Region *GlobalVariableModel::getInitRegion(mlir::Operation *op) const {
+  auto globalOp = mlir::cast<fir::GlobalOp>(op);
+  return globalOp.hasInitializationBody() ? &globalOp.getRegion() : nullptr;
+}
+
+// Helper to recursively process address-of operations in derived type
+// descriptors and collect all needed fir.globals.
+static void processAddrOfOpInDerivedTypeDescriptor(
+    fir::AddrOfOp addrOfOp, mlir::SymbolTable &symTab,
+    llvm::SmallSet<mlir::Operation *, 16> &globalsSet,
+    llvm::SmallVectorImpl<mlir::SymbolRefAttr> &symbols) {
+  if (auto globalOp = symTab.lookup<fir::GlobalOp>(
+          addrOfOp.getSymbol().getLeafReference().getValue())) {
+    if (globalsSet.contains(globalOp))
+      return;
+    globalsSet.insert(globalOp);
+    symbols.push_back(addrOfOp.getSymbolAttr());
+    globalOp.walk([&](fir::AddrOfOp op) {
+      processAddrOfOpInDerivedTypeDescriptor(op, symTab, globalsSet, symbols);
+    });
+  }
+}
+
+// Utility to collect referenced symbols for type descriptors of derived types.
+// This is the common logic for operations that may require type descriptor
+// globals.
+static void collectReferencedSymbolsForType(
+    mlir::Type ty, mlir::Operation *op,
+    llvm::SmallVectorImpl<mlir::SymbolRefAttr> &symbols,
+    mlir::SymbolTable *symbolTable) {
+  ty = fir::getDerivedType(fir::unwrapRefType(ty));
+
+  // Look for type descriptor globals only if it's a derived (record) type
+  if (auto recTy = mlir::dyn_cast_if_present<fir::RecordType>(ty)) {
+    // If no symbol table provided, simply add the type descriptor name
+    if (!symbolTable) {
+      symbols.push_back(mlir::SymbolRefAttr::get(
+          op->getContext(),
+          fir::NameUniquer::getTypeDescriptorName(recTy.getName())));
+      return;
+    }
+
+    // Otherwise, do full lookup and recursive processing
+    llvm::SmallSet<mlir::Operation *, 16> globalsSet;
+
+    fir::GlobalOp globalOp = symbolTable->lookup<fir::GlobalOp>(
+        fir::NameUniquer::getTypeDescriptorName(recTy.getName()));
+    if (!globalOp)
+      globalOp = symbolTable->lookup<fir::GlobalOp>(
+          fir::NameUniquer::getTypeDescriptorAssemblyName(recTy.getName()));
+
+    if (globalOp) {
+      globalsSet.insert(globalOp);
+      symbols.push_back(
+          mlir::SymbolRefAttr::get(op->getContext(), globalOp.getSymName()));
+      globalOp.walk([&](fir::AddrOfOp addrOp) {
+        processAddrOfOpInDerivedTypeDescriptor(addrOp, *symbolTable, globalsSet,
+                                               symbols);
+      });
+    }
+  }
+}
+
+template <>
+void IndirectGlobalAccessModel<fir::AllocaOp>::getReferencedSymbols(
+    mlir::Operation *op, llvm::SmallVectorImpl<mlir::SymbolRefAttr> &symbols,
+    mlir::SymbolTable *symbolTable) const {
+  auto allocaOp = mlir::cast<fir::AllocaOp>(op);
+  collectReferencedSymbolsForType(allocaOp.getType(), op, symbols, symbolTable);
+}
+
+template <>
+void IndirectGlobalAccessModel<fir::EmboxOp>::getReferencedSymbols(
+    mlir::Operation *op, llvm::SmallVectorImpl<mlir::SymbolRefAttr> &symbols,
+    mlir::SymbolTable *symbolTable) const {
+  auto emboxOp = mlir::cast<fir::EmboxOp>(op);
+  collectReferencedSymbolsForType(emboxOp.getMemref().getType(), op, symbols,
+                                  symbolTable);
+}
+
+template <>
+void IndirectGlobalAccessModel<fir::ReboxOp>::getReferencedSymbols(
+    mlir::Operation *op, llvm::SmallVectorImpl<mlir::SymbolRefAttr> &symbols,
+    mlir::SymbolTable *symbolTable) const {
+  auto reboxOp = mlir::cast<fir::ReboxOp>(op);
+  collectReferencedSymbolsForType(reboxOp.getBox().getType(), op, symbols,
+                                  symbolTable);
+}
+
+template <>
+void IndirectGlobalAccessModel<fir::TypeDescOp>::getReferencedSymbols(
+    mlir::Operation *op, llvm::SmallVectorImpl<mlir::SymbolRefAttr> &symbols,
+    mlir::SymbolTable *symbolTable) const {
+  auto typeDescOp = mlir::cast<fir::TypeDescOp>(op);
+  collectReferencedSymbolsForType(typeDescOp.getInType(), op, symbols,
+                                  symbolTable);
+}
+
 } // namespace fir::acc
diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
index ae0f5fb8..9fcc7d3 100644
--- a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
+++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
@@ -1014,4 +1014,114 @@ template bool OpenACCPointerLikeModel<fir::LLVMPointerType>::genCopy(
     mlir::TypedValue<mlir::acc::PointerLikeType> source,
     mlir::Type varType) const;
 
+template <typename Ty>
+mlir::Value OpenACCPointerLikeModel<Ty>::genLoad(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::TypedValue<mlir::acc::PointerLikeType> srcPtr,
+    mlir::Type valueType) const {
+
+  // Unwrap to get the pointee type.
+  mlir::Type pointeeTy = fir::dyn_cast_ptrEleTy(pointer);
+  assert(pointeeTy && "expected pointee type to be extractable");
+
+  // Box types contain both a descriptor and referenced data. The genLoad API
+  // handles simple loads and cannot properly manage both parts.
+  if (fir::isa_box_type(pointeeTy))
+    return {};
+
+  // Unlimited polymorphic (class(*)) cannot be handled because type is unknown.
+  if (fir::isUnlimitedPolymorphicType(pointeeTy))
+    return {};
+
+  // Return empty for dynamic size types because the load logic
+  // cannot be determined simply from the type.
+  if (fir::hasDynamicSize(pointeeTy))
+    return {};
+
+  mlir::Value loadedValue = fir::LoadOp::create(builder, loc, srcPtr);
+
+  // If valueType is provided and differs from the loaded type, insert a convert
+  if (valueType && loadedValue.getType() != valueType)
+    return fir::ConvertOp::create(builder, loc, valueType, loadedValue);
+
+  return loadedValue;
+}
+
+template mlir::Value OpenACCPointerLikeModel<fir::ReferenceType>::genLoad(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::TypedValue<mlir::acc::PointerLikeType> srcPtr,
+    mlir::Type valueType) const;
+
+template mlir::Value OpenACCPointerLikeModel<fir::PointerType>::genLoad(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::TypedValue<mlir::acc::PointerLikeType> srcPtr,
+    mlir::Type valueType) const;
+
+template mlir::Value OpenACCPointerLikeModel<fir::HeapType>::genLoad(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::TypedValue<mlir::acc::PointerLikeType> srcPtr,
+    mlir::Type valueType) const;
+
+template mlir::Value OpenACCPointerLikeModel<fir::LLVMPointerType>::genLoad(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::TypedValue<mlir::acc::PointerLikeType> srcPtr,
+    mlir::Type valueType) const;
+
+template <typename Ty>
+bool OpenACCPointerLikeModel<Ty>::genStore(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::Value valueToStore,
+    mlir::TypedValue<mlir::acc::PointerLikeType> destPtr) const {
+
+  // Unwrap to get the pointee type.
+  mlir::Type pointeeTy = fir::dyn_cast_ptrEleTy(pointer);
+  assert(pointeeTy && "expected pointee type to be extractable");
+
+  // Box types contain both a descriptor and referenced data. The genStore API
+  // handles simple stores and cannot properly manage both parts.
+  if (fir::isa_box_type(pointeeTy))
+    return false;
+
+  // Unlimited polymorphic (class(*)) cannot be handled because type is unknown.
+  if (fir::isUnlimitedPolymorphicType(pointeeTy))
+    return false;
+
+  // Return false for dynamic size types because the store logic
+  // cannot be determined simply from the type.
+  if (fir::hasDynamicSize(pointeeTy))
+    return false;
+
+  // Get the type from the value being stored
+  mlir::Type valueType = valueToStore.getType();
+  mlir::Value convertedValue = valueToStore;
+
+  // If the value type differs from the pointee type, insert a convert
+  if (valueType != pointeeTy)
+    convertedValue =
+        fir::ConvertOp::create(builder, loc, pointeeTy, valueToStore);
+
+  fir::StoreOp::create(builder, loc, convertedValue, destPtr);
+  return true;
+}
+
+template bool OpenACCPointerLikeModel<fir::ReferenceType>::genStore(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::Value valueToStore,
+    mlir::TypedValue<mlir::acc::PointerLikeType> destPtr) const;
+
+template bool OpenACCPointerLikeModel<fir::PointerType>::genStore(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::Value valueToStore,
+    mlir::TypedValue<mlir::acc::PointerLikeType> destPtr) const;
+
+template bool OpenACCPointerLikeModel<fir::HeapType>::genStore(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::Value valueToStore,
+    mlir::TypedValue<mlir::acc::PointerLikeType> destPtr) const;
+
+template bool OpenACCPointerLikeModel<fir::LLVMPointerType>::genStore(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::Value valueToStore,
+    mlir::TypedValue<mlir::acc::PointerLikeType> destPtr) const;
+
 } // namespace fir::acc
diff --git a/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp b/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp
index d71c40d..acd1d01 100644
--- a/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp
+++ b/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp
@@ -49,6 +49,18 @@ void registerOpenACCExtensions(mlir::DialectRegistry &registry) {
         PartialEntityAccessModel<fir::CoordinateOp>>(*ctx);
     fir::DeclareOp::attachInterface<PartialEntityAccessModel<fir::DeclareOp>>(
         *ctx);
+
+    fir::AddrOfOp::attachInterface<AddressOfGlobalModel>(*ctx);
+    fir::GlobalOp::attachInterface<GlobalVariableModel>(*ctx);
+
+    fir::AllocaOp::attachInterface<IndirectGlobalAccessModel<fir::AllocaOp>>(
+        *ctx);
+    fir::EmboxOp::attachInterface<IndirectGlobalAccessModel<fir::EmboxOp>>(
+        *ctx);
+    fir::ReboxOp::attachInterface<IndirectGlobalAccessModel<fir::ReboxOp>>(
+        *ctx);
+    fir::TypeDescOp::attachInterface<
+        IndirectGlobalAccessModel<fir::TypeDescOp>>(*ctx);
   });
 
   // Register HLFIR operation interfaces
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp b/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
index 0d135a9..ad0cfa3 100644
--- a/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
+++ b/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
@@ -87,30 +87,26 @@ static void bufferizeRegionArgsAndYields(mlir::Region &region,
   }
 }
 
-static void updateRecipeUse(mlir::ArrayAttr recipes, mlir::ValueRange operands,
+template <typename OpTy>
+static void updateRecipeUse(mlir::ValueRange operands,
                             llvm::StringRef recipeSymName,
                             mlir::Operation *computeOp) {
-  if (!recipes)
-    return;
-  for (auto [recipeSym, oldRes] : llvm::zip(recipes, operands)) {
-    if (llvm::cast<mlir::SymbolRefAttr>(recipeSym).getLeafReference() !=
-        recipeSymName)
+  for (auto operand : operands) {
+    auto op = operand.getDefiningOp<OpTy>();
+    if (!op || !op.getRecipe().has_value() ||
+        op.getRecipeAttr().getLeafReference() != recipeSymName)
       continue;
 
-    mlir::Operation *dataOp = oldRes.getDefiningOp();
-    assert(dataOp && "dataOp must be paired with computeOp");
-    mlir::Location loc = dataOp->getLoc();
-    mlir::OpBuilder builder(dataOp);
-    llvm::TypeSwitch<mlir::Operation *, void>(dataOp)
-        .Case<mlir::acc::PrivateOp, mlir::acc::FirstprivateOp,
-              mlir::acc::ReductionOp>([&](auto privateOp) {
-          builder.setInsertionPointAfterValue(privateOp.getVar());
-          mlir::Value alloca = BufferizeInterface::placeInMemory(
-              builder, loc, privateOp.getVar());
-          privateOp.getVarMutable().assign(alloca);
-          privateOp.getAccVar().setType(alloca.getType());
-        });
+    mlir::Location loc = op->getLoc();
+
+    mlir::OpBuilder builder(op);
+    builder.setInsertionPointAfterValue(op.getVar());
+    mlir::Value alloca =
+        BufferizeInterface::placeInMemory(builder, loc, op.getVar());
+    op.getVarMutable().assign(alloca);
+    op.getAccVar().setType(alloca.getType());
 
+    mlir::Value oldRes = op.getAccVar();
     llvm::SmallVector<mlir::Operation *> users(oldRes.getUsers().begin(),
                                                oldRes.getUsers().end());
     for (mlir::Operation *useOp : users) {
@@ -166,18 +162,15 @@ public:
           .Case<mlir::acc::LoopOp, mlir::acc::ParallelOp, mlir::acc::SerialOp>(
               [&](auto computeOp) {
                 for (llvm::StringRef recipeName : recipeNames) {
-                  if (computeOp.getPrivatizationRecipes())
-                    updateRecipeUse(computeOp.getPrivatizationRecipesAttr(),
-                                    computeOp.getPrivateOperands(), recipeName,
-                                    op);
-                  if (computeOp.getFirstprivatizationRecipes())
-                    updateRecipeUse(
-                        computeOp.getFirstprivatizationRecipesAttr(),
+                  if (!computeOp.getPrivateOperands().empty())
+                    updateRecipeUse<mlir::acc::PrivateOp>(
+                        computeOp.getPrivateOperands(), recipeName, op);
+                  if (!computeOp.getFirstprivateOperands().empty())
+                    updateRecipeUse<mlir::acc::FirstprivateOp>(
                         computeOp.getFirstprivateOperands(), recipeName, op);
-                  if (computeOp.getReductionRecipes())
-                    updateRecipeUse(computeOp.getReductionRecipesAttr(),
-                                    computeOp.getReductionOperands(),
-                                    recipeName, op);
+                  if (!computeOp.getReductionOperands().empty())
+                    updateRecipeUse<mlir::acc::ReductionOp>(
+                        computeOp.getReductionOperands(), recipeName, op);
                 }
               });
     });
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
index 35aa87d..d41e99a 100644
--- a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
@@ -15,4 +15,5 @@ add_flang_library(FIROpenACCTransforms
   MLIRIR
   MLIRPass
   MLIROpenACCDialect
+  MLIROpenACCUtils
 )
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 9aad8cd..1012a96 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -848,7 +848,8 @@ private:
         if (!ompReducer) {
           ompReducer = mlir::omp::DeclareReductionOp::create(
               rewriter, firReducer.getLoc(), ompReducerName,
-              firReducer.getTypeAttr().getValue());
+              firReducer.getTypeAttr().getValue(),
+              firReducer.getByrefElementTypeAttr());
 
           cloneFIRRegionToOMP(rewriter, firReducer.getAllocRegion(),
                               ompReducer.getAllocRegion());
diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
index 8382a48..3fe133d 100644
--- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
@@ -347,10 +347,10 @@ class MapInfoFinalizationPass
   /// base address (BoxOffsetOp) and a MapInfoOp for it. The most
   /// important thing to note is that we normally move the bounds from
   /// the descriptor map onto the base address map.
-  mlir::omp::MapInfoOp genBaseAddrMap(mlir::Value descriptor,
-                                      mlir::OperandRange bounds,
-                                      mlir::omp::ClauseMapFlags mapType,
-                                      fir::FirOpBuilder &builder) {
+  mlir::omp::MapInfoOp
+  genBaseAddrMap(mlir::Value descriptor, mlir::OperandRange bounds,
+                 mlir::omp::ClauseMapFlags mapType, fir::FirOpBuilder &builder,
+                 mlir::FlatSymbolRefAttr mapperId = mlir::FlatSymbolRefAttr()) {
     mlir::Location loc = descriptor.getLoc();
     mlir::Value baseAddrAddr = fir::BoxOffsetOp::create(
         builder, loc, descriptor, fir::BoxFieldAttr::base_addr);
@@ -372,7 +372,7 @@ class MapInfoFinalizationPass
             mlir::omp::VariableCaptureKind::ByRef),
         baseAddrAddr, /*members=*/mlir::SmallVector<mlir::Value>{},
         /*membersIndex=*/mlir::ArrayAttr{}, bounds,
-        /*mapperId*/ mlir::FlatSymbolRefAttr(),
+        /*mapperId=*/mapperId,
         /*name=*/builder.getStringAttr(""),
         /*partial_map=*/builder.getBoolAttr(false));
   }
@@ -437,6 +437,20 @@ class MapInfoFinalizationPass
 
     mapFlags flags =
         mapFlags::to | (mapTypeFlag & (mapFlags::implicit | mapFlags::always));
+
+    // Descriptors for objects will always be copied. This is because the
+    // descriptor can be rematerialized by the compiler, and so the address
+    // of the descriptor for a given object at one place in the code may
+    // differ from that address in another place. The contents of the
+    // descriptor (the base address in particular) will remain unchanged
+    // though.
+    // TODO/FIXME: We currently cannot have MAP_CLOSE and MAP_ALWAYS on
+    // the descriptor at once, these are mutually exclusive and when
+    // both are applied the runtime will fail to map.
+    flags |= ((mapFlags(mapTypeFlag) & mapFlags::close) == mapFlags::close)
+                 ? mapFlags::close
+                 : mapFlags::always;
+
     // For unified_shared_memory, we additionally add `CLOSE` on the descriptor
     // to ensure device-local placement where required by tests relying on USM +
     // close semantics.
@@ -578,6 +592,7 @@ class MapInfoFinalizationPass
     // from the descriptor to be used verbatim, i.e. without additional
     // remapping. To avoid this remapping, simply don't generate any map
     // information for the descriptor members.
+    mlir::FlatSymbolRefAttr mapperId = op.getMapperIdAttr();
     if (!mapMemberUsers.empty()) {
       // Currently, there should only be one user per map when this pass
       // is executed. Either a parent map, holding the current map in its
@@ -588,8 +603,8 @@ class MapInfoFinalizationPass
       assert(mapMemberUsers.size() == 1 &&
              "OMPMapInfoFinalization currently only supports single users of a "
              "MapInfoOp");
-      auto baseAddr =
-          genBaseAddrMap(descriptor, op.getBounds(), op.getMapType(), builder);
+      auto baseAddr = genBaseAddrMap(descriptor, op.getBounds(),
+                                     op.getMapType(), builder, mapperId);
       ParentAndPlacement mapUser = mapMemberUsers[0];
       adjustMemberIndices(memberIndices, mapUser.index);
       llvm::SmallVector<mlir::Value> newMemberOps;
@@ -602,8 +617,8 @@ class MapInfoFinalizationPass
       mapUser.parent.setMembersIndexAttr(
           builder.create2DI64ArrayAttr(memberIndices));
     } else if (!isHasDeviceAddrFlag) {
-      auto baseAddr =
-          genBaseAddrMap(descriptor, op.getBounds(), op.getMapType(), builder);
+      auto baseAddr = genBaseAddrMap(descriptor, op.getBounds(),
+                                     op.getMapType(), builder, mapperId);
       newMembers.push_back(baseAddr);
       if (!op.getMembers().empty()) {
         for (auto &indices : memberIndices)
@@ -635,7 +650,7 @@ class MapInfoFinalizationPass
             getDescriptorMapType(mapType, target)),
         op.getMapCaptureTypeAttr(), /*varPtrPtr=*/mlir::Value{}, newMembers,
         newMembersAttr, /*bounds=*/mlir::SmallVector<mlir::Value>{},
-        /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(),
+        /*mapperId=*/mlir::FlatSymbolRefAttr(), op.getNameAttr(),
         /*partial_map=*/builder.getBoolAttr(false));
     op.replaceAllUsesWith(newDescParentMapOp.getResult());
     op->erase();
diff --git a/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp b/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp
index 0972861..6404e18 100644
--- a/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp
@@ -104,21 +104,31 @@ class MapsForPrivatizedSymbolsPass
     llvm::SmallVector<mlir::Value> boundsOps;
     if (needsBoundsOps(varPtr))
       genBoundsOps(builder, varPtr, boundsOps);
+    mlir::Type varType = varPtr.getType();
 
     mlir::omp::VariableCaptureKind captureKind =
         mlir::omp::VariableCaptureKind::ByRef;
-    if (fir::isa_trivial(fir::unwrapRefType(varPtr.getType())) ||
-        fir::isa_char(fir::unwrapRefType(varPtr.getType()))) {
-      if (canPassByValue(fir::unwrapRefType(varPtr.getType()))) {
+    if (fir::isa_trivial(fir::unwrapRefType(varType)) ||
+        fir::isa_char(fir::unwrapRefType(varType))) {
+      if (canPassByValue(fir::unwrapRefType(varType))) {
         captureKind = mlir::omp::VariableCaptureKind::ByCopy;
       }
     }
 
+    // Use tofrom if what we are mapping is not a trivial type. In all
+    // likelihood, it is a descriptor
+    mlir::omp::ClauseMapFlags mapFlag;
+    if (fir::isa_trivial(fir::unwrapRefType(varType)) ||
+        fir::isa_char(fir::unwrapRefType(varType)))
+      mapFlag = mlir::omp::ClauseMapFlags::to;
+    else
+      mapFlag = mlir::omp::ClauseMapFlags::to | mlir::omp::ClauseMapFlags::from;
+
     return omp::MapInfoOp::create(
-        builder, loc, varPtr.getType(), varPtr,
-        TypeAttr::get(llvm::cast<omp::PointerLikeType>(varPtr.getType())
-                          .getElementType()),
-        builder.getAttr<omp::ClauseMapFlagsAttr>(omp::ClauseMapFlags::to),
+        builder, loc, varType, varPtr,
+        TypeAttr::get(
+            llvm::cast<omp::PointerLikeType>(varType).getElementType()),
+        builder.getAttr<omp::ClauseMapFlagsAttr>(mapFlag),
         builder.getAttr<omp::VariableCaptureKindAttr>(captureKind),
         /*varPtrPtr=*/Value{},
         /*members=*/SmallVector<Value>{},
diff --git a/flang/lib/Optimizer/OpenMP/MarkDeclareTarget.cpp b/flang/lib/Optimizer/OpenMP/MarkDeclareTarget.cpp
index 0b0e6bd..5fa77fb 100644
--- a/flang/lib/Optimizer/OpenMP/MarkDeclareTarget.cpp
+++ b/flang/lib/Optimizer/OpenMP/MarkDeclareTarget.cpp
@@ -21,6 +21,7 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/TypeSwitch.h"
 
 namespace flangomp {
 #define GEN_PASS_DEF_MARKDECLARETARGETPASS
@@ -31,9 +32,93 @@ namespace {
 class MarkDeclareTargetPass
     : public flangomp::impl::MarkDeclareTargetPassBase<MarkDeclareTargetPass> {
 
-  void markNestedFuncs(mlir::omp::DeclareTargetDeviceType parentDevTy,
-                       mlir::omp::DeclareTargetCaptureClause parentCapClause,
-                       bool parentAutomap, mlir::Operation *currOp,
+  struct ParentInfo {
+    mlir::omp::DeclareTargetDeviceType devTy;
+    mlir::omp::DeclareTargetCaptureClause capClause;
+    bool automap;
+  };
+
+  void processSymbolRef(mlir::SymbolRefAttr symRef, ParentInfo parentInfo,
+                        llvm::SmallPtrSet<mlir::Operation *, 16> visited) {
+    if (auto currFOp =
+            getOperation().lookupSymbol<mlir::func::FuncOp>(symRef)) {
+      auto current = llvm::dyn_cast<mlir::omp::DeclareTargetInterface>(
+          currFOp.getOperation());
+
+      if (current.isDeclareTarget()) {
+        auto currentDt = current.getDeclareTargetDeviceType();
+
+        // Found the same function twice, with different device_types,
+        // mark as Any as it belongs to both
+        if (currentDt != parentInfo.devTy &&
+            currentDt != mlir::omp::DeclareTargetDeviceType::any) {
+          current.setDeclareTarget(mlir::omp::DeclareTargetDeviceType::any,
+                                   current.getDeclareTargetCaptureClause(),
+                                   current.getDeclareTargetAutomap());
+        }
+      } else {
+        current.setDeclareTarget(parentInfo.devTy, parentInfo.capClause,
+                                 parentInfo.automap);
+      }
+
+      markNestedFuncs(parentInfo, currFOp, visited);
+    }
+  }
+
+  void processReductionRefs(std::optional<mlir::ArrayAttr> symRefs,
+                            ParentInfo parentInfo,
+                            llvm::SmallPtrSet<mlir::Operation *, 16> visited) {
+    if (!symRefs)
+      return;
+
+    for (auto symRef : symRefs->getAsRange<mlir::SymbolRefAttr>()) {
+      if (auto declareReductionOp =
+              getOperation().lookupSymbol<mlir::omp::DeclareReductionOp>(
+                  symRef)) {
+        markNestedFuncs(parentInfo, declareReductionOp, visited);
+      }
+    }
+  }
+
+  void
+  processReductionClauses(mlir::Operation *op, ParentInfo parentInfo,
+                          llvm::SmallPtrSet<mlir::Operation *, 16> visited) {
+    llvm::TypeSwitch<mlir::Operation &>(*op)
+        .Case([&](mlir::omp::LoopOp op) {
+          processReductionRefs(op.getReductionSyms(), parentInfo, visited);
+        })
+        .Case([&](mlir::omp::ParallelOp op) {
+          processReductionRefs(op.getReductionSyms(), parentInfo, visited);
+        })
+        .Case([&](mlir::omp::SectionsOp op) {
+          processReductionRefs(op.getReductionSyms(), parentInfo, visited);
+        })
+        .Case([&](mlir::omp::SimdOp op) {
+          processReductionRefs(op.getReductionSyms(), parentInfo, visited);
+        })
+        .Case([&](mlir::omp::TargetOp op) {
+          processReductionRefs(op.getInReductionSyms(), parentInfo, visited);
+        })
+        .Case([&](mlir::omp::TaskgroupOp op) {
+          processReductionRefs(op.getTaskReductionSyms(), parentInfo, visited);
+        })
+        .Case([&](mlir::omp::TaskloopOp op) {
+          processReductionRefs(op.getReductionSyms(), parentInfo, visited);
+          processReductionRefs(op.getInReductionSyms(), parentInfo, visited);
+        })
+        .Case([&](mlir::omp::TaskOp op) {
+          processReductionRefs(op.getInReductionSyms(), parentInfo, visited);
+        })
+        .Case([&](mlir::omp::TeamsOp op) {
+          processReductionRefs(op.getReductionSyms(), parentInfo, visited);
+        })
+        .Case([&](mlir::omp::WsloopOp op) {
+          processReductionRefs(op.getReductionSyms(), parentInfo, visited);
+        })
+        .Default([](mlir::Operation &) {});
+  }
+
+  void markNestedFuncs(ParentInfo parentInfo, mlir::Operation *currOp,
                        llvm::SmallPtrSet<mlir::Operation *, 16> visited) {
     if (visited.contains(currOp))
       return;
@@ -43,33 +128,10 @@ class MarkDeclareTargetPass
       if (auto callOp = llvm::dyn_cast<mlir::CallOpInterface>(op)) {
         if (auto symRef = llvm::dyn_cast_if_present<mlir::SymbolRefAttr>(
                 callOp.getCallableForCallee())) {
-          if (auto currFOp =
-                  getOperation().lookupSymbol<mlir::func::FuncOp>(symRef)) {
-            auto current = llvm::dyn_cast<mlir::omp::DeclareTargetInterface>(
-                currFOp.getOperation());
-
-            if (current.isDeclareTarget()) {
-              auto currentDt = current.getDeclareTargetDeviceType();
-
-              // Found the same function twice, with different device_types,
-              // mark as Any as it belongs to both
-              if (currentDt != parentDevTy &&
-                  currentDt != mlir::omp::DeclareTargetDeviceType::any) {
-                current.setDeclareTarget(
-                    mlir::omp::DeclareTargetDeviceType::any,
-                    current.getDeclareTargetCaptureClause(),
-                    current.getDeclareTargetAutomap());
-              }
-            } else {
-              current.setDeclareTarget(parentDevTy, parentCapClause,
-                                       parentAutomap);
-            }
-
-            markNestedFuncs(parentDevTy, parentCapClause, parentAutomap,
-                            currFOp, visited);
-          }
+          processSymbolRef(symRef, parentInfo, visited);
         }
       }
+      processReductionClauses(op, parentInfo, visited);
     });
   }
 
@@ -82,10 +144,10 @@ class MarkDeclareTargetPass
           functionOp.getOperation());
       if (declareTargetOp.isDeclareTarget()) {
         llvm::SmallPtrSet<mlir::Operation *, 16> visited;
-        markNestedFuncs(declareTargetOp.getDeclareTargetDeviceType(),
-                        declareTargetOp.getDeclareTargetCaptureClause(),
-                        declareTargetOp.getDeclareTargetAutomap(), functionOp,
-                        visited);
+        ParentInfo parentInfo{declareTargetOp.getDeclareTargetDeviceType(),
+                              declareTargetOp.getDeclareTargetCaptureClause(),
+                              declareTargetOp.getDeclareTargetAutomap()};
+        markNestedFuncs(parentInfo, functionOp, visited);
       }
     }
 
@@ -96,12 +158,13 @@ class MarkDeclareTargetPass
     // the contents of the device clause
     getOperation()->walk([&](mlir::omp::TargetOp tarOp) {
       llvm::SmallPtrSet<mlir::Operation *, 16> visited;
-      markNestedFuncs(
-          /*parentDevTy=*/mlir::omp::DeclareTargetDeviceType::nohost,
-          /*parentCapClause=*/mlir::omp::DeclareTargetCaptureClause::to,
-          /*parentAutomap=*/false, tarOp, visited);
+      ParentInfo parentInfo = {
+          /*devTy=*/mlir::omp::DeclareTargetDeviceType::nohost,
+          /*capClause=*/mlir::omp::DeclareTargetCaptureClause::to,
+          /*automap=*/false,
+      };
+      markNestedFuncs(parentInfo, tarOp, visited);
     });
   }
 };
-
 } // namespace
diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index e006d2e..7491b7b 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -53,7 +53,7 @@ class AddDebugInfoPass : public fir::impl::AddDebugInfoBase<AddDebugInfoPass> {
                        mlir::LLVM::DIFileAttr fileAttr,
                        mlir::LLVM::DIScopeAttr scopeAttr,
                        fir::DebugTypeGenerator &typeGen,
-                       mlir::SymbolTable *symbolTable);
+                       mlir::SymbolTable *symbolTable, mlir::Value dummyScope);
 
 public:
   AddDebugInfoPass(fir::AddDebugInfoOptions options) : Base(options) {}
@@ -144,69 +144,88 @@ bool AddDebugInfoPass::createCommonBlockGlobal(
     fir::DebugTypeGenerator &typeGen, mlir::SymbolTable *symbolTable) {
   mlir::MLIRContext *context = &getContext();
   mlir::OpBuilder builder(context);
-  std::optional<std::int64_t> optint;
-  mlir::Operation *op = declOp.getMemref().getDefiningOp();
 
-  if (auto conOp = mlir::dyn_cast_if_present<fir::ConvertOp>(op))
-    op = conOp.getValue().getDefiningOp();
+  std::optional<std::int64_t> offset;
+  mlir::Value storage = declOp.getStorage();
+  if (!storage)
+    return false;
+
+  // Extract offset from storage_offset attribute
+  uint64_t storageOffset = declOp.getStorageOffset();
+  if (storageOffset != 0)
+    offset = static_cast<std::int64_t>(storageOffset);
+
+  // Get the GlobalOp from the storage value.
+  // The storage may be wrapped in ConvertOp, so unwrap it first.
+  mlir::Operation *storageOp = storage.getDefiningOp();
+  if (auto convertOp = mlir::dyn_cast_if_present<fir::ConvertOp>(storageOp))
+    storageOp = convertOp.getValue().getDefiningOp();
+
+  auto addrOfOp = mlir::dyn_cast_if_present<fir::AddrOfOp>(storageOp);
+  if (!addrOfOp)
+    return false;
+
+  mlir::SymbolRefAttr sym = addrOfOp.getSymbol();
+  fir::GlobalOp global =
+      symbolTable->lookup<fir::GlobalOp>(sym.getRootReference());
+  if (!global)
+    return false;
+
+  // Check if the global is actually a common block by demangling its name.
+  // Module EQUIVALENCE variables also use storage operands but are mangled
+  // as VARIABLE type, so we reject them to avoid treating them as common
+  // blocks.
+  llvm::StringRef globalSymbol = sym.getRootReference();
+  auto globalResult = fir::NameUniquer::deconstruct(globalSymbol);
+  if (globalResult.first == fir::NameUniquer::NameKind::VARIABLE)
+    return false;
+
+  // FIXME: We are trying to extract the name of the common block from the
+  // name of the global. As part of mangling, GetCommonBlockObjectName can
+  // add a trailing _ in the name of that global. The demangle function
+  // does not seem to handle such cases. So the following hack is used to
+  // remove the trailing '_'.
+  llvm::StringRef commonName = globalSymbol;
+  if (commonName != Fortran::common::blankCommonObjectName &&
+      !commonName.empty() && commonName.back() == '_')
+    commonName = commonName.drop_back();
+
+  // Create the debug attributes.
+  unsigned line = getLineFromLoc(global.getLoc());
+  mlir::LLVM::DICommonBlockAttr commonBlock =
+      getOrCreateCommonBlockAttr(commonName, fileAttr, scopeAttr, line);
+
+  mlir::LLVM::DITypeAttr diType = typeGen.convertType(
+      fir::unwrapRefType(declOp.getType()), fileAttr, scopeAttr, declOp);
+
+  line = getLineFromLoc(declOp.getLoc());
+  auto gvAttr = mlir::LLVM::DIGlobalVariableAttr::get(
+      context, commonBlock, mlir::StringAttr::get(context, name),
+      declOp.getUniqName(), fileAttr, line, diType,
+      /*isLocalToUnit*/ false, /*isDefinition*/ true, /* alignInBits*/ 0);
+
+  // Create DIExpression for offset if needed
+  mlir::LLVM::DIExpressionAttr expr;
+  if (offset && *offset != 0) {
+    llvm::SmallVector<mlir::LLVM::DIExpressionElemAttr> ops;
+    ops.push_back(mlir::LLVM::DIExpressionElemAttr::get(
+        context, llvm::dwarf::DW_OP_plus_uconst, *offset));
+    expr = mlir::LLVM::DIExpressionAttr::get(context, ops);
+  }
 
-  if (auto cordOp = mlir::dyn_cast_if_present<fir::CoordinateOp>(op)) {
-    auto coors = cordOp.getCoor();
-    if (coors.size() != 1)
-      return false;
-    optint = fir::getIntIfConstant(coors[0]);
-    if (!optint)
-      return false;
-    op = cordOp.getRef().getDefiningOp();
-    if (auto conOp2 = mlir::dyn_cast_if_present<fir::ConvertOp>(op))
-      op = conOp2.getValue().getDefiningOp();
+  auto dbgExpr = mlir::LLVM::DIGlobalVariableExpressionAttr::get(
+      global.getContext(), gvAttr, expr);
+  globalToGlobalExprsMap[global].push_back(dbgExpr);
 
-    if (auto addrOfOp = mlir::dyn_cast_if_present<fir::AddrOfOp>(op)) {
-      mlir::SymbolRefAttr sym = addrOfOp.getSymbol();
-      if (auto global =
-              symbolTable->lookup<fir::GlobalOp>(sym.getRootReference())) {
-
-        unsigned line = getLineFromLoc(global.getLoc());
-        llvm::StringRef commonName(sym.getRootReference());
-        // FIXME: We are trying to extract the name of the common block from the
-        // name of the global. As part of mangling, GetCommonBlockObjectName can
-        // add a trailing _ in the name of that global. The demangle function
-        // does not seem to handle such cases. So the following hack is used to
-        // remove the trailing '_'.
-        if (commonName != Fortran::common::blankCommonObjectName &&
-            commonName.back() == '_')
-          commonName = commonName.drop_back();
-        mlir::LLVM::DICommonBlockAttr commonBlock =
-            getOrCreateCommonBlockAttr(commonName, fileAttr, scopeAttr, line);
-        mlir::LLVM::DITypeAttr diType = typeGen.convertType(
-            fir::unwrapRefType(declOp.getType()), fileAttr, scopeAttr, declOp);
-        line = getLineFromLoc(declOp.getLoc());
-        auto gvAttr = mlir::LLVM::DIGlobalVariableAttr::get(
-            context, commonBlock, mlir::StringAttr::get(context, name),
-            declOp.getUniqName(), fileAttr, line, diType,
-            /*isLocalToUnit*/ false, /*isDefinition*/ true, /* alignInBits*/ 0);
-        mlir::LLVM::DIExpressionAttr expr;
-        if (*optint != 0) {
-          llvm::SmallVector<mlir::LLVM::DIExpressionElemAttr> ops;
-          ops.push_back(mlir::LLVM::DIExpressionElemAttr::get(
-              context, llvm::dwarf::DW_OP_plus_uconst, *optint));
-          expr = mlir::LLVM::DIExpressionAttr::get(context, ops);
-        }
-        auto dbgExpr = mlir::LLVM::DIGlobalVariableExpressionAttr::get(
-            global.getContext(), gvAttr, expr);
-        globalToGlobalExprsMap[global].push_back(dbgExpr);
-        return true;
-      }
-    }
-  }
-  return false;
+  return true;
 }
 
 void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp,
                                        mlir::LLVM::DIFileAttr fileAttr,
                                        mlir::LLVM::DIScopeAttr scopeAttr,
                                        fir::DebugTypeGenerator &typeGen,
-                                       mlir::SymbolTable *symbolTable) {
+                                       mlir::SymbolTable *symbolTable,
+                                       mlir::Value dummyScope) {
   mlir::MLIRContext *context = &getContext();
   mlir::OpBuilder builder(context);
   auto result = fir::NameUniquer::deconstruct(declOp.getUniqName());
@@ -228,24 +247,11 @@ void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp,
     }
   }
 
-  // FIXME: There may be cases where an argument is processed a bit before
-  // DeclareOp is generated. In that case, DeclareOp may point to an
-  // intermediate op and not to BlockArgument.
-  // Moreover, with MLIR inlining we cannot use the BlockArgument
-  // position to identify the original number of the dummy argument.
-  // If we want to keep running AddDebugInfoPass late, the dummy argument
-  // position in the argument list has to be expressed in FIR (e.g. as a
-  // constant attribute of [hl]fir.declare/fircg.ext_declare operation that has
-  // a dummy_scope operand).
+  // Get the dummy argument position from the explicit attribute.
   unsigned argNo = 0;
-  if (declOp.getDummyScope()) {
-    if (auto arg = llvm::dyn_cast<mlir::BlockArgument>(declOp.getMemref())) {
-      // Check if it is the BlockArgument of the function's entry block.
-      if (auto funcLikeOp =
-              declOp->getParentOfType<mlir::FunctionOpInterface>())
-        if (arg.getOwner() == &funcLikeOp.front())
-          argNo = arg.getArgNumber() + 1;
-    }
+  if (dummyScope && declOp.getDummyScope() == dummyScope) {
+    if (auto argNoOpt = declOp.getDummyArgNo())
+      argNo = *argNoOpt;
   }
 
   auto tyAttr = typeGen.convertType(fir::unwrapRefType(declOp.getType()),
@@ -623,6 +629,21 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
   funcOp->setLoc(builder.getFusedLoc({l}, spAttr));
   addTargetOpDISP(/*lineTableOnly=*/false, entities);
 
+  // Find the first dummy_scope definition. This is the one of the current
+  // function. The other ones may come from inlined calls. The variables inside
+  // those inlined calls should not be identified as arguments of the current
+  // function.
+  mlir::Value dummyScope;
+  funcOp.walk([&](fir::UndefOp undef) -> mlir::WalkResult {
+    // TODO: delay fir.dummy_scope translation to undefined until
+    // codegeneration. This is nicer and safer to match.
+    if (llvm::isa<fir::DummyScopeType>(undef.getType())) {
+      dummyScope = undef;
+      return mlir::WalkResult::interrupt();
+    }
+    return mlir::WalkResult::advance();
+  });
+
   funcOp.walk([&](fir::cg::XDeclareOp declOp) {
     mlir::LLVM::DISubprogramAttr spTy = spAttr;
     if (auto tOp = declOp->getParentOfType<mlir::omp::TargetOp>()) {
@@ -632,7 +653,7 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
           spTy = sp;
       }
     }
-    handleDeclareOp(declOp, fileAttr, spTy, typeGen, symbolTable);
+    handleDeclareOp(declOp, fileAttr, spTy, typeGen, symbolTable, dummyScope);
   });
   // commonBlockMap ensures that we don't create multiple DICommonBlockAttr of
   // the same name in one function. But it is ok (rather required) to create
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 0388439..619f3adc 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -9,6 +9,7 @@ add_flang_library(FIRTransforms
   CompilerGeneratedNames.cpp
   ConstantArgumentGlobalisation.cpp
   ControlFlowConverter.cpp
+  CUDA/CUFAllocationConversion.cpp
   CUFAddConstructor.cpp
   CUFDeviceGlobal.cpp
   CUFOpConversion.cpp
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp
new file mode 100644
index 0000000..6579c23
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp
@@ -0,0 +1,438 @@
+//===-- CUFAllocationConversion.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Transforms/CUDA/CUFAllocationConversion.h"
+#include "flang/Optimizer/Builder/CUFCommon.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
+#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
+#include "flang/Optimizer/CodeGen/TypeConverter.h"
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Runtime/CUDA/allocatable.h"
+#include "flang/Runtime/CUDA/common.h"
+#include "flang/Runtime/CUDA/descriptor.h"
+#include "flang/Runtime/CUDA/memory.h"
+#include "flang/Runtime/CUDA/pointer.h"
+#include "flang/Runtime/allocatable.h"
+#include "flang/Runtime/allocator-registry-consts.h"
+#include "flang/Support/Fortran.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace fir {
+#define GEN_PASS_DEF_CUFALLOCATIONCONVERSION
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+using namespace fir;
+using namespace mlir;
+using namespace Fortran::runtime;
+using namespace Fortran::runtime::cuda;
+
+namespace {
+
+template <typename OpTy>
+static bool isPinned(OpTy op) {
+  if (op.getDataAttr() && *op.getDataAttr() == cuf::DataAttribute::Pinned)
+    return true;
+  return false;
+}
+
+static inline unsigned getMemType(cuf::DataAttribute attr) {
+  if (attr == cuf::DataAttribute::Device)
+    return kMemTypeDevice;
+  if (attr == cuf::DataAttribute::Managed)
+    return kMemTypeManaged;
+  if (attr == cuf::DataAttribute::Pinned)
+    return kMemTypePinned;
+  if (attr == cuf::DataAttribute::Unified)
+    return kMemTypeUnified;
+  llvm_unreachable("unsupported memory type");
+}
+
+static bool inDeviceContext(mlir::Operation *op) {
+  if (op->getParentOfType<cuf::KernelOp>())
+    return true;
+  if (auto funcOp = op->getParentOfType<mlir::gpu::GPUFuncOp>())
+    return true;
+  if (auto funcOp = op->getParentOfType<mlir::gpu::LaunchOp>())
+    return true;
+  if (auto funcOp = op->getParentOfType<mlir::func::FuncOp>()) {
+    if (auto cudaProcAttr =
+            funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
+                cuf::getProcAttrName())) {
+      return cudaProcAttr.getValue() != cuf::ProcAttribute::Host &&
+             cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
+    }
+  }
+  return false;
+}
+
+template <typename OpTy>
+static mlir::LogicalResult convertOpToCall(OpTy op,
+                                           mlir::PatternRewriter &rewriter,
+                                           mlir::func::FuncOp func) {
+  auto mod = op->template getParentOfType<mlir::ModuleOp>();
+  fir::FirOpBuilder builder(rewriter, mod);
+  mlir::Location loc = op.getLoc();
+  auto fTy = func.getFunctionType();
+
+  mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+  mlir::Value sourceLine;
+  if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
+    sourceLine = fir::factory::locationToLineNo(
+        builder, loc, op.getSource() ? fTy.getInput(7) : fTy.getInput(6));
+  else
+    sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
+
+  mlir::Value hasStat = op.getHasStat() ? builder.createBool(loc, true)
+                                        : builder.createBool(loc, false);
+
+  mlir::Value errmsg;
+  if (op.getErrmsg()) {
+    errmsg = op.getErrmsg();
+  } else {
+    mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType());
+    errmsg = fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
+  }
+  llvm::SmallVector<mlir::Value> args;
+  if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
+    mlir::Value pinned =
+        op.getPinned()
+            ? op.getPinned()
+            : builder.createNullConstant(
+                  loc, fir::ReferenceType::get(
+                           mlir::IntegerType::get(op.getContext(), 1)));
+    if (op.getSource()) {
+      mlir::Value stream =
+          op.getStream() ? op.getStream()
+                         : builder.createNullConstant(loc, fTy.getInput(2));
+      args = fir::runtime::createArguments(
+          builder, loc, fTy, op.getBox(), op.getSource(), stream, pinned,
+          hasStat, errmsg, sourceFile, sourceLine);
+    } else {
+      mlir::Value stream =
+          op.getStream() ? op.getStream()
+                         : builder.createNullConstant(loc, fTy.getInput(1));
+      args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
+                                           stream, pinned, hasStat, errmsg,
+                                           sourceFile, sourceLine);
+    }
+  } else {
+    args =
+        fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
+                                      errmsg, sourceFile, sourceLine);
+  }
+  auto callOp = fir::CallOp::create(builder, loc, func, args);
+  rewriter.replaceOp(op, callOp);
+  return mlir::success();
+}
+
+struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  CUFAllocOpConversion(mlir::MLIRContext *context, mlir::DataLayout *dl,
+                       const fir::LLVMTypeConverter *typeConverter)
+      : OpRewritePattern(context), dl{dl}, typeConverter{typeConverter} {}
+
+  mlir::LogicalResult
+  matchAndRewrite(cuf::AllocOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+
+    mlir::Location loc = op.getLoc();
+
+    if (inDeviceContext(op.getOperation())) {
+      // In device context just replace the cuf.alloc operation with a fir.alloc
+      // the cuf.free will be removed.
+      auto allocaOp =
+          fir::AllocaOp::create(rewriter, loc, op.getInType(),
+                                op.getUniqName() ? *op.getUniqName() : "",
+                                op.getBindcName() ? *op.getBindcName() : "",
+                                op.getTypeparams(), op.getShape());
+      allocaOp->setAttr(cuf::getDataAttrName(), op.getDataAttrAttr());
+      rewriter.replaceOp(op, allocaOp);
+      return mlir::success();
+    }
+
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+
+    if (!mlir::dyn_cast_or_null<fir::BaseBoxType>(op.getInType())) {
+      // Convert scalar and known size array allocations.
+      mlir::Value bytes;
+      fir::KindMapping kindMap{fir::getKindMapping(mod)};
+      if (fir::isa_trivial(op.getInType())) {
+        int width = cuf::computeElementByteSize(loc, op.getInType(), kindMap);
+        bytes =
+            builder.createIntegerConstant(loc, builder.getIndexType(), width);
+      } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(
+                     op.getInType())) {
+        std::size_t size = 0;
+        if (fir::isa_derived(seqTy.getEleTy())) {
+          mlir::Type structTy = typeConverter->convertType(seqTy.getEleTy());
+          size = dl->getTypeSizeInBits(structTy) / 8;
+        } else {
+          size = cuf::computeElementByteSize(loc, seqTy.getEleTy(), kindMap);
+        }
+        mlir::Value width =
+            builder.createIntegerConstant(loc, builder.getIndexType(), size);
+        mlir::Value nbElem;
+        if (fir::sequenceWithNonConstantShape(seqTy)) {
+          assert(!op.getShape().empty() && "expect shape with dynamic arrays");
+          nbElem = builder.loadIfRef(loc, op.getShape()[0]);
+          for (unsigned i = 1; i < op.getShape().size(); ++i) {
+            nbElem = mlir::arith::MulIOp::create(
+                rewriter, loc, nbElem,
+                builder.loadIfRef(loc, op.getShape()[i]));
+          }
+        } else {
+          nbElem = builder.createIntegerConstant(loc, builder.getIndexType(),
+                                                 seqTy.getConstantArraySize());
+        }
+        bytes = mlir::arith::MulIOp::create(rewriter, loc, nbElem, width);
+      } else if (fir::isa_derived(op.getInType())) {
+        mlir::Type structTy = typeConverter->convertType(op.getInType());
+        std::size_t structSize = dl->getTypeSizeInBits(structTy) / 8;
+        bytes = builder.createIntegerConstant(loc, builder.getIndexType(),
+                                              structSize);
+      } else if (fir::isa_char(op.getInType())) {
+        mlir::Type charTy = typeConverter->convertType(op.getInType());
+        std::size_t charSize = dl->getTypeSizeInBits(charTy) / 8;
+        bytes = builder.createIntegerConstant(loc, builder.getIndexType(),
+                                              charSize);
+      } else {
+        mlir::emitError(loc, "unsupported type in cuf.alloc\n");
+      }
+      mlir::func::FuncOp func =
+          fir::runtime::getRuntimeFunc<mkRTKey(CUFMemAlloc)>(loc, builder);
+      auto fTy = func.getFunctionType();
+      mlir::Value sourceLine =
+          fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
+      mlir::Value memTy = builder.createIntegerConstant(
+          loc, builder.getI32Type(), getMemType(op.getDataAttr()));
+      llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+          builder, loc, fTy, bytes, memTy, sourceFile, sourceLine)};
+      auto callOp = fir::CallOp::create(builder, loc, func, args);
+      callOp->setAttr(cuf::getDataAttrName(), op.getDataAttrAttr());
+      auto convOp = builder.createConvert(loc, op.getResult().getType(),
+                                          callOp.getResult(0));
+      rewriter.replaceOp(op, convOp);
+      return mlir::success();
+    }
+
+    // Convert descriptor allocations to function call.
+    auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(op.getInType());
+    mlir::func::FuncOp func =
+        fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocDescriptor)>(loc, builder);
+    auto fTy = func.getFunctionType();
+    mlir::Value sourceLine =
+        fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+
+    mlir::Type structTy = typeConverter->convertBoxTypeAsStruct(boxTy);
+    std::size_t boxSize = dl->getTypeSizeInBits(structTy) / 8;
+    mlir::Value sizeInBytes =
+        builder.createIntegerConstant(loc, builder.getIndexType(), boxSize);
+
+    llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+        builder, loc, fTy, sizeInBytes, sourceFile, sourceLine)};
+    auto callOp = fir::CallOp::create(builder, loc, func, args);
+    callOp->setAttr(cuf::getDataAttrName(), op.getDataAttrAttr());
+    auto convOp = builder.createConvert(loc, op.getResult().getType(),
+                                        callOp.getResult(0));
+    rewriter.replaceOp(op, convOp);
+    return mlir::success();
+  }
+
+private:
+  mlir::DataLayout *dl;
+  const fir::LLVMTypeConverter *typeConverter;
+};
+
+struct CUFFreeOpConversion : public mlir::OpRewritePattern<cuf::FreeOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cuf::FreeOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    if (inDeviceContext(op.getOperation())) {
+      rewriter.eraseOp(op);
+      return mlir::success();
+    }
+
+    if (!mlir::isa<fir::ReferenceType>(op.getDevptr().getType()))
+      return failure();
+
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+    mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+
+    auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getDevptr().getType());
+    if (!mlir::isa<fir::BaseBoxType>(refTy.getEleTy())) {
+      mlir::func::FuncOp func =
+          fir::runtime::getRuntimeFunc<mkRTKey(CUFMemFree)>(loc, builder);
+      auto fTy = func.getFunctionType();
+      mlir::Value sourceLine =
+          fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
+      mlir::Value memTy = builder.createIntegerConstant(
+          loc, builder.getI32Type(), getMemType(op.getDataAttr()));
+      llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+          builder, loc, fTy, op.getDevptr(), memTy, sourceFile, sourceLine)};
+      fir::CallOp::create(builder, loc, func, args);
+      rewriter.eraseOp(op);
+      return mlir::success();
+    }
+
+    // Convert cuf.free on descriptors.
+    mlir::func::FuncOp func =
+        fir::runtime::getRuntimeFunc<mkRTKey(CUFFreeDescriptor)>(loc, builder);
+    auto fTy = func.getFunctionType();
+    mlir::Value sourceLine =
+        fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+    llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+        builder, loc, fTy, op.getDevptr(), sourceFile, sourceLine)};
+    auto callOp = fir::CallOp::create(builder, loc, func, args);
+    callOp->setAttr(cuf::getDataAttrName(), op.getDataAttrAttr());
+    rewriter.eraseOp(op);
+    return mlir::success();
+  }
+};
+
+struct CUFAllocateOpConversion
+    : public mlir::OpRewritePattern<cuf::AllocateOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cuf::AllocateOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+
+    bool isPointer = op.getPointer();
+    if (op.getHasDoubleDescriptor()) {
+      // Allocation for module variable are done with custom runtime entry point
+      // so the descriptors can be synchronized.
+      mlir::func::FuncOp func;
+      if (op.getSource()) {
+        func = isPointer ? fir::runtime::getRuntimeFunc<mkRTKey(
+                               CUFPointerAllocateSourceSync)>(loc, builder)
+                         : fir::runtime::getRuntimeFunc<mkRTKey(
+                               CUFAllocatableAllocateSourceSync)>(loc, builder);
+      } else {
+        func =
+            isPointer
+                ? fir::runtime::getRuntimeFunc<mkRTKey(CUFPointerAllocateSync)>(
+                      loc, builder)
+                : fir::runtime::getRuntimeFunc<mkRTKey(
+                      CUFAllocatableAllocateSync)>(loc, builder);
+      }
+      return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
+    }
+
+    mlir::func::FuncOp func;
+    if (op.getSource()) {
+      func =
+          isPointer
+              ? fir::runtime::getRuntimeFunc<mkRTKey(CUFPointerAllocateSource)>(
+                    loc, builder)
+              : fir::runtime::getRuntimeFunc<mkRTKey(
+                    CUFAllocatableAllocateSource)>(loc, builder);
+    } else {
+      func =
+          isPointer
+              ? fir::runtime::getRuntimeFunc<mkRTKey(CUFPointerAllocate)>(
+                    loc, builder)
+              : fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
+                    loc, builder);
+    }
+
+    return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
+  }
+};
+
+struct CUFDeallocateOpConversion
+    : public mlir::OpRewritePattern<cuf::DeallocateOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cuf::DeallocateOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+
+    if (op.getHasDoubleDescriptor()) {
+      // Deallocation for module variable are done with custom runtime entry
+      // point so the descriptors can be synchronized.
+      mlir::func::FuncOp func =
+          fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableDeallocate)>(
+              loc, builder);
+      return convertOpToCall<cuf::DeallocateOp>(op, rewriter, func);
+    }
+
+    // Deallocation for local descriptor falls back on the standard runtime
+    // AllocatableDeallocate as the dedicated deallocator is set in the
+    // descriptor before the call.
+    mlir::func::FuncOp func =
+        fir::runtime::getRuntimeFunc<mkRTKey(AllocatableDeallocate)>(loc,
+                                                                     builder);
+    return convertOpToCall<cuf::DeallocateOp>(op, rewriter, func);
+  }
+};
+
+class CUFAllocationConversion
+    : public fir::impl::CUFAllocationConversionBase<CUFAllocationConversion> {
+public:
+  void runOnOperation() override {
+    auto *ctx = &getContext();
+    mlir::RewritePatternSet patterns(ctx);
+    mlir::ConversionTarget target(*ctx);
+
+    mlir::Operation *op = getOperation();
+    mlir::ModuleOp module = mlir::dyn_cast<mlir::ModuleOp>(op);
+    if (!module)
+      return signalPassFailure();
+    mlir::SymbolTable symtab(module);
+
+    std::optional<mlir::DataLayout> dl = fir::support::getOrSetMLIRDataLayout(
+        module, /*allowDefaultLayout=*/false);
+    fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
+                                         /*forceUnifiedTBAATree=*/false, *dl);
+    target.addLegalDialect<fir::FIROpsDialect, mlir::arith::ArithDialect,
+                           mlir::gpu::GPUDialect>();
+    target.addLegalOp<cuf::StreamCastOp>();
+    cuf::populateCUFAllocationConversionPatterns(typeConverter, *dl, symtab,
+                                                 patterns);
+    if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
+                                                  std::move(patterns)))) {
+      mlir::emitError(mlir::UnknownLoc::get(ctx),
+                      "error in CUF allocation conversion\n");
+      signalPassFailure();
+    }
+  }
+};
+
+} // namespace
+
+void cuf::populateCUFAllocationConversionPatterns(
+    const fir::LLVMTypeConverter &converter, mlir::DataLayout &dl,
+    const mlir::SymbolTable &symtab, mlir::RewritePatternSet &patterns) {
+  patterns.insert<CUFAllocOpConversion>(patterns.getContext(), &dl, &converter);
+  patterns.insert<CUFFreeOpConversion, CUFAllocateOpConversion,
+                  CUFDeallocateOpConversion>(patterns.getContext());
+}
diff --git a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
index a644945..7bae060 100644
--- a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
@@ -46,6 +46,43 @@ static bool isAssumedSize(mlir::ValueRange shape) {
   return false;
 }
 
+static void createSharedMemoryGlobal(fir::FirOpBuilder &builder,
+                                     mlir::Location loc, llvm::StringRef prefix,
+                                     llvm::StringRef suffix,
+                                     mlir::gpu::GPUModuleOp gpuMod,
+                                     mlir::Type sharedMemType, unsigned size,
+                                     unsigned align, bool isDynamic) {
+  std::string sharedMemGlobalName =
+      isDynamic ? (prefix + llvm::Twine(cudaSharedMemSuffix)).str()
+                : (prefix + llvm::Twine(cudaSharedMemSuffix) + suffix).str();
+
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToEnd(gpuMod.getBody());
+
+  mlir::StringAttr linkage = isDynamic ? builder.createExternalLinkage()
+                                       : builder.createInternalLinkage();
+  llvm::SmallVector<mlir::NamedAttribute> attrs;
+  auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
+                                          gpuMod.getContext());
+  attrs.push_back(mlir::NamedAttribute(
+      fir::GlobalOp::getDataAttrAttrName(globalOpName),
+      cuf::DataAttributeAttr::get(gpuMod.getContext(),
+                                  cuf::DataAttribute::Shared)));
+
+  mlir::DenseElementsAttr init = {};
+  mlir::Type i8Ty = builder.getI8Type();
+  if (size > 0) {
+    auto vecTy = mlir::VectorType::get(
+        static_cast<fir::SequenceType::Extent>(size), i8Ty);
+    mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0);
+    init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
+  }
+  auto sharedMem =
+      fir::GlobalOp::create(builder, loc, sharedMemGlobalName, false, false,
+                            sharedMemType, init, linkage, attrs);
+  sharedMem.setAlignment(align);
+}
+
 struct CUFComputeSharedMemoryOffsetsAndSize
     : public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase<
           CUFComputeSharedMemoryOffsetsAndSize> {
@@ -108,18 +145,23 @@ struct CUFComputeSharedMemoryOffsetsAndSize
                                                        crtDynOffset, dynSize);
           else
             crtDynOffset = dynSize;
-
-          continue;
+        } else {
+          // Static shared memory.
+          auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
+              loc, sharedOp.getInType(), *dl, kindMap);
+          createSharedMemoryGlobal(
+              builder, sharedOp.getLoc(), funcOp.getName(),
+              *sharedOp.getBindcName(), gpuMod,
+              fir::SequenceType::get(size, i8Ty), size,
+              sharedOp.getAlignment() ? *sharedOp.getAlignment() : align,
+              /*isDynamic=*/false);
+          mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
+          sharedOp.getOffsetMutable().assign(zero);
+          if (!sharedOp.getAlignment())
+            sharedOp.setAlignment(align);
+          sharedOp.setIsStatic(true);
+          ++nbStaticSharedVariables;
         }
-        auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
-            sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap);
-        ++nbStaticSharedVariables;
-        mlir::Value offset = builder.createIntegerConstant(
-            loc, i32Ty, llvm::alignTo(sharedMemSize, align));
-        sharedOp.getOffsetMutable().assign(offset);
-        sharedMemSize =
-            llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align);
-        alignment = std::max(alignment, align);
       }
 
       if (nbDynamicSharedVariables == 0 && nbStaticSharedVariables == 0)
@@ -130,35 +172,13 @@ struct CUFComputeSharedMemoryOffsetsAndSize
             funcOp.getLoc(),
             "static and dynamic shared variables in a single kernel");
 
-      mlir::DenseElementsAttr init = {};
-      if (sharedMemSize > 0) {
-        auto vecTy = mlir::VectorType::get(sharedMemSize, i8Ty);
-        mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0);
-        init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
-      }
+      if (nbStaticSharedVariables > 0)
+        continue;
 
-      // Create the shared memory global where each shared variable will point
-      // to.
       auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty);
-      std::string sharedMemGlobalName =
-          (funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str();
-      // Dynamic shared memory needs an external linkage while static shared
-      // memory needs an internal linkage.
-      mlir::StringAttr linkage = nbDynamicSharedVariables > 0
-                                     ? builder.createExternalLinkage()
-                                     : builder.createInternalLinkage();
-      builder.setInsertionPointToEnd(gpuMod.getBody());
-      llvm::SmallVector<mlir::NamedAttribute> attrs;
-      auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
-                                              gpuMod.getContext());
-      attrs.push_back(mlir::NamedAttribute(
-          fir::GlobalOp::getDataAttrAttrName(globalOpName),
-          cuf::DataAttributeAttr::get(gpuMod.getContext(),
-                                      cuf::DataAttribute::Shared)));
-      auto sharedMem = fir::GlobalOp::create(
-          builder, funcOp.getLoc(), sharedMemGlobalName, false, false,
-          sharedMemType, init, linkage, attrs);
-      sharedMem.setAlignment(alignment);
+      createSharedMemoryGlobal(builder, funcOp.getLoc(), funcOp.getName(), "",
+                               gpuMod, sharedMemType, sharedMemSize, alignment,
+                               /*isDynamic=*/true);
     }
   }
 };
diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
index 40f180a..d5a8212 100644
--- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
@@ -249,8 +249,13 @@ struct CUFSharedMemoryOpConversion
                       "cuf.shared_memory must have an offset for code gen");
 
     auto gpuMod = op->getParentOfType<gpu::GPUModuleOp>();
+
     std::string sharedGlobalName =
-        (getFuncName(op) + llvm::Twine(cudaSharedMemSuffix)).str();
+        op.getIsStatic()
+            ? (getFuncName(op) + llvm::Twine(cudaSharedMemSuffix) +
+               *op.getBindcName())
+                  .str()
+            : (getFuncName(op) + llvm::Twine(cudaSharedMemSuffix)).str();
     mlir::Value sharedGlobalAddr =
         createAddressOfOp(rewriter, loc, gpuMod, sharedGlobalName);
 
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 5b1b0a2..424a8fd 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -16,6 +16,8 @@
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Optimizer/Transforms/CUDA/CUFAllocationConversion.h"
+#include "flang/Optimizer/Transforms/Passes.h"
 #include "flang/Runtime/CUDA/allocatable.h"
 #include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/CUDA/descriptor.h"
@@ -44,207 +46,6 @@ using namespace Fortran::runtime::cuda;
 
 namespace {
 
-static inline unsigned getMemType(cuf::DataAttribute attr) {
-  if (attr == cuf::DataAttribute::Device)
-    return kMemTypeDevice;
-  if (attr == cuf::DataAttribute::Managed)
-    return kMemTypeManaged;
-  if (attr == cuf::DataAttribute::Unified)
-    return kMemTypeUnified;
-  if (attr == cuf::DataAttribute::Pinned)
-    return kMemTypePinned;
-  llvm::report_fatal_error("unsupported memory type");
-}
-
-template <typename OpTy>
-static bool isPinned(OpTy op) {
-  if (op.getDataAttr() && *op.getDataAttr() == cuf::DataAttribute::Pinned)
-    return true;
-  return false;
-}
-
-template <typename OpTy>
-static bool hasDoubleDescriptors(OpTy op) {
-  if (auto declareOp =
-          mlir::dyn_cast_or_null<fir::DeclareOp>(op.getBox().getDefiningOp())) {
-    if (mlir::isa_and_nonnull<fir::AddrOfOp>(
-            declareOp.getMemref().getDefiningOp())) {
-      if (isPinned(declareOp))
-        return false;
-      return true;
-    }
-  } else if (auto declareOp = mlir::dyn_cast_or_null<hlfir::DeclareOp>(
-                 op.getBox().getDefiningOp())) {
-    if (mlir::isa_and_nonnull<fir::AddrOfOp>(
-            declareOp.getMemref().getDefiningOp())) {
-      if (isPinned(declareOp))
-        return false;
-      return true;
-    }
-  }
-  return false;
-}
-
-static mlir::Value createConvertOp(mlir::PatternRewriter &rewriter,
-                                   mlir::Location loc, mlir::Type toTy,
-                                   mlir::Value val) {
-  if (val.getType() != toTy)
-    return fir::ConvertOp::create(rewriter, loc, toTy, val);
-  return val;
-}
-
-template <typename OpTy>
-static mlir::LogicalResult convertOpToCall(OpTy op,
-                                           mlir::PatternRewriter &rewriter,
-                                           mlir::func::FuncOp func) {
-  auto mod = op->template getParentOfType<mlir::ModuleOp>();
-  fir::FirOpBuilder builder(rewriter, mod);
-  mlir::Location loc = op.getLoc();
-  auto fTy = func.getFunctionType();
-
-  mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
-  mlir::Value sourceLine;
-  if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
-    sourceLine = fir::factory::locationToLineNo(
-        builder, loc, op.getSource() ? fTy.getInput(7) : fTy.getInput(6));
-  else
-    sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
-
-  mlir::Value hasStat = op.getHasStat() ? builder.createBool(loc, true)
-                                        : builder.createBool(loc, false);
-
-  mlir::Value errmsg;
-  if (op.getErrmsg()) {
-    errmsg = op.getErrmsg();
-  } else {
-    mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType());
-    errmsg = fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
-  }
-  llvm::SmallVector<mlir::Value> args;
-  if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
-    mlir::Value pinned =
-        op.getPinned()
-            ? op.getPinned()
-            : builder.createNullConstant(
-                  loc, fir::ReferenceType::get(
-                           mlir::IntegerType::get(op.getContext(), 1)));
-    if (op.getSource()) {
-      mlir::Value stream =
-          op.getStream() ? op.getStream()
-                         : builder.createNullConstant(loc, fTy.getInput(2));
-      args = fir::runtime::createArguments(
-          builder, loc, fTy, op.getBox(), op.getSource(), stream, pinned,
-          hasStat, errmsg, sourceFile, sourceLine);
-    } else {
-      mlir::Value stream =
-          op.getStream() ? op.getStream()
-                         : builder.createNullConstant(loc, fTy.getInput(1));
-      args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
-                                           stream, pinned, hasStat, errmsg,
-                                           sourceFile, sourceLine);
-    }
-  } else {
-    args =
-        fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
-                                      errmsg, sourceFile, sourceLine);
-  }
-  auto callOp = fir::CallOp::create(builder, loc, func, args);
-  rewriter.replaceOp(op, callOp);
-  return mlir::success();
-}
-
-struct CUFAllocateOpConversion
-    : public mlir::OpRewritePattern<cuf::AllocateOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  mlir::LogicalResult
-  matchAndRewrite(cuf::AllocateOp op,
-                  mlir::PatternRewriter &rewriter) const override {
-    auto mod = op->getParentOfType<mlir::ModuleOp>();
-    fir::FirOpBuilder builder(rewriter, mod);
-    mlir::Location loc = op.getLoc();
-
-    bool isPointer = false;
-
-    if (auto declareOp =
-            mlir::dyn_cast_or_null<fir::DeclareOp>(op.getBox().getDefiningOp()))
-      if (declareOp.getFortranAttrs() &&
-          bitEnumContainsAny(*declareOp.getFortranAttrs(),
-                             fir::FortranVariableFlagsEnum::pointer))
-        isPointer = true;
-
-    if (hasDoubleDescriptors(op)) {
-      // Allocation for module variable are done with custom runtime entry point
-      // so the descriptors can be synchronized.
-      mlir::func::FuncOp func;
-      if (op.getSource()) {
-        func = isPointer ? fir::runtime::getRuntimeFunc<mkRTKey(
-                               CUFPointerAllocateSourceSync)>(loc, builder)
-                         : fir::runtime::getRuntimeFunc<mkRTKey(
-                               CUFAllocatableAllocateSourceSync)>(loc, builder);
-      } else {
-        func =
-            isPointer
-                ? fir::runtime::getRuntimeFunc<mkRTKey(CUFPointerAllocateSync)>(
-                      loc, builder)
-                : fir::runtime::getRuntimeFunc<mkRTKey(
-                      CUFAllocatableAllocateSync)>(loc, builder);
-      }
-      return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
-    }
-
-    mlir::func::FuncOp func;
-    if (op.getSource()) {
-      func =
-          isPointer
-              ? fir::runtime::getRuntimeFunc<mkRTKey(CUFPointerAllocateSource)>(
-                    loc, builder)
-              : fir::runtime::getRuntimeFunc<mkRTKey(
-                    CUFAllocatableAllocateSource)>(loc, builder);
-    } else {
-      func =
-          isPointer
-              ? fir::runtime::getRuntimeFunc<mkRTKey(CUFPointerAllocate)>(
-                    loc, builder)
-              : fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
-                    loc, builder);
-    }
-
-    return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
-  }
-};
-
-struct CUFDeallocateOpConversion
-    : public mlir::OpRewritePattern<cuf::DeallocateOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  mlir::LogicalResult
-  matchAndRewrite(cuf::DeallocateOp op,
-                  mlir::PatternRewriter &rewriter) const override {
-
-    auto mod = op->getParentOfType<mlir::ModuleOp>();
-    fir::FirOpBuilder builder(rewriter, mod);
-    mlir::Location loc = op.getLoc();
-
-    if (hasDoubleDescriptors(op)) {
-      // Deallocation for module variable are done with custom runtime entry
-      // point so the descriptors can be synchronized.
-      mlir::func::FuncOp func =
-          fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableDeallocate)>(
-              loc, builder);
-      return convertOpToCall<cuf::DeallocateOp>(op, rewriter, func);
-    }
-
-    // Deallocation for local descriptor falls back on the standard runtime
-    // AllocatableDeallocate as the dedicated deallocator is set in the
-    // descriptor before the call.
-    mlir::func::FuncOp func =
-        fir::runtime::getRuntimeFunc<mkRTKey(AllocatableDeallocate)>(loc,
-                                                                     builder);
-    return convertOpToCall<cuf::DeallocateOp>(op, rewriter, func);
-  }
-};
-
 static bool inDeviceContext(mlir::Operation *op) {
   if (op->getParentOfType<cuf::KernelOp>())
     return true;
@@ -263,121 +64,13 @@ static bool inDeviceContext(mlir::Operation *op) {
   return false;
 }
 
-struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  CUFAllocOpConversion(mlir::MLIRContext *context, mlir::DataLayout *dl,
-                       const fir::LLVMTypeConverter *typeConverter)
-      : OpRewritePattern(context), dl{dl}, typeConverter{typeConverter} {}
-
-  mlir::LogicalResult
-  matchAndRewrite(cuf::AllocOp op,
-                  mlir::PatternRewriter &rewriter) const override {
-
-    mlir::Location loc = op.getLoc();
-
-    if (inDeviceContext(op.getOperation())) {
-      // In device context just replace the cuf.alloc operation with a fir.alloc
-      // the cuf.free will be removed.
-      auto allocaOp =
-          fir::AllocaOp::create(rewriter, loc, op.getInType(),
-                                op.getUniqName() ? *op.getUniqName() : "",
-                                op.getBindcName() ? *op.getBindcName() : "",
-                                op.getTypeparams(), op.getShape());
-      allocaOp->setAttr(cuf::getDataAttrName(), op.getDataAttrAttr());
-      rewriter.replaceOp(op, allocaOp);
-      return mlir::success();
-    }
-
-    auto mod = op->getParentOfType<mlir::ModuleOp>();
-    fir::FirOpBuilder builder(rewriter, mod);
-    mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
-
-    if (!mlir::dyn_cast_or_null<fir::BaseBoxType>(op.getInType())) {
-      // Convert scalar and known size array allocations.
-      mlir::Value bytes;
-      fir::KindMapping kindMap{fir::getKindMapping(mod)};
-      if (fir::isa_trivial(op.getInType())) {
-        int width = cuf::computeElementByteSize(loc, op.getInType(), kindMap);
-        bytes =
-            builder.createIntegerConstant(loc, builder.getIndexType(), width);
-      } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(
-                     op.getInType())) {
-        std::size_t size = 0;
-        if (fir::isa_derived(seqTy.getEleTy())) {
-          mlir::Type structTy = typeConverter->convertType(seqTy.getEleTy());
-          size = dl->getTypeSizeInBits(structTy) / 8;
-        } else {
-          size = cuf::computeElementByteSize(loc, seqTy.getEleTy(), kindMap);
-        }
-        mlir::Value width =
-            builder.createIntegerConstant(loc, builder.getIndexType(), size);
-        mlir::Value nbElem;
-        if (fir::sequenceWithNonConstantShape(seqTy)) {
-          assert(!op.getShape().empty() && "expect shape with dynamic arrays");
-          nbElem = builder.loadIfRef(loc, op.getShape()[0]);
-          for (unsigned i = 1; i < op.getShape().size(); ++i) {
-            nbElem = mlir::arith::MulIOp::create(
-                rewriter, loc, nbElem,
-                builder.loadIfRef(loc, op.getShape()[i]));
-          }
-        } else {
-          nbElem = builder.createIntegerConstant(loc, builder.getIndexType(),
-                                                 seqTy.getConstantArraySize());
-        }
-        bytes = mlir::arith::MulIOp::create(rewriter, loc, nbElem, width);
-      } else if (fir::isa_derived(op.getInType())) {
-        mlir::Type structTy = typeConverter->convertType(op.getInType());
-        std::size_t structSize = dl->getTypeSizeInBits(structTy) / 8;
-        bytes = builder.createIntegerConstant(loc, builder.getIndexType(),
-                                              structSize);
-      } else {
-        mlir::emitError(loc, "unsupported type in cuf.alloc\n");
-      }
-      mlir::func::FuncOp func =
-          fir::runtime::getRuntimeFunc<mkRTKey(CUFMemAlloc)>(loc, builder);
-      auto fTy = func.getFunctionType();
-      mlir::Value sourceLine =
-          fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
-      mlir::Value memTy = builder.createIntegerConstant(
-          loc, builder.getI32Type(), getMemType(op.getDataAttr()));
-      llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
-          builder, loc, fTy, bytes, memTy, sourceFile, sourceLine)};
-      auto callOp = fir::CallOp::create(builder, loc, func, args);
-      callOp->setAttr(cuf::getDataAttrName(), op.getDataAttrAttr());
-      auto convOp = builder.createConvert(loc, op.getResult().getType(),
-                                          callOp.getResult(0));
-      rewriter.replaceOp(op, convOp);
-      return mlir::success();
-    }
-
-    // Convert descriptor allocations to function call.
-    auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(op.getInType());
-    mlir::func::FuncOp func =
-        fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocDescriptor)>(loc, builder);
-    auto fTy = func.getFunctionType();
-    mlir::Value sourceLine =
-        fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
-
-    mlir::Type structTy = typeConverter->convertBoxTypeAsStruct(boxTy);
-    std::size_t boxSize = dl->getTypeSizeInBits(structTy) / 8;
-    mlir::Value sizeInBytes =
-        builder.createIntegerConstant(loc, builder.getIndexType(), boxSize);
-
-    llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
-        builder, loc, fTy, sizeInBytes, sourceFile, sourceLine)};
-    auto callOp = fir::CallOp::create(builder, loc, func, args);
-    callOp->setAttr(cuf::getDataAttrName(), op.getDataAttrAttr());
-    auto convOp = builder.createConvert(loc, op.getResult().getType(),
-                                        callOp.getResult(0));
-    rewriter.replaceOp(op, convOp);
-    return mlir::success();
-  }
-
-private:
-  mlir::DataLayout *dl;
-  const fir::LLVMTypeConverter *typeConverter;
-};
+static mlir::Value createConvertOp(mlir::PatternRewriter &rewriter,
+                                   mlir::Location loc, mlir::Type toTy,
+                                   mlir::Value val) {
+  if (val.getType() != toTy)
+    return fir::ConvertOp::create(rewriter, loc, toTy, val);
+  return val;
+}
 
 struct CUFDeviceAddressOpConversion
     : public mlir::OpRewritePattern<cuf::DeviceAddressOp> {
@@ -455,56 +148,6 @@ private:
   const mlir::SymbolTable &symTab;
 };
 
-struct CUFFreeOpConversion : public mlir::OpRewritePattern<cuf::FreeOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  mlir::LogicalResult
-  matchAndRewrite(cuf::FreeOp op,
-                  mlir::PatternRewriter &rewriter) const override {
-    if (inDeviceContext(op.getOperation())) {
-      rewriter.eraseOp(op);
-      return mlir::success();
-    }
-
-    if (!mlir::isa<fir::ReferenceType>(op.getDevptr().getType()))
-      return failure();
-
-    auto mod = op->getParentOfType<mlir::ModuleOp>();
-    fir::FirOpBuilder builder(rewriter, mod);
-    mlir::Location loc = op.getLoc();
-    mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
-
-    auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getDevptr().getType());
-    if (!mlir::isa<fir::BaseBoxType>(refTy.getEleTy())) {
-      mlir::func::FuncOp func =
-          fir::runtime::getRuntimeFunc<mkRTKey(CUFMemFree)>(loc, builder);
-      auto fTy = func.getFunctionType();
-      mlir::Value sourceLine =
-          fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
-      mlir::Value memTy = builder.createIntegerConstant(
-          loc, builder.getI32Type(), getMemType(op.getDataAttr()));
-      llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
-          builder, loc, fTy, op.getDevptr(), memTy, sourceFile, sourceLine)};
-      fir::CallOp::create(builder, loc, func, args);
-      rewriter.eraseOp(op);
-      return mlir::success();
-    }
-
-    // Convert cuf.free on descriptors.
-    mlir::func::FuncOp func =
-        fir::runtime::getRuntimeFunc<mkRTKey(CUFFreeDescriptor)>(loc, builder);
-    auto fTy = func.getFunctionType();
-    mlir::Value sourceLine =
-        fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
-    llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
-        builder, loc, fTy, op.getDevptr(), sourceFile, sourceLine)};
-    auto callOp = fir::CallOp::create(builder, loc, func, args);
-    callOp->setAttr(cuf::getDataAttrName(), op.getDataAttrAttr());
-    rewriter.eraseOp(op);
-    return mlir::success();
-  }
-};
-
 static bool isDstGlobal(cuf::DataTransferOp op) {
   if (auto declareOp = op.getDst().getDefiningOp<fir::DeclareOp>())
     if (declareOp.getMemref().getDefiningOp<fir::AddrOfOp>())
@@ -651,31 +294,8 @@ struct CUFDataTransferOpConversion
       }
 
       mlir::Type i64Ty = builder.getI64Type();
-      mlir::Value nbElement;
-      if (op.getShape()) {
-        llvm::SmallVector<mlir::Value> extents;
-        if (auto shapeOp =
-                mlir::dyn_cast<fir::ShapeOp>(op.getShape().getDefiningOp())) {
-          extents = shapeOp.getExtents();
-        } else if (auto shapeShiftOp = mlir::dyn_cast<fir::ShapeShiftOp>(
-                       op.getShape().getDefiningOp())) {
-          for (auto i : llvm::enumerate(shapeShiftOp.getPairs()))
-            if (i.index() & 1)
-              extents.push_back(i.value());
-        }
-
-        nbElement = fir::ConvertOp::create(rewriter, loc, i64Ty, extents[0]);
-        for (unsigned i = 1; i < extents.size(); ++i) {
-          auto operand =
-              fir::ConvertOp::create(rewriter, loc, i64Ty, extents[i]);
-          nbElement =
-              mlir::arith::MulIOp::create(rewriter, loc, nbElement, operand);
-        }
-      } else {
-        if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(dstTy))
-          nbElement = builder.createIntegerConstant(
-              loc, i64Ty, seqTy.getConstantArraySize());
-      }
+      mlir::Value nbElement =
+          cuf::computeElementCount(rewriter, loc, op.getShape(), dstTy, i64Ty);
       unsigned width = 0;
       if (fir::isa_derived(fir::unwrapSequenceType(dstTy))) {
         mlir::Type structTy =
@@ -914,6 +534,8 @@ struct CUFSyncDescriptorOpConversion
 };
 
 class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> {
+  using CUFOpConversionBase::CUFOpConversionBase;
+
 public:
   void runOnOperation() override {
     auto *ctx = &getContext();
@@ -935,6 +557,9 @@ public:
     target.addLegalOp<cuf::StreamCastOp>();
     cuf::populateCUFToFIRConversionPatterns(typeConverter, *dl, symtab,
                                             patterns);
+    if (allocationConversion)
+      cuf::populateCUFAllocationConversionPatterns(typeConverter, *dl, symtab,
+                                                   patterns);
     if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
                                                   std::move(patterns)))) {
       mlir::emitError(mlir::UnknownLoc::get(ctx),
@@ -974,10 +599,7 @@ public:
 void cuf::populateCUFToFIRConversionPatterns(
     const fir::LLVMTypeConverter &converter, mlir::DataLayout &dl,
     const mlir::SymbolTable &symtab, mlir::RewritePatternSet &patterns) {
-  patterns.insert<CUFAllocOpConversion>(patterns.getContext(), &dl, &converter);
-  patterns.insert<CUFAllocateOpConversion, CUFDeallocateOpConversion,
-                  CUFFreeOpConversion, CUFSyncDescriptorOpConversion>(
-      patterns.getContext());
+  patterns.insert<CUFSyncDescriptorOpConversion>(patterns.getContext());
   patterns.insert<CUFDataTransferOpConversion>(patterns.getContext(), symtab,
                                                &dl, &converter);
   patterns.insert<CUFLaunchOpConversion, CUFDeviceAddressOpConversion>(
diff --git a/flang/lib/Optimizer/Transforms/FIRToSCF.cpp b/flang/lib/Optimizer/Transforms/FIRToSCF.cpp
index 70d6ebb..d38bedc 100644
--- a/flang/lib/Optimizer/Transforms/FIRToSCF.cpp
+++ b/flang/lib/Optimizer/Transforms/FIRToSCF.cpp
@@ -18,6 +18,8 @@ namespace fir {
 
 namespace {
 class FIRToSCFPass : public fir::impl::FIRToSCFPassBase<FIRToSCFPass> {
+  using FIRToSCFPassBase::FIRToSCFPassBase;
+
 public:
   void runOnOperation() override;
 };
@@ -25,11 +27,18 @@ public:
 struct DoLoopConversion : public mlir::OpRewritePattern<fir::DoLoopOp> {
   using OpRewritePattern<fir::DoLoopOp>::OpRewritePattern;
 
+  DoLoopConversion(mlir::MLIRContext *context,
+                   bool parallelUnorderedLoop = false,
+                   mlir::PatternBenefit benefit = 1)
+      : OpRewritePattern<fir::DoLoopOp>(context, benefit),
+        parallelUnorderedLoop(parallelUnorderedLoop) {}
+
   mlir::LogicalResult
   matchAndRewrite(fir::DoLoopOp doLoopOp,
                   mlir::PatternRewriter &rewriter) const override {
     mlir::Location loc = doLoopOp.getLoc();
     bool hasFinalValue = doLoopOp.getFinalValue().has_value();
+    bool isUnordered = doLoopOp.getUnordered().has_value();
 
     // Get loop values from the DoLoopOp
     mlir::Value low = doLoopOp.getLowerBound();
@@ -53,39 +62,54 @@ struct DoLoopConversion : public mlir::OpRewritePattern<fir::DoLoopOp> {
         mlir::arith::DivSIOp::create(rewriter, loc, distance, step);
     auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
     auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
-    auto scfForOp =
-        mlir::scf::ForOp::create(rewriter, loc, zero, tripCount, one, iterArgs);
 
+    // Create the scf.for or scf.parallel operation
+    mlir::Operation *scfLoopOp = nullptr;
+    if (isUnordered && parallelUnorderedLoop) {
+      scfLoopOp = mlir::scf::ParallelOp::create(rewriter, loc, {zero},
+                                                {tripCount}, {one}, iterArgs);
+    } else {
+      scfLoopOp = mlir::scf::ForOp::create(rewriter, loc, zero, tripCount, one,
+                                           iterArgs);
+    }
+
+    // Move the body of the fir.do_loop to the scf.for or scf.parallel
     auto &loopOps = doLoopOp.getBody()->getOperations();
     auto resultOp =
         mlir::cast<fir::ResultOp>(doLoopOp.getBody()->getTerminator());
     auto results = resultOp.getOperands();
-    mlir::Block *loweredBody = scfForOp.getBody();
+    auto scfLoopLikeOp = mlir::cast<mlir::LoopLikeOpInterface>(scfLoopOp);
+    mlir::Block &scfLoopBody = scfLoopLikeOp.getLoopRegions().front()->front();
 
-    loweredBody->getOperations().splice(loweredBody->begin(), loopOps,
-                                        loopOps.begin(),
-                                        std::prev(loopOps.end()));
+    scfLoopBody.getOperations().splice(scfLoopBody.begin(), loopOps,
+                                       loopOps.begin(),
+                                       std::prev(loopOps.end()));
 
-    rewriter.setInsertionPointToStart(loweredBody);
+    rewriter.setInsertionPointToStart(&scfLoopBody);
     mlir::Value iv = mlir::arith::MulIOp::create(
-        rewriter, loc, scfForOp.getInductionVar(), step);
+        rewriter, loc, scfLoopLikeOp.getSingleInductionVar().value(), step);
     iv = mlir::arith::AddIOp::create(rewriter, loc, low, iv);
 
     if (!results.empty()) {
-      rewriter.setInsertionPointToEnd(loweredBody);
+      rewriter.setInsertionPointToEnd(&scfLoopBody);
       mlir::scf::YieldOp::create(rewriter, resultOp->getLoc(), results);
     }
     doLoopOp.getInductionVar().replaceAllUsesWith(iv);
-    rewriter.replaceAllUsesWith(doLoopOp.getRegionIterArgs(),
-                                hasFinalValue
-                                    ? scfForOp.getRegionIterArgs().drop_front()
-                                    : scfForOp.getRegionIterArgs());
-
-    // Copy all the attributes from the old to new op.
-    scfForOp->setAttrs(doLoopOp->getAttrs());
-    rewriter.replaceOp(doLoopOp, scfForOp);
+    rewriter.replaceAllUsesWith(
+        doLoopOp.getRegionIterArgs(),
+        hasFinalValue ? scfLoopLikeOp.getRegionIterArgs().drop_front()
+                      : scfLoopLikeOp.getRegionIterArgs());
+
+    // Copy loop annotations from the fir.do_loop to scf loop op.
+    if (auto ann = doLoopOp.getLoopAnnotation())
+      scfLoopOp->setAttr("loop_annotation", *ann);
+
+    rewriter.replaceOp(doLoopOp, scfLoopOp);
     return mlir::success();
   }
+
+private:
+  bool parallelUnorderedLoop;
 };
 
 struct IterWhileConversion : public mlir::OpRewritePattern<fir::IterWhileOp> {
@@ -102,6 +126,7 @@ struct IterWhileConversion : public mlir::OpRewritePattern<fir::IterWhileOp> {
 
     mlir::Value okInit = iterWhileOp.getIterateIn();
     mlir::ValueRange iterArgs = iterWhileOp.getInitArgs();
+    bool hasFinalValue = iterWhileOp.getFinalValue().has_value();
 
     mlir::SmallVector<mlir::Value> initVals;
     initVals.push_back(lowerBound);
@@ -128,10 +153,23 @@ struct IterWhileConversion : public mlir::OpRewritePattern<fir::IterWhileOp> {
 
     rewriter.setInsertionPointToStart(&beforeBlock);
 
-    mlir::Value inductionCmp = mlir::arith::CmpIOp::create(
+    // The comparison depends on the sign of the step value. We fully expect
+    // this expression to be folded by the optimizer or LLVM. This expression
+    // is written this way so that `step == 0` always returns `false`.
+    auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
+    auto compl0 = mlir::arith::CmpIOp::create(
+        rewriter, loc, mlir::arith::CmpIPredicate::slt, zero, step);
+    auto compl1 = mlir::arith::CmpIOp::create(
         rewriter, loc, mlir::arith::CmpIPredicate::sle, ivInBefore, upperBound);
-    mlir::Value cond = mlir::arith::AndIOp::create(rewriter, loc, inductionCmp,
-                                                   earlyExitInBefore);
+    auto compl2 = mlir::arith::CmpIOp::create(
+        rewriter, loc, mlir::arith::CmpIPredicate::slt, step, zero);
+    auto compl3 = mlir::arith::CmpIOp::create(
+        rewriter, loc, mlir::arith::CmpIPredicate::sge, ivInBefore, upperBound);
+    auto cmp0 = mlir::arith::AndIOp::create(rewriter, loc, compl0, compl1);
+    auto cmp1 = mlir::arith::AndIOp::create(rewriter, loc, compl2, compl3);
+    auto cmp2 = mlir::arith::OrIOp::create(rewriter, loc, cmp0, cmp1);
+    mlir::Value cond =
+        mlir::arith::AndIOp::create(rewriter, loc, earlyExitInBefore, cmp2);
 
     mlir::scf::ConditionOp::create(rewriter, loc, cond, argsInBefore);
 
@@ -140,17 +178,22 @@ struct IterWhileConversion : public mlir::OpRewritePattern<fir::IterWhileOp> {
 
     auto *afterBody = scfWhileOp.getAfterBody();
     auto resultOp = mlir::cast<fir::ResultOp>(afterBody->getTerminator());
-    mlir::SmallVector<mlir::Value> results(resultOp->getOperands());
-    mlir::Value ivInAfter = scfWhileOp.getAfterArguments()[0];
+    mlir::SmallVector<mlir::Value> results;
+    mlir::Value iv = scfWhileOp.getAfterArguments()[0];
 
     rewriter.setInsertionPointToStart(afterBody);
-    results[0] = mlir::arith::AddIOp::create(rewriter, loc, ivInAfter, step);
+    results.push_back(mlir::arith::AddIOp::create(rewriter, loc, iv, step));
+    llvm::append_range(results, hasFinalValue
+                                    ? resultOp->getOperands().drop_front()
+                                    : resultOp->getOperands());
 
     rewriter.setInsertionPointToEnd(afterBody);
     rewriter.replaceOpWithNewOp<mlir::scf::YieldOp>(resultOp, results);
 
     scfWhileOp->setAttrs(iterWhileOp->getAttrs());
-    rewriter.replaceOp(iterWhileOp, scfWhileOp);
+    rewriter.replaceOp(iterWhileOp,
+                       hasFinalValue ? scfWhileOp->getResults()
+                                     : scfWhileOp->getResults().drop_front());
     return mlir::success();
   }
 };
@@ -197,13 +240,14 @@ struct IfConversion : public mlir::OpRewritePattern<fir::IfOp> {
 };
 } // namespace
 
+void fir::populateFIRToSCFRewrites(mlir::RewritePatternSet &patterns,
+                                   bool parallelUnordered) {
+  patterns.add<IterWhileConversion, IfConversion>(patterns.getContext());
+  patterns.add<DoLoopConversion>(patterns.getContext(), parallelUnordered);
+}
+
 void FIRToSCFPass::runOnOperation() {
   mlir::RewritePatternSet patterns(&getContext());
-  patterns.add<DoLoopConversion, IterWhileConversion, IfConversion>(
-      patterns.getContext());
+  fir::populateFIRToSCFRewrites(patterns, parallelUnordered);
   walkAndApplyPatterns(getOperation(), std::move(patterns));
 }
-
-std::unique_ptr<mlir::Pass> fir::createFIRToSCFPass() {
-  return std::make_unique<FIRToSCFPass>();
-}
diff --git a/flang/lib/Optimizer/Transforms/MIFOpConversion.cpp b/flang/lib/Optimizer/Transforms/MIFOpConversion.cpp
index 206cb9b..0d3d2f6c 100644
--- a/flang/lib/Optimizer/Transforms/MIFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/MIFOpConversion.cpp
@@ -67,6 +67,13 @@ genErrmsgPRIF(fir::FirOpBuilder &builder, mlir::Location loc,
   return {errMsg, errMsgAlloc};
 }
 
+static mlir::Value genStatPRIF(fir::FirOpBuilder &builder, mlir::Location loc,
+                               mlir::Value stat) {
+  if (!stat)
+    return fir::AbsentOp::create(builder, loc, getPRIFStatType(builder));
+  return stat;
+}
+
 /// Convert mif.init operation to runtime call of 'prif_init'
 struct MIFInitOpConversion : public mlir::OpRewritePattern<mif::InitOp> {
   using OpRewritePattern::OpRewritePattern;
@@ -210,9 +217,7 @@ struct MIFSyncAllOpConversion : public mlir::OpRewritePattern<mif::SyncAllOp> {
 
     auto [errmsgArg, errmsgAllocArg] =
         genErrmsgPRIF(builder, loc, op.getErrmsg());
-    mlir::Value stat = op.getStat();
-    if (!stat)
-      stat = fir::AbsentOp::create(builder, loc, getPRIFStatType(builder));
+    mlir::Value stat = genStatPRIF(builder, loc, op.getStat());
     llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
         builder, loc, ftype, stat, errmsgArg, errmsgAllocArg);
     rewriter.replaceOpWithNewOp<fir::CallOp>(op, funcOp, args);
@@ -261,9 +266,7 @@ struct MIFSyncImagesOpConversion
     }
     auto [errmsgArg, errmsgAllocArg] =
         genErrmsgPRIF(builder, loc, op.getErrmsg());
-    mlir::Value stat = op.getStat();
-    if (!stat)
-      stat = fir::AbsentOp::create(builder, loc, getPRIFStatType(builder));
+    mlir::Value stat = genStatPRIF(builder, loc, op.getStat());
     llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
         builder, loc, ftype, imageSet, stat, errmsgArg, errmsgAllocArg);
     rewriter.replaceOpWithNewOp<fir::CallOp>(op, funcOp, args);
@@ -293,9 +296,7 @@ struct MIFSyncMemoryOpConversion
 
     auto [errmsgArg, errmsgAllocArg] =
         genErrmsgPRIF(builder, loc, op.getErrmsg());
-    mlir::Value stat = op.getStat();
-    if (!stat)
-      stat = fir::AbsentOp::create(builder, loc, getPRIFStatType(builder));
+    mlir::Value stat = genStatPRIF(builder, loc, op.getStat());
     llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
         builder, loc, ftype, stat, errmsgArg, errmsgAllocArg);
     rewriter.replaceOpWithNewOp<fir::CallOp>(op, funcOp, args);
@@ -303,6 +304,37 @@ struct MIFSyncMemoryOpConversion
   }
 };
 
+/// Convert mif.sync_team operation to runtime call of 'prif_sync_team'
+struct MIFSyncTeamOpConversion
+    : public mlir::OpRewritePattern<mif::SyncTeamOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::SyncTeamOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+
+    mlir::Type boxTy = fir::BoxType::get(builder.getNoneType());
+    mlir::Type errmsgTy = getPRIFErrmsgType(builder);
+    mlir::FunctionType ftype = mlir::FunctionType::get(
+        builder.getContext(),
+        /*inputs*/ {boxTy, getPRIFStatType(builder), errmsgTy, errmsgTy},
+        /*results*/ {});
+    mlir::func::FuncOp funcOp =
+        builder.createFunction(loc, getPRIFProcName("sync_team"), ftype);
+
+    auto [errmsgArg, errmsgAllocArg] =
+        genErrmsgPRIF(builder, loc, op.getErrmsg());
+    mlir::Value stat = genStatPRIF(builder, loc, op.getStat());
+    llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+        builder, loc, ftype, op.getTeam(), stat, errmsgArg, errmsgAllocArg);
+    rewriter.replaceOpWithNewOp<fir::CallOp>(op, funcOp, args);
+    return mlir::success();
+  }
+};
+
 /// Generate call to collective subroutines except co_reduce
 /// A must be lowered as a box
 static fir::CallOp genCollectiveSubroutine(fir::FirOpBuilder &builder,
@@ -432,6 +464,208 @@ struct MIFCoSumOpConversion : public mlir::OpRewritePattern<mif::CoSumOp> {
   }
 };
 
+/// Convert mif.form_team operation to runtime call of 'prif_form_team'
+struct MIFFormTeamOpConversion
+    : public mlir::OpRewritePattern<mif::FormTeamOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::FormTeamOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+    mlir::Type errmsgTy = getPRIFErrmsgType(builder);
+    mlir::Type boxTy = fir::BoxType::get(builder.getNoneType());
+    mlir::FunctionType ftype = mlir::FunctionType::get(
+        builder.getContext(),
+        /*inputs*/
+        {builder.getRefType(builder.getI64Type()), boxTy,
+         builder.getRefType(builder.getI32Type()), getPRIFStatType(builder),
+         errmsgTy, errmsgTy},
+        /*results*/ {});
+    mlir::func::FuncOp funcOp =
+        builder.createFunction(loc, getPRIFProcName("form_team"), ftype);
+
+    mlir::Type i64Ty = builder.getI64Type();
+    mlir::Value teamNumber = builder.createTemporary(loc, i64Ty);
+    mlir::Value t =
+        (op.getTeamNumber().getType() == i64Ty)
+            ? op.getTeamNumber()
+            : fir::ConvertOp::create(builder, loc, i64Ty, op.getTeamNumber());
+    fir::StoreOp::create(builder, loc, t, teamNumber);
+
+    mlir::Type i32Ty = builder.getI32Type();
+    mlir::Value newIndex;
+    if (op.getNewIndex()) {
+      newIndex = builder.createTemporary(loc, i32Ty);
+      mlir::Value ni =
+          (op.getNewIndex().getType() == i32Ty)
+              ? op.getNewIndex()
+              : fir::ConvertOp::create(builder, loc, i32Ty, op.getNewIndex());
+      fir::StoreOp::create(builder, loc, ni, newIndex);
+    } else
+      newIndex = fir::AbsentOp::create(builder, loc, builder.getRefType(i32Ty));
+
+    mlir::Value stat = genStatPRIF(builder, loc, op.getStat());
+    auto [errmsgArg, errmsgAllocArg] =
+        genErrmsgPRIF(builder, loc, op.getErrmsg());
+    llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+        builder, loc, ftype, teamNumber, op.getTeamVar(), newIndex, stat,
+        errmsgArg, errmsgAllocArg);
+    fir::CallOp callOp = fir::CallOp::create(builder, loc, funcOp, args);
+    rewriter.replaceOp(op, callOp);
+    return mlir::success();
+  }
+};
+
+/// Convert mif.change_team operation to runtime call of 'prif_change_team'
+struct MIFChangeTeamOpConversion
+    : public mlir::OpRewritePattern<mif::ChangeTeamOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::ChangeTeamOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    builder.setInsertionPoint(op);
+
+    mlir::Location loc = op.getLoc();
+    mlir::Type errmsgTy = getPRIFErrmsgType(builder);
+    mlir::Type boxTy = fir::BoxType::get(builder.getNoneType());
+    mlir::FunctionType ftype = mlir::FunctionType::get(
+        builder.getContext(),
+        /*inputs*/ {boxTy, getPRIFStatType(builder), errmsgTy, errmsgTy},
+        /*results*/ {});
+    mlir::func::FuncOp funcOp =
+        builder.createFunction(loc, getPRIFProcName("change_team"), ftype);
+
+    mlir::Value stat = genStatPRIF(builder, loc, op.getStat());
+    auto [errmsgArg, errmsgAllocArg] =
+        genErrmsgPRIF(builder, loc, op.getErrmsg());
+    llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+        builder, loc, ftype, op.getTeam(), stat, errmsgArg, errmsgAllocArg);
+    fir::CallOp::create(builder, loc, funcOp, args);
+
+    mlir::Operation *changeOp = op.getOperation();
+    auto &bodyRegion = op.getRegion();
+    mlir::Block &bodyBlock = bodyRegion.front();
+
+    rewriter.inlineBlockBefore(&bodyBlock, changeOp);
+    rewriter.eraseOp(op);
+    return mlir::success();
+  }
+};
+
+/// Convert mif.end_team operation to runtime call of 'prif_end_team'
+struct MIFEndTeamOpConversion : public mlir::OpRewritePattern<mif::EndTeamOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::EndTeamOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+    mlir::Type errmsgTy = getPRIFErrmsgType(builder);
+    mlir::FunctionType ftype = mlir::FunctionType::get(
+        builder.getContext(),
+        /*inputs*/ {getPRIFStatType(builder), errmsgTy, errmsgTy},
+        /*results*/ {});
+    mlir::func::FuncOp funcOp =
+        builder.createFunction(loc, getPRIFProcName("end_team"), ftype);
+
+    mlir::Value stat = genStatPRIF(builder, loc, op.getStat());
+    auto [errmsgArg, errmsgAllocArg] =
+        genErrmsgPRIF(builder, loc, op.getErrmsg());
+    llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+        builder, loc, ftype, stat, errmsgArg, errmsgAllocArg);
+    fir::CallOp callOp = fir::CallOp::create(builder, loc, funcOp, args);
+    rewriter.replaceOp(op, callOp);
+    return mlir::success();
+  }
+};
+
+/// Convert mif.get_team operation to runtime call of 'prif_get_team'
+struct MIFGetTeamOpConversion : public mlir::OpRewritePattern<mif::GetTeamOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::GetTeamOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+
+    mlir::Type boxTy = fir::BoxType::get(builder.getNoneType());
+    mlir::Type lvlTy = builder.getRefType(builder.getI32Type());
+    mlir::FunctionType ftype =
+        mlir::FunctionType::get(builder.getContext(),
+                                /*inputs*/ {lvlTy, boxTy},
+                                /*results*/ {});
+    mlir::func::FuncOp funcOp =
+        builder.createFunction(loc, getPRIFProcName("get_team"), ftype);
+
+    mlir::Value level = op.getLevel();
+    if (!level)
+      level = fir::AbsentOp::create(builder, loc, lvlTy);
+    else {
+      mlir::Value cst = op.getLevel();
+      mlir::Type i32Ty = builder.getI32Type();
+      level = builder.createTemporary(loc, i32Ty);
+      if (cst.getType() != i32Ty)
+        cst = builder.createConvert(loc, i32Ty, cst);
+      fir::StoreOp::create(builder, loc, cst, level);
+    }
+    mlir::Type resultType = op.getResult().getType();
+    mlir::Type baseTy = fir::unwrapRefType(resultType);
+    mlir::Value team = builder.createTemporary(loc, baseTy);
+    fir::EmboxOp box = fir::EmboxOp::create(builder, loc, resultType, team);
+
+    llvm::SmallVector<mlir::Value> args =
+        fir::runtime::createArguments(builder, loc, ftype, level, box);
+    fir::CallOp::create(builder, loc, funcOp, args);
+
+    rewriter.replaceOp(op, box);
+    return mlir::success();
+  }
+};
+
+/// Convert mif.team_number operation to runtime call of 'prif_team_number'
+struct MIFTeamNumberOpConversion
+    : public mlir::OpRewritePattern<mif::TeamNumberOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::TeamNumberOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+    mlir::Type i64Ty = builder.getI64Type();
+    mlir::Type boxTy = fir::BoxType::get(builder.getNoneType());
+    mlir::FunctionType ftype =
+        mlir::FunctionType::get(builder.getContext(),
+                                /*inputs*/ {boxTy, builder.getRefType(i64Ty)},
+                                /*results*/ {});
+    mlir::func::FuncOp funcOp =
+        builder.createFunction(loc, getPRIFProcName("team_number"), ftype);
+
+    mlir::Value team = op.getTeam();
+    if (!team)
+      team = fir::AbsentOp::create(builder, loc, boxTy);
+
+    mlir::Value result = builder.createTemporary(loc, i64Ty);
+    llvm::SmallVector<mlir::Value> args =
+        fir::runtime::createArguments(builder, loc, ftype, team, result);
+    fir::CallOp::create(builder, loc, funcOp, args);
+    fir::LoadOp load = fir::LoadOp::create(builder, loc, result);
+    rewriter.replaceOp(op, load);
+    return mlir::success();
+  }
+};
+
 class MIFOpConversion : public fir::impl::MIFOpConversionBase<MIFOpConversion> {
 public:
   void runOnOperation() override {
@@ -458,7 +692,10 @@ void mif::populateMIFOpConversionPatterns(mlir::RewritePatternSet &patterns) {
   patterns.insert<MIFInitOpConversion, MIFThisImageOpConversion,
                   MIFNumImagesOpConversion, MIFSyncAllOpConversion,
                   MIFSyncImagesOpConversion, MIFSyncMemoryOpConversion,
-                  MIFCoBroadcastOpConversion, MIFCoMaxOpConversion,
-                  MIFCoMinOpConversion, MIFCoSumOpConversion>(
+                  MIFSyncTeamOpConversion, MIFCoBroadcastOpConversion,
+                  MIFCoMaxOpConversion, MIFCoMinOpConversion,
+                  MIFCoSumOpConversion, MIFFormTeamOpConversion,
+                  MIFChangeTeamOpConversion, MIFEndTeamOpConversion,
+                  MIFGetTeamOpConversion, MIFTeamNumberOpConversion>(
       patterns.getContext());
 }
diff --git a/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp b/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
index 8c0acc5..c9d52c4 100644
--- a/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
@@ -247,7 +247,8 @@ struct DispatchOpConv : public OpConversionPattern<fir::DispatchOp> {
     rewriter.replaceOpWithNewOp<fir::CallOp>(
         dispatch, resTypes, nullptr, args, dispatch.getArgAttrsAttr(),
         dispatch.getResAttrsAttr(), dispatch.getProcedureAttrsAttr(),
-        /*inline_attr*/ fir::FortranInlineEnumAttr{});
+        /*inline_attr*/ fir::FortranInlineEnumAttr{},
+        /*accessGroups*/ mlir::ArrayAttr{});
     return mlir::success();
   }
 
diff --git a/flang/lib/Optimizer/Transforms/SetRuntimeCallAttributes.cpp b/flang/lib/Optimizer/Transforms/SetRuntimeCallAttributes.cpp
index 378037e..4ba2ea5 100644
--- a/flang/lib/Optimizer/Transforms/SetRuntimeCallAttributes.cpp
+++ b/flang/lib/Optimizer/Transforms/SetRuntimeCallAttributes.cpp
@@ -85,7 +85,10 @@ static mlir::LLVM::MemoryEffectsAttr getGenericMemoryAttr(fir::CallOp callOp) {
         callOp->getContext(),
         {/*other=*/mlir::LLVM::ModRefInfo::NoModRef,
          /*argMem=*/mlir::LLVM::ModRefInfo::ModRef,
-         /*inaccessibleMem=*/mlir::LLVM::ModRefInfo::ModRef});
+         /*inaccessibleMem=*/mlir::LLVM::ModRefInfo::ModRef,
+         /*errnoMem=*/mlir::LLVM::ModRefInfo::NoModRef,
+         /*targetMem0=*/mlir::LLVM::ModRefInfo::NoModRef,
+         /*targetMem1=*/mlir::LLVM::ModRefInfo::NoModRef});
   }
 
   return {};