88 files changed, 2912 insertions, 567 deletions
diff --git a/flang/include/flang/Lower/CUDA.h b/flang/include/flang/Lower/CUDA.h
index 4a831fd..ab9dde8 100644
--- a/flang/include/flang/Lower/CUDA.h
+++ b/flang/include/flang/Lower/CUDA.h
@@ -47,10 +47,6 @@ static inline unsigned getAllocatorIdx(const Fortran::semantics::Symbol &sym) {
   return kDefaultAllocator;
 }
 
-void initializeDeviceComponentAllocator(
-    Fortran::lower::AbstractConverter &converter,
-    const Fortran::semantics::Symbol &sym, const fir::MutableBoxValue &box);
-
 mlir::Type gatherDeviceComponentCoordinatesAndType(
     fir::FirOpBuilder &builder, mlir::Location loc,
     const Fortran::semantics::Symbol &sym, fir::RecordType recTy,
diff --git a/flang/include/flang/Lower/HlfirIntrinsics.h b/flang/include/flang/Lower/HlfirIntrinsics.h
index f01f1c7d..930bbeb 100644
--- a/flang/include/flang/Lower/HlfirIntrinsics.h
+++ b/flang/include/flang/Lower/HlfirIntrinsics.h
@@ -58,6 +58,14 @@ struct PreparedActualArgument {
   /// call, the current element value will be returned.
   hlfir::Entity getActual(mlir::Location loc, fir::FirOpBuilder &builder) const;
 
+  mlir::Type getFortranElementType() {
+    if (auto *actualEntity = std::get_if<hlfir::Entity>(&actual))
+      return hlfir::getFortranElementType(actualEntity->getType());
+    mlir::Value entity =
+        std::get<hlfir::ElementalAddrOp>(actual).getElementEntity();
+    return hlfir::getFortranElementType(entity.getType());
+  }
+
   void derefPointersAndAllocatables(mlir::Location loc,
                                     fir::FirOpBuilder &builder) {
     if (auto *actualEntity = std::get_if<hlfir::Entity>(&actual))
diff --git a/flang/include/flang/Lower/OpenMP.h b/flang/include/flang/Lower/OpenMP.h
index 581c93f..df01a7b 100644
--- a/flang/include/flang/Lower/OpenMP.h
+++ b/flang/include/flang/Lower/OpenMP.h
@@ -80,7 +80,6 @@ void genOpenMPDeclarativeConstruct(AbstractConverter &,
 void genOpenMPSymbolProperties(AbstractConverter &converter,
                                const pft::Variable &var);
 
-int64_t getCollapseValue(const Fortran::parser::OmpClauseList &clauseList);
 void genThreadprivateOp(AbstractConverter &, const pft::Variable &);
 void genDeclareTargetIntGlobal(AbstractConverter &, const pft::Variable &);
 bool isOpenMPTargetConstruct(const parser::OpenMPConstruct &);
diff --git a/flang/include/flang/Lower/OpenMP/Clauses.h b/flang/include/flang/Lower/OpenMP/Clauses.h
index 1ab594f..6388468 100644
--- a/flang/include/flang/Lower/OpenMP/Clauses.h
+++ b/flang/include/flang/Lower/OpenMP/Clauses.h
@@ -229,6 +229,8 @@ using Firstprivate = tomp::clause::FirstprivateT<TypeTy, IdTy, ExprTy>;
 using From = tomp::clause::FromT<TypeTy, IdTy, ExprTy>;
 using Full = tomp::clause::FullT<TypeTy, IdTy, ExprTy>;
 using Grainsize = tomp::clause::GrainsizeT<TypeTy, IdTy, ExprTy>;
+using GraphId = tomp::clause::GraphIdT<TypeTy, IdTy, ExprTy>;
+using GraphReset = tomp::clause::GraphResetT<TypeTy, IdTy, ExprTy>;
 using HasDeviceAddr = tomp::clause::HasDeviceAddrT<TypeTy, IdTy, ExprTy>;
 using Hint = tomp::clause::HintT<TypeTy, IdTy, ExprTy>;
 using Holds = tomp::clause::HoldsT<TypeTy, IdTy, ExprTy>;
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index cd73798d..3c020ab 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -246,6 +246,10 @@ struct IntrinsicLibrary {
   template <mlir::arith::CmpIPredicate pred>
   fir::ExtendedValue genCPtrCompare(mlir::Type,
                                     llvm::ArrayRef<fir::ExtendedValue>);
+  void genCoBroadcast(llvm::ArrayRef<fir::ExtendedValue>);
+  void genCoMax(llvm::ArrayRef<fir::ExtendedValue>);
+  void genCoMin(llvm::ArrayRef<fir::ExtendedValue>);
+  void genCoSum(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genCosd(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genCospi(mlir::Type, llvm::ArrayRef<mlir::Value>);
   void genDateAndTime(llvm::ArrayRef<fir::ExtendedValue>);
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h b/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
index 43dca65..bdeb757 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
@@ -31,10 +31,6 @@ void genSyncGlobalDescriptor(fir::FirOpBuilder &builder, mlir::Location loc,
 void genDescriptorCheckSection(fir::FirOpBuilder &builder, mlir::Location loc,
                                mlir::Value desc);
 
-/// Generate runtime call to set the allocator index in the descriptor.
-void genSetAllocatorIndex(fir::FirOpBuilder &builder, mlir::Location loc,
-                          mlir::Value desc, mlir::Value index);
-
 } // namespace fir::runtime::cuda
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Character.h b/flang/include/flang/Optimizer/Builder/Runtime/Character.h
index d1c521d..261ac34 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Character.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Character.h
@@ -66,6 +66,14 @@ mlir::Value genIndex(fir::FirOpBuilder &builder, mlir::Location loc, int kind,
                      mlir::Value back);
 
 /// Generate call to INDEX runtime.
+/// This calls the simple runtime entry points based on the KIND of the string.
+/// A version of interface taking a `boxchar` for string and substring.
+/// Uses no-descriptors flow.
+mlir::Value genIndex(fir::FirOpBuilder &builder, mlir::Location loc,
+                     const fir::ExtendedValue &str,
+                     const fir::ExtendedValue &substr, mlir::Value back);
+
+/// Generate call to INDEX runtime.
 /// This calls the descriptor based runtime call implementation for the index
 /// intrinsic.
 void genIndexDescriptor(fir::FirOpBuilder &builder, mlir::Location loc,
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Coarray.h b/flang/include/flang/Optimizer/Builder/Runtime/Coarray.h
index 23bb378..10ed503 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Coarray.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Coarray.h
@@ -34,6 +34,11 @@ namespace fir::runtime {
     return fir::NameUniquer::doProcedure({"prif"}, {}, oss.str());             \
   }()
 
+#define PRIF_STAT_TYPE builder.getRefType(builder.getI32Type())
+#define PRIF_ERRMSG_TYPE                                                       \
+  fir::BoxType::get(fir::CharacterType::get(builder.getContext(), 1,           \
+                                            fir::CharacterType::unknownLen()))
+
 /// Generate Call to runtime prif_init
 mlir::Value genInitCoarray(fir::FirOpBuilder &builder, mlir::Location loc);
 
@@ -49,5 +54,22 @@ mlir::Value getNumImagesWithTeam(fir::FirOpBuilder &builder, mlir::Location loc,
 mlir::Value getThisImage(fir::FirOpBuilder &builder, mlir::Location loc,
                          mlir::Value team = {});
 
+/// Generate call to runtime subroutine prif_co_broadcast
+void genCoBroadcast(fir::FirOpBuilder &builder, mlir::Location loc,
+                    mlir::Value A, mlir::Value sourceImage, mlir::Value stat,
+                    mlir::Value errmsg);
+
+/// Generate call to runtime subroutine prif_co_max and prif_co_max_character
+void genCoMax(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value A,
+              mlir::Value resultImage, mlir::Value stat, mlir::Value errmsg);
+
+/// Generate call to runtime subroutine prif_co_min or prif_co_min_character
+void genCoMin(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value A,
+              mlir::Value resultImage, mlir::Value stat, mlir::Value errmsg);
+
+/// Generate call to runtime subroutine prif_co_sum
+void genCoSum(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value A,
+              mlir::Value resultImage, mlir::Value stat, mlir::Value errmsg);
+
 } // namespace fir::runtime
 #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_COARRAY_H
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index 23ab8826..e3873823 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -388,25 +388,4 @@ def cuf_StreamCastOp : cuf_Op<"stream_cast", [NoMemoryEffect]> {
   let hasVerifier = 1;
 }
 
-def cuf_SetAllocatorIndexOp : cuf_Op<"set_allocator_idx", []> {
-  let summary = "Set the allocator index in a descriptor";
-
-  let description = [{
-    Allocator index in the Fortran descriptor is used to retrived the correct
-    CUDA allocator to allocate the memory on the device.
-    In many cases the allocator index is set when the descriptor is created. For
-    device components, the descriptor is part of the derived-type itself and
-    needs to be set after the derived-type is allocated in managed memory.
-  }];
-
-  let arguments = (ins Arg<fir_ReferenceType, "", [MemRead, MemWrite]>:$box,
-      cuf_DataAttributeAttr:$data_attr);
-
-  let assemblyFormat = [{
-    $box `:` qualified(type($box)) attr-dict
-  }];
-
-  let hasVerifier = 1;
-}
-
 #endif // FORTRAN_DIALECT_CUF_CUF_OPS
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
index 9a22b2d..9051258 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -394,6 +394,27 @@ def hlfir_CharTrimOp
   let builders = [OpBuilder<(ins "mlir::Value":$chr)>];
 }
 
+def hlfir_IndexOp
+    : hlfir_Op<"index", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
+  let summary = "index transformational intrinsic";
+  let description = [{
+    Search for a substring position within a string, optionally backward
+    if back is set to true.
+  }];
+
+  let arguments = (ins AnyScalarCharacterEntity:$substr,
+      AnyScalarCharacterEntity:$str,
+      Optional<Type<AnyLogicalLike.predicate>>:$back);
+
+  let results = (outs AnyIntegerType);
+
+  let assemblyFormat = [{
+      $substr `in` $str  (`back` $back^)? attr-dict `:` functional-type(operands, results)
+  }];
+
+  let hasVerifier = 1;
+}
+
 def hlfir_AllOp : hlfir_Op<"all", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let summary = "ALL transformational intrinsic";
   let description = [{
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 27be500..d2ab7cb 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -583,6 +583,8 @@ public:
   NODE(OmpFromClause, Modifier)
   NODE(parser, OmpGrainsizeClause)
   NODE(OmpGrainsizeClause, Modifier)
+  NODE(parser, OmpGraphIdClause)
+  NODE(parser, OmpGraphResetClause)
   NODE(parser, OmpHintClause)
   NODE(parser, OmpHoldsClause)
   NODE(parser, OmpIfClause)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 61fdcfe..622b5f9 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4430,6 +4430,22 @@ struct OmpGrainsizeClause {
   std::tuple<MODIFIERS(), ScalarIntExpr> t;
 };
 
+// Ref: [6.0:438]
+//
+// graph_id-clause ->
+//    GRAPH_ID(graph-id-value)                      // since 6.0
+struct OmpGraphIdClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpGraphIdClause, common::Indirection<Expr>);
+};
+
+// Ref: [6.0:438-439]
+//
+// graph_reset-clause ->
+//    GRAPH_RESET[(graph-reset-expression)]         // since 6.0
+struct OmpGraphResetClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpGraphResetClause, common::Indirection<Expr>);
+};
+
 // Ref: [5.0:234-242], [5.1:266-275], [5.2:299], [6.0:472-473]
 struct OmpHintClause {
   WRAPPER_CLASS_BOILERPLATE(OmpHintClause, ScalarIntConstantExpr);
diff --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h
index 7555f27..06e4a464 100644
--- a/flang/include/flang/Runtime/CUDA/descriptor.h
+++ b/flang/include/flang/Runtime/CUDA/descriptor.h
@@ -41,10 +41,6 @@ void RTDECL(CUFSyncGlobalDescriptor)(
 void RTDECL(CUFDescriptorCheckSection)(
     const Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0);
 
-/// Set the allocator index with the provided value.
-void RTDECL(CUFSetAllocatorIndex)(Descriptor *, int index,
-    const char *sourceFile = nullptr, int sourceLine = 0);
-
 } // extern "C"
 
 } // namespace Fortran::runtime::cuda
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index cb1def3..db73a85 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -225,12 +225,18 @@ inline bool HasCUDAAttr(const Symbol &sym) {
 
 bool HasCUDAComponent(const Symbol &sym);
 
+inline bool IsCUDADevice(const Symbol &sym) {
+  if (const auto *details{sym.GetUltimate().detailsIf<ObjectEntityDetails>()}) {
+    return details->cudaDataAttr() &&
+        *details->cudaDataAttr() == common::CUDADataAttr::Device;
+  }
+  return false;
+}
+
 inline bool IsCUDAShared(const Symbol &sym) {
   if (const auto *details{sym.GetUltimate().detailsIf<ObjectEntityDetails>()}) {
-    if (details->cudaDataAttr() &&
-        *details->cudaDataAttr() == common::CUDADataAttr::Shared) {
-      return true;
-    }
+    return details->cudaDataAttr() &&
+        *details->cudaDataAttr() == common::CUDADataAttr::Shared;
   }
   return false;
 }
diff --git a/flang/lib/Evaluate/check-expression.cpp b/flang/lib/Evaluate/check-expression.cpp
index 394a033..8931cbe 100644
--- a/flang/lib/Evaluate/check-expression.cpp
+++ b/flang/lib/Evaluate/check-expression.cpp
@@ -1026,18 +1026,40 @@ public:
     if (x.base().Rank() == 0) {
       return (*this)(x.GetLastSymbol());
     } else {
-      if (Result baseIsContiguous{(*this)(x.base())}) {
+      const DataRef &base{x.base()};
+      if (Result baseIsContiguous{(*this)(base)}) {
         if (!*baseIsContiguous) {
           return false;
+        } else {
+          bool sizeKnown{false};
+          if (auto constShape{GetConstantExtents(context_, x)}) {
+            sizeKnown = true;
+            if (GetSize(*constShape) <= 1) {
+              return true; // empty or singleton
+            }
+          }
+          const Symbol &last{base.GetLastSymbol()};
+          if (auto type{DynamicType::From(last)}) {
+            CHECK(type->category() == TypeCategory::Derived);
+            if (!type->IsPolymorphic()) {
+              const auto &derived{type->GetDerivedTypeSpec()};
+              if (const auto *scope{derived.scope()}) {
+                auto iter{scope->begin()};
+                if (++iter == scope->end()) {
+                  return true; // type has but one component
+                } else if (sizeKnown) {
+                  return false; // multiple components & array size is known > 1
+                }
+              }
+            }
+          }
         }
-        // TODO: should be true if base is contiguous and this is only
-        // component, or when the base has at most one element
       }
       return std::nullopt;
     }
   }
   Result operator()(const ComplexPart &x) const {
-    // TODO: should be true when base is empty array, too
+    // TODO: should be true when base is empty array or singleton, too
     return x.complex().Rank() == 0;
   }
   Result operator()(const Substring &x) const {
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 444b5b6..53239cb 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -450,9 +450,6 @@ private:
     if (alloc.getSymbol().test(Fortran::semantics::Symbol::Flag::AccDeclare))
       Fortran::lower::attachDeclarePostAllocAction(converter, builder,
                                                    alloc.getSymbol());
-    if (Fortran::semantics::HasCUDAComponent(alloc.getSymbol()))
-      Fortran::lower::initializeDeviceComponentAllocator(
-          converter, alloc.getSymbol(), box);
   }
 
   void setPinnedToFalse() {
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index e91fa2d..6125ea9 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -72,6 +72,7 @@
 #include "mlir/Parser/Parser.h"
 #include "mlir/Support/StateStack.h"
 #include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/CommandLine.h"
@@ -2198,6 +2199,11 @@ private:
     // Loops with induction variables inside OpenACC compute constructs
     // need special handling to ensure that the IVs are privatized.
     if (Fortran::lower::isInsideOpenACCComputeConstruct(*builder)) {
+      // Open up a new scope for the loop variables.
+      localSymbols.pushScope();
+      auto scopeGuard =
+          llvm::make_scope_exit([&]() { localSymbols.popScope(); });
+
       mlir::Operation *loopOp = Fortran::lower::genOpenACCLoopFromDoConstruct(
           *this, bridge.getSemanticsContext(), localSymbols, doConstruct, eval);
       bool success = loopOp != nullptr;
@@ -2214,6 +2220,8 @@ private:
         for (auto end = --eval.getNestedEvaluations().end(); iter != end;
              ++iter)
           genFIR(*iter, unstructuredContext);
+
+        builder->setInsertionPointAfter(loopOp);
         return;
       }
       // Fall back to normal loop handling.
diff --git a/flang/lib/Lower/CUDA.cpp b/flang/lib/Lower/CUDA.cpp
index 1293d2c..bb4bdee 100644
--- a/flang/lib/Lower/CUDA.cpp
+++ b/flang/lib/Lower/CUDA.cpp
@@ -17,95 +17,6 @@
 
 #define DEBUG_TYPE "flang-lower-cuda"
 
-void Fortran::lower::initializeDeviceComponentAllocator(
-    Fortran::lower::AbstractConverter &converter,
-    const Fortran::semantics::Symbol &sym, const fir::MutableBoxValue &box) {
-  if (const auto *details{
-          sym.GetUltimate()
-              .detailsIf<Fortran::semantics::ObjectEntityDetails>()}) {
-    const Fortran::semantics::DeclTypeSpec *type{details->type()};
-    const Fortran::semantics::DerivedTypeSpec *derived{type ? type->AsDerived()
-                                                            : nullptr};
-    if (derived) {
-      if (!FindCUDADeviceAllocatableUltimateComponent(*derived))
-        return; // No device components.
-
-      fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-      mlir::Location loc = converter.getCurrentLocation();
-
-      mlir::Type baseTy = fir::unwrapRefType(box.getAddr().getType());
-
-      // Only pointer and allocatable needs post allocation initialization
-      // of components descriptors.
-      if (!fir::isAllocatableType(baseTy) && !fir::isPointerType(baseTy))
-        return;
-
-      // Extract the derived type.
-      mlir::Type ty = fir::getDerivedType(baseTy);
-      auto recTy = mlir::dyn_cast<fir::RecordType>(ty);
-      assert(recTy && "expected fir::RecordType");
-
-      if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(baseTy))
-        baseTy = boxTy.getEleTy();
-      baseTy = fir::unwrapRefType(baseTy);
-
-      Fortran::semantics::UltimateComponentIterator components{*derived};
-      mlir::Value loadedBox = fir::LoadOp::create(builder, loc, box.getAddr());
-      mlir::Value addr;
-      if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(baseTy)) {
-        mlir::Type idxTy = builder.getIndexType();
-        mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
-        mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0);
-        llvm::SmallVector<fir::DoLoopOp> loops;
-        llvm::SmallVector<mlir::Value> indices;
-        llvm::SmallVector<mlir::Value> extents;
-        for (unsigned i = 0; i < seqTy.getDimension(); ++i) {
-          mlir::Value dim = builder.createIntegerConstant(loc, idxTy, i);
-          auto dimInfo = fir::BoxDimsOp::create(builder, loc, idxTy, idxTy,
-                                                idxTy, loadedBox, dim);
-          mlir::Value lbub = mlir::arith::AddIOp::create(
-              builder, loc, dimInfo.getResult(0), dimInfo.getResult(1));
-          mlir::Value ext =
-              mlir::arith::SubIOp::create(builder, loc, lbub, one);
-          mlir::Value cmp = mlir::arith::CmpIOp::create(
-              builder, loc, mlir::arith::CmpIPredicate::sgt, ext, zero);
-          ext = mlir::arith::SelectOp::create(builder, loc, cmp, ext, zero);
-          extents.push_back(ext);
-
-          auto loop = fir::DoLoopOp::create(
-              builder, loc, dimInfo.getResult(0), dimInfo.getResult(1),
-              dimInfo.getResult(2), /*isUnordered=*/true,
-              /*finalCount=*/false, mlir::ValueRange{});
-          loops.push_back(loop);
-          indices.push_back(loop.getInductionVar());
-          builder.setInsertionPointToStart(loop.getBody());
-        }
-        mlir::Value boxAddr = fir::BoxAddrOp::create(builder, loc, loadedBox);
-        auto shape = fir::ShapeOp::create(builder, loc, extents);
-        addr = fir::ArrayCoorOp::create(
-            builder, loc, fir::ReferenceType::get(recTy), boxAddr, shape,
-            /*slice=*/mlir::Value{}, indices, /*typeparms=*/mlir::ValueRange{});
-      } else {
-        addr = fir::BoxAddrOp::create(builder, loc, loadedBox);
-      }
-      for (const auto &compSym : components) {
-        if (Fortran::semantics::IsDeviceAllocatable(compSym)) {
-          llvm::SmallVector<mlir::Value> coord;
-          mlir::Type fieldTy = gatherDeviceComponentCoordinatesAndType(
-              builder, loc, compSym, recTy, coord);
-          assert(coord.size() == 1 && "expect one coordinate");
-          mlir::Value comp = fir::CoordinateOp::create(
-              builder, loc, builder.getRefType(fieldTy), addr, coord[0]);
-          cuf::DataAttributeAttr dataAttr =
-              Fortran::lower::translateSymbolCUFDataAttribute(
-                  builder.getContext(), compSym);
-          cuf::SetAllocatorIndexOp::create(builder, loc, comp, dataAttr);
-        }
-      }
-    }
-  }
-}
-
 mlir::Type Fortran::lower::gatherDeviceComponentCoordinatesAndType(
     fir::FirOpBuilder &builder, mlir::Location loc,
     const Fortran::semantics::Symbol &sym, fir::RecordType recTy,
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index cf8458f..e82d4ea 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -515,10 +515,19 @@ Fortran::lower::genCallOpAndResult(
     // arguments of any type and vice versa.
     mlir::Value cast;
     auto *context = builder.getContext();
-    if (mlir::isa<fir::BoxProcType>(snd) &&
-        mlir::isa<mlir::FunctionType>(fst.getType())) {
-      auto funcTy = mlir::FunctionType::get(context, {}, {});
-      auto boxProcTy = builder.getBoxProcType(funcTy);
+
+    // Special handling for %VAL arguments: internal procedures expect
+    // reference parameters. When %VAL is used, the argument should be
+    // passed by value. Pass the originally loaded value.
+    if (fir::isa_ref_type(snd) && !fir::isa_ref_type(fst.getType()) &&
+        fir::dyn_cast_ptrEleTy(snd) == fst.getType()) {
+      auto loadOp = mlir::cast<fir::LoadOp>(fst.getDefiningOp());
+      mlir::Value originalStorage = loadOp.getMemref();
+      cast = originalStorage;
+    } else if (mlir::isa<fir::BoxProcType>(snd) &&
+               mlir::isa<mlir::FunctionType>(fst.getType())) {
+      mlir::FunctionType funcTy = mlir::FunctionType::get(context, {}, {});
+      fir::BoxProcType boxProcTy = builder.getBoxProcType(funcTy);
       if (mlir::Value host = argumentHostAssocs(converter, fst)) {
         cast = fir::EmboxProcOp::create(builder, loc, boxProcTy,
                                         llvm::ArrayRef<mlir::Value>{fst, host});
@@ -630,9 +639,18 @@ Fortran::lower::genCallOpAndResult(
               caller.getCallDescription().chevrons()[2], stmtCtx)));
 
     mlir::Value stream; // stream is optional.
-    if (caller.getCallDescription().chevrons().size() > 3)
+    if (caller.getCallDescription().chevrons().size() > 3) {
       stream = fir::getBase(converter.genExprAddr(
           caller.getCallDescription().chevrons()[3], stmtCtx));
+      if (!fir::unwrapRefType(stream.getType()).isInteger(64)) {
+        auto i64Ty = mlir::IntegerType::get(builder.getContext(), 64);
+        mlir::Value newStream = builder.createTemporary(loc, i64Ty);
+        mlir::Value load = fir::LoadOp::create(builder, loc, stream);
+        mlir::Value conv = fir::ConvertOp::create(builder, loc, i64Ty, load);
+        fir::StoreOp::create(builder, loc, conv, newStream);
+        stream = newStream;
+      }
+    }
 
     cuf::KernelLaunchOp::create(builder, loc, funcType.getResults(),
                                 funcSymbolAttr, grid_x, grid_y, grid_z, block_x,
@@ -1658,7 +1676,19 @@ void prepareUserCallArguments(
           (*cleanup)();
         break;
       }
-      caller.placeInput(arg, builder.createConvert(loc, argTy, value));
+      // For %VAL arguments, we should pass the value directly without
+      // conversion to reference types. If argTy is different from value type,
+      // it might be due to signature mismatch with internal procedures.
+      if (argTy == value.getType())
+        caller.placeInput(arg, value);
+      else if (fir::isa_ref_type(argTy) &&
+               fir::dyn_cast_ptrEleTy(argTy) == value.getType()) {
+        auto loadOp = mlir::cast<fir::LoadOp>(value.getDefiningOp());
+        mlir::Value originalStorage = loadOp.getMemref();
+        caller.placeInput(arg, originalStorage);
+      } else
+        caller.placeInput(arg, builder.createConvert(loc, argTy, value));
+
     } break;
     case PassBy::BaseAddressValueAttribute:
     case PassBy::CharBoxValueAttribute:
@@ -2193,10 +2223,15 @@ static std::optional<hlfir::EntityWithAttributes> genHLFIRIntrinsicRefCore(
     const std::string intrinsicName = callContext.getProcedureName();
     const fir::IntrinsicArgumentLoweringRules *argLowering =
         intrinsicEntry.getArgumentLoweringRules();
+    mlir::Type resultType =
+        callContext.isElementalProcWithArrayArgs()
+            ? hlfir::getFortranElementType(*callContext.resultType)
+            : *callContext.resultType;
+
     std::optional<hlfir::EntityWithAttributes> res =
         Fortran::lower::lowerHlfirIntrinsic(builder, loc, intrinsicName,
                                             loweredActuals, argLowering,
-                                            *callContext.resultType);
+                                            resultType);
     if (res)
       return res;
   }
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index c79c9b1..ccfde16 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -786,62 +786,6 @@ static mlir::Value createNewLocal(Fortran::lower::AbstractConverter &converter,
   return res;
 }
 
-/// Device allocatable components in a derived-type don't have the correct
-/// allocator index in their descriptor when they are created. After
-/// initialization, cuf.set_allocator_idx operations are inserted to set the
-/// correct allocator index for each device component.
-static void
-initializeDeviceComponentAllocator(Fortran::lower::AbstractConverter &converter,
-                                   const Fortran::semantics::Symbol &symbol,
-                                   Fortran::lower::SymMap &symMap) {
-  if (const auto *details{
-          symbol.GetUltimate()
-              .detailsIf<Fortran::semantics::ObjectEntityDetails>()}) {
-    const Fortran::semantics::DeclTypeSpec *type{details->type()};
-    const Fortran::semantics::DerivedTypeSpec *derived{type ? type->AsDerived()
-                                                            : nullptr};
-    if (derived) {
-      if (!FindCUDADeviceAllocatableUltimateComponent(*derived))
-        return; // No device components.
-
-      fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-      mlir::Location loc = converter.getCurrentLocation();
-
-      fir::ExtendedValue exv =
-          converter.getSymbolExtendedValue(symbol.GetUltimate(), &symMap);
-      mlir::Type baseTy = fir::unwrapRefType(fir::getBase(exv).getType());
-      if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(baseTy))
-        baseTy = boxTy.getEleTy();
-      baseTy = fir::unwrapRefType(baseTy);
-
-      if (fir::isAllocatableType(fir::getBase(exv).getType()) ||
-          fir::isPointerType(fir::getBase(exv).getType()))
-        return; // Allocator index need to be set after allocation.
-
-      auto recTy =
-          mlir::dyn_cast<fir::RecordType>(fir::unwrapSequenceType(baseTy));
-      assert(recTy && "expected fir::RecordType");
-
-      Fortran::semantics::UltimateComponentIterator components{*derived};
-      for (const auto &sym : components) {
-        if (Fortran::semantics::IsDeviceAllocatable(sym)) {
-          llvm::SmallVector<mlir::Value> coord;
-          mlir::Type fieldTy =
-              Fortran::lower::gatherDeviceComponentCoordinatesAndType(
-                  builder, loc, sym, recTy, coord);
-          mlir::Value base = fir::getBase(exv);
-          mlir::Value comp = fir::CoordinateOp::create(
-              builder, loc, builder.getRefType(fieldTy), base, coord);
-          cuf::DataAttributeAttr dataAttr =
-              Fortran::lower::translateSymbolCUFDataAttribute(
-                  builder.getContext(), sym);
-          cuf::SetAllocatorIndexOp::create(builder, loc, comp, dataAttr);
-        }
-      }
-    }
-  }
-}
-
 /// Must \p var be default initialized at runtime when entering its scope.
 static bool
 mustBeDefaultInitializedAtRuntime(const Fortran::lower::pft::Variable &var) {
@@ -898,7 +842,8 @@ void Fortran::lower::defaultInitializeAtRuntime(
             Fortran::semantics::DeclTypeSpec::Category::TypeDerived &&
         !mlir::isa<fir::SequenceType>(symTy) &&
         !sym.test(Fortran::semantics::Symbol::Flag::OmpPrivate) &&
-        !sym.test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate)) {
+        !sym.test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate) &&
+        !Fortran::semantics::HasCUDAComponent(sym)) {
       std::string globalName = fir::NameUniquer::doGenerated(
           (converter.mangleName(*declTy->AsDerived()) + fir::kNameSeparator +
            fir::kDerivedTypeInitSuffix)
@@ -1164,9 +1109,6 @@ static void instantiateLocal(Fortran::lower::AbstractConverter &converter,
   if (mustBeDefaultInitializedAtRuntime(var))
     Fortran::lower::defaultInitializeAtRuntime(converter, var.getSymbol(),
                                                symMap);
-  if (converter.getFoldingContext().languageFeatures().IsEnabled(
-          Fortran::common::LanguageFeature::CUDA))
-    initializeDeviceComponentAllocator(converter, var.getSymbol(), symMap);
   auto *builder = &converter.getFirOpBuilder();
   if (needCUDAAlloc(var.getSymbol()) &&
       !cuf::isCUDADeviceContext(builder->getRegion())) {
@@ -1426,9 +1368,6 @@ static void instantiateAlias(Fortran::lower::AbstractConverter &converter,
   if (mustBeDefaultInitializedAtRuntime(var))
     Fortran::lower::defaultInitializeAtRuntime(converter, var.getSymbol(),
                                                symMap);
-  if (converter.getFoldingContext().languageFeatures().IsEnabled(
-          Fortran::common::LanguageFeature::CUDA))
-    initializeDeviceComponentAllocator(converter, var.getSymbol(), symMap);
 }
 
 //===--------------------------------------------------------------===//
diff --git a/flang/lib/Lower/HlfirIntrinsics.cpp b/flang/lib/Lower/HlfirIntrinsics.cpp
index b9731e9..27c8bb8 100644
--- a/flang/lib/Lower/HlfirIntrinsics.cpp
+++ b/flang/lib/Lower/HlfirIntrinsics.cpp
@@ -69,6 +69,11 @@ protected:
   mlir::Value loadBoxAddress(
       const std::optional<Fortran::lower::PreparedActualArgument> &arg);
 
+  mlir::Value
+  loadTrivialScalar(const Fortran::lower::PreparedActualArgument &arg);
+
+  mlir::Value loadOptionalValue(Fortran::lower::PreparedActualArgument &arg);
+
   void addCleanup(std::optional<hlfir::CleanupFunction> cleanup) {
     if (cleanup)
       cleanupFns.emplace_back(std::move(*cleanup));
@@ -204,6 +209,17 @@ protected:
             mlir::Type stmtResultType) override;
 };
 
+class HlfirIndexLowering : public HlfirTransformationalIntrinsic {
+public:
+  using HlfirTransformationalIntrinsic::HlfirTransformationalIntrinsic;
+
+protected:
+  mlir::Value
+  lowerImpl(const Fortran::lower::PreparedActualArguments &loweredActuals,
+            const fir::IntrinsicArgumentLoweringRules *argLowering,
+            mlir::Type stmtResultType) override;
+};
+
 } // namespace
 
 mlir::Value HlfirTransformationalIntrinsic::loadBoxAddress(
@@ -239,19 +255,22 @@ mlir::Value HlfirTransformationalIntrinsic::loadBoxAddress(
   return boxOrAbsent;
 }
 
-static mlir::Value loadOptionalValue(
-    mlir::Location loc, fir::FirOpBuilder &builder,
-    const std::optional<Fortran::lower::PreparedActualArgument> &arg,
-    hlfir::Entity actual) {
-  if (!arg->handleDynamicOptional())
-    return hlfir::loadTrivialScalar(loc, builder, actual);
+mlir::Value HlfirTransformationalIntrinsic::loadOptionalValue(
+    Fortran::lower::PreparedActualArgument &arg) {
+  mlir::Type eleType = arg.getFortranElementType();
 
-  mlir::Value isPresent = arg->getIsPresent();
-  mlir::Type eleType = hlfir::getFortranElementType(actual.getType());
+  // For an elemental call, getActual() may produce
+  // a designator denoting the array element to be passed
+  // to the subprogram. If the actual array is dynamically
+  // optional the designator must be generated under
+  // isPresent check (see also genIntrinsicRefCore).
   return builder
-      .genIfOp(loc, {eleType}, isPresent,
+      .genIfOp(loc, {eleType}, arg.getIsPresent(),
                /*withElseRegion=*/true)
       .genThen([&]() {
+        hlfir::Entity actual = arg.getActual(loc, builder);
+        assert(eleType == actual.getFortranElementType() &&
+               "result type mismatch in genOptionalValue");
         assert(actual.isScalar() && fir::isa_trivial(eleType) &&
                "must be a numerical or logical scalar");
         hlfir::Entity val = hlfir::loadTrivialScalar(loc, builder, actual);
@@ -264,6 +283,12 @@ static mlir::Value loadOptionalValue(
       .getResults()[0];
 }
 
+mlir::Value HlfirTransformationalIntrinsic::loadTrivialScalar(
+    const Fortran::lower::PreparedActualArgument &arg) {
+  hlfir::Entity actual = arg.getActual(loc, builder);
+  return hlfir::loadTrivialScalar(loc, builder, actual);
+}
+
 llvm::SmallVector<mlir::Value> HlfirTransformationalIntrinsic::getOperandVector(
     const Fortran::lower::PreparedActualArguments &loweredActuals,
     const fir::IntrinsicArgumentLoweringRules *argLowering) {
@@ -277,29 +302,33 @@ llvm::SmallVector<mlir::Value> HlfirTransformationalIntrinsic::getOperandVector(
       operands.emplace_back();
       continue;
     }
-    hlfir::Entity actual = arg->getActual(loc, builder);
     mlir::Value valArg;
-
     if (!argLowering) {
-      valArg = hlfir::loadTrivialScalar(loc, builder, actual);
-    } else {
-      fir::ArgLoweringRule argRules =
-          fir::lowerIntrinsicArgumentAs(*argLowering, i);
-      if (argRules.lowerAs == fir::LowerIntrinsicArgAs::Box)
-        valArg = loadBoxAddress(arg);
-      else if (!argRules.handleDynamicOptional &&
-               argRules.lowerAs != fir::LowerIntrinsicArgAs::Inquired)
-        valArg = hlfir::derefPointersAndAllocatables(loc, builder, actual);
-      else if (argRules.handleDynamicOptional &&
-               argRules.lowerAs == fir::LowerIntrinsicArgAs::Value)
-        valArg = loadOptionalValue(loc, builder, arg, actual);
-      else if (argRules.handleDynamicOptional)
+      valArg = loadTrivialScalar(*arg);
+      operands.emplace_back(valArg);
+      continue;
+    }
+    fir::ArgLoweringRule argRules =
+        fir::lowerIntrinsicArgumentAs(*argLowering, i);
+    if (argRules.lowerAs == fir::LowerIntrinsicArgAs::Box) {
+      valArg = loadBoxAddress(arg);
+    } else if (argRules.handleDynamicOptional) {
+      if (argRules.lowerAs == fir::LowerIntrinsicArgAs::Value) {
+        if (arg->handleDynamicOptional())
+          valArg = loadOptionalValue(*arg);
+        else
+          valArg = loadTrivialScalar(*arg);
+      } else {
         TODO(loc, "hlfir transformational intrinsic dynamically optional "
                   "argument without box lowering");
+      }
+    } else {
+      hlfir::Entity actual = arg->getActual(loc, builder);
+      if (argRules.lowerAs != fir::LowerIntrinsicArgAs::Inquired)
+        valArg = hlfir::derefPointersAndAllocatables(loc, builder, actual);
       else
         valArg = actual.getBase();
     }
-
     operands.emplace_back(valArg);
   }
   return operands;
@@ -513,6 +542,22 @@ mlir::Value HlfirReshapeLowering::lowerImpl(
                                     operands[2], operands[3]);
 }
 
+mlir::Value HlfirIndexLowering::lowerImpl(
+    const Fortran::lower::PreparedActualArguments &loweredActuals,
+    const fir::IntrinsicArgumentLoweringRules *argLowering,
+    mlir::Type stmtResultType) {
+  auto operands = getOperandVector(loweredActuals, argLowering);
+  // 'kind' optional operand is unused here as it has already been
+  // translated into result type.
+  assert(operands.size() == 4);
+  mlir::Value substr = operands[1];
+  mlir::Value str = operands[0];
+  mlir::Value back = operands[2];
+  mlir::Value result =
+      createOp<hlfir::IndexOp>(stmtResultType, substr, str, back);
+  return result;
+}
+
 std::optional<hlfir::EntityWithAttributes> Fortran::lower::lowerHlfirIntrinsic(
     fir::FirOpBuilder &builder, mlir::Location loc, const std::string &name,
     const Fortran::lower::PreparedActualArguments &loweredActuals,
@@ -567,6 +612,10 @@ std::optional<hlfir::EntityWithAttributes> Fortran::lower::lowerHlfirIntrinsic(
   if (name == "reshape")
     return HlfirReshapeLowering{builder, loc}.lower(loweredActuals, argLowering,
                                                     stmtResultType);
+  if (name == "index")
+    return HlfirIndexLowering{builder, loc}.lower(loweredActuals, argLowering,
+                                                  stmtResultType);
+
   if (mlir::isa<fir::CharacterType>(stmtResultType)) {
     if (name == "min")
       return HlfirCharExtremumLowering{builder, loc,
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index bbe749f..d8a0e4d 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -61,6 +61,11 @@ static llvm::cl::opt<bool> strideIncludeLowerExtent(
         "Whether to include the lower dimensions extents in the stride."),
     llvm::cl::init(true));
 
+static llvm::cl::opt<bool> lowerDoLoopToAccLoop(
+    "openacc-do-loop-to-acc-loop",
+    llvm::cl::desc("Whether to lower do loops as `acc.loop` operations."),
+    llvm::cl::init(true));
+
 // Special value for * passed in device_type or gang clauses.
 static constexpr std::int64_t starCst = -1;
 
@@ -5005,6 +5010,9 @@ mlir::Operation *Fortran::lower::genOpenACCLoopFromDoConstruct(
     Fortran::semantics::SemanticsContext &semanticsContext,
     Fortran::lower::SymMap &localSymbols,
     const Fortran::parser::DoConstruct &doConstruct, pft::Evaluation &eval) {
+  if (!lowerDoLoopToAccLoop)
+    return nullptr;
+
   // Only convert loops which have induction variables that need privatized.
   if (!doConstruct.IsDoNormal() && !doConstruct.IsDoConcurrent())
     return nullptr;
@@ -5027,10 +5035,6 @@ mlir::Operation *Fortran::lower::genOpenACCLoopFromDoConstruct(
     return nullptr;
   }
 
-  // Open up a new scope for the loop variables.
-  localSymbols.pushScope();
-  auto scopeGuard = llvm::make_scope_exit([&]() { localSymbols.popScope(); });
-
   // Prepare empty operand vectors since there are no associated `acc loop`
   // clauses with the Fortran do loops being handled here.
   llvm::SmallVector<mlir::Value> privateOperands, gangOperands,
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 23f0ca1..a96884f 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -273,10 +273,15 @@ bool ClauseProcessor::processCancelDirectiveName(
 
 bool ClauseProcessor::processCollapse(
     mlir::Location currentLocation, lower::pft::Evaluation &eval,
-    mlir::omp::LoopRelatedClauseOps &result,
+    mlir::omp::LoopRelatedClauseOps &loopResult,
+    mlir::omp::CollapseClauseOps &collapseResult,
     llvm::SmallVectorImpl<const semantics::Symbol *> &iv) const {
-  return collectLoopRelatedInfo(converter, currentLocation, eval, clauses,
-                                result, iv);
+
+  int64_t numCollapse = collectLoopRelatedInfo(converter, currentLocation, eval,
+                                               clauses, loopResult, iv);
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  collapseResult.collapseNumLoops = firOpBuilder.getI64IntegerAttr(numCollapse);
+  return numCollapse > 1;
 }
 
 bool ClauseProcessor::processDevice(lower::StatementContext &stmtCtx,
@@ -522,6 +527,13 @@ bool ClauseProcessor::processProcBind(
   return false;
 }
 
+bool ClauseProcessor::processTileSizes(
+    lower::pft::Evaluation &eval, mlir::omp::LoopNestOperands &result) const {
+  auto *ompCons{eval.getIf<parser::OpenMPConstruct>()};
+  collectTileSizesFromOpenMPConstruct(ompCons, result.tileSizes, semaCtx);
+  return !result.tileSizes.empty();
+}
+
 bool ClauseProcessor::processSafelen(
     mlir::omp::SafelenClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::Safelen>()) {
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index c46bdb3..324ea3c 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -63,7 +63,8 @@ public:
       mlir::omp::CancelDirectiveNameClauseOps &result) const;
   bool
   processCollapse(mlir::Location currentLocation, lower::pft::Evaluation &eval,
-                  mlir::omp::LoopRelatedClauseOps &result,
+                  mlir::omp::LoopRelatedClauseOps &loopResult,
+                  mlir::omp::CollapseClauseOps &collapseResult,
                   llvm::SmallVectorImpl<const semantics::Symbol *> &iv) const;
   bool processDevice(lower::StatementContext &stmtCtx,
                      mlir::omp::DeviceClauseOps &result) const;
@@ -98,6 +99,8 @@ public:
   bool processPriority(lower::StatementContext &stmtCtx,
                        mlir::omp::PriorityClauseOps &result) const;
   bool processProcBind(mlir::omp::ProcBindClauseOps &result) const;
+  bool processTileSizes(lower::pft::Evaluation &eval,
+                        mlir::omp::LoopNestOperands &result) const;
   bool processSafelen(mlir::omp::SafelenClauseOps &result) const;
   bool processSchedule(lower::StatementContext &stmtCtx,
                        mlir::omp::ScheduleClauseOps &result) const;
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 1a16e1c..cecc1a9 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -221,6 +221,8 @@ MAKE_EMPTY_CLASS(Capture, Capture);
 MAKE_EMPTY_CLASS(Compare, Compare);
 MAKE_EMPTY_CLASS(DynamicAllocators, DynamicAllocators);
 MAKE_EMPTY_CLASS(Full, Full);
+MAKE_EMPTY_CLASS(GraphId, GraphId);
+MAKE_EMPTY_CLASS(GraphReset, GraphReset);
 MAKE_EMPTY_CLASS(Inbranch, Inbranch);
 MAKE_EMPTY_CLASS(Mergeable, Mergeable);
 MAKE_EMPTY_CLASS(Nogroup, Nogroup);
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index def6cff..0ec33e6 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -503,7 +503,7 @@ static void processHostEvalClauses(lower::AbstractConverter &converter,
       [[fallthrough]];
     case OMPD_distribute:
     case OMPD_distribute_simd:
-      cp.processCollapse(loc, eval, hostInfo->ops, hostInfo->iv);
+      cp.processCollapse(loc, eval, hostInfo->ops, hostInfo->ops, hostInfo->iv);
       break;
 
     case OMPD_teams:
@@ -522,7 +522,7 @@ static void processHostEvalClauses(lower::AbstractConverter &converter,
       [[fallthrough]];
     case OMPD_target_teams_distribute:
     case OMPD_target_teams_distribute_simd:
-      cp.processCollapse(loc, eval, hostInfo->ops, hostInfo->iv);
+      cp.processCollapse(loc, eval, hostInfo->ops, hostInfo->ops, hostInfo->iv);
       cp.processNumTeams(stmtCtx, hostInfo->ops);
       break;
 
@@ -533,7 +533,7 @@ static void processHostEvalClauses(lower::AbstractConverter &converter,
       cp.processNumTeams(stmtCtx, hostInfo->ops);
       [[fallthrough]];
     case OMPD_loop:
-      cp.processCollapse(loc, eval, hostInfo->ops, hostInfo->iv);
+      cp.processCollapse(loc, eval, hostInfo->ops, hostInfo->ops, hostInfo->iv);
       break;
 
     case OMPD_teams_workdistribute:
@@ -1569,9 +1569,10 @@ genLoopNestClauses(lower::AbstractConverter &converter,
 
   HostEvalInfo *hostEvalInfo = getHostEvalInfoStackTop(converter);
   if (!hostEvalInfo || !hostEvalInfo->apply(clauseOps, iv))
-    cp.processCollapse(loc, eval, clauseOps, iv);
+    cp.processCollapse(loc, eval, clauseOps, clauseOps, iv);
 
   clauseOps.loopInclusive = converter.getFirOpBuilder().getUnitAttr();
+  cp.processTileSizes(eval, clauseOps);
 }
 
 static void genLoopClauses(
@@ -1948,9 +1949,9 @@ static mlir::omp::LoopNestOp genLoopNestOp(
     return llvm::SmallVector<const semantics::Symbol *>(iv);
   };
 
-  auto *nestedEval =
-      getCollapsedLoopEval(eval, getCollapseValue(item->clauses));
-
+  uint64_t nestValue = getCollapseValue(item->clauses);
+  nestValue = nestValue < iv.size() ? iv.size() : nestValue;
+  auto *nestedEval = getCollapsedLoopEval(eval, nestValue);
   return genOpWithBody<mlir::omp::LoopNestOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, *nestedEval,
                         directive)
@@ -3843,8 +3844,8 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
           parser::omp::GetOmpDirectiveName(*ompNestedLoopCons).v;
       switch (nestedDirective) {
       case llvm::omp::Directive::OMPD_tile:
-        // Emit the omp.loop_nest with annotation for tiling
-        genOMP(converter, symTable, semaCtx, eval, ompNestedLoopCons->value());
+        // Skip OMPD_tile since the tile sizes will be retrieved when
+        // generating the omp.loop_nest op.
         break;
       default: {
         unsigned version = semaCtx.langOptions().OpenMPVersion;
@@ -3957,18 +3958,6 @@ void Fortran::lower::genOpenMPSymbolProperties(
     lower::genDeclareTargetIntGlobal(converter, var);
 }
 
-int64_t
-Fortran::lower::getCollapseValue(const parser::OmpClauseList &clauseList) {
-  for (const parser::OmpClause &clause : clauseList.v) {
-    if (const auto &collapseClause =
-            std::get_if<parser::OmpClause::Collapse>(&clause.u)) {
-      const auto *expr = semantics::GetExpr(collapseClause->v);
-      return evaluate::ToInt64(*expr).value();
-    }
-  }
-  return 1;
-}
-
 void Fortran::lower::genThreadprivateOp(lower::AbstractConverter &converter,
                                         const lower::pft::Variable &var) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index cb6dd57..d1d1cd6 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -13,6 +13,7 @@
 #include "Utils.h"
 
 #include "ClauseFinder.h"
+#include "flang/Evaluate/fold.h"
 #include "flang/Lower/OpenMP/Clauses.h"
 #include <flang/Lower/AbstractConverter.h>
 #include <flang/Lower/ConvertType.h>
@@ -24,11 +25,32 @@
 #include <flang/Parser/parse-tree.h>
 #include <flang/Parser/tools.h>
 #include <flang/Semantics/tools.h>
+#include <flang/Semantics/type.h>
 #include <flang/Utils/OpenMP.h>
 #include <llvm/Support/CommandLine.h>
 
 #include <iterator>
 
+template <typename T>
+Fortran::semantics::MaybeIntExpr
+EvaluateIntExpr(Fortran::semantics::SemanticsContext &context, const T &expr) {
+  if (Fortran::semantics::MaybeExpr maybeExpr{
+          Fold(context.foldingContext(), AnalyzeExpr(context, expr))}) {
+    if (auto *intExpr{
+            Fortran::evaluate::UnwrapExpr<Fortran::semantics::SomeIntExpr>(
+                *maybeExpr)}) {
+      return std::move(*intExpr);
+    }
+  }
+  return std::nullopt;
+}
+
+template <typename T>
+std::optional<std::int64_t>
+EvaluateInt64(Fortran::semantics::SemanticsContext &context, const T &expr) {
+  return Fortran::evaluate::ToInt64(EvaluateIntExpr(context, expr));
+}
+
 llvm::cl::opt<bool> treatIndexAsSection(
     "openmp-treat-index-as-section",
     llvm::cl::desc("In the OpenMP data clauses treat `a(N)` as `a(N:N)`."),
@@ -577,12 +599,64 @@ static void convertLoopBounds(lower::AbstractConverter &converter,
   }
 }
 
-bool collectLoopRelatedInfo(
+// Helper function that finds the sizes clause in a inner OMPD_tile directive
+// and passes the sizes clause to the callback function if found.
+static void processTileSizesFromOpenMPConstruct(
+    const parser::OpenMPConstruct *ompCons,
+    std::function<void(const parser::OmpClause::Sizes *)> processFun) {
+  if (!ompCons)
+    return;
+  if (auto *ompLoop{std::get_if<parser::OpenMPLoopConstruct>(&ompCons->u)}) {
+    const auto &nestedOptional =
+        std::get<std::optional<parser::NestedConstruct>>(ompLoop->t);
+    assert(nestedOptional.has_value() &&
+           "Expected a DoConstruct or OpenMPLoopConstruct");
+    const auto *innerConstruct =
+        std::get_if<common::Indirection<parser::OpenMPLoopConstruct>>(
+            &(nestedOptional.value()));
+    if (innerConstruct) {
+      const auto &innerLoopDirective = innerConstruct->value();
+      const auto &innerBegin =
+          std::get<parser::OmpBeginLoopDirective>(innerLoopDirective.t);
+      const auto &innerDirective =
+          std::get<parser::OmpLoopDirective>(innerBegin.t).v;
+
+      if (innerDirective == llvm::omp::Directive::OMPD_tile) {
+        // Get the size values from parse tree and convert to a vector.
+        const auto &innerClauseList{
+            std::get<parser::OmpClauseList>(innerBegin.t)};
+        for (const auto &clause : innerClauseList.v) {
+          if (const auto tclause{
+                  std::get_if<parser::OmpClause::Sizes>(&clause.u)}) {
+            processFun(tclause);
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+
+/// Populates the sizes vector with values if the given OpenMPConstruct
+/// contains a loop construct with an inner tiling construct.
+void collectTileSizesFromOpenMPConstruct(
+    const parser::OpenMPConstruct *ompCons,
+    llvm::SmallVectorImpl<int64_t> &tileSizes,
+    Fortran::semantics::SemanticsContext &semaCtx) {
+  processTileSizesFromOpenMPConstruct(
+      ompCons, [&](const parser::OmpClause::Sizes *tclause) {
+        for (auto &tval : tclause->v)
+          if (const auto v{EvaluateInt64(semaCtx, tval)})
+            tileSizes.push_back(*v);
+      });
+}
+
+int64_t collectLoopRelatedInfo(
     lower::AbstractConverter &converter, mlir::Location currentLocation,
     lower::pft::Evaluation &eval, const omp::List<omp::Clause> &clauses,
     mlir::omp::LoopRelatedClauseOps &result,
     llvm::SmallVectorImpl<const semantics::Symbol *> &iv) {
-  bool found = false;
+  int64_t numCollapse = 1;
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
   // Collect the loops to collapse.
@@ -595,9 +669,19 @@ bool collectLoopRelatedInfo(
   if (auto *clause =
           ClauseFinder::findUniqueClause<omp::clause::Collapse>(clauses)) {
     collapseValue = evaluate::ToInt64(clause->v).value();
-    found = true;
+    numCollapse = collapseValue;
+  }
+
+  // Collect sizes from tile directive if present.
+  std::int64_t sizesLengthValue = 0l;
+  if (auto *ompCons{eval.getIf<parser::OpenMPConstruct>()}) {
+    processTileSizesFromOpenMPConstruct(
+        ompCons, [&](const parser::OmpClause::Sizes *tclause) {
+          sizesLengthValue = tclause->v.size();
+        });
   }
 
+  collapseValue = std::max(collapseValue, sizesLengthValue);
   std::size_t loopVarTypeSize = 0;
   do {
     lower::pft::Evaluation *doLoop =
@@ -631,7 +715,7 @@ bool collectLoopRelatedInfo(
 
   convertLoopBounds(converter, currentLocation, result, loopVarTypeSize);
 
-  return found;
+  return numCollapse;
 }
 
 } // namespace omp
diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h
index 88371ab..5f191d8 100644
--- a/flang/lib/Lower/OpenMP/Utils.h
+++ b/flang/lib/Lower/OpenMP/Utils.h
@@ -159,12 +159,17 @@ void genObjectList(const ObjectList &objects,
 void lastprivateModifierNotSupported(const omp::clause::Lastprivate &lastp,
                                      mlir::Location loc);
 
-bool collectLoopRelatedInfo(
+int64_t collectLoopRelatedInfo(
     lower::AbstractConverter &converter, mlir::Location currentLocation,
     lower::pft::Evaluation &eval, const omp::List<omp::Clause> &clauses,
     mlir::omp::LoopRelatedClauseOps &result,
     llvm::SmallVectorImpl<const semantics::Symbol *> &iv);
 
+void collectTileSizesFromOpenMPConstruct(
+    const parser::OpenMPConstruct *ompCons,
+    llvm::SmallVectorImpl<int64_t> &tileSizes,
+    Fortran::semantics::SemanticsContext &semaCtx);
+
 } // namespace omp
 } // namespace lower
 } // namespace Fortran
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index e1c9520..6ae48c1 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -397,6 +397,34 @@ static constexpr IntrinsicHandler handlers[]{
     {"cmplx",
      &I::genCmplx,
      {{{"x", asValue}, {"y", asValue, handleDynamicOptional}}}},
+    {"co_broadcast",
+     &I::genCoBroadcast,
+     {{{"a", asBox},
+       {"source_image", asAddr},
+       {"stat", asAddr, handleDynamicOptional},
+       {"errmsg", asBox, handleDynamicOptional}}},
+     /*isElemental*/ false},
+    {"co_max",
+     &I::genCoMax,
+     {{{"a", asBox},
+       {"result_image", asAddr, handleDynamicOptional},
+       {"stat", asAddr, handleDynamicOptional},
+       {"errmsg", asBox, handleDynamicOptional}}},
+     /*isElemental*/ false},
+    {"co_min",
+     &I::genCoMin,
+     {{{"a", asBox},
+       {"result_image", asAddr, handleDynamicOptional},
+       {"stat", asAddr, handleDynamicOptional},
+       {"errmsg", asBox, handleDynamicOptional}}},
+     /*isElemental*/ false},
+    {"co_sum",
+     &I::genCoSum,
+     {{{"a", asBox},
+       {"result_image", asAddr, handleDynamicOptional},
+       {"stat", asAddr, handleDynamicOptional},
+       {"errmsg", asBox, handleDynamicOptional}}},
+     /*isElemental*/ false},
     {"command_argument_count", &I::genCommandArgumentCount},
     {"conjg", &I::genConjg},
     {"cosd", &I::genCosd},
@@ -3686,6 +3714,85 @@ mlir::Value IntrinsicLibrary::genCmplx(mlir::Type resultType,
                                                            imag);
 }
 
+// CO_BROADCAST
+void IntrinsicLibrary::genCoBroadcast(llvm::ArrayRef<fir::ExtendedValue> args) {
+  checkCoarrayEnabled();
+  assert(args.size() == 4);
+  mlir::Value sourceImage = fir::getBase(args[1]);
+  mlir::Value status =
+      isStaticallyAbsent(args[2])
+          ? fir::AbsentOp::create(builder, loc,
+                                  builder.getRefType(builder.getI32Type()))
+                .getResult()
+          : fir::getBase(args[2]);
+  mlir::Value errmsg =
+      isStaticallyAbsent(args[3])
+          ? fir::AbsentOp::create(builder, loc, PRIF_ERRMSG_TYPE).getResult()
+          : fir::getBase(args[3]);
+  fir::runtime::genCoBroadcast(builder, loc, fir::getBase(args[0]), sourceImage,
+                               status, errmsg);
+}
+
+// CO_MAX
+void IntrinsicLibrary::genCoMax(llvm::ArrayRef<fir::ExtendedValue> args) {
+  checkCoarrayEnabled();
+  assert(args.size() == 4);
+  mlir::Value refNone =
+      fir::AbsentOp::create(builder, loc,
+                            builder.getRefType(builder.getI32Type()))
+          .getResult();
+  mlir::Value resultImage =
+      isStaticallyAbsent(args[1]) ? refNone : fir::getBase(args[1]);
+  mlir::Value status =
+      isStaticallyAbsent(args[2]) ? refNone : fir::getBase(args[2]);
+  mlir::Value errmsg =
+      isStaticallyAbsent(args[3])
+          ? fir::AbsentOp::create(builder, loc, PRIF_ERRMSG_TYPE).getResult()
+          : fir::getBase(args[3]);
+  fir::runtime::genCoMax(builder, loc, fir::getBase(args[0]), resultImage,
+                         status, errmsg);
+}
+
+// CO_MIN
+void IntrinsicLibrary::genCoMin(llvm::ArrayRef<fir::ExtendedValue> args) {
+  checkCoarrayEnabled();
+  assert(args.size() == 4);
+  mlir::Value refNone =
+      fir::AbsentOp::create(builder, loc,
+                            builder.getRefType(builder.getI32Type()))
+          .getResult();
+  mlir::Value resultImage =
+      isStaticallyAbsent(args[1]) ? refNone : fir::getBase(args[1]);
+  mlir::Value status =
+      isStaticallyAbsent(args[2]) ? refNone : fir::getBase(args[2]);
+  mlir::Value errmsg =
+      isStaticallyAbsent(args[3])
+          ? fir::AbsentOp::create(builder, loc, PRIF_ERRMSG_TYPE).getResult()
+          : fir::getBase(args[3]);
+  fir::runtime::genCoMin(builder, loc, fir::getBase(args[0]), resultImage,
+                         status, errmsg);
+}
+
+// CO_SUM
+void IntrinsicLibrary::genCoSum(llvm::ArrayRef<fir::ExtendedValue> args) {
+  checkCoarrayEnabled();
+  assert(args.size() == 4);
+  mlir::Value absentInt =
+      fir::AbsentOp::create(builder, loc,
+                            builder.getRefType(builder.getI32Type()))
+          .getResult();
+  mlir::Value resultImage =
+      isStaticallyAbsent(args[1]) ? absentInt : fir::getBase(args[1]);
+  mlir::Value status =
+      isStaticallyAbsent(args[2]) ? absentInt : fir::getBase(args[2]);
+  mlir::Value errmsg =
+      isStaticallyAbsent(args[3])
+          ? fir::AbsentOp::create(builder, loc, PRIF_ERRMSG_TYPE).getResult()
+          : fir::getBase(args[3]);
+  fir::runtime::genCoSum(builder, loc, fir::getBase(args[0]), resultImage,
+                         status, errmsg);
+}
+
 // COMMAND_ARGUMENT_COUNT
 fir::ExtendedValue IntrinsicLibrary::genCommandArgumentCount(
     mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args) {
diff --git a/flang/lib/Optimizer/Builder/MutableBox.cpp b/flang/lib/Optimizer/Builder/MutableBox.cpp
index 50c945d..d4cdfec 100644
--- a/flang/lib/Optimizer/Builder/MutableBox.cpp
+++ b/flang/lib/Optimizer/Builder/MutableBox.cpp
@@ -603,21 +603,23 @@ void fir::factory::associateMutableBoxWithRemap(
     mlir::ValueRange lbounds, mlir::ValueRange ubounds) {
   // Compute new extents
   llvm::SmallVector<mlir::Value> extents;
-  auto idxTy = builder.getIndexType();
+  mlir::Type idxTy = builder.getIndexType();
+  mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0);
   if (!lbounds.empty()) {
     auto one = builder.createIntegerConstant(loc, idxTy, 1);
     for (auto [lb, ub] : llvm::zip(lbounds, ubounds)) {
-      auto lbi = builder.createConvert(loc, idxTy, lb);
-      auto ubi = builder.createConvert(loc, idxTy, ub);
-      auto diff = mlir::arith::SubIOp::create(builder, loc, idxTy, ubi, lbi);
+
+      mlir::Value lbi = builder.createConvert(loc, idxTy, lb);
+      mlir::Value ubi = builder.createConvert(loc, idxTy, ub);
       extents.emplace_back(
-          mlir::arith::AddIOp::create(builder, loc, idxTy, diff, one));
+          fir::factory::computeExtent(builder, loc, lbi, ubi, zero, one));
     }
   } else {
     // lbounds are default. Upper bounds and extents are the same.
-    for (auto ub : ubounds) {
-      auto cast = builder.createConvert(loc, idxTy, ub);
-      extents.emplace_back(cast);
+    for (mlir::Value ub : ubounds) {
+      mlir::Value cast = builder.createConvert(loc, idxTy, ub);
+      extents.emplace_back(
+          fir::factory::genMaxWithZero(builder, loc, cast, zero));
     }
   }
   const auto newRank = extents.size();
diff --git a/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
index a6ee986..37e4c5a 100644
--- a/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
@@ -47,18 +47,3 @@ void fir::runtime::cuda::genDescriptorCheckSection(fir::FirOpBuilder &builder,
       builder, loc, fTy, desc, sourceFile, sourceLine)};
   fir::CallOp::create(builder, loc, func, args);
 }
-
-void fir::runtime::cuda::genSetAllocatorIndex(fir::FirOpBuilder &builder,
-                                              mlir::Location loc,
-                                              mlir::Value desc,
-                                              mlir::Value index) {
-  mlir::func::FuncOp func =
-      fir::runtime::getRuntimeFunc<mkRTKey(CUFSetAllocatorIndex)>(loc, builder);
-  auto fTy = func.getFunctionType();
-  mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
-  mlir::Value sourceLine =
-      fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
-  llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
-      builder, loc, fTy, desc, index, sourceFile, sourceLine)};
-  fir::CallOp::create(builder, loc, func, args);
-}
diff --git a/flang/lib/Optimizer/Builder/Runtime/Character.cpp b/flang/lib/Optimizer/Builder/Runtime/Character.cpp
index 57fb0cc..540ecba 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Character.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Character.cpp
@@ -119,23 +119,23 @@ fir::runtime::genCharCompare(fir::FirOpBuilder &builder, mlir::Location loc,
   return mlir::arith::CmpIOp::create(builder, loc, cmp, tri, zero);
 }
 
+static mlir::Value allocateIfNotInMemory(fir::FirOpBuilder &builder,
+                                         mlir::Location loc, mlir::Value base) {
+  if (fir::isa_ref_type(base.getType()))
+    return base;
+  auto mem =
+      fir::AllocaOp::create(builder, loc, base.getType(), /*pinned=*/false);
+  fir::StoreOp::create(builder, loc, base, mem);
+  return mem;
+}
+
 mlir::Value fir::runtime::genCharCompare(fir::FirOpBuilder &builder,
                                          mlir::Location loc,
                                          mlir::arith::CmpIPredicate cmp,
                                          const fir::ExtendedValue &lhs,
                                          const fir::ExtendedValue &rhs) {
-  if (lhs.getBoxOf<fir::BoxValue>() || rhs.getBoxOf<fir::BoxValue>())
-    TODO(loc, "character compare from descriptors");
-  auto allocateIfNotInMemory = [&](mlir::Value base) -> mlir::Value {
-    if (fir::isa_ref_type(base.getType()))
-      return base;
-    auto mem =
-        fir::AllocaOp::create(builder, loc, base.getType(), /*pinned=*/false);
-    fir::StoreOp::create(builder, loc, base, mem);
-    return mem;
-  };
-  auto lhsBuffer = allocateIfNotInMemory(fir::getBase(lhs));
-  auto rhsBuffer = allocateIfNotInMemory(fir::getBase(rhs));
+  auto lhsBuffer = allocateIfNotInMemory(builder, loc, fir::getBase(lhs));
+  auto rhsBuffer = allocateIfNotInMemory(builder, loc, fir::getBase(rhs));
   return genCharCompare(builder, loc, cmp, lhsBuffer, fir::getLen(lhs),
                         rhsBuffer, fir::getLen(rhs));
 }
@@ -168,6 +168,20 @@ mlir::Value fir::runtime::genIndex(fir::FirOpBuilder &builder,
   return fir::CallOp::create(builder, loc, indexFunc, args).getResult(0);
 }
 
+mlir::Value fir::runtime::genIndex(fir::FirOpBuilder &builder,
+                                   mlir::Location loc,
+                                   const fir::ExtendedValue &str,
+                                   const fir::ExtendedValue &substr,
+                                   mlir::Value back) {
+  assert(!substr.getBoxOf<fir::BoxValue>() && !str.getBoxOf<fir::BoxValue>() &&
+         "shall use genIndexDescriptor version");
+  auto strBuffer = allocateIfNotInMemory(builder, loc, fir::getBase(str));
+  auto substrBuffer = allocateIfNotInMemory(builder, loc, fir::getBase(substr));
+  int kind = discoverKind(strBuffer.getType());
+  return genIndex(builder, loc, kind, strBuffer, fir::getLen(str), substrBuffer,
+                  fir::getLen(substr), back);
+}
+
 void fir::runtime::genIndexDescriptor(fir::FirOpBuilder &builder,
                                       mlir::Location loc, mlir::Value resultBox,
                                       mlir::Value stringBox,
diff --git a/flang/lib/Optimizer/Builder/Runtime/Coarray.cpp b/flang/lib/Optimizer/Builder/Runtime/Coarray.cpp
index fb72fc2..9a893d6 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Coarray.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Coarray.cpp
@@ -14,6 +14,24 @@
 using namespace Fortran::runtime;
 using namespace Fortran::semantics;
 
+// Most PRIF functions take `errmsg` and `errmsg_alloc` as two optional
+// arguments of intent (out). One is allocatable, the other is not.
+// It is the responsibility of the compiler to ensure that the appropriate
+// optional argument is passed, and at most one must be provided in a given
+// call.
+// Depending on the type of `errmsg`, this function will return the pair
+// corresponding to (`errmsg`, `errmsg_alloc`).
+static std::pair<mlir::Value, mlir::Value>
+genErrmsgPRIF(fir::FirOpBuilder &builder, mlir::Location loc,
+              mlir::Value errmsg) {
+  bool isAllocatableErrmsg = fir::isAllocatableType(errmsg.getType());
+
+  mlir::Value absent = fir::AbsentOp::create(builder, loc, PRIF_ERRMSG_TYPE);
+  mlir::Value errMsg = isAllocatableErrmsg ? absent : errmsg;
+  mlir::Value errMsgAlloc = isAllocatableErrmsg ? errmsg : absent;
+  return {errMsg, errMsgAlloc};
+}
+
 /// Generate Call to runtime prif_init
 mlir::Value fir::runtime::genInitCoarray(fir::FirOpBuilder &builder,
                                          mlir::Location loc) {
@@ -24,8 +42,8 @@ mlir::Value fir::runtime::genInitCoarray(fir::FirOpBuilder &builder,
       builder.createFunction(loc, PRIFNAME_SUB("init"), ftype);
   llvm::SmallVector<mlir::Value> args =
       fir::runtime::createArguments(builder, loc, ftype, result);
-  builder.create<fir::CallOp>(loc, funcOp, args);
-  return builder.create<fir::LoadOp>(loc, result);
+  fir::CallOp::create(builder, loc, funcOp, args);
+  return fir::LoadOp::create(builder, loc, result);
 }
 
 /// Generate Call to runtime prif_num_images
@@ -38,8 +56,8 @@ mlir::Value fir::runtime::getNumImages(fir::FirOpBuilder &builder,
       builder.createFunction(loc, PRIFNAME_SUB("num_images"), ftype);
   llvm::SmallVector<mlir::Value> args =
       fir::runtime::createArguments(builder, loc, ftype, result);
-  builder.create<fir::CallOp>(loc, funcOp, args);
-  return builder.create<fir::LoadOp>(loc, result);
+  fir::CallOp::create(builder, loc, funcOp, args);
+  return fir::LoadOp::create(builder, loc, result);
 }
 
 /// Generate Call to runtime prif_num_images_with_{team|team_number}
@@ -63,8 +81,8 @@ mlir::Value fir::runtime::getNumImagesWithTeam(fir::FirOpBuilder &builder,
     team = builder.createBox(loc, team);
   llvm::SmallVector<mlir::Value> args =
       fir::runtime::createArguments(builder, loc, ftype, team, result);
-  builder.create<fir::CallOp>(loc, funcOp, args);
-  return builder.create<fir::LoadOp>(loc, result);
+  fir::CallOp::create(builder, loc, funcOp, args);
+  return fir::LoadOp::create(builder, loc, result);
 }
 
 /// Generate Call to runtime prif_this_image_no_coarray
@@ -78,9 +96,72 @@ mlir::Value fir::runtime::getThisImage(fir::FirOpBuilder &builder,
 
   mlir::Value result = builder.createTemporary(loc, builder.getI32Type());
   mlir::Value teamArg =
-      !team ? builder.create<fir::AbsentOp>(loc, boxTy) : team;
+      !team ? fir::AbsentOp::create(builder, loc, boxTy) : team;
   llvm::SmallVector<mlir::Value> args =
       fir::runtime::createArguments(builder, loc, ftype, teamArg, result);
-  builder.create<fir::CallOp>(loc, funcOp, args);
-  return builder.create<fir::LoadOp>(loc, result);
+  fir::CallOp::create(builder, loc, funcOp, args);
+  return fir::LoadOp::create(builder, loc, result);
+}
+
+/// Generate call to collective subroutines except co_reduce
+/// A must be lowered as a box
+void genCollectiveSubroutine(fir::FirOpBuilder &builder, mlir::Location loc,
+                             mlir::Value A, mlir::Value rootImage,
+                             mlir::Value stat, mlir::Value errmsg,
+                             std::string coName) {
+  mlir::Type boxTy = fir::BoxType::get(builder.getNoneType());
+  mlir::FunctionType ftype =
+      PRIF_FUNCTYPE(boxTy, builder.getRefType(builder.getI32Type()),
+                    PRIF_STAT_TYPE, PRIF_ERRMSG_TYPE, PRIF_ERRMSG_TYPE);
+  mlir::func::FuncOp funcOp = builder.createFunction(loc, coName, ftype);
+
+  auto [errmsgArg, errmsgAllocArg] = genErrmsgPRIF(builder, loc, errmsg);
+  llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+      builder, loc, ftype, A, rootImage, stat, errmsgArg, errmsgAllocArg);
+  fir::CallOp::create(builder, loc, funcOp, args);
+}
+
+/// Generate call to runtime subroutine prif_co_broadcast
+void fir::runtime::genCoBroadcast(fir::FirOpBuilder &builder,
+                                  mlir::Location loc, mlir::Value A,
+                                  mlir::Value sourceImage, mlir::Value stat,
+                                  mlir::Value errmsg) {
+  genCollectiveSubroutine(builder, loc, A, sourceImage, stat, errmsg,
+                          PRIFNAME_SUB("co_broadcast"));
+}
+
+/// Generate call to runtime subroutine prif_co_max or prif_co_max_character
+void fir::runtime::genCoMax(fir::FirOpBuilder &builder, mlir::Location loc,
+                            mlir::Value A, mlir::Value resultImage,
+                            mlir::Value stat, mlir::Value errmsg) {
+  mlir::Type argTy =
+      fir::unwrapSequenceType(fir::unwrapPassByRefType(A.getType()));
+  if (mlir::isa<fir::CharacterType>(argTy))
+    genCollectiveSubroutine(builder, loc, A, resultImage, stat, errmsg,
+                            PRIFNAME_SUB("co_max_character"));
+  else
+    genCollectiveSubroutine(builder, loc, A, resultImage, stat, errmsg,
+                            PRIFNAME_SUB("co_max"));
+}
+
+/// Generate call to runtime subroutine prif_co_min or prif_co_min_character
+void fir::runtime::genCoMin(fir::FirOpBuilder &builder, mlir::Location loc,
+                            mlir::Value A, mlir::Value resultImage,
+                            mlir::Value stat, mlir::Value errmsg) {
+  mlir::Type argTy =
+      fir::unwrapSequenceType(fir::unwrapPassByRefType(A.getType()));
+  if (mlir::isa<fir::CharacterType>(argTy))
+    genCollectiveSubroutine(builder, loc, A, resultImage, stat, errmsg,
+                            PRIFNAME_SUB("co_min_character"));
+  else
+    genCollectiveSubroutine(builder, loc, A, resultImage, stat, errmsg,
+                            PRIFNAME_SUB("co_min"));
+}
+
+/// Generate call to runtime subroutine prif_co_sum
+void fir::runtime::genCoSum(fir::FirOpBuilder &builder, mlir::Location loc,
+                            mlir::Value A, mlir::Value resultImage,
+                            mlir::Value stat, mlir::Value errmsg) {
+  genCollectiveSubroutine(builder, loc, A, resultImage, stat, errmsg,
+                          PRIFNAME_SUB("co_sum"));
 }
diff --git a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
index 97912bd..381b2a2 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
@@ -60,6 +60,21 @@ struct MapInfoOpConversion
     : public OpenMPFIROpConversion<mlir::omp::MapInfoOp> {
   using OpenMPFIROpConversion::OpenMPFIROpConversion;
 
+  mlir::omp::MapBoundsOp
+  createBoundsForCharString(mlir::ConversionPatternRewriter &rewriter,
+                            unsigned int len, mlir::Location loc) const {
+    mlir::Type i64Ty = rewriter.getIntegerType(64);
+    auto lBound = mlir::LLVM::ConstantOp::create(rewriter, loc, i64Ty, 0);
+    auto uBoundAndExt =
+        mlir::LLVM::ConstantOp::create(rewriter, loc, i64Ty, len - 1);
+    auto stride = mlir::LLVM::ConstantOp::create(rewriter, loc, i64Ty, 1);
+    auto baseLb = mlir::LLVM::ConstantOp::create(rewriter, loc, i64Ty, 1);
+    auto mapBoundType = rewriter.getType<mlir::omp::MapBoundsType>();
+    return mlir::omp::MapBoundsOp::create(rewriter, loc, mapBoundType, lBound,
+                                          uBoundAndExt, uBoundAndExt, stride,
+                                          /*strideInBytes*/ false, baseLb);
+  }
+
   llvm::LogicalResult
   matchAndRewrite(mlir::omp::MapInfoOp curOp, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
@@ -69,13 +84,79 @@ struct MapInfoOpConversion
       return mlir::failure();
 
     llvm::SmallVector<mlir::NamedAttribute> newAttrs;
-    mlir::omp::MapInfoOp newOp;
+    mlir::omp::MapBoundsOp mapBoundsOp;
     for (mlir::NamedAttribute attr : curOp->getAttrs()) {
       if (auto typeAttr = mlir::dyn_cast<mlir::TypeAttr>(attr.getValue())) {
         mlir::Type newAttr;
         if (fir::isTypeWithDescriptor(typeAttr.getValue())) {
           newAttr = lowerTy().convertBoxTypeAsStruct(
               mlir::cast<fir::BaseBoxType>(typeAttr.getValue()));
+        } else if (fir::isa_char_string(fir::unwrapSequenceType(
+                       fir::unwrapPassByRefType(typeAttr.getValue()))) &&
+                   !characterWithDynamicLen(
+                       fir::unwrapPassByRefType(typeAttr.getValue()))) {
+          // Characters with a LEN param are represented as strings
+          // (array of characters), the lowering to LLVM dialect
+          // doesn't generate bounds for these (and this is not
+          // done at the initial lowering either) and there is
+          // minor inconsistencies in the variable types we
+          // create for the map without this step when converting
+          // to the LLVM dialect.
+          //
+          // For example, given the types:
+          //
+          //  1) CHARACTER(LEN=16), dimension(:,:), allocatable :: char_arr
+          //  2) CHARACTER(LEN=16), dimension(10,10) :: char_arr
+          //
+          // We get the FIR types (note for 1: we already peeled off the
+          // dynamic extents from the type at this stage, but the conversion
+          // to llvm dialect does that in any case, so the final result
+          // is the same):
+          //
+          //  1) !fir.char<1,16>
+          //  2) !fir.array<10x10x!fir.char<1,16>>
+          //
+          // Which are converted to the LLVM dialect types:
+          //
+          // 1) !llvm.array<16 x i8>
+          // 2) llvm.array<10 x array<10 x array<16 x i8>>
+          //
+          // And in both cases, we are missing the innermost bounds for
+          // the !fir.char<1,16> which is expanded into a 16 x i8 array
+          // in the conversion to LLVM dialect.
+          //
+          // The problem with this is that we would like to treat these
+          // cases identically and not have to create specialised
+          // lowerings for either of these in the lowering to LLVM-IR
+          // and treat them like any other array that passes through.
+          //
+          // To do so below, we generate an extra bound for the
+          // innermost array (the char type/string) using the LEN
+          // parameter of the character type. And we "canonicalize"
+          // the type, stripping it down to the base element type,
+          // which in this case is an i8. This effectively allows
+          // the lowering to treat this as a 1-D array with multiple
+          // bounds which it is capable of handling without any special
+          // casing.
+          // TODO: Handle dynamic LEN characters.
+          if (auto ct = mlir::dyn_cast_or_null<fir::CharacterType>(
+                  fir::unwrapSequenceType(typeAttr.getValue()))) {
+            newAttr = converter->convertType(
+                fir::unwrapSequenceType(typeAttr.getValue()));
+            if (auto type = mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(newAttr))
+              newAttr = type.getElementType();
+            // We do not generate MapBoundsOps for the device pass, as
+            // MapBoundsOps are not generated for the device pass, as
+            // they're unused in the device lowering.
+            auto offloadMod =
+                llvm::dyn_cast_or_null<mlir::omp::OffloadModuleInterface>(
+                    *curOp->getParentOfType<mlir::ModuleOp>());
+            if (!offloadMod.getIsTargetDevice())
+              mapBoundsOp = createBoundsForCharString(rewriter, ct.getLen(),
+                                                      curOp.getLoc());
+          } else {
+            newAttr = converter->convertType(typeAttr.getValue());
+          }
         } else {
           newAttr = converter->convertType(typeAttr.getValue());
         }
@@ -85,8 +166,13 @@ struct MapInfoOpConversion
       }
     }
 
-    rewriter.replaceOpWithNewOp<mlir::omp::MapInfoOp>(
+    auto newOp = rewriter.replaceOpWithNewOp<mlir::omp::MapInfoOp>(
         curOp, resTypes, adaptor.getOperands(), newAttrs);
+    if (mapBoundsOp) {
+      rewriter.startOpModification(newOp);
+      newOp.getBoundsMutable().append(mlir::ValueRange{mapBoundsOp});
+      rewriter.finalizeOpModification(newOp);
+    }
 
     return mlir::success();
   }
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index ade8071..687007d 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -345,17 +345,6 @@ llvm::LogicalResult cuf::StreamCastOp::verify() {
   return checkStreamType(*this);
 }
 
-//===----------------------------------------------------------------------===//
-// SetAllocatorOp
-//===----------------------------------------------------------------------===//
-
-llvm::LogicalResult cuf::SetAllocatorIndexOp::verify() {
-  if (!mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(getBox().getType())))
-    return emitOpError(
-        "expect box to be a reference to class or box type value");
-  return mlir::success();
-}
-
 // Tablegen operators
 
 #define GET_OP_CLASSES
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index ffec4ffb..1a63b1b 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -879,6 +879,28 @@ void hlfir::CharTrimOp::getEffects(
 }
 
 //===----------------------------------------------------------------------===//
+// IndexOp
+//===----------------------------------------------------------------------===//
+
+llvm::LogicalResult hlfir::IndexOp::verify() {
+  mlir::Value substr = getSubstr();
+  mlir::Value str = getStr();
+
+  unsigned charKind = getCharacterKind(substr.getType());
+  if (charKind != getCharacterKind(str.getType()))
+    return emitOpError("character arguments must have the same KIND");
+
+  return mlir::success();
+}
+
+void hlfir::IndexOp::getEffects(
+    llvm::SmallVectorImpl<
+        mlir::SideEffects::EffectInstance<mlir::MemoryEffects::Effect>>
+        &effects) {
+  getIntrinsicEffects(getOperation(), effects);
+}
+
+//===----------------------------------------------------------------------===//
 // NumericalReductionOp
 //===----------------------------------------------------------------------===//
 
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
index a913cfa..4239e57 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
@@ -613,6 +613,45 @@ class CharTrimOpConversion
   }
 };
 
+class IndexOpConversion : public HlfirIntrinsicConversion<hlfir::IndexOp> {
+  using HlfirIntrinsicConversion<hlfir::IndexOp>::HlfirIntrinsicConversion;
+
+  llvm::LogicalResult
+  matchAndRewrite(hlfir::IndexOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    fir::FirOpBuilder builder{rewriter, op.getOperation()};
+    const mlir::Location &loc = op->getLoc();
+    hlfir::Entity substr{op.getSubstr()};
+    hlfir::Entity str{op.getStr()};
+
+    auto [substrExv, substrCleanUp] =
+        hlfir::translateToExtendedValue(loc, builder, substr);
+    auto [strExv, strCleanUp] =
+        hlfir::translateToExtendedValue(loc, builder, str);
+
+    mlir::Value back = op.getBack();
+    if (!back)
+      back = builder.createBool(loc, false);
+
+    mlir::Value result =
+        fir::runtime::genIndex(builder, loc, strExv, substrExv, back);
+    result = builder.createConvert(loc, op.getType(), result);
+    if (strCleanUp || substrCleanUp) {
+      mlir::OpBuilder::InsertionGuard guard(builder);
+      builder.setInsertionPointAfter(op);
+      if (strCleanUp)
+        (*strCleanUp)();
+      if (substrCleanUp)
+        (*substrCleanUp)();
+    }
+    auto resultEntity = hlfir::EntityWithAttributes{result};
+
+    processReturnValue(op, resultEntity, /*mustBeFreed=*/false, builder,
+                       rewriter);
+    return mlir::success();
+  }
+};
+
 class LowerHLFIRIntrinsics
     : public hlfir::impl::LowerHLFIRIntrinsicsBase<LowerHLFIRIntrinsics> {
 public:
@@ -627,7 +666,7 @@ public:
         MaxvalOpConversion, MinvalOpConversion, MinlocOpConversion,
         MaxlocOpConversion, ArrayShiftOpConversion<hlfir::CShiftOp>,
         ArrayShiftOpConversion<hlfir::EOShiftOp>, ReshapeOpConversion,
-        CmpCharOpConversion, CharTrimOpConversion>(context);
+        CmpCharOpConversion, CharTrimOpConversion, IndexOpConversion>(context);
 
     // While conceptually this pass is performing dialect conversion, we use
     // pattern rewrites here instead of dialect conversion because this pass
diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
index e0aebd0..b85ee7e 100644
--- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
@@ -26,6 +26,7 @@ add_flang_library(FlangOpenMPTransforms
   FIRSupport
   FortranSupport
   HLFIRDialect
+  FortranUtils
 
   MLIR_DEPS
   ${dialect_libs}
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index de3b8d7..6c71924 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -6,17 +6,23 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "flang/Optimizer/Builder/DirectivesCommon.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/OpenMP/Passes.h"
 #include "flang/Optimizer/OpenMP/Utils.h"
 #include "flang/Support/OpenMP-utils.h"
+#include "flang/Utils/OpenMP.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
 
 namespace flangomp {
 #define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS
@@ -107,6 +113,33 @@ private:
 
 using InductionVariableInfos = llvm::SmallVector<InductionVariableInfo>;
 
+/// Collect the list of values used inside the loop but defined outside of it.
+void collectLoopLiveIns(fir::DoConcurrentLoopOp loop,
+                        llvm::SmallVectorImpl<mlir::Value> &liveIns) {
+  llvm::SmallDenseSet<mlir::Value> seenValues;
+  llvm::SmallPtrSet<mlir::Operation *, 8> seenOps;
+
+  for (auto [lb, ub, st] : llvm::zip_equal(
+           loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) {
+    liveIns.push_back(lb);
+    liveIns.push_back(ub);
+    liveIns.push_back(st);
+  }
+
+  mlir::visitUsedValuesDefinedAbove(
+      loop.getRegion(), [&](mlir::OpOperand *operand) {
+        if (!seenValues.insert(operand->get()).second)
+          return;
+
+        mlir::Operation *definingOp = operand->get().getDefiningOp();
+        // We want to collect ops corresponding to live-ins only once.
+        if (definingOp && !seenOps.insert(definingOp).second)
+          return;
+
+        liveIns.push_back(operand->get());
+      });
+}
+
 /// Collects values that are local to a loop: "loop-local values". A loop-local
 /// value is one that is used exclusively inside the loop but allocated outside
 /// of it. This usually corresponds to temporary values that are used inside the
@@ -168,6 +201,52 @@ static void localizeLoopLocalValue(mlir::Value local, mlir::Region &allocRegion,
 
 class DoConcurrentConversion
     : public mlir::OpConversionPattern<fir::DoConcurrentOp> {
+private:
+  struct TargetDeclareShapeCreationInfo {
+    // Note: We use `std::vector` (rather than `llvm::SmallVector` as usual) to
+    // interface more easily `ShapeShiftOp::getOrigins()` which returns
+    // `std::vector`.
+    std::vector<mlir::Value> startIndices;
+    std::vector<mlir::Value> extents;
+
+    TargetDeclareShapeCreationInfo(mlir::Value liveIn) {
+      mlir::Value shape = nullptr;
+      mlir::Operation *liveInDefiningOp = liveIn.getDefiningOp();
+      auto declareOp =
+          mlir::dyn_cast_if_present<hlfir::DeclareOp>(liveInDefiningOp);
+
+      if (declareOp != nullptr)
+        shape = declareOp.getShape();
+
+      if (!shape)
+        return;
+
+      auto shapeOp =
+          mlir::dyn_cast_if_present<fir::ShapeOp>(shape.getDefiningOp());
+      auto shapeShiftOp =
+          mlir::dyn_cast_if_present<fir::ShapeShiftOp>(shape.getDefiningOp());
+
+      if (!shapeOp && !shapeShiftOp)
+        TODO(liveIn.getLoc(),
+             "Shapes not defined by `fir.shape` or `fir.shape_shift` op's are"
+             "not supported yet.");
+
+      if (shapeShiftOp != nullptr)
+        startIndices = shapeShiftOp.getOrigins();
+
+      extents = shapeOp != nullptr
+                    ? std::vector<mlir::Value>(shapeOp.getExtents().begin(),
+                                               shapeOp.getExtents().end())
+                    : shapeShiftOp.getExtents();
+    }
+
+    bool isShapedValue() const { return !extents.empty(); }
+    bool isShapeShiftedValue() const { return !startIndices.empty(); }
+  };
+
+  using LiveInShapeInfoMap =
+      llvm::DenseMap<mlir::Value, TargetDeclareShapeCreationInfo>;
+
 public:
   using mlir::OpConversionPattern<fir::DoConcurrentOp>::OpConversionPattern;
 
@@ -182,10 +261,6 @@ public:
   mlir::LogicalResult
   matchAndRewrite(fir::DoConcurrentOp doLoop, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
-    if (mapToDevice)
-      return doLoop.emitError(
-          "not yet implemented: Mapping `do concurrent` loops to device");
-
     looputils::InductionVariableInfos ivInfos;
     auto loop = mlir::cast<fir::DoConcurrentLoopOp>(
         doLoop.getRegion().back().getTerminator());
@@ -196,20 +271,72 @@ public:
     for (mlir::Value indVar : *indVars)
       ivInfos.emplace_back(loop, indVar);
 
+    llvm::SmallVector<mlir::Value> loopNestLiveIns;
+    looputils::collectLoopLiveIns(loop, loopNestLiveIns);
+    assert(!loopNestLiveIns.empty());
+
     llvm::SetVector<mlir::Value> locals;
     looputils::collectLoopLocalValues(loop, locals);
 
+    // We do not want to map "loop-local" values to the device through
+    // `omp.map.info` ops. Therefore, we remove them from the list of live-ins.
+    loopNestLiveIns.erase(llvm::remove_if(loopNestLiveIns,
+                                          [&](mlir::Value liveIn) {
+                                            return locals.contains(liveIn);
+                                          }),
+                          loopNestLiveIns.end());
+
+    mlir::omp::TargetOp targetOp;
+    mlir::omp::LoopNestOperands loopNestClauseOps;
+
     mlir::IRMapping mapper;
+
+    if (mapToDevice) {
+      mlir::ModuleOp module = doLoop->getParentOfType<mlir::ModuleOp>();
+      bool isTargetDevice =
+          llvm::cast<mlir::omp::OffloadModuleInterface>(*module)
+              .getIsTargetDevice();
+
+      mlir::omp::TargetOperands targetClauseOps;
+      genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper,
+                           loopNestClauseOps,
+                           isTargetDevice ? nullptr : &targetClauseOps);
+
+      LiveInShapeInfoMap liveInShapeInfoMap;
+      fir::FirOpBuilder builder(
+          rewriter,
+          fir::getKindMapping(doLoop->getParentOfType<mlir::ModuleOp>()));
+
+      for (mlir::Value liveIn : loopNestLiveIns) {
+        targetClauseOps.mapVars.push_back(
+            genMapInfoOpForLiveIn(builder, liveIn));
+        liveInShapeInfoMap.insert(
+            {liveIn, TargetDeclareShapeCreationInfo(liveIn)});
+      }
+
+      targetOp =
+          genTargetOp(doLoop.getLoc(), rewriter, mapper, loopNestLiveIns,
+                      targetClauseOps, loopNestClauseOps, liveInShapeInfoMap);
+      genTeamsOp(doLoop.getLoc(), rewriter);
+    }
+
     mlir::omp::ParallelOp parallelOp =
         genParallelOp(doLoop.getLoc(), rewriter, ivInfos, mapper);
-    mlir::omp::LoopNestOperands loopNestClauseOps;
-    genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper,
-                         loopNestClauseOps);
+
+    // Only set as composite when part of `distribute parallel do`.
+    parallelOp.setComposite(mapToDevice);
+
+    if (!mapToDevice)
+      genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper,
+                           loopNestClauseOps);
 
     for (mlir::Value local : locals)
       looputils::localizeLoopLocalValue(local, parallelOp.getRegion(),
                                         rewriter);
 
+    if (mapToDevice)
+      genDistributeOp(doLoop.getLoc(), rewriter).setComposite(/*val=*/true);
+
     mlir::omp::LoopNestOp ompLoopNest =
         genWsLoopOp(rewriter, loop, mapper, loopNestClauseOps,
                     /*isComposite=*/mapToDevice);
@@ -284,11 +411,11 @@ private:
     return result;
   }
 
-  void
-  genLoopNestClauseOps(mlir::Location loc,
-                       mlir::ConversionPatternRewriter &rewriter,
-                       fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
-                       mlir::omp::LoopNestOperands &loopNestClauseOps) const {
+  void genLoopNestClauseOps(
+      mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
+      fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
+      mlir::omp::LoopNestOperands &loopNestClauseOps,
+      mlir::omp::TargetOperands *targetClauseOps = nullptr) const {
     assert(loopNestClauseOps.loopLowerBounds.empty() &&
            "Loop nest bounds were already emitted!");
 
@@ -297,11 +424,21 @@ private:
       bounds.push_back(var.getDefiningOp()->getResult(0));
     };
 
+    auto hostEvalCapture = [&](mlir::Value var,
+                               llvm::SmallVectorImpl<mlir::Value> &bounds) {
+      populateBounds(var, bounds);
+
+      // Ensure that loop-nest bounds are evaluated in the host and forwarded to
+      // the nested omp constructs when we map to the device.
+      if (targetClauseOps)
+        targetClauseOps->hostEvalVars.push_back(var);
+    };
+
     for (auto [lb, ub, st] : llvm::zip_equal(
              loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) {
-      populateBounds(lb, loopNestClauseOps.loopLowerBounds);
-      populateBounds(ub, loopNestClauseOps.loopUpperBounds);
-      populateBounds(st, loopNestClauseOps.loopSteps);
+      hostEvalCapture(lb, loopNestClauseOps.loopLowerBounds);
+      hostEvalCapture(ub, loopNestClauseOps.loopUpperBounds);
+      hostEvalCapture(st, loopNestClauseOps.loopSteps);
     }
 
     loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
@@ -439,6 +576,247 @@ private:
     return loopNestOp;
   }
 
+  void genBoundsOps(fir::FirOpBuilder &builder, mlir::Value liveIn,
+                    mlir::Value rawAddr,
+                    llvm::SmallVectorImpl<mlir::Value> &boundsOps) const {
+    fir::ExtendedValue extVal =
+        hlfir::translateToExtendedValue(rawAddr.getLoc(), builder,
+                                        hlfir::Entity{liveIn},
+                                        /*contiguousHint=*/
+                                        true)
+            .first;
+    fir::factory::AddrAndBoundsInfo info = fir::factory::getDataOperandBaseAddr(
+        builder, rawAddr, /*isOptional=*/false, rawAddr.getLoc());
+    boundsOps = fir::factory::genImplicitBoundsOps<mlir::omp::MapBoundsOp,
+                                                   mlir::omp::MapBoundsType>(
+        builder, info, extVal,
+        /*dataExvIsAssumedSize=*/false, rawAddr.getLoc());
+  }
+
+  mlir::omp::MapInfoOp genMapInfoOpForLiveIn(fir::FirOpBuilder &builder,
+                                             mlir::Value liveIn) const {
+    mlir::Value rawAddr = liveIn;
+    llvm::StringRef name;
+
+    mlir::Operation *liveInDefiningOp = liveIn.getDefiningOp();
+    auto declareOp =
+        mlir::dyn_cast_if_present<hlfir::DeclareOp>(liveInDefiningOp);
+
+    if (declareOp != nullptr) {
+      // Use the raw address to avoid unboxing `fir.box` values whenever
+      // possible. Put differently, if we have access to the direct value memory
+      // reference/address, we use it.
+      rawAddr = declareOp.getOriginalBase();
+      name = declareOp.getUniqName();
+    }
+
+    if (!llvm::isa<mlir::omp::PointerLikeType>(rawAddr.getType())) {
+      mlir::OpBuilder::InsertionGuard guard(builder);
+      builder.setInsertionPointAfter(liveInDefiningOp);
+      auto copyVal = builder.createTemporary(liveIn.getLoc(), liveIn.getType());
+      builder.createStoreWithConvert(copyVal.getLoc(), liveIn, copyVal);
+      rawAddr = copyVal;
+    }
+
+    mlir::Type liveInType = liveIn.getType();
+    mlir::Type eleType = liveInType;
+    if (auto refType = mlir::dyn_cast<fir::ReferenceType>(liveInType))
+      eleType = refType.getElementType();
+
+    llvm::omp::OpenMPOffloadMappingFlags mapFlag =
+        llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT;
+    mlir::omp::VariableCaptureKind captureKind =
+        mlir::omp::VariableCaptureKind::ByRef;
+
+    if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) {
+      captureKind = mlir::omp::VariableCaptureKind::ByCopy;
+    } else if (!fir::isa_builtin_cptr_type(eleType)) {
+      mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
+      mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
+    }
+
+    llvm::SmallVector<mlir::Value> boundsOps;
+    genBoundsOps(builder, liveIn, rawAddr, boundsOps);
+
+    return Fortran::utils::openmp::createMapInfoOp(
+        builder, liveIn.getLoc(), rawAddr,
+        /*varPtrPtr=*/{}, name.str(), boundsOps,
+        /*members=*/{},
+        /*membersIndex=*/mlir::ArrayAttr{},
+        static_cast<
+            std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
+            mapFlag),
+        captureKind, rawAddr.getType());
+  }
+
+  mlir::omp::TargetOp
+  genTargetOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
+              mlir::IRMapping &mapper, llvm::ArrayRef<mlir::Value> mappedVars,
+              mlir::omp::TargetOperands &clauseOps,
+              mlir::omp::LoopNestOperands &loopNestClauseOps,
+              const LiveInShapeInfoMap &liveInShapeInfoMap) const {
+    auto targetOp = rewriter.create<mlir::omp::TargetOp>(loc, clauseOps);
+    auto argIface = llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(*targetOp);
+
+    mlir::Region &region = targetOp.getRegion();
+
+    llvm::SmallVector<mlir::Type> regionArgTypes;
+    llvm::SmallVector<mlir::Location> regionArgLocs;
+
+    for (auto var : llvm::concat<const mlir::Value>(clauseOps.hostEvalVars,
+                                                    clauseOps.mapVars)) {
+      regionArgTypes.push_back(var.getType());
+      regionArgLocs.push_back(var.getLoc());
+    }
+
+    rewriter.createBlock(&region, {}, regionArgTypes, regionArgLocs);
+    fir::FirOpBuilder builder(
+        rewriter,
+        fir::getKindMapping(targetOp->getParentOfType<mlir::ModuleOp>()));
+
+    // Within the loop, it is possible that we discover other values that need
+    // to be mapped to the target region (the shape info values for arrays, for
+    // example). Therefore, the map block args might be extended and resized.
+    // Hence, we invoke `argIface.getMapBlockArgs()` every iteration to make
+    // sure we access the proper vector of data.
+    int idx = 0;
+    for (auto [mapInfoOp, mappedVar] :
+         llvm::zip_equal(clauseOps.mapVars, mappedVars)) {
+      auto miOp = mlir::cast<mlir::omp::MapInfoOp>(mapInfoOp.getDefiningOp());
+      hlfir::DeclareOp liveInDeclare =
+          genLiveInDeclare(builder, targetOp, argIface.getMapBlockArgs()[idx],
+                           miOp, liveInShapeInfoMap.at(mappedVar));
+      ++idx;
+
+      // If `mappedVar.getDefiningOp()` is a `fir::BoxAddrOp`, we probably
+      // need to "unpack" the box by getting the defining op of it's value.
+      // However, we did not hit this case in reality yet so leaving it as a
+      // todo for now.
+      if (mlir::isa<fir::BoxAddrOp>(mappedVar.getDefiningOp()))
+        TODO(mappedVar.getLoc(),
+             "Mapped variabled defined by `BoxAddrOp` are not supported yet");
+
+      auto mapHostValueToDevice = [&](mlir::Value hostValue,
+                                      mlir::Value deviceValue) {
+        if (!llvm::isa<mlir::omp::PointerLikeType>(hostValue.getType()))
+          mapper.map(hostValue,
+                     builder.loadIfRef(hostValue.getLoc(), deviceValue));
+        else
+          mapper.map(hostValue, deviceValue);
+      };
+
+      mapHostValueToDevice(mappedVar, liveInDeclare.getOriginalBase());
+
+      if (auto origDeclareOp = mlir::dyn_cast_if_present<hlfir::DeclareOp>(
+              mappedVar.getDefiningOp()))
+        mapHostValueToDevice(origDeclareOp.getBase(), liveInDeclare.getBase());
+    }
+
+    for (auto [arg, hostEval] : llvm::zip_equal(argIface.getHostEvalBlockArgs(),
+                                                clauseOps.hostEvalVars))
+      mapper.map(hostEval, arg);
+
+    for (unsigned i = 0; i < loopNestClauseOps.loopLowerBounds.size(); ++i) {
+      loopNestClauseOps.loopLowerBounds[i] =
+          mapper.lookup(loopNestClauseOps.loopLowerBounds[i]);
+      loopNestClauseOps.loopUpperBounds[i] =
+          mapper.lookup(loopNestClauseOps.loopUpperBounds[i]);
+      loopNestClauseOps.loopSteps[i] =
+          mapper.lookup(loopNestClauseOps.loopSteps[i]);
+    }
+
+    // Check if cloning the bounds introduced any dependency on the outer
+    // region. If so, then either clone them as well if they are
+    // MemoryEffectFree, or else copy them to a new temporary and add them to
+    // the map and block_argument lists and replace their uses with the new
+    // temporary.
+    Fortran::utils::openmp::cloneOrMapRegionOutsiders(builder, targetOp);
+    rewriter.setInsertionPoint(
+        rewriter.create<mlir::omp::TerminatorOp>(targetOp.getLoc()));
+
+    return targetOp;
+  }
+
+  hlfir::DeclareOp genLiveInDeclare(
+      fir::FirOpBuilder &builder, mlir::omp::TargetOp targetOp,
+      mlir::Value liveInArg, mlir::omp::MapInfoOp liveInMapInfoOp,
+      const TargetDeclareShapeCreationInfo &targetShapeCreationInfo) const {
+    mlir::Type liveInType = liveInArg.getType();
+    std::string liveInName = liveInMapInfoOp.getName().has_value()
+                                 ? liveInMapInfoOp.getName().value().str()
+                                 : std::string("");
+    if (fir::isa_ref_type(liveInType))
+      liveInType = fir::unwrapRefType(liveInType);
+
+    mlir::Value shape = [&]() -> mlir::Value {
+      if (!targetShapeCreationInfo.isShapedValue())
+        return {};
+
+      llvm::SmallVector<mlir::Value> extentOperands;
+      llvm::SmallVector<mlir::Value> startIndexOperands;
+
+      if (targetShapeCreationInfo.isShapeShiftedValue()) {
+        llvm::SmallVector<mlir::Value> shapeShiftOperands;
+
+        size_t shapeIdx = 0;
+        for (auto [startIndex, extent] :
+             llvm::zip_equal(targetShapeCreationInfo.startIndices,
+                             targetShapeCreationInfo.extents)) {
+          shapeShiftOperands.push_back(
+              Fortran::utils::openmp::mapTemporaryValue(
+                  builder, targetOp, startIndex,
+                  liveInName + ".start_idx.dim" + std::to_string(shapeIdx)));
+          shapeShiftOperands.push_back(
+              Fortran::utils::openmp::mapTemporaryValue(
+                  builder, targetOp, extent,
+                  liveInName + ".extent.dim" + std::to_string(shapeIdx)));
+          ++shapeIdx;
+        }
+
+        auto shapeShiftType = fir::ShapeShiftType::get(
+            builder.getContext(), shapeShiftOperands.size() / 2);
+        return builder.create<fir::ShapeShiftOp>(
+            liveInArg.getLoc(), shapeShiftType, shapeShiftOperands);
+      }
+
+      llvm::SmallVector<mlir::Value> shapeOperands;
+      size_t shapeIdx = 0;
+      for (auto extent : targetShapeCreationInfo.extents) {
+        shapeOperands.push_back(Fortran::utils::openmp::mapTemporaryValue(
+            builder, targetOp, extent,
+            liveInName + ".extent.dim" + std::to_string(shapeIdx)));
+        ++shapeIdx;
+      }
+
+      return builder.create<fir::ShapeOp>(liveInArg.getLoc(), shapeOperands);
+    }();
+
+    return builder.create<hlfir::DeclareOp>(liveInArg.getLoc(), liveInArg,
+                                            liveInName, shape);
+  }
+
+  mlir::omp::TeamsOp
+  genTeamsOp(mlir::Location loc,
+             mlir::ConversionPatternRewriter &rewriter) const {
+    auto teamsOp = rewriter.create<mlir::omp::TeamsOp>(
+        loc, /*clauses=*/mlir::omp::TeamsOperands{});
+
+    rewriter.createBlock(&teamsOp.getRegion());
+    rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
+
+    return teamsOp;
+  }
+
+  mlir::omp::DistributeOp
+  genDistributeOp(mlir::Location loc,
+                  mlir::ConversionPatternRewriter &rewriter) const {
+    auto distOp = rewriter.create<mlir::omp::DistributeOp>(
+        loc, /*clauses=*/mlir::omp::DistributeOperands{});
+
+    rewriter.createBlock(&distOp.getRegion());
+    return distOp;
+  }
+
   bool mapToDevice;
   llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip;
   mlir::SymbolTable &moduleSymbolTable;
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 9834b04..609a1fc 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -557,8 +557,8 @@ static mlir::Value emboxSrc(mlir::PatternRewriter &rewriter,
     mlir::Value src = op.getSrc();
     if (srcTy.isInteger(1)) {
       // i1 is not a supported type in the descriptor and it is actually coming
-      // from a LOGICAL constant. Store it as a fir.logical.
-      srcTy = fir::LogicalType::get(rewriter.getContext(), 4);
+      // from a LOGICAL constant. Use the destination type to avoid mismatch.
+      srcTy = dstEleTy;
       src = createConvertOp(rewriter, loc, srcTy, src);
       addr = builder.createTemporary(loc, srcTy);
       fir::StoreOp::create(builder, loc, src, addr);
@@ -650,7 +650,7 @@ struct CUFDataTransferOpConversion
 
       if (fir::isa_trivial(srcTy) && !fir::isa_trivial(dstTy)) {
         // Initialization of an array from a scalar value should be implemented
-        // via a kernel launch. Use the flan runtime via the Assign function
+        // via a kernel launch. Use the flang runtime via the Assign function
         // until we have more infrastructure.
         mlir::Value src = emboxSrc(rewriter, op, symtab);
         mlir::Value dst = emboxDst(rewriter, op, symtab);
@@ -928,34 +928,6 @@ struct CUFSyncDescriptorOpConversion
   }
 };
 
-struct CUFSetAllocatorIndexOpConversion
-    : public mlir::OpRewritePattern<cuf::SetAllocatorIndexOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  mlir::LogicalResult
-  matchAndRewrite(cuf::SetAllocatorIndexOp op,
-                  mlir::PatternRewriter &rewriter) const override {
-    auto mod = op->getParentOfType<mlir::ModuleOp>();
-    fir::FirOpBuilder builder(rewriter, mod);
-    mlir::Location loc = op.getLoc();
-    int idx = kDefaultAllocator;
-    if (op.getDataAttr() == cuf::DataAttribute::Device) {
-      idx = kDeviceAllocatorPos;
-    } else if (op.getDataAttr() == cuf::DataAttribute::Managed) {
-      idx = kManagedAllocatorPos;
-    } else if (op.getDataAttr() == cuf::DataAttribute::Unified) {
-      idx = kUnifiedAllocatorPos;
-    } else if (op.getDataAttr() == cuf::DataAttribute::Pinned) {
-      idx = kPinnedAllocatorPos;
-    }
-    mlir::Value index =
-        builder.createIntegerConstant(loc, builder.getI32Type(), idx);
-    fir::runtime::cuda::genSetAllocatorIndex(builder, loc, op.getBox(), index);
-    op.erase();
-    return mlir::success();
-  }
-};
-
 class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> {
 public:
   void runOnOperation() override {
@@ -1017,8 +989,8 @@ void cuf::populateCUFToFIRConversionPatterns(
     const mlir::SymbolTable &symtab, mlir::RewritePatternSet &patterns) {
   patterns.insert<CUFAllocOpConversion>(patterns.getContext(), &dl, &converter);
   patterns.insert<CUFAllocateOpConversion, CUFDeallocateOpConversion,
-                  CUFFreeOpConversion, CUFSyncDescriptorOpConversion,
-                  CUFSetAllocatorIndexOpConversion>(patterns.getContext());
+                  CUFFreeOpConversion, CUFSyncDescriptorOpConversion>(
+      patterns.getContext());
   patterns.insert<CUFDataTransferOpConversion>(patterns.getContext(), symtab,
                                                &dl, &converter);
   patterns.insert<CUFLaunchOpConversion, CUFDeviceAddressOpConversion>(
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index ce46a86..68e0acd 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -802,6 +802,10 @@ TYPE_PARSER(construct<OmpFailClause>(
     "RELEASE" >> pure(common::OmpMemoryOrderType::Release) ||
     "SEQ_CST" >> pure(common::OmpMemoryOrderType::Seq_Cst)))
 
+TYPE_PARSER(construct<OmpGraphIdClause>(expr))
+
+TYPE_PARSER(construct<OmpGraphResetClause>(expr))
+
 // 2.5 PROC_BIND (MASTER | CLOSE | PRIMARY | SPREAD)
 TYPE_PARSER(construct<OmpProcBindClause>(
     "CLOSE" >> pure(OmpProcBindClause::AffinityPolicy::Close) ||
@@ -1102,6 +1106,11 @@ TYPE_PARSER( //
     "FULL" >> construct<OmpClause>(construct<OmpClause::Full>()) ||
     "GRAINSIZE" >> construct<OmpClause>(construct<OmpClause::Grainsize>(
                        parenthesized(Parser<OmpGrainsizeClause>{}))) ||
+    "GRAPH_ID" >> construct<OmpClause>(construct<OmpClause::GraphId>(
+                      parenthesized(Parser<OmpGraphIdClause>{}))) ||
+    "GRAPH_RESET" >>
+        construct<OmpClause>(construct<OmpClause::GraphReset>(
+            maybe(parenthesized(Parser<OmpGraphResetClause>{})))) ||
     "HAS_DEVICE_ADDR" >>
         construct<OmpClause>(construct<OmpClause::HasDeviceAddr>(
             parenthesized(Parser<OmpObjectList>{}))) ||
@@ -1872,6 +1881,7 @@ TYPE_PARSER( //
         llvm::omp::Directive::OMPD_target_teams_workdistribute) ||
     MakeBlockConstruct(llvm::omp::Directive::OMPD_target) ||
     MakeBlockConstruct(llvm::omp::Directive::OMPD_task) ||
+    MakeBlockConstruct(llvm::omp::Directive::OMPD_taskgraph) ||
     MakeBlockConstruct(llvm::omp::Directive::OMPD_taskgroup) ||
     MakeBlockConstruct(llvm::omp::Directive::OMPD_teams) ||
     MakeBlockConstruct(llvm::omp::Directive::OMPD_teams_workdistribute) ||
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 85d79a00..d1654a3 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -2793,6 +2793,8 @@ CHECK_SIMPLE_CLAUSE(Final, OMPC_final)
 CHECK_SIMPLE_CLAUSE(Flush, OMPC_flush)
 CHECK_SIMPLE_CLAUSE(Full, OMPC_full)
 CHECK_SIMPLE_CLAUSE(Grainsize, OMPC_grainsize)
+CHECK_SIMPLE_CLAUSE(GraphId, OMPC_graph_id)
+CHECK_SIMPLE_CLAUSE(GraphReset, OMPC_graph_reset)
 CHECK_SIMPLE_CLAUSE(Holds, OMPC_holds)
 CHECK_SIMPLE_CLAUSE(Inclusive, OMPC_inclusive)
 CHECK_SIMPLE_CLAUSE(Initializer, OMPC_initializer)
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index ccccf60..3f048ab 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -3700,7 +3700,10 @@ static MaybeExpr NumericUnaryHelper(ExpressionAnalyzer &context,
       analyzer.CheckForNullPointer();
       analyzer.CheckForAssumedRank();
       if (opr == NumericOperator::Add) {
-        return analyzer.MoveExpr(0);
+        // +x -> (x), not a bare x, because the bounds of the argument must
+        // not be exposed to allocatable assignments or structure constructor
+        // components.
+        return Parenthesize(analyzer.MoveExpr(0));
       } else {
         return Negation(context.GetContextualMessages(), analyzer.MoveExpr(0));
       }
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 43f12c2..16b895d 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -15,6 +15,7 @@
 #include "flang/Evaluate/fold.h"
 #include "flang/Evaluate/tools.h"
 #include "flang/Evaluate/type.h"
+#include "flang/Parser/openmp-utils.h"
 #include "flang/Parser/parse-tree-visitor.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Parser/tools.h"
@@ -579,6 +580,12 @@ public:
   bool Pre(const parser::OpenMPAllocatorsConstruct &);
   void Post(const parser::OpenMPAllocatorsConstruct &);
 
+  bool Pre(const parser::OpenMPUtilityConstruct &x) {
+    PushContext(x.source, parser::omp::GetOmpDirectiveName(x).v);
+    return true;
+  }
+  void Post(const parser::OpenMPUtilityConstruct &) { PopContext(); }
+
   bool Pre(const parser::OmpDeclareVariantDirective &x) {
     PushContext(x.source, llvm::omp::Directive::OMPD_declare_variant);
     return true;
@@ -856,7 +863,23 @@ public:
   const parser::OmpClause *GetAssociatedClause() { return associatedClause; }
 
 private:
-  std::int64_t GetAssociatedLoopLevelFromClauses(const parser::OmpClauseList &);
+  /// Given a vector of loop levels and a vector of corresponding clauses find
+  /// the largest loop level and set the associated loop level to the found
+  /// maximum. This is used for error handling to ensure that the number of
+  /// affected loops is not larger that the number of available loops.
+  std::int64_t SetAssociatedMaxClause(llvm::SmallVector<std::int64_t> &,
+      llvm::SmallVector<const parser::OmpClause *> &);
+  std::int64_t GetNumAffectedLoopsFromLoopConstruct(
+      const parser::OpenMPLoopConstruct &);
+  void CollectNumAffectedLoopsFromLoopConstruct(
+      const parser::OpenMPLoopConstruct &, llvm::SmallVector<std::int64_t> &,
+      llvm::SmallVector<const parser::OmpClause *> &);
+  void CollectNumAffectedLoopsFromInnerLoopContruct(
+      const parser::OpenMPLoopConstruct &, llvm::SmallVector<std::int64_t> &,
+      llvm::SmallVector<const parser::OmpClause *> &);
+  void CollectNumAffectedLoopsFromClauses(const parser::OmpClauseList &,
+      llvm::SmallVector<std::int64_t> &,
+      llvm::SmallVector<const parser::OmpClause *> &);
 
   Symbol::Flags dataSharingAttributeFlags{Symbol::Flag::OmpShared,
       Symbol::Flag::OmpPrivate, Symbol::Flag::OmpFirstPrivate,
@@ -1774,6 +1797,7 @@ bool OmpAttributeVisitor::Pre(const parser::OmpBlockConstruct &x) {
   case llvm::omp::Directive::OMPD_target:
   case llvm::omp::Directive::OMPD_target_data:
   case llvm::omp::Directive::OMPD_task:
+  case llvm::omp::Directive::OMPD_taskgraph:
   case llvm::omp::Directive::OMPD_taskgroup:
   case llvm::omp::Directive::OMPD_teams:
   case llvm::omp::Directive::OMPD_workdistribute:
@@ -1868,7 +1892,6 @@ bool OmpAttributeVisitor::Pre(
 bool OmpAttributeVisitor::Pre(const parser::OpenMPLoopConstruct &x) {
   const auto &beginLoopDir{std::get<parser::OmpBeginLoopDirective>(x.t)};
   const auto &beginDir{std::get<parser::OmpLoopDirective>(beginLoopDir.t)};
-  const auto &clauseList{std::get<parser::OmpClauseList>(beginLoopDir.t)};
   switch (beginDir.v) {
   case llvm::omp::Directive::OMPD_distribute:
   case llvm::omp::Directive::OMPD_distribute_parallel_do:
@@ -1919,7 +1942,7 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPLoopConstruct &x) {
       beginDir.v == llvm::omp::Directive::OMPD_target_loop)
     IssueNonConformanceWarning(beginDir.v, beginDir.source, 52);
   ClearDataSharingAttributeObjects();
-  SetContextAssociatedLoopLevel(GetAssociatedLoopLevelFromClauses(clauseList));
+  SetContextAssociatedLoopLevel(GetNumAffectedLoopsFromLoopConstruct(x));
 
   if (beginDir.v == llvm::omp::Directive::OMPD_do) {
     auto &optLoopCons = std::get<std::optional<parser::NestedConstruct>>(x.t);
@@ -1933,7 +1956,7 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPLoopConstruct &x) {
     }
   }
   PrivatizeAssociatedLoopIndexAndCheckLoopLevel(x);
-  ordCollapseLevel = GetAssociatedLoopLevelFromClauses(clauseList) + 1;
+  ordCollapseLevel = GetNumAffectedLoopsFromLoopConstruct(x) + 1;
   return true;
 }
 
@@ -2021,44 +2044,111 @@ bool OmpAttributeVisitor::Pre(const parser::DoConstruct &x) {
   return true;
 }
 
-std::int64_t OmpAttributeVisitor::GetAssociatedLoopLevelFromClauses(
-    const parser::OmpClauseList &x) {
-  std::int64_t orderedLevel{0};
-  std::int64_t collapseLevel{0};
+static bool isSizesClause(const parser::OmpClause *clause) {
+  return std::holds_alternative<parser::OmpClause::Sizes>(clause->u);
+}
+
+std::int64_t OmpAttributeVisitor::SetAssociatedMaxClause(
+    llvm::SmallVector<std::int64_t> &levels,
+    llvm::SmallVector<const parser::OmpClause *> &clauses) {
+
+  // Find the tile level to ensure that the COLLAPSE clause value
+  // does not exeed the number of tiled loops.
+  std::int64_t tileLevel = 0;
+  for (auto [level, clause] : llvm::zip_equal(levels, clauses))
+    if (isSizesClause(clause))
+      tileLevel = level;
+
+  std::int64_t maxLevel = 1;
+  const parser::OmpClause *maxClause = nullptr;
+  for (auto [level, clause] : llvm::zip_equal(levels, clauses)) {
+    if (tileLevel > 0 && tileLevel < level) {
+      context_.Say(clause->source,
+          "The value of the parameter in the COLLAPSE clause must"
+          " not be larger than the number of the number of tiled loops"
+          " because collapse currently is limited to independent loop"
+          " iterations."_err_en_US);
+      return 1;
+    }
+
+    if (level > maxLevel) {
+      maxLevel = level;
+      maxClause = clause;
+    }
+  }
+  if (maxClause)
+    SetAssociatedClause(maxClause);
+  return maxLevel;
+}
+
+std::int64_t OmpAttributeVisitor::GetNumAffectedLoopsFromLoopConstruct(
+    const parser::OpenMPLoopConstruct &x) {
+  llvm::SmallVector<std::int64_t> levels;
+  llvm::SmallVector<const parser::OmpClause *> clauses;
+
+  CollectNumAffectedLoopsFromLoopConstruct(x, levels, clauses);
+  return SetAssociatedMaxClause(levels, clauses);
+}
 
-  const parser::OmpClause *ordClause{nullptr};
-  const parser::OmpClause *collClause{nullptr};
+void OmpAttributeVisitor::CollectNumAffectedLoopsFromLoopConstruct(
+    const parser::OpenMPLoopConstruct &x,
+    llvm::SmallVector<std::int64_t> &levels,
+    llvm::SmallVector<const parser::OmpClause *> &clauses) {
+  const auto &beginLoopDir{std::get<parser::OmpBeginLoopDirective>(x.t)};
+  const auto &clauseList{std::get<parser::OmpClauseList>(beginLoopDir.t)};
+
+  CollectNumAffectedLoopsFromClauses(clauseList, levels, clauses);
+  CollectNumAffectedLoopsFromInnerLoopContruct(x, levels, clauses);
+}
+
+void OmpAttributeVisitor::CollectNumAffectedLoopsFromInnerLoopContruct(
+    const parser::OpenMPLoopConstruct &x,
+    llvm::SmallVector<std::int64_t> &levels,
+    llvm::SmallVector<const parser::OmpClause *> &clauses) {
 
+  const auto &nestedOptional =
+      std::get<std::optional<parser::NestedConstruct>>(x.t);
+  assert(nestedOptional.has_value() &&
+      "Expected a DoConstruct or OpenMPLoopConstruct");
+  const auto *innerConstruct =
+      std::get_if<common::Indirection<parser::OpenMPLoopConstruct>>(
+          &(nestedOptional.value()));
+
+  if (innerConstruct) {
+    CollectNumAffectedLoopsFromLoopConstruct(
+        innerConstruct->value(), levels, clauses);
+  }
+}
+
+void OmpAttributeVisitor::CollectNumAffectedLoopsFromClauses(
+    const parser::OmpClauseList &x, llvm::SmallVector<std::int64_t> &levels,
+    llvm::SmallVector<const parser::OmpClause *> &clauses) {
   for (const auto &clause : x.v) {
-    if (const auto *orderedClause{
+    if (const auto oclause{
             std::get_if<parser::OmpClause::Ordered>(&clause.u)}) {
-      if (const auto v{EvaluateInt64(context_, orderedClause->v)}) {
-        orderedLevel = *v;
+      std::int64_t level = 0;
+      if (const auto v{EvaluateInt64(context_, oclause->v)}) {
+        level = *v;
       }
-      ordClause = &clause;
+      levels.push_back(level);
+      clauses.push_back(&clause);
     }
-    if (const auto *collapseClause{
+
+    if (const auto cclause{
             std::get_if<parser::OmpClause::Collapse>(&clause.u)}) {
-      if (const auto v{EvaluateInt64(context_, collapseClause->v)}) {
-        collapseLevel = *v;
+      std::int64_t level = 0;
+      if (const auto v{EvaluateInt64(context_, cclause->v)}) {
+        level = *v;
       }
-      collClause = &clause;
+      levels.push_back(level);
+      clauses.push_back(&clause);
     }
-  }
 
-  if (orderedLevel && (!collapseLevel || orderedLevel >= collapseLevel)) {
-    SetAssociatedClause(ordClause);
-    return orderedLevel;
-  } else if (!orderedLevel && collapseLevel) {
-    SetAssociatedClause(collClause);
-    return collapseLevel;
-  } else {
-    SetAssociatedClause(nullptr);
+    if (const auto tclause{std::get_if<parser::OmpClause::Sizes>(&clause.u)}) {
+      levels.push_back(tclause->v.size());
+      clauses.push_back(&clause);
+    }
   }
-  // orderedLevel < collapseLevel is an error handled in structural
-  // checks
-
-  return 1; // default is outermost loop
 }
 
 // 2.15.1.1 Data-sharing Attribute Rules - Predetermined
@@ -2090,10 +2180,21 @@ void OmpAttributeVisitor::PrivatizeAssociatedLoopIndexAndCheckLoopLevel(
   const parser::OmpClause *clause{GetAssociatedClause()};
   bool hasCollapseClause{
       clause ? (clause->Id() == llvm::omp::OMPC_collapse) : false};
+  const parser::OpenMPLoopConstruct *innerMostLoop = &x;
+  const parser::NestedConstruct *innerMostNest = nullptr;
+  while (auto &optLoopCons{
+      std::get<std::optional<parser::NestedConstruct>>(innerMostLoop->t)}) {
+    innerMostNest = &(optLoopCons.value());
+    if (const auto *innerLoop{
+            std::get_if<common::Indirection<parser::OpenMPLoopConstruct>>(
+                innerMostNest)}) {
+      innerMostLoop = &(innerLoop->value());
+    } else
+      break;
+  }
 
-  auto &optLoopCons = std::get<std::optional<parser::NestedConstruct>>(x.t);
-  if (optLoopCons.has_value()) {
-    if (const auto &outer{std::get_if<parser::DoConstruct>(&*optLoopCons)}) {
+  if (innerMostNest) {
+    if (const auto &outer{std::get_if<parser::DoConstruct>(innerMostNest)}) {
       for (const parser::DoConstruct *loop{&*outer}; loop && level > 0;
           --level) {
         if (loop->IsDoConcurrent()) {
@@ -2129,7 +2230,7 @@ void OmpAttributeVisitor::PrivatizeAssociatedLoopIndexAndCheckLoopLevel(
       CheckAssocLoopLevel(level, GetAssociatedClause());
     } else if (const auto &loop{std::get_if<
                    common::Indirection<parser::OpenMPLoopConstruct>>(
-                   &*optLoopCons)}) {
+                   innerMostNest)}) {
       auto &beginDirective =
           std::get<parser::OmpBeginLoopDirective>(loop->value().t);
       auto &beginLoopDirective =
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 4720932..077bee9 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -646,12 +646,18 @@ public:
     }
     if (symbol->CanReplaceDetails(details)) {
       // update the existing symbol
-      CheckDuplicatedAttrs(name, *symbol, attrs);
-      SetExplicitAttrs(*symbol, attrs);
       if constexpr (std::is_same_v<SubprogramDetails, D>) {
         // Dummy argument defined by explicit interface?
         details.set_isDummy(IsDummy(*symbol));
+        if (symbol->has<ProcEntityDetails>()) {
+          // Bare "EXTERNAL" dummy replaced with explicit INTERFACE
+          context().Warn(common::LanguageFeature::RedundantAttribute, name,
+              "Dummy argument '%s' was declared earlier as EXTERNAL"_warn_en_US,
+              name);
+        }
       }
+      CheckDuplicatedAttrs(name, *symbol, attrs);
+      SetExplicitAttrs(*symbol, attrs);
       symbol->set_details(std::move(details));
       return *symbol;
     } else if constexpr (std::is_same_v<UnknownDetails, D>) {
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index 5916a07..b8c3db8 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -771,6 +771,8 @@ evaluate::StructureConstructor RuntimeTableBuilder::DescribeComponent(
   auto &foldingContext{context_.foldingContext()};
   auto typeAndShape{evaluate::characteristics::TypeAndShape::Characterize(
       symbol, foldingContext)};
+  bool isDevice{object.cudaDataAttr() &&
+      *object.cudaDataAttr() == common::CUDADataAttr::Device};
   CHECK(typeAndShape.has_value());
   auto dyType{typeAndShape->type()};
   int rank{typeAndShape->Rank()};
@@ -883,9 +885,19 @@ evaluate::StructureConstructor RuntimeTableBuilder::DescribeComponent(
   // Default component initialization
   bool hasDataInit{false};
   if (IsAllocatable(symbol)) {
-    AddValue(values, componentSchema_, "genre"s, GetEnumValue("allocatable"));
+    if (isDevice) {
+      AddValue(values, componentSchema_, "genre"s,
+          GetEnumValue("allocatabledevice"));
+    } else {
+      AddValue(values, componentSchema_, "genre"s, GetEnumValue("allocatable"));
+    }
   } else if (IsPointer(symbol)) {
-    AddValue(values, componentSchema_, "genre"s, GetEnumValue("pointer"));
+    if (isDevice) {
+      AddValue(
+          values, componentSchema_, "genre"s, GetEnumValue("pointerdevice"));
+    } else {
+      AddValue(values, componentSchema_, "genre"s, GetEnumValue("pointer"));
+    }
     hasDataInit = InitializeDataPointer(
         values, symbol, object, scope, dtScope, distinctName);
   } else if (IsAutomatic(symbol)) {
diff --git a/flang/lib/Semantics/symbol.cpp b/flang/lib/Semantics/symbol.cpp
index a6b402c..6152f61 100644
--- a/flang/lib/Semantics/symbol.cpp
+++ b/flang/lib/Semantics/symbol.cpp
@@ -330,8 +330,14 @@ bool Symbol::CanReplaceDetails(const Details &details) const {
         common::visitors{
             [](const UseErrorDetails &) { return true; },
             [&](const ObjectEntityDetails &) { return has<EntityDetails>(); },
-            [&](const ProcEntityDetails &) { return has<EntityDetails>(); },
+            [&](const ProcEntityDetails &x) { return has<EntityDetails>(); },
             [&](const SubprogramDetails &) {
+              if (const auto *oldProc{this->detailsIf<ProcEntityDetails>()}) {
+                // Can replace bare "EXTERNAL dummy" with explicit INTERFACE
+                return oldProc->isDummy() && !oldProc->procInterface() &&
+                    attrs().test(Attr::EXTERNAL) && !test(Flag::Function) &&
+                    !test(Flag::Subroutine);
+              }
               return has<SubprogramNameDetails>() || has<EntityDetails>();
             },
             [&](const DerivedTypeDetails &) {
@@ -342,11 +348,9 @@ bool Symbol::CanReplaceDetails(const Details &details) const {
               const auto *use{this->detailsIf<UseDetails>()};
               return use && use->symbol() == x.symbol();
             },
-            [&](const HostAssocDetails &) {
-              return this->has<HostAssocDetails>();
-            },
+            [&](const HostAssocDetails &) { return has<HostAssocDetails>(); },
             [&](const UserReductionDetails &) {
-              return this->has<UserReductionDetails>();
+              return has<UserReductionDetails>();
             },
             [](const auto &) { return false; },
         },
diff --git a/flang/lib/Semantics/unparse-with-symbols.cpp b/flang/lib/Semantics/unparse-with-symbols.cpp
index b199481..ec5b3ff 100644
--- a/flang/lib/Semantics/unparse-with-symbols.cpp
+++ b/flang/lib/Semantics/unparse-with-symbols.cpp
@@ -37,6 +37,8 @@ public:
   template <typename T> void Post(const parser::Statement<T> &) {
     currStmt_ = std::nullopt;
   }
+  void Post(const parser::Name &name);
+
   bool Pre(const parser::AccClause &clause) {
     currStmt_ = clause.source;
     return true;
@@ -57,7 +59,6 @@ public:
     return true;
   }
   void Post(const parser::OpenMPThreadprivate &) { currStmt_ = std::nullopt; }
-  void Post(const parser::Name &name);
 
   bool Pre(const parser::OpenMPDeclareMapperConstruct &x) {
     currStmt_ = x.source;
@@ -67,6 +68,14 @@ public:
     currStmt_ = std::nullopt;
   }
 
+  bool Pre(const parser::OpenMPDeclareReductionConstruct &x) {
+    currStmt_ = x.source;
+    return true;
+  }
+  void Post(const parser::OpenMPDeclareReductionConstruct &) {
+    currStmt_ = std::nullopt;
+  }
+
   bool Pre(const parser::OpenMPDeclareTargetConstruct &x) {
     currStmt_ = x.source;
     return true;
@@ -120,6 +129,7 @@ void SymbolDumpVisitor::Indent(llvm::raw_ostream &out, int indent) const {
 void SymbolDumpVisitor::Post(const parser::Name &name) {
   if (const auto *symbol{name.symbol}) {
     if (!symbol->has<MiscDetails>()) {
+      CHECK(currStmt_.has_value());
       symbols_.emplace(currStmt_.value().begin(), symbol);
     }
   }
diff --git a/flang/lib/Utils/CMakeLists.txt b/flang/lib/Utils/CMakeLists.txt
index 4d5000a..96c0375 100644
--- a/flang/lib/Utils/CMakeLists.txt
+++ b/flang/lib/Utils/CMakeLists.txt
@@ -17,6 +17,8 @@ add_flang_library(FortranUtils
   LINK_LIBS
   FIRDialect
   FIRBuilder
+  FortranEvaluate
+  FortranSupport
   HLFIRDialect
  
   MLIR_LIBS
diff --git a/flang/module/__fortran_type_info.f90 b/flang/module/__fortran_type_info.f90
index 6af2a5a..ae8eeef 100644
--- a/flang/module/__fortran_type_info.f90
+++ b/flang/module/__fortran_type_info.f90
@@ -75,7 +75,7 @@ module __fortran_type_info
   end type
 
   enum, bind(c) ! Component::Genre
-    enumerator :: Data = 1, Pointer = 2, Allocatable = 3, Automatic = 4
+    enumerator :: Data = 1, Pointer = 2, Allocatable = 3, Automatic = 4, PointerDevice = 5, AllocatableDevice = 6
   end enum
 
   enum, bind(c) ! common::TypeCategory
diff --git a/flang/test/Driver/target-cpu-features.f90 b/flang/test/Driver/target-cpu-features.f90
index 58ee670..92ad12d 100644
--- a/flang/test/Driver/target-cpu-features.f90
+++ b/flang/test/Driver/target-cpu-features.f90
@@ -74,7 +74,7 @@
 ! CHECK-X86_64H-SAME: "-target-cpu" "x86-64" "-target-feature" "-rdrnd" "-target-feature" "-aes" "-target-feature" "-pclmul" "-target-feature" "-rtm" "-target-feature" "-fsgsbase"
 
 ! CHECK-RV64: "-fc1" "-triple" "riscv64-unknown-linux-gnu"
-! CHECK-RV64-SAME: "-target-cpu" "generic-rv64" "-target-feature" "+m" "-target-feature" "+a" "-target-feature" "+f" "-target-feature" "+d" "-target-feature" "+c"
+! CHECK-RV64-SAME: "-target-cpu" "generic-rv64" "-target-feature" "+i" "-target-feature" "+m" "-target-feature" "+a" "-target-feature" "+f" "-target-feature" "+d" "-target-feature" "+c"
 
 ! CHECK-AMDGPU: "-fc1" "-triple" "amdgcn-amd-amdhsa"
 ! CHECK-AMDGPU-SAME: "-target-cpu" "gfx908"
diff --git a/flang/test/Evaluate/bug157379.f90 b/flang/test/Evaluate/bug157379.f90
new file mode 100644
index 0000000..53aac4c
--- /dev/null
+++ b/flang/test/Evaluate/bug157379.f90
@@ -0,0 +1,13 @@
+! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+program main
+  type t
+    integer, allocatable :: ia(:)
+  end type
+  type(t) x
+  integer, allocatable :: ja(:)
+  allocate(ja(2:2))
+  ja(2) = 2
+  !CHECK: x=t(ia=(ja))
+  x = t(+ja)            ! must be t(ia=(ja)), not t(ia=ja)
+  print *, lbound(x%ia) ! must be 1, not 2
+end
diff --git a/flang/test/Fir/CUDA/cuda-alloc-free.fir b/flang/test/Fir/CUDA/cuda-alloc-free.fir
index 8b6e7d6..31f2ed0 100644
--- a/flang/test/Fir/CUDA/cuda-alloc-free.fir
+++ b/flang/test/Fir/CUDA/cuda-alloc-free.fir
@@ -94,19 +94,4 @@ func.func @_QQalloc_char() attributes {fir.bindc_name = "alloc_char"} {
 // CHECK: %[[BYTES_CONV:.*]] = fir.convert %[[BYTES]] : (index) -> i64
 // CHECK: fir.call @_FortranACUFMemAlloc(%[[BYTES_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) {cuf.data_attr = #cuf.cuda<device>} : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
 
-
-func.func @_QQsetalloc() {
-  %0 = cuf.alloc !fir.type<_QMm1Tdt1{a2:!fir.box<!fir.heap<!fir.array<?xf32>>>}> {bindc_name = "d1", data_attr = #cuf.cuda<managed>, uniq_name = "_QFEd1"} -> !fir.ref<!fir.type<_QMm1Tdt1{a2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>
-  %1 = fir.coordinate_of %0, a2 : (!fir.ref<!fir.type<_QMm1Tdt1{a2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-  cuf.set_allocator_idx %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>}
-  return
-}
-
-// CHECK-LABEL:   func.func @_QQsetalloc() {
-// CHECK: %[[DT:.*]] = fir.call @_FortranACUFMemAlloc
-// CHECK: %[[CONV:.*]] = fir.convert %[[DT]] : (!fir.llvm_ptr<i8>) -> !fir.ref<!fir.type<_QMm1Tdt1{a2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>
-// CHECK: %[[COMP:.*]] = fir.coordinate_of %[[CONV]], a2 : (!fir.ref<!fir.type<_QMm1Tdt1{a2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-// CHECK: %[[DESC:.*]] = fir.convert %[[COMP]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFSetAllocatorIndex(%[[DESC]], %c2{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
-
 } // end module
diff --git a/flang/test/Fir/CUDA/cuda-data-transfer.fir b/flang/test/Fir/CUDA/cuda-data-transfer.fir
index a724d9f..669300c 100644
--- a/flang/test/Fir/CUDA/cuda-data-transfer.fir
+++ b/flang/test/Fir/CUDA/cuda-data-transfer.fir
@@ -463,13 +463,13 @@ func.func @_QPlogical_cst() {
 }
 
 // CHECK-LABEL: func.func @_QPlogical_cst()
-// CHECK: %[[DESC:.*]] = fir.alloca !fir.box<!fir.logical<4>>
-// CHECK: %[[CONST:.*]] = fir.alloca !fir.logical<4>
-// CHECK: %[[CONV:.*]] = fir.convert %false : (i1) -> !fir.logical<4>
-// CHECK: fir.store %[[CONV]] to %[[CONST]] : !fir.ref<!fir.logical<4>>
-// CHECK: %[[EMBOX:.*]] = fir.embox %[[CONST]] : (!fir.ref<!fir.logical<4>>) -> !fir.box<!fir.logical<4>>
-// CHECK: fir.store %[[EMBOX]] to %[[DESC]] : !fir.ref<!fir.box<!fir.logical<4>>>
-// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DESC]] : (!fir.ref<!fir.box<!fir.logical<4>>>) -> !fir.ref<!fir.box<none>>
+// CHECK: %[[DESC:.*]] = fir.alloca !fir.box<!fir.logical<1>>
+// CHECK: %[[CONST:.*]] = fir.alloca !fir.logical<1>
+// CHECK: %[[CONV:.*]] = fir.convert %false : (i1) -> !fir.logical<1>
+// CHECK: fir.store %[[CONV]] to %[[CONST]] : !fir.ref<!fir.logical<1>>
+// CHECK: %[[EMBOX:.*]] = fir.embox %[[CONST]] : (!fir.ref<!fir.logical<1>>) -> !fir.box<!fir.logical<1>>
+// CHECK: fir.store %[[EMBOX]] to %[[DESC]] : !fir.ref<!fir.box<!fir.logical<1>>>
+// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DESC]] : (!fir.ref<!fir.box<!fir.logical<1>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFDataTransferCstDesc(%{{.*}}, %[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
 
 func.func @_QPcallkernel(%arg0: !fir.box<!fir.array<?x?xcomplex<f32>>> {fir.bindc_name = "a"}, %arg1: !fir.ref<f32> {fir.bindc_name = "b"}, %arg2: !fir.ref<f32> {fir.bindc_name = "c"}) {
@@ -603,5 +603,53 @@ func.func @_QPsub20() {
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[BOX_ALLOCA]] : (!fir.ref<!fir.box<f32>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFDataTransferCstDesc(%13, %[[BOX_NONE]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
 
+func.func @_QPsub28() {
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = cuf.alloc !fir.box<!fir.heap<!fir.array<?x?x!fir.logical<8>>>> {bindc_name = "id2", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub28Eid2"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.logical<8>>>>>
+  %2 = fir.zero_bits !fir.heap<!fir.array<?x?x!fir.logical<8>>>
+  %c0 = arith.constant 0 : index
+  %3 = fir.shape %c0, %c0 : (index, index) -> !fir.shape<2>
+  %4 = fir.embox %2(%3) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?x?x!fir.logical<8>>>, !fir.shape<2>) -> !fir.box<!fir.heap<!fir.array<?x?x!fir.logical<8>>>>
+  fir.store %4 to %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.logical<8>>>>>
+  %5:2 = hlfir.declare %1 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub28Eid2"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.logical<8>>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.logical<8>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.logical<8>>>>>)
+  %c1 = arith.constant 1 : index
+  %c10_i32 = arith.constant 10 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %6 = fir.convert %5#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.logical<8>>>>>) -> !fir.ref<!fir.box<none>>
+  %7 = fir.convert %c1 : (index) -> i64
+  %8 = fir.convert %c10_i32 : (i32) -> i64
+  fir.call @_FortranAAllocatableSetBounds(%6, %c0_i32, %7, %8) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
+  %c1_0 = arith.constant 1 : index
+  %c10_i32_1 = arith.constant 10 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %9 = fir.convert %5#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.logical<8>>>>>) -> !fir.ref<!fir.box<none>>
+  %10 = fir.convert %c1_0 : (index) -> i64
+  %11 = fir.convert %c10_i32_1 : (i32) -> i64
+  fir.call @_FortranAAllocatableSetBounds(%9, %c1_i32, %10, %11) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
+  %12 = cuf.allocate %5#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.logical<8>>>>> {data_attr = #cuf.cuda<device>} -> i32
+  %false = arith.constant false
+  cuf.data_transfer %false to %5#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : i1, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.logical<8>>>>>
+  %13 = fir.load %5#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.logical<8>>>>>
+  %14 = fir.box_addr %13 : (!fir.box<!fir.heap<!fir.array<?x?x!fir.logical<8>>>>) -> !fir.heap<!fir.array<?x?x!fir.logical<8>>>
+  %15 = fir.convert %14 : (!fir.heap<!fir.array<?x?x!fir.logical<8>>>) -> i64
+  %c0_i64 = arith.constant 0 : i64
+  %16 = arith.cmpi ne, %15, %c0_i64 : i64
+  fir.if %16 {
+    %17 = cuf.deallocate %5#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.logical<8>>>>> {data_attr = #cuf.cuda<device>} -> i32
+  }
+  cuf.free %5#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.logical<8>>>>> {data_attr = #cuf.cuda<device>}
+  return
+}
+
+// CHECK-LABEL: func.func @_QPsub28()
+// CHECK: %[[DESC:.*]] = fir.alloca !fir.box<!fir.logical<8>>
+// CHECK: %[[L8:.*]] = fir.alloca !fir.logical<8>
+// CHECK: %[[FALSE:.*]] = fir.convert %false{{.*}} : (i1) -> !fir.logical<8>
+// CHECK: fir.store %[[FALSE]] to %[[L8]] : !fir.ref<!fir.logical<8>>
+// CHECK: %[[EMBOX:.*]] = fir.embox %[[L8]] : (!fir.ref<!fir.logical<8>>) -> !fir.box<!fir.logical<8>>
+// CHECK: fir.store %[[EMBOX]] to %[[DESC]] : !fir.ref<!fir.box<!fir.logical<8>>>
+// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DESC]] : (!fir.ref<!fir.box<!fir.logical<8>>>) -> !fir.ref<!fir.box<none>>
+// CHECK: fir.call @_FortranACUFDataTransferCstDesc(%{{.*}}, %[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
+
 } // end of module
 
diff --git a/flang/test/Fir/OpenMP/bounds-generation-for-char-arrays.f90 b/flang/test/Fir/OpenMP/bounds-generation-for-char-arrays.f90
new file mode 100644
index 0000000..d9d54ee
--- /dev/null
+++ b/flang/test/Fir/OpenMP/bounds-generation-for-char-arrays.f90
@@ -0,0 +1,96 @@
+// RUN: fir-opt --cfg-conversion --fir-to-llvm-ir="target=aarch64-unknown-linux-gnu" %s | FileCheck %s
+
+module attributes {omp.is_target_device = false} {
+  func.func @_QPchar_array(%arg0 : !fir.ref<!fir.array<10x10x!fir.char<1,16>>>) {
+    %c9 = arith.constant 9 : index
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c10 = arith.constant 10 : index
+    %0 = omp.map.bounds lower_bound(%c0 : index) upper_bound(%c9 : index) extent(%c10 : index) stride(%c1 : index) start_idx(%c1 : index)
+    %1 = omp.map.bounds lower_bound(%c0 : index) upper_bound(%c9 : index) extent(%c10 : index) stride(%c1 : index) start_idx(%c1 : index)
+    %2 = omp.map.info var_ptr(%arg0 : !fir.ref<!fir.array<10x10x!fir.char<1,16>>>, !fir.array<10x10x!fir.char<1,16>>) map_clauses(tofrom) capture(ByRef) bounds(%0, %1) -> !fir.ref<!fir.array<10x10x!fir.char<1,16>>> {name = ""}
+    omp.target map_entries(%2 -> %arg1 : !fir.ref<!fir.array<10x10x!fir.char<1,16>>>) {
+      omp.terminator
+    }
+    return
+  }
+
+// CHECK-LABEL:   llvm.func @_QPchar_array(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr) {
+// CHECK:           %[[VAL_0:.*]] = llvm.mlir.constant(9 : index) : i64
+// CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(1 : index) : i64
+// CHECK:           %[[VAL_3:.*]] = llvm.mlir.constant(10 : index) : i64
+// CHECK:           %[[VAL_4:.*]] = omp.map.bounds lower_bound(%[[VAL_1]] : i64) upper_bound(%[[VAL_0]] : i64) extent(%[[VAL_3]] : i64) stride(%[[VAL_2]] : i64) start_idx(%[[VAL_2]] : i64)
+// CHECK:           %[[VAL_5:.*]] = omp.map.bounds lower_bound(%[[VAL_1]] : i64) upper_bound(%[[VAL_0]] : i64) extent(%[[VAL_3]] : i64) stride(%[[VAL_2]] : i64) start_idx(%[[VAL_2]] : i64)
+// CHECK:           %[[VAL_6:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:           %[[VAL_7:.*]] = llvm.mlir.constant(15 : i64) : i64
+// CHECK:           %[[VAL_8:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:           %[[VAL_9:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:           %[[VAL_10:.*]] = omp.map.bounds lower_bound(%[[VAL_6]] : i64) upper_bound(%[[VAL_7]] : i64) extent(%[[VAL_7]] : i64) stride(%[[VAL_8]] : i64) start_idx(%[[VAL_9]] : i64)
+// CHECK:           %[[VAL_11:.*]] = omp.map.info var_ptr(%[[ARG0]] : !llvm.ptr, i8) map_clauses(tofrom) capture(ByRef) bounds(%[[VAL_4]], %[[VAL_5]], %[[VAL_10]]) -> !llvm.ptr {name = ""}
+// CHECK:           omp.target map_entries(%[[VAL_11]] -> %[[VAL_12:.*]] : !llvm.ptr) {
+// CHECK:             omp.terminator
+// CHECK:           }
+// CHECK:           llvm.return
+// CHECK:         }
+
+  func.func @_QPallocatable_char_array(%arg0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,16>>>>>) {
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %0 = fir.load %arg0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,16>>>>>
+    %1:3 = fir.box_dims %0, %c0 : (!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,16>>>>, index) -> (index, index, index)
+    %2 = arith.subi %1#1, %c1 : index
+    %3 = omp.map.bounds lower_bound(%c0 : index) upper_bound(%2 : index) extent(%1#1 : index) stride(%1#2 : index) start_idx(%1#0 : index) {stride_in_bytes = true}
+    %4 = arith.muli %1#2, %1#1 : index
+    %5:3 = fir.box_dims %0, %c1 : (!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,16>>>>, index) -> (index, index, index)
+    %6 = arith.subi %5#1, %c1 : index
+    %7 = omp.map.bounds lower_bound(%c0 : index) upper_bound(%6 : index) extent(%5#1 : index) stride(%4 : index) start_idx(%5#0 : index) {stride_in_bytes = true}
+    %8 = fir.box_offset %arg0 base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,16>>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?x?x!fir.char<1,16>>>>
+    %9 = omp.map.info var_ptr(%arg0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,16>>>>>, !fir.char<1,16>) map_clauses(tofrom) capture(ByRef) var_ptr_ptr(%8 : !fir.llvm_ptr<!fir.ref<!fir.array<?x?x!fir.char<1,16>>>>) bounds(%3, %7) -> !fir.llvm_ptr<!fir.ref<!fir.array<?x?x!fir.char<1,16>>>> {name = ""}
+    %10 = omp.map.info var_ptr(%arg0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,16>>>>>, !fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,16>>>>) map_clauses(to) capture(ByRef) members(%9 : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?x?x!fir.char<1,16>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,16>>>>> {name = "csv_chem_list_a"}
+    omp.target map_entries(%10 -> %arg1, %9 -> %arg2 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,16>>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?x?x!fir.char<1,16>>>>) {
+      omp.terminator
+    }
+    return
+  }
+
+// CHECK-LABEL:   llvm.func @_QPallocatable_char_array(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr) {
+// CHECK:           %[[VAL_0:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:           %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+// CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(1 : index) : i64
+// CHECK:           %[[VAL_3:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(72 : i32) : i32
+// CHECK:           "llvm.intr.memcpy"(%[[VAL_1]], %[[ARG0]], %[[VAL_4]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+// CHECK:           %[[VAL_5:.*]] = llvm.getelementptr %[[VAL_1]][0, 7, %[[VAL_3]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
+// CHECK:           %[[VAL_6:.*]] = llvm.load %[[VAL_5]] : !llvm.ptr -> i64
+// CHECK:           %[[VAL_7:.*]] = llvm.getelementptr %[[VAL_1]][0, 7, %[[VAL_3]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
+// CHECK:           %[[VAL_8:.*]] = llvm.load %[[VAL_7]] : !llvm.ptr -> i64
+// CHECK:           %[[VAL_9:.*]] = llvm.getelementptr %[[VAL_1]][0, 7, %[[VAL_3]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
+// CHECK:           %[[VAL_10:.*]] = llvm.load %[[VAL_9]] : !llvm.ptr -> i64
+// CHECK:           %[[VAL_11:.*]] = llvm.sub %[[VAL_8]], %[[VAL_2]] : i64
+// CHECK:           %[[VAL_12:.*]] = omp.map.bounds lower_bound(%[[VAL_3]] : i64) upper_bound(%[[VAL_11]] : i64) extent(%[[VAL_8]] : i64) stride(%[[VAL_10]] : i64) start_idx(%[[VAL_6]] : i64) {stride_in_bytes = true}
+// CHECK:           %[[VAL_13:.*]] = llvm.mul %[[VAL_10]], %[[VAL_8]] : i64
+// CHECK:           %[[VAL_14:.*]] = llvm.getelementptr %[[VAL_1]][0, 7, %[[VAL_2]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
+// CHECK:           %[[VAL_15:.*]] = llvm.load %[[VAL_14]] : !llvm.ptr -> i64
+// CHECK:           %[[VAL_16:.*]] = llvm.getelementptr %[[VAL_1]][0, 7, %[[VAL_2]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
+// CHECK:           %[[VAL_17:.*]] = llvm.load %[[VAL_16]] : !llvm.ptr -> i64
+// CHECK:           %[[VAL_18:.*]] = llvm.getelementptr %[[VAL_1]][0, 7, %[[VAL_2]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
+// CHECK:           %[[VAL_19:.*]] = llvm.load %[[VAL_18]] : !llvm.ptr -> i64
+// CHECK:           %[[VAL_20:.*]] = llvm.sub %[[VAL_17]], %[[VAL_2]] : i64
+// CHECK:           %[[VAL_21:.*]] = omp.map.bounds lower_bound(%[[VAL_3]] : i64) upper_bound(%[[VAL_20]] : i64) extent(%[[VAL_17]] : i64) stride(%[[VAL_13]] : i64) start_idx(%[[VAL_15]] : i64) {stride_in_bytes = true}
+// CHECK:           %[[VAL_22:.*]] = llvm.getelementptr %[[ARG0]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
+// CHECK:           %[[VAL_23:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:           %[[VAL_24:.*]] = llvm.mlir.constant(15 : i64) : i64
+// CHECK:           %[[VAL_25:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:           %[[VAL_26:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:           %[[VAL_27:.*]] = omp.map.bounds lower_bound(%[[VAL_23]] : i64) upper_bound(%[[VAL_24]] : i64) extent(%[[VAL_24]] : i64) stride(%[[VAL_25]] : i64) start_idx(%[[VAL_26]] : i64)
+// CHECK:           %[[VAL_28:.*]] = omp.map.info var_ptr(%[[ARG0]] : !llvm.ptr, i8) map_clauses(tofrom) capture(ByRef) var_ptr_ptr(%[[VAL_22]] : !llvm.ptr) bounds(%[[VAL_12]], %[[VAL_21]], %[[VAL_27]]) -> !llvm.ptr {name = ""}
+// CHECK:           %[[VAL_29:.*]] = omp.map.info var_ptr(%[[ARG0]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>) map_clauses(to) capture(ByRef) members(%[[VAL_28]] : [0] : !llvm.ptr) -> !llvm.ptr {name = "csv_chem_list_a"}
+// CHECK:           omp.target map_entries(%[[VAL_29]] -> %[[VAL_30:.*]], %[[VAL_28]] -> %[[VAL_31:.*]] : !llvm.ptr, !llvm.ptr) {
+// CHECK:             omp.terminator
+// CHECK:           }
+// CHECK:           llvm.return
+// CHECK:         }
+}
diff --git a/flang/test/HLFIR/index-lowering.fir b/flang/test/HLFIR/index-lowering.fir
new file mode 100644
index 0000000..7266513
--- /dev/null
+++ b/flang/test/HLFIR/index-lowering.fir
@@ -0,0 +1,198 @@
+// Test hlfir.index operation lowering to a fir runtime call
+// RUN: fir-opt %s -lower-hlfir-intrinsics | FileCheck %s
+
+func.func @_QPt(%arg0: !fir.boxchar<1> {fir.bindc_name = "s"}) {
+// CHECK-LABEL:   func.func @_QPt(
+// CHECK-SAME:                    %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant false
+// CHECK:           %[[VAL_1:.*]] = arith.constant 4 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFtEn"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFtEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFtEs"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQclX74686973) : !fir.ref<!fir.char<1,4>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX74686973"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFtEn"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFtEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFtEs"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    %5 = fir.address_of(@_QQclX74686973) : !fir.ref<!fir.char<1,4>>
+    %c4 = arith.constant 4 : index
+    %6:2 = hlfir.declare %5 typeparams %c4 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX74686973"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+    %7 = hlfir.index %6#0 in %4#0 : (!fir.ref<!fir.char<1,4>>, !fir.boxchar<1>) -> i32
+// CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_6]]#1 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
+// CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_5]]#1 : (index) -> i64
+// CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_8]]#0 : (!fir.ref<!fir.char<1,4>>) -> !fir.ref<i8>
+// CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_1]] : (index) -> i64
+// CHECK:           %[[VAL_13:.*]] = fir.call @_FortranAIndex1(%[[VAL_9]], %[[VAL_10]], %[[VAL_11]], %[[VAL_12]], %[[VAL_0]]) : (!fir.ref<i8>, i64, !fir.ref<i8>, i64, i1) -> i64
+// CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i64) -> i32
+// CHECK:           hlfir.assign %[[VAL_14]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
+    return
+}
+
+func.func @_QPt1(%arg0: !fir.boxchar<1> {fir.bindc_name = "s"}, %arg1: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
+// CHECK-LABEL:   func.func @_QPt1(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"},
+// CHECK-SAME:                     %[[ARG1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 4 : index
+// CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_1]] {uniq_name = "_QFt1Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt1En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt1En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_1]] {uniq_name = "_QFt1Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQclX74686973) : !fir.ref<!fir.char<1,4>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_0]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX74686973"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+// CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.logical<4>>
+    %0 = fir.dummy_scope : !fir.dscope
+    %1:2 = hlfir.declare %arg1 dummy_scope %0 {uniq_name = "_QFt1Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+    %2 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt1En"}
+    %3:2 = hlfir.declare %2 {uniq_name = "_QFt1En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %4:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+    %5:2 = hlfir.declare %4#0 typeparams %4#1 dummy_scope %0 {uniq_name = "_QFt1Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    %6 = fir.address_of(@_QQclX74686973) : !fir.ref<!fir.char<1,4>>
+    %c4 = arith.constant 4 : index
+    %7:2 = hlfir.declare %6 typeparams %c4 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX74686973"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+    %8 = fir.load %1#0 : !fir.ref<!fir.logical<4>>
+    %9 = hlfir.index %7#0 in %5#0 back %8 : (!fir.ref<!fir.char<1,4>>, !fir.boxchar<1>, !fir.logical<4>) -> i32
+// CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_6]]#1 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
+// CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_5]]#1 : (index) -> i64
+// CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_8]]#0 : (!fir.ref<!fir.char<1,4>>) -> !fir.ref<i8>
+// CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_0]] : (index) -> i64
+// CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_9]] : (!fir.logical<4>) -> i1
+// CHECK:           %[[VAL_15:.*]] = fir.call @_FortranAIndex1(%[[VAL_10]], %[[VAL_11]], %[[VAL_12]], %[[VAL_13]], %[[VAL_14]]) : (!fir.ref<i8>, i64, !fir.ref<i8>, i64, i1) -> i64
+// CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i64) -> i32
+// CHECK:           hlfir.assign %[[VAL_16]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+    hlfir.assign %9 to %3#0 : i32, !fir.ref<i32>
+    return
+}
+
+func.func @_QPt2(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}, %arg1: !fir.boxchar<2> {fir.bindc_name = "c"}) {
+// CHECK-LABEL:   func.func @_QPt2(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"},
+// CHECK-SAME:                     %[[ARG1:.*]]: !fir.boxchar<2> {fir.bindc_name = "c"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant false
+// CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_2:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+// CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] {uniq_name = "_QFt2Ec"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+// CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt2En"}
+// CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFt2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_6:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+// CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]]#0 typeparams %[[VAL_6]]#1 dummy_scope %[[VAL_1]] {uniq_name = "_QFt2Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+    %0 = fir.dummy_scope : !fir.dscope
+    %1:2 = fir.unboxchar %arg1 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+    %2:2 = hlfir.declare %1#0 typeparams %1#1 dummy_scope %0 {uniq_name = "_QFt2Ec"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+    %3 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt2En"}
+    %4:2 = hlfir.declare %3 {uniq_name = "_QFt2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %5:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+    %6:2 = hlfir.declare %5#0 typeparams %5#1 dummy_scope %0 {uniq_name = "_QFt2Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+    %false = arith.constant false
+    %7 = hlfir.index %2#0 in %6#0 back %false : (!fir.boxchar<2>, !fir.boxchar<2>, i1) -> i32
+// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]]#1 : (!fir.ref<!fir.char<2,?>>) -> !fir.ref<i16>
+// CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
+// CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.ref<!fir.char<2,?>>) -> !fir.ref<i16>
+// CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_2]]#1 : (index) -> i64
+// CHECK:           %[[VAL_12:.*]] = fir.call @_FortranAIndex2(%[[VAL_8]], %[[VAL_9]], %[[VAL_10]], %[[VAL_11]], %[[VAL_0]]) : (!fir.ref<i16>, i64, !fir.ref<i16>, i64, i1) -> i64
+// CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i64) -> i32
+// CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_5]]#0 : i32, !fir.ref<i32>
+    hlfir.assign %7 to %4#0 : i32, !fir.ref<i32>
+    return
+}
+
+func.func @_QPt3(%arg0: !fir.boxchar<4> {fir.bindc_name = "s"}, %arg1: !fir.boxchar<4> {fir.bindc_name = "c"}) {
+// CHECK-LABEL:   func.func @_QPt3(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<4> {fir.bindc_name = "s"},
+// CHECK-SAME:                     %[[ARG1:.*]]: !fir.boxchar<4> {fir.bindc_name = "c"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant true
+// CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_2:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
+// CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] {uniq_name = "_QFt3Ec"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
+// CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt3En"}
+// CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFt3En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_6:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
+// CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]]#0 typeparams %[[VAL_6]]#1 dummy_scope %[[VAL_1]] {uniq_name = "_QFt3Es"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
+    %0 = fir.dummy_scope : !fir.dscope
+    %1:2 = fir.unboxchar %arg1 : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
+    %2:2 = hlfir.declare %1#0 typeparams %1#1 dummy_scope %0 {uniq_name = "_QFt3Ec"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
+    %3 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt3En"}
+    %4:2 = hlfir.declare %3 {uniq_name = "_QFt3En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %5:2 = fir.unboxchar %arg0 : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
+    %6:2 = hlfir.declare %5#0 typeparams %5#1 dummy_scope %0 {uniq_name = "_QFt3Es"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
+    %true = arith.constant true
+    %7 = hlfir.index %2#0 in %6#0 back %true : (!fir.boxchar<4>, !fir.boxchar<4>, i1) -> i8
+// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]]#1 : (!fir.ref<!fir.char<4,?>>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
+// CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.ref<!fir.char<4,?>>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_2]]#1 : (index) -> i64
+// CHECK:           %[[VAL_12:.*]] = fir.call @_FortranAIndex4(%[[VAL_8]], %[[VAL_9]], %[[VAL_10]], %[[VAL_11]], %[[VAL_0]]) : (!fir.ref<i32>, i64, !fir.ref<i32>, i64, i1) -> i64
+// CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i64) -> i8
+// CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i8) -> i32
+// CHECK:           hlfir.assign %[[VAL_14]] to %[[VAL_5]]#0 : i32, !fir.ref<i32>
+    %8 = fir.convert %7 : (i8) -> i32
+    hlfir.assign %8 to %4#0 : i32, !fir.ref<i32>
+    return
+}
+
+func.func @_QPt4(%arg0: !fir.boxchar<1> {fir.bindc_name = "c1"}, %arg1: !fir.boxchar<1> {fir.bindc_name = "c2"}) {
+// CHECK-LABEL:   func.func @_QPt4(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "c1"},
+// CHECK-SAME:                     %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "c2"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant false
+// CHECK:           %[[VAL_1:.*]] = arith.constant 3 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+// CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<3x!fir.char<1,?>>>
+// CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_5]]) typeparams %[[VAL_3]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt4Ec1"} : (!fir.ref<!fir.array<3x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,?>>>)
+// CHECK:           %[[VAL_7:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<3x!fir.char<1,?>>>
+// CHECK:           %[[VAL_9:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]](%[[VAL_9]]) typeparams %[[VAL_7]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt4Ec2"} : (!fir.ref<!fir.array<3x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,?>>>)
+// CHECK:           %[[VAL_11:.*]] = fir.alloca !fir.array<3xi8> {bindc_name = "n", uniq_name = "_QFt4En"}
+// CHECK:           %[[VAL_12:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_11]](%[[VAL_12]]) {uniq_name = "_QFt4En"} : (!fir.ref<!fir.array<3xi8>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi8>>, !fir.ref<!fir.array<3xi8>>)
+// CHECK:           %[[VAL_14:.*]] = hlfir.elemental %[[VAL_5]] unordered : (!fir.shape<1>) -> !hlfir.expr<3xi8> {
+// CHECK:           ^bb0(%[[VAL_15:.*]]: index):
+// CHECK:             %[[VAL_16:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_15]])  typeparams %[[VAL_3]]#1 : (!fir.box<!fir.array<3x!fir.char<1,?>>>, index, index) -> !fir.boxchar<1>
+// CHECK:             %[[VAL_17:.*]] = hlfir.designate %[[VAL_10]]#0 (%[[VAL_15]])  typeparams %[[VAL_7]]#1 : (!fir.box<!fir.array<3x!fir.char<1,?>>>, index, index) -> !fir.boxchar<1>
+// CHECK:             %[[VAL_18:.*]]:2 = fir.unboxchar %[[VAL_17]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+// CHECK:             %[[VAL_19:.*]]:2 = fir.unboxchar %[[VAL_16]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+    %0 = fir.dummy_scope : !fir.dscope
+    %1:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+    %2 = fir.convert %1#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<3x!fir.char<1,?>>>
+    %c3 = arith.constant 3 : index
+    %3 = fir.shape %c3 : (index) -> !fir.shape<1>
+    %4:2 = hlfir.declare %2(%3) typeparams %1#1 dummy_scope %0 {uniq_name = "_QFt4Ec1"} : (!fir.ref<!fir.array<3x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,?>>>)
+    %5:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+    %6 = fir.convert %5#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<3x!fir.char<1,?>>>
+    %c3_0 = arith.constant 3 : index
+    %7 = fir.shape %c3_0 : (index) -> !fir.shape<1>
+    %8:2 = hlfir.declare %6(%7) typeparams %5#1 dummy_scope %0 {uniq_name = "_QFt4Ec2"} : (!fir.ref<!fir.array<3x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,?>>>)
+    %c3_1 = arith.constant 3 : index
+    %9 = fir.alloca !fir.array<3xi8> {bindc_name = "n", uniq_name = "_QFt4En"}
+    %10 = fir.shape %c3_1 : (index) -> !fir.shape<1>
+    %11:2 = hlfir.declare %9(%10) {uniq_name = "_QFt4En"} : (!fir.ref<!fir.array<3xi8>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi8>>, !fir.ref<!fir.array<3xi8>>)
+    %12 = hlfir.elemental %3 unordered : (!fir.shape<1>) -> !hlfir.expr<3xi8> {
+    ^bb0(%arg2: index):
+      %13 = hlfir.designate %4#0 (%arg2)  typeparams %1#1 : (!fir.box<!fir.array<3x!fir.char<1,?>>>, index, index) -> !fir.boxchar<1>
+      %14 = hlfir.designate %8#0 (%arg2)  typeparams %5#1 : (!fir.box<!fir.array<3x!fir.char<1,?>>>, index, index) -> !fir.boxchar<1>
+      %15 = hlfir.index %14 in %13 : (!fir.boxchar<1>, !fir.boxchar<1>) -> i8
+// CHECK:             %[[VAL_20:.*]] = fir.convert %[[VAL_19]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
+// CHECK:             %[[VAL_21:.*]] = fir.convert %[[VAL_3]]#1 : (index) -> i64
+// CHECK:             %[[VAL_22:.*]] = fir.convert %[[VAL_18]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
+// CHECK:             %[[VAL_23:.*]] = fir.convert %[[VAL_7]]#1 : (index) -> i64
+// CHECK:             %[[VAL_24:.*]] = fir.call @_FortranAIndex1(%[[VAL_20]], %[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_0]]) : (!fir.ref<i8>, i64, !fir.ref<i8>, i64, i1) -> i64
+// CHECK:             %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i64) -> i8
+// CHECK:             hlfir.yield_element %[[VAL_25]] : i8
+// CHECK:           }
+// CHECK:           hlfir.assign %[[VAL_14]] to %[[VAL_13]]#0 : !hlfir.expr<3xi8>, !fir.ref<!fir.array<3xi8>>
+// CHECK:           hlfir.destroy %[[VAL_14]] : !hlfir.expr<3xi8>
+      hlfir.yield_element %15 : i8
+    }
+    hlfir.assign %12 to %11#0 : !hlfir.expr<3xi8>, !fir.ref<!fir.array<3xi8>>
+    hlfir.destroy %12 : !hlfir.expr<3xi8>
+    return
+}
diff --git a/flang/test/HLFIR/invalid.fir b/flang/test/HLFIR/invalid.fir
index ea0f3c6..8871139 100644
--- a/flang/test/HLFIR/invalid.fir
+++ b/flang/test/HLFIR/invalid.fir
@@ -308,6 +308,12 @@ func.func @bad_cmpchar_2(%arg0: !fir.ref<!fir.char<1,10>>, %arg1: !fir.ref<!fir.
 }
 
 // -----
+func.func @bad_index_1(%arg0: !fir.ref<!fir.char<1,10>>, %arg1: !fir.ref<!fir.char<2,10>>) {
+  // expected-error@+1 {{'hlfir.index' op character arguments must have the same KIND}}
+  %0 = hlfir.index %arg0 in %arg1 : (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<2,10>>) -> i32
+}
+
+// -----
 func.func @bad_any1(%arg0: !hlfir.expr<?x!fir.logical<4>>) {
   // expected-error@+1 {{'hlfir.any' op result must have the same element type as MASK argument}}
   %0 = hlfir.any %arg0 : (!hlfir.expr<?x!fir.logical<4>>) -> !fir.logical<8>
diff --git a/flang/test/Lower/CUDA/cuda-allocatable-device.cuf b/flang/test/Lower/CUDA/cuda-allocatable-device.cuf
new file mode 100644
index 0000000..57c588e
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-allocatable-device.cuf
@@ -0,0 +1,22 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+module m
+  type device_array
+    real(kind=8), allocatable, dimension(:), device :: ad
+    real(kind=8), pointer, dimension(:), device :: pd
+  end type
+
+  type(device_array), allocatable :: da(:)
+end module
+
+! CHECK-LABEL: fir.global linkonce_odr @_QMmE.c.device_array
+! CHECK: fir.insert_value %{{.*}}, %c6{{.*}}, ["genre"
+! CHECK: fir.insert_value %{{.*}}, %c5{{.*}}, ["genre"
+
+program main
+  use m
+  type(device_array) :: local
+end
+
+! CHECK-LABEL: func.func @_QQmain()
+! CHECK: fir.call @_FortranAInitialize
diff --git a/flang/test/Lower/CUDA/cuda-set-allocator.cuf b/flang/test/Lower/CUDA/cuda-set-allocator.cuf
deleted file mode 100644
index d783f34..0000000
--- a/flang/test/Lower/CUDA/cuda-set-allocator.cuf
+++ /dev/null
@@ -1,66 +0,0 @@
-! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
-
-module m1
-  type ty_device
-    integer, device, allocatable, dimension(:) :: x
-    integer :: y
-    integer, device, allocatable, dimension(:) :: z
-  end type
-contains
-  subroutine sub1()
-    type(ty_device) :: a
-  end subroutine
-
-! CHECK-LABEL: func.func @_QMm1Psub1()
-! CHECK: %[[ALLOC:.*]] = cuf.alloc !fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}> {bindc_name = "a", data_attr = #cuf.cuda<managed>, uniq_name = "_QMm1Fsub1Ea"} -> !fir.ref<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
-! CHECK: %[[DT:.*]]:2 = hlfir.declare %[[ALLOC]] {data_attr = #cuf.cuda<managed>, uniq_name = "_QMm1Fsub1Ea"} : (!fir.ref<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> (!fir.ref<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.ref<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>)
-! CHECK: fir.address_of(@_QQ_QMm1Tty_device.DerivedInit)
-! CHECK: fir.copy 
-! CHECK: %[[X:.*]] = fir.coordinate_of %[[DT]]#0, x : (!fir.ref<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-! CHECK: cuf.set_allocator_idx %[[X]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>}
-! CHECK: %[[Z:.*]] = fir.coordinate_of %[[DT]]#0, z : (!fir.ref<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-! CHECK: cuf.set_allocator_idx %[[Z]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>}
-
-  subroutine sub2()
-    type(ty_device), pointer :: d1
-    allocate(d1)
-  end subroutine
-
-! CHECK-LABEL: func.func @_QMm1Psub2()
-! CHECK: %[[ALLOC:.*]] = cuf.alloc !fir.box<!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>> {bindc_name = "d1", data_attr = #cuf.cuda<managed>, uniq_name = "_QMm1Fsub2Ed1"} -> !fir.ref<!fir.box<!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>
-! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]] {data_attr = #cuf.cuda<managed>, fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMm1Fsub2Ed1"} : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>)
-! CHECK: cuf.allocate
-! CHECK: %[[LOAD:.*]] = fir.load %[[DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>
-! CHECK: %[[ADDR:.*]] = fir.box_addr %[[LOAD]] : (!fir.box<!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>) -> !fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
-! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ADDR]], x : (!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-! CHECK: cuf.set_allocator_idx %[[COORD1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>}
-! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ADDR]], z : (!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-! CHECK: cuf.set_allocator_idx %[[COORD2]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>}
-
-  subroutine sub3()
-    type(ty_device), allocatable :: d1
-    allocate(d1)
-  end subroutine
-
-! CHECK-LABEL: func.func @_QMm1Psub3()
-! CHECK: %[[ALLOC:.*]] = cuf.alloc !fir.box<!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>> {bindc_name = "d1", data_attr = #cuf.cuda<managed>, uniq_name = "_QMm1Fsub3Ed1"} -> !fir.ref<!fir.box<!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>
-! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]] {data_attr = #cuf.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMm1Fsub3Ed1"} : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>, !fir.ref<!fir.box<!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>)
-! CHECK: cuf.allocate
-! CHECK: %[[LOAD:.*]] = fir.load %[[DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>
-! CHECK: %[[ADDR:.*]] = fir.box_addr %[[LOAD]] : (!fir.box<!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>) -> !fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
-! CHECK: %[[COORD1:.*]] = fir.coordinate_of %[[ADDR]], x : (!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-! CHECK: cuf.set_allocator_idx %[[COORD1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>}
-! CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ADDR]], z : (!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-! CHECK: cuf.set_allocator_idx %[[COORD2]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>}
-
-  subroutine sub4()
-    type(ty_device), allocatable :: d1(:,:)
-    allocate(d1(10, 10))
-  end subroutine
-
-! CHECK-LABEL: func.func @_QMm1Psub4()
-! CHECK: cuf.allocate
-! CHECK-COUNT-2: fir.do_loop
-! CHECK-COUNT-2: cuf.set_allocator_idx
-
-end module
diff --git a/flang/test/Lower/CUDA/cuda-stream.cuf b/flang/test/Lower/CUDA/cuda-stream.cuf
new file mode 100644
index 0000000..a58ab4e
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-stream.cuf
@@ -0,0 +1,15 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+attributes(global) subroutine sharedmem()
+  real, shared :: s(*)
+  integer :: t
+  t = threadIdx%x
+  s(t) = t
+end subroutine
+
+program test
+  call sharedmem<<<1, 1, 1024, 0>>>()
+end
+
+! CHECK-LABEL: func.func @_QQmain()
+! CHECK: cuf.kernel_launch @_QPsharedmem<<<%c1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c1024{{.*}}, %{{.*}} : !fir.ref<i64>>>>()
diff --git a/flang/test/Lower/Coarray/co_broadcast.f90 b/flang/test/Lower/Coarray/co_broadcast.f90
new file mode 100644
index 0000000..be7fdcb
--- /dev/null
+++ b/flang/test/Lower/Coarray/co_broadcast.f90
@@ -0,0 +1,92 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s
+
+program test_co_broadcast
+  integer :: i, array_i(2), status
+  real :: r, array_r(2)
+  double precision :: d, array_d(2)
+  complex :: c, array_c(2)
+  character(len=1) :: message
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I:.*]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V5]], %[[IMAGE_RESULT]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_broadcast(i,       source_image=1)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_C:.*]]#0 : (!fir.ref<complex<f32>>) -> !fir.box<complex<f32>>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<complex<f32>>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS:.*]], %[[V2]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_broadcast(c,       source_image=1, stat=status)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D:.*]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  ! CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_broadcast(d,       source_image=1, stat=status, errmsg=message)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R:.*]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  ! CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_broadcast(r,       source_image=1, stat=status, errmsg=message)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xi32>>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V5]], %[[IMAGE_RESULT]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_broadcast(array_i, source_image=1)
+  
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xcomplex<f32>>>, !fir.shape<1>) -> !fir.box<!fir.array<2xcomplex<f32>>>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xcomplex<f32>>>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V5]], %[[IMAGE_RESULT]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_broadcast(array_c, source_image=1)
+  
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf64>>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V2]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_broadcast(array_d, source_image=1, stat=status)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  ! CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_broadcast(array_r, source_image=1, stat= status, errmsg=message)
+
+end program
diff --git a/flang/test/Lower/Coarray/co_max.f90 b/flang/test/Lower/Coarray/co_max.f90
new file mode 100644
index 0000000..56d8633
--- /dev/null
+++ b/flang/test/Lower/Coarray/co_max.f90
@@ -0,0 +1,112 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s
+
+program test_co_max
+  integer :: i, array_i(2), status
+  real :: r, array_r(2)
+  double precision :: d, array_d(2)
+  character(len=1) :: c, array_c(2), message
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I:.*]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_max(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_max(i)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_C:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.char<1>>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_max_character(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_max(c)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D:.*]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_max(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_max(d)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R:.*]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_max(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_max(r)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_max(%[[V5]], %[[IMAGE_RESULT]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_max(i,       result_image=1)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  ! CHECK: fir.call @_QMprifPprif_co_max(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS:.*]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_max(d,       result_image=1, stat=status, errmsg=message)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  ! CHECK: fir.call @_QMprifPprif_co_max(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_max(r,       result_image=1, stat=status, errmsg=message)
+
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xi32>>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_max(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_max(array_i)
+   
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1>>>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2x!fir.char<1>>>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_max_character(%[[V5]], %[[IMAGE_RESULT]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_max(array_c, result_image=1) 
+   
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf64>>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_max(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V2]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_max(array_d, result_image=1, stat=status)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  ! CHECK: fir.call @_QMprifPprif_co_max(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_max(array_r, result_image=1, stat= status, errmsg=message)
+
+end program
diff --git a/flang/test/Lower/Coarray/co_min.f90 b/flang/test/Lower/Coarray/co_min.f90
new file mode 100644
index 0000000..dde878b
--- /dev/null
+++ b/flang/test/Lower/Coarray/co_min.f90
@@ -0,0 +1,112 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s
+
+program test_co_min
+  integer :: i, array_i(2), status
+  real :: r, array_r(2)
+  double precision :: d, array_d(2)
+  character(len=1) :: c, array_c(2), message
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I:.*]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_min(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_min(i)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_C:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.char<1>>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_min_character(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_min(c)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D:.*]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_min(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_min(d)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R:.*]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_min(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_min(r)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_min(%[[V5]], %[[IMAGE_RESULT]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_min(i,       result_image=1)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  ! CHECK: fir.call @_QMprifPprif_co_min(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS:.*]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_min(d,       result_image=1, stat=status, errmsg=message)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  ! CHECK: fir.call @_QMprifPprif_co_min(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_min(r,       result_image=1, stat=status, errmsg=message)
+
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xi32>>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_min(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_min(array_i)
+   
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1>>>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2x!fir.char<1>>>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_min_character(%[[V5]], %[[IMAGE_RESULT]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_min(array_c, result_image=1) 
+   
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf64>>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_min(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V2]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_min(array_d, result_image=1, stat=status)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  ! CHECK: fir.call @_QMprifPprif_co_min(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_min(array_r, result_image=1, stat= status, errmsg=message)
+
+end program
diff --git a/flang/test/Lower/Coarray/co_sum.f90 b/flang/test/Lower/Coarray/co_sum.f90
new file mode 100644
index 0000000..2932b54
--- /dev/null
+++ b/flang/test/Lower/Coarray/co_sum.f90
@@ -0,0 +1,122 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s
+
+program test_co_sum
+  integer :: i, array_i(2), status
+  real :: r, array_r(2)
+  double precision :: d, array_d(2)
+  complex :: c, array_c(2)
+  character(len=1) :: message
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I:.*]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_sum(i)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_C:.*]]#0 : (!fir.ref<complex<f32>>) -> !fir.box<complex<f32>>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<complex<f32>>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_sum(c)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D:.*]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_sum(d)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R:.*]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_sum(r)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V5]], %[[IMAGE_RESULT]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_sum(i,       result_image=1)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_C]]#0 : (!fir.ref<complex<f32>>) -> !fir.box<complex<f32>>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<complex<f32>>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS:.*]], %[[V2]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_sum(c,       result_image=1, stat=status)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_sum(d,       result_image=1, stat=status, errmsg=message)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_sum(r,       result_image=1, stat=status, errmsg=message)
+
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xi32>>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_sum(array_i)
+  
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xcomplex<f32>>>, !fir.shape<1>) -> !fir.box<!fir.array<2xcomplex<f32>>>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xcomplex<f32>>>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V5]], %[[IMAGE_RESULT]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_sum(array_c, result_image=1)
+  
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf64>>) -> !fir.box<none>
+  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V2]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_sum(array_d, result_image=1, stat=status)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none>
+  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  call co_sum(array_r, result_image=1, stat= status, errmsg=message)
+
+end program
diff --git a/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90 b/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90
index e2fd268..08492e9 100644
--- a/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90
+++ b/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90
@@ -58,12 +58,16 @@ subroutine pointer_remapping(p, ziel)
 ! CHECK:  %[[VAL_14:.*]] = fir.convert %[[VAL_9]] : (i64) -> index
 ! CHECK:  %[[VAL_15:.*]] = arith.subi %[[VAL_14]], %[[VAL_13]] : index
 ! CHECK:  %[[VAL_16:.*]] = arith.addi %[[VAL_15]], %[[VAL_12]] : index
+! CHECK:  %[[cmp0:.*]] = arith.cmpi sgt, %[[VAL_16]], %c0{{.*}} : index
+! CHECK:  %[[ext0:.*]] = arith.select %[[cmp0]], %[[VAL_16]], %c0{{.*}} : index
 ! CHECK:  %[[VAL_17:.*]] = fir.convert %[[VAL_10]] : (i64) -> index
 ! CHECK:  %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (i64) -> index
 ! CHECK:  %[[VAL_19:.*]] = arith.subi %[[VAL_18]], %[[VAL_17]] : index
 ! CHECK:  %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_12]] : index
+! CHECK:  %[[cmp1:.*]] = arith.cmpi sgt, %[[VAL_20]], %c0{{.*}} : index
+! CHECK:  %[[ext1:.*]] = arith.select %[[cmp1]], %[[VAL_20]], %c0{{.*}} : index
 ! CHECK:  %[[VAL_21:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.ref<!fir.array<10x20x30xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
-! CHECK:  %[[VAL_22:.*]] = fir.shape_shift %[[VAL_8]], %[[VAL_16]], %[[VAL_10]], %[[VAL_20]] : (i64, index, i64, index) -> !fir.shapeshift<2>
+! CHECK:  %[[VAL_22:.*]] = fir.shape_shift %[[VAL_8]], %[[ext0]], %[[VAL_10]], %[[ext1]] : (i64, index, i64, index) -> !fir.shapeshift<2>
 ! CHECK:  %[[VAL_23:.*]] = fir.embox %[[VAL_21]](%[[VAL_22]]) : (!fir.ref<!fir.array<?x?xf32>>, !fir.shapeshift<2>) -> !fir.box<!fir.ptr<!fir.array<?x?xf32>>>
 ! CHECK:  fir.store %[[VAL_23]] to %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
 end subroutine
diff --git a/flang/test/Lower/HLFIR/index.f90 b/flang/test/Lower/HLFIR/index.f90
new file mode 100644
index 0000000..a36027f
--- /dev/null
+++ b/flang/test/Lower/HLFIR/index.f90
@@ -0,0 +1,162 @@
+! Test lowering of index intrinsic to HLFIR
+! RUN: bbc -emit-hlfir -o - %s 2>&1 | FileCheck %s
+
+subroutine t(s)
+  implicit none
+  character(len=*, kind=1):: s
+  integer :: n
+  n = index(s,'this')
+end subroutine t
+! CHECK-LABEL:   func.func @_QPt(
+! CHECK-SAME:                    %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFtEn"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]]#0 typeparams %[[VAL_3]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFtEs"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_5:.*]] = fir.address_of(@_QQclX74686973) : !fir.ref<!fir.char<1,4>>
+! CHECK:           %[[VAL_6:.*]] = arith.constant 4 : index
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_6]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX74686973"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+! CHECK:           %[[VAL_8:.*]] = hlfir.index %[[VAL_7]]#0 in %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,4>>, !fir.boxchar<1>) -> i32
+! CHECK:           hlfir.assign %[[VAL_8]] to %[[VAL_2]]#0 : i32, !fir.ref<i32>
+
+subroutine t1(s, b)
+  implicit none
+  character(len=*, kind=1):: s
+  logical :: b
+  integer :: n
+  n = index(s,'this', back = b)
+end subroutine t1
+! CHECK-LABEL:   func.func @_QPt1(
+! CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"},
+! CHECK-SAME:                     %[[ARG1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFt1Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt1En"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFt1En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt1Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_6:.*]] = fir.address_of(@_QQclX74686973) : !fir.ref<!fir.char<1,4>>
+! CHECK:           %[[VAL_7:.*]] = arith.constant 4 : index
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_6]] typeparams %[[VAL_7]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX74686973"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+! CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_10:.*]] = hlfir.index %[[VAL_8]]#0 in %[[VAL_5]]#0 back %[[VAL_9]] : (!fir.ref<!fir.char<1,4>>, !fir.boxchar<1>, !fir.logical<4>) -> i32
+! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+
+
+subroutine t2(s, c)
+  implicit none
+  character(len=*, kind=2):: s, c
+  integer :: n
+  n = index(s,c,back=.false.)
+end subroutine t2
+! CHECK-LABEL:   func.func @_QPt2(
+! CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"},
+! CHECK-SAME:                     %[[ARG1:.*]]: !fir.boxchar<2> {fir.bindc_name = "c"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt2Ec"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt2En"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt2Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+! CHECK:           %[[VAL_7:.*]] = arith.constant false
+! CHECK:           %[[VAL_8:.*]] = hlfir.index %[[VAL_2]]#0 in %[[VAL_6]]#0 back %[[VAL_7]] : (!fir.boxchar<2>, !fir.boxchar<2>, i1) -> i32
+! CHECK:           hlfir.assign %[[VAL_8]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+
+subroutine t3(s, c)
+  implicit none
+  character(len=*, kind=4):: s, c
+  integer :: n
+  n = index(s,c,back=.true., kind=1)
+end subroutine t3
+! CHECK-LABEL:   func.func @_QPt3(
+! CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<4> {fir.bindc_name = "s"},
+! CHECK-SAME:                     %[[ARG1:.*]]: !fir.boxchar<4> {fir.bindc_name = "c"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt3Ec"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt3En"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt3En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt3Es"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
+! CHECK:           %[[VAL_7:.*]] = arith.constant true
+! CHECK:           %[[VAL_8:.*]] = hlfir.index %[[VAL_2]]#0 in %[[VAL_6]]#0 back %[[VAL_7]] : (!fir.boxchar<4>, !fir.boxchar<4>, i1) -> i8
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i8) -> i32
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+
+subroutine t4(c1, c2)
+  implicit none
+  character(*) :: c1(3)
+  character(*) :: c2(3)
+  integer(kind=1) :: n(3)
+  n = index(c1, c2, kind=1)
+end subroutine t4
+! CHECK-LABEL:   func.func @_QPt4(
+! CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "c1"},
+! CHECK-SAME:                     %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "c2"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<3x!fir.char<1,?>>>
+! CHECK:           %[[VAL_3:.*]] = arith.constant 3 : index
+! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_4]]) typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt4Ec1"} : (!fir.ref<!fir.array<3x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_6:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<3x!fir.char<1,?>>>
+! CHECK:           %[[VAL_8:.*]] = arith.constant 3 : index
+! CHECK:           %[[VAL_9:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_9]]) typeparams %[[VAL_6]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt4Ec2"} : (!fir.ref<!fir.array<3x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_11:.*]] = arith.constant 3 : index
+! CHECK:           %[[VAL_12:.*]] = fir.alloca !fir.array<3xi8> {bindc_name = "n", uniq_name = "_QFt4En"}
+! CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_11]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]](%[[VAL_13]]) {uniq_name = "_QFt4En"} : (!fir.ref<!fir.array<3xi8>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi8>>, !fir.ref<!fir.array<3xi8>>)
+! CHECK:           %[[VAL_15:.*]] = hlfir.elemental %[[VAL_4]] unordered : (!fir.shape<1>) -> !hlfir.expr<3xi8> {
+! CHECK:           ^bb0(%[[VAL_16:.*]]: index):
+! CHECK:             %[[VAL_17:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_16]])  typeparams %[[VAL_1]]#1 : (!fir.box<!fir.array<3x!fir.char<1,?>>>, index, index) -> !fir.boxchar<1>
+! CHECK:             %[[VAL_18:.*]] = hlfir.designate %[[VAL_10]]#0 (%[[VAL_16]])  typeparams %[[VAL_6]]#1 : (!fir.box<!fir.array<3x!fir.char<1,?>>>, index, index) -> !fir.boxchar<1>
+! CHECK:             %[[VAL_19:.*]] = hlfir.index %[[VAL_18]] in %[[VAL_17]] : (!fir.boxchar<1>, !fir.boxchar<1>) -> i8
+! CHECK:             hlfir.yield_element %[[VAL_19]] : i8
+! CHECK:           }
+! CHECK:           hlfir.assign %[[VAL_15]] to %[[VAL_14]]#0 : !hlfir.expr<3xi8>, !fir.ref<!fir.array<3xi8>>
+
+! index is called as elemental with the 3d argument optional for 'sub' (^bb0 block)
+! Make sure that the argument is actually accessed (hlfir.designate) only
+! under fir.if that depends on fir.is_present check.
+program test
+  call sub('abcdefgc',(/'c','c'/))
+contains
+  subroutine sub(a,b,c)
+    character(*) a,b(:)
+    logical,optional :: c(:)
+    print *,index(a,b,c)
+  end subroutine
+end program test
+! CHECK-LABEL:   func.func private @_QFPsub(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "a"},
+! CHECK-SAME:      %[[ARG1:.*]]: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_name = "b"},
+! CHECK-SAME:      %[[ARG2:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "c", fir.optional}) attributes {fir.host_symbol = @_QQmain, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFFsubEa"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFFsubEb"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFFsubEc"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+! CHECK:           %[[VAL_10:.*]] = fir.is_present %[[VAL_4]]#0 : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> i1
+! CHECK:           %[[VAL_11:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_12:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_11]] : (!fir.box<!fir.array<?x!fir.char<1,?>>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_12]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_14:.*]] = hlfir.elemental %[[VAL_13]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
+! CHECK:           ^bb0(%[[VAL_15:.*]]: index):
+! CHECK:             %[[VAL_16:.*]] = fir.box_elesize %[[VAL_3]]#1 : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> index
+! CHECK:             %[[VAL_17:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_15]])  typeparams %[[VAL_16]] : (!fir.box<!fir.array<?x!fir.char<1,?>>>, index, index) -> !fir.boxchar<1>
+! CHECK:             %[[VAL_18:.*]] = fir.if %[[VAL_10]] -> (!fir.logical<4>) {
+! CHECK:               %[[VAL_19:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_15]])  : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.logical<4>>
+! CHECK:               fir.result %[[VAL_20]] : !fir.logical<4>
+! CHECK:             } else {
+! CHECK:               %[[VAL_21:.*]] = arith.constant false
+! CHECK:               %[[VAL_22:.*]] = fir.convert %[[VAL_21]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.result %[[VAL_22]] : !fir.logical<4>
+! CHECK:             }
+! CHECK:             %[[VAL_23:.*]] = hlfir.index %[[VAL_17]] in %[[VAL_2]]#0 back %[[VAL_18]] : (!fir.boxchar<1>, !fir.boxchar<1>, !fir.logical<4>) -> i32
+! CHECK:             hlfir.yield_element %[[VAL_23]] : i32
+! CHECK:           }
diff --git a/flang/test/Lower/HLFIR/issue80884.f90 b/flang/test/Lower/HLFIR/issue80884.f90
index 5c05a99..a5a5178 100644
--- a/flang/test/Lower/HLFIR/issue80884.f90
+++ b/flang/test/Lower/HLFIR/issue80884.f90
@@ -26,7 +26,9 @@ end subroutine
 ! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_5]] : (i64) -> index
 ! CHECK:           %[[VAL_14:.*]] = arith.subi %[[VAL_13]], %[[VAL_12]] : index
 ! CHECK:           %[[VAL_15:.*]] = arith.addi %[[VAL_14]], %[[VAL_11]] : index
+! CHECK:           %[[cmp0:.*]] = arith.cmpi sgt, %[[VAL_15]], %c0{{.*}} : index
+! CHECK:           %[[ext0:.*]] = arith.select %[[cmp0]], %[[VAL_15]], %c0{{.*}} : index
 ! CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_10]] : (!fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<?xf32>>
-! CHECK:           %[[VAL_17:.*]] = fir.shape_shift %[[VAL_4]], %[[VAL_15]] : (i64, index) -> !fir.shapeshift<1>
+! CHECK:           %[[VAL_17:.*]] = fir.shape_shift %[[VAL_4]], %[[ext0]] : (i64, index) -> !fir.shapeshift<1>
 ! CHECK:           %[[VAL_18:.*]] = fir.embox %[[VAL_16]](%[[VAL_17]]) : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
 ! CHECK:           fir.store %[[VAL_18]] to %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
diff --git a/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90 b/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90
index 5f8ea03..a75a022 100644
--- a/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90
+++ b/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90
@@ -2,6 +2,7 @@
 ! Tests the new functionality that converts Fortran iteration constructs to acc.loop with proper IV handling.
 
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -fopenacc -emit-hlfir --openacc-do-loop-to-acc-loop=false %s -o - | FileCheck %s --check-prefix=CHECK-NOACCLOOP
 
 ! CHECK-LABEL: func.func @_QPbasic_do_loop
 subroutine basic_do_loop()
@@ -17,10 +18,19 @@ subroutine basic_do_loop()
   !$acc end kernels
 
 ! CHECK: acc.kernels {
-! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: %[[PRIVATE_IV:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFbasic_do_loopEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_IV]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: fir.store %{{.*}} to %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: acc.yield
 ! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 
+! CHECK-NOACCLOOP-LABEL: func.func @_QPbasic_do_loop
+! CHECK-NOACCLOOP: acc.kernels {
+! CHECK-NOACCLOOP-NOT: acc.loop
+
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPbasic_do_concurrent
@@ -37,10 +47,19 @@ subroutine basic_do_concurrent()
   !$acc end kernels
 
 ! CHECK: acc.kernels {
-! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: %[[PRIVATE_IV:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFbasic_do_concurrentEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_IV]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: fir.store %{{.*}} to %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: acc.yield
 ! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 
+! CHECK-NOACCLOOP-LABEL: func.func @_QPbasic_do_concurrent
+! CHECK-NOACCLOOP: acc.kernels {
+! CHECK-NOACCLOOP-NOT: acc.loop
+
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPbasic_do_loop_parallel
@@ -57,10 +76,19 @@ subroutine basic_do_loop_parallel()
   !$acc end parallel
 
 ! CHECK: acc.parallel {
-! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: %[[PRIVATE_IV:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFbasic_do_loop_parallelEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_IV]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: fir.store %{{.*}} to %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: acc.yield
 ! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 
+! CHECK-NOACCLOOP-LABEL: func.func @_QPbasic_do_loop_parallel
+! CHECK-NOACCLOOP: acc.parallel {
+! CHECK-NOACCLOOP-NOT: acc.loop
+
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPbasic_do_loop_serial
@@ -77,10 +105,19 @@ subroutine basic_do_loop_serial()
   !$acc end serial
 
 ! CHECK: acc.serial {
-! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: %[[PRIVATE_IV:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFbasic_do_loop_serialEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_IV]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: fir.store %{{.*}} to %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: acc.yield
 ! CHECK: attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<none>]}
 
+! CHECK-NOACCLOOP-LABEL: func.func @_QPbasic_do_loop_serial
+! CHECK-NOACCLOOP: acc.serial {
+! CHECK-NOACCLOOP-NOT: acc.loop
+
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPbasic_do_concurrent_parallel
@@ -97,10 +134,19 @@ subroutine basic_do_concurrent_parallel()
   !$acc end parallel
 
 ! CHECK: acc.parallel {
-! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: %[[PRIVATE_IV:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFbasic_do_concurrent_parallelEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_IV]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: fir.store %{{.*}} to %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: acc.yield
 ! CHECK: attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
+! CHECK-NOACCLOOP-LABEL: func.func @_QPbasic_do_concurrent_parallel
+! CHECK-NOACCLOOP: acc.parallel {
+! CHECK-NOACCLOOP-NOT: acc.loop
+
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPbasic_do_concurrent_serial
@@ -117,10 +163,19 @@ subroutine basic_do_concurrent_serial()
   !$acc end serial
 
 ! CHECK: acc.serial {
-! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: %[[PRIVATE_IV:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFbasic_do_concurrent_serialEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_IV]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: fir.store %{{.*}} to %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: acc.yield
 ! CHECK: attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<none>]}
 
+! CHECK-NOACCLOOP-LABEL: func.func @_QPbasic_do_concurrent_serial
+! CHECK-NOACCLOOP: acc.serial {
+! CHECK-NOACCLOOP-NOT: acc.loop
+
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPmulti_dimension_do_concurrent
@@ -137,9 +192,29 @@ subroutine multi_dimension_do_concurrent()
   !$acc end kernels
 
 ! CHECK: acc.kernels {
-! CHECK: acc.loop {{.*}} control(%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32) = (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) to (%{{.*}}, %{{.*}}, %{{.*}} : i32, i32, i32) step (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32)
+! CHECK-DAG: %[[PRIVATE_I:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK-DAG: %[[PRIVATE_J:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "j"}
+! CHECK-DAG: %[[PRIVATE_K:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "k"}
+! CHECK-DAG: %[[PRIVATE_I_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_I]] {uniq_name = "_QFmulti_dimension_do_concurrentEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-DAG: %[[PRIVATE_J_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_J]] {uniq_name = "_QFmulti_dimension_do_concurrentEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-DAG: %[[PRIVATE_K_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_K]] {uniq_name = "_QFmulti_dimension_do_concurrentEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_I]] : !fir.ref<i32>, @privatization_ref_i32 -> %[[PRIVATE_J]] : !fir.ref<i32>, @privatization_ref_i32 -> %[[PRIVATE_K]] : !fir.ref<i32>) control(%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32) = (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) to (%{{.*}}, %{{.*}}, %{{.*}} : i32, i32, i32) step (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32)
+! CHECK: fir.store %{{.*}} to %[[PRIVATE_I_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: fir.store %{{.*}} to %[[PRIVATE_J_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: fir.store %{{.*}} to %[[PRIVATE_K_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_I_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_J_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_K_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_I_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_J_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_K_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: acc.yield
 ! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true, true>}
+
+! CHECK-NOACCLOOP-LABEL: func.func @_QPmulti_dimension_do_concurrent
+! CHECK-NOACCLOOP: acc.kernels {
+! CHECK-NOACCLOOP-NOT: acc.loop
+
 end subroutine
 
 
@@ -159,13 +234,27 @@ subroutine nested_do_loops()
   !$acc end kernels
 
 ! CHECK: acc.kernels {
-! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
-! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK-DAG: %[[PRIVATE_I:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK-DAG: %[[PRIVATE_I_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_I]] {uniq_name = "_QFnested_do_loopsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_I]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: fir.store %{{.*}} to %[[PRIVATE_I_DECLARE]]#0 : !fir.ref<i32>
+! CHECK-DAG: %[[PRIVATE_J:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "j"}
+! CHECK-DAG: %[[PRIVATE_J_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_J]] {uniq_name = "_QFnested_do_loopsEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_J]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: fir.store %{{.*}} to %[[PRIVATE_J_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_I_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_J_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_I_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_J_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: acc.yield
 ! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK: acc.yield
 ! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 
+! CHECK-NOACCLOOP-LABEL: func.func @_QPnested_do_loops
+! CHECK-NOACCLOOP: acc.kernels {
+! CHECK-NOACCLOOP-NOT: acc.loop
+
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPvariable_bounds_and_step
@@ -182,10 +271,19 @@ subroutine variable_bounds_and_step(n, start_val, step_val)
   !$acc end kernels
 
 ! CHECK: acc.kernels {
-! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: %[[PRIVATE_IV:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFvariable_bounds_and_stepEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_IV]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: fir.store %{{.*}} to %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: acc.yield
 ! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 
+! CHECK-NOACCLOOP-LABEL: func.func @_QPvariable_bounds_and_step
+! CHECK-NOACCLOOP: acc.kernels {
+! CHECK-NOACCLOOP-NOT: acc.loop
+
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPdifferent_iv_types
@@ -216,11 +314,76 @@ subroutine different_iv_types()
   !$acc end kernels
 
 ! CHECK: acc.kernels {
-! CHECK: acc.loop {{.*}} control(%{{.*}} : i64) = (%{{.*}} : i64) to (%{{.*}} : i64) step (%{{.*}} : i64)
+! CHECK: %[[PRIVATE_I8:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i64>) -> !fir.ref<i64> {implicit = true, name = "i8"}
+! CHECK: %[[PRIVATE_I8_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_I8]] {uniq_name = "_QFdifferent_iv_typesEi8"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: acc.loop private(@privatization_ref_i64 -> %[[PRIVATE_I8]] : !fir.ref<i64>) control(%{{.*}} : i64) = (%{{.*}} : i64) to (%{{.*}} : i64) step (%{{.*}} : i64)
+! CHECK: fir.store %{{.*}} to %[[PRIVATE_I8_DECLARE]]#0 : !fir.ref<i64>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_I8_DECLARE]]#0 : !fir.ref<i64>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_I8_DECLARE]]#0 : !fir.ref<i64>
 ! CHECK: acc.kernels {
-! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: %[[PRIVATE_I4:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i4"}
+! CHECK: %[[PRIVATE_I4_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_I4]] {uniq_name = "_QFdifferent_iv_typesEi4"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_I4]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: fir.store %{{.*}} to %[[PRIVATE_I4_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_I4_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_I4_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: acc.kernels {
-! CHECK: acc.loop {{.*}} control(%{{.*}} : i16) = (%{{.*}} : i16) to (%{{.*}} : i16) step (%{{.*}} : i16)
+! CHECK: %[[PRIVATE_I2:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i16>) -> !fir.ref<i16> {implicit = true, name = "i2"}
+! CHECK: %[[PRIVATE_I2_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_I2]] {uniq_name = "_QFdifferent_iv_typesEi2"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
+! CHECK: acc.loop private(@privatization_ref_i16 -> %[[PRIVATE_I2]] : !fir.ref<i16>) control(%{{.*}} : i16) = (%{{.*}} : i16) to (%{{.*}} : i16) step (%{{.*}} : i16)
+! CHECK: fir.store %{{.*}} to %[[PRIVATE_I2_DECLARE]]#0 : !fir.ref<i16>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_I2_DECLARE]]#0 : !fir.ref<i16>
+! CHECK: %{{.*}} = fir.load %[[PRIVATE_I2_DECLARE]]#0 : !fir.ref<i16>
+
+! CHECK-NOACCLOOP-LABEL: func.func @_QPdifferent_iv_types
+! CHECK-NOACCLOOP: acc.kernels {
+! CHECK-NOACCLOOP-NOT: acc.loop
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPnested_loop_with_reduction
+subroutine nested_loop_with_reduction(x, y)
+  integer :: x, y
+  integer :: i, j
+
+  ! Nested loop with reduction variables - check that reduction operations
+  ! are correctly scoped (outer loop reduction should not be inside inner loop)
+  !$acc parallel
+  !$acc loop reduction(+:x,y)
+  do i = 1, 10
+    do j = 1, 20
+      y = y + 1
+    end do
+    x = x + 1
+  end do
+  !$acc end parallel
+
+! CHECK: acc.parallel {
+! CHECK: %[[REDUCTION_X:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {name = "x"}
+! CHECK: %[[REDUCTION_Y:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {name = "y"}
+! CHECK: %[[PRIVATE_I:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK: %[[PRIVATE_I_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_I]] {uniq_name = "_QFnested_loop_with_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_I]] : !fir.ref<i32>) reduction(@reduction_add_ref_i32 -> %[[REDUCTION_X]] : !fir.ref<i32>, @reduction_add_ref_i32 -> %[[REDUCTION_Y]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: fir.store %{{.*}} to %[[PRIVATE_I_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %[[PRIVATE_J:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "j"}
+! CHECK: %[[PRIVATE_J_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_J]] {uniq_name = "_QFnested_loop_with_reductionEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_J]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: fir.store %{{.*}} to %[[PRIVATE_J_DECLARE]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK: %{{.*}} = arith.addi %{{.*}}, %{{.*}} : i32
+! CHECK: hlfir.assign %{{.*}} to %{{.*}} : i32, !fir.ref<i32>
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+! CHECK: %{{.*}} = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK: %{{.*}} = arith.addi %{{.*}}, %{{.*}} : i32
+! CHECK: hlfir.assign %{{.*}} to %{{.*}} : i32, !fir.ref<i32>
+! CHECK: acc.yield
+! CHECK: attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+
+! CHECK-NOACCLOOP-LABEL: func.func @_QPnested_loop_with_reduction
+! CHECK-NOACCLOOP: acc.parallel {
+! CHECK-NOACCLOOP: acc.loop{{.*}}reduction{{.*}}control
+! CHECK-NOACCLOOP-NOT: acc.loop
 
 end subroutine
 
diff --git a/flang/test/Lower/OpenMP/nested-loop-transformation-construct01.f90 b/flang/test/Lower/OpenMP/nested-loop-transformation-construct01.f90
deleted file mode 100644
index 17eba93..0000000
--- a/flang/test/Lower/OpenMP/nested-loop-transformation-construct01.f90
+++ /dev/null
@@ -1,20 +0,0 @@
-! Test to ensure TODO message is emitted for tile OpenMP 5.1 Directives when they are nested.
-
-!RUN: not %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
-
-subroutine loop_transformation_construct
-  implicit none
-  integer :: I = 10
-  integer :: x
-  integer :: y(I)
-
-  !$omp do
-  !$omp tile
-  do i = 1, I
-    y(i) = y(i) * 5
-  end do
-  !$omp end tile
-  !$omp end do
-end subroutine
-
-!CHECK: not yet implemented: Unhandled loop directive (tile)
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90
index 2890e78..faf8f71 100644
--- a/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90
+++ b/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90
@@ -108,7 +108,7 @@ subroutine omp_do_lastprivate_collapse2(a)
   ! CHECK-NEXT: %[[UB2:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<i32>
   ! CHECK-NEXT: %[[STEP2:.*]] = arith.constant 1 : i32
   ! CHECK-NEXT: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[A_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[I_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[J_PVT_REF:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
-  ! CHECK-NEXT: omp.loop_nest (%[[ARG1:.*]], %[[ARG2:.*]]) : i32 = (%[[LB1]], %[[LB2]]) to (%[[UB1]], %[[UB2]]) inclusive step (%[[STEP1]], %[[STEP2]]) {
+  ! CHECK-NEXT: omp.loop_nest (%[[ARG1:.*]], %[[ARG2:.*]]) : i32 = (%[[LB1]], %[[LB2]]) to (%[[UB1]], %[[UB2]]) inclusive step (%[[STEP1]], %[[STEP2]]) collapse(2) {
   ! CHECK:      %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse2Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK:      %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse2Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK:      %[[J_PVT_DECL:.*]]:2 = hlfir.declare %[[J_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse2Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -174,7 +174,7 @@ subroutine omp_do_lastprivate_collapse3(a)
   ! CHECK-NEXT: %[[UB3:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<i32>
   ! CHECK-NEXT: %[[STEP3:.*]] = arith.constant 1 : i32
   ! CHECK-NEXT: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[A_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[I_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[J_PVT_REF:.*]], @{{.*}} %{{.*}}#0 -> %[[K_PVT_REF:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
-  ! CHECK-NEXT: omp.loop_nest (%[[ARG1:.*]], %[[ARG2:.*]], %[[ARG3:.*]]) : i32 = (%[[LB1]], %[[LB2]], %[[LB3]]) to (%[[UB1]], %[[UB2]], %[[UB3]]) inclusive step (%[[STEP1]], %[[STEP2]], %[[STEP3]]) {
+  ! CHECK-NEXT: omp.loop_nest (%[[ARG1:.*]], %[[ARG2:.*]], %[[ARG3:.*]]) : i32 = (%[[LB1]], %[[LB2]], %[[LB3]]) to (%[[UB1]], %[[UB2]], %[[UB3]]) inclusive step (%[[STEP1]], %[[STEP2]], %[[STEP3]]) collapse(3) {
   ! CHECK:      %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse3Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK:      %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse3Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK:      %[[J_PVT_DECL:.*]]:2 = hlfir.declare %[[J_PVT_REF]] {uniq_name = "_QFomp_do_lastprivate_collapse3Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git a/flang/test/Lower/OpenMP/simd.f90 b/flang/test/Lower/OpenMP/simd.f90
index 7655c78..369b5eb 100644
--- a/flang/test/Lower/OpenMP/simd.f90
+++ b/flang/test/Lower/OpenMP/simd.f90
@@ -175,7 +175,7 @@ subroutine simd_with_collapse_clause(n)
   ! CHECK-NEXT: omp.loop_nest (%[[ARG_0:.*]], %[[ARG_1:.*]]) : i32 = (
   ! CHECK-SAME:                %[[LOWER_I]], %[[LOWER_J]]) to (
   ! CHECK-SAME:                %[[UPPER_I]], %[[UPPER_J]]) inclusive step (
-  ! CHECK-SAME:                %[[STEP_I]], %[[STEP_J]]) {
+  ! CHECK-SAME:                %[[STEP_I]], %[[STEP_J]]) collapse(2) {
   !$OMP SIMD COLLAPSE(2)
   do i = 1, n
     do j = 1, n
diff --git a/flang/test/Lower/OpenMP/wsloop-collapse.f90 b/flang/test/Lower/OpenMP/wsloop-collapse.f90
index 7ec40ab..677c780 100644
--- a/flang/test/Lower/OpenMP/wsloop-collapse.f90
+++ b/flang/test/Lower/OpenMP/wsloop-collapse.f90
@@ -57,7 +57,7 @@ program wsloop_collapse
 !CHECK:           %[[VAL_31:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
 !CHECK:           %[[VAL_32:.*]] = arith.constant 1 : i32
 !CHECK:           omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[VAL_4:.*]], @{{.*}} %{{.*}}#0 -> %[[VAL_2:.*]], @{{.*}} %{{.*}}#0 -> %[[VAL_0:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
-!CHECK-NEXT:        omp.loop_nest (%[[VAL_33:.*]], %[[VAL_34:.*]], %[[VAL_35:.*]]) : i32 = (%[[VAL_24]], %[[VAL_27]], %[[VAL_30]]) to (%[[VAL_25]], %[[VAL_28]], %[[VAL_31]]) inclusive step (%[[VAL_26]], %[[VAL_29]], %[[VAL_32]]) {
+!CHECK-NEXT:        omp.loop_nest (%[[VAL_33:.*]], %[[VAL_34:.*]], %[[VAL_35:.*]]) : i32 = (%[[VAL_24]], %[[VAL_27]], %[[VAL_30]]) to (%[[VAL_25]], %[[VAL_28]], %[[VAL_31]]) inclusive step (%[[VAL_26]], %[[VAL_29]], %[[VAL_32]]) collapse(3) {
   !$omp do collapse(3)
   do i = 1, a
      do j= 1, b
diff --git a/flang/test/Lower/OpenMP/wsloop-variable.f90 b/flang/test/Lower/OpenMP/wsloop-variable.f90
index f998c84..0f4aafb 100644
--- a/flang/test/Lower/OpenMP/wsloop-variable.f90
+++ b/flang/test/Lower/OpenMP/wsloop-variable.f90
@@ -22,7 +22,7 @@ program wsloop_variable
 !CHECK:      %[[TMP6:.*]] = fir.convert %[[TMP1]] : (i32) -> i64
 !CHECK:      %[[TMP7:.*]] = fir.convert %{{.*}} : (i32) -> i64
 !CHECK:      omp.wsloop private({{.*}}) {
-!CHECK-NEXT:   omp.loop_nest (%[[ARG0:.*]], %[[ARG1:.*]]) : i64 = (%[[TMP2]], %[[TMP5]]) to (%[[TMP3]], %[[TMP6]]) inclusive step (%[[TMP4]], %[[TMP7]]) {
+!CHECK-NEXT:   omp.loop_nest (%[[ARG0:.*]], %[[ARG1:.*]]) : i64 = (%[[TMP2]], %[[TMP5]]) to (%[[TMP3]], %[[TMP6]]) inclusive step (%[[TMP4]], %[[TMP7]]) collapse(2) {
 !CHECK:          %[[ARG0_I16:.*]] = fir.convert %[[ARG0]] : (i64) -> i16
 !CHECK:          hlfir.assign %[[ARG0_I16]] to %[[STORE_IV0:.*]]#0 : i16, !fir.ref<i16>
 !CHECK:          hlfir.assign %[[ARG1]] to %[[STORE_IV1:.*]]#0 : i64, !fir.ref<i64>
diff --git a/flang/test/Lower/components.f90 b/flang/test/Lower/components.f90
index 5afde4b..f0caddb 100644
--- a/flang/test/Lower/components.f90
+++ b/flang/test/Lower/components.f90
@@ -136,7 +136,7 @@ end subroutine
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_6]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX68656C6C6F"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 5 : index
 ! CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_4]]#0{"c"}   shape %[[VAL_3]] typeparams %[[VAL_8]] : (!fir.ref<!fir.array<10x!fir.type<_QFlhs_char_sectionTt
-! CHECK:           hlfir.assign %[[VAL_7]]#0 to %[[VAL_9]] : !fir.ref<!fir.char<1,5>>, !fir.box<!fir.array<10x!fir.char<1,5>>>
+! CHECK:           hlfir.assign %[[VAL_7]]#0 to %[[VAL_9]] : !fir.ref<!fir.char<1,5>>, !fir.ref<!fir.array<10x!fir.char<1,5>>>
 ! CHECK:           return
 ! CHECK:         }
 
@@ -163,7 +163,7 @@ end subroutine
 ! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_10]]) typeparams %[[VAL_8]] dummy_scope %[[VAL_2]] {uniq_name = "_QFrhs_char_sectionEc"} : (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.ref<!fir.array<10x!fir.char<1,10>>>)
 ! CHECK:           %[[VAL_12:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_13:.*]] = hlfir.designate %[[VAL_5]]#0{"c"}   shape %[[VAL_4]] typeparams %[[VAL_12]] : (!fir.ref<!fir.array<10x!fir.type<_QFrhs_char_sectionTt
-! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_11]]#0 : !fir.box<!fir.array<10x!fir.char<1,10>>>, !fir.ref<!fir.array<10x!fir.char<1,10>>>
+! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_11]]#0 : !fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.ref<!fir.array<10x!fir.char<1,10>>>
 ! CHECK:           return
 ! CHECK:         }
 
@@ -192,7 +192,7 @@ end subroutine
 ! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_11]] typeparams %[[VAL_12]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX68656C6C6F"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
 ! CHECK:           %[[VAL_14:.*]] = hlfir.elemental %[[VAL_4]] unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> {
 ! CHECK:           ^bb0(%[[VAL_15:.*]]: index):
-! CHECK:             %[[VAL_16:.*]] = hlfir.designate %[[VAL_10]] (%[[VAL_15]])  typeparams %[[VAL_9]] : (!fir.box<!fir.array<10x!fir.char<1,10>>>, index, index) -> !fir.ref<!fir.char<1,10>>
+! CHECK:             %[[VAL_16:.*]] = hlfir.designate %[[VAL_10]] (%[[VAL_15]])  typeparams %[[VAL_9]] : (!fir.ref<!fir.array<10x!fir.char<1,10>>>, index, index) -> !fir.ref<!fir.char<1,10>>
 ! CHECK:             %[[VAL_17:.*]] = arith.constant false
 ! CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (!fir.ref<!fir.char<1,10>>) -> !fir.ref<i8>
 ! CHECK:             %[[VAL_19:.*]] = fir.convert %[[VAL_9]] : (index) -> i64
diff --git a/flang/test/Lower/percent-val-actual-argument.f90 b/flang/test/Lower/percent-val-actual-argument.f90
new file mode 100644
index 0000000..890b197
--- /dev/null
+++ b/flang/test/Lower/percent-val-actual-argument.f90
@@ -0,0 +1,16 @@
+! RUN: flang -fc1 -emit-hlfir %s -o - | FileCheck %s
+
+program main
+  logical::a1
+  data a1/.true./
+  call sa(%val(a1))
+! CHECK: %[[A1_ADDR:.*]] = fir.address_of(@_QFEa1) : !fir.ref<!fir.logical<4>>
+! CHECK: %[[A1_DECL:.*]]:2 = hlfir.declare %[[A1_ADDR]] {uniq_name = "_QFEa1"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK: fir.call @_QPsa(%[[A1_DECL]]#0) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> ()
+! CHECK: func.func @_QPsa(%[[SA_ARG:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "x1"}) {
+  write(6,*) "a1 = ", a1
+end program main
+
+subroutine sa(x1)
+  logical::x1
+end subroutine sa
diff --git a/flang/test/Lower/percent-val-value-argument.f90 b/flang/test/Lower/percent-val-value-argument.f90
new file mode 100644
index 0000000..e7d5c54
--- /dev/null
+++ b/flang/test/Lower/percent-val-value-argument.f90
@@ -0,0 +1,17 @@
+! RUN: flang -fc1 -emit-hlfir %s -o - | FileCheck %s
+
+program main
+  logical::a1
+  data a1/.true./
+  call sb(%val(a1))
+! CHECK: %[[A1_ADDR:.*]] = fir.address_of(@_QFEa1) : !fir.ref<!fir.logical<4>>
+! CHECK: %[[A1_DECL:.*]]:2 = hlfir.declare %[[A1_ADDR]] {uniq_name = "_QFEa1"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK: %[[A1_LOADED:.*]] = fir.load %[[A1_DECL]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK: fir.call @_QFPsb(%[[A1_LOADED]]) fastmath<contract> : (!fir.logical<4>) -> ()
+! CHECK: func.func private @_QFPsb(%[[SB_ARG:.*]]: !fir.logical<4> {fir.bindc_name = "x1"})
+  write(6,*) "a1 = ", a1
+contains
+  subroutine sb(x1)
+    logical, value :: x1
+  end subroutine sb
+end program main
diff --git a/flang/test/Lower/pointer-assignments.f90 b/flang/test/Lower/pointer-assignments.f90
index ac9c99c..98fd61d 100644
--- a/flang/test/Lower/pointer-assignments.f90
+++ b/flang/test/Lower/pointer-assignments.f90
@@ -113,11 +113,15 @@ subroutine test_array_remap(p, x)
   ! CHECK-DAG: %[[c2_idx:.*]] = fir.convert %c2{{.*}} : (i64) -> index
   ! CHECK-DAG: %[[c11_idx:.*]] = fir.convert %c11{{.*}} : (i64) -> index
   ! CHECK-DAG: %[[diff0:.*]] = arith.subi %[[c11_idx]], %[[c2_idx]] : index
-  ! CHECK-DAG: %[[ext0:.*]] = arith.addi %[[diff0:.*]], %c1{{.*}} : index
+  ! CHECK-DAG: %[[raw_ext0:.*]] = arith.addi %[[diff0:.*]], %c1{{.*}} : index
+  ! CHECK-DAG: %[[cmp0:.*]] = arith.cmpi sgt, %[[raw_ext0]], %c0{{.*}} : index
+  ! CHECK-DAG: %[[ext0:.*]] = arith.select %[[cmp0]], %[[raw_ext0]], %c0{{.*}} : index
   ! CHECK-DAG: %[[c3_idx:.*]] = fir.convert %c3{{.*}} : (i64) -> index
   ! CHECK-DAG: %[[c12_idx:.*]] = fir.convert %c12{{.*}} : (i64) -> index
   ! CHECK-DAG: %[[diff1:.*]] = arith.subi %[[c12_idx]], %[[c3_idx]] : index
-  ! CHECK-DAG: %[[ext1:.*]] = arith.addi %[[diff1]], %c1{{.*}} : index
+  ! CHECK-DAG: %[[raw_ext1:.*]] = arith.addi %[[diff1]], %c1{{.*}} : index
+  ! CHECK-DAG: %[[cmp1:.*]] = arith.cmpi sgt, %[[raw_ext1]], %c0{{.*}} : index
+  ! CHECK-DAG: %[[ext1:.*]] = arith.select %[[cmp1]], %[[raw_ext1]], %c0{{.*}} : index
   ! CHECK-DAG: %[[addrCast:.*]] = fir.convert %[[x]] : (!fir.ref<!fir.array<100xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
   ! CHECK: %[[shape:.*]] = fir.shape_shift %c2{{.*}}, %[[ext0]], %c3{{.*}}, %[[ext1]]
   ! CHECK: %[[box:.*]] = fir.embox %[[addrCast]](%[[shape]]) : (!fir.ref<!fir.array<?x?xf32>>, !fir.shapeshift<2>) -> !fir.box<!fir.ptr<!fir.array<?x?xf32>>>
@@ -132,9 +136,9 @@ subroutine test_array_char_remap(p, x)
   character(*), target :: x(100)
   character(:), pointer :: p(:, :)
   ! CHECK: subi
-  ! CHECK: %[[ext0:.*]] = arith.addi
+  ! CHECK: %[[ext0:.*]] = arith.select
   ! CHECK: subi
-  ! CHECK: %[[ext1:.*]] = arith.addi
+  ! CHECK: %[[ext1:.*]] = arith.select
   ! CHECK: %[[shape:.*]] = fir.shape_shift %c2{{.*}}, %[[ext0]], %c3{{.*}}, %[[ext1]]
   ! CHECK: %[[box:.*]] = fir.embox %{{.*}}(%[[shape]]) typeparams %[[unbox]]#1 : (!fir.ref<!fir.array<?x?x!fir.char<1,?>>>, !fir.shapeshift<2>, index) -> !fir.box<!fir.ptr<!fir.array<?x?x!fir.char<1,?>>>>
   ! CHECK: fir.store %[[box]] to %[[p]]
@@ -218,9 +222,9 @@ subroutine test_array_non_contig_remap(p, x)
   real, target :: x(:)
   real, pointer :: p(:, :)
   ! CHECK: subi
-  ! CHECK: %[[ext0:.*]] = arith.addi
+  ! CHECK: %[[ext0:.*]] = arith.select
   ! CHECK: subi
-  ! CHECK: %[[ext1:.*]] = arith.addi
+  ! CHECK: %[[ext1:.*]] = arith.select
   ! CHECK: %[[shape:.*]] = fir.shape_shift %{{.*}}, %[[ext0]], %{{.*}}, %[[ext1]]
   ! CHECK: %[[rebox:.*]] = fir.rebox %[[x]](%[[shape]]) : (!fir.box<!fir.array<?xf32>>, !fir.shapeshift<2>) -> !fir.box<!fir.ptr<!fir.array<?x?xf32>>>
   ! CHECK: fir.store %[[rebox]] to %[[p]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
@@ -250,13 +254,17 @@ end subroutine
 ! CHECK:         %[[VAL_18:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
 ! CHECK:         %[[VAL_19:.*]] = arith.subi %[[VAL_18]], %[[VAL_17]] : index
 ! CHECK:         %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_16]] : index
+! CHECK:         %[[cmp0:.*]] = arith.cmpi sgt, %[[VAL_20]], %c0{{.*}} : index
+! CHECK:         %[[ext0:.*]] = arith.select %[[cmp0]], %[[VAL_20]], %c0{{.*}} : index
 ! CHECK:         %[[VAL_21:.*]] = fir.convert %[[VAL_5]] : (i64) -> index
 ! CHECK:         %[[VAL_22:.*]] = fir.convert %[[VAL_6]] : (i64) -> index
 ! CHECK:         %[[VAL_23:.*]] = arith.subi %[[VAL_22]], %[[VAL_21]] : index
 ! CHECK:         %[[VAL_24:.*]] = arith.addi %[[VAL_23]], %[[VAL_16]] : index
+! CHECK:         %[[cmp1:.*]] = arith.cmpi sgt, %[[VAL_24]], %c0{{.*}} : index
+! CHECK:         %[[ext1:.*]] = arith.select %[[cmp1]], %[[VAL_24]], %c0{{.*}} : index
 ! CHECK:         %[[VAL_25:.*]] = fir.convert %[[VAL_3]] : (i64) -> index
 ! CHECK:         %[[VAL_26:.*]] = fir.convert %[[VAL_5]] : (i64) -> index
-! CHECK:         %[[VAL_27:.*]] = fir.shape_shift %[[VAL_25]], %[[VAL_20]], %[[VAL_26]], %[[VAL_24]] : (index, index, index, index) -> !fir.shapeshift<2>
+! CHECK:         %[[VAL_27:.*]] = fir.shape_shift %[[VAL_25]], %[[ext0]], %[[VAL_26]], %[[ext1]] : (index, index, index, index) -> !fir.shapeshift<2>
 ! CHECK:         %[[VAL_28:.*]] = fir.rebox %[[VAL_15]](%[[VAL_27]]) : (!fir.box<!fir.array<100xf32>>, !fir.shapeshift<2>) -> !fir.box<!fir.ptr<!fir.array<?x?xf32>>>
 ! CHECK:         fir.store %[[VAL_28]] to %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
 ! CHECK:         return
@@ -333,7 +341,9 @@ subroutine issue857_array_remap(rhs)
   ! CHECK: %[[c101:.*]] = fir.convert %c101_i64 : (i64) -> index
   ! CHECK: %[[c200:.*]] = fir.convert %c200_i64 : (i64) -> index
   ! CHECK: %[[sub:.*]] = arith.subi %[[c200]], %[[c101]] : index
-  ! CHECK: %[[extent:.*]] = arith.addi %[[sub]], %c1{{.*}} : index
+  ! CHECK: %[[raw_extent:.*]] = arith.addi %[[sub]], %c1{{.*}} : index
+  ! CHECK: %[[cmp:.*]] = arith.cmpi sgt, %[[raw_extent]], %c0{{.*}} : index
+  ! CHECK: %[[extent:.*]] = arith.select %[[cmp]], %[[raw_extent]], %c0{{.*}} : index
   ! CHECK: %[[addr:.*]] = fir.box_addr %{{.*}} : (!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QFissue857_array_remapTt{i:i32}>>>>) -> !fir.ptr<!fir.array<?x?x!fir.type<_QFissue857_array_remapTt{i:i32}>>>
   ! CHECK: %[[addr_cast:.*]] = fir.convert %[[addr]] : (!fir.ptr<!fir.array<?x?x!fir.type<_QFissue857_array_remapTt{i:i32}>>>) -> !fir.ptr<!fir.array<?x!fir.type<_QFissue857_array_remapTt{i:i32}>>>
   ! CHECK: fir.store %[[addr_cast]] to %[[lhs_addr]] : !fir.ref<!fir.ptr<!fir.array<?x!fir.type<_QFissue857_array_remapTt{i:i32}>>>>
diff --git a/flang/test/Lower/volatile-string.f90 b/flang/test/Lower/volatile-string.f90
index 38c29b4..54f22af 100644
--- a/flang/test/Lower/volatile-string.f90
+++ b/flang/test/Lower/volatile-string.f90
@@ -25,7 +25,6 @@ end program
 ! CHECK:           %[[VAL_0:.*]] = arith.constant true
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 10 : i32
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 3 : i32
-! CHECK:           %[[VAL_3:.*]] = arith.constant false
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.box<!fir.heap<!fir.char<1,3>>>
@@ -43,13 +42,8 @@ end program
 ! CHECK:           fir.call @_QFPassign_different_length(%[[VAL_16]]) fastmath<contract> : (!fir.boxchar<1>) -> ()
 ! CHECK:           %[[VAL_17:.*]] = fir.address_of(@_QQclX6F) : !fir.ref<!fir.char<1>>
 ! CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] typeparams %[[VAL_4]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX6F"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
-! CHECK:           %[[VAL_19:.*]] = fir.convert %[[VAL_15]] : (!fir.ref<!fir.char<1,3>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_20:.*]] = fir.convert %[[VAL_5]] : (index) -> i64
-! CHECK:           %[[VAL_21:.*]] = fir.convert %[[VAL_18]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_4]] : (index) -> i64
-! CHECK:           %[[VAL_23:.*]] = fir.call @_FortranAIndex1(%[[VAL_19]], %[[VAL_20]], %[[VAL_21]], %[[VAL_22]], %[[VAL_3]]) fastmath<contract> : (!fir.ref<i8>, i64, !fir.ref<i8>, i64, i1) -> i64
-! CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (i64) -> i32
-! CHECK:           hlfir.assign %[[VAL_24]] to %[[VAL_9]]#0 : i32, !fir.ref<i32>
+! CHECK:           %[[VAL_21:.*]] = hlfir.index %[[VAL_18]]#0 in %[[VAL_14]]#0 : (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1,3>, volatile>) -> i32
+! CHECK:           hlfir.assign %[[VAL_21]] to %[[VAL_9]]#0 : i32, !fir.ref<i32>
 ! CHECK:           hlfir.assign %[[VAL_2]] to %[[VAL_9]]#0 : i32, !fir.ref<i32>
 ! CHECK:           %[[VAL_25:.*]] = fir.embox %[[VAL_14]]#0 : (!fir.ref<!fir.char<1,3>, volatile>) -> !fir.box<!fir.char<1,3>, volatile>
 ! CHECK:           %[[VAL_26:.*]] = fir.zero_bits !fir.heap<!fir.char<1,3>>
diff --git a/flang/test/Parser/OpenMP/declare-reduction-unparse-with-symbols.f90 b/flang/test/Parser/OpenMP/declare-reduction-unparse-with-symbols.f90
new file mode 100644
index 0000000..fbcd5b6
--- /dev/null
+++ b/flang/test/Parser/OpenMP/declare-reduction-unparse-with-symbols.f90
@@ -0,0 +1,13 @@
+!RUN: %flang_fc1 -fdebug-unparse-with-symbols -fopenmp %s | FileCheck %s
+
+! This used to crash.
+
+subroutine f00
+  !$omp declare reduction(fred : integer, real : omp_out = omp_in + omp_out)
+end
+
+!CHECK: !DEF: /f00 (Subroutine) Subprogram
+!CHECK: subroutine f00
+!CHECK: !$omp declare reduction (fred:integer,real:omp_out = omp_in+omp_out)
+!CHECK: end subroutine
+
diff --git a/flang/test/Parser/OpenMP/do-tile-size.f90 b/flang/test/Parser/OpenMP/do-tile-size.f90
new file mode 100644
index 0000000..886ee4a
--- /dev/null
+++ b/flang/test/Parser/OpenMP/do-tile-size.f90
@@ -0,0 +1,29 @@
+! RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=51 %s | FileCheck --ignore-case %s
+! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=51 %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+subroutine openmp_do_tiles(x)
+
+  integer, intent(inout)::x
+
+
+!CHECK: !$omp do
+!CHECK: !$omp tile sizes
+!$omp do
+!$omp  tile sizes(2)
+!CHECK: do
+  do x = 1, 100
+     call F1()
+!CHECK: end do
+  end do
+!CHECK: !$omp end tile
+!$omp end tile
+!$omp end do
+
+!PARSE-TREE:| | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+!PARSE-TREE:| | | OmpBeginLoopDirective
+!PARSE-TREE:| | | OpenMPLoopConstruct
+!PARSE-TREE:| | | | OmpBeginLoopDirective
+!PARSE-TREE:| | | | | OmpLoopDirective -> llvm::omp::Directive = tile
+!PARSE-TREE:| | | | | OmpClauseList -> OmpClause -> Sizes -> Scalar -> Integer -> Expr = '2_4'
+!PARSE-TREE: | | | | DoConstruct
+END subroutine openmp_do_tiles
diff --git a/flang/test/Parser/OpenMP/taskgraph.f90 b/flang/test/Parser/OpenMP/taskgraph.f90
new file mode 100644
index 0000000..7fcbae4
--- /dev/null
+++ b/flang/test/Parser/OpenMP/taskgraph.f90
@@ -0,0 +1,95 @@
+!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=60 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s
+!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=60 %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+subroutine f00
+  !$omp taskgraph
+  block
+  end block
+end
+
+!UNPARSE: SUBROUTINE f00
+!UNPARSE: !$OMP TASKGRAPH
+!UNPARSE:  BLOCK
+!UNPARSE:  END BLOCK
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpBlockConstruct
+!PARSE-TREE: | OmpBeginDirective
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = taskgraph
+!PARSE-TREE: | | OmpClauseList ->
+!PARSE-TREE: | | Flags = None
+!PARSE-TREE: | Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> BlockConstruct
+!PARSE-TREE: | | | BlockStmt ->
+!PARSE-TREE: | | | BlockSpecificationPart -> SpecificationPart
+!PARSE-TREE: | | | | ImplicitPart ->
+!PARSE-TREE: | | | Block
+!PARSE-TREE: | | | EndBlockStmt ->
+
+
+subroutine f01(x, y)
+  integer :: x
+  logical :: y
+  !$omp taskgraph graph_id(x) graph_reset(y)
+  !$omp task
+    continue
+  !$omp end task
+  !$omp end taskgraph
+end
+
+!UNPARSE: SUBROUTINE f01 (x, y)
+!UNPARSE:  INTEGER x
+!UNPARSE:  LOGICAL y
+!UNPARSE: !$OMP TASKGRAPH GRAPH_ID(x) GRAPH_RESET(y)
+!UNPARSE: !$OMP TASK
+!UNPARSE:  CONTINUE
+!UNPARSE: !$OMP END TASK
+!UNPARSE: !$OMP END TASKGRAPH
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpBlockConstruct
+!PARSE-TREE: | OmpBeginDirective
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = taskgraph
+!PARSE-TREE: | | OmpClauseList -> OmpClause -> GraphId -> OmpGraphIdClause -> Expr = 'x'
+!PARSE-TREE: | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | OmpClause -> GraphReset -> OmpGraphResetClause -> Expr = 'y'
+!PARSE-TREE: | | | Designator -> DataRef -> Name = 'y'
+!PARSE-TREE: | | Flags = None
+!PARSE-TREE: | Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpBlockConstruct
+!PARSE-TREE: | | | OmpBeginDirective
+!PARSE-TREE: | | | | OmpDirectiveName -> llvm::omp::Directive = task
+!PARSE-TREE: | | | | OmpClauseList ->
+!PARSE-TREE: | | | | Flags = None
+!PARSE-TREE: | | | Block
+!PARSE-TREE: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> ContinueStmt
+!PARSE-TREE: | | | OmpEndDirective
+!PARSE-TREE: | | | | OmpDirectiveName -> llvm::omp::Directive = task
+!PARSE-TREE: | | | | OmpClauseList ->
+!PARSE-TREE: | | | | Flags = None
+!PARSE-TREE: | OmpEndDirective
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = taskgraph
+!PARSE-TREE: | | OmpClauseList ->
+!PARSE-TREE: | | Flags = None
+
+
+subroutine f02
+  !$omp taskgraph graph_reset
+  !$omp end taskgraph
+end
+
+!UNPARSE: SUBROUTINE f02
+!UNPARSE: !$OMP TASKGRAPH GRAPH_RESET
+!UNPARSE: !$OMP END TASKGRAPH
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpBlockConstruct
+!PARSE-TREE: | OmpBeginDirective
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = taskgraph
+!PARSE-TREE: | | OmpClauseList -> OmpClause -> GraphReset ->
+!PARSE-TREE: | | Flags = None
+!PARSE-TREE: | Block
+!PARSE-TREE: | OmpEndDirective
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = taskgraph
+!PARSE-TREE: | | OmpClauseList ->
+!PARSE-TREE: | | Flags = None
diff --git a/flang/test/Semantics/OpenMP/do-collapse.f90 b/flang/test/Semantics/OpenMP/do-collapse.f90
index 480bd45..ec6a3bd 100644
--- a/flang/test/Semantics/OpenMP/do-collapse.f90
+++ b/flang/test/Semantics/OpenMP/do-collapse.f90
@@ -31,6 +31,7 @@ program omp_doCollapse
       end do
     end do
 
+  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
   !ERROR: At most one COLLAPSE clause can appear on the SIMD directive
   !$omp simd collapse(2) collapse(1)
   do i = 1, 4
diff --git a/flang/test/Semantics/OpenMP/do-concurrent-collapse.f90 b/flang/test/Semantics/OpenMP/do-concurrent-collapse.f90
index bb19292..355626f 100644
--- a/flang/test/Semantics/OpenMP/do-concurrent-collapse.f90
+++ b/flang/test/Semantics/OpenMP/do-concurrent-collapse.f90
@@ -1,6 +1,7 @@
 !RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 integer :: i, j
+! ERROR: DO CONCURRENT loops cannot be used with the COLLAPSE clause.
 !$omp parallel do collapse(2)
 do i = 1, 1
   ! ERROR: DO CONCURRENT loops cannot form part of a loop nest.
diff --git a/flang/test/Semantics/contiguous02.f90 b/flang/test/Semantics/contiguous02.f90
new file mode 100644
index 0000000..6543ea9
--- /dev/null
+++ b/flang/test/Semantics/contiguous02.f90
@@ -0,0 +1,27 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic
+subroutine s1
+  type :: d1
+    real :: x
+  end type
+  type :: d2
+     type(d1) :: x
+  end type
+  type(d1), target :: a(5)
+  type(d2), target :: b(5)
+  real, pointer, contiguous :: c(:)
+  c => a%x ! okay, type has single component
+  c => b%x%x ! okay, types have single components
+end
+
+subroutine s2
+  type :: d1
+    real :: x, y
+  end type
+  type(d1), target :: b(5)
+  real, pointer, contiguous :: c(:)
+  !ERROR: CONTIGUOUS pointer may not be associated with a discontiguous target
+  c => b%x
+  c => b(1:1)%x ! okay, one element
+  !ERROR: CONTIGUOUS pointer may not be associated with a discontiguous target
+  c => b(1:2)%x
+end
diff --git a/flang/test/Semantics/resolve20.f90 b/flang/test/Semantics/resolve20.f90
index 8b8d190..f1a1a30 100644
--- a/flang/test/Semantics/resolve20.f90
+++ b/flang/test/Semantics/resolve20.f90
@@ -89,4 +89,12 @@ contains
     !ERROR: Abstract procedure interface 'f' may not be referenced
     x = f()
   end subroutine
+  subroutine baz(foo)
+    external foo
+    interface
+      !WARNING: Dummy argument 'foo' was declared earlier as EXTERNAL [-Wredundant-attribute]
+      subroutine foo(x)
+      end
+    end interface
+  end
 end module
diff --git a/flang/test/Transforms/DoConcurrent/basic_device.f90 b/flang/test/Transforms/DoConcurrent/basic_device.f90
new file mode 100644
index 0000000..fd13f9c
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/basic_device.f90
@@ -0,0 +1,83 @@
+! Tests mapping of a basic `do concurrent` loop to
+! `!$omp target teams distribute parallel do`.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s
+
+program do_concurrent_basic
+    implicit none
+    integer :: a(10)
+    integer :: i
+
+    ! CHECK: %[[I_ORIG_ALLOC:.*]] = fir.alloca i32 {bindc_name = "i"}
+    ! CHECK: %[[I_ORIG_DECL:.*]]:2 = hlfir.declare %[[I_ORIG_ALLOC]]
+
+    ! CHECK: %[[A_ADDR:.*]] = fir.address_of(@_QFEa)
+    ! CHECK: %[[A_SHAPE:.*]] = fir.shape %[[A_EXTENT:.*]] : (index) -> !fir.shape<1>
+    ! CHECK: %[[A_ORIG_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]](%[[A_SHAPE]])
+
+    ! CHECK-NOT: fir.do_loop
+
+    ! CHECK: %[[C1:.*]] = arith.constant 1 : i32
+    ! CHECK: %[[HOST_LB:.*]] = fir.convert %[[C1]] : (i32) -> index
+    ! CHECK: %[[C10:.*]] = arith.constant 10 : i32
+    ! CHECK: %[[HOST_UB:.*]] = fir.convert %[[C10]] : (i32) -> index
+    ! CHECK: %[[HOST_STEP:.*]] = arith.constant 1 : index
+
+    ! CHECK: %[[I_MAP_INFO:.*]] = omp.map.info var_ptr(%[[I_ORIG_DECL]]#1
+    ! CHECK: %[[C0:.*]] = arith.constant 0 : index
+    ! CHECK: %[[UPPER_BOUND:.*]] = arith.subi %[[A_EXTENT]], %{{c1.*}} : index
+
+    ! CHECK: %[[A_BOUNDS:.*]] = omp.map.bounds lower_bound(%[[C0]] : index)
+    ! CHECK-SAME: upper_bound(%[[UPPER_BOUND]] : index)
+    ! CHECK-SAME: extent(%[[A_EXTENT]] : index)
+
+    ! CHECK: %[[A_MAP_INFO:.*]] = omp.map.info var_ptr(%[[A_ORIG_DECL]]#1 : {{[^(]+}})
+    ! CHECK-SAME: map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[A_BOUNDS]])
+
+    ! CHECK: omp.target
+    ! CHECK-SAME: host_eval(%[[HOST_LB]] -> %[[LB:[[:alnum:]]+]], %[[HOST_UB]] -> %[[UB:[[:alnum:]]+]], %[[HOST_STEP]] -> %[[STEP:[[:alnum:]]+]] : index, index, index)
+    ! CHECK-SAME: map_entries(
+    ! CHECK-SAME:     %{{[[:alnum:]]+}} -> %{{[^,]+}},
+    ! CHECK-SAME:     %{{[[:alnum:]]+}} -> %{{[^,]+}},
+    ! CHECK-SAME:     %{{[[:alnum:]]+}} -> %{{[^,]+}},
+    ! CHECK-SAME:     %[[I_MAP_INFO]] -> %[[I_ARG:[[:alnum:]]+]],
+    ! CHECK-SAME:             %[[A_MAP_INFO]] -> %[[A_ARG:.[[:alnum:]]+]]
+
+    ! CHECK: %[[A_DEV_DECL:.*]]:2 = hlfir.declare %[[A_ARG]]
+    ! CHECK: omp.teams {
+    ! CHECK-NEXT: omp.parallel {
+
+    ! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
+    ! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+    ! CHECK-NEXT: omp.distribute {
+    ! CHECK-NEXT: omp.wsloop {
+
+    ! CHECK-NEXT: omp.loop_nest (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+    ! CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32
+    ! CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#0 : !fir.ref<i32>
+    ! CHECK-NEXT: %[[IV_VAL1:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
+    ! CHECK-NEXT: %[[IV_VAL2:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
+    ! CHECK-NEXT: %[[IV_VAL_I64:.*]] = fir.convert %[[IV_VAL2]] : (i32) -> i64
+    ! CHECK-NEXT: %[[ARR_ACCESS:.*]] = hlfir.designate %[[A_DEV_DECL]]#0 (%[[IV_VAL_I64]])  : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+    ! CHECK-NEXT: hlfir.assign %[[IV_VAL1]] to %[[ARR_ACCESS]] : i32, !fir.ref<i32>
+    ! CHECK-NEXT: omp.yield
+    ! CHECK-NEXT: }
+
+    ! CHECK-NEXT: } {omp.composite}
+    ! CHECK-NEXT: } {omp.composite}
+    ! CHECK-NEXT: omp.terminator
+    ! CHECK-NEXT: } {omp.composite}
+    ! CHECK-NEXT: omp.terminator
+    ! CHECK-NEXT: }
+    ! CHECK-NEXT: omp.terminator
+    ! CHECK-NEXT: }
+    do concurrent (i=1:10)
+        a(i) = i
+    end do
+
+    ! CHECK-NOT: fir.do_loop
+end program do_concurrent_basic
diff --git a/flang/test/Transforms/DoConcurrent/basic_device.mlir b/flang/test/Transforms/DoConcurrent/basic_device.mlir
index 0ca4894..fa511c3 100644
--- a/flang/test/Transforms/DoConcurrent/basic_device.mlir
+++ b/flang/test/Transforms/DoConcurrent/basic_device.mlir
@@ -1,4 +1,4 @@
-// RUN: fir-opt --omp-do-concurrent-conversion="map-to=device" -verify-diagnostics %s
+// RUN: fir-opt --omp-do-concurrent-conversion="map-to=device" %s -o - | FileCheck %s
 
 func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} {
     %2 = fir.address_of(@_QFEa) : !fir.ref<!fir.array<10xi32>>
@@ -11,8 +11,12 @@ func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_bas
     %8 = fir.convert %c10_i32 : (i32) -> index
     %c1 = arith.constant 1 : index
 
-    // expected-error@+2 {{not yet implemented: Mapping `do concurrent` loops to device}}
-    // expected-error@below {{failed to legalize operation 'fir.do_concurrent'}}
+    // CHECK: omp.target
+    // CHECK: omp.teams
+    // CHECK: omp.parallel
+    // CHECK: omp.distribute
+    // CHECK: omp.wsloop
+    // CHECK: omp.loop_nest
     fir.do_concurrent {
       %0 = fir.alloca i32 {bindc_name = "i"}
       %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git a/flang/test/Transforms/DoConcurrent/use_loop_bounds_in_body.f90 b/flang/test/Transforms/DoConcurrent/use_loop_bounds_in_body.f90
new file mode 100644
index 0000000..b467747
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/use_loop_bounds_in_body.f90
@@ -0,0 +1,40 @@
+! Tests that when a loop bound is used in the body, that the mapped version of
+! the loop bound (rather than the host-eval one) is the one used inside the loop.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s
+
+subroutine foo(a, n)
+  implicit none
+  integer :: i, n
+  real, dimension(n) :: a
+
+  do concurrent (i=1:n)
+    a(i) = n
+  end do
+end subroutine 
+
+! CHECK-LABEL: func.func @_QPfoo
+! CHECK: omp.target
+! CHECK-SAME: host_eval(%{{.*}} -> %{{.*}}, %{{.*}} -> %[[N_HOST_EVAL:.*]], %{{.*}} -> %{{.*}} : index, index, index)
+! CHECK-SAME: map_entries({{[^[:space:]]*}} -> {{[^[:space:]]*}},
+! CHECK-SAME:   {{[^[:space:]]*}} -> {{[^[:space:]]*}}, {{[^[:space:]]*}} -> {{[^[:space:]]*}},
+! CHECK-SAME:   {{[^[:space:]]*}} -> {{[^[:space:]]*}}, {{[^[:space:]]*}} -> %[[N_MAP_ARG:[^[:space:]]*]], {{.*}}) {
+! CHECK:   %[[N_MAPPED:.*]]:2 = hlfir.declare %[[N_MAP_ARG]] {uniq_name = "_QFfooEn"}
+! CHECK:   omp.teams {
+! CHECK:     omp.parallel {
+! CHECK:       omp.distribute {
+! CHECK:         omp.wsloop {
+! CHECK:           omp.loop_nest (%{{.*}}) : index = (%{{.*}}) to (%[[N_HOST_EVAL]]) inclusive step (%{{.*}}) {
+! CHECK:             %[[N_VAL:.*]] = fir.load %[[N_MAPPED]]#0 : !fir.ref<i32>
+! CHECK:             %[[N_VAL_CVT:.*]] = fir.convert %[[N_VAL]] : (i32) -> f32
+! CHECK:             hlfir.assign %[[N_VAL_CVT]] to {{.*}}
+! CHECK-NEXT:        omp.yield
+! CHECK:           }
+! CHECK:         }
+! CHECK:       }
+! CHECK:     }
+! CHECK:   }
+! CHECK: }