aboutsummaryrefslogtreecommitdiff
path: root/flang
diff options
context:
space:
mode:
Diffstat (limited to 'flang')
-rw-r--r--flang/docs/OpenMPSupport.md20
-rw-r--r--flang/include/flang/Frontend/TargetOptions.h5
-rw-r--r--flang/include/flang/Lower/OpenACC.h22
-rw-r--r--flang/include/flang/Optimizer/Dialect/Support/FIRContext.h19
-rw-r--r--flang/include/flang/Optimizer/Support/InitFIR.h8
-rw-r--r--flang/lib/Frontend/CompilerInvocation.cpp10
-rw-r--r--flang/lib/Lower/Bridge.cpp37
-rw-r--r--flang/lib/Lower/OpenACC.cpp395
-rw-r--r--flang/lib/Lower/OpenMP/Atomic.cpp9
-rw-r--r--flang/lib/Lower/OpenMP/OpenMP.cpp51
-rw-r--r--flang/lib/Optimizer/Dialect/Support/FIRContext.cpp51
-rw-r--r--flang/lib/Optimizer/Support/CMakeLists.txt9
-rw-r--r--flang/test/Lower/OpenACC/Todo/do-loops-to-acc-loops-todo.f9091
-rw-r--r--flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90267
-rw-r--r--flang/test/Lower/OpenMP/atomic-control-options.f9037
-rw-r--r--flang/test/Lower/OpenMP/unroll-heuristic01.f9063
-rw-r--r--flang/test/Lower/OpenMP/unroll-heuristic02.f9098
-rw-r--r--flang/test/Lower/OpenMP/unroll-heuristic03.f9061
18 files changed, 1015 insertions, 238 deletions
diff --git a/flang/docs/OpenMPSupport.md b/flang/docs/OpenMPSupport.md
index c9f19c3..81f5f9f 100644
--- a/flang/docs/OpenMPSupport.md
+++ b/flang/docs/OpenMPSupport.md
@@ -41,7 +41,7 @@ Note : No distinction is made between the support in Parser/Semantics, MLIR, Low
| target construct | P | device clause not supported |
| target update construct | P | device clause not supported |
| declare target directive | P | |
-| teams construct | P | reduction clause not supported |
+| teams construct | Y | |
| distribute construct | P | dist_schedule clause not supported |
| distribute simd construct | P | dist_schedule and linear clauses are not supported |
| distribute parallel loop construct | P | dist_schedule clause not supported |
@@ -51,15 +51,15 @@ Note : No distinction is made between the support in Parser/Semantics, MLIR, Low
| atomic construct extensions | Y | |
| cancel construct | Y | |
| cancellation point construct | Y | |
-| parallel do simd construct | P | linear clause is not supported |
-| target teams construct | P | device and reduction clauses are not supported |
-| teams distribute construct | P | reduction and dist_schedule clauses not supported |
-| teams distribute simd construct | P | reduction, dist_schedule and linear clauses are not supported |
-| target teams distribute construct | P | device, reduction and dist_schedule clauses are not supported |
-| teams distribute parallel loop construct | P | reduction and dist_schedule clauses are not supported |
-| target teams distribute parallel loop construct | P | device, reduction and dist_schedule clauses are not supported |
-| teams distribute parallel loop simd construct | P | reduction, dist_schedule, and linear clauses are not supported |
-| target teams distribute parallel loop simd construct | P | device, reduction, dist_schedule and linear clauses are not supported |
+| parallel do simd construct | P | linear clause not supported |
+| target teams construct | P | device clause not supported |
+| teams distribute construct | P | dist_schedule clause not supported |
+| teams distribute simd construct | P | dist_schedule and linear clauses are not supported |
+| target teams distribute construct | P | device and dist_schedule clauses are not supported |
+| teams distribute parallel loop construct | P | dist_schedule clause not supported |
+| target teams distribute parallel loop construct | P | device and dist_schedule clauses are not supported |
+| teams distribute parallel loop simd construct | P | dist_schedule and linear clauses are not supported |
+| target teams distribute parallel loop simd construct | P | device, dist_schedule and linear clauses are not supported |
## Extensions
### ATOMIC construct
diff --git a/flang/include/flang/Frontend/TargetOptions.h b/flang/include/flang/Frontend/TargetOptions.h
index 002d8d1..f6e5634 100644
--- a/flang/include/flang/Frontend/TargetOptions.h
+++ b/flang/include/flang/Frontend/TargetOptions.h
@@ -53,6 +53,11 @@ public:
/// Print verbose assembly
bool asmVerbose = false;
+
+ /// Atomic control options
+ bool atomicIgnoreDenormalMode = false;
+ bool atomicRemoteMemory = false;
+ bool atomicFineGrainedMemory = false;
};
} // end namespace Fortran::frontend
diff --git a/flang/include/flang/Lower/OpenACC.h b/flang/include/flang/Lower/OpenACC.h
index af34510..e974f3d 100644
--- a/flang/include/flang/Lower/OpenACC.h
+++ b/flang/include/flang/Lower/OpenACC.h
@@ -43,6 +43,7 @@ struct ProcedureDesignator;
namespace parser {
struct AccClauseList;
+struct DoConstruct;
struct OpenACCConstruct;
struct OpenACCDeclarativeConstruct;
struct OpenACCRoutineConstruct;
@@ -58,6 +59,7 @@ namespace lower {
class AbstractConverter;
class StatementContext;
+class SymMap;
namespace pft {
struct Evaluation;
@@ -114,14 +116,32 @@ void attachDeclarePostDeallocAction(AbstractConverter &, fir::FirOpBuilder &,
void genOpenACCTerminator(fir::FirOpBuilder &, mlir::Operation *,
mlir::Location);
-int64_t getLoopCountForCollapseAndTile(const Fortran::parser::AccClauseList &);
+/// Used to obtain the number of contained loops to look for
+/// since this is dependent on number of tile operands and collapse
+/// clause.
+uint64_t getLoopCountForCollapseAndTile(const Fortran::parser::AccClauseList &);
+/// Checks whether the current insertion point is inside OpenACC loop.
bool isInOpenACCLoop(fir::FirOpBuilder &);
+/// Checks whether the current insertion point is inside OpenACC compute
+/// construct.
+bool isInsideOpenACCComputeConstruct(fir::FirOpBuilder &);
+
void setInsertionPointAfterOpenACCLoopIfInside(fir::FirOpBuilder &);
void genEarlyReturnInOpenACCLoop(fir::FirOpBuilder &, mlir::Location);
+/// Generates an OpenACC loop from a do construct in order to
+/// properly capture the loop bounds, parallelism determination mode,
+/// and to privatize the loop variables.
+/// When the conversion is rejected, nullptr is returned.
+mlir::Operation *genOpenACCLoopFromDoConstruct(
+ AbstractConverter &converter,
+ Fortran::semantics::SemanticsContext &semanticsContext,
+ Fortran::lower::SymMap &localSymbols,
+ const Fortran::parser::DoConstruct &doConstruct, pft::Evaluation &eval);
+
} // namespace lower
} // namespace Fortran
diff --git a/flang/include/flang/Optimizer/Dialect/Support/FIRContext.h b/flang/include/flang/Optimizer/Dialect/Support/FIRContext.h
index 2df14f8..c0c0b74 100644
--- a/flang/include/flang/Optimizer/Dialect/Support/FIRContext.h
+++ b/flang/include/flang/Optimizer/Dialect/Support/FIRContext.h
@@ -58,6 +58,25 @@ void setTargetCPU(mlir::ModuleOp mod, llvm::StringRef cpu);
/// Get the target CPU string from the Module or return a null reference.
llvm::StringRef getTargetCPU(mlir::ModuleOp mod);
+/// Sets whether Denormal Mode can be ignored or not for lowering of floating
+/// point atomic operations.
+void setAtomicIgnoreDenormalMode(mlir::ModuleOp mod, bool value);
+/// Gets whether Denormal Mode can be ignored or not for lowering of floating
+/// point atomic operations.
+bool getAtomicIgnoreDenormalMode(mlir::ModuleOp mod);
+/// Sets whether fine grained memory can be used or not for lowering of atomic
+/// operations.
+void setAtomicFineGrainedMemory(mlir::ModuleOp mod, bool value);
+/// Gets whether fine grained memory can be used or not for lowering of atomic
+/// operations.
+bool getAtomicFineGrainedMemory(mlir::ModuleOp mod);
+/// Sets whether remote memory can be used or not for lowering of atomic
+/// operations.
+void setAtomicRemoteMemory(mlir::ModuleOp mod, bool value);
+/// Gets whether remote memory can be used or not for lowering of atomic
+/// operations.
+bool getAtomicRemoteMemory(mlir::ModuleOp mod);
+
/// Set the tune CPU for the module. `cpu` must not be deallocated while
/// module `mod` is still live.
void setTuneCPU(mlir::ModuleOp mod, llvm::StringRef cpu);
diff --git a/flang/include/flang/Optimizer/Support/InitFIR.h b/flang/include/flang/Optimizer/Support/InitFIR.h
index aacba23..3e42ffd 100644
--- a/flang/include/flang/Optimizer/Support/InitFIR.h
+++ b/flang/include/flang/Optimizer/Support/InitFIR.h
@@ -20,12 +20,20 @@
#include "flang/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.h"
#include "flang/Optimizer/OpenMP/Support/RegisterOpenMPExtensions.h"
#include "mlir/Conversion/Passes.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/Passes.h"
#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/DLTI/DLTI.h"
#include "mlir/Dialect/Func/Extensions/InlinerExtension.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/Index/IR/IndexDialect.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
#include "mlir/Dialect/OpenACC/Transforms/Passes.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/SCF/Transforms/Passes.h"
#include "mlir/InitAllDialects.h"
#include "mlir/Pass/Pass.h"
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index f55d866..111c5aa4 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -512,6 +512,16 @@ static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) {
args.getLastArg(clang::driver::options::OPT_triple))
opts.triple = a->getValue();
+ opts.atomicIgnoreDenormalMode = args.hasFlag(
+ clang::driver::options::OPT_fatomic_ignore_denormal_mode,
+ clang::driver::options::OPT_fno_atomic_ignore_denormal_mode, false);
+ opts.atomicFineGrainedMemory = args.hasFlag(
+ clang::driver::options::OPT_fatomic_fine_grained_memory,
+ clang::driver::options::OPT_fno_atomic_fine_grained_memory, false);
+ opts.atomicRemoteMemory =
+ args.hasFlag(clang::driver::options::OPT_fatomic_remote_memory,
+ clang::driver::options::OPT_fno_atomic_remote_memory, false);
+
if (const llvm::opt::Arg *a =
args.getLastArg(clang::driver::options::OPT_target_cpu))
opts.cpu = a->getValue();
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 92aae79..1adfb96 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2167,10 +2167,35 @@ private:
/// - structured and unstructured concurrent loops
void genFIR(const Fortran::parser::DoConstruct &doConstruct) {
setCurrentPositionAt(doConstruct);
- // Collect loop nest information.
- // Generate begin loop code directly for infinite and while loops.
Fortran::lower::pft::Evaluation &eval = getEval();
bool unstructuredContext = eval.lowerAsUnstructured();
+
+ // Loops with induction variables inside OpenACC compute constructs
+ // need special handling to ensure that the IVs are privatized.
+ if (Fortran::lower::isInsideOpenACCComputeConstruct(*builder)) {
+ mlir::Operation *loopOp = Fortran::lower::genOpenACCLoopFromDoConstruct(
+ *this, bridge.getSemanticsContext(), localSymbols, doConstruct, eval);
+ bool success = loopOp != nullptr;
+ if (success) {
+ // Sanity check that the builder insertion point is inside the newly
+ // generated loop.
+ assert(
+ loopOp->getRegion(0).isAncestor(
+ builder->getInsertionPoint()->getBlock()->getParent()) &&
+ "builder insertion point is not inside the newly generated loop");
+
+ // Loop body code.
+ auto iter = eval.getNestedEvaluations().begin();
+ for (auto end = --eval.getNestedEvaluations().end(); iter != end;
+ ++iter)
+ genFIR(*iter, unstructuredContext);
+ return;
+ }
+ // Fall back to normal loop handling.
+ }
+
+ // Collect loop nest information.
+ // Generate begin loop code directly for infinite and while loops.
Fortran::lower::pft::Evaluation &doStmtEval =
eval.getFirstNestedEvaluation();
auto *doStmt = doStmtEval.getIf<Fortran::parser::NonLabelDoStmt>();
@@ -3124,7 +3149,7 @@ private:
Fortran::lower::pft::Evaluation *curEval = &getEval();
if (accLoop || accCombined) {
- int64_t loopCount;
+ uint64_t loopCount;
if (accLoop) {
const Fortran::parser::AccBeginLoopDirective &beginLoopDir =
std::get<Fortran::parser::AccBeginLoopDirective>(accLoop->t);
@@ -3142,7 +3167,7 @@ private:
if (curEval->lowerAsStructured()) {
curEval = &curEval->getFirstNestedEvaluation();
- for (int64_t i = 1; i < loopCount; i++)
+ for (uint64_t i = 1; i < loopCount; i++)
curEval = &*std::next(curEval->getNestedEvaluations().begin());
}
}
@@ -6733,6 +6758,10 @@ Fortran::lower::LoweringBridge::LoweringBridge(
fir::setKindMapping(*module, kindMap);
fir::setTargetCPU(*module, targetMachine.getTargetCPU());
fir::setTuneCPU(*module, targetOpts.cpuToTuneFor);
+ fir::setAtomicIgnoreDenormalMode(*module,
+ targetOpts.atomicIgnoreDenormalMode);
+ fir::setAtomicFineGrainedMemory(*module, targetOpts.atomicFineGrainedMemory);
+ fir::setAtomicRemoteMemory(*module, targetOpts.atomicRemoteMemory);
fir::setTargetFeatures(*module, targetMachine.getTargetFeatureString());
fir::support::setMLIRDataLayout(*module, targetMachine.createDataLayout());
fir::setIdent(*module, Fortran::common::getFlangFullVersion());
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 471f368..57ce1d3 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -36,6 +36,7 @@
#include "mlir/IR/MLIRContext.h"
#include "mlir/Support/LLVM.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
#include "llvm/Frontend/OpenACC/ACC.h.inc"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -2142,6 +2143,168 @@ static void determineDefaultLoopParMode(
}
}
+// Extract loop bounds, steps, induction variables, and privatization info
+// for both DO CONCURRENT and regular do loops
+static void processDoLoopBounds(
+ Fortran::lower::AbstractConverter &converter,
+ mlir::Location currentLocation, Fortran::lower::StatementContext &stmtCtx,
+ fir::FirOpBuilder &builder,
+ const Fortran::parser::DoConstruct &outerDoConstruct,
+ Fortran::lower::pft::Evaluation &eval,
+ llvm::SmallVector<mlir::Value> &lowerbounds,
+ llvm::SmallVector<mlir::Value> &upperbounds,
+ llvm::SmallVector<mlir::Value> &steps,
+ llvm::SmallVector<mlir::Value> &privateOperands,
+ llvm::SmallVector<mlir::Value> &ivPrivate,
+ llvm::SmallVector<mlir::Attribute> &privatizationRecipes,
+ llvm::SmallVector<mlir::Type> &ivTypes,
+ llvm::SmallVector<mlir::Location> &ivLocs,
+ llvm::SmallVector<bool> &inclusiveBounds,
+ llvm::SmallVector<mlir::Location> &locs, uint64_t loopsToProcess) {
+ assert(loopsToProcess > 0 && "expect at least one loop");
+ locs.push_back(currentLocation); // Location of the directive
+ Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation();
+ bool isDoConcurrent = outerDoConstruct.IsDoConcurrent();
+
+ if (isDoConcurrent) {
+ locs.push_back(converter.genLocation(
+ Fortran::parser::FindSourceLocation(outerDoConstruct)));
+ const Fortran::parser::LoopControl *loopControl =
+ &*outerDoConstruct.GetLoopControl();
+ const auto &concurrent =
+ std::get<Fortran::parser::LoopControl::Concurrent>(loopControl->u);
+ if (!std::get<std::list<Fortran::parser::LocalitySpec>>(concurrent.t)
+ .empty())
+ TODO(currentLocation, "DO CONCURRENT with locality spec inside ACC");
+
+ const auto &concurrentHeader =
+ std::get<Fortran::parser::ConcurrentHeader>(concurrent.t);
+ const auto &controls =
+ std::get<std::list<Fortran::parser::ConcurrentControl>>(
+ concurrentHeader.t);
+ for (const auto &control : controls) {
+ lowerbounds.push_back(fir::getBase(converter.genExprValue(
+ *Fortran::semantics::GetExpr(std::get<1>(control.t)), stmtCtx)));
+ upperbounds.push_back(fir::getBase(converter.genExprValue(
+ *Fortran::semantics::GetExpr(std::get<2>(control.t)), stmtCtx)));
+ if (const auto &expr =
+ std::get<std::optional<Fortran::parser::ScalarIntExpr>>(
+ control.t))
+ steps.push_back(fir::getBase(converter.genExprValue(
+ *Fortran::semantics::GetExpr(*expr), stmtCtx)));
+ else // If `step` is not present, assume it is `1`.
+ steps.push_back(builder.createIntegerConstant(
+ currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
+
+ const auto &name = std::get<Fortran::parser::Name>(control.t);
+ privatizeIv(converter, *name.symbol, currentLocation, ivTypes, ivLocs,
+ privateOperands, ivPrivate, privatizationRecipes,
+ isDoConcurrent);
+
+ inclusiveBounds.push_back(true);
+ }
+ } else {
+ for (uint64_t i = 0; i < loopsToProcess; ++i) {
+ const Fortran::parser::LoopControl *loopControl;
+ if (i == 0) {
+ loopControl = &*outerDoConstruct.GetLoopControl();
+ locs.push_back(converter.genLocation(
+ Fortran::parser::FindSourceLocation(outerDoConstruct)));
+ } else {
+ auto *doCons = crtEval->getIf<Fortran::parser::DoConstruct>();
+ assert(doCons && "expect do construct");
+ loopControl = &*doCons->GetLoopControl();
+ locs.push_back(converter.genLocation(
+ Fortran::parser::FindSourceLocation(*doCons)));
+ }
+
+ const Fortran::parser::LoopControl::Bounds *bounds =
+ std::get_if<Fortran::parser::LoopControl::Bounds>(&loopControl->u);
+ assert(bounds && "Expected bounds on the loop construct");
+ lowerbounds.push_back(fir::getBase(converter.genExprValue(
+ *Fortran::semantics::GetExpr(bounds->lower), stmtCtx)));
+ upperbounds.push_back(fir::getBase(converter.genExprValue(
+ *Fortran::semantics::GetExpr(bounds->upper), stmtCtx)));
+ if (bounds->step)
+ steps.push_back(fir::getBase(converter.genExprValue(
+ *Fortran::semantics::GetExpr(bounds->step), stmtCtx)));
+ else // If `step` is not present, assume it is `1`.
+ steps.push_back(builder.createIntegerConstant(
+ currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
+
+ Fortran::semantics::Symbol &ivSym =
+ bounds->name.thing.symbol->GetUltimate();
+ privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs,
+ privateOperands, ivPrivate, privatizationRecipes);
+
+ inclusiveBounds.push_back(true);
+
+ if (i < loopsToProcess - 1)
+ crtEval = &*std::next(crtEval->getNestedEvaluations().begin());
+ }
+ }
+}
+
+static mlir::acc::LoopOp
+buildACCLoopOp(Fortran::lower::AbstractConverter &converter,
+ mlir::Location currentLocation,
+ Fortran::semantics::SemanticsContext &semanticsContext,
+ Fortran::lower::StatementContext &stmtCtx,
+ const Fortran::parser::DoConstruct &outerDoConstruct,
+ Fortran::lower::pft::Evaluation &eval,
+ llvm::SmallVector<mlir::Value> &privateOperands,
+ llvm::SmallVector<mlir::Attribute> &privatizationRecipes,
+ llvm::SmallVector<mlir::Value> &gangOperands,
+ llvm::SmallVector<mlir::Value> &workerNumOperands,
+ llvm::SmallVector<mlir::Value> &vectorOperands,
+ llvm::SmallVector<mlir::Value> &tileOperands,
+ llvm::SmallVector<mlir::Value> &cacheOperands,
+ llvm::SmallVector<mlir::Value> &reductionOperands,
+ llvm::SmallVector<mlir::Type> &retTy, mlir::Value yieldValue,
+ uint64_t loopsToProcess) {
+ fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+
+ llvm::SmallVector<mlir::Value> ivPrivate;
+ llvm::SmallVector<mlir::Type> ivTypes;
+ llvm::SmallVector<mlir::Location> ivLocs;
+ llvm::SmallVector<bool> inclusiveBounds;
+ llvm::SmallVector<mlir::Location> locs;
+ llvm::SmallVector<mlir::Value> lowerbounds, upperbounds, steps;
+
+ // Look at the do/do concurrent loops to extract bounds information.
+ processDoLoopBounds(converter, currentLocation, stmtCtx, builder,
+ outerDoConstruct, eval, lowerbounds, upperbounds, steps,
+ privateOperands, ivPrivate, privatizationRecipes, ivTypes,
+ ivLocs, inclusiveBounds, locs, loopsToProcess);
+
+ // Prepare the operand segment size attribute and the operands value range.
+ llvm::SmallVector<mlir::Value> operands;
+ llvm::SmallVector<int32_t> operandSegments;
+ addOperands(operands, operandSegments, lowerbounds);
+ addOperands(operands, operandSegments, upperbounds);
+ addOperands(operands, operandSegments, steps);
+ addOperands(operands, operandSegments, gangOperands);
+ addOperands(operands, operandSegments, workerNumOperands);
+ addOperands(operands, operandSegments, vectorOperands);
+ addOperands(operands, operandSegments, tileOperands);
+ addOperands(operands, operandSegments, cacheOperands);
+ addOperands(operands, operandSegments, privateOperands);
+ addOperands(operands, operandSegments, reductionOperands);
+
+ auto loopOp = createRegionOp<mlir::acc::LoopOp, mlir::acc::YieldOp>(
+ builder, builder.getFusedLoc(locs), currentLocation, eval, operands,
+ operandSegments, /*outerCombined=*/false, retTy, yieldValue, ivTypes,
+ ivLocs);
+
+ for (auto [arg, value] : llvm::zip(
+ loopOp.getLoopRegions().front()->front().getArguments(), ivPrivate))
+ fir::StoreOp::create(builder, currentLocation, arg, value);
+
+ loopOp.setInclusiveUpperbound(inclusiveBounds);
+
+ return loopOp;
+}
+
static mlir::acc::LoopOp createLoopOp(
Fortran::lower::AbstractConverter &converter,
mlir::Location currentLocation,
@@ -2154,9 +2317,9 @@ static mlir::acc::LoopOp createLoopOp(
std::nullopt,
bool needEarlyReturnHandling = false) {
fir::FirOpBuilder &builder = converter.getFirOpBuilder();
- llvm::SmallVector<mlir::Value> tileOperands, privateOperands, ivPrivate,
+ llvm::SmallVector<mlir::Value> tileOperands, privateOperands,
reductionOperands, cacheOperands, vectorOperands, workerNumOperands,
- gangOperands, lowerbounds, upperbounds, steps;
+ gangOperands;
llvm::SmallVector<mlir::Attribute> privatizationRecipes, reductionRecipes;
llvm::SmallVector<int32_t> tileOperandsSegments, gangOperandsSegments;
llvm::SmallVector<int64_t> collapseValues;
@@ -2325,107 +2488,6 @@ static mlir::acc::LoopOp createLoopOp(
}
}
- llvm::SmallVector<mlir::Type> ivTypes;
- llvm::SmallVector<mlir::Location> ivLocs;
- llvm::SmallVector<bool> inclusiveBounds;
- llvm::SmallVector<mlir::Location> locs;
- locs.push_back(currentLocation); // Location of the directive
- Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation();
- bool isDoConcurrent = outerDoConstruct.IsDoConcurrent();
- if (isDoConcurrent) {
- locs.push_back(converter.genLocation(
- Fortran::parser::FindSourceLocation(outerDoConstruct)));
- const Fortran::parser::LoopControl *loopControl =
- &*outerDoConstruct.GetLoopControl();
- const auto &concurrent =
- std::get<Fortran::parser::LoopControl::Concurrent>(loopControl->u);
- if (!std::get<std::list<Fortran::parser::LocalitySpec>>(concurrent.t)
- .empty())
- TODO(currentLocation, "DO CONCURRENT with locality spec");
-
- const auto &concurrentHeader =
- std::get<Fortran::parser::ConcurrentHeader>(concurrent.t);
- const auto &controls =
- std::get<std::list<Fortran::parser::ConcurrentControl>>(
- concurrentHeader.t);
- for (const auto &control : controls) {
- lowerbounds.push_back(fir::getBase(converter.genExprValue(
- *Fortran::semantics::GetExpr(std::get<1>(control.t)), stmtCtx)));
- upperbounds.push_back(fir::getBase(converter.genExprValue(
- *Fortran::semantics::GetExpr(std::get<2>(control.t)), stmtCtx)));
- if (const auto &expr =
- std::get<std::optional<Fortran::parser::ScalarIntExpr>>(
- control.t))
- steps.push_back(fir::getBase(converter.genExprValue(
- *Fortran::semantics::GetExpr(*expr), stmtCtx)));
- else // If `step` is not present, assume it is `1`.
- steps.push_back(builder.createIntegerConstant(
- currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
-
- const auto &name = std::get<Fortran::parser::Name>(control.t);
- privatizeIv(converter, *name.symbol, currentLocation, ivTypes, ivLocs,
- privateOperands, ivPrivate, privatizationRecipes,
- isDoConcurrent);
-
- inclusiveBounds.push_back(true);
- }
- } else {
- int64_t loopCount =
- Fortran::lower::getLoopCountForCollapseAndTile(accClauseList);
- for (unsigned i = 0; i < loopCount; ++i) {
- const Fortran::parser::LoopControl *loopControl;
- if (i == 0) {
- loopControl = &*outerDoConstruct.GetLoopControl();
- locs.push_back(converter.genLocation(
- Fortran::parser::FindSourceLocation(outerDoConstruct)));
- } else {
- auto *doCons = crtEval->getIf<Fortran::parser::DoConstruct>();
- assert(doCons && "expect do construct");
- loopControl = &*doCons->GetLoopControl();
- locs.push_back(converter.genLocation(
- Fortran::parser::FindSourceLocation(*doCons)));
- }
-
- const Fortran::parser::LoopControl::Bounds *bounds =
- std::get_if<Fortran::parser::LoopControl::Bounds>(&loopControl->u);
- assert(bounds && "Expected bounds on the loop construct");
- lowerbounds.push_back(fir::getBase(converter.genExprValue(
- *Fortran::semantics::GetExpr(bounds->lower), stmtCtx)));
- upperbounds.push_back(fir::getBase(converter.genExprValue(
- *Fortran::semantics::GetExpr(bounds->upper), stmtCtx)));
- if (bounds->step)
- steps.push_back(fir::getBase(converter.genExprValue(
- *Fortran::semantics::GetExpr(bounds->step), stmtCtx)));
- else // If `step` is not present, assume it is `1`.
- steps.push_back(builder.createIntegerConstant(
- currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
-
- Fortran::semantics::Symbol &ivSym =
- bounds->name.thing.symbol->GetUltimate();
- privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs,
- privateOperands, ivPrivate, privatizationRecipes);
-
- inclusiveBounds.push_back(true);
-
- if (i < loopCount - 1)
- crtEval = &*std::next(crtEval->getNestedEvaluations().begin());
- }
- }
-
- // Prepare the operand segment size attribute and the operands value range.
- llvm::SmallVector<mlir::Value> operands;
- llvm::SmallVector<int32_t> operandSegments;
- addOperands(operands, operandSegments, lowerbounds);
- addOperands(operands, operandSegments, upperbounds);
- addOperands(operands, operandSegments, steps);
- addOperands(operands, operandSegments, gangOperands);
- addOperands(operands, operandSegments, workerNumOperands);
- addOperands(operands, operandSegments, vectorOperands);
- addOperands(operands, operandSegments, tileOperands);
- addOperands(operands, operandSegments, cacheOperands);
- addOperands(operands, operandSegments, privateOperands);
- addOperands(operands, operandSegments, reductionOperands);
-
llvm::SmallVector<mlir::Type> retTy;
mlir::Value yieldValue;
if (needEarlyReturnHandling) {
@@ -2434,16 +2496,13 @@ static mlir::acc::LoopOp createLoopOp(
retTy.push_back(i1Ty);
}
- auto loopOp = createRegionOp<mlir::acc::LoopOp, mlir::acc::YieldOp>(
- builder, builder.getFusedLoc(locs), currentLocation, eval, operands,
- operandSegments, /*outerCombined=*/false, retTy, yieldValue, ivTypes,
- ivLocs);
-
- for (auto [arg, value] : llvm::zip(
- loopOp.getLoopRegions().front()->front().getArguments(), ivPrivate))
- fir::StoreOp::create(builder, currentLocation, arg, value);
-
- loopOp.setInclusiveUpperbound(inclusiveBounds);
+ uint64_t loopsToProcess =
+ Fortran::lower::getLoopCountForCollapseAndTile(accClauseList);
+ auto loopOp = buildACCLoopOp(
+ converter, currentLocation, semanticsContext, stmtCtx, outerDoConstruct,
+ eval, privateOperands, privatizationRecipes, gangOperands,
+ workerNumOperands, vectorOperands, tileOperands, cacheOperands,
+ reductionOperands, retTy, yieldValue, loopsToProcess);
if (!gangDeviceTypes.empty())
loopOp.setGangAttr(builder.getArrayAttr(gangDeviceTypes));
@@ -4899,6 +4958,12 @@ bool Fortran::lower::isInOpenACCLoop(fir::FirOpBuilder &builder) {
return false;
}
+bool Fortran::lower::isInsideOpenACCComputeConstruct(
+ fir::FirOpBuilder &builder) {
+ return mlir::isa_and_nonnull<ACC_COMPUTE_CONSTRUCT_OPS>(
+ mlir::acc::getEnclosingComputeOp(builder.getRegion()));
+}
+
void Fortran::lower::setInsertionPointAfterOpenACCLoopIfInside(
fir::FirOpBuilder &builder) {
if (auto loopOp =
@@ -4913,10 +4978,10 @@ void Fortran::lower::genEarlyReturnInOpenACCLoop(fir::FirOpBuilder &builder,
mlir::acc::YieldOp::create(builder, loc, yieldValue);
}
-int64_t Fortran::lower::getLoopCountForCollapseAndTile(
+uint64_t Fortran::lower::getLoopCountForCollapseAndTile(
const Fortran::parser::AccClauseList &clauseList) {
- int64_t collapseLoopCount = 1;
- int64_t tileLoopCount = 1;
+ uint64_t collapseLoopCount = 1;
+ uint64_t tileLoopCount = 1;
for (const Fortran::parser::AccClause &clause : clauseList.v) {
if (const auto *collapseClause =
std::get_if<Fortran::parser::AccClause::Collapse>(&clause.u)) {
@@ -4935,3 +5000,101 @@ int64_t Fortran::lower::getLoopCountForCollapseAndTile(
return tileLoopCount;
return collapseLoopCount;
}
+
+/// Create an ACC loop operation for a DO construct when inside ACC compute
+/// constructs This serves as a bridge between regular DO construct handling and
+/// ACC loop creation
+mlir::Operation *Fortran::lower::genOpenACCLoopFromDoConstruct(
+ AbstractConverter &converter,
+ Fortran::semantics::SemanticsContext &semanticsContext,
+ Fortran::lower::SymMap &localSymbols,
+ const Fortran::parser::DoConstruct &doConstruct, pft::Evaluation &eval) {
+ // Only convert loops which have induction variables that need privatized.
+ if (!doConstruct.IsDoNormal() && !doConstruct.IsDoConcurrent())
+ return nullptr;
+
+ // If the evaluation is unstructured, then we cannot convert the loop
+ // because acc loop does not have an unstructured form.
+ // TODO: There may be other strategies that can be employed such
+ // as generating acc.private for the loop variables without attaching
+ // them to acc.loop.
+ // For now - generate a not-yet-implemented message because without
+ // privatizing the induction variable, the loop may not execute correctly.
+ // Only do this for `acc kernels` because in `acc parallel`, scalars end
+ // up as implicitly firstprivate.
+ if (eval.lowerAsUnstructured()) {
+ if (mlir::isa_and_present<mlir::acc::KernelsOp>(
+ mlir::acc::getEnclosingComputeOp(
+ converter.getFirOpBuilder().getRegion())))
+ TODO(converter.getCurrentLocation(),
+ "unstructured do loop in acc kernels");
+ return nullptr;
+ }
+
+ // Open up a new scope for the loop variables.
+ localSymbols.pushScope();
+ auto scopeGuard = llvm::make_scope_exit([&]() { localSymbols.popScope(); });
+
+ // Prepare empty operand vectors since there are no associated `acc loop`
+ // clauses with the Fortran do loops being handled here.
+ llvm::SmallVector<mlir::Value> privateOperands, gangOperands,
+ workerNumOperands, vectorOperands, tileOperands, cacheOperands,
+ reductionOperands;
+ llvm::SmallVector<mlir::Attribute> privatizationRecipes;
+ llvm::SmallVector<mlir::Type> retTy;
+ mlir::Value yieldValue;
+ uint64_t loopsToProcess = 1; // Single loop construct
+
+ // Use same mechanism that handles `acc loop` contained do loops to handle
+ // the implicit loop case.
+ Fortran::lower::StatementContext stmtCtx;
+ auto loopOp = buildACCLoopOp(
+ converter, converter.getCurrentLocation(), semanticsContext, stmtCtx,
+ doConstruct, eval, privateOperands, privatizationRecipes, gangOperands,
+ workerNumOperands, vectorOperands, tileOperands, cacheOperands,
+ reductionOperands, retTy, yieldValue, loopsToProcess);
+
+ fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+ if (!privatizationRecipes.empty())
+ loopOp.setPrivatizationRecipesAttr(mlir::ArrayAttr::get(
+ converter.getFirOpBuilder().getContext(), privatizationRecipes));
+
+ // Normal do loops which are not annotated with `acc loop` should be
+ // left for analysis by marking with `auto`. This is the case even in the case
+ // of `acc parallel` region because the normal rules of applying `independent`
+ // is only for loops marked with `acc loop`.
+ // For do concurrent loops, the spec says in section 2.17.2:
+ // "When do concurrent appears without a loop construct in a kernels construct
+ // it is treated as if it is annotated with loop auto. If it appears in a
+ // parallel construct or an accelerator routine then it is treated as if it is
+ // annotated with loop independent."
+ // So this means that in all cases we mark with `auto` unless it is a
+ // `do concurrent` in an `acc parallel` construct or it must be `seq` because
+ // it is in an `acc serial` construct.
+ mlir::Operation *accRegionOp =
+ mlir::acc::getEnclosingComputeOp(converter.getFirOpBuilder().getRegion());
+ mlir::acc::LoopParMode parMode =
+ mlir::isa_and_present<mlir::acc::ParallelOp>(accRegionOp) &&
+ doConstruct.IsDoConcurrent()
+ ? mlir::acc::LoopParMode::loop_independent
+ : mlir::isa_and_present<mlir::acc::SerialOp>(accRegionOp)
+ ? mlir::acc::LoopParMode::loop_seq
+ : mlir::acc::LoopParMode::loop_auto;
+
+ // Set the parallel mode based on the computed parMode
+ auto deviceNoneAttr = mlir::acc::DeviceTypeAttr::get(
+ builder.getContext(), mlir::acc::DeviceType::None);
+ auto arrOfDeviceNone =
+ mlir::ArrayAttr::get(builder.getContext(), deviceNoneAttr);
+ if (parMode == mlir::acc::LoopParMode::loop_independent) {
+ loopOp.setIndependentAttr(arrOfDeviceNone);
+ } else if (parMode == mlir::acc::LoopParMode::loop_seq) {
+ loopOp.setSeqAttr(arrOfDeviceNone);
+ } else if (parMode == mlir::acc::LoopParMode::loop_auto) {
+ loopOp.setAuto_Attr(arrOfDeviceNone);
+ } else {
+ llvm_unreachable("Unexpected loop par mode");
+ }
+
+ return loopOp;
+}
diff --git a/flang/lib/Lower/OpenMP/Atomic.cpp b/flang/lib/Lower/OpenMP/Atomic.cpp
index 9a233d2..d4f83f5 100644
--- a/flang/lib/Lower/OpenMP/Atomic.cpp
+++ b/flang/lib/Lower/OpenMP/Atomic.cpp
@@ -635,9 +635,16 @@ genAtomicUpdate(lower::AbstractConverter &converter,
}
}
+ mlir::ModuleOp module = builder.getModule();
+ mlir::omp::AtomicControlAttr atomicControlAttr =
+ mlir::omp::AtomicControlAttr::get(
+ builder.getContext(), fir::getAtomicIgnoreDenormalMode(module),
+ fir::getAtomicFineGrainedMemory(module),
+ fir::getAtomicRemoteMemory(module));
builder.restoreInsertionPoint(atomicAt);
auto updateOp = mlir::omp::AtomicUpdateOp::create(
- builder, loc, atomAddr, hint, makeMemOrderAttr(converter, memOrder));
+ builder, loc, atomAddr, atomicControlAttr, hint,
+ makeMemOrderAttr(converter, memOrder));
mlir::Region &region = updateOp->getRegion(0);
mlir::Block *block = builder.createBlock(&region, {}, {atomType}, {loc});
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 12089d6..6a4ec77 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -697,20 +697,16 @@ static void threadPrivatizeVars(lower::AbstractConverter &converter,
}
}
-static mlir::Operation *
-createAndSetPrivatizedLoopVar(lower::AbstractConverter &converter,
- mlir::Location loc, mlir::Value indexVal,
- const semantics::Symbol *sym) {
+static mlir::Operation *setLoopVar(lower::AbstractConverter &converter,
+ mlir::Location loc, mlir::Value indexVal,
+ const semantics::Symbol *sym) {
fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+
mlir::OpBuilder::InsertPoint insPt = firOpBuilder.saveInsertionPoint();
firOpBuilder.setInsertionPointToStart(firOpBuilder.getAllocaBlock());
-
mlir::Type tempTy = converter.genType(*sym);
-
- assert(converter.isPresentShallowLookup(*sym) &&
- "Expected symbol to be in symbol table.");
-
firOpBuilder.restoreInsertionPoint(insPt);
+
mlir::Value cvtVal = firOpBuilder.createConvert(loc, tempTy, indexVal);
hlfir::Entity lhs{converter.getSymbolAddress(*sym)};
@@ -721,6 +717,15 @@ createAndSetPrivatizedLoopVar(lower::AbstractConverter &converter,
return storeOp;
}
+static mlir::Operation *
+createAndSetPrivatizedLoopVar(lower::AbstractConverter &converter,
+ mlir::Location loc, mlir::Value indexVal,
+ const semantics::Symbol *sym) {
+ assert(converter.isPresentShallowLookup(*sym) &&
+ "Expected symbol to be in symbol table.");
+ return setLoopVar(converter, loc, indexVal, sym);
+}
+
// This helper function implements the functionality of "promoting" non-CPTR
// arguments of use_device_ptr to use_device_addr arguments (automagic
// conversion of use_device_ptr -> use_device_addr in these cases). The way we
@@ -1123,6 +1128,11 @@ struct OpWithBodyGenInfo {
return *this;
}
+ OpWithBodyGenInfo &setPrivatize(bool value) {
+ privatize = value;
+ return *this;
+ }
+
/// [inout] converter to use for the clauses.
lower::AbstractConverter &converter;
/// [in] Symbol table
@@ -1149,6 +1159,8 @@ struct OpWithBodyGenInfo {
/// [in] if set to `true`, skip generating nested evaluations and dispatching
/// any further leaf constructs.
bool genSkeletonOnly = false;
+ /// [in] enables handling of privatized variable unless set to `false`.
+ bool privatize = true;
};
/// Create the body (block) for an OpenMP Operation.
@@ -1209,7 +1221,7 @@ static void createBodyOfOp(mlir::Operation &op, const OpWithBodyGenInfo &info,
// code will use the right symbols.
bool isLoop = llvm::omp::getDirectiveAssociation(info.dir) ==
llvm::omp::Association::Loop;
- bool privatize = info.clauses;
+ bool privatize = info.clauses && info.privatize;
firOpBuilder.setInsertionPoint(marker);
std::optional<DataSharingProcessor> tempDsp;
@@ -2083,7 +2095,7 @@ genCanonicalLoopOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
const ConstructQueue &queue,
ConstructQueue::const_iterator item,
llvm::ArrayRef<const semantics::Symbol *> ivs,
- llvm::omp::Directive directive, DataSharingProcessor &dsp) {
+ llvm::omp::Directive directive) {
fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
assert(ivs.size() == 1 && "Nested loops not yet implemented");
@@ -2176,10 +2188,8 @@ genCanonicalLoopOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
mlir::Value userVal =
firOpBuilder.create<mlir::arith::AddIOp>(loc, loopLBVar, scaled);
- // The argument is not currently in memory, so make a temporary for the
- // argument, and store it there, then bind that location to the argument.
- mlir::Operation *storeOp =
- createAndSetPrivatizedLoopVar(converter, loc, userVal, iv);
+ // Write loop value to loop variable
+ mlir::Operation *storeOp = setLoopVar(converter, loc, userVal, iv);
firOpBuilder.setInsertionPointAfter(storeOp);
return {iv};
@@ -2190,7 +2200,7 @@ genCanonicalLoopOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
OpWithBodyGenInfo(converter, symTable, semaCtx, loc, nestedEval,
directive)
.setClauses(&item->clauses)
- .setDataSharingProcessor(&dsp)
+ .setPrivatize(false)
.setGenRegionEntryCb(ivCallback),
queue, item, tripcount, cli);
@@ -2216,17 +2226,10 @@ static void genUnrollOp(Fortran::lower::AbstractConverter &converter,
cp.processTODO<clause::Partial, clause::Full>(
loc, llvm::omp::Directive::OMPD_unroll);
- // Even though unroll does not support data-sharing clauses, but this is
- // required to fill the symbol table.
- DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
- /*shouldCollectPreDeterminedSymbols=*/true,
- /*useDelayedPrivatization=*/false, symTable);
- dsp.processStep1();
-
// Emit the associated loop
auto canonLoop =
genCanonicalLoopOp(converter, symTable, semaCtx, eval, loc, queue, item,
- iv, llvm::omp::Directive::OMPD_unroll, dsp);
+ iv, llvm::omp::Directive::OMPD_unroll);
// Apply unrolling to it
auto cli = canonLoop.getCli();
diff --git a/flang/lib/Optimizer/Dialect/Support/FIRContext.cpp b/flang/lib/Optimizer/Dialect/Support/FIRContext.cpp
index 01c0be6..c2e0afe1 100644
--- a/flang/lib/Optimizer/Dialect/Support/FIRContext.cpp
+++ b/flang/lib/Optimizer/Dialect/Support/FIRContext.cpp
@@ -88,6 +88,57 @@ void fir::setTuneCPU(mlir::ModuleOp mod, llvm::StringRef cpu) {
mod->setAttr(tuneCpuName, mlir::StringAttr::get(ctx, cpu));
}
+static constexpr const char *atomicIgnoreDenormalModeName =
+ "fir.atomic_ignore_denormal_mode";
+
+void fir::setAtomicIgnoreDenormalMode(mlir::ModuleOp mod, bool value) {
+ if (value) {
+ auto *ctx = mod.getContext();
+ mod->setAttr(atomicIgnoreDenormalModeName, mlir::UnitAttr::get(ctx));
+ } else {
+ if (mod->hasAttr(atomicIgnoreDenormalModeName))
+ mod->removeAttr(atomicIgnoreDenormalModeName);
+ }
+}
+
+bool fir::getAtomicIgnoreDenormalMode(mlir::ModuleOp mod) {
+ return mod->hasAttr(atomicIgnoreDenormalModeName);
+}
+
+static constexpr const char *atomicFineGrainedMemoryName =
+ "fir.atomic_fine_grained_memory";
+
+void fir::setAtomicFineGrainedMemory(mlir::ModuleOp mod, bool value) {
+ if (value) {
+ auto *ctx = mod.getContext();
+ mod->setAttr(atomicFineGrainedMemoryName, mlir::UnitAttr::get(ctx));
+ } else {
+ if (mod->hasAttr(atomicFineGrainedMemoryName))
+ mod->removeAttr(atomicFineGrainedMemoryName);
+ }
+}
+
+bool fir::getAtomicFineGrainedMemory(mlir::ModuleOp mod) {
+ return mod->hasAttr(atomicFineGrainedMemoryName);
+}
+
+static constexpr const char *atomicRemoteMemoryName =
+ "fir.atomic_remote_memory";
+
+void fir::setAtomicRemoteMemory(mlir::ModuleOp mod, bool value) {
+ if (value) {
+ auto *ctx = mod.getContext();
+ mod->setAttr(atomicRemoteMemoryName, mlir::UnitAttr::get(ctx));
+ } else {
+ if (mod->hasAttr(atomicRemoteMemoryName))
+ mod->removeAttr(atomicRemoteMemoryName);
+ }
+}
+
+bool fir::getAtomicRemoteMemory(mlir::ModuleOp mod) {
+ return mod->hasAttr(atomicRemoteMemoryName);
+}
+
llvm::StringRef fir::getTuneCPU(mlir::ModuleOp mod) {
if (auto attr = mod->getAttrOfType<mlir::StringAttr>(tuneCpuName))
return attr.getValue();
diff --git a/flang/lib/Optimizer/Support/CMakeLists.txt b/flang/lib/Optimizer/Support/CMakeLists.txt
index 7ccdd4f..38038e1 100644
--- a/flang/lib/Optimizer/Support/CMakeLists.txt
+++ b/flang/lib/Optimizer/Support/CMakeLists.txt
@@ -1,6 +1,3 @@
-get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
-get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)
-
add_flang_library(FIRSupport
DataLayout.cpp
InitFIR.cpp
@@ -23,12 +20,12 @@ add_flang_library(FIRSupport
${extension_libs}
MLIR_LIBS
- ${dialect_libs}
- ${extension_libs}
MLIRBuiltinToLLVMIRTranslation
+ MLIRLLVMToLLVMIRTranslation
MLIROpenACCToLLVMIRTranslation
MLIROpenMPToLLVMIRTranslation
- MLIRLLVMToLLVMIRTranslation
+ MLIRRegisterAllDialects
+ MLIRRegisterAllExtensions
MLIRTargetLLVMIRExport
MLIRTargetLLVMIRImport
)
diff --git a/flang/test/Lower/OpenACC/Todo/do-loops-to-acc-loops-todo.f90 b/flang/test/Lower/OpenACC/Todo/do-loops-to-acc-loops-todo.f90
new file mode 100644
index 0000000..aa1d443
--- /dev/null
+++ b/flang/test/Lower/OpenACC/Todo/do-loops-to-acc-loops-todo.f90
@@ -0,0 +1,91 @@
+! RUN: split-file %s %t
+! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/do_loop_with_stop.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK1
+! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/do_loop_with_cycle_goto.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK2
+! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/nested_goto_loop.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK3
+! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/nested_loop_with_inner_goto.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK4
+
+//--- do_loop_with_stop.f90
+
+subroutine do_loop_with_stop()
+ integer :: i
+ integer, parameter :: n = 10
+ real, dimension(n) :: a, b
+
+ !$acc kernels
+ do i = 1, n
+ a(i) = b(i) + 1.0
+ if (i == 5) stop
+ end do
+ !$acc end kernels
+
+! CHECK1: not yet implemented: unstructured do loop in acc kernels
+
+end subroutine
+
+//--- do_loop_with_cycle_goto.f90
+
+subroutine do_loop_with_cycle_goto()
+ integer :: i
+ integer, parameter :: n = 10
+ real, dimension(n) :: a, b
+
+ ! Do loop with cycle and goto - unstructured control flow is not converted.
+ !$acc kernels
+ do i = 1, n
+ if (i == 3) cycle
+ a(i) = b(i) + 1.0
+ if (i == 7) goto 200
+ a(i) = a(i) * 2.0
+ end do
+200 continue
+ !$acc end kernels
+
+! CHECK2: not yet implemented: unstructured do loop in acc kernels
+
+end subroutine
+
+//--- nested_goto_loop.f90
+
+subroutine nested_goto_loop()
+ integer :: i, j
+ integer, parameter :: n = 10, m = 5
+ real, dimension(n,m) :: a, b
+
+ ! Nested loop with goto from inner to outer - should NOT convert to acc.loop
+ !$acc kernels
+ do i = 1, n
+ do j = 1, m
+ a(i,j) = b(i,j) + 1.0
+ if (i * j > 20) goto 300 ! Exit both loops
+ end do
+ end do
+300 continue
+ !$acc end kernels
+
+! CHECK3: not yet implemented: unstructured do loop in acc kernels
+
+end subroutine
+
+//--- nested_loop_with_inner_goto.f90
+
+subroutine nested_loop_with_inner_goto()
+ integer :: ii = 0, jj = 0
+ integer, parameter :: nn = 3
+ real, dimension(nn, nn) :: aa
+
+ aa = -1
+
+ ! Nested loop with goto from inner loop - unstructured control flow is not converted.
+ !$acc kernels
+ do ii = 1, nn
+ do jj = 1, nn
+ if (jj > 1) goto 300
+ aa(jj, ii) = 1337
+ end do
+ 300 continue
+ end do
+ !$acc end kernels
+
+! CHECK4: not yet implemented: unstructured do loop in acc kernels
+
+end subroutine \ No newline at end of file
diff --git a/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90 b/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90
new file mode 100644
index 0000000..5f8ea03
--- /dev/null
+++ b/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90
@@ -0,0 +1,267 @@
+! This test checks lowering of Fortran do loops and do concurrent loops to OpenACC loop constructs.
+! Tests the new functionality that converts Fortran iteration constructs to acc.loop with proper IV handling.
+
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+
+! CHECK-LABEL: func.func @_QPbasic_do_loop
+subroutine basic_do_loop()
+ integer :: i
+ integer, parameter :: n = 10
+ real, dimension(n) :: a, b
+
+ ! Basic do loop that should be converted to acc.loop
+ !$acc kernels
+ do i = 1, n
+ a(i) = b(i) + 1.0
+ end do
+ !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPbasic_do_concurrent
+subroutine basic_do_concurrent()
+ integer :: i
+ integer, parameter :: n = 10
+ real, dimension(n) :: a, b
+
+ ! Basic do concurrent loop
+ !$acc kernels
+ do concurrent (i = 1:n)
+ a(i) = b(i) + 1.0
+ end do
+ !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPbasic_do_loop_parallel
+subroutine basic_do_loop_parallel()
+ integer :: i
+ integer, parameter :: n = 10
+ real, dimension(n) :: a, b
+
+ ! Basic do loop with acc parallel that should be converted to acc.loop
+ !$acc parallel
+ do i = 1, n
+ a(i) = b(i) + 1.0
+ end do
+ !$acc end parallel
+
+! CHECK: acc.parallel {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPbasic_do_loop_serial
+subroutine basic_do_loop_serial()
+ integer :: i
+ integer, parameter :: n = 10
+ real, dimension(n) :: a, b
+
+ ! Basic do loop with acc serial that should be converted to acc.loop
+ !$acc serial
+ do i = 1, n
+ a(i) = b(i) + 1.0
+ end do
+ !$acc end serial
+
+! CHECK: acc.serial {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<none>]}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPbasic_do_concurrent_parallel
+subroutine basic_do_concurrent_parallel()
+ integer :: i
+ integer, parameter :: n = 10
+ real, dimension(n) :: a, b
+
+ ! Basic do concurrent loop with acc parallel
+ !$acc parallel
+ do concurrent (i = 1:n)
+ a(i) = b(i) + 1.0
+ end do
+ !$acc end parallel
+
+! CHECK: acc.parallel {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPbasic_do_concurrent_serial
+subroutine basic_do_concurrent_serial()
+ integer :: i
+ integer, parameter :: n = 10
+ real, dimension(n) :: a, b
+
+ ! Basic do concurrent loop with acc serial
+ !$acc serial
+ do concurrent (i = 1:n)
+ a(i) = b(i) + 1.0
+ end do
+ !$acc end serial
+
+! CHECK: acc.serial {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<none>]}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPmulti_dimension_do_concurrent
+subroutine multi_dimension_do_concurrent()
+ integer :: i, j, k
+ integer, parameter :: n = 10, m = 20, l = 5
+ real, dimension(n,m,l) :: a, b
+
+ ! Multi-dimensional do concurrent with multiple iteration variables
+ !$acc kernels
+ do concurrent (i = 1:n, j = 1:m, k = 1:l)
+ a(i,j,k) = b(i,j,k) * 2.0
+ end do
+ !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32) = (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) to (%{{.*}}, %{{.*}}, %{{.*}} : i32, i32, i32) step (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32)
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true, true>}
+end subroutine
+
+
+! CHECK-LABEL: func.func @_QPnested_do_loops
+subroutine nested_do_loops()
+ integer :: i, j
+ integer, parameter :: n = 10, m = 20
+ real, dimension(n,m) :: a, b
+
+ ! Nested do loops
+ !$acc kernels
+ do i = 1, n
+ do j = 1, m
+ a(i,j) = b(i,j) + i + j
+ end do
+ end do
+ !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPvariable_bounds_and_step
+subroutine variable_bounds_and_step(n, start_val, step_val)
+ integer, intent(in) :: n, start_val, step_val
+ integer :: i
+ real, dimension(n) :: a, b
+
+ ! Do loop with variable bounds and step
+ !$acc kernels
+ do i = start_val, n, step_val
+ a(i) = b(i) * 2.0
+ end do
+ !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPdifferent_iv_types
+subroutine different_iv_types()
+ integer(kind=8) :: i8
+ integer(kind=4) :: i4
+ integer(kind=2) :: i2
+ integer, parameter :: n = 10
+ real, dimension(n) :: a, b, c, d
+
+ ! Test different iteration variable types
+ !$acc kernels
+ do i8 = 1_8, int(n,8)
+ a(i8) = b(i8) + 1.0
+ end do
+ !$acc end kernels
+
+ !$acc kernels
+ do i4 = 1, n
+ b(i4) = c(i4) + 1.0
+ end do
+ !$acc end kernels
+
+ !$acc kernels
+ do i2 = 1_2, int(n,2)
+ c(i2) = d(i2) + 1.0
+ end do
+ !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i64) = (%{{.*}} : i64) to (%{{.*}} : i64) step (%{{.*}} : i64)
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i16) = (%{{.*}} : i16) to (%{{.*}} : i16) step (%{{.*}} : i16)
+
+end subroutine
+
+! -----------------------------------------------------------------------------------------
+! Tests for loops that should NOT be converted to acc.loop due to unstructured control flow
+
+! CHECK-LABEL: func.func @_QPinfinite_loop_no_iv
+subroutine infinite_loop_no_iv()
+ integer :: i
+ logical :: condition
+
+ ! Infinite loop with no induction variable - should NOT convert to acc.loop
+ !$acc kernels
+ do
+ i = i + 1
+ if (i > 100) exit
+ end do
+ !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK-NOT: acc.loop
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPwhile_like_loop
+subroutine while_like_loop()
+ integer :: i
+ logical :: condition
+
+ i = 1
+ condition = .true.
+
+ ! While-like infinite loop - should NOT convert to acc.loop
+ !$acc kernels
+ do while (condition)
+ i = i + 1
+ if (i > 100) condition = .false.
+ end do
+ !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK-NOT: acc.loop
+
+end subroutine
diff --git a/flang/test/Lower/OpenMP/atomic-control-options.f90 b/flang/test/Lower/OpenMP/atomic-control-options.f90
new file mode 100644
index 0000000..407f83b
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-control-options.f90
@@ -0,0 +1,37 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-device -munsafe-fp-atomics %s -o - | FileCheck -check-prefix=UNSAFE-FP-ATOMICS %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-device -fatomic-ignore-denormal-mode %s -o - | FileCheck -check-prefix=IGNORE-DENORMAL %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-device -fatomic-fine-grained-memory %s -o - | FileCheck -check-prefix=FINE-GRAINED-MEMORY %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-device -fatomic-remote-memory %s -o - | FileCheck -check-prefix=REMOTE-MEMORY %s
+program test
+ implicit none
+ integer :: A, B, threads
+ threads = 128
+ A = 0
+ B = 0
+ !UNSAFE-FP-ATOMICS: omp.atomic.update %{{.*}} : !fir.ref<i32> {
+ !UNSAFE-FP-ATOMICS: } {atomic_control = #omp.atomic_control<ignore_denormal_mode = true>}
+ !IGNORE-DENORMAL: omp.atomic.update %{{.*}} : !fir.ref<i32> {
+ !IGNORE-DENORMAL: } {atomic_control = #omp.atomic_control<ignore_denormal_mode = true>}
+ !FINE-GRAINED-MEMORY: omp.atomic.update %{{.*}} : !fir.ref<i32> {
+ !FINE-GRAINED-MEMORY: } {atomic_control = #omp.atomic_control<fine_grained_memory = true>}
+ !REMOTE-MEMORY: omp.atomic.update %{{.*}} : !fir.ref<i32> {
+ !REMOTE-MEMORY: } {atomic_control = #omp.atomic_control<remote_memory = true>}
+ !$omp target parallel num_threads(threads)
+ !$omp atomic
+ A = A + 1
+ !$omp end target parallel
+ !UNSAFE-FP-ATOMICS: omp.atomic.update %{{.*}} : !fir.ref<i32> {
+ !UNSAFE-FP-ATOMICS: } {atomic_control = #omp.atomic_control<ignore_denormal_mode = true>}
+ !IGNORE-DENORMAL: omp.atomic.update %{{.*}} : !fir.ref<i32> {
+ !IGNORE-DENORMAL: } {atomic_control = #omp.atomic_control<ignore_denormal_mode = true>}
+ !FINE-GRAINED-MEMORY: omp.atomic.update %{{.*}} : !fir.ref<i32> {
+ !FINE-GRAINED-MEMORY: } {atomic_control = #omp.atomic_control<fine_grained_memory = true>}
+ !REMOTE-MEMORY: omp.atomic.update %{{.*}} : !fir.ref<i32> {
+ !REMOTE-MEMORY: } {atomic_control = #omp.atomic_control<remote_memory = true>}
+ !$omp target parallel num_threads(threads)
+ !$omp atomic capture
+ A = A + B
+ B = A
+ !$omp end atomic
+ !$omp end target parallel
+end program test
diff --git a/flang/test/Lower/OpenMP/unroll-heuristic01.f90 b/flang/test/Lower/OpenMP/unroll-heuristic01.f90
index a5f5c00..34020eb 100644
--- a/flang/test/Lower/OpenMP/unroll-heuristic01.f90
+++ b/flang/test/Lower/OpenMP/unroll-heuristic01.f90
@@ -13,27 +13,42 @@ subroutine omp_unroll_heuristic01(lb, ub, inc)
end subroutine omp_unroll_heuristic01
-!CHECK-LABEL: func.func @_QPomp_unroll_heuristic01(
-!CHECK: %c0_i32 = arith.constant 0 : i32
-!CHECK-NEXT: %c1_i32 = arith.constant 1 : i32
-!CHECK-NEXT: %13 = arith.cmpi slt, %12, %c0_i32 : i32
-!CHECK-NEXT: %14 = arith.subi %c0_i32, %12 : i32
-!CHECK-NEXT: %15 = arith.select %13, %14, %12 : i32
-!CHECK-NEXT: %16 = arith.select %13, %11, %10 : i32
-!CHECK-NEXT: %17 = arith.select %13, %10, %11 : i32
-!CHECK-NEXT: %18 = arith.subi %17, %16 overflow<nuw> : i32
-!CHECK-NEXT: %19 = arith.divui %18, %15 : i32
-!CHECK-NEXT: %20 = arith.addi %19, %c1_i32 overflow<nuw> : i32
-!CHECK-NEXT: %21 = arith.cmpi slt, %17, %16 : i32
-!CHECK-NEXT: %22 = arith.select %21, %c0_i32, %20 : i32
-!CHECK-NEXT: %canonloop_s0 = omp.new_cli
-!CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%22) {
-!CHECK-NEXT: %23 = arith.muli %iv, %12 : i32
-!CHECK-NEXT: %24 = arith.addi %10, %23 : i32
-!CHECK-NEXT: hlfir.assign %24 to %9#0 : i32, !fir.ref<i32>
-!CHECK-NEXT: %25 = fir.load %9#0 : !fir.ref<i32>
-!CHECK-NEXT: hlfir.assign %25 to %6#0 : i32, !fir.ref<i32>
-!CHECK-NEXT: omp.terminator
-!CHECK-NEXT: }
-!CHECK-NEXT: omp.unroll_heuristic(%canonloop_s0)
-!CHECK-NEXT: return
+! CHECK-LABEL: func.func @_QPomp_unroll_heuristic01(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "lb"},
+! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "ub"},
+! CHECK-SAME: %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "inc"}) {
+! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_unroll_heuristic01Ei"}
+! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFomp_unroll_heuristic01Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic01Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic01Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_unroll_heuristic01Eres"}
+! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFomp_unroll_heuristic01Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic01Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
+! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32>
+! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
+! CHECK: %[[VAL_11:.*]] = arith.constant 0 : i32
+! CHECK: %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK: %[[VAL_13:.*]] = arith.cmpi slt, %[[VAL_10]], %[[VAL_11]] : i32
+! CHECK: %[[VAL_14:.*]] = arith.subi %[[VAL_11]], %[[VAL_10]] : i32
+! CHECK: %[[VAL_15:.*]] = arith.select %[[VAL_13]], %[[VAL_14]], %[[VAL_10]] : i32
+! CHECK: %[[VAL_16:.*]] = arith.select %[[VAL_13]], %[[VAL_9]], %[[VAL_8]] : i32
+! CHECK: %[[VAL_17:.*]] = arith.select %[[VAL_13]], %[[VAL_8]], %[[VAL_9]] : i32
+! CHECK: %[[VAL_18:.*]] = arith.subi %[[VAL_17]], %[[VAL_16]] overflow<nuw> : i32
+! CHECK: %[[VAL_19:.*]] = arith.divui %[[VAL_18]], %[[VAL_15]] : i32
+! CHECK: %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_12]] overflow<nuw> : i32
+! CHECK: %[[VAL_21:.*]] = arith.cmpi slt, %[[VAL_17]], %[[VAL_16]] : i32
+! CHECK: %[[VAL_22:.*]] = arith.select %[[VAL_21]], %[[VAL_11]], %[[VAL_20]] : i32
+! CHECK: %[[VAL_23:.*]] = omp.new_cli
+! CHECK: omp.canonical_loop(%[[VAL_23]]) %[[VAL_24:.*]] : i32 in range(%[[VAL_22]]) {
+! CHECK: %[[VAL_25:.*]] = arith.muli %[[VAL_24]], %[[VAL_10]] : i32
+! CHECK: %[[VAL_26:.*]] = arith.addi %[[VAL_8]], %[[VAL_25]] : i32
+! CHECK: hlfir.assign %[[VAL_26]] to %[[VAL_2]]#0 : i32, !fir.ref<i32>
+! CHECK: %[[VAL_27:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
+! CHECK: hlfir.assign %[[VAL_27]] to %[[VAL_6]]#0 : i32, !fir.ref<i32>
+! CHECK: omp.terminator
+! CHECK: }
+! CHECK: omp.unroll_heuristic(%[[VAL_23]])
+! CHECK: return
+! CHECK: } \ No newline at end of file
diff --git a/flang/test/Lower/OpenMP/unroll-heuristic02.f90 b/flang/test/Lower/OpenMP/unroll-heuristic02.f90
index 14f694d..fdb1366 100644
--- a/flang/test/Lower/OpenMP/unroll-heuristic02.f90
+++ b/flang/test/Lower/OpenMP/unroll-heuristic02.f90
@@ -37,61 +37,55 @@ end subroutine omp_unroll_heuristic_nested02
!CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Eouter_ub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
!CHECK: %[[VAL_11:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_unroll_heuristic_nested02Eres"}
!CHECK: %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFomp_unroll_heuristic_nested02Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[VAL_13:.*]] = fir.alloca i32 {bindc_name = "i", pinned, uniq_name = "_QFomp_unroll_heuristic_nested02Ei"}
-!CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_13]] {uniq_name = "_QFomp_unroll_heuristic_nested02Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[VAL_15:.*]] = fir.alloca i32 {bindc_name = "j", pinned, uniq_name = "_QFomp_unroll_heuristic_nested02Ej"}
-!CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFomp_unroll_heuristic_nested02Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[VAL_17:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32>
-!CHECK: %[[VAL_18:.*]] = fir.load %[[VAL_10]]#0 : !fir.ref<i32>
-!CHECK: %[[VAL_19:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
-!CHECK: %[[VAL_20:.*]] = arith.constant 0 : i32
-!CHECK: %[[VAL_21:.*]] = arith.constant 1 : i32
-!CHECK: %[[VAL_22:.*]] = arith.cmpi slt, %[[VAL_19]], %[[VAL_20]] : i32
-!CHECK: %[[VAL_23:.*]] = arith.subi %[[VAL_20]], %[[VAL_19]] : i32
-!CHECK: %[[VAL_24:.*]] = arith.select %[[VAL_22]], %[[VAL_23]], %[[VAL_19]] : i32
-!CHECK: %[[VAL_25:.*]] = arith.select %[[VAL_22]], %[[VAL_18]], %[[VAL_17]] : i32
-!CHECK: %[[VAL_26:.*]] = arith.select %[[VAL_22]], %[[VAL_17]], %[[VAL_18]] : i32
-!CHECK: %[[VAL_27:.*]] = arith.subi %[[VAL_26]], %[[VAL_25]] overflow<nuw> : i32
-!CHECK: %[[VAL_28:.*]] = arith.divui %[[VAL_27]], %[[VAL_24]] : i32
-!CHECK: %[[VAL_29:.*]] = arith.addi %[[VAL_28]], %[[VAL_21]] overflow<nuw> : i32
-!CHECK: %[[VAL_30:.*]] = arith.cmpi slt, %[[VAL_26]], %[[VAL_25]] : i32
-!CHECK: %[[VAL_31:.*]] = arith.select %[[VAL_30]], %[[VAL_20]], %[[VAL_29]] : i32
-!CHECK: %[[VAL_32:.*]] = omp.new_cli
-!CHECK: omp.canonical_loop(%[[VAL_32]]) %[[VAL_33:.*]] : i32 in range(%[[VAL_31]]) {
-!CHECK: %[[VAL_34:.*]] = arith.muli %[[VAL_33]], %[[VAL_19]] : i32
-!CHECK: %[[VAL_35:.*]] = arith.addi %[[VAL_17]], %[[VAL_34]] : i32
-!CHECK: hlfir.assign %[[VAL_35]] to %[[VAL_14]]#0 : i32, !fir.ref<i32>
-!CHECK: %[[VAL_36:.*]] = fir.alloca i32 {bindc_name = "j", pinned, uniq_name = "_QFomp_unroll_heuristic_nested02Ej"}
-!CHECK: %[[VAL_37:.*]]:2 = hlfir.declare %[[VAL_36]] {uniq_name = "_QFomp_unroll_heuristic_nested02Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[VAL_38:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
-!CHECK: %[[VAL_39:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
-!CHECK: %[[VAL_40:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
-!CHECK: %[[VAL_41:.*]] = arith.constant 0 : i32
-!CHECK: %[[VAL_42:.*]] = arith.constant 1 : i32
-!CHECK: %[[VAL_43:.*]] = arith.cmpi slt, %[[VAL_40]], %[[VAL_41]] : i32
-!CHECK: %[[VAL_44:.*]] = arith.subi %[[VAL_41]], %[[VAL_40]] : i32
-!CHECK: %[[VAL_45:.*]] = arith.select %[[VAL_43]], %[[VAL_44]], %[[VAL_40]] : i32
-!CHECK: %[[VAL_46:.*]] = arith.select %[[VAL_43]], %[[VAL_39]], %[[VAL_38]] : i32
-!CHECK: %[[VAL_47:.*]] = arith.select %[[VAL_43]], %[[VAL_38]], %[[VAL_39]] : i32
-!CHECK: %[[VAL_48:.*]] = arith.subi %[[VAL_47]], %[[VAL_46]] overflow<nuw> : i32
-!CHECK: %[[VAL_49:.*]] = arith.divui %[[VAL_48]], %[[VAL_45]] : i32
-!CHECK: %[[VAL_50:.*]] = arith.addi %[[VAL_49]], %[[VAL_42]] overflow<nuw> : i32
-!CHECK: %[[VAL_51:.*]] = arith.cmpi slt, %[[VAL_47]], %[[VAL_46]] : i32
-!CHECK: %[[VAL_52:.*]] = arith.select %[[VAL_51]], %[[VAL_41]], %[[VAL_50]] : i32
-!CHECK: %[[VAL_53:.*]] = omp.new_cli
-!CHECK: omp.canonical_loop(%[[VAL_53]]) %[[VAL_54:.*]] : i32 in range(%[[VAL_52]]) {
-!CHECK: %[[VAL_55:.*]] = arith.muli %[[VAL_54]], %[[VAL_40]] : i32
-!CHECK: %[[VAL_56:.*]] = arith.addi %[[VAL_38]], %[[VAL_55]] : i32
-!CHECK: hlfir.assign %[[VAL_56]] to %[[VAL_37]]#0 : i32, !fir.ref<i32>
-!CHECK: %[[VAL_57:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32>
-!CHECK: %[[VAL_58:.*]] = fir.load %[[VAL_37]]#0 : !fir.ref<i32>
-!CHECK: %[[VAL_59:.*]] = arith.addi %[[VAL_57]], %[[VAL_58]] : i32
-!CHECK: hlfir.assign %[[VAL_59]] to %[[VAL_12]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32>
+!CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_10]]#0 : !fir.ref<i32>
+!CHECK: %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
+!CHECK: %[[VAL_16:.*]] = arith.constant 0 : i32
+!CHECK: %[[VAL_17:.*]] = arith.constant 1 : i32
+!CHECK: %[[VAL_18:.*]] = arith.cmpi slt, %[[VAL_15]], %[[VAL_16]] : i32
+!CHECK: %[[VAL_19:.*]] = arith.subi %[[VAL_16]], %[[VAL_15]] : i32
+!CHECK: %[[VAL_20:.*]] = arith.select %[[VAL_18]], %[[VAL_19]], %[[VAL_15]] : i32
+!CHECK: %[[VAL_21:.*]] = arith.select %[[VAL_18]], %[[VAL_14]], %[[VAL_13]] : i32
+!CHECK: %[[VAL_22:.*]] = arith.select %[[VAL_18]], %[[VAL_13]], %[[VAL_14]] : i32
+!CHECK: %[[VAL_23:.*]] = arith.subi %[[VAL_22]], %[[VAL_21]] overflow<nuw> : i32
+!CHECK: %[[VAL_24:.*]] = arith.divui %[[VAL_23]], %[[VAL_20]] : i32
+!CHECK: %[[VAL_25:.*]] = arith.addi %[[VAL_24]], %[[VAL_17]] overflow<nuw> : i32
+!CHECK: %[[VAL_26:.*]] = arith.cmpi slt, %[[VAL_22]], %[[VAL_21]] : i32
+!CHECK: %[[VAL_27:.*]] = arith.select %[[VAL_26]], %[[VAL_16]], %[[VAL_25]] : i32
+!CHECK: %[[VAL_28:.*]] = omp.new_cli
+!CHECK: omp.canonical_loop(%[[VAL_28]]) %[[VAL_29:.*]] : i32 in range(%[[VAL_27]]) {
+!CHECK: %[[VAL_30:.*]] = arith.muli %[[VAL_29]], %[[VAL_15]] : i32
+!CHECK: %[[VAL_31:.*]] = arith.addi %[[VAL_13]], %[[VAL_30]] : i32
+!CHECK: hlfir.assign %[[VAL_31]] to %[[VAL_2]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[VAL_32:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
+!CHECK: %[[VAL_33:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+!CHECK: %[[VAL_34:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
+!CHECK: %[[VAL_35:.*]] = arith.constant 0 : i32
+!CHECK: %[[VAL_36:.*]] = arith.constant 1 : i32
+!CHECK: %[[VAL_37:.*]] = arith.cmpi slt, %[[VAL_34]], %[[VAL_35]] : i32
+!CHECK: %[[VAL_38:.*]] = arith.subi %[[VAL_35]], %[[VAL_34]] : i32
+!CHECK: %[[VAL_39:.*]] = arith.select %[[VAL_37]], %[[VAL_38]], %[[VAL_34]] : i32
+!CHECK: %[[VAL_40:.*]] = arith.select %[[VAL_37]], %[[VAL_33]], %[[VAL_32]] : i32
+!CHECK: %[[VAL_41:.*]] = arith.select %[[VAL_37]], %[[VAL_32]], %[[VAL_33]] : i32
+!CHECK: %[[VAL_42:.*]] = arith.subi %[[VAL_41]], %[[VAL_40]] overflow<nuw> : i32
+!CHECK: %[[VAL_43:.*]] = arith.divui %[[VAL_42]], %[[VAL_39]] : i32
+!CHECK: %[[VAL_44:.*]] = arith.addi %[[VAL_43]], %[[VAL_36]] overflow<nuw> : i32
+!CHECK: %[[VAL_45:.*]] = arith.cmpi slt, %[[VAL_41]], %[[VAL_40]] : i32
+!CHECK: %[[VAL_46:.*]] = arith.select %[[VAL_45]], %[[VAL_35]], %[[VAL_44]] : i32
+!CHECK: %[[VAL_47:.*]] = omp.new_cli
+!CHECK: omp.canonical_loop(%[[VAL_47]]) %[[VAL_48:.*]] : i32 in range(%[[VAL_46]]) {
+!CHECK: %[[VAL_49:.*]] = arith.muli %[[VAL_48]], %[[VAL_34]] : i32
+!CHECK: %[[VAL_50:.*]] = arith.addi %[[VAL_32]], %[[VAL_49]] : i32
+!CHECK: hlfir.assign %[[VAL_50]] to %[[VAL_7]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[VAL_51:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
+!CHECK: %[[VAL_52:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32>
+!CHECK: %[[VAL_53:.*]] = arith.addi %[[VAL_51]], %[[VAL_52]] : i32
+!CHECK: hlfir.assign %[[VAL_53]] to %[[VAL_12]]#0 : i32, !fir.ref<i32>
!CHECK: omp.terminator
!CHECK: }
-!CHECK: omp.unroll_heuristic(%[[VAL_53]])
+!CHECK: omp.unroll_heuristic(%[[VAL_47]])
!CHECK: omp.terminator
!CHECK: }
-!CHECK: omp.unroll_heuristic(%[[VAL_32]])
+!CHECK: omp.unroll_heuristic(%[[VAL_28]])
!CHECK: return
!CHECK: }
diff --git a/flang/test/Lower/OpenMP/unroll-heuristic03.f90 b/flang/test/Lower/OpenMP/unroll-heuristic03.f90
new file mode 100644
index 0000000..308c149
--- /dev/null
+++ b/flang/test/Lower/OpenMP/unroll-heuristic03.f90
@@ -0,0 +1,61 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
+
+! Test implicitly privatized loop variable that is affected by unrolling.
+
+subroutine omp_unroll_heuristic03(lb, ub, inc)
+ integer res, i, lb, ub, inc
+
+ !$omp parallel
+ !$omp unroll
+ do i = lb, ub, inc
+ res = i
+ end do
+ !$omp end unroll
+ !$omp end parallel
+
+end subroutine omp_unroll_heuristic03
+
+
+! CHECK-LABEL: func.func @_QPomp_unroll_heuristic03(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "lb"},
+! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "ub"},
+! CHECK-SAME: %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "inc"}) {
+! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_unroll_heuristic03Ei"}
+! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFomp_unroll_heuristic03Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic03Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic03Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_unroll_heuristic03Eres"}
+! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFomp_unroll_heuristic03Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic03Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: omp.parallel private(@_QFomp_unroll_heuristic03Ei_private_i32 %[[VAL_2]]#0 -> %[[VAL_8:.*]] : !fir.ref<i32>) {
+! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFomp_unroll_heuristic03Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
+! CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32>
+! CHECK: %[[VAL_12:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
+! CHECK: %[[VAL_13:.*]] = arith.constant 0 : i32
+! CHECK: %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK: %[[VAL_15:.*]] = arith.cmpi slt, %[[VAL_12]], %[[VAL_13]] : i32
+! CHECK: %[[VAL_16:.*]] = arith.subi %[[VAL_13]], %[[VAL_12]] : i32
+! CHECK: %[[VAL_17:.*]] = arith.select %[[VAL_15]], %[[VAL_16]], %[[VAL_12]] : i32
+! CHECK: %[[VAL_18:.*]] = arith.select %[[VAL_15]], %[[VAL_11]], %[[VAL_10]] : i32
+! CHECK: %[[VAL_19:.*]] = arith.select %[[VAL_15]], %[[VAL_10]], %[[VAL_11]] : i32
+! CHECK: %[[VAL_20:.*]] = arith.subi %[[VAL_19]], %[[VAL_18]] overflow<nuw> : i32
+! CHECK: %[[VAL_21:.*]] = arith.divui %[[VAL_20]], %[[VAL_17]] : i32
+! CHECK: %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_14]] overflow<nuw> : i32
+! CHECK: %[[VAL_23:.*]] = arith.cmpi slt, %[[VAL_19]], %[[VAL_18]] : i32
+! CHECK: %[[VAL_24:.*]] = arith.select %[[VAL_23]], %[[VAL_13]], %[[VAL_22]] : i32
+! CHECK: %[[VAL_25:.*]] = omp.new_cli
+! CHECK: omp.canonical_loop(%[[VAL_25]]) %[[VAL_26:.*]] : i32 in range(%[[VAL_24]]) {
+! CHECK: %[[VAL_27:.*]] = arith.muli %[[VAL_26]], %[[VAL_12]] : i32
+! CHECK: %[[VAL_28:.*]] = arith.addi %[[VAL_10]], %[[VAL_27]] : i32
+! CHECK: hlfir.assign %[[VAL_28]] to %[[VAL_9]]#0 : i32, !fir.ref<i32>
+! CHECK: %[[VAL_29:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32>
+! CHECK: hlfir.assign %[[VAL_29]] to %[[VAL_6]]#0 : i32, !fir.ref<i32>
+! CHECK: omp.terminator
+! CHECK: }
+! CHECK: omp.unroll_heuristic(%[[VAL_25]])
+! CHECK: omp.terminator
+! CHECK: }
+! CHECK: return
+! CHECK: } \ No newline at end of file