diff options
Diffstat (limited to 'flang')
-rw-r--r-- | flang/docs/OpenMPSupport.md | 20 | ||||
-rw-r--r-- | flang/include/flang/Frontend/TargetOptions.h | 5 | ||||
-rw-r--r-- | flang/include/flang/Lower/OpenACC.h | 22 | ||||
-rw-r--r-- | flang/include/flang/Optimizer/Dialect/Support/FIRContext.h | 19 | ||||
-rw-r--r-- | flang/include/flang/Optimizer/Support/InitFIR.h | 8 | ||||
-rw-r--r-- | flang/lib/Frontend/CompilerInvocation.cpp | 10 | ||||
-rw-r--r-- | flang/lib/Lower/Bridge.cpp | 37 | ||||
-rw-r--r-- | flang/lib/Lower/OpenACC.cpp | 395 | ||||
-rw-r--r-- | flang/lib/Lower/OpenMP/Atomic.cpp | 9 | ||||
-rw-r--r-- | flang/lib/Lower/OpenMP/OpenMP.cpp | 51 | ||||
-rw-r--r-- | flang/lib/Optimizer/Dialect/Support/FIRContext.cpp | 51 | ||||
-rw-r--r-- | flang/lib/Optimizer/Support/CMakeLists.txt | 9 | ||||
-rw-r--r-- | flang/test/Lower/OpenACC/Todo/do-loops-to-acc-loops-todo.f90 | 91 | ||||
-rw-r--r-- | flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90 | 267 | ||||
-rw-r--r-- | flang/test/Lower/OpenMP/atomic-control-options.f90 | 37 | ||||
-rw-r--r-- | flang/test/Lower/OpenMP/unroll-heuristic01.f90 | 63 | ||||
-rw-r--r-- | flang/test/Lower/OpenMP/unroll-heuristic02.f90 | 98 | ||||
-rw-r--r-- | flang/test/Lower/OpenMP/unroll-heuristic03.f90 | 61 |
18 files changed, 1015 insertions, 238 deletions
diff --git a/flang/docs/OpenMPSupport.md b/flang/docs/OpenMPSupport.md index c9f19c3..81f5f9f 100644 --- a/flang/docs/OpenMPSupport.md +++ b/flang/docs/OpenMPSupport.md @@ -41,7 +41,7 @@ Note : No distinction is made between the support in Parser/Semantics, MLIR, Low | target construct | P | device clause not supported | | target update construct | P | device clause not supported | | declare target directive | P | | -| teams construct | P | reduction clause not supported | +| teams construct | Y | | | distribute construct | P | dist_schedule clause not supported | | distribute simd construct | P | dist_schedule and linear clauses are not supported | | distribute parallel loop construct | P | dist_schedule clause not supported | @@ -51,15 +51,15 @@ Note : No distinction is made between the support in Parser/Semantics, MLIR, Low | atomic construct extensions | Y | | | cancel construct | Y | | | cancellation point construct | Y | | -| parallel do simd construct | P | linear clause is not supported | -| target teams construct | P | device and reduction clauses are not supported | -| teams distribute construct | P | reduction and dist_schedule clauses not supported | -| teams distribute simd construct | P | reduction, dist_schedule and linear clauses are not supported | -| target teams distribute construct | P | device, reduction and dist_schedule clauses are not supported | -| teams distribute parallel loop construct | P | reduction and dist_schedule clauses are not supported | -| target teams distribute parallel loop construct | P | device, reduction and dist_schedule clauses are not supported | -| teams distribute parallel loop simd construct | P | reduction, dist_schedule, and linear clauses are not supported | -| target teams distribute parallel loop simd construct | P | device, reduction, dist_schedule and linear clauses are not supported | +| parallel do simd construct | P | linear clause not supported | +| target teams construct | P | device clause not supported | +| teams distribute construct | P | dist_schedule clause not supported | +| teams distribute simd construct | P | dist_schedule and linear clauses are not supported | +| target teams distribute construct | P | device and dist_schedule clauses are not supported | +| teams distribute parallel loop construct | P | dist_schedule clause not supported | +| target teams distribute parallel loop construct | P | device and dist_schedule clauses are not supported | +| teams distribute parallel loop simd construct | P | dist_schedule and linear clauses are not supported | +| target teams distribute parallel loop simd construct | P | device, dist_schedule and linear clauses are not supported | ## Extensions ### ATOMIC construct diff --git a/flang/include/flang/Frontend/TargetOptions.h b/flang/include/flang/Frontend/TargetOptions.h index 002d8d1..f6e5634 100644 --- a/flang/include/flang/Frontend/TargetOptions.h +++ b/flang/include/flang/Frontend/TargetOptions.h @@ -53,6 +53,11 @@ public: /// Print verbose assembly bool asmVerbose = false; + + /// Atomic control options + bool atomicIgnoreDenormalMode = false; + bool atomicRemoteMemory = false; + bool atomicFineGrainedMemory = false; }; } // end namespace Fortran::frontend diff --git a/flang/include/flang/Lower/OpenACC.h b/flang/include/flang/Lower/OpenACC.h index af34510..e974f3d 100644 --- a/flang/include/flang/Lower/OpenACC.h +++ b/flang/include/flang/Lower/OpenACC.h @@ -43,6 +43,7 @@ struct ProcedureDesignator; namespace parser { struct AccClauseList; +struct DoConstruct; struct OpenACCConstruct; struct OpenACCDeclarativeConstruct; struct OpenACCRoutineConstruct; @@ -58,6 +59,7 @@ namespace lower { class AbstractConverter; class StatementContext; +class SymMap; namespace pft { struct Evaluation; @@ -114,14 +116,32 @@ void attachDeclarePostDeallocAction(AbstractConverter &, fir::FirOpBuilder &, void genOpenACCTerminator(fir::FirOpBuilder &, mlir::Operation *, mlir::Location); -int64_t getLoopCountForCollapseAndTile(const Fortran::parser::AccClauseList &); +/// Used to obtain the number of contained loops to look for +/// since this is dependent on number of tile operands and collapse +/// clause. +uint64_t getLoopCountForCollapseAndTile(const Fortran::parser::AccClauseList &); +/// Checks whether the current insertion point is inside OpenACC loop. bool isInOpenACCLoop(fir::FirOpBuilder &); +/// Checks whether the current insertion point is inside OpenACC compute +/// construct. +bool isInsideOpenACCComputeConstruct(fir::FirOpBuilder &); + void setInsertionPointAfterOpenACCLoopIfInside(fir::FirOpBuilder &); void genEarlyReturnInOpenACCLoop(fir::FirOpBuilder &, mlir::Location); +/// Generates an OpenACC loop from a do construct in order to +/// properly capture the loop bounds, parallelism determination mode, +/// and to privatize the loop variables. +/// When the conversion is rejected, nullptr is returned. +mlir::Operation *genOpenACCLoopFromDoConstruct( + AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semanticsContext, + Fortran::lower::SymMap &localSymbols, + const Fortran::parser::DoConstruct &doConstruct, pft::Evaluation &eval); + } // namespace lower } // namespace Fortran diff --git a/flang/include/flang/Optimizer/Dialect/Support/FIRContext.h b/flang/include/flang/Optimizer/Dialect/Support/FIRContext.h index 2df14f8..c0c0b74 100644 --- a/flang/include/flang/Optimizer/Dialect/Support/FIRContext.h +++ b/flang/include/flang/Optimizer/Dialect/Support/FIRContext.h @@ -58,6 +58,25 @@ void setTargetCPU(mlir::ModuleOp mod, llvm::StringRef cpu); /// Get the target CPU string from the Module or return a null reference. llvm::StringRef getTargetCPU(mlir::ModuleOp mod); +/// Sets whether Denormal Mode can be ignored or not for lowering of floating +/// point atomic operations. +void setAtomicIgnoreDenormalMode(mlir::ModuleOp mod, bool value); +/// Gets whether Denormal Mode can be ignored or not for lowering of floating +/// point atomic operations. +bool getAtomicIgnoreDenormalMode(mlir::ModuleOp mod); +/// Sets whether fine grained memory can be used or not for lowering of atomic +/// operations. +void setAtomicFineGrainedMemory(mlir::ModuleOp mod, bool value); +/// Gets whether fine grained memory can be used or not for lowering of atomic +/// operations. +bool getAtomicFineGrainedMemory(mlir::ModuleOp mod); +/// Sets whether remote memory can be used or not for lowering of atomic +/// operations. +void setAtomicRemoteMemory(mlir::ModuleOp mod, bool value); +/// Gets whether remote memory can be used or not for lowering of atomic +/// operations. +bool getAtomicRemoteMemory(mlir::ModuleOp mod); + /// Set the tune CPU for the module. `cpu` must not be deallocated while /// module `mod` is still live. void setTuneCPU(mlir::ModuleOp mod, llvm::StringRef cpu); diff --git a/flang/include/flang/Optimizer/Support/InitFIR.h b/flang/include/flang/Optimizer/Support/InitFIR.h index aacba23..3e42ffd 100644 --- a/flang/include/flang/Optimizer/Support/InitFIR.h +++ b/flang/include/flang/Optimizer/Support/InitFIR.h @@ -20,12 +20,20 @@ #include "flang/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.h" #include "flang/Optimizer/OpenMP/Support/RegisterOpenMPExtensions.h" #include "mlir/Conversion/Passes.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/Passes.h" #include "mlir/Dialect/Complex/IR/Complex.h" +#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h" +#include "mlir/Dialect/DLTI/DLTI.h" #include "mlir/Dialect/Func/Extensions/InlinerExtension.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Index/IR/IndexDialect.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" +#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h" +#include "mlir/Dialect/Math/IR/Math.h" +#include "mlir/Dialect/OpenACC/OpenACC.h" #include "mlir/Dialect/OpenACC/Transforms/Passes.h" +#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/SCF/Transforms/Passes.h" #include "mlir/InitAllDialects.h" #include "mlir/Pass/Pass.h" diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index f55d866..111c5aa4 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -512,6 +512,16 @@ static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) { args.getLastArg(clang::driver::options::OPT_triple)) opts.triple = a->getValue(); + opts.atomicIgnoreDenormalMode = args.hasFlag( + clang::driver::options::OPT_fatomic_ignore_denormal_mode, + clang::driver::options::OPT_fno_atomic_ignore_denormal_mode, false); + opts.atomicFineGrainedMemory = args.hasFlag( + clang::driver::options::OPT_fatomic_fine_grained_memory, + clang::driver::options::OPT_fno_atomic_fine_grained_memory, false); + opts.atomicRemoteMemory = + args.hasFlag(clang::driver::options::OPT_fatomic_remote_memory, + clang::driver::options::OPT_fno_atomic_remote_memory, false); + if (const llvm::opt::Arg *a = args.getLastArg(clang::driver::options::OPT_target_cpu)) opts.cpu = a->getValue(); diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 92aae79..1adfb96 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -2167,10 +2167,35 @@ private: /// - structured and unstructured concurrent loops void genFIR(const Fortran::parser::DoConstruct &doConstruct) { setCurrentPositionAt(doConstruct); - // Collect loop nest information. - // Generate begin loop code directly for infinite and while loops. Fortran::lower::pft::Evaluation &eval = getEval(); bool unstructuredContext = eval.lowerAsUnstructured(); + + // Loops with induction variables inside OpenACC compute constructs + // need special handling to ensure that the IVs are privatized. + if (Fortran::lower::isInsideOpenACCComputeConstruct(*builder)) { + mlir::Operation *loopOp = Fortran::lower::genOpenACCLoopFromDoConstruct( + *this, bridge.getSemanticsContext(), localSymbols, doConstruct, eval); + bool success = loopOp != nullptr; + if (success) { + // Sanity check that the builder insertion point is inside the newly + // generated loop. + assert( + loopOp->getRegion(0).isAncestor( + builder->getInsertionPoint()->getBlock()->getParent()) && + "builder insertion point is not inside the newly generated loop"); + + // Loop body code. + auto iter = eval.getNestedEvaluations().begin(); + for (auto end = --eval.getNestedEvaluations().end(); iter != end; + ++iter) + genFIR(*iter, unstructuredContext); + return; + } + // Fall back to normal loop handling. + } + + // Collect loop nest information. + // Generate begin loop code directly for infinite and while loops. Fortran::lower::pft::Evaluation &doStmtEval = eval.getFirstNestedEvaluation(); auto *doStmt = doStmtEval.getIf<Fortran::parser::NonLabelDoStmt>(); @@ -3124,7 +3149,7 @@ private: Fortran::lower::pft::Evaluation *curEval = &getEval(); if (accLoop || accCombined) { - int64_t loopCount; + uint64_t loopCount; if (accLoop) { const Fortran::parser::AccBeginLoopDirective &beginLoopDir = std::get<Fortran::parser::AccBeginLoopDirective>(accLoop->t); @@ -3142,7 +3167,7 @@ private: if (curEval->lowerAsStructured()) { curEval = &curEval->getFirstNestedEvaluation(); - for (int64_t i = 1; i < loopCount; i++) + for (uint64_t i = 1; i < loopCount; i++) curEval = &*std::next(curEval->getNestedEvaluations().begin()); } } @@ -6733,6 +6758,10 @@ Fortran::lower::LoweringBridge::LoweringBridge( fir::setKindMapping(*module, kindMap); fir::setTargetCPU(*module, targetMachine.getTargetCPU()); fir::setTuneCPU(*module, targetOpts.cpuToTuneFor); + fir::setAtomicIgnoreDenormalMode(*module, + targetOpts.atomicIgnoreDenormalMode); + fir::setAtomicFineGrainedMemory(*module, targetOpts.atomicFineGrainedMemory); + fir::setAtomicRemoteMemory(*module, targetOpts.atomicRemoteMemory); fir::setTargetFeatures(*module, targetMachine.getTargetFeatureString()); fir::support::setMLIRDataLayout(*module, targetMachine.createDataLayout()); fir::setIdent(*module, Fortran::common::getFlangFullVersion()); diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index 471f368..57ce1d3 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -36,6 +36,7 @@ #include "mlir/IR/MLIRContext.h" #include "mlir/Support/LLVM.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/Frontend/OpenACC/ACC.h.inc" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -2142,6 +2143,168 @@ static void determineDefaultLoopParMode( } } +// Extract loop bounds, steps, induction variables, and privatization info +// for both DO CONCURRENT and regular do loops +static void processDoLoopBounds( + Fortran::lower::AbstractConverter &converter, + mlir::Location currentLocation, Fortran::lower::StatementContext &stmtCtx, + fir::FirOpBuilder &builder, + const Fortran::parser::DoConstruct &outerDoConstruct, + Fortran::lower::pft::Evaluation &eval, + llvm::SmallVector<mlir::Value> &lowerbounds, + llvm::SmallVector<mlir::Value> &upperbounds, + llvm::SmallVector<mlir::Value> &steps, + llvm::SmallVector<mlir::Value> &privateOperands, + llvm::SmallVector<mlir::Value> &ivPrivate, + llvm::SmallVector<mlir::Attribute> &privatizationRecipes, + llvm::SmallVector<mlir::Type> &ivTypes, + llvm::SmallVector<mlir::Location> &ivLocs, + llvm::SmallVector<bool> &inclusiveBounds, + llvm::SmallVector<mlir::Location> &locs, uint64_t loopsToProcess) { + assert(loopsToProcess > 0 && "expect at least one loop"); + locs.push_back(currentLocation); // Location of the directive + Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation(); + bool isDoConcurrent = outerDoConstruct.IsDoConcurrent(); + + if (isDoConcurrent) { + locs.push_back(converter.genLocation( + Fortran::parser::FindSourceLocation(outerDoConstruct))); + const Fortran::parser::LoopControl *loopControl = + &*outerDoConstruct.GetLoopControl(); + const auto &concurrent = + std::get<Fortran::parser::LoopControl::Concurrent>(loopControl->u); + if (!std::get<std::list<Fortran::parser::LocalitySpec>>(concurrent.t) + .empty()) + TODO(currentLocation, "DO CONCURRENT with locality spec inside ACC"); + + const auto &concurrentHeader = + std::get<Fortran::parser::ConcurrentHeader>(concurrent.t); + const auto &controls = + std::get<std::list<Fortran::parser::ConcurrentControl>>( + concurrentHeader.t); + for (const auto &control : controls) { + lowerbounds.push_back(fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(std::get<1>(control.t)), stmtCtx))); + upperbounds.push_back(fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(std::get<2>(control.t)), stmtCtx))); + if (const auto &expr = + std::get<std::optional<Fortran::parser::ScalarIntExpr>>( + control.t)) + steps.push_back(fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(*expr), stmtCtx))); + else // If `step` is not present, assume it is `1`. + steps.push_back(builder.createIntegerConstant( + currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1)); + + const auto &name = std::get<Fortran::parser::Name>(control.t); + privatizeIv(converter, *name.symbol, currentLocation, ivTypes, ivLocs, + privateOperands, ivPrivate, privatizationRecipes, + isDoConcurrent); + + inclusiveBounds.push_back(true); + } + } else { + for (uint64_t i = 0; i < loopsToProcess; ++i) { + const Fortran::parser::LoopControl *loopControl; + if (i == 0) { + loopControl = &*outerDoConstruct.GetLoopControl(); + locs.push_back(converter.genLocation( + Fortran::parser::FindSourceLocation(outerDoConstruct))); + } else { + auto *doCons = crtEval->getIf<Fortran::parser::DoConstruct>(); + assert(doCons && "expect do construct"); + loopControl = &*doCons->GetLoopControl(); + locs.push_back(converter.genLocation( + Fortran::parser::FindSourceLocation(*doCons))); + } + + const Fortran::parser::LoopControl::Bounds *bounds = + std::get_if<Fortran::parser::LoopControl::Bounds>(&loopControl->u); + assert(bounds && "Expected bounds on the loop construct"); + lowerbounds.push_back(fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(bounds->lower), stmtCtx))); + upperbounds.push_back(fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(bounds->upper), stmtCtx))); + if (bounds->step) + steps.push_back(fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(bounds->step), stmtCtx))); + else // If `step` is not present, assume it is `1`. + steps.push_back(builder.createIntegerConstant( + currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1)); + + Fortran::semantics::Symbol &ivSym = + bounds->name.thing.symbol->GetUltimate(); + privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs, + privateOperands, ivPrivate, privatizationRecipes); + + inclusiveBounds.push_back(true); + + if (i < loopsToProcess - 1) + crtEval = &*std::next(crtEval->getNestedEvaluations().begin()); + } + } +} + +static mlir::acc::LoopOp +buildACCLoopOp(Fortran::lower::AbstractConverter &converter, + mlir::Location currentLocation, + Fortran::semantics::SemanticsContext &semanticsContext, + Fortran::lower::StatementContext &stmtCtx, + const Fortran::parser::DoConstruct &outerDoConstruct, + Fortran::lower::pft::Evaluation &eval, + llvm::SmallVector<mlir::Value> &privateOperands, + llvm::SmallVector<mlir::Attribute> &privatizationRecipes, + llvm::SmallVector<mlir::Value> &gangOperands, + llvm::SmallVector<mlir::Value> &workerNumOperands, + llvm::SmallVector<mlir::Value> &vectorOperands, + llvm::SmallVector<mlir::Value> &tileOperands, + llvm::SmallVector<mlir::Value> &cacheOperands, + llvm::SmallVector<mlir::Value> &reductionOperands, + llvm::SmallVector<mlir::Type> &retTy, mlir::Value yieldValue, + uint64_t loopsToProcess) { + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + + llvm::SmallVector<mlir::Value> ivPrivate; + llvm::SmallVector<mlir::Type> ivTypes; + llvm::SmallVector<mlir::Location> ivLocs; + llvm::SmallVector<bool> inclusiveBounds; + llvm::SmallVector<mlir::Location> locs; + llvm::SmallVector<mlir::Value> lowerbounds, upperbounds, steps; + + // Look at the do/do concurrent loops to extract bounds information. + processDoLoopBounds(converter, currentLocation, stmtCtx, builder, + outerDoConstruct, eval, lowerbounds, upperbounds, steps, + privateOperands, ivPrivate, privatizationRecipes, ivTypes, + ivLocs, inclusiveBounds, locs, loopsToProcess); + + // Prepare the operand segment size attribute and the operands value range. + llvm::SmallVector<mlir::Value> operands; + llvm::SmallVector<int32_t> operandSegments; + addOperands(operands, operandSegments, lowerbounds); + addOperands(operands, operandSegments, upperbounds); + addOperands(operands, operandSegments, steps); + addOperands(operands, operandSegments, gangOperands); + addOperands(operands, operandSegments, workerNumOperands); + addOperands(operands, operandSegments, vectorOperands); + addOperands(operands, operandSegments, tileOperands); + addOperands(operands, operandSegments, cacheOperands); + addOperands(operands, operandSegments, privateOperands); + addOperands(operands, operandSegments, reductionOperands); + + auto loopOp = createRegionOp<mlir::acc::LoopOp, mlir::acc::YieldOp>( + builder, builder.getFusedLoc(locs), currentLocation, eval, operands, + operandSegments, /*outerCombined=*/false, retTy, yieldValue, ivTypes, + ivLocs); + + for (auto [arg, value] : llvm::zip( + loopOp.getLoopRegions().front()->front().getArguments(), ivPrivate)) + fir::StoreOp::create(builder, currentLocation, arg, value); + + loopOp.setInclusiveUpperbound(inclusiveBounds); + + return loopOp; +} + static mlir::acc::LoopOp createLoopOp( Fortran::lower::AbstractConverter &converter, mlir::Location currentLocation, @@ -2154,9 +2317,9 @@ static mlir::acc::LoopOp createLoopOp( std::nullopt, bool needEarlyReturnHandling = false) { fir::FirOpBuilder &builder = converter.getFirOpBuilder(); - llvm::SmallVector<mlir::Value> tileOperands, privateOperands, ivPrivate, + llvm::SmallVector<mlir::Value> tileOperands, privateOperands, reductionOperands, cacheOperands, vectorOperands, workerNumOperands, - gangOperands, lowerbounds, upperbounds, steps; + gangOperands; llvm::SmallVector<mlir::Attribute> privatizationRecipes, reductionRecipes; llvm::SmallVector<int32_t> tileOperandsSegments, gangOperandsSegments; llvm::SmallVector<int64_t> collapseValues; @@ -2325,107 +2488,6 @@ static mlir::acc::LoopOp createLoopOp( } } - llvm::SmallVector<mlir::Type> ivTypes; - llvm::SmallVector<mlir::Location> ivLocs; - llvm::SmallVector<bool> inclusiveBounds; - llvm::SmallVector<mlir::Location> locs; - locs.push_back(currentLocation); // Location of the directive - Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation(); - bool isDoConcurrent = outerDoConstruct.IsDoConcurrent(); - if (isDoConcurrent) { - locs.push_back(converter.genLocation( - Fortran::parser::FindSourceLocation(outerDoConstruct))); - const Fortran::parser::LoopControl *loopControl = - &*outerDoConstruct.GetLoopControl(); - const auto &concurrent = - std::get<Fortran::parser::LoopControl::Concurrent>(loopControl->u); - if (!std::get<std::list<Fortran::parser::LocalitySpec>>(concurrent.t) - .empty()) - TODO(currentLocation, "DO CONCURRENT with locality spec"); - - const auto &concurrentHeader = - std::get<Fortran::parser::ConcurrentHeader>(concurrent.t); - const auto &controls = - std::get<std::list<Fortran::parser::ConcurrentControl>>( - concurrentHeader.t); - for (const auto &control : controls) { - lowerbounds.push_back(fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(std::get<1>(control.t)), stmtCtx))); - upperbounds.push_back(fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(std::get<2>(control.t)), stmtCtx))); - if (const auto &expr = - std::get<std::optional<Fortran::parser::ScalarIntExpr>>( - control.t)) - steps.push_back(fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(*expr), stmtCtx))); - else // If `step` is not present, assume it is `1`. - steps.push_back(builder.createIntegerConstant( - currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1)); - - const auto &name = std::get<Fortran::parser::Name>(control.t); - privatizeIv(converter, *name.symbol, currentLocation, ivTypes, ivLocs, - privateOperands, ivPrivate, privatizationRecipes, - isDoConcurrent); - - inclusiveBounds.push_back(true); - } - } else { - int64_t loopCount = - Fortran::lower::getLoopCountForCollapseAndTile(accClauseList); - for (unsigned i = 0; i < loopCount; ++i) { - const Fortran::parser::LoopControl *loopControl; - if (i == 0) { - loopControl = &*outerDoConstruct.GetLoopControl(); - locs.push_back(converter.genLocation( - Fortran::parser::FindSourceLocation(outerDoConstruct))); - } else { - auto *doCons = crtEval->getIf<Fortran::parser::DoConstruct>(); - assert(doCons && "expect do construct"); - loopControl = &*doCons->GetLoopControl(); - locs.push_back(converter.genLocation( - Fortran::parser::FindSourceLocation(*doCons))); - } - - const Fortran::parser::LoopControl::Bounds *bounds = - std::get_if<Fortran::parser::LoopControl::Bounds>(&loopControl->u); - assert(bounds && "Expected bounds on the loop construct"); - lowerbounds.push_back(fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(bounds->lower), stmtCtx))); - upperbounds.push_back(fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(bounds->upper), stmtCtx))); - if (bounds->step) - steps.push_back(fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(bounds->step), stmtCtx))); - else // If `step` is not present, assume it is `1`. - steps.push_back(builder.createIntegerConstant( - currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1)); - - Fortran::semantics::Symbol &ivSym = - bounds->name.thing.symbol->GetUltimate(); - privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs, - privateOperands, ivPrivate, privatizationRecipes); - - inclusiveBounds.push_back(true); - - if (i < loopCount - 1) - crtEval = &*std::next(crtEval->getNestedEvaluations().begin()); - } - } - - // Prepare the operand segment size attribute and the operands value range. - llvm::SmallVector<mlir::Value> operands; - llvm::SmallVector<int32_t> operandSegments; - addOperands(operands, operandSegments, lowerbounds); - addOperands(operands, operandSegments, upperbounds); - addOperands(operands, operandSegments, steps); - addOperands(operands, operandSegments, gangOperands); - addOperands(operands, operandSegments, workerNumOperands); - addOperands(operands, operandSegments, vectorOperands); - addOperands(operands, operandSegments, tileOperands); - addOperands(operands, operandSegments, cacheOperands); - addOperands(operands, operandSegments, privateOperands); - addOperands(operands, operandSegments, reductionOperands); - llvm::SmallVector<mlir::Type> retTy; mlir::Value yieldValue; if (needEarlyReturnHandling) { @@ -2434,16 +2496,13 @@ static mlir::acc::LoopOp createLoopOp( retTy.push_back(i1Ty); } - auto loopOp = createRegionOp<mlir::acc::LoopOp, mlir::acc::YieldOp>( - builder, builder.getFusedLoc(locs), currentLocation, eval, operands, - operandSegments, /*outerCombined=*/false, retTy, yieldValue, ivTypes, - ivLocs); - - for (auto [arg, value] : llvm::zip( - loopOp.getLoopRegions().front()->front().getArguments(), ivPrivate)) - fir::StoreOp::create(builder, currentLocation, arg, value); - - loopOp.setInclusiveUpperbound(inclusiveBounds); + uint64_t loopsToProcess = + Fortran::lower::getLoopCountForCollapseAndTile(accClauseList); + auto loopOp = buildACCLoopOp( + converter, currentLocation, semanticsContext, stmtCtx, outerDoConstruct, + eval, privateOperands, privatizationRecipes, gangOperands, + workerNumOperands, vectorOperands, tileOperands, cacheOperands, + reductionOperands, retTy, yieldValue, loopsToProcess); if (!gangDeviceTypes.empty()) loopOp.setGangAttr(builder.getArrayAttr(gangDeviceTypes)); @@ -4899,6 +4958,12 @@ bool Fortran::lower::isInOpenACCLoop(fir::FirOpBuilder &builder) { return false; } +bool Fortran::lower::isInsideOpenACCComputeConstruct( + fir::FirOpBuilder &builder) { + return mlir::isa_and_nonnull<ACC_COMPUTE_CONSTRUCT_OPS>( + mlir::acc::getEnclosingComputeOp(builder.getRegion())); +} + void Fortran::lower::setInsertionPointAfterOpenACCLoopIfInside( fir::FirOpBuilder &builder) { if (auto loopOp = @@ -4913,10 +4978,10 @@ void Fortran::lower::genEarlyReturnInOpenACCLoop(fir::FirOpBuilder &builder, mlir::acc::YieldOp::create(builder, loc, yieldValue); } -int64_t Fortran::lower::getLoopCountForCollapseAndTile( +uint64_t Fortran::lower::getLoopCountForCollapseAndTile( const Fortran::parser::AccClauseList &clauseList) { - int64_t collapseLoopCount = 1; - int64_t tileLoopCount = 1; + uint64_t collapseLoopCount = 1; + uint64_t tileLoopCount = 1; for (const Fortran::parser::AccClause &clause : clauseList.v) { if (const auto *collapseClause = std::get_if<Fortran::parser::AccClause::Collapse>(&clause.u)) { @@ -4935,3 +5000,101 @@ int64_t Fortran::lower::getLoopCountForCollapseAndTile( return tileLoopCount; return collapseLoopCount; } + +/// Create an ACC loop operation for a DO construct when inside ACC compute +/// constructs This serves as a bridge between regular DO construct handling and +/// ACC loop creation +mlir::Operation *Fortran::lower::genOpenACCLoopFromDoConstruct( + AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semanticsContext, + Fortran::lower::SymMap &localSymbols, + const Fortran::parser::DoConstruct &doConstruct, pft::Evaluation &eval) { + // Only convert loops which have induction variables that need privatized. + if (!doConstruct.IsDoNormal() && !doConstruct.IsDoConcurrent()) + return nullptr; + + // If the evaluation is unstructured, then we cannot convert the loop + // because acc loop does not have an unstructured form. + // TODO: There may be other strategies that can be employed such + // as generating acc.private for the loop variables without attaching + // them to acc.loop. + // For now - generate a not-yet-implemented message because without + // privatizing the induction variable, the loop may not execute correctly. + // Only do this for `acc kernels` because in `acc parallel`, scalars end + // up as implicitly firstprivate. + if (eval.lowerAsUnstructured()) { + if (mlir::isa_and_present<mlir::acc::KernelsOp>( + mlir::acc::getEnclosingComputeOp( + converter.getFirOpBuilder().getRegion()))) + TODO(converter.getCurrentLocation(), + "unstructured do loop in acc kernels"); + return nullptr; + } + + // Open up a new scope for the loop variables. + localSymbols.pushScope(); + auto scopeGuard = llvm::make_scope_exit([&]() { localSymbols.popScope(); }); + + // Prepare empty operand vectors since there are no associated `acc loop` + // clauses with the Fortran do loops being handled here. + llvm::SmallVector<mlir::Value> privateOperands, gangOperands, + workerNumOperands, vectorOperands, tileOperands, cacheOperands, + reductionOperands; + llvm::SmallVector<mlir::Attribute> privatizationRecipes; + llvm::SmallVector<mlir::Type> retTy; + mlir::Value yieldValue; + uint64_t loopsToProcess = 1; // Single loop construct + + // Use same mechanism that handles `acc loop` contained do loops to handle + // the implicit loop case. + Fortran::lower::StatementContext stmtCtx; + auto loopOp = buildACCLoopOp( + converter, converter.getCurrentLocation(), semanticsContext, stmtCtx, + doConstruct, eval, privateOperands, privatizationRecipes, gangOperands, + workerNumOperands, vectorOperands, tileOperands, cacheOperands, + reductionOperands, retTy, yieldValue, loopsToProcess); + + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + if (!privatizationRecipes.empty()) + loopOp.setPrivatizationRecipesAttr(mlir::ArrayAttr::get( + converter.getFirOpBuilder().getContext(), privatizationRecipes)); + + // Normal do loops which are not annotated with `acc loop` should be + // left for analysis by marking with `auto`. This is the case even in the case + // of `acc parallel` region because the normal rules of applying `independent` + // is only for loops marked with `acc loop`. + // For do concurrent loops, the spec says in section 2.17.2: + // "When do concurrent appears without a loop construct in a kernels construct + // it is treated as if it is annotated with loop auto. If it appears in a + // parallel construct or an accelerator routine then it is treated as if it is + // annotated with loop independent." + // So this means that in all cases we mark with `auto` unless it is a + // `do concurrent` in an `acc parallel` construct or it must be `seq` because + // it is in an `acc serial` construct. + mlir::Operation *accRegionOp = + mlir::acc::getEnclosingComputeOp(converter.getFirOpBuilder().getRegion()); + mlir::acc::LoopParMode parMode = + mlir::isa_and_present<mlir::acc::ParallelOp>(accRegionOp) && + doConstruct.IsDoConcurrent() + ? mlir::acc::LoopParMode::loop_independent + : mlir::isa_and_present<mlir::acc::SerialOp>(accRegionOp) + ? mlir::acc::LoopParMode::loop_seq + : mlir::acc::LoopParMode::loop_auto; + + // Set the parallel mode based on the computed parMode + auto deviceNoneAttr = mlir::acc::DeviceTypeAttr::get( + builder.getContext(), mlir::acc::DeviceType::None); + auto arrOfDeviceNone = + mlir::ArrayAttr::get(builder.getContext(), deviceNoneAttr); + if (parMode == mlir::acc::LoopParMode::loop_independent) { + loopOp.setIndependentAttr(arrOfDeviceNone); + } else if (parMode == mlir::acc::LoopParMode::loop_seq) { + loopOp.setSeqAttr(arrOfDeviceNone); + } else if (parMode == mlir::acc::LoopParMode::loop_auto) { + loopOp.setAuto_Attr(arrOfDeviceNone); + } else { + llvm_unreachable("Unexpected loop par mode"); + } + + return loopOp; +} diff --git a/flang/lib/Lower/OpenMP/Atomic.cpp b/flang/lib/Lower/OpenMP/Atomic.cpp index 9a233d2..d4f83f5 100644 --- a/flang/lib/Lower/OpenMP/Atomic.cpp +++ b/flang/lib/Lower/OpenMP/Atomic.cpp @@ -635,9 +635,16 @@ genAtomicUpdate(lower::AbstractConverter &converter, } } + mlir::ModuleOp module = builder.getModule(); + mlir::omp::AtomicControlAttr atomicControlAttr = + mlir::omp::AtomicControlAttr::get( + builder.getContext(), fir::getAtomicIgnoreDenormalMode(module), + fir::getAtomicFineGrainedMemory(module), + fir::getAtomicRemoteMemory(module)); builder.restoreInsertionPoint(atomicAt); auto updateOp = mlir::omp::AtomicUpdateOp::create( - builder, loc, atomAddr, hint, makeMemOrderAttr(converter, memOrder)); + builder, loc, atomAddr, atomicControlAttr, hint, + makeMemOrderAttr(converter, memOrder)); mlir::Region ®ion = updateOp->getRegion(0); mlir::Block *block = builder.createBlock(®ion, {}, {atomType}, {loc}); diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 12089d6..6a4ec77 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -697,20 +697,16 @@ static void threadPrivatizeVars(lower::AbstractConverter &converter, } } -static mlir::Operation * -createAndSetPrivatizedLoopVar(lower::AbstractConverter &converter, - mlir::Location loc, mlir::Value indexVal, - const semantics::Symbol *sym) { +static mlir::Operation *setLoopVar(lower::AbstractConverter &converter, + mlir::Location loc, mlir::Value indexVal, + const semantics::Symbol *sym) { fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + mlir::OpBuilder::InsertPoint insPt = firOpBuilder.saveInsertionPoint(); firOpBuilder.setInsertionPointToStart(firOpBuilder.getAllocaBlock()); - mlir::Type tempTy = converter.genType(*sym); - - assert(converter.isPresentShallowLookup(*sym) && - "Expected symbol to be in symbol table."); - firOpBuilder.restoreInsertionPoint(insPt); + mlir::Value cvtVal = firOpBuilder.createConvert(loc, tempTy, indexVal); hlfir::Entity lhs{converter.getSymbolAddress(*sym)}; @@ -721,6 +717,15 @@ createAndSetPrivatizedLoopVar(lower::AbstractConverter &converter, return storeOp; } +static mlir::Operation * +createAndSetPrivatizedLoopVar(lower::AbstractConverter &converter, + mlir::Location loc, mlir::Value indexVal, + const semantics::Symbol *sym) { + assert(converter.isPresentShallowLookup(*sym) && + "Expected symbol to be in symbol table."); + return setLoopVar(converter, loc, indexVal, sym); +} + // This helper function implements the functionality of "promoting" non-CPTR // arguments of use_device_ptr to use_device_addr arguments (automagic // conversion of use_device_ptr -> use_device_addr in these cases). The way we @@ -1123,6 +1128,11 @@ struct OpWithBodyGenInfo { return *this; } + OpWithBodyGenInfo &setPrivatize(bool value) { + privatize = value; + return *this; + } + /// [inout] converter to use for the clauses. lower::AbstractConverter &converter; /// [in] Symbol table @@ -1149,6 +1159,8 @@ struct OpWithBodyGenInfo { /// [in] if set to `true`, skip generating nested evaluations and dispatching /// any further leaf constructs. bool genSkeletonOnly = false; + /// [in] enables handling of privatized variable unless set to `false`. + bool privatize = true; }; /// Create the body (block) for an OpenMP Operation. @@ -1209,7 +1221,7 @@ static void createBodyOfOp(mlir::Operation &op, const OpWithBodyGenInfo &info, // code will use the right symbols. bool isLoop = llvm::omp::getDirectiveAssociation(info.dir) == llvm::omp::Association::Loop; - bool privatize = info.clauses; + bool privatize = info.clauses && info.privatize; firOpBuilder.setInsertionPoint(marker); std::optional<DataSharingProcessor> tempDsp; @@ -2083,7 +2095,7 @@ genCanonicalLoopOp(lower::AbstractConverter &converter, lower::SymMap &symTable, const ConstructQueue &queue, ConstructQueue::const_iterator item, llvm::ArrayRef<const semantics::Symbol *> ivs, - llvm::omp::Directive directive, DataSharingProcessor &dsp) { + llvm::omp::Directive directive) { fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); assert(ivs.size() == 1 && "Nested loops not yet implemented"); @@ -2176,10 +2188,8 @@ genCanonicalLoopOp(lower::AbstractConverter &converter, lower::SymMap &symTable, mlir::Value userVal = firOpBuilder.create<mlir::arith::AddIOp>(loc, loopLBVar, scaled); - // The argument is not currently in memory, so make a temporary for the - // argument, and store it there, then bind that location to the argument. - mlir::Operation *storeOp = - createAndSetPrivatizedLoopVar(converter, loc, userVal, iv); + // Write loop value to loop variable + mlir::Operation *storeOp = setLoopVar(converter, loc, userVal, iv); firOpBuilder.setInsertionPointAfter(storeOp); return {iv}; @@ -2190,7 +2200,7 @@ genCanonicalLoopOp(lower::AbstractConverter &converter, lower::SymMap &symTable, OpWithBodyGenInfo(converter, symTable, semaCtx, loc, nestedEval, directive) .setClauses(&item->clauses) - .setDataSharingProcessor(&dsp) + .setPrivatize(false) .setGenRegionEntryCb(ivCallback), queue, item, tripcount, cli); @@ -2216,17 +2226,10 @@ static void genUnrollOp(Fortran::lower::AbstractConverter &converter, cp.processTODO<clause::Partial, clause::Full>( loc, llvm::omp::Directive::OMPD_unroll); - // Even though unroll does not support data-sharing clauses, but this is - // required to fill the symbol table. - DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval, - /*shouldCollectPreDeterminedSymbols=*/true, - /*useDelayedPrivatization=*/false, symTable); - dsp.processStep1(); - // Emit the associated loop auto canonLoop = genCanonicalLoopOp(converter, symTable, semaCtx, eval, loc, queue, item, - iv, llvm::omp::Directive::OMPD_unroll, dsp); + iv, llvm::omp::Directive::OMPD_unroll); // Apply unrolling to it auto cli = canonLoop.getCli(); diff --git a/flang/lib/Optimizer/Dialect/Support/FIRContext.cpp b/flang/lib/Optimizer/Dialect/Support/FIRContext.cpp index 01c0be6..c2e0afe1 100644 --- a/flang/lib/Optimizer/Dialect/Support/FIRContext.cpp +++ b/flang/lib/Optimizer/Dialect/Support/FIRContext.cpp @@ -88,6 +88,57 @@ void fir::setTuneCPU(mlir::ModuleOp mod, llvm::StringRef cpu) { mod->setAttr(tuneCpuName, mlir::StringAttr::get(ctx, cpu)); } +static constexpr const char *atomicIgnoreDenormalModeName = + "fir.atomic_ignore_denormal_mode"; + +void fir::setAtomicIgnoreDenormalMode(mlir::ModuleOp mod, bool value) { + if (value) { + auto *ctx = mod.getContext(); + mod->setAttr(atomicIgnoreDenormalModeName, mlir::UnitAttr::get(ctx)); + } else { + if (mod->hasAttr(atomicIgnoreDenormalModeName)) + mod->removeAttr(atomicIgnoreDenormalModeName); + } +} + +bool fir::getAtomicIgnoreDenormalMode(mlir::ModuleOp mod) { + return mod->hasAttr(atomicIgnoreDenormalModeName); +} + +static constexpr const char *atomicFineGrainedMemoryName = + "fir.atomic_fine_grained_memory"; + +void fir::setAtomicFineGrainedMemory(mlir::ModuleOp mod, bool value) { + if (value) { + auto *ctx = mod.getContext(); + mod->setAttr(atomicFineGrainedMemoryName, mlir::UnitAttr::get(ctx)); + } else { + if (mod->hasAttr(atomicFineGrainedMemoryName)) + mod->removeAttr(atomicFineGrainedMemoryName); + } +} + +bool fir::getAtomicFineGrainedMemory(mlir::ModuleOp mod) { + return mod->hasAttr(atomicFineGrainedMemoryName); +} + +static constexpr const char *atomicRemoteMemoryName = + "fir.atomic_remote_memory"; + +void fir::setAtomicRemoteMemory(mlir::ModuleOp mod, bool value) { + if (value) { + auto *ctx = mod.getContext(); + mod->setAttr(atomicRemoteMemoryName, mlir::UnitAttr::get(ctx)); + } else { + if (mod->hasAttr(atomicRemoteMemoryName)) + mod->removeAttr(atomicRemoteMemoryName); + } +} + +bool fir::getAtomicRemoteMemory(mlir::ModuleOp mod) { + return mod->hasAttr(atomicRemoteMemoryName); +} + llvm::StringRef fir::getTuneCPU(mlir::ModuleOp mod) { if (auto attr = mod->getAttrOfType<mlir::StringAttr>(tuneCpuName)) return attr.getValue(); diff --git a/flang/lib/Optimizer/Support/CMakeLists.txt b/flang/lib/Optimizer/Support/CMakeLists.txt index 7ccdd4f..38038e1 100644 --- a/flang/lib/Optimizer/Support/CMakeLists.txt +++ b/flang/lib/Optimizer/Support/CMakeLists.txt @@ -1,6 +1,3 @@ -get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) -get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS) - add_flang_library(FIRSupport DataLayout.cpp InitFIR.cpp @@ -23,12 +20,12 @@ add_flang_library(FIRSupport ${extension_libs} MLIR_LIBS - ${dialect_libs} - ${extension_libs} MLIRBuiltinToLLVMIRTranslation + MLIRLLVMToLLVMIRTranslation MLIROpenACCToLLVMIRTranslation MLIROpenMPToLLVMIRTranslation - MLIRLLVMToLLVMIRTranslation + MLIRRegisterAllDialects + MLIRRegisterAllExtensions MLIRTargetLLVMIRExport MLIRTargetLLVMIRImport ) diff --git a/flang/test/Lower/OpenACC/Todo/do-loops-to-acc-loops-todo.f90 b/flang/test/Lower/OpenACC/Todo/do-loops-to-acc-loops-todo.f90 new file mode 100644 index 0000000..aa1d443 --- /dev/null +++ b/flang/test/Lower/OpenACC/Todo/do-loops-to-acc-loops-todo.f90 @@ -0,0 +1,91 @@ +! RUN: split-file %s %t +! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/do_loop_with_stop.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK1 +! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/do_loop_with_cycle_goto.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK2 +! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/nested_goto_loop.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK3 +! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/nested_loop_with_inner_goto.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK4 + +//--- do_loop_with_stop.f90 + +subroutine do_loop_with_stop() + integer :: i + integer, parameter :: n = 10 + real, dimension(n) :: a, b + + !$acc kernels + do i = 1, n + a(i) = b(i) + 1.0 + if (i == 5) stop + end do + !$acc end kernels + +! CHECK1: not yet implemented: unstructured do loop in acc kernels + +end subroutine + +//--- do_loop_with_cycle_goto.f90 + +subroutine do_loop_with_cycle_goto() + integer :: i + integer, parameter :: n = 10 + real, dimension(n) :: a, b + + ! Do loop with cycle and goto - unstructured control flow is not converted. + !$acc kernels + do i = 1, n + if (i == 3) cycle + a(i) = b(i) + 1.0 + if (i == 7) goto 200 + a(i) = a(i) * 2.0 + end do +200 continue + !$acc end kernels + +! CHECK2: not yet implemented: unstructured do loop in acc kernels + +end subroutine + +//--- nested_goto_loop.f90 + +subroutine nested_goto_loop() + integer :: i, j + integer, parameter :: n = 10, m = 5 + real, dimension(n,m) :: a, b + + ! Nested loop with goto from inner to outer - should NOT convert to acc.loop + !$acc kernels + do i = 1, n + do j = 1, m + a(i,j) = b(i,j) + 1.0 + if (i * j > 20) goto 300 ! Exit both loops + end do + end do +300 continue + !$acc end kernels + +! CHECK3: not yet implemented: unstructured do loop in acc kernels + +end subroutine + +//--- nested_loop_with_inner_goto.f90 + +subroutine nested_loop_with_inner_goto() + integer :: ii = 0, jj = 0 + integer, parameter :: nn = 3 + real, dimension(nn, nn) :: aa + + aa = -1 + + ! Nested loop with goto from inner loop - unstructured control flow is not converted. + !$acc kernels + do ii = 1, nn + do jj = 1, nn + if (jj > 1) goto 300 + aa(jj, ii) = 1337 + end do + 300 continue + end do + !$acc end kernels + +! CHECK4: not yet implemented: unstructured do loop in acc kernels + +end subroutine
\ No newline at end of file diff --git a/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90 b/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90 new file mode 100644 index 0000000..5f8ea03 --- /dev/null +++ b/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90 @@ -0,0 +1,267 @@ +! This test checks lowering of Fortran do loops and do concurrent loops to OpenACC loop constructs. +! Tests the new functionality that converts Fortran iteration constructs to acc.loop with proper IV handling. + +! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s + +! CHECK-LABEL: func.func @_QPbasic_do_loop +subroutine basic_do_loop() + integer :: i + integer, parameter :: n = 10 + real, dimension(n) :: a, b + + ! Basic do loop that should be converted to acc.loop + !$acc kernels + do i = 1, n + a(i) = b(i) + 1.0 + end do + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.yield +! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>} + +end subroutine + +! CHECK-LABEL: func.func @_QPbasic_do_concurrent +subroutine basic_do_concurrent() + integer :: i + integer, parameter :: n = 10 + real, dimension(n) :: a, b + + ! Basic do concurrent loop + !$acc kernels + do concurrent (i = 1:n) + a(i) = b(i) + 1.0 + end do + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.yield +! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>} + +end subroutine + +! CHECK-LABEL: func.func @_QPbasic_do_loop_parallel +subroutine basic_do_loop_parallel() + integer :: i + integer, parameter :: n = 10 + real, dimension(n) :: a, b + + ! Basic do loop with acc parallel that should be converted to acc.loop + !$acc parallel + do i = 1, n + a(i) = b(i) + 1.0 + end do + !$acc end parallel + +! CHECK: acc.parallel { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.yield +! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>} + +end subroutine + +! CHECK-LABEL: func.func @_QPbasic_do_loop_serial +subroutine basic_do_loop_serial() + integer :: i + integer, parameter :: n = 10 + real, dimension(n) :: a, b + + ! Basic do loop with acc serial that should be converted to acc.loop + !$acc serial + do i = 1, n + a(i) = b(i) + 1.0 + end do + !$acc end serial + +! CHECK: acc.serial { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.yield +! CHECK: attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<none>]} + +end subroutine + +! CHECK-LABEL: func.func @_QPbasic_do_concurrent_parallel +subroutine basic_do_concurrent_parallel() + integer :: i + integer, parameter :: n = 10 + real, dimension(n) :: a, b + + ! Basic do concurrent loop with acc parallel + !$acc parallel + do concurrent (i = 1:n) + a(i) = b(i) + 1.0 + end do + !$acc end parallel + +! CHECK: acc.parallel { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.yield +! CHECK: attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]} + +end subroutine + +! CHECK-LABEL: func.func @_QPbasic_do_concurrent_serial +subroutine basic_do_concurrent_serial() + integer :: i + integer, parameter :: n = 10 + real, dimension(n) :: a, b + + ! Basic do concurrent loop with acc serial + !$acc serial + do concurrent (i = 1:n) + a(i) = b(i) + 1.0 + end do + !$acc end serial + +! CHECK: acc.serial { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.yield +! CHECK: attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<none>]} + +end subroutine + +! CHECK-LABEL: func.func @_QPmulti_dimension_do_concurrent +subroutine multi_dimension_do_concurrent() + integer :: i, j, k + integer, parameter :: n = 10, m = 20, l = 5 + real, dimension(n,m,l) :: a, b + + ! Multi-dimensional do concurrent with multiple iteration variables + !$acc kernels + do concurrent (i = 1:n, j = 1:m, k = 1:l) + a(i,j,k) = b(i,j,k) * 2.0 + end do + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32) = (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) to (%{{.*}}, %{{.*}}, %{{.*}} : i32, i32, i32) step (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) +! CHECK: acc.yield +! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true, true>} +end subroutine + + +! CHECK-LABEL: func.func @_QPnested_do_loops +subroutine nested_do_loops() + integer :: i, j + integer, parameter :: n = 10, m = 20 + real, dimension(n,m) :: a, b + + ! Nested do loops + !$acc kernels + do i = 1, n + do j = 1, m + a(i,j) = b(i,j) + i + j + end do + end do + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.yield +! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>} +! CHECK: acc.yield +! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>} + +end subroutine + +! CHECK-LABEL: func.func @_QPvariable_bounds_and_step +subroutine variable_bounds_and_step(n, start_val, step_val) + integer, intent(in) :: n, start_val, step_val + integer :: i + real, dimension(n) :: a, b + + ! Do loop with variable bounds and step + !$acc kernels + do i = start_val, n, step_val + a(i) = b(i) * 2.0 + end do + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.yield +! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>} + +end subroutine + +! CHECK-LABEL: func.func @_QPdifferent_iv_types +subroutine different_iv_types() + integer(kind=8) :: i8 + integer(kind=4) :: i4 + integer(kind=2) :: i2 + integer, parameter :: n = 10 + real, dimension(n) :: a, b, c, d + + ! Test different iteration variable types + !$acc kernels + do i8 = 1_8, int(n,8) + a(i8) = b(i8) + 1.0 + end do + !$acc end kernels + + !$acc kernels + do i4 = 1, n + b(i4) = c(i4) + 1.0 + end do + !$acc end kernels + + !$acc kernels + do i2 = 1_2, int(n,2) + c(i2) = d(i2) + 1.0 + end do + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i64) = (%{{.*}} : i64) to (%{{.*}} : i64) step (%{{.*}} : i64) +! CHECK: acc.kernels { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.kernels { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i16) = (%{{.*}} : i16) to (%{{.*}} : i16) step (%{{.*}} : i16) + +end subroutine + +! ----------------------------------------------------------------------------------------- +! Tests for loops that should NOT be converted to acc.loop due to unstructured control flow + +! CHECK-LABEL: func.func @_QPinfinite_loop_no_iv +subroutine infinite_loop_no_iv() + integer :: i + logical :: condition + + ! Infinite loop with no induction variable - should NOT convert to acc.loop + !$acc kernels + do + i = i + 1 + if (i > 100) exit + end do + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK-NOT: acc.loop + +end subroutine + +! CHECK-LABEL: func.func @_QPwhile_like_loop +subroutine while_like_loop() + integer :: i + logical :: condition + + i = 1 + condition = .true. + + ! While-like infinite loop - should NOT convert to acc.loop + !$acc kernels + do while (condition) + i = i + 1 + if (i > 100) condition = .false. + end do + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK-NOT: acc.loop + +end subroutine diff --git a/flang/test/Lower/OpenMP/atomic-control-options.f90 b/flang/test/Lower/OpenMP/atomic-control-options.f90 new file mode 100644 index 0000000..407f83b --- /dev/null +++ b/flang/test/Lower/OpenMP/atomic-control-options.f90 @@ -0,0 +1,37 @@ +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-device -munsafe-fp-atomics %s -o - | FileCheck -check-prefix=UNSAFE-FP-ATOMICS %s +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-device -fatomic-ignore-denormal-mode %s -o - | FileCheck -check-prefix=IGNORE-DENORMAL %s +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-device -fatomic-fine-grained-memory %s -o - | FileCheck -check-prefix=FINE-GRAINED-MEMORY %s +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-device -fatomic-remote-memory %s -o - | FileCheck -check-prefix=REMOTE-MEMORY %s +program test + implicit none + integer :: A, B, threads + threads = 128 + A = 0 + B = 0 + !UNSAFE-FP-ATOMICS: omp.atomic.update %{{.*}} : !fir.ref<i32> { + !UNSAFE-FP-ATOMICS: } {atomic_control = #omp.atomic_control<ignore_denormal_mode = true>} + !IGNORE-DENORMAL: omp.atomic.update %{{.*}} : !fir.ref<i32> { + !IGNORE-DENORMAL: } {atomic_control = #omp.atomic_control<ignore_denormal_mode = true>} + !FINE-GRAINED-MEMORY: omp.atomic.update %{{.*}} : !fir.ref<i32> { + !FINE-GRAINED-MEMORY: } {atomic_control = #omp.atomic_control<fine_grained_memory = true>} + !REMOTE-MEMORY: omp.atomic.update %{{.*}} : !fir.ref<i32> { + !REMOTE-MEMORY: } {atomic_control = #omp.atomic_control<remote_memory = true>} + !$omp target parallel num_threads(threads) + !$omp atomic + A = A + 1 + !$omp end target parallel + !UNSAFE-FP-ATOMICS: omp.atomic.update %{{.*}} : !fir.ref<i32> { + !UNSAFE-FP-ATOMICS: } {atomic_control = #omp.atomic_control<ignore_denormal_mode = true>} + !IGNORE-DENORMAL: omp.atomic.update %{{.*}} : !fir.ref<i32> { + !IGNORE-DENORMAL: } {atomic_control = #omp.atomic_control<ignore_denormal_mode = true>} + !FINE-GRAINED-MEMORY: omp.atomic.update %{{.*}} : !fir.ref<i32> { + !FINE-GRAINED-MEMORY: } {atomic_control = #omp.atomic_control<fine_grained_memory = true>} + !REMOTE-MEMORY: omp.atomic.update %{{.*}} : !fir.ref<i32> { + !REMOTE-MEMORY: } {atomic_control = #omp.atomic_control<remote_memory = true>} + !$omp target parallel num_threads(threads) + !$omp atomic capture + A = A + B + B = A + !$omp end atomic + !$omp end target parallel +end program test diff --git a/flang/test/Lower/OpenMP/unroll-heuristic01.f90 b/flang/test/Lower/OpenMP/unroll-heuristic01.f90 index a5f5c00..34020eb 100644 --- a/flang/test/Lower/OpenMP/unroll-heuristic01.f90 +++ b/flang/test/Lower/OpenMP/unroll-heuristic01.f90 @@ -13,27 +13,42 @@ subroutine omp_unroll_heuristic01(lb, ub, inc) end subroutine omp_unroll_heuristic01 -!CHECK-LABEL: func.func @_QPomp_unroll_heuristic01( -!CHECK: %c0_i32 = arith.constant 0 : i32 -!CHECK-NEXT: %c1_i32 = arith.constant 1 : i32 -!CHECK-NEXT: %13 = arith.cmpi slt, %12, %c0_i32 : i32 -!CHECK-NEXT: %14 = arith.subi %c0_i32, %12 : i32 -!CHECK-NEXT: %15 = arith.select %13, %14, %12 : i32 -!CHECK-NEXT: %16 = arith.select %13, %11, %10 : i32 -!CHECK-NEXT: %17 = arith.select %13, %10, %11 : i32 -!CHECK-NEXT: %18 = arith.subi %17, %16 overflow<nuw> : i32 -!CHECK-NEXT: %19 = arith.divui %18, %15 : i32 -!CHECK-NEXT: %20 = arith.addi %19, %c1_i32 overflow<nuw> : i32 -!CHECK-NEXT: %21 = arith.cmpi slt, %17, %16 : i32 -!CHECK-NEXT: %22 = arith.select %21, %c0_i32, %20 : i32 -!CHECK-NEXT: %canonloop_s0 = omp.new_cli -!CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%22) { -!CHECK-NEXT: %23 = arith.muli %iv, %12 : i32 -!CHECK-NEXT: %24 = arith.addi %10, %23 : i32 -!CHECK-NEXT: hlfir.assign %24 to %9#0 : i32, !fir.ref<i32> -!CHECK-NEXT: %25 = fir.load %9#0 : !fir.ref<i32> -!CHECK-NEXT: hlfir.assign %25 to %6#0 : i32, !fir.ref<i32> -!CHECK-NEXT: omp.terminator -!CHECK-NEXT: } -!CHECK-NEXT: omp.unroll_heuristic(%canonloop_s0) -!CHECK-NEXT: return +! CHECK-LABEL: func.func @_QPomp_unroll_heuristic01( +! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "lb"}, +! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "ub"}, +! CHECK-SAME: %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "inc"}) { +! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_unroll_heuristic01Ei"} +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFomp_unroll_heuristic01Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic01Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic01Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_unroll_heuristic01Eres"} +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFomp_unroll_heuristic01Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic01Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32> +! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32> +! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32> +! CHECK: %[[VAL_11:.*]] = arith.constant 0 : i32 +! CHECK: %[[VAL_12:.*]] = arith.constant 1 : i32 +! CHECK: %[[VAL_13:.*]] = arith.cmpi slt, %[[VAL_10]], %[[VAL_11]] : i32 +! CHECK: %[[VAL_14:.*]] = arith.subi %[[VAL_11]], %[[VAL_10]] : i32 +! CHECK: %[[VAL_15:.*]] = arith.select %[[VAL_13]], %[[VAL_14]], %[[VAL_10]] : i32 +! CHECK: %[[VAL_16:.*]] = arith.select %[[VAL_13]], %[[VAL_9]], %[[VAL_8]] : i32 +! CHECK: %[[VAL_17:.*]] = arith.select %[[VAL_13]], %[[VAL_8]], %[[VAL_9]] : i32 +! CHECK: %[[VAL_18:.*]] = arith.subi %[[VAL_17]], %[[VAL_16]] overflow<nuw> : i32 +! CHECK: %[[VAL_19:.*]] = arith.divui %[[VAL_18]], %[[VAL_15]] : i32 +! CHECK: %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_12]] overflow<nuw> : i32 +! CHECK: %[[VAL_21:.*]] = arith.cmpi slt, %[[VAL_17]], %[[VAL_16]] : i32 +! CHECK: %[[VAL_22:.*]] = arith.select %[[VAL_21]], %[[VAL_11]], %[[VAL_20]] : i32 +! CHECK: %[[VAL_23:.*]] = omp.new_cli +! CHECK: omp.canonical_loop(%[[VAL_23]]) %[[VAL_24:.*]] : i32 in range(%[[VAL_22]]) { +! CHECK: %[[VAL_25:.*]] = arith.muli %[[VAL_24]], %[[VAL_10]] : i32 +! CHECK: %[[VAL_26:.*]] = arith.addi %[[VAL_8]], %[[VAL_25]] : i32 +! CHECK: hlfir.assign %[[VAL_26]] to %[[VAL_2]]#0 : i32, !fir.ref<i32> +! CHECK: %[[VAL_27:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32> +! CHECK: hlfir.assign %[[VAL_27]] to %[[VAL_6]]#0 : i32, !fir.ref<i32> +! CHECK: omp.terminator +! CHECK: } +! CHECK: omp.unroll_heuristic(%[[VAL_23]]) +! CHECK: return +! CHECK: }
\ No newline at end of file diff --git a/flang/test/Lower/OpenMP/unroll-heuristic02.f90 b/flang/test/Lower/OpenMP/unroll-heuristic02.f90 index 14f694d..fdb1366 100644 --- a/flang/test/Lower/OpenMP/unroll-heuristic02.f90 +++ b/flang/test/Lower/OpenMP/unroll-heuristic02.f90 @@ -37,61 +37,55 @@ end subroutine omp_unroll_heuristic_nested02 !CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Eouter_ub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) !CHECK: %[[VAL_11:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_unroll_heuristic_nested02Eres"} !CHECK: %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFomp_unroll_heuristic_nested02Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[VAL_13:.*]] = fir.alloca i32 {bindc_name = "i", pinned, uniq_name = "_QFomp_unroll_heuristic_nested02Ei"} -!CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_13]] {uniq_name = "_QFomp_unroll_heuristic_nested02Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[VAL_15:.*]] = fir.alloca i32 {bindc_name = "j", pinned, uniq_name = "_QFomp_unroll_heuristic_nested02Ej"} -!CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFomp_unroll_heuristic_nested02Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[VAL_17:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32> -!CHECK: %[[VAL_18:.*]] = fir.load %[[VAL_10]]#0 : !fir.ref<i32> -!CHECK: %[[VAL_19:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32> -!CHECK: %[[VAL_20:.*]] = arith.constant 0 : i32 -!CHECK: %[[VAL_21:.*]] = arith.constant 1 : i32 -!CHECK: %[[VAL_22:.*]] = arith.cmpi slt, %[[VAL_19]], %[[VAL_20]] : i32 -!CHECK: %[[VAL_23:.*]] = arith.subi %[[VAL_20]], %[[VAL_19]] : i32 -!CHECK: %[[VAL_24:.*]] = arith.select %[[VAL_22]], %[[VAL_23]], %[[VAL_19]] : i32 -!CHECK: %[[VAL_25:.*]] = arith.select %[[VAL_22]], %[[VAL_18]], %[[VAL_17]] : i32 -!CHECK: %[[VAL_26:.*]] = arith.select %[[VAL_22]], %[[VAL_17]], %[[VAL_18]] : i32 -!CHECK: %[[VAL_27:.*]] = arith.subi %[[VAL_26]], %[[VAL_25]] overflow<nuw> : i32 -!CHECK: %[[VAL_28:.*]] = arith.divui %[[VAL_27]], %[[VAL_24]] : i32 -!CHECK: %[[VAL_29:.*]] = arith.addi %[[VAL_28]], %[[VAL_21]] overflow<nuw> : i32 -!CHECK: %[[VAL_30:.*]] = arith.cmpi slt, %[[VAL_26]], %[[VAL_25]] : i32 -!CHECK: %[[VAL_31:.*]] = arith.select %[[VAL_30]], %[[VAL_20]], %[[VAL_29]] : i32 -!CHECK: %[[VAL_32:.*]] = omp.new_cli -!CHECK: omp.canonical_loop(%[[VAL_32]]) %[[VAL_33:.*]] : i32 in range(%[[VAL_31]]) { -!CHECK: %[[VAL_34:.*]] = arith.muli %[[VAL_33]], %[[VAL_19]] : i32 -!CHECK: %[[VAL_35:.*]] = arith.addi %[[VAL_17]], %[[VAL_34]] : i32 -!CHECK: hlfir.assign %[[VAL_35]] to %[[VAL_14]]#0 : i32, !fir.ref<i32> -!CHECK: %[[VAL_36:.*]] = fir.alloca i32 {bindc_name = "j", pinned, uniq_name = "_QFomp_unroll_heuristic_nested02Ej"} -!CHECK: %[[VAL_37:.*]]:2 = hlfir.declare %[[VAL_36]] {uniq_name = "_QFomp_unroll_heuristic_nested02Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) -!CHECK: %[[VAL_38:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32> -!CHECK: %[[VAL_39:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32> -!CHECK: %[[VAL_40:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32> -!CHECK: %[[VAL_41:.*]] = arith.constant 0 : i32 -!CHECK: %[[VAL_42:.*]] = arith.constant 1 : i32 -!CHECK: %[[VAL_43:.*]] = arith.cmpi slt, %[[VAL_40]], %[[VAL_41]] : i32 -!CHECK: %[[VAL_44:.*]] = arith.subi %[[VAL_41]], %[[VAL_40]] : i32 -!CHECK: %[[VAL_45:.*]] = arith.select %[[VAL_43]], %[[VAL_44]], %[[VAL_40]] : i32 -!CHECK: %[[VAL_46:.*]] = arith.select %[[VAL_43]], %[[VAL_39]], %[[VAL_38]] : i32 -!CHECK: %[[VAL_47:.*]] = arith.select %[[VAL_43]], %[[VAL_38]], %[[VAL_39]] : i32 -!CHECK: %[[VAL_48:.*]] = arith.subi %[[VAL_47]], %[[VAL_46]] overflow<nuw> : i32 -!CHECK: %[[VAL_49:.*]] = arith.divui %[[VAL_48]], %[[VAL_45]] : i32 -!CHECK: %[[VAL_50:.*]] = arith.addi %[[VAL_49]], %[[VAL_42]] overflow<nuw> : i32 -!CHECK: %[[VAL_51:.*]] = arith.cmpi slt, %[[VAL_47]], %[[VAL_46]] : i32 -!CHECK: %[[VAL_52:.*]] = arith.select %[[VAL_51]], %[[VAL_41]], %[[VAL_50]] : i32 -!CHECK: %[[VAL_53:.*]] = omp.new_cli -!CHECK: omp.canonical_loop(%[[VAL_53]]) %[[VAL_54:.*]] : i32 in range(%[[VAL_52]]) { -!CHECK: %[[VAL_55:.*]] = arith.muli %[[VAL_54]], %[[VAL_40]] : i32 -!CHECK: %[[VAL_56:.*]] = arith.addi %[[VAL_38]], %[[VAL_55]] : i32 -!CHECK: hlfir.assign %[[VAL_56]] to %[[VAL_37]]#0 : i32, !fir.ref<i32> -!CHECK: %[[VAL_57:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32> -!CHECK: %[[VAL_58:.*]] = fir.load %[[VAL_37]]#0 : !fir.ref<i32> -!CHECK: %[[VAL_59:.*]] = arith.addi %[[VAL_57]], %[[VAL_58]] : i32 -!CHECK: hlfir.assign %[[VAL_59]] to %[[VAL_12]]#0 : i32, !fir.ref<i32> +!CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32> +!CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_10]]#0 : !fir.ref<i32> +!CHECK: %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32> +!CHECK: %[[VAL_16:.*]] = arith.constant 0 : i32 +!CHECK: %[[VAL_17:.*]] = arith.constant 1 : i32 +!CHECK: %[[VAL_18:.*]] = arith.cmpi slt, %[[VAL_15]], %[[VAL_16]] : i32 +!CHECK: %[[VAL_19:.*]] = arith.subi %[[VAL_16]], %[[VAL_15]] : i32 +!CHECK: %[[VAL_20:.*]] = arith.select %[[VAL_18]], %[[VAL_19]], %[[VAL_15]] : i32 +!CHECK: %[[VAL_21:.*]] = arith.select %[[VAL_18]], %[[VAL_14]], %[[VAL_13]] : i32 +!CHECK: %[[VAL_22:.*]] = arith.select %[[VAL_18]], %[[VAL_13]], %[[VAL_14]] : i32 +!CHECK: %[[VAL_23:.*]] = arith.subi %[[VAL_22]], %[[VAL_21]] overflow<nuw> : i32 +!CHECK: %[[VAL_24:.*]] = arith.divui %[[VAL_23]], %[[VAL_20]] : i32 +!CHECK: %[[VAL_25:.*]] = arith.addi %[[VAL_24]], %[[VAL_17]] overflow<nuw> : i32 +!CHECK: %[[VAL_26:.*]] = arith.cmpi slt, %[[VAL_22]], %[[VAL_21]] : i32 +!CHECK: %[[VAL_27:.*]] = arith.select %[[VAL_26]], %[[VAL_16]], %[[VAL_25]] : i32 +!CHECK: %[[VAL_28:.*]] = omp.new_cli +!CHECK: omp.canonical_loop(%[[VAL_28]]) %[[VAL_29:.*]] : i32 in range(%[[VAL_27]]) { +!CHECK: %[[VAL_30:.*]] = arith.muli %[[VAL_29]], %[[VAL_15]] : i32 +!CHECK: %[[VAL_31:.*]] = arith.addi %[[VAL_13]], %[[VAL_30]] : i32 +!CHECK: hlfir.assign %[[VAL_31]] to %[[VAL_2]]#0 : i32, !fir.ref<i32> +!CHECK: %[[VAL_32:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32> +!CHECK: %[[VAL_33:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32> +!CHECK: %[[VAL_34:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32> +!CHECK: %[[VAL_35:.*]] = arith.constant 0 : i32 +!CHECK: %[[VAL_36:.*]] = arith.constant 1 : i32 +!CHECK: %[[VAL_37:.*]] = arith.cmpi slt, %[[VAL_34]], %[[VAL_35]] : i32 +!CHECK: %[[VAL_38:.*]] = arith.subi %[[VAL_35]], %[[VAL_34]] : i32 +!CHECK: %[[VAL_39:.*]] = arith.select %[[VAL_37]], %[[VAL_38]], %[[VAL_34]] : i32 +!CHECK: %[[VAL_40:.*]] = arith.select %[[VAL_37]], %[[VAL_33]], %[[VAL_32]] : i32 +!CHECK: %[[VAL_41:.*]] = arith.select %[[VAL_37]], %[[VAL_32]], %[[VAL_33]] : i32 +!CHECK: %[[VAL_42:.*]] = arith.subi %[[VAL_41]], %[[VAL_40]] overflow<nuw> : i32 +!CHECK: %[[VAL_43:.*]] = arith.divui %[[VAL_42]], %[[VAL_39]] : i32 +!CHECK: %[[VAL_44:.*]] = arith.addi %[[VAL_43]], %[[VAL_36]] overflow<nuw> : i32 +!CHECK: %[[VAL_45:.*]] = arith.cmpi slt, %[[VAL_41]], %[[VAL_40]] : i32 +!CHECK: %[[VAL_46:.*]] = arith.select %[[VAL_45]], %[[VAL_35]], %[[VAL_44]] : i32 +!CHECK: %[[VAL_47:.*]] = omp.new_cli +!CHECK: omp.canonical_loop(%[[VAL_47]]) %[[VAL_48:.*]] : i32 in range(%[[VAL_46]]) { +!CHECK: %[[VAL_49:.*]] = arith.muli %[[VAL_48]], %[[VAL_34]] : i32 +!CHECK: %[[VAL_50:.*]] = arith.addi %[[VAL_32]], %[[VAL_49]] : i32 +!CHECK: hlfir.assign %[[VAL_50]] to %[[VAL_7]]#0 : i32, !fir.ref<i32> +!CHECK: %[[VAL_51:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32> +!CHECK: %[[VAL_52:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32> +!CHECK: %[[VAL_53:.*]] = arith.addi %[[VAL_51]], %[[VAL_52]] : i32 +!CHECK: hlfir.assign %[[VAL_53]] to %[[VAL_12]]#0 : i32, !fir.ref<i32> !CHECK: omp.terminator !CHECK: } -!CHECK: omp.unroll_heuristic(%[[VAL_53]]) +!CHECK: omp.unroll_heuristic(%[[VAL_47]]) !CHECK: omp.terminator !CHECK: } -!CHECK: omp.unroll_heuristic(%[[VAL_32]]) +!CHECK: omp.unroll_heuristic(%[[VAL_28]]) !CHECK: return !CHECK: } diff --git a/flang/test/Lower/OpenMP/unroll-heuristic03.f90 b/flang/test/Lower/OpenMP/unroll-heuristic03.f90 new file mode 100644 index 0000000..308c149 --- /dev/null +++ b/flang/test/Lower/OpenMP/unroll-heuristic03.f90 @@ -0,0 +1,61 @@ +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s + +! Test implicitly privatized loop variable that is affected by unrolling. + +subroutine omp_unroll_heuristic03(lb, ub, inc) + integer res, i, lb, ub, inc + + !$omp parallel + !$omp unroll + do i = lb, ub, inc + res = i + end do + !$omp end unroll + !$omp end parallel + +end subroutine omp_unroll_heuristic03 + + +! CHECK-LABEL: func.func @_QPomp_unroll_heuristic03( +! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "lb"}, +! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "ub"}, +! CHECK-SAME: %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "inc"}) { +! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_unroll_heuristic03Ei"} +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFomp_unroll_heuristic03Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic03Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic03Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_unroll_heuristic03Eres"} +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFomp_unroll_heuristic03Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic03Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: omp.parallel private(@_QFomp_unroll_heuristic03Ei_private_i32 %[[VAL_2]]#0 -> %[[VAL_8:.*]] : !fir.ref<i32>) { +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFomp_unroll_heuristic03Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) +! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32> +! CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32> +! CHECK: %[[VAL_12:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32> +! CHECK: %[[VAL_13:.*]] = arith.constant 0 : i32 +! CHECK: %[[VAL_14:.*]] = arith.constant 1 : i32 +! CHECK: %[[VAL_15:.*]] = arith.cmpi slt, %[[VAL_12]], %[[VAL_13]] : i32 +! CHECK: %[[VAL_16:.*]] = arith.subi %[[VAL_13]], %[[VAL_12]] : i32 +! CHECK: %[[VAL_17:.*]] = arith.select %[[VAL_15]], %[[VAL_16]], %[[VAL_12]] : i32 +! CHECK: %[[VAL_18:.*]] = arith.select %[[VAL_15]], %[[VAL_11]], %[[VAL_10]] : i32 +! CHECK: %[[VAL_19:.*]] = arith.select %[[VAL_15]], %[[VAL_10]], %[[VAL_11]] : i32 +! CHECK: %[[VAL_20:.*]] = arith.subi %[[VAL_19]], %[[VAL_18]] overflow<nuw> : i32 +! CHECK: %[[VAL_21:.*]] = arith.divui %[[VAL_20]], %[[VAL_17]] : i32 +! CHECK: %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_14]] overflow<nuw> : i32 +! CHECK: %[[VAL_23:.*]] = arith.cmpi slt, %[[VAL_19]], %[[VAL_18]] : i32 +! CHECK: %[[VAL_24:.*]] = arith.select %[[VAL_23]], %[[VAL_13]], %[[VAL_22]] : i32 +! CHECK: %[[VAL_25:.*]] = omp.new_cli +! CHECK: omp.canonical_loop(%[[VAL_25]]) %[[VAL_26:.*]] : i32 in range(%[[VAL_24]]) { +! CHECK: %[[VAL_27:.*]] = arith.muli %[[VAL_26]], %[[VAL_12]] : i32 +! CHECK: %[[VAL_28:.*]] = arith.addi %[[VAL_10]], %[[VAL_27]] : i32 +! CHECK: hlfir.assign %[[VAL_28]] to %[[VAL_9]]#0 : i32, !fir.ref<i32> +! CHECK: %[[VAL_29:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32> +! CHECK: hlfir.assign %[[VAL_29]] to %[[VAL_6]]#0 : i32, !fir.ref<i32> +! CHECK: omp.terminator +! CHECK: } +! CHECK: omp.unroll_heuristic(%[[VAL_25]]) +! CHECK: omp.terminator +! CHECK: } +! CHECK: return +! CHECK: }
\ No newline at end of file |