//===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/Todo.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/OpenMP/Passes.h" #include "flang/Optimizer/OpenMP/Utils.h" #include "flang/Support/OpenMP-utils.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/IR/IRMapping.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/RegionUtils.h" namespace flangomp { #define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS #include "flang/Optimizer/OpenMP/Passes.h.inc" } // namespace flangomp #define DEBUG_TYPE "do-concurrent-conversion" #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ") namespace { namespace looputils { /// Stores info needed about the induction/iteration variable for each `do /// concurrent` in a loop nest. struct InductionVariableInfo { InductionVariableInfo(fir::DoConcurrentLoopOp loop, mlir::Value inductionVar) { populateInfo(loop, inductionVar); } /// The operation allocating memory for iteration variable. mlir::Operation *iterVarMemDef; /// the operation(s) updating the iteration variable with the current /// iteration number. llvm::SmallVector indVarUpdateOps; private: /// For the \p doLoop parameter, find the following: /// /// 1. The operation that declares its iteration variable or allocates memory /// for it. For example, give the following loop: /// ``` /// ... /// %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ... /// ... /// fir.do_concurrent.loop (%ind_var) = (%lb) to (%ub) step (%s) { /// %ind_var_conv = fir.convert %ind_var : (index) -> i32 /// fir.store %ind_var_conv to %i#1 : !fir.ref /// ... /// } /// ``` /// /// This function sets the `iterVarMemDef` member to the `hlfir.declare` op /// for `%i`. /// /// 2. The operation(s) that update the loop's iteration variable from its /// induction variable. For the above example, the `indVarUpdateOps` is /// populated with the first 2 ops in the loop's body. /// /// Note: The current implementation is dependent on how flang emits loop /// bodies; which is sufficient for the current simple test/use cases. If this /// proves to be insufficient, this should be made more generic. void populateInfo(fir::DoConcurrentLoopOp loop, mlir::Value inductionVar) { mlir::Value result = nullptr; // Checks if a StoreOp is updating the memref of the loop's iteration // variable. auto isStoringIV = [&](fir::StoreOp storeOp) { // Direct store into the IV memref. if (storeOp.getValue() == inductionVar) { indVarUpdateOps.push_back(storeOp); return true; } // Indirect store into the IV memref. if (auto convertOp = mlir::dyn_cast( storeOp.getValue().getDefiningOp())) { if (convertOp.getOperand() == inductionVar) { indVarUpdateOps.push_back(convertOp); indVarUpdateOps.push_back(storeOp); return true; } } return false; }; for (mlir::Operation &op : loop) { if (auto storeOp = mlir::dyn_cast(op)) if (isStoringIV(storeOp)) { result = storeOp.getMemref(); break; } } assert(result != nullptr && result.getDefiningOp() != nullptr); iterVarMemDef = result.getDefiningOp(); } }; using InductionVariableInfos = llvm::SmallVector; /// Collects values that are local to a loop: "loop-local values". A loop-local /// value is one that is used exclusively inside the loop but allocated outside /// of it. This usually corresponds to temporary values that are used inside the /// loop body for initialzing other variables for example. /// /// See `flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90` for an /// example of why we need this. /// /// \param [in] doLoop - the loop within which the function searches for values /// used exclusively inside. /// /// \param [out] locals - the list of loop-local values detected for \p doLoop. void collectLoopLocalValues(fir::DoConcurrentLoopOp loop, llvm::SetVector &locals) { loop.walk([&](mlir::Operation *op) { for (mlir::Value operand : op->getOperands()) { if (locals.contains(operand)) continue; bool isLocal = true; if (!mlir::isa_and_present(operand.getDefiningOp())) continue; // Values defined inside the loop are not interesting since they do not // need to be localized. if (loop->isAncestor(operand.getDefiningOp())) continue; for (auto *user : operand.getUsers()) { if (!loop->isAncestor(user)) { isLocal = false; break; } } if (isLocal) locals.insert(operand); } }); } /// For a "loop-local" value \p local within a loop's scope, localizes that /// value within the scope of the parallel region the loop maps to. Towards that /// end, this function moves the allocation of \p local within \p allocRegion. /// /// \param local - the value used exclusively within a loop's scope (see /// collectLoopLocalValues). /// /// \param allocRegion - the parallel region where \p local's allocation will be /// privatized. /// /// \param rewriter - builder used for updating \p allocRegion. static void localizeLoopLocalValue(mlir::Value local, mlir::Region &allocRegion, mlir::ConversionPatternRewriter &rewriter) { rewriter.moveOpBefore(local.getDefiningOp(), &allocRegion.front().front()); } } // namespace looputils class DoConcurrentConversion : public mlir::OpConversionPattern { public: using mlir::OpConversionPattern::OpConversionPattern; DoConcurrentConversion( mlir::MLIRContext *context, bool mapToDevice, llvm::DenseSet &concurrentLoopsToSkip) : OpConversionPattern(context), mapToDevice(mapToDevice), concurrentLoopsToSkip(concurrentLoopsToSkip) {} mlir::LogicalResult matchAndRewrite(fir::DoConcurrentOp doLoop, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const override { if (mapToDevice) return doLoop.emitError( "not yet implemented: Mapping `do concurrent` loops to device"); looputils::InductionVariableInfos ivInfos; auto loop = mlir::cast( doLoop.getRegion().back().getTerminator()); auto indVars = loop.getLoopInductionVars(); assert(indVars.has_value()); for (mlir::Value indVar : *indVars) ivInfos.emplace_back(loop, indVar); llvm::SetVector locals; looputils::collectLoopLocalValues(loop, locals); mlir::IRMapping mapper; mlir::omp::ParallelOp parallelOp = genParallelOp(doLoop.getLoc(), rewriter, ivInfos, mapper); mlir::omp::LoopNestOperands loopNestClauseOps; genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper, loopNestClauseOps); for (mlir::Value local : locals) looputils::localizeLoopLocalValue(local, parallelOp.getRegion(), rewriter); mlir::omp::LoopNestOp ompLoopNest = genWsLoopOp(rewriter, loop, mapper, loopNestClauseOps, /*isComposite=*/mapToDevice); rewriter.setInsertionPoint(doLoop); fir::FirOpBuilder builder( rewriter, fir::getKindMapping(doLoop->getParentOfType())); // Collect iteration variable(s) allocations so that we can move them // outside the `fir.do_concurrent` wrapper (before erasing it). llvm::SmallVector opsToMove; for (mlir::Operation &op : llvm::drop_end(doLoop)) opsToMove.push_back(&op); mlir::Block *allocBlock = builder.getAllocaBlock(); for (mlir::Operation *op : llvm::reverse(opsToMove)) { rewriter.moveOpBefore(op, allocBlock, allocBlock->begin()); } // Mark `unordered` loops that are not perfectly nested to be skipped from // the legality check of the `ConversionTarget` since we are not interested // in mapping them to OpenMP. ompLoopNest->walk([&](fir::DoConcurrentOp doLoop) { concurrentLoopsToSkip.insert(doLoop); }); rewriter.eraseOp(doLoop); return mlir::success(); } private: mlir::omp::ParallelOp genParallelOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter, looputils::InductionVariableInfos &ivInfos, mlir::IRMapping &mapper) const { auto parallelOp = mlir::omp::ParallelOp::create(rewriter, loc); rewriter.createBlock(¶llelOp.getRegion()); rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc)); genLoopNestIndVarAllocs(rewriter, ivInfos, mapper); return parallelOp; } void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter, looputils::InductionVariableInfos &ivInfos, mlir::IRMapping &mapper) const { for (auto &indVarInfo : ivInfos) genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper); } mlir::Operation * genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter, mlir::Operation *indVarMemDef, mlir::IRMapping &mapper) const { assert( indVarMemDef != nullptr && "Induction variable memdef is expected to have a defining operation."); llvm::SmallSetVector indVarDeclareAndAlloc; for (auto operand : indVarMemDef->getOperands()) indVarDeclareAndAlloc.insert(operand.getDefiningOp()); indVarDeclareAndAlloc.insert(indVarMemDef); mlir::Operation *result; for (mlir::Operation *opToClone : indVarDeclareAndAlloc) result = rewriter.clone(*opToClone, mapper); return result; } void genLoopNestClauseOps(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter, fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper, mlir::omp::LoopNestOperands &loopNestClauseOps) const { assert(loopNestClauseOps.loopLowerBounds.empty() && "Loop nest bounds were already emitted!"); auto populateBounds = [](mlir::Value var, llvm::SmallVectorImpl &bounds) { bounds.push_back(var.getDefiningOp()->getResult(0)); }; for (auto [lb, ub, st] : llvm::zip_equal( loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) { populateBounds(lb, loopNestClauseOps.loopLowerBounds); populateBounds(ub, loopNestClauseOps.loopUpperBounds); populateBounds(st, loopNestClauseOps.loopSteps); } loopNestClauseOps.loopInclusive = rewriter.getUnitAttr(); } mlir::omp::LoopNestOp genWsLoopOp(mlir::ConversionPatternRewriter &rewriter, fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper, const mlir::omp::LoopNestOperands &clauseOps, bool isComposite) const { mlir::omp::WsloopOperands wsloopClauseOps; auto cloneFIRRegionToOMP = [&rewriter](mlir::Region &firRegion, mlir::Region &ompRegion) { if (!firRegion.empty()) { rewriter.cloneRegionBefore(firRegion, ompRegion, ompRegion.begin()); auto firYield = mlir::cast(ompRegion.back().getTerminator()); rewriter.setInsertionPoint(firYield); mlir::omp::YieldOp::create(rewriter, firYield.getLoc(), firYield.getOperands()); rewriter.eraseOp(firYield); } }; // For `local` (and `local_init`) opernads, emit corresponding `private` // clauses and attach these clauses to the workshare loop. if (!loop.getLocalVars().empty()) for (auto [op, sym, arg] : llvm::zip_equal( loop.getLocalVars(), loop.getLocalSymsAttr().getAsRange(), loop.getRegionLocalArgs())) { auto localizer = mlir::SymbolTable::lookupNearestSymbolFrom< fir::LocalitySpecifierOp>(loop, sym); if (localizer.getLocalitySpecifierType() == fir::LocalitySpecifierType::LocalInit) TODO(localizer.getLoc(), "local_init conversion is not supported yet"); mlir::OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPointAfter(localizer); auto privatizer = mlir::omp::PrivateClauseOp::create( rewriter, localizer.getLoc(), sym.getLeafReference().str() + ".omp", localizer.getTypeAttr().getValue(), mlir::omp::DataSharingClauseType::Private); cloneFIRRegionToOMP(localizer.getInitRegion(), privatizer.getInitRegion()); cloneFIRRegionToOMP(localizer.getDeallocRegion(), privatizer.getDeallocRegion()); wsloopClauseOps.privateVars.push_back(op); wsloopClauseOps.privateSyms.push_back( mlir::SymbolRefAttr::get(privatizer)); } if (!loop.getReduceVars().empty()) { for (auto [op, byRef, sym, arg] : llvm::zip_equal( loop.getReduceVars(), loop.getReduceByrefAttr().asArrayRef(), loop.getReduceSymsAttr().getAsRange(), loop.getRegionReduceArgs())) { auto firReducer = mlir::SymbolTable::lookupNearestSymbolFrom( loop, sym); mlir::OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPointAfter(firReducer); auto ompReducer = mlir::omp::DeclareReductionOp::create( rewriter, firReducer.getLoc(), sym.getLeafReference().str() + ".omp", firReducer.getTypeAttr().getValue()); cloneFIRRegionToOMP(firReducer.getAllocRegion(), ompReducer.getAllocRegion()); cloneFIRRegionToOMP(firReducer.getInitializerRegion(), ompReducer.getInitializerRegion()); cloneFIRRegionToOMP(firReducer.getReductionRegion(), ompReducer.getReductionRegion()); cloneFIRRegionToOMP(firReducer.getAtomicReductionRegion(), ompReducer.getAtomicReductionRegion()); cloneFIRRegionToOMP(firReducer.getCleanupRegion(), ompReducer.getCleanupRegion()); wsloopClauseOps.reductionVars.push_back(op); wsloopClauseOps.reductionByref.push_back(byRef); wsloopClauseOps.reductionSyms.push_back( mlir::SymbolRefAttr::get(ompReducer)); } } auto wsloopOp = mlir::omp::WsloopOp::create(rewriter, loop.getLoc(), wsloopClauseOps); wsloopOp.setComposite(isComposite); Fortran::common::openmp::EntryBlockArgs wsloopArgs; wsloopArgs.priv.vars = wsloopClauseOps.privateVars; wsloopArgs.reduction.vars = wsloopClauseOps.reductionVars; Fortran::common::openmp::genEntryBlock(rewriter, wsloopArgs, wsloopOp.getRegion()); auto loopNestOp = mlir::omp::LoopNestOp::create(rewriter, loop.getLoc(), clauseOps); // Clone the loop's body inside the loop nest construct using the // mapped values. rewriter.cloneRegionBefore(loop.getRegion(), loopNestOp.getRegion(), loopNestOp.getRegion().begin(), mapper); rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back()); mlir::omp::YieldOp::create(rewriter, loop->getLoc()); // `local` region arguments are transferred/cloned from the `do concurrent` // loop to the loopnest op when the region is cloned above. Instead, these // region arguments should be on the workshare loop's region. for (auto [wsloopArg, loopNestArg] : llvm::zip_equal(wsloopOp.getRegion().getArguments(), loopNestOp.getRegion().getArguments().drop_front( clauseOps.loopLowerBounds.size()))) rewriter.replaceAllUsesWith(loopNestArg, wsloopArg); for (unsigned i = 0; i < loop.getLocalVars().size() + loop.getReduceVars().size(); ++i) loopNestOp.getRegion().eraseArgument(clauseOps.loopLowerBounds.size()); return loopNestOp; } bool mapToDevice; llvm::DenseSet &concurrentLoopsToSkip; }; class DoConcurrentConversionPass : public flangomp::impl::DoConcurrentConversionPassBase< DoConcurrentConversionPass> { public: DoConcurrentConversionPass() = default; DoConcurrentConversionPass( const flangomp::DoConcurrentConversionPassOptions &options) : DoConcurrentConversionPassBase(options) {} void runOnOperation() override { mlir::func::FuncOp func = getOperation(); if (func.isDeclaration()) return; mlir::MLIRContext *context = &getContext(); if (mapTo != flangomp::DoConcurrentMappingKind::DCMK_Host && mapTo != flangomp::DoConcurrentMappingKind::DCMK_Device) { mlir::emitWarning(mlir::UnknownLoc::get(context), "DoConcurrentConversionPass: invalid `map-to` value. " "Valid values are: `host` or `device`"); return; } llvm::DenseSet concurrentLoopsToSkip; mlir::RewritePatternSet patterns(context); patterns.insert( context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device, concurrentLoopsToSkip); mlir::ConversionTarget target(*context); target.addDynamicallyLegalOp( [&](fir::DoConcurrentOp op) { return concurrentLoopsToSkip.contains(op); }); target.markUnknownOpDynamicallyLegal( [](mlir::Operation *) { return true; }); if (mlir::failed(mlir::applyFullConversion(getOperation(), target, std::move(patterns)))) { signalPassFailure(); } } }; } // namespace std::unique_ptr flangomp::createDoConcurrentConversionPass(bool mapToDevice) { DoConcurrentConversionPassOptions options; options.mapTo = mapToDevice ? flangomp::DoConcurrentMappingKind::DCMK_Device : flangomp::DoConcurrentMappingKind::DCMK_Host; return std::make_unique(options); }