//===- LowerWorkshare.cpp - special cases for bufferization -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the lowering of omp.workshare to other omp constructs. // // This pass is tasked with parallelizing the loops nested in // workshare.loop_wrapper while both the Fortran to mlir lowering and the hlfir // to fir lowering pipelines are responsible for emitting the // workshare.loop_wrapper ops where appropriate according to the // `shouldUseWorkshareLowering` function. // //===----------------------------------------------------------------------===// #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace flangomp { #define GEN_PASS_DEF_LOWERWORKSHARE #include "flang/Optimizer/OpenMP/Passes.h.inc" } // namespace flangomp #define DEBUG_TYPE "lower-workshare" using namespace mlir; namespace flangomp { // Checks for nesting pattern below as we need to avoid sharing the work of // statements which are nested in some constructs such as omp.critical or // another omp.parallel. // // omp.workshare { // `wsOp` // ... // omp.T { // `parent` // ... // `op` // template static bool isNestedIn(omp::WorkshareOp wsOp, Operation *op) { T parent = op->getParentOfType(); if (!parent) return false; return wsOp->isProperAncestor(parent); } bool shouldUseWorkshareLowering(Operation *op) { auto parentWorkshare = op->getParentOfType(); if (!parentWorkshare) return false; if (isNestedIn(parentWorkshare, op)) return false; // 2.8.3 workshare Construct // For a parallel construct, the construct is a unit of work with respect to // the workshare construct. The statements contained in the parallel construct // are executed by a new thread team. if (isNestedIn(parentWorkshare, op)) return false; // 2.8.2 single Construct // Binding The binding thread set for a single region is the current team. A // single region binds to the innermost enclosing parallel region. // Description Only one of the encountering threads will execute the // structured block associated with the single construct. if (isNestedIn(parentWorkshare, op)) return false; // Do not use workshare lowering until we support CFG in omp.workshare if (parentWorkshare.getRegion().getBlocks().size() != 1) return false; return true; } } // namespace flangomp namespace { struct SingleRegion { Block::iterator begin, end; }; static bool mustParallelizeOp(Operation *op) { return op ->walk([&](Operation *nested) { // We need to be careful not to pick up workshare.loop_wrapper in nested // omp.parallel{omp.workshare} regions, i.e. make sure that `nested` // binds to the workshare region we are currently handling. // // For example: // // omp.parallel { // omp.workshare { // currently handling this // omp.parallel { // omp.workshare { // nested workshare // omp.workshare.loop_wrapper {} // // Therefore, we skip if we encounter a nested omp.workshare. if (isa(nested)) return WalkResult::skip(); if (isa(nested)) return WalkResult::interrupt(); return WalkResult::advance(); }) .wasInterrupted(); } static bool isSafeToParallelize(Operation *op) { return isa(op) || isa(op) || isMemoryEffectFree(op); } /// Simple shallow copies suffice for our purposes in this pass, so we implement /// this simpler alternative to the full fledged `createCopyFunc` in the /// frontend static mlir::func::FuncOp createCopyFunc(mlir::Location loc, mlir::Type varType, fir::FirOpBuilder builder) { mlir::ModuleOp module = builder.getModule(); auto rt = cast(varType); mlir::Type eleTy = rt.getEleTy(); std::string copyFuncName = fir::getTypeAsString(eleTy, builder.getKindMap(), "_workshare_copy"); if (auto decl = module.lookupSymbol(copyFuncName)) return decl; // create function mlir::OpBuilder::InsertionGuard guard(builder); mlir::OpBuilder modBuilder(module.getBodyRegion()); llvm::SmallVector argsTy = {varType, varType}; auto funcType = mlir::FunctionType::get(builder.getContext(), argsTy, {}); mlir::func::FuncOp funcOp = mlir::func::FuncOp::create(modBuilder, loc, copyFuncName, funcType); funcOp.setVisibility(mlir::SymbolTable::Visibility::Private); fir::factory::setInternalLinkage(funcOp); builder.createBlock(&funcOp.getRegion(), funcOp.getRegion().end(), argsTy, {loc, loc}); builder.setInsertionPointToStart(&funcOp.getRegion().back()); Value loaded = fir::LoadOp::create(builder, loc, funcOp.getArgument(1)); fir::StoreOp::create(builder, loc, loaded, funcOp.getArgument(0)); mlir::func::ReturnOp::create(builder, loc); return funcOp; } static bool isUserOutsideSR(Operation *user, Operation *parentOp, SingleRegion sr) { while (user->getParentOp() != parentOp) user = user->getParentOp(); return sr.begin->getBlock() != user->getBlock() || !(user->isBeforeInBlock(&*sr.end) && sr.begin->isBeforeInBlock(user)); } static bool isTransitivelyUsedOutside(Value v, SingleRegion sr) { Block *srBlock = sr.begin->getBlock(); Operation *parentOp = srBlock->getParentOp(); for (auto &use : v.getUses()) { Operation *user = use.getOwner(); if (isUserOutsideSR(user, parentOp, sr)) return true; // Now we know user is inside `sr`. // Results of nested users cannot be used outside of `sr`. if (user->getBlock() != srBlock) continue; // A non-safe to parallelize operation will be checked for uses outside // separately. if (!isSafeToParallelize(user)) continue; // For safe to parallelize operations, we need to check if there is a // transitive use of `v` through them. for (auto res : user->getResults()) if (isTransitivelyUsedOutside(res, sr)) return true; } return false; } /// We clone pure operations in both the parallel and single blocks. this /// functions cleans them up if they end up with no uses static void cleanupBlock(Block *block) { for (Operation &op : llvm::make_early_inc_range( llvm::make_range(block->rbegin(), block->rend()))) if (isOpTriviallyDead(&op)) op.erase(); } static void parallelizeRegion(Region &sourceRegion, Region &targetRegion, IRMapping &rootMapping, Location loc, mlir::DominanceInfo &di) { OpBuilder rootBuilder(sourceRegion.getContext()); ModuleOp m = sourceRegion.getParentOfType(); OpBuilder copyFuncBuilder(m.getBodyRegion()); fir::FirOpBuilder firCopyFuncBuilder(copyFuncBuilder, m); auto mapReloadedValue = [&](Value v, OpBuilder allocaBuilder, OpBuilder singleBuilder, OpBuilder parallelBuilder, IRMapping singleMapping) -> Value { if (auto reloaded = rootMapping.lookupOrNull(v)) return nullptr; Type ty = v.getType(); Value alloc = fir::AllocaOp::create(allocaBuilder, loc, ty); fir::StoreOp::create(singleBuilder, loc, singleMapping.lookup(v), alloc); Value reloaded = fir::LoadOp::create(parallelBuilder, loc, ty, alloc); rootMapping.map(v, reloaded); return alloc; }; auto moveToSingle = [&](SingleRegion sr, OpBuilder allocaBuilder, OpBuilder singleBuilder, OpBuilder parallelBuilder) -> std::pair> { IRMapping singleMapping = rootMapping; SmallVector copyPrivate; bool allParallelized = true; for (Operation &op : llvm::make_range(sr.begin, sr.end)) { if (isSafeToParallelize(&op)) { singleBuilder.clone(op, singleMapping); if (llvm::all_of(op.getOperands(), [&](Value opr) { // Either we have already remapped it bool remapped = rootMapping.contains(opr); // Or it is available because it dominates `sr` bool dominates = di.properlyDominates(opr, &*sr.begin); return remapped || dominates; })) { // Safe to parallelize operations which have all operands available in // the root parallel block can be executed there. parallelBuilder.clone(op, rootMapping); } else { // If any operand was not available, it means that there was no // transitive use of a non-safe-to-parallelize operation outside `sr`. // This means that there should be no transitive uses outside `sr` of // `op`. assert(llvm::all_of(op.getResults(), [&](Value v) { return !isTransitivelyUsedOutside(v, sr); })); allParallelized = false; } } else if (auto alloca = dyn_cast(&op)) { auto hoisted = cast(allocaBuilder.clone(*alloca, singleMapping)); rootMapping.map(&*alloca, &*hoisted); rootMapping.map(alloca.getResult(), hoisted.getResult()); copyPrivate.push_back(hoisted); allParallelized = false; } else { singleBuilder.clone(op, singleMapping); // Prepare reloaded values for results of operations that cannot be // safely parallelized and which are used after the region `sr`. for (auto res : op.getResults()) { if (isTransitivelyUsedOutside(res, sr)) { auto alloc = mapReloadedValue(res, allocaBuilder, singleBuilder, parallelBuilder, singleMapping); if (alloc) copyPrivate.push_back(alloc); } } allParallelized = false; } } omp::TerminatorOp::create(singleBuilder, loc); return {allParallelized, copyPrivate}; }; for (Block &block : sourceRegion) { Block *targetBlock = rootBuilder.createBlock( &targetRegion, {}, block.getArgumentTypes(), llvm::map_to_vector(block.getArguments(), [](BlockArgument arg) { return arg.getLoc(); })); rootMapping.map(&block, targetBlock); rootMapping.map(block.getArguments(), targetBlock->getArguments()); } auto handleOneBlock = [&](Block &block) { Block &targetBlock = *rootMapping.lookup(&block); rootBuilder.setInsertionPointToStart(&targetBlock); Operation *terminator = block.getTerminator(); SmallVector> regions; auto it = block.begin(); auto getOneRegion = [&]() { if (&*it == terminator) return false; if (mustParallelizeOp(&*it)) { regions.push_back(&*it); it++; return true; } SingleRegion sr; sr.begin = it; while (&*it != terminator && !mustParallelizeOp(&*it)) it++; sr.end = it; assert(sr.begin != sr.end); regions.push_back(sr); return true; }; while (getOneRegion()) ; for (auto [i, opOrSingle] : llvm::enumerate(regions)) { bool isLast = i + 1 == regions.size(); if (std::holds_alternative(opOrSingle)) { OpBuilder singleBuilder(sourceRegion.getContext()); Block *singleBlock = new Block(); singleBuilder.setInsertionPointToStart(singleBlock); OpBuilder allocaBuilder(sourceRegion.getContext()); Block *allocaBlock = new Block(); allocaBuilder.setInsertionPointToStart(allocaBlock); OpBuilder parallelBuilder(sourceRegion.getContext()); Block *parallelBlock = new Block(); parallelBuilder.setInsertionPointToStart(parallelBlock); auto [allParallelized, copyprivateVars] = moveToSingle(std::get(opOrSingle), allocaBuilder, singleBuilder, parallelBuilder); if (allParallelized) { // The single region was not required as all operations were safe to // parallelize assert(copyprivateVars.empty()); assert(allocaBlock->empty()); delete singleBlock; } else { omp::SingleOperands singleOperands; if (isLast) singleOperands.nowait = rootBuilder.getUnitAttr(); singleOperands.copyprivateVars = copyprivateVars; cleanupBlock(singleBlock); for (auto var : singleOperands.copyprivateVars) { mlir::func::FuncOp funcOp = createCopyFunc(loc, var.getType(), firCopyFuncBuilder); singleOperands.copyprivateSyms.push_back( SymbolRefAttr::get(funcOp)); } omp::SingleOp singleOp = omp::SingleOp::create(rootBuilder, loc, singleOperands); singleOp.getRegion().push_back(singleBlock); targetRegion.front().getOperations().splice( singleOp->getIterator(), allocaBlock->getOperations()); } rootBuilder.getInsertionBlock()->getOperations().splice( rootBuilder.getInsertionPoint(), parallelBlock->getOperations()); delete allocaBlock; delete parallelBlock; } else { auto op = std::get(opOrSingle); if (auto wslw = dyn_cast(op)) { omp::WsloopOperands wsloopOperands; if (isLast) wsloopOperands.nowait = rootBuilder.getUnitAttr(); auto wsloop = mlir::omp::WsloopOp::create(rootBuilder, loc, wsloopOperands); auto clonedWslw = cast( rootBuilder.clone(*wslw, rootMapping)); wsloop.getRegion().takeBody(clonedWslw.getRegion()); clonedWslw->erase(); } else { assert(mustParallelizeOp(op)); Operation *cloned = rootBuilder.cloneWithoutRegions(*op, rootMapping); for (auto [region, clonedRegion] : llvm::zip(op->getRegions(), cloned->getRegions())) parallelizeRegion(region, clonedRegion, rootMapping, loc, di); } } } rootBuilder.clone(*block.getTerminator(), rootMapping); }; if (sourceRegion.hasOneBlock()) { handleOneBlock(sourceRegion.front()); } else if (!sourceRegion.empty()) { auto &domTree = di.getDomTree(&sourceRegion); for (auto node : llvm::breadth_first(domTree.getRootNode())) { handleOneBlock(*node->getBlock()); } } for (Block &targetBlock : targetRegion) cleanupBlock(&targetBlock); } /// Lowers workshare to a sequence of single-thread regions and parallel loops /// /// For example: /// /// omp.workshare { /// %a = fir.allocmem /// omp.workshare.loop_wrapper {} /// fir.call Assign %b %a /// fir.freemem %a /// } /// /// becomes /// /// %tmp = fir.alloca /// omp.single copyprivate(%tmp) { /// %a = fir.allocmem /// fir.store %a %tmp /// } /// %a_reloaded = fir.load %tmp /// omp.workshare.loop_wrapper {} /// omp.single { /// fir.call Assign %b %a_reloaded /// fir.freemem %a_reloaded /// } /// /// Note that we allocate temporary memory for values in omp.single's which need /// to be accessed by all threads and broadcast them using single's copyprivate LogicalResult lowerWorkshare(mlir::omp::WorkshareOp wsOp, DominanceInfo &di) { Location loc = wsOp->getLoc(); IRMapping rootMapping; OpBuilder rootBuilder(wsOp); // FIXME Currently, we only support workshare constructs with structured // control flow. The transformation itself supports CFG, however, once we // transform the MLIR region in the omp.workshare, we need to inline that // region in the parent block. We have no guarantees at this point of the // pipeline that the parent op supports CFG (e.g. fir.if), thus this is not // generally possible. The alternative is to put the lowered region in an // operation akin to scf.execute_region, which will get lowered at the same // time when fir ops get lowered to CFG. However, SCF is not registered in // flang so we cannot use it. Remove this requirement once we have // scf.execute_region or an alternative operation available. if (wsOp.getRegion().getBlocks().size() == 1) { // This operation is just a placeholder which will be erased later. We need // it because our `parallelizeRegion` function works on regions and not // blocks. omp::WorkshareOp newOp = omp::WorkshareOp::create(rootBuilder, loc, omp::WorkshareOperands()); if (!wsOp.getNowait()) omp::BarrierOp::create(rootBuilder, loc); parallelizeRegion(wsOp.getRegion(), newOp.getRegion(), rootMapping, loc, di); // Inline the contents of the placeholder workshare op into its parent // block. Block *theBlock = &newOp.getRegion().front(); Operation *term = theBlock->getTerminator(); Block *parentBlock = wsOp->getBlock(); parentBlock->getOperations().splice(newOp->getIterator(), theBlock->getOperations()); assert(term->getNumOperands() == 0); term->erase(); newOp->erase(); wsOp->erase(); } else { // Otherwise just change the operation to an omp.single. wsOp->emitWarning( "omp workshare with unstructured control flow is currently " "unsupported and will be serialized."); // `shouldUseWorkshareLowering` should have guaranteed that there are no // omp.workshare_loop_wrapper's that bind to this omp.workshare. assert(!wsOp->walk([&](Operation *op) { // Nested omp.workshare can have their own // omp.workshare_loop_wrapper's. if (isa(op)) return WalkResult::skip(); if (isa(op)) return WalkResult::interrupt(); return WalkResult::advance(); }) .wasInterrupted()); omp::SingleOperands operands; operands.nowait = wsOp.getNowaitAttr(); omp::SingleOp newOp = omp::SingleOp::create(rootBuilder, loc, operands); newOp.getRegion().getBlocks().splice(newOp.getRegion().getBlocks().begin(), wsOp.getRegion().getBlocks()); wsOp->erase(); } return success(); } class LowerWorksharePass : public flangomp::impl::LowerWorkshareBase { public: void runOnOperation() override { mlir::DominanceInfo &di = getAnalysis(); getOperation()->walk([&](mlir::omp::WorkshareOp wsOp) { if (failed(lowerWorkshare(wsOp, di))) signalPassFailure(); }); } }; } // namespace