//===- OpenMPOffloadPrivatizationPrepare.cpp - Prepare OMP privatization --===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/LLVMIR/FunctionCallUtils.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/IR/Builders.h" #include "mlir/IR/Dominance.h" #include "mlir/IR/IRMapping.h" #include "mlir/Pass/Pass.h" #include "mlir/Support/LLVM.h" #include "llvm/Support/DebugLog.h" #include "llvm/Support/FormatVariadic.h" #include #include #include //===----------------------------------------------------------------------===// // A pass that prepares OpenMP code for translation of delayed privatization // in the context of deferred target tasks. Deferred target tasks are created // when the nowait clause is used on the target directive. //===----------------------------------------------------------------------===// #define DEBUG_TYPE "omp-prepare-for-offload-privatization" namespace mlir { namespace omp { #define GEN_PASS_DEF_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS #include "mlir/Dialect/OpenMP/Transforms/Passes.h.inc" } // namespace omp } // namespace mlir using namespace mlir; namespace { //===----------------------------------------------------------------------===// // PrepareForOMPOffloadPrivatizationPass //===----------------------------------------------------------------------===// class PrepareForOMPOffloadPrivatizationPass : public omp::impl::PrepareForOMPOffloadPrivatizationPassBase< PrepareForOMPOffloadPrivatizationPass> { void runOnOperation() override { ModuleOp mod = getOperation(); // In this pass, we make host-allocated privatized variables persist for // deferred target tasks by copying them to the heap. Once the target task // is done, this heap memory is freed. Since all of this happens on the host // we can skip device modules. auto offloadModuleInterface = dyn_cast(mod.getOperation()); if (offloadModuleInterface && offloadModuleInterface.getIsTargetDevice()) return; getOperation()->walk([&](omp::TargetOp targetOp) { if (!hasPrivateVars(targetOp) || !isTargetTaskDeferred(targetOp)) return; IRRewriter rewriter(&getContext()); OperandRange privateVars = targetOp.getPrivateVars(); SmallVector newPrivVars; Value fakeDependVar; omp::TaskOp cleanupTaskOp; newPrivVars.reserve(privateVars.size()); std::optional privateSyms = targetOp.getPrivateSyms(); for (auto [privVarIdx, privVarSymPair] : llvm::enumerate(llvm::zip_equal(privateVars, *privateSyms))) { Value privVar = std::get<0>(privVarSymPair); Attribute privSym = std::get<1>(privVarSymPair); omp::PrivateClauseOp privatizer = findPrivatizer(targetOp, privSym); if (!privatizer.needsMap()) { newPrivVars.push_back(privVar); continue; } bool isFirstPrivate = privatizer.getDataSharingType() == omp::DataSharingClauseType::FirstPrivate; Value mappedValue = targetOp.getMappedValueForPrivateVar(privVarIdx); auto mapInfoOp = cast(mappedValue.getDefiningOp()); if (mapInfoOp.getMapCaptureType() == omp::VariableCaptureKind::ByCopy) { newPrivVars.push_back(privVar); continue; } // For deferred target tasks (!$omp target nowait), we need to keep // a copy of the original, i.e. host variable being privatized so // that it is available when the target task is eventually executed. // We do this by first allocating as much heap memory as is needed by // the original variable. Then, we use the init and copy regions of the // privatizer, an instance of omp::PrivateClauseOp to set up the heap- // allocated copy. // After the target task is done, we need to use the dealloc region // of the privatizer to clean up everything. We also need to free // the heap memory we allocated. But due to the deferred nature // of the target task, we cannot simply deallocate right after the // omp.target operation else we may end up freeing memory before // its eventual use by the target task. So, we create a dummy // dependence between the target task and new omp.task. In the omp.task, // we do all the cleanup. So, we end up with the following structure // // omp.target map_entries(..) ... nowait depend(out:fakeDependVar) { // ... // omp.terminator // } // omp.task depend(in: fakeDependVar) { // /*cleanup_code*/ // omp.terminator // } // fakeDependVar is the address of the first heap-allocated copy of the // host variable being privatized. bool needsCleanupTask = !privatizer.getDeallocRegion().empty(); // Allocate heap memory that corresponds to the type of memory // pointed to by varPtr // For boxchars this won't be a pointer. But, MapsForPrivatizedSymbols // should have mapped the pointer to the boxchar so use that as varPtr. Value varPtr = mapInfoOp.getVarPtr(); Type varType = mapInfoOp.getVarType(); bool isPrivatizedByValue = !isa(privVar.getType()); assert(isa(varPtr.getType())); Value heapMem = allocateHeapMem(targetOp, varPtr, varType, mod, rewriter); if (!heapMem) targetOp.emitError( "Unable to allocate heap memory when trying to move " "a private variable out of the stack and into the " "heap for use by a deferred target task"); if (needsCleanupTask && !fakeDependVar) fakeDependVar = heapMem; // The types of private vars should match before and after the // transformation. In particular, if the type is a pointer, // simply record the newly allocated malloc location as the // new private variable. If, however, the type is not a pointer // then, we need to load the value from the newly allocated // location. We'll insert that load later after we have updated // the malloc'd location with the contents of the original // variable. if (!isPrivatizedByValue) newPrivVars.push_back(heapMem); // We now need to copy the original private variable into the newly // allocated location in the heap. // Find the earliest insertion point for the copy. This will be before // the first in the list of omp::MapInfoOp instances that use varPtr. // After the copy these omp::MapInfoOp instances will refer to heapMem // instead. Operation *varPtrDefiningOp = varPtr.getDefiningOp(); DenseSet users; if (varPtrDefiningOp) { users.insert(varPtrDefiningOp->user_begin(), varPtrDefiningOp->user_end()); } else { auto blockArg = cast(varPtr); users.insert(blockArg.user_begin(), blockArg.user_end()); } auto usesVarPtr = [&users](Operation *op) -> bool { return users.count(op); }; SmallVector chainOfOps; chainOfOps.push_back(mapInfoOp); for (auto member : mapInfoOp.getMembers()) { omp::MapInfoOp memberMap = cast(member.getDefiningOp()); if (usesVarPtr(memberMap)) chainOfOps.push_back(memberMap); if (memberMap.getVarPtrPtr()) { Operation *defOp = memberMap.getVarPtrPtr().getDefiningOp(); if (defOp && usesVarPtr(defOp)) chainOfOps.push_back(defOp); } } DominanceInfo dom; llvm::sort(chainOfOps, [&](Operation *l, Operation *r) { if (l == r) return false; return dom.properlyDominates(l, r); }); rewriter.setInsertionPoint(chainOfOps.front()); Operation *firstOp = chainOfOps.front(); Location loc = firstOp->getLoc(); // Create a llvm.func for 'region' that is marked always_inline and call // it. auto createAlwaysInlineFuncAndCallIt = [&](Region ®ion, llvm::StringRef funcName, llvm::ArrayRef args, bool returnsValue) -> Value { assert(!region.empty() && "region cannot be empty"); LLVM::LLVMFuncOp func = createFuncOpForRegion( loc, mod, region, funcName, rewriter, returnsValue); auto call = LLVM::CallOp::create(rewriter, loc, func, args); return call.getResult(); }; Value moldArg, newArg; if (isPrivatizedByValue) { moldArg = LLVM::LoadOp::create(rewriter, loc, varType, varPtr); newArg = LLVM::LoadOp::create(rewriter, loc, varType, heapMem); } else { moldArg = varPtr; newArg = heapMem; } Value initializedVal; if (!privatizer.getInitRegion().empty()) initializedVal = createAlwaysInlineFuncAndCallIt( privatizer.getInitRegion(), llvm::formatv("{0}_{1}", privatizer.getSymName(), "init").str(), {moldArg, newArg}, /*returnsValue=*/true); else initializedVal = newArg; if (isFirstPrivate && !privatizer.getCopyRegion().empty()) initializedVal = createAlwaysInlineFuncAndCallIt( privatizer.getCopyRegion(), llvm::formatv("{0}_{1}", privatizer.getSymName(), "copy").str(), {moldArg, initializedVal}, /*returnsValue=*/true); if (isPrivatizedByValue) (void)LLVM::StoreOp::create(rewriter, loc, initializedVal, heapMem); // clone origOp, replace all uses of varPtr with heapMem and // erase origOp. auto cloneModifyAndErase = [&](Operation *origOp) -> Operation * { Operation *clonedOp = rewriter.clone(*origOp); rewriter.replaceAllOpUsesWith(origOp, clonedOp); rewriter.modifyOpInPlace(clonedOp, [&]() { clonedOp->replaceUsesOfWith(varPtr, heapMem); }); rewriter.eraseOp(origOp); return clonedOp; }; // Now that we have set up the heap-allocated copy of the private // variable, rewrite all the uses of the original variable with // the heap-allocated variable. rewriter.setInsertionPoint(targetOp); mapInfoOp = cast(cloneModifyAndErase(mapInfoOp)); rewriter.setInsertionPoint(mapInfoOp); // Fix any members that may use varPtr to now use heapMem for (auto member : mapInfoOp.getMembers()) { auto memberMapInfoOp = cast(member.getDefiningOp()); if (!usesVarPtr(memberMapInfoOp)) continue; memberMapInfoOp = cast(cloneModifyAndErase(memberMapInfoOp)); rewriter.setInsertionPoint(memberMapInfoOp); if (memberMapInfoOp.getVarPtrPtr()) { Operation *varPtrPtrdefOp = memberMapInfoOp.getVarPtrPtr().getDefiningOp(); rewriter.setInsertionPoint(cloneModifyAndErase(varPtrPtrdefOp)); } } // If the type of the private variable is not a pointer, // which is typically the case with !fir.boxchar types, then // we need to ensure that the new private variable is also // not a pointer. Insert a load from heapMem right before // targetOp. if (isPrivatizedByValue) { rewriter.setInsertionPoint(targetOp); auto newPrivVar = LLVM::LoadOp::create(rewriter, mapInfoOp.getLoc(), varType, heapMem); newPrivVars.push_back(newPrivVar); } // Deallocate if (needsCleanupTask) { if (!cleanupTaskOp) { assert(fakeDependVar && "Need a valid value to set up a dependency"); rewriter.setInsertionPointAfter(targetOp); omp::TaskOperands taskOperands; auto inDepend = omp::ClauseTaskDependAttr::get( rewriter.getContext(), omp::ClauseTaskDepend::taskdependin); taskOperands.dependKinds.push_back(inDepend); taskOperands.dependVars.push_back(fakeDependVar); cleanupTaskOp = omp::TaskOp::create(rewriter, loc, taskOperands); Block *taskBlock = rewriter.createBlock(&cleanupTaskOp.getRegion()); rewriter.setInsertionPointToEnd(taskBlock); omp::TerminatorOp::create(rewriter, cleanupTaskOp.getLoc()); } rewriter.setInsertionPointToStart( &*cleanupTaskOp.getRegion().getBlocks().begin()); (void)createAlwaysInlineFuncAndCallIt( privatizer.getDeallocRegion(), llvm::formatv("{0}_{1}", privatizer.getSymName(), "dealloc") .str(), {initializedVal}, /*returnsValue=*/false); llvm::FailureOr freeFunc = LLVM::lookupOrCreateFreeFn(rewriter, mod); assert(llvm::succeeded(freeFunc) && "Could not find free in the module"); (void)LLVM::CallOp::create(rewriter, loc, freeFunc.value(), ValueRange{heapMem}); } } assert(newPrivVars.size() == privateVars.size() && "The number of private variables must match before and after " "transformation"); if (fakeDependVar) { omp::ClauseTaskDependAttr outDepend = omp::ClauseTaskDependAttr::get( rewriter.getContext(), omp::ClauseTaskDepend::taskdependout); SmallVector newDependKinds; if (!targetOp.getDependVars().empty()) { std::optional dependKinds = targetOp.getDependKinds(); assert(dependKinds && "bad depend clause in omp::TargetOp"); llvm::copy(*dependKinds, std::back_inserter(newDependKinds)); } newDependKinds.push_back(outDepend); ArrayAttr newDependKindsAttr = ArrayAttr::get(rewriter.getContext(), newDependKinds); targetOp.getDependVarsMutable().append(fakeDependVar); targetOp.setDependKindsAttr(newDependKindsAttr); } rewriter.setInsertionPoint(targetOp); targetOp.getPrivateVarsMutable().clear(); targetOp.getPrivateVarsMutable().assign(newPrivVars); }); } private: bool hasPrivateVars(omp::TargetOp targetOp) const { return !targetOp.getPrivateVars().empty(); } bool isTargetTaskDeferred(omp::TargetOp targetOp) const { return targetOp.getNowait(); } template omp::PrivateClauseOp findPrivatizer(OpTy op, Attribute privSym) const { SymbolRefAttr privatizerName = llvm::cast(privSym); omp::PrivateClauseOp privatizer = SymbolTable::lookupNearestSymbolFrom( op, privatizerName); return privatizer; } // Get the (compile-time constant) size of varType as per the // given DataLayout dl. std::int64_t getSizeInBytes(const DataLayout &dl, Type varType) const { llvm::TypeSize size = dl.getTypeSize(varType); unsigned short alignment = dl.getTypeABIAlignment(varType); return llvm::alignTo(size, alignment); } LLVM::LLVMFuncOp getMalloc(ModuleOp mod, IRRewriter &rewriter) const { llvm::FailureOr mallocCall = LLVM::lookupOrCreateMallocFn(rewriter, mod, rewriter.getI64Type()); assert(llvm::succeeded(mallocCall) && "Could not find malloc in the module"); return mallocCall.value(); } Value allocateHeapMem(omp::TargetOp targetOp, Value privVar, Type varType, ModuleOp mod, IRRewriter &rewriter) const { OpBuilder::InsertionGuard guard(rewriter); Value varPtr = privVar; Operation *definingOp = varPtr.getDefiningOp(); BlockArgument blockArg; if (!definingOp) { blockArg = mlir::dyn_cast(varPtr); rewriter.setInsertionPointToStart(blockArg.getParentBlock()); } else { rewriter.setInsertionPoint(definingOp); } Location loc = definingOp ? definingOp->getLoc() : blockArg.getLoc(); LLVM::LLVMFuncOp mallocFn = getMalloc(mod, rewriter); assert(mod.getDataLayoutSpec() && "MLIR module with no datalayout spec not handled yet"); const DataLayout &dl = DataLayout(mod); std::int64_t distance = getSizeInBytes(dl, varType); Value sizeBytes = LLVM::ConstantOp::create( rewriter, loc, mallocFn.getFunctionType().getParamType(0), distance); auto mallocCallOp = LLVM::CallOp::create(rewriter, loc, mallocFn, ValueRange{sizeBytes}); return mallocCallOp.getResult(); } // Create a function for srcRegion and attribute it to be always_inline. // The big assumption here is that srcRegion is one of init, copy or dealloc // regions of a omp::PrivateClauseop. Accordingly, the return type is assumed // to either be the same as the types of the two arguments of the region (for // init and copy regions) or void as would be the case for dealloc regions. LLVM::LLVMFuncOp createFuncOpForRegion(Location loc, ModuleOp mod, Region &srcRegion, llvm::StringRef funcName, IRRewriter &rewriter, bool returnsValue = false) { OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPoint(mod.getBody(), mod.getBody()->end()); Region clonedRegion; IRMapping mapper; srcRegion.cloneInto(&clonedRegion, mapper); SmallVector paramTypes; llvm::copy(srcRegion.getArgumentTypes(), std::back_inserter(paramTypes)); Type resultType = returnsValue ? srcRegion.getArgument(0).getType() : LLVM::LLVMVoidType::get(rewriter.getContext()); LLVM::LLVMFunctionType funcType = LLVM::LLVMFunctionType::get(resultType, paramTypes); LLVM::LLVMFuncOp func = LLVM::LLVMFuncOp::create(rewriter, loc, funcName, funcType); func.setAlwaysInline(true); rewriter.inlineRegionBefore(clonedRegion, func.getRegion(), func.getRegion().end()); for (auto &block : func.getRegion().getBlocks()) { if (isa(block.getTerminator())) { omp::YieldOp yieldOp = cast(block.getTerminator()); rewriter.setInsertionPoint(yieldOp); rewriter.replaceOpWithNewOp(yieldOp, TypeRange(), yieldOp.getOperands()); } } return func; } }; } // namespace