//===- DistributionUtils.cpp - Distribution tools for GPUOps --------------===// // // Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements distribution utility methods. // //===----------------------------------------------------------------------===// #include "mlir/Dialect/GPU/Utils/DistributionUtils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/IR/Value.h" #include "llvm/ADT/DenseMap.h" #include using namespace mlir; using namespace mlir::gpu; WarpExecuteOnLane0Op WarpDistributionPattern::moveRegionToNewWarpOpAndReplaceReturns( RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp, ValueRange newYieldedValues, TypeRange newReturnTypes) const { // Create a new op before the existing one, with the extra operands. OpBuilder::InsertionGuard g(rewriter); rewriter.setInsertionPoint(warpOp); auto newWarpOp = WarpExecuteOnLane0Op::create( rewriter, warpOp.getLoc(), newReturnTypes, warpOp.getLaneid(), warpOp.getWarpSize(), warpOp.getArgs(), warpOp.getBody()->getArgumentTypes()); Region &opBody = warpOp.getBodyRegion(); Region &newOpBody = newWarpOp.getBodyRegion(); Block &newOpFirstBlock = newOpBody.front(); rewriter.inlineRegionBefore(opBody, newOpBody, newOpBody.begin()); rewriter.eraseBlock(&newOpFirstBlock); assert(newWarpOp.getWarpRegion().hasOneBlock() && "expected WarpOp with single block"); auto yield = cast(newOpBody.getBlocks().begin()->getTerminator()); rewriter.modifyOpInPlace( yield, [&]() { yield.getValuesMutable().assign(newYieldedValues); }); return newWarpOp; } WarpExecuteOnLane0Op WarpDistributionPattern::moveRegionToNewWarpOpAndAppendReturns( RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp, ValueRange newYieldedValues, TypeRange newReturnTypes, SmallVector &indices) const { SmallVector types(warpOp.getResultTypes().begin(), warpOp.getResultTypes().end()); gpu::YieldOp yield = warpOp.getTerminator(); SmallVector yieldValues(yield.getOperands().begin(), yield.getOperands().end()); llvm::SmallDenseMap indexLookup; // Record the value -> first index mapping for faster lookup. for (auto [i, v] : llvm::enumerate(yieldValues)) { if (!indexLookup.count(v)) indexLookup[v] = i; } for (auto [value, type] : llvm::zip_equal(newYieldedValues, newReturnTypes)) { // If the value already exists in the yield, don't create a new output. if (indexLookup.count(value)) { indices.push_back(indexLookup[value]); } else { // If the value is new, add it to the yield and to the types. yieldValues.push_back(value); types.push_back(type); indices.push_back(yieldValues.size() - 1); } } WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns( rewriter, warpOp, yieldValues, types); rewriter.replaceOp(warpOp, newWarpOp.getResults().take_front(warpOp.getNumResults())); return newWarpOp; } OpOperand *WarpDistributionPattern::getWarpResult( WarpExecuteOnLane0Op warpOp, llvm::function_ref fn) const { gpu::YieldOp yield = warpOp.getTerminator(); for (OpOperand &yieldOperand : yield->getOpOperands()) { Value yieldValues = yieldOperand.get(); Operation *definedOp = yieldValues.getDefiningOp(); if (definedOp && fn(definedOp)) { if (!warpOp.getResult(yieldOperand.getOperandNumber()).use_empty()) return &yieldOperand; } } return nullptr; } bool WarpDistributionPattern::delinearizeLaneId( OpBuilder &builder, Location loc, ArrayRef originalShape, ArrayRef distributedShape, int64_t warpSize, Value laneId, SmallVectorImpl &delinearizedIds) const { // If the original shape and the distributed shape is the same, we don't // distribute at all--every thread is handling the whole. For such case, we // should not rely on lane IDs later. So just return an empty lane ID vector. if (originalShape == distributedShape) { delinearizedIds.clear(); return true; } SmallVector sizes; for (auto [large, small] : llvm::zip_equal(originalShape, distributedShape)) { if (large % small != 0) return false; sizes.push_back(large / small); } if (std::accumulate(sizes.begin(), sizes.end(), 1, std::multiplies()) != warpSize) return false; AffineExpr s0, s1; bindSymbols(builder.getContext(), s0, s1); int64_t usedThreads = 1; Value zero = arith::ConstantIndexOp::create(builder, loc, 0); delinearizedIds.assign(sizes.size(), zero); for (int i = sizes.size() - 1; i >= 0; --i) { usedThreads *= sizes[i]; if (usedThreads == warpSize) { // We've used up all available threads. Don't need to perform modulo // anymore. And we can stop the calculation for further dimensions. delinearizedIds[i] = laneId; break; } delinearizedIds[i] = affine::makeComposedAffineApply(builder, loc, s0 % sizes[i], {laneId}); laneId = affine::makeComposedAffineApply( builder, loc, s0.floorDiv(usedThreads), {laneId}); } return true; }