diff options
Diffstat (limited to 'mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp')
-rw-r--r-- | mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp | 74 |
1 files changed, 36 insertions, 38 deletions
diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp index 844e66e..badd2f6 100644 --- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp +++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp @@ -25,9 +25,7 @@ #include "mlir/IR/Builders.h" #include "mlir/IR/IRMapping.h" #include "mlir/Interfaces/SideEffectInterfaces.h" -#include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" -#include "mlir/Transforms/Passes.h" #include "mlir/Transforms/RegionUtils.h" #include "llvm/Support/Debug.h" #include <optional> @@ -84,8 +82,8 @@ static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) { // Get a Value that corresponds to the loop step. If the step is an attribute, // materialize a corresponding constant using builder. static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) { - return builder.create<arith::ConstantIndexOp>(forOp.getLoc(), - forOp.getStepAsInt()); + return arith::ConstantIndexOp::create(builder, forOp.getLoc(), + forOp.getStepAsInt()); } // Get a Value for the loop lower bound. If the value requires computation, @@ -190,12 +188,12 @@ AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) { return std::nullopt; } - Value range = builder.create<arith::SubIOp>(currentLoop.getLoc(), - upperBound, lowerBound); + Value range = arith::SubIOp::create(builder, currentLoop.getLoc(), + upperBound, lowerBound); Value step = getOrCreateStep(currentLoop, builder); if (getConstantIntValue(step) != static_cast<int64_t>(1)) - range = - builder.create<arith::CeilDivSIOp>(currentLoop.getLoc(), range, step); + range = arith::CeilDivSIOp::create(builder, currentLoop.getLoc(), range, + step); dims.push_back(range); lbs.push_back(lowerBound); @@ -221,7 +219,7 @@ void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp, // no loop mapped to a specific dimension, use constant "1" as its size. Value constOne = (numBlockDims < 3 || numThreadDims < 3) - ? builder.create<arith::ConstantIndexOp>(rootForOp.getLoc(), 1) + ? arith::ConstantIndexOp::create(builder, rootForOp.getLoc(), 1) : nullptr; Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne; Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne; @@ -232,9 +230,9 @@ void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp, // Create a launch op and move the body region of the innermost loop to the // launch op. - auto launchOp = builder.create<gpu::LaunchOp>( - rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX, - blockSizeY, blockSizeZ); + auto launchOp = + gpu::LaunchOp::create(builder, rootForOp.getLoc(), gridSizeX, gridSizeY, + gridSizeZ, blockSizeX, blockSizeY, blockSizeZ); // Replace the loop terminator (loops contain only a single block) with the // gpu terminator and move the operations from the loop body block to the gpu @@ -244,7 +242,7 @@ void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp, Location terminatorLoc = terminator.getLoc(); terminator.erase(); builder.setInsertionPointToEnd(innermostForOp.getBody()); - builder.create<gpu::TerminatorOp>(terminatorLoc, TypeRange()); + gpu::TerminatorOp::create(builder, terminatorLoc, TypeRange()); launchOp.getBody().front().getOperations().splice( launchOp.getBody().front().begin(), innermostForOp.getBody()->getOperations()); @@ -263,10 +261,10 @@ void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp, : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims); Value step = steps[en.index()]; if (getConstantIntValue(step) != static_cast<int64_t>(1)) - id = builder.create<arith::MulIOp>(rootForOp.getLoc(), step, id); + id = arith::MulIOp::create(builder, rootForOp.getLoc(), step, id); Value ivReplacement = - builder.create<arith::AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id); + arith::AddIOp::create(builder, rootForOp.getLoc(), *lbArgumentIt, id); en.value().replaceAllUsesWith(ivReplacement); std::advance(lbArgumentIt, 1); std::advance(stepArgumentIt, 1); @@ -319,8 +317,8 @@ static Value deriveStaticUpperBound(Value upperBound, if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) { for (const AffineExpr &result : minOp.getMap().getResults()) { if (auto constExpr = dyn_cast<AffineConstantExpr>(result)) { - return rewriter.create<arith::ConstantIndexOp>(minOp.getLoc(), - constExpr.getValue()); + return arith::ConstantIndexOp::create(rewriter, minOp.getLoc(), + constExpr.getValue()); } } } @@ -344,8 +342,8 @@ static Value deriveStaticUpperBound(Value upperBound, if ((lhs.value() < 0) != (rhs.value() < 0)) return {}; - return rewriter.create<arith::ConstantIndexOp>( - multiplyOp.getLoc(), lhs.value() * rhs.value()); + return arith::ConstantIndexOp::create(rewriter, multiplyOp.getLoc(), + lhs.value() * rhs.value()); } } @@ -422,8 +420,8 @@ static LogicalResult processParallelLoop( if (launchIndependent(val)) return val; if (auto constOp = val.getDefiningOp<arith::ConstantOp>()) - return rewriter.create<arith::ConstantOp>(constOp.getLoc(), - constOp.getValue()); + return arith::ConstantOp::create(rewriter, constOp.getLoc(), + constOp.getValue()); return {}; }; @@ -453,8 +451,8 @@ static LogicalResult processParallelLoop( 1, 2, rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) + rewriter.getAffineSymbolExpr(1)); - newIndex = rewriter.create<AffineApplyOp>( - loc, annotation.getMap().compose(lowerAndStep), + newIndex = AffineApplyOp::create( + rewriter, loc, annotation.getMap().compose(lowerAndStep), ValueRange{operand, ensureLaunchIndependent(step), ensureLaunchIndependent(lowerBound)}); // If there was also a bound, insert that, too. @@ -498,8 +496,8 @@ static LogicalResult processParallelLoop( 1, 2, ((rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0)) .ceilDiv(rewriter.getAffineSymbolExpr(1)))); - Value launchBound = rewriter.create<AffineApplyOp>( - loc, annotation.getBound().compose(stepMap), + Value launchBound = AffineApplyOp::create( + rewriter, loc, annotation.getBound().compose(stepMap), ValueRange{ ensureLaunchIndependent( cloningMap.lookupOrDefault(upperBound)), @@ -517,10 +515,10 @@ static LogicalResult processParallelLoop( if (!boundIsPrecise) { // We are using an approximation, create a surrounding conditional. Value originalBound = std::get<3>(config); - arith::CmpIOp pred = rewriter.create<arith::CmpIOp>( - loc, arith::CmpIPredicate::slt, newIndex, + arith::CmpIOp pred = arith::CmpIOp::create( + rewriter, loc, arith::CmpIPredicate::slt, newIndex, cloningMap.lookupOrDefault(originalBound)); - scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false); + scf::IfOp ifOp = scf::IfOp::create(rewriter, loc, pred, false); rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front()); // Put a sentinel into the worklist so we know when to pop out of the // if body again. We use the launchOp here, as that cannot be part of @@ -530,10 +528,10 @@ static LogicalResult processParallelLoop( } } else { // Create a sequential for loop. - auto loopOp = rewriter.create<scf::ForOp>( - loc, cloningMap.lookupOrDefault(lowerBound), - cloningMap.lookupOrDefault(upperBound), - cloningMap.lookupOrDefault(step)); + auto loopOp = scf::ForOp::create(rewriter, loc, + cloningMap.lookupOrDefault(lowerBound), + cloningMap.lookupOrDefault(upperBound), + cloningMap.lookupOrDefault(step)); newIndex = loopOp.getInductionVar(); rewriter.setInsertionPointToStart(loopOp.getBody()); // Put a sentinel into the worklist so we know when to pop out of the loop @@ -608,12 +606,12 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, // sizes. Those will be refined later as we discover them from mappings. Location loc = parallelOp.getLoc(); Value constantOne = - rewriter.create<arith::ConstantIndexOp>(parallelOp.getLoc(), 1); - gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>( - parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne, - constantOne, constantOne); + arith::ConstantIndexOp::create(rewriter, parallelOp.getLoc(), 1); + gpu::LaunchOp launchOp = gpu::LaunchOp::create( + rewriter, parallelOp.getLoc(), constantOne, constantOne, constantOne, + constantOne, constantOne, constantOne); rewriter.setInsertionPointToEnd(&launchOp.getBody().front()); - rewriter.create<gpu::TerminatorOp>(loc); + gpu::TerminatorOp::create(rewriter, loc); rewriter.setInsertionPointToStart(&launchOp.getBody().front()); IRMapping cloningMap; @@ -667,7 +665,7 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, if (externalValues.size()) return failure(); // Replace by gpu.all_reduce. - auto gpuRedOp = rewriter.create<gpu::AllReduceOp>(loc, newValue); + auto gpuRedOp = gpu::AllReduceOp::create(rewriter, loc, newValue); cloningMap.map(parentLoop->getResult(0), gpuRedOp.getResult()); // Copy region. rewriter.inlineRegionBefore(reduceOp.getRegion(0), gpuRedOp.getRegion(), |