diff options
Diffstat (limited to 'mlir/lib')
29 files changed, 1498 insertions, 480 deletions
diff --git a/mlir/lib/Bindings/Python/Rewrite.cpp b/mlir/lib/Bindings/Python/Rewrite.cpp index 5ddb3fb..0f0ed22 100644 --- a/mlir/lib/Bindings/Python/Rewrite.cpp +++ b/mlir/lib/Bindings/Python/Rewrite.cpp @@ -205,7 +205,7 @@ public: nb::object res = f(opView, PyPatternRewriter(rewriter)); return logicalResultFromObject(res); }; - MlirRewritePattern pattern = mlirOpRewritePattenCreate( + MlirRewritePattern pattern = mlirOpRewritePatternCreate( rootName, benefit, ctx, callbacks, matchAndRewrite.ptr(), /* nGeneratedNames */ 0, /* generatedNames */ nullptr); diff --git a/mlir/lib/CAPI/Transforms/Rewrite.cpp b/mlir/lib/CAPI/Transforms/Rewrite.cpp index 46c329d..41ceb15 100644 --- a/mlir/lib/CAPI/Transforms/Rewrite.cpp +++ b/mlir/lib/CAPI/Transforms/Rewrite.cpp @@ -341,7 +341,7 @@ private: } // namespace mlir -MlirRewritePattern mlirOpRewritePattenCreate( +MlirRewritePattern mlirOpRewritePatternCreate( MlirStringRef rootName, unsigned benefit, MlirContext context, MlirRewritePatternCallbacks callbacks, void *userData, size_t nGeneratedNames, MlirStringRef *generatedNames) { diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index b215211..c03f3a5 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -484,5 +484,5 @@ void mlir::populateGpuToROCDLConversionPatterns( GPUSubgroupBroadcastOpToROCDL>(converter); patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset); - populateMathToROCDLConversionPatterns(converter, patterns); + populateMathToROCDLConversionPatterns(converter, patterns, chipset); } diff --git a/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt b/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt index 2771955a..8cc3fde 100644 --- a/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt +++ b/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt @@ -11,6 +11,7 @@ add_mlir_conversion_library(MLIRMathToROCDL Core LINK_LIBS PUBLIC + MLIRAMDGPUUtils MLIRDialectUtils MLIRFuncDialect MLIRGPUToGPURuntimeTransforms diff --git a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp index df219f3..a2dfc12 100644 --- a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp +++ b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp @@ -10,6 +10,8 @@ #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Conversion/LLVMCommon/LoweringOptions.h" #include "mlir/Conversion/LLVMCommon/TypeConverter.h" +#include "mlir/Conversion/LLVMCommon/VectorPattern.h" +#include "mlir/Dialect/AMDGPU/Utils/Chipset.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" @@ -19,6 +21,7 @@ #include "mlir/IR/PatternMatch.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" +#include "llvm/Support/DebugLog.h" #include "../GPUCommon/GPUOpsLowering.h" #include "../GPUCommon/OpToFuncCallLowering.h" @@ -42,8 +45,46 @@ static void populateOpPatterns(const LLVMTypeConverter &converter, f32ApproxFunc, f16Func); } +struct ClampFOpConversion final + : public ConvertOpToLLVMPattern<math::ClampFOp> { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(math::ClampFOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + // Only f16 and f32 types are supported by fmed3 + Type opTy = op.getType(); + Type resultType = getTypeConverter()->convertType(opTy); + + if (auto vectorType = dyn_cast<VectorType>(opTy)) + opTy = vectorType.getElementType(); + + if (!isa<Float16Type, Float32Type>(opTy)) + return rewriter.notifyMatchFailure( + op, "fmed3 only supports f16 and f32 types"); + + // Handle multi-dimensional vectors (converted to LLVM arrays) + if (auto arrayType = dyn_cast<LLVM::LLVMArrayType>(resultType)) + return LLVM::detail::handleMultidimensionalVectors( + op.getOperation(), adaptor.getOperands(), *getTypeConverter(), + [&](Type llvm1DVectorTy, ValueRange operands) -> Value { + typename math::ClampFOp::Adaptor adaptor(operands); + return ROCDL::FMed3Op::create(rewriter, op.getLoc(), llvm1DVectorTy, + adaptor.getValue(), adaptor.getMin(), + adaptor.getMax()); + }, + rewriter); + + // Handle 1D vectors and scalars directly + rewriter.replaceOpWithNewOp<ROCDL::FMed3Op>(op, op.getType(), op.getValue(), + op.getMin(), op.getMax()); + return success(); + } +}; + void mlir::populateMathToROCDLConversionPatterns( - const LLVMTypeConverter &converter, RewritePatternSet &patterns) { + const LLVMTypeConverter &converter, RewritePatternSet &patterns, + std::optional<amdgpu::Chipset> chipset) { // Handled by mathToLLVM: math::AbsIOp // Handled by mathToLLVM: math::AbsFOp // Handled by mathToLLVM: math::CopySignOp @@ -118,15 +159,21 @@ void mlir::populateMathToROCDLConversionPatterns( // worth creating a separate pass for it. populateOpPatterns<arith::RemFOp>(converter, patterns, "__ocml_fmod_f32", "__ocml_fmod_f64", "__ocml_fmod_f16"); + + if (chipset.has_value() && chipset->majorVersion >= 9) { + patterns.add<ClampFOpConversion>(converter); + } else { + LDBG() << "Chipset dependent patterns were not added"; + } } -namespace { -struct ConvertMathToROCDLPass - : public impl::ConvertMathToROCDLBase<ConvertMathToROCDLPass> { - ConvertMathToROCDLPass() = default; +struct ConvertMathToROCDLPass final + : impl::ConvertMathToROCDLBase<ConvertMathToROCDLPass> { + using impl::ConvertMathToROCDLBase< + ConvertMathToROCDLPass>::ConvertMathToROCDLBase; + void runOnOperation() override; }; -} // namespace void ConvertMathToROCDLPass::runOnOperation() { auto m = getOperation(); @@ -135,10 +182,21 @@ void ConvertMathToROCDLPass::runOnOperation() { RewritePatternSet patterns(&getContext()); LowerToLLVMOptions options(ctx, DataLayout(m)); LLVMTypeConverter converter(ctx, options); - populateMathToROCDLConversionPatterns(converter, patterns); + + FailureOr<amdgpu::Chipset> maybeChipset; + if (!chipset.empty()) { + maybeChipset = amdgpu::Chipset::parse(chipset); + if (failed(maybeChipset)) + return signalPassFailure(); + } + populateMathToROCDLConversionPatterns( + converter, patterns, + succeeded(maybeChipset) ? std::optional(*maybeChipset) : std::nullopt); + ConversionTarget target(getContext()); - target.addLegalDialect<BuiltinDialect, func::FuncDialect, - vector::VectorDialect, LLVM::LLVMDialect>(); + target + .addLegalDialect<BuiltinDialect, func::FuncDialect, vector::VectorDialect, + LLVM::LLVMDialect, ROCDL::ROCDLDialect>(); target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp, LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp, diff --git a/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt b/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt index 84b2580..dd9edc4 100644 --- a/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt +++ b/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt @@ -21,6 +21,7 @@ add_mlir_conversion_library(MLIRXeGPUToXeVM MLIRIndexDialect MLIRSCFDialect MLIRXeGPUDialect + MLIRXeGPUUtils MLIRPass MLIRTransforms MLIRSCFTransforms diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp index ddcbc44..fcbf66d 100644 --- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp +++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp @@ -22,6 +22,7 @@ #include "mlir/Dialect/SCF/Transforms/Patterns.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" +#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/Pass/Pass.h" #include "mlir/Support/LLVM.h" #include "llvm/ADT/STLExtras.h" @@ -63,6 +64,7 @@ static int32_t getNumericXeVMAddrSpace(xegpu::MemorySpace xeGpuMemspace) { case xegpu::MemorySpace::SLM: return static_cast<int>(xevm::AddrSpace::SHARED); } + llvm_unreachable("Unknown XeGPU memory space"); } // Get same bitwidth flat vector type of new element type. @@ -186,6 +188,7 @@ class CreateNdDescToXeVMPattern int64_t rank = mixedSizes.size(); if (rank != 2) return rewriter.notifyMatchFailure(op, "Expected 2D shape."); + auto sourceTy = source.getType(); auto sourceMemrefTy = dyn_cast<MemRefType>(sourceTy); // If source is a memref, we need to extract the aligned pointer as index. @@ -364,10 +367,11 @@ class LoadStorePrefetchNdToXeVMPattern : public OpConversionPattern<OpType> { // Add a builder that creates // offset * elemByteSize + baseAddr -static Value addOffset(ConversionPatternRewriter &rewriter, Location loc, - Value baseAddr, Value offset, int64_t elemByteSize) { +static Value addOffsetToBaseAddr(ConversionPatternRewriter &rewriter, + Location loc, Value baseAddr, Value offset, + int64_t elemByteSize) { Value byteSize = arith::ConstantIntOp::create( - rewriter, loc, rewriter.getI64Type(), elemByteSize); + rewriter, loc, baseAddr.getType(), elemByteSize); Value byteOffset = arith::MulIOp::create(rewriter, loc, offset, byteSize); Value newAddr = arith::AddIOp::create(rewriter, loc, baseAddr, byteOffset); return newAddr; @@ -443,7 +447,8 @@ class LoadStoreToXeVMPattern : public OpConversionPattern<OpType> { // If offset is provided, we add them to the base pointer. // Offset is in number of elements, we need to multiply by // element byte size. - basePtrI64 = addOffset(rewriter, loc, basePtrI64, offset, elemByteSize); + basePtrI64 = + addOffsetToBaseAddr(rewriter, loc, basePtrI64, offset, elemByteSize); } // Convert base pointer (i64) to LLVM pointer type. Value basePtrLLVM = @@ -506,6 +511,147 @@ class LoadStoreToXeVMPattern : public OpConversionPattern<OpType> { } }; +// Lower xegpu::CreateMemDescOp to memref::ViewOp. Since SLM access instructions +// on Xe2 and Xe3 operate on 32-bit or 64-bit units, all data types smaller than +// 32 bits will be converted to 32 bits. +class CreateMemDescOpPattern final + : public OpConversionPattern<xegpu::CreateMemDescOp> { +public: + using OpConversionPattern<xegpu::CreateMemDescOp>::OpConversionPattern; + LogicalResult + matchAndRewrite(xegpu::CreateMemDescOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + + auto resTy = op.getMemDesc(); + + // Create the result MemRefType with the same shape, element type, and + // memory space + auto newResTy = getTypeConverter()->convertType<MemRefType>(resTy); + + Value zero = arith::ConstantIndexOp::create(rewriter, op.getLoc(), 0); + auto viewOp = memref::ViewOp::create(rewriter, op.getLoc(), newResTy, + op.getSource(), zero, ValueRange()); + rewriter.replaceOp(op, viewOp); + return success(); + } +}; + +template <typename OpType, + typename = std::enable_if_t<llvm::is_one_of< + OpType, xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>::value>> +class LoadStoreMatrixToXeVMPattern : public OpConversionPattern<OpType> { + using OpConversionPattern<OpType>::OpConversionPattern; + LogicalResult + matchAndRewrite(OpType op, typename OpType::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + + SmallVector<OpFoldResult> offsets = op.getMixedOffsets(); + if (offsets.empty()) + return rewriter.notifyMatchFailure(op, "Expected offset to be provided."); + + auto loc = op.getLoc(); + auto ctxt = rewriter.getContext(); + Value basePtrStruct = adaptor.getMemDesc(); + Value mdescVal = op.getMemDesc(); + // Load result or Store value Type can be vector or scalar. + Value data; + if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) + data = op.getResult(); + else + data = adaptor.getData(); + VectorType valOrResVecTy = dyn_cast<VectorType>(data.getType()); + if (!valOrResVecTy) + valOrResVecTy = VectorType::get(1, data.getType()); + + int64_t elemBitWidth = + valOrResVecTy.getElementType().getIntOrFloatBitWidth(); + // Element type must be multiple of 8 bits. + if (elemBitWidth % 8 != 0) + return rewriter.notifyMatchFailure( + op, "Expected element type bit width to be multiple of 8."); + int64_t elemByteSize = elemBitWidth / 8; + + // Default memory space is SLM. + LLVM::LLVMPointerType ptrTypeLLVM = LLVM::LLVMPointerType::get( + ctxt, getNumericXeVMAddrSpace(xegpu::MemorySpace::SLM)); + + auto mdescTy = cast<xegpu::MemDescType>(mdescVal.getType()); + + Value basePtrLLVM = memref::ExtractAlignedPointerAsIndexOp::create( + rewriter, loc, basePtrStruct); + + // Convert base pointer (ptr) to i32 + Value basePtrI32 = arith::IndexCastUIOp::create( + rewriter, loc, rewriter.getI32Type(), basePtrLLVM); + + Value linearOffset = mdescTy.getLinearOffsets(rewriter, loc, offsets); + linearOffset = arith::IndexCastUIOp::create( + rewriter, loc, rewriter.getI32Type(), linearOffset); + basePtrI32 = addOffsetToBaseAddr(rewriter, loc, basePtrI32, linearOffset, + elemByteSize); + + // convert base pointer (i32) to LLVM pointer type + basePtrLLVM = + LLVM::IntToPtrOp::create(rewriter, loc, ptrTypeLLVM, basePtrI32); + + if (op.getSubgroupBlockIoAttr()) { + // if the attribute 'subgroup_block_io' is set to true, it lowers to + // xevm.blockload + + Type intElemTy = rewriter.getIntegerType(elemBitWidth); + VectorType intVecTy = + VectorType::get(valOrResVecTy.getShape(), intElemTy); + + if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) { + Value loadOp = + xevm::BlockLoadOp::create(rewriter, loc, intVecTy, basePtrLLVM); + if (intVecTy != valOrResVecTy) { + loadOp = + vector::BitCastOp::create(rewriter, loc, valOrResVecTy, loadOp); + } + rewriter.replaceOp(op, loadOp); + } else { + Value dataToStore = adaptor.getData(); + if (valOrResVecTy != intVecTy) { + dataToStore = + vector::BitCastOp::create(rewriter, loc, intVecTy, dataToStore); + } + xevm::BlockStoreOp::create(rewriter, loc, basePtrLLVM, dataToStore, + nullptr); + rewriter.eraseOp(op); + } + return success(); + } + + if (valOrResVecTy.getNumElements() >= 1) { + auto chipOpt = xegpu::getChipStr(op); + if (!chipOpt || (*chipOpt != "pvc" && *chipOpt != "bmg")) { + // the lowering for chunk load only works for pvc and bmg + return rewriter.notifyMatchFailure( + op, "The lowering is specific to pvc or bmg."); + } + } + + if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) { + // if the size of valOrResVecTy is 1, it lowers to a scalar load/store + // operation. LLVM load/store does not support vector of size 1, so we + // need to handle this case separately. + auto scalarTy = valOrResVecTy.getElementType(); + LLVM::LoadOp loadOp; + if (valOrResVecTy.getNumElements() == 1) + loadOp = LLVM::LoadOp::create(rewriter, loc, scalarTy, basePtrLLVM); + else + loadOp = + LLVM::LoadOp::create(rewriter, loc, valOrResVecTy, basePtrLLVM); + rewriter.replaceOp(op, loadOp); + } else { + LLVM::StoreOp::create(rewriter, loc, adaptor.getData(), basePtrLLVM); + rewriter.eraseOp(op); + } + return success(); + } +}; + class PrefetchToXeVMPattern : public OpConversionPattern<xegpu::PrefetchOp> { using OpConversionPattern::OpConversionPattern; LogicalResult @@ -548,8 +694,8 @@ class PrefetchToXeVMPattern : public OpConversionPattern<xegpu::PrefetchOp> { op, "Expected element type bit width to be multiple of 8."); elemByteSize = elemBitWidth / 8; } - basePtrI64 = - addOffset(rewriter, loc, basePtrI64, offsets, elemByteSize); + basePtrI64 = addOffsetToBaseAddr(rewriter, loc, basePtrI64, offsets, + elemByteSize); } } // Default memory space is global. @@ -786,6 +932,13 @@ struct ConvertXeGPUToXeVMPass auto i32Type = IntegerType::get(&getContext(), 32); return VectorType::get(8, i32Type); }); + // Convert MemDescType into flattened MemRefType for SLM + typeConverter.addConversion([&](xegpu::MemDescType type) -> Type { + Type elemTy = type.getElementType(); + int numElems = type.getNumElements(); + return MemRefType::get(numElems, elemTy, AffineMap(), 3); + }); + typeConverter.addConversion([&](MemRefType type) -> Type { // Convert MemRefType to i64 type. return IntegerType::get(&getContext(), 64); @@ -940,6 +1093,9 @@ void mlir::populateXeGPUToXeVMConversionPatterns( LoadStoreToXeVMPattern<xegpu::LoadGatherOp>, LoadStoreToXeVMPattern<xegpu::StoreScatterOp>>( typeConverter, patterns.getContext()); + patterns.add<LoadStoreMatrixToXeVMPattern<xegpu::LoadMatrixOp>, + LoadStoreMatrixToXeVMPattern<xegpu::StoreMatrixOp>, + CreateMemDescOpPattern>(typeConverter, patterns.getContext()); patterns.add<FenceToXeVMPattern, DpasToXeVMPattern>(typeConverter, patterns.getContext()); } diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index c798adb..61166db 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -339,6 +339,25 @@ void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns( } //===----------------------------------------------------------------------===// +// ScaledExtPacked816Op +//===----------------------------------------------------------------------===// +LogicalResult ScaledExtPacked816Op::verify() { + int blockSize = getBlockSize(); + assert((blockSize == 16 || blockSize == 32) && "invalid block size"); + int firstScaleByte = getFirstScaleByte(); + if (blockSize == 16 && !llvm::is_contained({0, 1}, firstScaleByte)) { + return emitOpError( + "blockSize of 16 can only have firstScaleByte be 0 or 1."); + } + if (blockSize == 32 && !llvm::is_contained({0, 2}, firstScaleByte)) { + return emitOpError( + "blockSize of 32 can only have firstScaleByte be 0 or 2."); + } + + return success(); +} + +//===----------------------------------------------------------------------===// // WMMAOp //===----------------------------------------------------------------------===// LogicalResult WMMAOp::verify() { diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp index 749e2ba..e0a53cd 100644 --- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp @@ -2600,6 +2600,65 @@ static LogicalResult foldLoopBounds(AffineForOp forOp) { return success(folded); } +/// Returns constant trip count in trivial cases. +static std::optional<uint64_t> getTrivialConstantTripCount(AffineForOp forOp) { + int64_t step = forOp.getStepAsInt(); + if (!forOp.hasConstantBounds() || step <= 0) + return std::nullopt; + int64_t lb = forOp.getConstantLowerBound(); + int64_t ub = forOp.getConstantUpperBound(); + return ub - lb <= 0 ? 0 : (ub - lb + step - 1) / step; +} + +/// Fold the empty loop. +static SmallVector<OpFoldResult> AffineForEmptyLoopFolder(AffineForOp forOp) { + if (!llvm::hasSingleElement(*forOp.getBody())) + return {}; + if (forOp.getNumResults() == 0) + return {}; + std::optional<uint64_t> tripCount = getTrivialConstantTripCount(forOp); + if (tripCount == 0) { + // The initial values of the iteration arguments would be the op's + // results. + return forOp.getInits(); + } + SmallVector<Value, 4> replacements; + auto yieldOp = cast<AffineYieldOp>(forOp.getBody()->getTerminator()); + auto iterArgs = forOp.getRegionIterArgs(); + bool hasValDefinedOutsideLoop = false; + bool iterArgsNotInOrder = false; + for (unsigned i = 0, e = yieldOp->getNumOperands(); i < e; ++i) { + Value val = yieldOp.getOperand(i); + BlockArgument *iterArgIt = llvm::find(iterArgs, val); + // TODO: It should be possible to perform a replacement by computing the + // last value of the IV based on the bounds and the step. + if (val == forOp.getInductionVar()) + return {}; + if (iterArgIt == iterArgs.end()) { + // `val` is defined outside of the loop. + assert(forOp.isDefinedOutsideOfLoop(val) && + "must be defined outside of the loop"); + hasValDefinedOutsideLoop = true; + replacements.push_back(val); + } else { + unsigned pos = std::distance(iterArgs.begin(), iterArgIt); + if (pos != i) + iterArgsNotInOrder = true; + replacements.push_back(forOp.getInits()[pos]); + } + } + // Bail out when the trip count is unknown and the loop returns any value + // defined outside of the loop or any iterArg out of order. + if (!tripCount.has_value() && + (hasValDefinedOutsideLoop || iterArgsNotInOrder)) + return {}; + // Bail out when the loop iterates more than once and it returns any iterArg + // out of order. + if (tripCount.has_value() && tripCount.value() >= 2 && iterArgsNotInOrder) + return {}; + return llvm::to_vector_of<OpFoldResult>(replacements); +} + /// Canonicalize the bounds of the given loop. static LogicalResult canonicalizeLoopBounds(AffineForOp forOp) { SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands()); @@ -2631,79 +2690,30 @@ static LogicalResult canonicalizeLoopBounds(AffineForOp forOp) { return success(); } -namespace { -/// Returns constant trip count in trivial cases. -static std::optional<uint64_t> getTrivialConstantTripCount(AffineForOp forOp) { - int64_t step = forOp.getStepAsInt(); - if (!forOp.hasConstantBounds() || step <= 0) - return std::nullopt; - int64_t lb = forOp.getConstantLowerBound(); - int64_t ub = forOp.getConstantUpperBound(); - return ub - lb <= 0 ? 0 : (ub - lb + step - 1) / step; +/// Returns true if the affine.for has zero iterations in trivial cases. +static bool hasTrivialZeroTripCount(AffineForOp op) { + return getTrivialConstantTripCount(op) == 0; } -/// This is a pattern to fold trivially empty loop bodies. -/// TODO: This should be moved into the folding hook. -struct AffineForEmptyLoopFolder : public OpRewritePattern<AffineForOp> { - using OpRewritePattern<AffineForOp>::OpRewritePattern; - - LogicalResult matchAndRewrite(AffineForOp forOp, - PatternRewriter &rewriter) const override { - // Check that the body only contains a yield. - if (!llvm::hasSingleElement(*forOp.getBody())) - return failure(); - if (forOp.getNumResults() == 0) - return success(); - std::optional<uint64_t> tripCount = getTrivialConstantTripCount(forOp); - if (tripCount == 0) { - // The initial values of the iteration arguments would be the op's - // results. - rewriter.replaceOp(forOp, forOp.getInits()); - return success(); - } - SmallVector<Value, 4> replacements; - auto yieldOp = cast<AffineYieldOp>(forOp.getBody()->getTerminator()); - auto iterArgs = forOp.getRegionIterArgs(); - bool hasValDefinedOutsideLoop = false; - bool iterArgsNotInOrder = false; - for (unsigned i = 0, e = yieldOp->getNumOperands(); i < e; ++i) { - Value val = yieldOp.getOperand(i); - auto *iterArgIt = llvm::find(iterArgs, val); - // TODO: It should be possible to perform a replacement by computing the - // last value of the IV based on the bounds and the step. - if (val == forOp.getInductionVar()) - return failure(); - if (iterArgIt == iterArgs.end()) { - // `val` is defined outside of the loop. - assert(forOp.isDefinedOutsideOfLoop(val) && - "must be defined outside of the loop"); - hasValDefinedOutsideLoop = true; - replacements.push_back(val); - } else { - unsigned pos = std::distance(iterArgs.begin(), iterArgIt); - if (pos != i) - iterArgsNotInOrder = true; - replacements.push_back(forOp.getInits()[pos]); - } - } - // Bail out when the trip count is unknown and the loop returns any value - // defined outside of the loop or any iterArg out of order. - if (!tripCount.has_value() && - (hasValDefinedOutsideLoop || iterArgsNotInOrder)) - return failure(); - // Bail out when the loop iterates more than once and it returns any iterArg - // out of order. - if (tripCount.has_value() && tripCount.value() >= 2 && iterArgsNotInOrder) - return failure(); - rewriter.replaceOp(forOp, replacements); - return success(); +LogicalResult AffineForOp::fold(FoldAdaptor adaptor, + SmallVectorImpl<OpFoldResult> &results) { + bool folded = succeeded(foldLoopBounds(*this)); + folded |= succeeded(canonicalizeLoopBounds(*this)); + if (hasTrivialZeroTripCount(*this) && getNumResults() != 0) { + // The initial values of the loop-carried variables (iter_args) are the + // results of the op. But this must be avoided for an affine.for op that + // does not return any results. Since ops that do not return results cannot + // be folded away, we would enter an infinite loop of folds on the same + // affine.for op. + results.assign(getInits().begin(), getInits().end()); + folded = true; } -}; -} // namespace - -void AffineForOp::getCanonicalizationPatterns(RewritePatternSet &results, - MLIRContext *context) { - results.add<AffineForEmptyLoopFolder>(context); + SmallVector<OpFoldResult> foldResults = AffineForEmptyLoopFolder(*this); + if (!foldResults.empty()) { + results.assign(foldResults); + folded = true; + } + return success(folded); } OperandRange AffineForOp::getEntrySuccessorOperands(RegionBranchPoint point) { @@ -2746,27 +2756,6 @@ void AffineForOp::getSuccessorRegions( regions.push_back(RegionSuccessor(getResults())); } -/// Returns true if the affine.for has zero iterations in trivial cases. -static bool hasTrivialZeroTripCount(AffineForOp op) { - return getTrivialConstantTripCount(op) == 0; -} - -LogicalResult AffineForOp::fold(FoldAdaptor adaptor, - SmallVectorImpl<OpFoldResult> &results) { - bool folded = succeeded(foldLoopBounds(*this)); - folded |= succeeded(canonicalizeLoopBounds(*this)); - if (hasTrivialZeroTripCount(*this) && getNumResults() != 0) { - // The initial values of the loop-carried variables (iter_args) are the - // results of the op. But this must be avoided for an affine.for op that - // does not return any results. Since ops that do not return results cannot - // be folded away, we would enter an infinite loop of folds on the same - // affine.for op. - results.assign(getInits().begin(), getInits().end()); - folded = true; - } - return success(folded); -} - AffineBound AffineForOp::getLowerBound() { return AffineBound(*this, getLowerBoundOperands(), getLowerBoundMap()); } diff --git a/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp b/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp index 70faa71..bc17990 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp @@ -41,18 +41,37 @@ namespace bufferization { using namespace mlir; -/// Return the unique ReturnOp that terminates `funcOp`. -/// Return nullptr if there is no such unique ReturnOp. -static func::ReturnOp getAssumedUniqueReturnOp(func::FuncOp funcOp) { - func::ReturnOp returnOp; +/// Get all the ReturnOp in the funcOp. +static SmallVector<func::ReturnOp> getReturnOps(func::FuncOp funcOp) { + SmallVector<func::ReturnOp> returnOps; for (Block &b : funcOp.getBody()) { if (auto candidateOp = dyn_cast<func::ReturnOp>(b.getTerminator())) { - if (returnOp) - return nullptr; - returnOp = candidateOp; + returnOps.push_back(candidateOp); } } - return returnOp; + return returnOps; +} + +/// Get the operands at the specified position for all returnOps. +static SmallVector<Value> +getReturnOpsOperandInPos(ArrayRef<func::ReturnOp> returnOps, size_t pos) { + return llvm::map_to_vector(returnOps, [&](func::ReturnOp returnOp) { + return returnOp.getOperand(pos); + }); +} + +/// Check if all given values are the same buffer as the block argument (modulo +/// cast ops). +static bool operandsEqualFuncArgument(ArrayRef<Value> operands, + BlockArgument argument) { + for (Value val : operands) { + while (auto castOp = val.getDefiningOp<memref::CastOp>()) + val = castOp.getSource(); + + if (val != argument) + return false; + } + return true; } LogicalResult @@ -72,40 +91,45 @@ mlir::bufferization::dropEquivalentBufferResults(ModuleOp module) { for (auto funcOp : module.getOps<func::FuncOp>()) { if (funcOp.isExternal() || funcOp.isPublic()) continue; - func::ReturnOp returnOp = getAssumedUniqueReturnOp(funcOp); - // TODO: Support functions with multiple blocks. - if (!returnOp) + SmallVector<func::ReturnOp> returnOps = getReturnOps(funcOp); + if (returnOps.empty()) continue; // Compute erased results. - SmallVector<Value> newReturnValues; - BitVector erasedResultIndices(funcOp.getFunctionType().getNumResults()); + size_t numReturnOps = returnOps.size(); + size_t numReturnValues = funcOp.getFunctionType().getNumResults(); + SmallVector<SmallVector<Value>> newReturnValues(numReturnOps); + BitVector erasedResultIndices(numReturnValues); DenseMap<int64_t, int64_t> resultToArgs; - for (const auto &it : llvm::enumerate(returnOp.getOperands())) { + for (size_t i = 0; i < numReturnValues; ++i) { bool erased = false; + SmallVector<Value> returnOperands = + getReturnOpsOperandInPos(returnOps, i); for (BlockArgument bbArg : funcOp.getArguments()) { - Value val = it.value(); - while (auto castOp = val.getDefiningOp<memref::CastOp>()) - val = castOp.getSource(); - - if (val == bbArg) { - resultToArgs[it.index()] = bbArg.getArgNumber(); + if (operandsEqualFuncArgument(returnOperands, bbArg)) { + resultToArgs[i] = bbArg.getArgNumber(); erased = true; break; } } if (erased) { - erasedResultIndices.set(it.index()); + erasedResultIndices.set(i); } else { - newReturnValues.push_back(it.value()); + for (auto [newReturnValue, operand] : + llvm::zip(newReturnValues, returnOperands)) { + newReturnValue.push_back(operand); + } } } // Update function. if (failed(funcOp.eraseResults(erasedResultIndices))) return failure(); - returnOp.getOperandsMutable().assign(newReturnValues); + + for (auto [returnOp, newReturnValue] : + llvm::zip(returnOps, newReturnValues)) + returnOp.getOperandsMutable().assign(newReturnValue); // Update function calls. for (func::CallOp callOp : callerMap[funcOp]) { diff --git a/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt b/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt index 70a9c77..ec68acf 100644 --- a/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt @@ -1,5 +1,6 @@ add_mlir_dialect_library(MLIRGPUPipelines GPUToNVVMPipeline.cpp + GPUToXeVMPipeline.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU @@ -11,12 +12,17 @@ add_mlir_dialect_library(MLIRGPUPipelines MLIRTransforms MLIRLinalgTransforms MLIRAffineToStandard + MLIRGPUToLLVMSPV MLIRGPUToNVVMTransforms MLIRIndexToLLVM MLIRMathToLLVM + MLIRMathToXeVM MLIRNVGPUToNVVM MLIRNVVMToLLVM MLIRReconcileUnrealizedCasts MLIRSCFToControlFlow MLIRVectorToSCF + MLIRXeGPUTransforms + MLIRXeGPUToXeVM + MLIRXeVMToLLVM ) diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp new file mode 100644 index 0000000..1a1485b --- /dev/null +++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp @@ -0,0 +1,139 @@ +//===- GPUToXeVMPipeline.cpp - Lowering pipeline to XeVM/LLVM -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass for testing the lowering to XeVM as a generally +// usable sink pass. If XeGPU ops are used, it expects the MLIR code to have +// XeGPU ops already embedded in gpu code. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/AffineToStandard/AffineToStandard.h" +#include "mlir/Conversion/MathToXeVM/MathToXeVM.h" +#include "mlir/Conversion/Passes.h" +#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" +#include "mlir/Conversion/VectorToSCF/VectorToSCF.h" +#include "mlir/Conversion/XeGPUToXeVM/XeGPUToXeVM.h" +#include "mlir/Conversion/XeVMToLLVM/XeVMToLLVM.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/GPU/Pipelines/Passes.h" +#include "mlir/Dialect/GPU/Transforms/Passes.h" +#include "mlir/Dialect/LLVMIR/Transforms/RequestCWrappers.h" +#include "mlir/Dialect/MemRef/Transforms/Passes.h" +#include "mlir/Dialect/XeGPU/Transforms/Passes.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassOptions.h" +#include "mlir/Target/LLVM/XeVM/Target.h" +#include "mlir/Transforms/Passes.h" + +using namespace mlir; + +namespace { +//===----------------------------------------------------------------------===// +// Pre-GPU common pipeline for both Host and GPU. +//===----------------------------------------------------------------------===// +void buildPreGPUCommonPassPipeline( + OpPassManager &pm, const mlir::gpu::GPUToXeVMPipelineOptions &options) { + // builtin.module scope passes. + pm.addPass(createCSEPass()); + pm.addPass(createConvertVectorToSCFPass()); + { + GpuXeVMAttachTargetOptions xevmTargetOptions; + xevmTargetOptions.moduleMatcher = options.xevmModuleMatcher; + xevmTargetOptions.triple = options.zebinTriple; + xevmTargetOptions.chip = options.zebinChip; + xevmTargetOptions.optLevel = options.optLevel; + xevmTargetOptions.cmdOptions = options.cmdOptions; + pm.addPass(createGpuXeVMAttachTarget(xevmTargetOptions)); + } + pm.addPass(createLowerAffinePass()); + pm.addNestedPass<func::FuncOp>(createGpuAsyncRegionPass()); +} + +//===----------------------------------------------------------------------===// +// GPUModule-specific stuff. +//===----------------------------------------------------------------------===// +void buildGPUPassPipeline(OpPassManager &pm, + const mlir::gpu::GPUToXeVMPipelineOptions &options) { + if (options.xegpuOpLevel == "workgroup") { + pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUWgToSgDistribute()); + pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass()); + pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUBlocking()); + pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass()); + pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass()); + } + if (options.xegpuOpLevel == "subgroup" || + options.xegpuOpLevel == "workgroup") { + pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUPropagateLayout()); + pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUSubgroupDistribute()); + pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass()); + pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass()); + pm.addNestedPass<gpu::GPUModuleOp>(createLoopInvariantCodeMotionPass()); + pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass()); + pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUVectorLinearize()); + } + pm.addNestedPass<gpu::GPUModuleOp>(createConvertMathToXeVM()); + pm.addNestedPass<gpu::GPUModuleOp>(createConvertXeGPUToXeVMPass()); + { + ConvertGpuOpsToLLVMSPVOpsOptions gpuToLLVMSPVOptions; + gpuToLLVMSPVOptions.use64bitIndex = options.use64bitIndex; + pm.addNestedPass<gpu::GPUModuleOp>( + createConvertGpuOpsToLLVMSPVOps(gpuToLLVMSPVOptions)); + } + pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass()); + pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass()); +} + +//===----------------------------------------------------------------------===// +// Post-GPU pipeline for both Host and GPU. +//===----------------------------------------------------------------------===// +void buildPostGPUCommonPassPipeline( + OpPassManager &pm, const mlir::gpu::GPUToXeVMPipelineOptions &options) { + // builtin.module scope passes. + pm.addPass(createSCFToControlFlowPass()); + pm.addPass(memref::createExpandStridedMetadataPass()); + { + GpuToLLVMConversionPassOptions gpuToLLVMOptions; + gpuToLLVMOptions.hostBarePtrCallConv = options.hostBarePtrCallConv; + gpuToLLVMOptions.kernelBarePtrCallConv = options.kernelBarePtrCallConv; + pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMOptions)); + } + pm.addPass(createLowerAffinePass()); + pm.addPass(createConvertToLLVMPass()); + pm.addPass(createReconcileUnrealizedCastsPass()); + // gpu-module-to-binary + { + GpuModuleToBinaryPassOptions gpuToModuleBinOptions; + gpuToModuleBinOptions.compilationTarget = options.binaryFormat; + gpuToModuleBinOptions.cmdOptions = options.cmdOptions; + pm.addPass(createGpuModuleToBinaryPass(gpuToModuleBinOptions)); + } +} +} // namespace + +void mlir::gpu::buildLowerToXeVMPassPipeline( + OpPassManager &pm, const GPUToXeVMPipelineOptions &options) { + // Pre-GPU common pipelines. + buildPreGPUCommonPassPipeline(pm, options); + + // GPUModule-specific stuff. + buildGPUPassPipeline(pm, options); + + // Post-GPU pipeline for both Host and GPU. + buildPostGPUCommonPassPipeline(pm, options); +} + +void mlir::gpu::registerGPUToXeVMPipeline() { + PassPipelineRegistration<GPUToXeVMPipelineOptions>( + "gpu-lower-to-xevm-pipeline", + "The default GPU to XeVM lowering pipeline. It starts by lowering GPU " + "code to the " + "specified compilation target (default is fatbin) then lowers the host " + "code.", + buildLowerToXeVMPassPipeline); +} diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 6192d79..9a8a63e 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -2457,26 +2457,24 @@ transform::PadTilingInterfaceOp::apply(transform::TransformRewriter &rewriter, } // Set options. - TilingInterface paddedOp; PadTilingInterfaceOptions options; options.setPaddingValues(paddingValues) .setPaddingSizes(getMixedPaddingSizes()) .setPadToMultipleOf(getPadToMultipleOf()); - // Apply padding. - SmallVector<tensor::PadOp> newPadOps; - FailureOr<TilingInterface> maybePaddedOp = rewriteAsPaddedOp( - rewriter, cast<TilingInterface>(targetOp.getOperation()), options, - newPadOps); - if (failed(maybePaddedOp)) { + auto maybePadOps = rewriteAsPaddedOp( + rewriter, cast<TilingInterface>(targetOp.getOperation()), options); + if (failed(maybePadOps)) { auto diag = emitSilenceableError() << "failed to pad op"; diag.attachNote(target->getLoc()) << "target op"; return diag; } + const auto &[paddedOperands, paddedOp, slicedResults] = maybePadOps.value(); // Set transform results. - paddedOps.push_back(cast<TilingInterface>(maybePaddedOp->getOperation())); - padOps.append(newPadOps.begin(), newPadOps.end()); + paddedOps.push_back(paddedOp); + padOps.append(paddedOperands.begin(), paddedOperands.end()); + rewriter.replaceOp(targetOp.getOperation(), slicedResults); } results.set(cast<OpResult>(getPadded()), paddedOps); diff --git a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp index 0956c5d..3e787a2 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp @@ -95,10 +95,11 @@ static int64_t extractConstantMultiplier(AffineExpr expr) { /// - affine_map<(d0, d1) -> (d0 * 3 + d1)> /// In the future, more general interfaces can be devised to encode similar /// shape evolutions and map between an op and its operands. -SmallVector<OpFoldResult> linalg::computePaddedShape( - RewriterBase &rewriter, TypedValue<RankedTensorType> v, - AffineMap indexingMap, ArrayRef<OpFoldResult> indexingSizes, - const PadTilingInterfaceOptions &options) { +SmallVector<OpFoldResult> +linalg::computePaddedShape(OpBuilder &builder, TypedValue<RankedTensorType> v, + AffineMap indexingMap, + ArrayRef<OpFoldResult> indexingSizes, + const PadTilingInterfaceOptions &options) { Location loc = v.getLoc(); SmallVector<OpFoldResult> paddedShape; auto tensorType = cast<RankedTensorType>(v.getType()); @@ -109,7 +110,7 @@ SmallVector<OpFoldResult> linalg::computePaddedShape( // "Full-rank" padding specification. SmallVector<OpFoldResult> paddingSizes = - getFullRankPaddingSizes(rewriter, indexingSizes, options); + getFullRankPaddingSizes(builder, indexingSizes, options); // For each dimension in the operand's shape, iterate over indexingSizes and // add the various term contributions. @@ -147,28 +148,27 @@ SmallVector<OpFoldResult> linalg::computePaddedShape( OpFoldResult paddingDimOfr; if (options.padToMultipleOf) { AffineExpr d0, s0; - bindDims(rewriter.getContext(), d0); - bindSymbols(rewriter.getContext(), s0); + bindDims(builder.getContext(), d0); + bindSymbols(builder.getContext(), s0); AffineMap ceilMap = AffineMap::get(1, 1, d0.ceilDiv(s0) * s0); AffineMap composedMap = projectedMap.compose(ceilMap); paddingDimOfr = affine::makeComposedFoldedAffineApply( - rewriter, loc, composedMap, - {indexingSizes[paddingDim], paddingSize}, + builder, loc, composedMap, {indexingSizes[paddingDim], paddingSize}, /*composeAffineMin=*/true); } else { // Otherwise just set to paddingSize. paddingDimOfr = affine::makeComposedFoldedAffineApply( - rewriter, loc, projectedMap, paddingSize); + builder, loc, projectedMap, paddingSize); } // Adjust for the maximum accessed index, which is (paddingSize - 1) * // multiplier. AffineExpr d0; - bindDims(rewriter.getContext(), d0); + bindDims(builder.getContext(), d0); int64_t multiplier = extractConstantMultiplier(projectedMap.getResult(0)); AffineMap subtractMap = AffineMap::get(1, 0, d0 - multiplier); OpFoldResult maxAccessIdx = affine::makeComposedFoldedAffineApply( - rewriter, loc, subtractMap, {paddingDimOfr}); + builder, loc, subtractMap, {paddingDimOfr}); terms.push_back(maxAccessIdx); LLVM_DEBUG(DBGS() << "------new term: " << terms.back() << "\n"); @@ -177,19 +177,19 @@ SmallVector<OpFoldResult> linalg::computePaddedShape( // If there are no terms, just return the dim. if (terms.empty()) { paddedShape[resultIndex] = - createFoldedDimOp(rewriter, loc, v, resultIndex); + createFoldedDimOp(builder, loc, v, resultIndex); continue; } // Sum individual terms' contributions. SmallVector<AffineExpr> dims(terms.size()); - bindDimsList(rewriter.getContext(), MutableArrayRef{dims}); + bindDimsList(builder.getContext(), MutableArrayRef{dims}); AffineExpr sumExpr = dims.front(); for (unsigned i = 1; i < dims.size(); ++i) sumExpr = sumExpr + dims[i]; // Add 1 to the maximum accessed index and get the final padded size. - OpFoldResult paddedDimOfr = affine::makeComposedFoldedAffineApply( - rewriter, loc, sumExpr + 1, terms); + OpFoldResult paddedDimOfr = + affine::makeComposedFoldedAffineApply(builder, loc, sumExpr + 1, terms); paddedShape[resultIndex] = paddedDimOfr; } @@ -198,7 +198,7 @@ SmallVector<OpFoldResult> linalg::computePaddedShape( FailureOr<SmallVector<OpFoldResult>> linalg::computeIndexingMapOpInterfacePaddedShape( - RewriterBase &rewriter, OpOperand &operandToPad, + OpBuilder &builder, OpOperand &operandToPad, ArrayRef<Range> iterationDomain, const PadTilingInterfaceOptions &options) { auto transferOp = llvm::dyn_cast<IndexingMapOpInterface>(operandToPad.getOwner()); @@ -206,9 +206,9 @@ linalg::computeIndexingMapOpInterfacePaddedShape( return failure(); // clang-format off - assert(llvm::all_of(iterationDomain, [&rewriter](Range r) { - return r.offset == OpFoldResult(rewriter.getIndexAttr(0)) && - r.stride == OpFoldResult(rewriter.getIndexAttr(1)); + assert(llvm::all_of(iterationDomain, [&builder](Range r) { + return r.offset == OpFoldResult(builder.getIndexAttr(0)) && + r.stride == OpFoldResult(builder.getIndexAttr(1)); }) && "expected 0-offset 1-stride loop ranges"); // clang-format on SmallVector<OpFoldResult> loopUpperBounds; @@ -218,13 +218,13 @@ linalg::computeIndexingMapOpInterfacePaddedShape( AffineMap indexingMap = transferOp.getMatchingIndexingMap(&operandToPad); return computePaddedShape( - rewriter, cast<TypedValue<RankedTensorType>>(operandToPad.get()), + builder, cast<TypedValue<RankedTensorType>>(operandToPad.get()), indexingMap, loopUpperBounds, options); } /// Pad a single operand to `paddedShape` using `paddingValueAttr` as padding /// Value. -static Value padOperand(RewriterBase &rewriter, TilingInterface opToPad, +static Value padOperand(OpBuilder &builder, TilingInterface opToPad, TypedValue<RankedTensorType> v, ArrayRef<OpFoldResult> paddedShape, Attribute paddingValueAttr) { @@ -232,15 +232,15 @@ static Value padOperand(RewriterBase &rewriter, TilingInterface opToPad, if (auto complexTy = dyn_cast<ComplexType>(getElementTypeOrSelf(v.getType()))) { if (auto complexAttr = dyn_cast<ArrayAttr>(paddingValueAttr)) { - paddingValue = complex::ConstantOp::create(rewriter, opToPad.getLoc(), + paddingValue = complex::ConstantOp::create(builder, opToPad.getLoc(), complexTy, complexAttr); } } else if (isa<ub::PoisonAttr>(paddingValueAttr)) { - paddingValue = ub::PoisonOp::create(rewriter, opToPad.getLoc(), + paddingValue = ub::PoisonOp::create(builder, opToPad.getLoc(), getElementTypeOrSelf(v.getType())); } else if (auto typedAttr = dyn_cast<TypedAttr>(paddingValueAttr)) { paddingValue = - arith::ConstantOp::create(rewriter, opToPad.getLoc(), typedAttr); + arith::ConstantOp::create(builder, opToPad.getLoc(), typedAttr); } assert(paddingValue && "failed to create value from padding attribute"); @@ -259,49 +259,48 @@ static Value padOperand(RewriterBase &rewriter, TilingInterface opToPad, RankedTensorType::get(tensorShape, getElementTypeOrSelf(v)); LLVM_DEBUG(DBGS() << "--SUCCESS, makeComposedPadHighOp with type: " << paddedTensorType); - return makeComposedPadHighOp(rewriter, opToPad.getLoc(), paddedTensorType, v, + return makeComposedPadHighOp(builder, opToPad.getLoc(), paddedTensorType, v, paddingValue, /*nofold=*/false, dynDims); } -FailureOr<TilingInterface> linalg::rewriteAsPaddedOp( - RewriterBase &rewriter, TilingInterface opToPad, - const PadTilingInterfaceOptions &constOptions, - SmallVector<tensor::PadOp> &padOps, +FailureOr<PadTilingInterfaceResult> linalg::rewriteAsPaddedOp( + OpBuilder &builder, TilingInterface toPad, + PadTilingInterfaceOptions options, const PadSizeComputationFunction &computePaddingSizeFun) { - LLVM_DEBUG(DBGS() << "Start rewriteAsPaddedOp : " << opToPad << "\n"); + LLVM_DEBUG(DBGS() << "Start rewriteAsPaddedOp : " << toPad << "\n"); + SmallVector<tensor::PadOp> padOps; + Location loc = toPad.getLoc(); - Location loc = opToPad.getLoc(); - PadTilingInterfaceOptions options(constOptions); // Allow inference of pad values if they are not explicitly specified. // TODO: be mindful about the value depending on the actual operation. if (options.paddingValues.empty()) { - SmallVector<Type> types(opToPad->getOperandTypes()); - llvm::append_range(types, opToPad->getResultTypes()); + SmallVector<Type> types(toPad->getOperandTypes()); + llvm::append_range(types, toPad->getResultTypes()); for (Type t : types) { options.paddingValues.push_back( - rewriter.getZeroAttr(getElementTypeOrSelf(t))); + builder.getZeroAttr(getElementTypeOrSelf(t))); } } - if (llvm::any_of(opToPad->getOperands(), + if (llvm::any_of(toPad->getOperands(), [](Value v) { return isa<MemRefType>(v.getType()); })) { - return rewriter.notifyMatchFailure(opToPad, - "expected operation on tensors"); + LLVM_DEBUG(DBGS() << "Not an operation on tensors: FAIL\n"); + return failure(); } - OpBuilder::InsertionGuard g(rewriter); - // Set IP after opToPad because we also take the dims of opToPad's output. - rewriter.setInsertionPointAfter(opToPad); + OpBuilder::InsertionGuard g(builder); + // Set IP after toPad because we also take the dims of toPad's output. + builder.setInsertionPointAfter(toPad); // 1. Get the loopUpperBounds from the TilingInterface. - SmallVector<Range> iterationDomain = opToPad.getIterationDomain(rewriter); + SmallVector<Range> iterationDomain = toPad.getIterationDomain(builder); // 2. For each operand. SmallVector<Value> newOperands; - newOperands.reserve(opToPad->getNumOperands()); - for (OpOperand &opOperand : opToPad->getOpOperands()) { + newOperands.reserve(toPad->getNumOperands()); + for (OpOperand &opOperand : toPad->getOpOperands()) { Value operand = opOperand.get(); - LLVM_DEBUG(DBGS() << "--start padding oprd: " << operand << "\n"); + LLVM_DEBUG(DBGS() << "--start padding operand: " << operand << "\n"); // 2.a. Skip scalar-like operands. Type operandType = operand.getType(); @@ -311,30 +310,31 @@ FailureOr<TilingInterface> linalg::rewriteAsPaddedOp( newOperands.push_back(operand); continue; } + // 2.a. Compute padded shape. FailureOr<SmallVector<OpFoldResult>> maybePaddedShape = - computePaddingSizeFun(rewriter, opOperand, iterationDomain, options); + computePaddingSizeFun(builder, opOperand, iterationDomain, options); if (failed(maybePaddedShape)) { - return rewriter.notifyMatchFailure(opToPad, "could not pad op"); + LLVM_DEBUG(DBGS() << "Could not get padded shape of operand: FAIL\n"); + return failure(); } // 2.b. Expect proper `paddingValues`. // TODO: we may want to allow garbage padding in the future, in which case // we would just not assert. if (opOperand.getOperandNumber() >= options.paddingValues.size()) { - return rewriter.notifyMatchFailure(opToPad, - "--no padding value specified"); + LLVM_DEBUG(DBGS() << "Too few padding values specified: FAIL\n"); + return failure(); } Attribute paddingValueAttr = options.paddingValues[opOperand.getOperandNumber()]; // 2.c. Perform actual padding. - Value paddedOperand = padOperand( - rewriter, opToPad, cast<TypedValue<RankedTensorType>>(operand), - *maybePaddedShape, paddingValueAttr); + Value paddedOperand = + padOperand(builder, toPad, cast<TypedValue<RankedTensorType>>(operand), + *maybePaddedShape, paddingValueAttr); LLVM_DEBUG(DBGS() << "--done padding operand: " << paddedOperand << "\n"); - // 2.d. Perform actual padding. newOperands.push_back(paddedOperand); if (auto padOp = paddedOperand.getDefiningOp<tensor::PadOp>()) padOps.push_back(padOp); @@ -342,38 +342,34 @@ FailureOr<TilingInterface> linalg::rewriteAsPaddedOp( // 3. Form the resulting tensor::ExtractSliceOp. ReifiedRankedShapedTypeDims reifiedResultShapes; - if (failed(reifyResultShapes(rewriter, opToPad, reifiedResultShapes))) { - LLVM_DEBUG(DBGS() << "--failed to reify result shapes -> FAIL\n"); - return rewriter.notifyMatchFailure(opToPad, - "failed to reify result shapes"); + if (failed(reifyResultShapes(builder, toPad, reifiedResultShapes))) { + LLVM_DEBUG(DBGS() << "Failed to reify result shapes: FAIL\n"); + return failure(); } - assert(reifiedResultShapes.size() == opToPad->getNumResults() && + assert(reifiedResultShapes.size() == toPad->getNumResults() && "expected same number of results"); - // Clone `opToPad` to operate on the statically padded shapes. + // Clone `toPad` to operate on the statically padded shapes. auto resultTensorTypes = - ValueRange(newOperands).take_back(opToPad->getNumResults()).getTypes(); - // clone **should** properly notify the rewriter. + ValueRange(newOperands).take_back(toPad->getNumResults()).getTypes(); + // clone **should** properly notify the builder. TilingInterface paddedOp = - clone(rewriter, opToPad, resultTensorTypes, newOperands); + clone(builder, toPad, resultTensorTypes, newOperands); LLVM_DEBUG(DBGS() << "--cloned padded op: " << paddedOp << "\n"); - // Recover the slice out of the new static results. This keeps the original - // opToPad around because it uses the dims of the original results. + // Recover the slice out of the new static results. SmallVector<Value> paddedSubtensorResults; - paddedSubtensorResults.reserve(opToPad->getNumResults()); + paddedSubtensorResults.reserve(toPad->getNumResults()); for (const auto &en : llvm::enumerate(paddedOp->getResults())) { Value paddedResult = en.value(); int64_t resultNumber = en.index(); int64_t rank = cast<RankedTensorType>(paddedResult.getType()).getRank(); - SmallVector<OpFoldResult> offsets(rank, rewriter.getIndexAttr(0)); - SmallVector<OpFoldResult> strides(rank, rewriter.getIndexAttr(1)); + SmallVector<OpFoldResult> offsets(rank, builder.getIndexAttr(0)); + SmallVector<OpFoldResult> strides(rank, builder.getIndexAttr(1)); paddedSubtensorResults.push_back(tensor::ExtractSliceOp::create( - rewriter, loc, paddedResult, offsets, reifiedResultShapes[resultNumber], + builder, loc, paddedResult, offsets, reifiedResultShapes[resultNumber], strides)); } - rewriter.replaceOp(opToPad, paddedSubtensorResults); - - return paddedOp; + return PadTilingInterfaceResult{padOps, paddedOp, paddedSubtensorResults}; } diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp index 507597b..94947b7 100644 --- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp +++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp @@ -2158,11 +2158,45 @@ public: return success(); } }; + +struct ReinterpretCastOpConstantFolder + : public OpRewritePattern<ReinterpretCastOp> { +public: + using OpRewritePattern<ReinterpretCastOp>::OpRewritePattern; + + LogicalResult matchAndRewrite(ReinterpretCastOp op, + PatternRewriter &rewriter) const override { + unsigned srcStaticCount = llvm::count_if( + llvm::concat<OpFoldResult>(op.getMixedOffsets(), op.getMixedSizes(), + op.getMixedStrides()), + [](OpFoldResult ofr) { return isa<Attribute>(ofr); }); + + SmallVector<OpFoldResult> offsets = {op.getConstifiedMixedOffset()}; + SmallVector<OpFoldResult> sizes = op.getConstifiedMixedSizes(); + SmallVector<OpFoldResult> strides = op.getConstifiedMixedStrides(); + + // TODO: Using counting comparison instead of direct comparison because + // getMixedValues (and therefore ReinterpretCastOp::getMixed...) returns + // IntegerAttrs, while constifyIndexValues (and therefore + // ReinterpretCastOp::getConstifiedMixed...) returns IndexAttrs. + if (srcStaticCount == + llvm::count_if(llvm::concat<OpFoldResult>(offsets, sizes, strides), + [](OpFoldResult ofr) { return isa<Attribute>(ofr); })) + return failure(); + + auto newReinterpretCast = ReinterpretCastOp::create( + rewriter, op->getLoc(), op.getSource(), offsets[0], sizes, strides); + + rewriter.replaceOpWithNewOp<CastOp>(op, op.getType(), newReinterpretCast); + return success(); + } +}; } // namespace void ReinterpretCastOp::getCanonicalizationPatterns(RewritePatternSet &results, MLIRContext *context) { - results.add<ReinterpretCastOpExtractStridedMetadataFolder>(context); + results.add<ReinterpretCastOpExtractStridedMetadataFolder, + ReinterpretCastOpConstantFolder>(context); } FailureOr<std::optional<SmallVector<Value>>> diff --git a/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp b/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp index 49b7162..6f815ae 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp @@ -121,7 +121,7 @@ struct EmulateWideIntPass final [&typeConverter](Operation *op) { return typeConverter.isLegal(op); }); RewritePatternSet patterns(ctx); - // Add common pattenrs to support contants, functions, etc. + // Add common patterns to support contants, functions, etc. arith::populateArithWideIntEmulationPatterns(typeConverter, patterns); memref::populateMemRefWideIntEmulationPatterns(typeConverter, patterns); diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index 642ced9..dcfe2c7 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -40,6 +40,16 @@ static bool isScalarLikeType(Type type) { return type.isIntOrIndexOrFloat() || isa<ComplexType>(type); } +/// Helper function to attach the `VarName` attribute to an operation +/// if a variable name is provided. +static void attachVarNameAttr(Operation *op, OpBuilder &builder, + StringRef varName) { + if (!varName.empty()) { + auto varNameAttr = acc::VarNameAttr::get(builder.getContext(), varName); + op->setAttr(acc::getVarNameAttrName(), varNameAttr); + } +} + struct MemRefPointerLikeModel : public PointerLikeType::ExternalModel<MemRefPointerLikeModel, MemRefType> { @@ -83,7 +93,9 @@ struct MemRefPointerLikeModel // then we can generate an alloca operation. if (memrefTy.hasStaticShape()) { needsFree = false; // alloca doesn't need deallocation - return memref::AllocaOp::create(builder, loc, memrefTy).getResult(); + auto allocaOp = memref::AllocaOp::create(builder, loc, memrefTy); + attachVarNameAttr(allocaOp, builder, varName); + return allocaOp.getResult(); } // For dynamic memrefs, extract sizes from the original variable if @@ -103,8 +115,10 @@ struct MemRefPointerLikeModel // Static dimensions are handled automatically by AllocOp } needsFree = true; // alloc needs deallocation - return memref::AllocOp::create(builder, loc, memrefTy, dynamicSizes) - .getResult(); + auto allocOp = + memref::AllocOp::create(builder, loc, memrefTy, dynamicSizes); + attachVarNameAttr(allocOp, builder, varName); + return allocOp.getResult(); } // TODO: Unranked not yet supported. @@ -1016,12 +1030,12 @@ struct RemoveConstantIfConditionWithRegion : public OpRewritePattern<OpTy> { //===----------------------------------------------------------------------===// /// Create and populate an init region for privatization recipes. -/// Returns the init block on success, or nullptr on failure. +/// Returns success if the region is populated, failure otherwise. /// Sets needsFree to indicate if the allocated memory requires deallocation. -static std::unique_ptr<Block> createInitRegion(OpBuilder &builder, Location loc, - Type varType, StringRef varName, - ValueRange bounds, - bool &needsFree) { +static LogicalResult createInitRegion(OpBuilder &builder, Location loc, + Region &initRegion, Type varType, + StringRef varName, ValueRange bounds, + bool &needsFree) { // Create init block with arguments: original value + bounds SmallVector<Type> argTypes{varType}; SmallVector<Location> argLocs{loc}; @@ -1030,9 +1044,9 @@ static std::unique_ptr<Block> createInitRegion(OpBuilder &builder, Location loc, argLocs.push_back(loc); } - auto initBlock = std::make_unique<Block>(); + Block *initBlock = builder.createBlock(&initRegion); initBlock->addArguments(argTypes, argLocs); - builder.setInsertionPointToStart(initBlock.get()); + builder.setInsertionPointToStart(initBlock); Value privatizedValue; @@ -1046,7 +1060,7 @@ static std::unique_ptr<Block> createInitRegion(OpBuilder &builder, Location loc, privatizedValue = mappableTy.generatePrivateInit( builder, loc, typedVar, varName, bounds, {}, needsFree); if (!privatizedValue) - return nullptr; + return failure(); } else { assert(isa<PointerLikeType>(varType) && "Expected PointerLikeType"); auto pointerLikeTy = cast<PointerLikeType>(varType); @@ -1054,21 +1068,21 @@ static std::unique_ptr<Block> createInitRegion(OpBuilder &builder, Location loc, privatizedValue = pointerLikeTy.genAllocate(builder, loc, varName, varType, blockArgVar, needsFree); if (!privatizedValue) - return nullptr; + return failure(); } // Add yield operation to init block acc::YieldOp::create(builder, loc, privatizedValue); - return initBlock; + return success(); } /// Create and populate a copy region for firstprivate recipes. -/// Returns the copy block on success, or nullptr on failure. +/// Returns success if the region is populated, failure otherwise. /// TODO: Handle MappableType - it does not yet have a copy API. -static std::unique_ptr<Block> createCopyRegion(OpBuilder &builder, Location loc, - Type varType, - ValueRange bounds) { +static LogicalResult createCopyRegion(OpBuilder &builder, Location loc, + Region ©Region, Type varType, + ValueRange bounds) { // Create copy block with arguments: original value + privatized value + // bounds SmallVector<Type> copyArgTypes{varType, varType}; @@ -1078,16 +1092,16 @@ static std::unique_ptr<Block> createCopyRegion(OpBuilder &builder, Location loc, copyArgLocs.push_back(loc); } - auto copyBlock = std::make_unique<Block>(); + Block *copyBlock = builder.createBlock(©Region); copyBlock->addArguments(copyArgTypes, copyArgLocs); - builder.setInsertionPointToStart(copyBlock.get()); + builder.setInsertionPointToStart(copyBlock); bool isMappable = isa<MappableType>(varType); bool isPointerLike = isa<PointerLikeType>(varType); // TODO: Handle MappableType - it does not yet have a copy API. // Otherwise, for now just fallback to pointer-like behavior. if (isMappable && !isPointerLike) - return nullptr; + return failure(); // Generate copy region body based on variable type if (isPointerLike) { @@ -1099,21 +1113,20 @@ static std::unique_ptr<Block> createCopyRegion(OpBuilder &builder, Location loc, if (!pointerLikeTy.genCopy( builder, loc, cast<TypedValue<PointerLikeType>>(privatizedArg), cast<TypedValue<PointerLikeType>>(originalArg), varType)) - return nullptr; + return failure(); } // Add terminator to copy block acc::TerminatorOp::create(builder, loc); - return copyBlock; + return success(); } /// Create and populate a destroy region for privatization recipes. -/// Returns the destroy block on success, or nullptr if not needed. -static std::unique_ptr<Block> createDestroyRegion(OpBuilder &builder, - Location loc, Type varType, - Value allocRes, - ValueRange bounds) { +/// Returns success if the region is populated, failure otherwise. +static LogicalResult createDestroyRegion(OpBuilder &builder, Location loc, + Region &destroyRegion, Type varType, + Value allocRes, ValueRange bounds) { // Create destroy block with arguments: original value + privatized value + // bounds SmallVector<Type> destroyArgTypes{varType, varType}; @@ -1123,28 +1136,25 @@ static std::unique_ptr<Block> createDestroyRegion(OpBuilder &builder, destroyArgLocs.push_back(loc); } - auto destroyBlock = std::make_unique<Block>(); + Block *destroyBlock = builder.createBlock(&destroyRegion); destroyBlock->addArguments(destroyArgTypes, destroyArgLocs); - builder.setInsertionPointToStart(destroyBlock.get()); + builder.setInsertionPointToStart(destroyBlock); - bool isMappable = isa<MappableType>(varType); - bool isPointerLike = isa<PointerLikeType>(varType); - // TODO: Handle MappableType - it does not yet have a deallocation API. - // Otherwise, for now just fallback to pointer-like behavior. - if (isMappable && !isPointerLike) - return nullptr; - - assert(isa<PointerLikeType>(varType) && "Expected PointerLikeType"); - auto pointerLikeTy = cast<PointerLikeType>(varType); - auto privatizedArg = + auto varToFree = cast<TypedValue<PointerLikeType>>(destroyBlock->getArgument(1)); - // Pass allocRes to help determine the allocation type - if (!pointerLikeTy.genFree(builder, loc, privatizedArg, allocRes, varType)) - return nullptr; + if (isa<MappableType>(varType)) { + auto mappableTy = cast<MappableType>(varType); + if (!mappableTy.generatePrivateDestroy(builder, loc, varToFree)) + return failure(); + } else { + assert(isa<PointerLikeType>(varType) && "Expected PointerLikeType"); + auto pointerLikeTy = cast<PointerLikeType>(varType); + if (!pointerLikeTy.genFree(builder, loc, varToFree, allocRes, varType)) + return failure(); + } acc::TerminatorOp::create(builder, loc); - - return destroyBlock; + return success(); } } // namespace @@ -1206,40 +1216,33 @@ PrivateRecipeOp::createAndPopulate(OpBuilder &builder, Location loc, if (!isMappable && !isPointerLike) return std::nullopt; - // Create init and destroy blocks using shared helpers OpBuilder::InsertionGuard guard(builder); - // Save the original insertion point for creating the recipe operation later - auto originalInsertionPoint = builder.saveInsertionPoint(); + // Create the recipe operation first so regions have proper parent context + auto recipe = PrivateRecipeOp::create(builder, loc, recipeName, varType); + // Populate the init region bool needsFree = false; - auto initBlock = - createInitRegion(builder, loc, varType, varName, bounds, needsFree); - if (!initBlock) + if (failed(createInitRegion(builder, loc, recipe.getInitRegion(), varType, + varName, bounds, needsFree))) { + recipe.erase(); return std::nullopt; + } // Only create destroy region if the allocation needs deallocation - std::unique_ptr<Block> destroyBlock; if (needsFree) { // Extract the allocated value from the init block's yield operation - auto yieldOp = cast<acc::YieldOp>(initBlock->getTerminator()); + auto yieldOp = + cast<acc::YieldOp>(recipe.getInitRegion().front().getTerminator()); Value allocRes = yieldOp.getOperand(0); - destroyBlock = createDestroyRegion(builder, loc, varType, allocRes, bounds); - if (!destroyBlock) + if (failed(createDestroyRegion(builder, loc, recipe.getDestroyRegion(), + varType, allocRes, bounds))) { + recipe.erase(); return std::nullopt; + } } - // Now create the recipe operation at the original insertion point and attach - // the blocks - builder.restoreInsertionPoint(originalInsertionPoint); - auto recipe = PrivateRecipeOp::create(builder, loc, recipeName, varType); - - // Move the blocks into the recipe's regions - recipe.getInitRegion().push_back(initBlock.release()); - if (destroyBlock) - recipe.getDestroyRegion().push_back(destroyBlock.release()); - return recipe; } @@ -1285,45 +1288,40 @@ FirstprivateRecipeOp::createAndPopulate(OpBuilder &builder, Location loc, if (!isMappable && !isPointerLike) return std::nullopt; - // Create init, copy, and destroy blocks using shared helpers OpBuilder::InsertionGuard guard(builder); - // Save the original insertion point for creating the recipe operation later - auto originalInsertionPoint = builder.saveInsertionPoint(); + // Create the recipe operation first so regions have proper parent context + auto recipe = FirstprivateRecipeOp::create(builder, loc, recipeName, varType); + // Populate the init region bool needsFree = false; - auto initBlock = - createInitRegion(builder, loc, varType, varName, bounds, needsFree); - if (!initBlock) + if (failed(createInitRegion(builder, loc, recipe.getInitRegion(), varType, + varName, bounds, needsFree))) { + recipe.erase(); return std::nullopt; + } - auto copyBlock = createCopyRegion(builder, loc, varType, bounds); - if (!copyBlock) + // Populate the copy region + if (failed(createCopyRegion(builder, loc, recipe.getCopyRegion(), varType, + bounds))) { + recipe.erase(); return std::nullopt; + } // Only create destroy region if the allocation needs deallocation - std::unique_ptr<Block> destroyBlock; if (needsFree) { // Extract the allocated value from the init block's yield operation - auto yieldOp = cast<acc::YieldOp>(initBlock->getTerminator()); + auto yieldOp = + cast<acc::YieldOp>(recipe.getInitRegion().front().getTerminator()); Value allocRes = yieldOp.getOperand(0); - destroyBlock = createDestroyRegion(builder, loc, varType, allocRes, bounds); - if (!destroyBlock) + if (failed(createDestroyRegion(builder, loc, recipe.getDestroyRegion(), + varType, allocRes, bounds))) { + recipe.erase(); return std::nullopt; + } } - // Now create the recipe operation at the original insertion point and attach - // the blocks - builder.restoreInsertionPoint(originalInsertionPoint); - auto recipe = FirstprivateRecipeOp::create(builder, loc, recipeName, varType); - - // Move the blocks into the recipe's regions - recipe.getInitRegion().push_back(initBlock.release()); - recipe.getCopyRegion().push_back(copyBlock.release()); - if (destroyBlock) - recipe.getDestroyRegion().push_back(destroyBlock.release()); - return recipe; } diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp index 12e6475..7c019e7 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp @@ -2032,11 +2032,19 @@ struct WarpOpScfForOp : public WarpDistributionPattern { } // Newly created `WarpOp` will yield values in following order: - // 1. All init args of the `ForOp`. - // 2. All escaping values. - // 3. All non-`ForOp` yielded values. + // 1. Loop bounds. + // 2. All init args of the `ForOp`. + // 3. All escaping values. + // 4. All non-`ForOp` yielded values. SmallVector<Value> newWarpOpYieldValues; SmallVector<Type> newWarpOpDistTypes; + newWarpOpYieldValues.insert( + newWarpOpYieldValues.end(), + {forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep()}); + newWarpOpDistTypes.insert(newWarpOpDistTypes.end(), + {forOp.getLowerBound().getType(), + forOp.getUpperBound().getType(), + forOp.getStep().getType()}); for (auto [i, initArg] : llvm::enumerate(forOp.getInitArgs())) { newWarpOpYieldValues.push_back(initArg); // Compute the distributed type for this init arg. @@ -2072,20 +2080,24 @@ struct WarpOpScfForOp : public WarpDistributionPattern { // Next, we create a new `ForOp` with the init args yielded by the new // `WarpOp`. + const unsigned initArgsStartIdx = 3; // After loop bounds. const unsigned escapingValuesStartIdx = + initArgsStartIdx + forOp.getInitArgs().size(); // `ForOp` init args are positioned before // escaping values in the new `WarpOp`. SmallVector<Value> newForOpOperands; - for (size_t i = 0; i < escapingValuesStartIdx; ++i) + for (size_t i = initArgsStartIdx; i < escapingValuesStartIdx; ++i) newForOpOperands.push_back(newWarpOp.getResult(newIndices[i])); // Create a new `ForOp` outside the new `WarpOp` region. OpBuilder::InsertionGuard g(rewriter); rewriter.setInsertionPointAfter(newWarpOp); auto newForOp = scf::ForOp::create( - rewriter, forOp.getLoc(), forOp.getLowerBound(), forOp.getUpperBound(), - forOp.getStep(), newForOpOperands, /*bodyBuilder=*/nullptr, - forOp.getUnsignedCmp()); + rewriter, forOp.getLoc(), + /**LowerBound=**/ newWarpOp.getResult(newIndices[0]), + /**UpperBound=**/ newWarpOp.getResult(newIndices[1]), + /**Step=**/ newWarpOp.getResult(newIndices[2]), newForOpOperands, + /*bodyBuilder=*/nullptr, forOp.getUnsignedCmp()); // Next, we insert a new `WarpOp` (called inner `WarpOp`) inside the // newly created `ForOp`. This `WarpOp` will contain all ops that were // contained within the original `ForOp` body. diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp index 1b656d8..ea93085 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp @@ -817,6 +817,50 @@ struct LinearizeVectorToElements final } }; +/// Convert broadcasts from scalars or 1-element vectors, such as +/// +/// ```mlir +/// vector.broadcast %value : f32 to vector<4x4xf32> +/// ``` +/// +/// to broadcasts to rank-1 vectors, with shape_casts before/after as needed. +/// The above becomes, +/// +/// ```mlir +/// %out_1d = vector.broadcast %value : f32 to vector<16xf32> +/// %out_nd = vector.shape_cast %out_1d : vector<16xf32> to vector<4x4xf32> +/// ``` +struct LinearizeVectorBroadcast final + : public OpConversionPattern<vector::BroadcastOp> { + using Base::Base; + + LinearizeVectorBroadcast(const TypeConverter &typeConverter, + MLIRContext *context, PatternBenefit benefit = 1) + : OpConversionPattern(typeConverter, context, benefit) {} + + LogicalResult + matchAndRewrite(vector::BroadcastOp broadcastOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + + int numElements = 1; + Type sourceType = broadcastOp.getSourceType(); + if (auto vecType = dyn_cast<VectorType>(sourceType)) { + numElements = vecType.getNumElements(); + } + + if (numElements != 1) { + return rewriter.notifyMatchFailure( + broadcastOp, "only broadcasts of single elements can be linearized."); + } + + auto dstTy = getTypeConverter()->convertType(broadcastOp.getType()); + rewriter.replaceOpWithNewOp<vector::BroadcastOp>(broadcastOp, dstTy, + adaptor.getSource()); + + return success(); + } +}; + } // namespace /// This method defines the set of operations that are linearizable, and hence @@ -909,8 +953,8 @@ void mlir::vector::populateVectorLinearizeBasePatterns( patterns .add<LinearizeConstantLike, LinearizeVectorizable, LinearizeVectorBitCast, LinearizeVectorCreateMask, LinearizeVectorLoad, LinearizeVectorStore, - LinearizeVectorFromElements, LinearizeVectorToElements>( - typeConverter, patterns.getContext()); + LinearizeVectorBroadcast, LinearizeVectorFromElements, + LinearizeVectorToElements>(typeConverter, patterns.getContext()); } void mlir::vector::populateVectorLinearizeShuffleLikeOpsPatterns( diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp index 025ee9a..c809c502 100644 --- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp +++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp @@ -91,7 +91,7 @@ mlir::vector::isTranspose2DSlice(vector::TransposeOp op) { // Check whether the two source vector dimensions that are greater than one // must be transposed with each other so that we can apply one of the 2-D - // transpose pattens. Otherwise, these patterns are not applicable. + // transpose patterns. Otherwise, these patterns are not applicable. if (!areDimsTransposedIn2DSlice(srcGtOneDims[0], srcGtOneDims[1], op.getPermutation())) return failure(); diff --git a/mlir/lib/Dialect/WasmSSA/IR/WasmSSAOps.cpp b/mlir/lib/Dialect/WasmSSA/IR/WasmSSAOps.cpp index 89b62a2..a514ea9 100644 --- a/mlir/lib/Dialect/WasmSSA/IR/WasmSSAOps.cpp +++ b/mlir/lib/Dialect/WasmSSA/IR/WasmSSAOps.cpp @@ -12,6 +12,7 @@ #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/Diagnostics.h" #include "mlir/IR/Dialect.h" #include "mlir/IR/Region.h" #include "mlir/IR/SymbolTable.h" @@ -39,28 +40,6 @@ void printElseRegion(OpAsmPrinter &opPrinter, Operation *op, opPrinter.printKeywordOrString("else "); opPrinter.printRegion(elseRegion); } - -ParseResult parseWasmVisibility(OpAsmParser &opParser, StringAttr &visibility) { - std::string keyword; - auto initLocation = opParser.getCurrentLocation(); - std::ignore = opParser.parseOptionalKeywordOrString(&keyword); - if (keyword == "nested" or keyword == "") { - visibility = StringAttr::get(opParser.getContext(), "nested"); - return ParseResult::success(); - } - - if (keyword == "public" || keyword == "private") { - visibility = StringAttr::get(opParser.getContext(), keyword); - return ParseResult::success(); - } - opParser.emitError(initLocation, "expecting symbol visibility"); - return ParseResult::failure(); -} - -void printWasmVisibility(OpAsmPrinter &opPrinter, Operation *op, - Attribute visibility) { - opPrinter.printKeywordOrString(cast<StringAttr>(visibility).strref()); -} } // namespace #define GET_OP_CLASSES @@ -167,10 +146,23 @@ Block *FuncOp::addEntryBlock() { void FuncOp::build(OpBuilder &odsBuilder, OperationState &odsState, StringRef symbol, FunctionType funcType) { - FuncOp::build(odsBuilder, odsState, symbol, funcType, {}, {}, "nested"); + FuncOp::build(odsBuilder, odsState, symbol, funcType, {}, {}); } ParseResult FuncOp::parse(OpAsmParser &parser, OperationState &result) { + auto *ctx = parser.getContext(); + std::string visibilityString; + auto loc = parser.getNameLoc(); + ParseResult res = parser.parseOptionalKeywordOrString(&visibilityString); + bool exported{false}; + if (res.succeeded()) { + if (visibilityString != "exported") + return parser.emitError( + loc, "expecting either `exported` or symbol name. got ") + << visibilityString; + exported = true; + } + auto buildFuncType = [&parser](Builder &builder, ArrayRef<Type> argTypes, ArrayRef<Type> results, function_interface_impl::VariadicFlag, @@ -191,11 +183,13 @@ ParseResult FuncOp::parse(OpAsmParser &parser, OperationState &result) { return builder.getFunctionType(argTypesWithoutLocal, results); }; - - return function_interface_impl::parseFunctionOp( + auto funcParseRes = function_interface_impl::parseFunctionOp( parser, result, /*allowVariadic=*/false, getFunctionTypeAttrName(result.name), buildFuncType, getArgAttrsAttrName(result.name), getResAttrsAttrName(result.name)); + if (exported) + result.addAttribute(getExportedAttrName(result.name), UnitAttr::get(ctx)); + return funcParseRes; } LogicalResult FuncOp::verifyBody() { @@ -224,9 +218,18 @@ LogicalResult FuncOp::verifyBody() { } void FuncOp::print(OpAsmPrinter &p) { + /// If exported, print it before and mask it before printing + /// using generic interface. + auto exported = getExported(); + if (exported) { + p << " exported"; + removeExportedAttr(); + } function_interface_impl::printFunctionOp( p, *this, /*isVariadic=*/false, getFunctionTypeAttrName(), getArgAttrsAttrName(), getResAttrsAttrName()); + if (exported) + setExported(true); } //===----------------------------------------------------------------------===// @@ -237,38 +240,37 @@ void FuncImportOp::build(OpBuilder &odsBuilder, OperationState &odsState, StringRef symbol, StringRef moduleName, StringRef importName, FunctionType type) { FuncImportOp::build(odsBuilder, odsState, symbol, moduleName, importName, - type, {}, {}, odsBuilder.getStringAttr("nested")); + type, {}, {}); } //===----------------------------------------------------------------------===// // GlobalOp //===----------------------------------------------------------------------===// - -void GlobalOp::build(OpBuilder &odsBuilder, OperationState &odsState, - StringRef symbol, Type type, bool isMutable) { - GlobalOp::build(odsBuilder, odsState, symbol, type, isMutable, - odsBuilder.getStringAttr("nested")); -} - // Custom formats ParseResult GlobalOp::parse(OpAsmParser &parser, OperationState &result) { StringAttr symbolName; Type globalType; auto *ctx = parser.getContext(); - ParseResult res = parser.parseSymbolName( - symbolName, SymbolTable::getSymbolAttrName(), result.attributes); + std::string visibilityString; + auto loc = parser.getNameLoc(); + ParseResult res = parser.parseOptionalKeywordOrString(&visibilityString); + if (res.succeeded()) { + if (visibilityString != "exported") + return parser.emitError( + loc, "expecting either `exported` or symbol name. got ") + << visibilityString; + result.addAttribute(getExportedAttrName(result.name), UnitAttr::get(ctx)); + } + res = parser.parseSymbolName(symbolName, SymbolTable::getSymbolAttrName(), + result.attributes); res = parser.parseType(globalType); result.addAttribute(getTypeAttrName(result.name), TypeAttr::get(globalType)); std::string mutableString; res = parser.parseOptionalKeywordOrString(&mutableString); if (res.succeeded() && mutableString == "mutable") result.addAttribute("isMutable", UnitAttr::get(ctx)); - std::string visibilityString; - res = parser.parseOptionalKeywordOrString(&visibilityString); - if (res.succeeded()) - result.addAttribute("sym_visibility", - StringAttr::get(ctx, visibilityString)); + res = parser.parseColon(); Region *globalInitRegion = result.addRegion(); res = parser.parseRegion(*globalInitRegion); @@ -276,11 +278,11 @@ ParseResult GlobalOp::parse(OpAsmParser &parser, OperationState &result) { } void GlobalOp::print(OpAsmPrinter &printer) { + if (getExported()) + printer << " exported"; printer << " @" << getSymName().str() << " " << getType(); if (getIsMutable()) printer << " mutable"; - if (auto vis = getSymVisibility()) - printer << " " << *vis; printer << " :"; Region &body = getRegion(); if (!body.empty()) { @@ -319,13 +321,6 @@ GlobalGetOp::verifySymbolUses(SymbolTableCollection &symbolTable) { // GlobalImportOp //===----------------------------------------------------------------------===// -void GlobalImportOp::build(OpBuilder &odsBuilder, OperationState &odsState, - StringRef symbol, StringRef moduleName, - StringRef importName, Type type, bool isMutable) { - GlobalImportOp::build(odsBuilder, odsState, symbol, moduleName, importName, - type, isMutable, odsBuilder.getStringAttr("nested")); -} - ParseResult GlobalImportOp::parse(OpAsmParser &parser, OperationState &result) { auto *ctx = parser.getContext(); ParseResult res = parseImportOp(parser, result); @@ -335,12 +330,8 @@ ParseResult GlobalImportOp::parse(OpAsmParser &parser, OperationState &result) { res = parser.parseOptionalKeywordOrString(&mutableOrSymVisString); if (res.succeeded() && mutableOrSymVisString == "mutable") { result.addAttribute("isMutable", UnitAttr::get(ctx)); - res = parser.parseOptionalKeywordOrString(&mutableOrSymVisString); } - if (res.succeeded()) - result.addAttribute("sym_visibility", - StringAttr::get(ctx, mutableOrSymVisString)); res = parser.parseColon(); Type importedType; @@ -356,8 +347,6 @@ void GlobalImportOp::print(OpAsmPrinter &printer) { << "\" as @" << getSymName(); if (getIsMutable()) printer << " mutable"; - if (auto vis = getSymVisibility()) - printer << " " << *vis; printer << " : " << getType(); } @@ -431,27 +420,6 @@ LogicalResult LocalTeeOp::verify() { Block *LoopOp::getLabelTarget() { return &getBody().front(); } //===----------------------------------------------------------------------===// -// MemOp -//===----------------------------------------------------------------------===// - -void MemOp::build(OpBuilder &odsBuilder, OperationState &odsState, - StringRef symbol, LimitType limit) { - MemOp::build(odsBuilder, odsState, symbol, limit, - odsBuilder.getStringAttr("nested")); -} - -//===----------------------------------------------------------------------===// -// MemImportOp -//===----------------------------------------------------------------------===// - -void MemImportOp::build(OpBuilder &odsBuilder, OperationState &odsState, - StringRef symbol, StringRef moduleName, - StringRef importName, LimitType limits) { - MemImportOp::build(odsBuilder, odsState, symbol, moduleName, importName, - limits, odsBuilder.getStringAttr("nested")); -} - -//===----------------------------------------------------------------------===// // ReinterpretOp //===----------------------------------------------------------------------===// @@ -471,24 +439,3 @@ LogicalResult ReinterpretOp::verify() { //===----------------------------------------------------------------------===// void ReturnOp::build(OpBuilder &odsBuilder, OperationState &odsState) {} - -//===----------------------------------------------------------------------===// -// TableOp -//===----------------------------------------------------------------------===// - -void TableOp::build(OpBuilder &odsBuilder, OperationState &odsState, - StringRef symbol, TableType type) { - TableOp::build(odsBuilder, odsState, symbol, type, - odsBuilder.getStringAttr("nested")); -} - -//===----------------------------------------------------------------------===// -// TableImportOp -//===----------------------------------------------------------------------===// - -void TableImportOp::build(OpBuilder &odsBuilder, OperationState &odsState, - StringRef symbol, StringRef moduleName, - StringRef importName, TableType type) { - TableImportOp::build(odsBuilder, odsState, symbol, moduleName, importName, - type, odsBuilder.getStringAttr("nested")); -} diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 9beb22d..1599ae9 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -727,6 +727,152 @@ void MemLayoutAttr::print(AsmPrinter &printer) const { } printer << ">"; } +// a helper utility to perform binary operation on OpFoldResult. +// If both a and b are attributes, it will simply return the result. +// Otherwise, the corresponding arith op will be generated, and an +// contant op will be created if one of them is an attribute. +template <typename ArithOp> +OpFoldResult genBinOp(OpFoldResult a, OpFoldResult b, Location loc, + OpBuilder &builder) { + auto aVal = getValueOrCreateConstantIndexOp(builder, loc, a); + auto bVal = getValueOrCreateConstantIndexOp(builder, loc, b); + return builder.create<ArithOp>(loc, aVal, bVal).getResult(); +} + +// a helper utility to perform division operation on OpFoldResult and int64_t. +#define div(a, b) \ + genBinOp<arith::DivSIOp>(a, builder.getIndexAttr(b), loc, builder) + +// a helper utility to perform reminder operation on OpFoldResult and int64_t. +#define rem(a, b) \ + genBinOp<arith::RemSIOp>(a, builder.getIndexAttr(b), loc, builder) + +// a helper utility to perform multiply operation on OpFoldResult and int64_t. +#define mul(a, b) \ + genBinOp<arith::MulIOp>(a, builder.getIndexAttr(b), loc, builder) + +// a helper utility to perform addition operation on two OpFoldResult. +#define add(a, b) genBinOp<arith::AddIOp>(a, b, loc, builder) + +// block the given offsets according to the block shape +// say the original offset is [y, x], and the block shape is [By, Bx], +// then the blocked offset is [y/By, x/Bx, y%By, x%Bx] +SmallVector<OpFoldResult> getBlockedOffsets(OpBuilder &builder, Location loc, + ArrayRef<OpFoldResult> offsets, + ArrayRef<int64_t> blockShape) { + + assert(offsets.size() == blockShape.size() && + "offsets and blockShape must have the same size"); + SmallVector<OpFoldResult> blockedOffsets; + SmallVector<OpFoldResult> divs, rems; + + for (auto [offset, block] : llvm::zip(offsets, blockShape)) { + divs.push_back(div(offset, block)); + rems.push_back(rem(offset, block)); + } + blockedOffsets.append(divs.begin(), divs.end()); + blockedOffsets.append(rems.begin(), rems.end()); + + return blockedOffsets; +} + +// Get strides as vector of integer for MemDesc. +SmallVector<int64_t> MemDescType::getStrideShape() { + + SmallVector<int64_t> matrixShape(getShape().begin(), getShape().end()); + + ArrayAttr strideAttr = getStrideAttr(); + SmallVector<int64_t> strides; + for (Attribute attr : strideAttr.getValue()) { + strides.push_back(cast<IntegerAttr>(attr).getInt()); + } + + SmallVector<int64_t> innerBlkShape = getBlockShape(); + + // get perm from FCD to LCD + // perm[i] = the dim with i-th smallest stride + SmallVector<int, 4> perm = + llvm::to_vector<4>(llvm::seq<int>(0, strides.size())); + llvm::sort(perm, [&](int a, int b) { return strides[a] < strides[b]; }); + + assert(strides[perm[0]] == 1 && "inner most dim must have stride 1"); + + SmallVector<int64_t> innerBlkStride(innerBlkShape.size()); + innerBlkStride[perm[0]] = 1; + for (size_t i = 1; i < perm.size(); ++i) + innerBlkStride[perm[i]] = + innerBlkStride[perm[i - 1]] * innerBlkShape[perm[i - 1]]; + + // compute the original matrix shape using the stride info + // and compute the number of blocks in each dimension + // The shape of highest dim can't be derived from stride info, + // but doesn't impact the stride computation for blocked layout. + SmallVector<int64_t> matrixShapeOrig(matrixShape.size()); + SmallVector<int64_t> BlkShapeOrig(matrixShape.size()); + for (size_t i = 0; i < perm.size() - 1; ++i) { + matrixShapeOrig[perm[i]] = strides[perm[i + 1]] / strides[perm[i]]; + BlkShapeOrig[perm[i]] = matrixShapeOrig[perm[i]] / innerBlkShape[perm[i]]; + } + + int64_t innerBlkSize = 1; + for (auto s : innerBlkShape) + innerBlkSize *= s; + + SmallVector<int64_t> outerBlkStride(matrixShape.size()); + outerBlkStride[perm[0]] = innerBlkSize; + for (size_t i = 0; i < perm.size() - 1; ++i) { + outerBlkStride[perm[i + 1]] = + outerBlkStride[perm[i]] * BlkShapeOrig[perm[i]]; + } + + // combine the inner and outer strides + SmallVector<int64_t> blockedStrides; + blockedStrides.append(outerBlkStride.begin(), outerBlkStride.end()); + blockedStrides.append(innerBlkStride.begin(), innerBlkStride.end()); + + return blockedStrides; +} + +// Calculate the linear offset using the blocked offsets and stride +Value MemDescType::getLinearOffsets(OpBuilder &builder, Location loc, + ArrayRef<OpFoldResult> offsets) { + + SmallVector<int64_t> matrixShape(getShape().begin(), getShape().end()); + SmallVector<int64_t> blockShape = getBlockShape(); + SmallVector<int64_t> strides = getStrideShape(); + SmallVector<OpFoldResult> blockedOffsets; + + // blockshape equal to matrixshape means no blocking + if (llvm::equal(blockShape, matrixShape)) { + // remove the outer dims from strides + strides.erase(strides.begin(), strides.begin() + matrixShape.size()); + } else { + assert(offsets.size() == blockShape.size() && + "offsets and blockShape must have the same size"); + // say the original offset is [y, x], and the block shape is [By, Bx], + // then the blocked offset is [y/By, x/Bx, y%By, x%Bx] + + SmallVector<OpFoldResult> divs, rems; + + for (auto [offset, block] : llvm::zip(offsets, blockShape)) { + divs.push_back(div(offset, block)); + rems.push_back(rem(offset, block)); + } + blockedOffsets.append(divs.begin(), divs.end()); + blockedOffsets.append(rems.begin(), rems.end()); + offsets = blockedOffsets; + } + + // Start with initial value as matrix descriptor's base offset. + Value linearOffset = arith::ConstantIndexOp::create(builder, loc, 0); + for (size_t i = 0; i < offsets.size(); ++i) { + OpFoldResult mulResult = mul(offsets[i], strides[i]); + Value mulVal = getValueOrCreateConstantIndexOp(builder, loc, mulResult); + linearOffset = arith::AddIOp::create(builder, loc, mulVal, linearOffset); + } + + return linearOffset; +} } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index e0a8ac4..abd12e2 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -173,6 +173,49 @@ isValidGatherScatterBufferParams(Type offsetsTy, Type maskTy, return success(); } +LogicalResult +IsValidMatrixOpParams(VectorType dataTy, MemDescType mdescTy, + UnitAttr subgroup_block_io, + function_ref<InFlightDiagnostic()> emitError) { + + if (!dataTy) { + if (subgroup_block_io) + return emitError() << "subgroup_block_io " + "are only allowed when result is a 1D VectorType."; + else + return success(); + } + + if (mdescTy.getRank() != 2) + return emitError() << "mem_desc must be 2D."; + + ArrayRef<int64_t> dataShape = dataTy.getShape(); + ArrayRef<int64_t> mdescShape = mdescTy.getShape(); + + if (dataShape.size() == 2) { + if (subgroup_block_io) + return emitError() << "subgroup_block_io " + "are only allowed when result is a 1D VectorType."; + if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape), + [](auto p) { return std::get<0>(p) > std::get<1>(p); })) + return emitError() << "data shape must not exceed mem_desc shape."; + } else { + SmallVector<int64_t> blockShape = mdescTy.getBlockShape(); + // if the subgroup_block_io attribute is set, mdescTy must have block + // attribute + if (subgroup_block_io && !blockShape.size()) + return emitError() << "mem_desc must have block attribute when " + "subgroup_block_io is set."; + // if the subgroup_block_io attribute is set, the memdesc should be row + // major + if (subgroup_block_io && mdescTy.isColMajor()) + return emitError() << "mem_desc should be row major when " + "subgroup_block_io is set."; + } + + return success(); +} + //===----------------------------------------------------------------------===// // XeGPU_CreateNdDescOp //===----------------------------------------------------------------------===// @@ -1049,23 +1092,20 @@ void LoadMatrixOp::build(OpBuilder &builder, OperationState &state, Type res, llvm::SmallVector<int64_t> staticOffsets; dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets); + // Call the generated builder with all parameters (including optional ones as + // nullptr/empty) build(builder, state, res, memDesc, dynamicOffsets, staticOffsetsAttr, - layout); + /*subgroup_block_io=*/nullptr, layout); } LogicalResult LoadMatrixOp::verify() { - VectorType resTy = getRes().getType(); - MemDescType mdescTy = getMemDesc().getType(); - if (mdescTy.getRank() != 2) - return emitOpError("mem_desc must be 2D."); + auto resTy = dyn_cast<VectorType>(getRes().getType()); + UnitAttr subgroup_block_io = getSubgroupBlockIoAttr(); + MemDescType mdescTy = getMemDesc().getType(); - ArrayRef<int64_t> valueShape = resTy.getShape(); - ArrayRef<int64_t> mdescShape = mdescTy.getShape(); - if (llvm::any_of(llvm::zip_equal(valueShape, mdescShape), - [](auto p) { return std::get<0>(p) > std::get<1>(p); })) - return emitOpError("result shape must not exceed mem_desc shape."); - return success(); + return IsValidMatrixOpParams(resTy, mdescTy, subgroup_block_io, + [&]() { return emitError(); }); } //===----------------------------------------------------------------------===// @@ -1080,57 +1120,16 @@ void StoreMatrixOp::build(OpBuilder &builder, OperationState &state, Value data, dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets); build(builder, state, data, memDesc, dynamicOffsets, staticOffsetsAttr, - layout); + /*subgroup_block_io=*/nullptr, layout); } LogicalResult StoreMatrixOp::verify() { - VectorType dataTy = getData().getType(); - MemDescType mdescTy = getMemDesc().getType(); - - if (mdescTy.getRank() != 2) - return emitOpError("mem_desc must be 2D."); - - ArrayRef<int64_t> dataShape = dataTy.getShape(); - ArrayRef<int64_t> mdescShape = mdescTy.getShape(); - if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape), - [](auto p) { return std::get<0>(p) > std::get<1>(p); })) - return emitOpError("data shape must not exceed mem_desc shape."); - - return success(); -} - -//===----------------------------------------------------------------------===// -// XeGPU_MemDescSubviewOp -//===----------------------------------------------------------------------===// - -void MemDescSubviewOp::build(OpBuilder &builder, OperationState &state, - Type resTy, Value src, - llvm::ArrayRef<OpFoldResult> offsets) { - llvm::SmallVector<Value> dynamicOffsets; - llvm::SmallVector<int64_t> staticOffsets; - dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); - auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets); - build(builder, state, resTy, src, dynamicOffsets, staticOffsetsAttr); -} - -LogicalResult MemDescSubviewOp::verify() { - MemDescType srcTy = getSrc().getType(); - MemDescType resTy = getRes().getType(); - ArrayRef<int64_t> srcShape = srcTy.getShape(); - ArrayRef<int64_t> resShape = resTy.getShape(); - if (srcTy.getRank() < resTy.getRank()) - return emitOpError("result rank must not exceed source rank."); - - if (llvm::any_of( - llvm::zip_equal(resShape, srcShape.take_back(resShape.size())), - [](auto p) { return std::get<0>(p) > std::get<1>(p); })) - return emitOpError("result shape must not exceed source shape."); - - if (srcTy.getStrides() != resTy.getStrides()) - return emitOpError("result must inherit the source strides."); - - return success(); + auto dataTy = dyn_cast<VectorType>(getData().getType()); + UnitAttr subgroup_block_io = getSubgroupBlockIoAttr(); + MemDescType mdescTy = getMemDesc().getType(); + return IsValidMatrixOpParams(dataTy, mdescTy, subgroup_block_io, + [&]() { return emitError(); }); } namespace mlir { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp index a178d0f..aafa1b7 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp @@ -941,7 +941,9 @@ struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> { LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op, PatternRewriter &rewriter) const override { Location loc = op.getLoc(); - VectorType valueTy = op.getType(); + VectorType valueTy = llvm::dyn_cast<VectorType>(op.getType()); + assert(valueTy && "the value type must be vector type!"); + std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op); if (!targetShape || targetShape->size() != (size_t)valueTy.getRank()) return failure(); @@ -984,7 +986,8 @@ struct UnrollStoreMatrixOp : public UnrollPattern<xegpu::StoreMatrixOp> { return failure(); Location loc = op.getLoc(); - VectorType valueTy = op.getData().getType(); + VectorType valueTy = llvm::dyn_cast<VectorType>(op.getData().getType()); + assert(valueTy && "the value type must be vector type!"); ArrayRef<int64_t> shape = valueTy.getShape(); auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr()); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index c28d2fc..31a967d 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -991,7 +991,8 @@ struct WgToSgLoadMatrixOp : public OpConversionPattern<xegpu::LoadMatrixOp> { return failure(); ArrayRef<int64_t> wgShape = op.getDataShape(); - VectorType valueTy = op.getRes().getType(); + VectorType valueTy = llvm::dyn_cast<VectorType>(op.getRes().getType()); + assert(valueTy && "the value type must be vector type!"); Type elemTy = valueTy.getElementType(); xegpu::DistributeLayoutAttr layout = op.getLayoutAttr(); diff --git a/mlir/lib/RegisterAllPasses.cpp b/mlir/lib/RegisterAllPasses.cpp index c67b242..dd413d2de 100644 --- a/mlir/lib/RegisterAllPasses.cpp +++ b/mlir/lib/RegisterAllPasses.cpp @@ -98,4 +98,5 @@ void mlir::registerAllPasses() { sparse_tensor::registerSparseTensorPipelines(); tosa::registerTosaToLinalgPipelines(); gpu::registerGPUToNVVMPipeline(); + gpu::registerGPUToXeVMPipeline(); } diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 1e2099d..8de49dd 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -246,7 +246,7 @@ public: // Rewrite all uses of the original variable in `BBName` // with the linear variable in-place - void rewriteInPlace(llvm::IRBuilderBase &builder, std::string BBName, + void rewriteInPlace(llvm::IRBuilderBase &builder, const std::string &BBName, size_t varIndex) { llvm::SmallVector<llvm::User *> users; for (llvm::User *user : linearOrigVal[varIndex]->users()) diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp index 0c3e87a..d9ad8fb 100644 --- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp +++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp @@ -2619,6 +2619,11 @@ LogicalResult ControlFlowStructurizer::structurize() { // region. We cannot handle such cases given that once a value is sinked into // the SelectionOp/LoopOp's region, there is no escape for it. for (auto *block : constructBlocks) { + if (!block->use_empty()) + return emitError(block->getParent()->getLoc(), + "failed control flow structurization: " + "block has uses outside of the " + "enclosing selection/loop construct"); for (Operation &op : *block) if (!op.use_empty()) return op.emitOpError("failed control flow structurization: value has " diff --git a/mlir/lib/Target/Wasm/TranslateFromWasm.cpp b/mlir/lib/Target/Wasm/TranslateFromWasm.cpp index 51c6077..366ba8f 100644 --- a/mlir/lib/Target/Wasm/TranslateFromWasm.cpp +++ b/mlir/lib/Target/Wasm/TranslateFromWasm.cpp @@ -14,6 +14,7 @@ #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributeInterfaces.h" +#include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Location.h" #include "mlir/Support/LLVM.h" @@ -138,6 +139,10 @@ using ImportDesc = using parsed_inst_t = FailureOr<SmallVector<Value>>; +struct EmptyBlockMarker {}; +using BlockTypeParseResult = + std::variant<EmptyBlockMarker, TypeIdxRecord, Type>; + struct WasmModuleSymbolTables { SmallVector<FunctionSymbolRefContainer> funcSymbols; SmallVector<GlobalSymbolRefContainer> globalSymbols; @@ -175,6 +180,9 @@ class ParserHead; /// Wrapper around SmallVector to only allow access as push and pop on the /// stack. Makes sure that there are no "free accesses" on the stack to preserve /// its state. +/// This class also keep tracks of the Wasm labels defined by different ops, +/// which can be targeted by control flow ops. This can be modeled as part of +/// the Value Stack as Wasm control flow ops can only target enclosing labels. class ValueStack { private: struct LabelLevel { @@ -206,6 +214,16 @@ public: /// if an error occurs. LogicalResult pushResults(ValueRange results, Location *opLoc); + void addLabelLevel(LabelLevelOpInterface levelOp) { + labelLevel.push_back({values.size(), levelOp}); + LDBG() << "Adding a new frame context to ValueStack"; + } + + void dropLabelLevel() { + assert(!labelLevel.empty() && "Trying to drop a frame from empty context"); + auto newSize = labelLevel.pop_back_val().stackIdx; + values.truncate(newSize); + } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// A simple dump function for debugging. /// Writes output to llvm::dbgs(). @@ -214,6 +232,7 @@ public: private: SmallVector<Value> values; + SmallVector<LabelLevel> labelLevel; }; using local_val_t = TypedValue<wasmssa::LocalRefType>; @@ -248,6 +267,19 @@ private: buildNumericOp(OpBuilder &builder, std::enable_if_t<std::is_arithmetic_v<valueType>> * = nullptr); + /// Construct a conversion operation of type \p opType that takes a value from + /// type \p inputType on the stack and will produce a value of type + /// \p outputType. + /// + /// \p opType - The WASM dialect operation to build. + /// \p inputType - The operand type for the built instruction. + /// \p outputType - The result type for the built instruction. + /// + /// \returns The parsed instruction result, or failure. + template <typename opType, typename inputType, typename outputType, + typename... extraArgsT> + inline parsed_inst_t buildConvertOp(OpBuilder &builder, extraArgsT...); + /// This function generates a dispatch tree to associate an opcode with a /// parser. Parsers are registered by specialising the /// `parseSpecificInstruction` function for the op code to handle. @@ -280,11 +312,105 @@ private: } } + /// + /// RAII guard class for creating a nesting level + /// + struct NestingContextGuard { + NestingContextGuard(ExpressionParser &parser, LabelLevelOpInterface levelOp) + : parser{parser} { + parser.addNestingContextLevel(levelOp); + } + NestingContextGuard(NestingContextGuard &&other) : parser{other.parser} { + other.shouldDropOnDestruct = false; + } + NestingContextGuard(NestingContextGuard const &) = delete; + ~NestingContextGuard() { + if (shouldDropOnDestruct) + parser.dropNestingContextLevel(); + } + ExpressionParser &parser; + bool shouldDropOnDestruct = true; + }; + + void addNestingContextLevel(LabelLevelOpInterface levelOp) { + valueStack.addLabelLevel(levelOp); + } + + void dropNestingContextLevel() { + // Should always succeed as we are droping the frame that was previously + // created. + valueStack.dropLabelLevel(); + } + + llvm::FailureOr<FunctionType> getFuncTypeFor(OpBuilder &builder, + EmptyBlockMarker) { + return builder.getFunctionType({}, {}); + } + + llvm::FailureOr<FunctionType> getFuncTypeFor(OpBuilder &builder, + TypeIdxRecord type) { + if (type.id >= symbols.moduleFuncTypes.size()) + return emitError(*currentOpLoc, + "type index references nonexistent type (") + << type.id << "). Only " << symbols.moduleFuncTypes.size() + << " types are registered"; + return symbols.moduleFuncTypes[type.id]; + } + + llvm::FailureOr<FunctionType> getFuncTypeFor(OpBuilder &builder, + Type valType) { + return builder.getFunctionType({}, {valType}); + } + + llvm::FailureOr<FunctionType> + getFuncTypeFor(OpBuilder &builder, BlockTypeParseResult parseResult) { + return std::visit( + [this, &builder](auto value) { return getFuncTypeFor(builder, value); }, + parseResult); + } + + llvm::FailureOr<FunctionType> + getFuncTypeFor(OpBuilder &builder, + llvm::FailureOr<BlockTypeParseResult> parseResult) { + if (llvm::failed(parseResult)) + return failure(); + return getFuncTypeFor(builder, *parseResult); + } + + llvm::FailureOr<FunctionType> parseBlockFuncType(OpBuilder &builder); + struct ParseResultWithInfo { SmallVector<Value> opResults; std::byte endingByte; }; + template <typename FilterT = ByteSequence<WasmBinaryEncoding::endByte>> + /// @param blockToFill: the block which content will be populated + /// @param resType: the type that this block is supposed to return + llvm::FailureOr<std::byte> + parseBlockContent(OpBuilder &builder, Block *blockToFill, TypeRange resTypes, + Location opLoc, LabelLevelOpInterface levelOp, + FilterT parseEndBytes = {}) { + OpBuilder::InsertionGuard guard{builder}; + builder.setInsertionPointToStart(blockToFill); + LDBG() << "parsing a block of type " + << builder.getFunctionType(blockToFill->getArgumentTypes(), + resTypes); + auto nC = addNesting(levelOp); + + if (failed(pushResults(blockToFill->getArguments()))) + return failure(); + auto bodyParsingRes = parse(builder, parseEndBytes); + if (failed(bodyParsingRes)) + return failure(); + auto returnOperands = popOperands(resTypes); + if (failed(returnOperands)) + return failure(); + builder.create<BlockReturnOp>(opLoc, *returnOperands); + LDBG() << "end of parsing of a block"; + return bodyParsingRes->endingByte; + } + public: template <std::byte ParseEndByte = WasmBinaryEncoding::endByte> parsed_inst_t parse(OpBuilder &builder, UniqueByte<ParseEndByte> = {}); @@ -294,7 +420,11 @@ public: parse(OpBuilder &builder, ByteSequence<ExpressionParseEnd...> parsingEndFilters); - FailureOr<SmallVector<Value>> popOperands(TypeRange operandTypes) { + NestingContextGuard addNesting(LabelLevelOpInterface levelOp) { + return NestingContextGuard{*this, levelOp}; + } + + FailureOr<llvm::SmallVector<Value>> popOperands(TypeRange operandTypes) { return valueStack.popOperands(operandTypes, ¤tOpLoc.value()); } @@ -308,6 +438,12 @@ public: template <typename OpToCreate> parsed_inst_t parseSetOrTee(OpBuilder &); + /// Blocks and Loops have a similar format and differ only in how their exit + /// is handled which doesn´t matter at parsing time. Factorizes in one + /// function. + template <typename OpToCreate> + parsed_inst_t parseBlockLikeOp(OpBuilder &); + private: std::optional<Location> currentOpLoc; ParserHead &parser; @@ -586,6 +722,29 @@ public: return success(); } + llvm::FailureOr<BlockTypeParseResult> parseBlockType(MLIRContext *ctx) { + auto loc = getLocation(); + auto blockIndicator = peek(); + if (failed(blockIndicator)) + return failure(); + if (*blockIndicator == WasmBinaryEncoding::Type::emptyBlockType) { + offset += 1; + return {EmptyBlockMarker{}}; + } + if (isValueOneOf(*blockIndicator, valueTypesEncodings)) + return parseValueType(ctx); + /// Block type idx is a 32 bit positive integer encoded as a 33 bit signed + /// value + auto typeIdx = parseI64(); + if (failed(typeIdx)) + return failure(); + if (*typeIdx < 0 || *typeIdx > std::numeric_limits<uint32_t>::max()) + return emitError(loc, "type ID should be representable with an unsigned " + "32 bits integer. Got ") + << *typeIdx; + return {TypeIdxRecord{static_cast<uint32_t>(*typeIdx)}}; + } + bool end() const { return curHead().empty(); } ParserHead copy() const { return *this; } @@ -701,17 +860,41 @@ inline parsed_inst_t ExpressionParser::parseSpecificInstruction(OpBuilder &) { void ValueStack::dump() const { llvm::dbgs() << "================= Wasm ValueStack =======================\n"; llvm::dbgs() << "size: " << size() << "\n"; + llvm::dbgs() << "nbFrames: " << labelLevel.size() << '\n'; llvm::dbgs() << "<Top>" << "\n"; // Stack is pushed to via push_back. Therefore the top of the stack is the // end of the vector. Iterate in reverse so that the first thing we print // is the top of the stack. + auto indexGetter = [this]() { + size_t idx = labelLevel.size(); + return [this, idx]() mutable -> std::optional<std::pair<size_t, size_t>> { + llvm::dbgs() << "IDX: " << idx << '\n'; + if (idx == 0) + return std::nullopt; + auto frameId = idx - 1; + auto frameLimit = labelLevel[frameId].stackIdx; + idx -= 1; + return {{frameId, frameLimit}}; + }; + }; + auto getNextFrameIndex = indexGetter(); + auto nextFrameIdx = getNextFrameIndex(); size_t stackSize = size(); - for (size_t idx = 0; idx < stackSize; idx++) { + for (size_t idx = 0; idx < stackSize; ++idx) { size_t actualIdx = stackSize - 1 - idx; + while (nextFrameIdx && (nextFrameIdx->second > actualIdx)) { + llvm::dbgs() << " --------------- Frame (" << nextFrameIdx->first + << ")\n"; + nextFrameIdx = getNextFrameIndex(); + } llvm::dbgs() << " "; values[actualIdx].dump(); } + while (nextFrameIdx) { + llvm::dbgs() << " --------------- Frame (" << nextFrameIdx->first << ")\n"; + nextFrameIdx = getNextFrameIndex(); + } llvm::dbgs() << "<Bottom>" << "\n"; llvm::dbgs() << "=========================================================\n"; @@ -726,7 +909,7 @@ parsed_inst_t ValueStack::popOperands(TypeRange operandTypes, Location *opLoc) { return emitError(*opLoc, "stack doesn't contain enough values. trying to get ") << operandTypes.size() << " operands on a stack containing only " - << values.size() << " values."; + << values.size() << " values"; size_t stackIdxOffset = values.size() - operandTypes.size(); SmallVector<Value> res{}; res.reserve(operandTypes.size()); @@ -735,8 +918,7 @@ parsed_inst_t ValueStack::popOperands(TypeRange operandTypes, Location *opLoc) { Type stackType = operand.getType(); if (stackType != operandTypes[i]) return emitError(*opLoc, "invalid operand type on stack. expecting ") - << operandTypes[i] << ", value on stack is of type " << stackType - << "."; + << operandTypes[i] << ", value on stack is of type " << stackType; LDBG() << " POP: " << operand; res.push_back(operand); } @@ -792,6 +974,151 @@ ExpressionParser::parse(OpBuilder &builder, } } +llvm::FailureOr<FunctionType> +ExpressionParser::parseBlockFuncType(OpBuilder &builder) { + return getFuncTypeFor(builder, parser.parseBlockType(builder.getContext())); +} + +template <typename OpToCreate> +parsed_inst_t ExpressionParser::parseBlockLikeOp(OpBuilder &builder) { + auto opLoc = currentOpLoc; + auto funcType = parseBlockFuncType(builder); + if (failed(funcType)) + return failure(); + + auto inputTypes = funcType->getInputs(); + auto inputOps = popOperands(inputTypes); + if (failed(inputOps)) + return failure(); + + Block *curBlock = builder.getBlock(); + Region *curRegion = curBlock->getParent(); + auto resTypes = funcType->getResults(); + llvm::SmallVector<Location> locations{}; + locations.resize(resTypes.size(), *currentOpLoc); + auto *successor = + builder.createBlock(curRegion, curRegion->end(), resTypes, locations); + builder.setInsertionPointToEnd(curBlock); + auto blockOp = + builder.create<OpToCreate>(*currentOpLoc, *inputOps, successor); + auto *blockBody = blockOp.createBlock(); + if (failed(parseBlockContent(builder, blockBody, resTypes, *opLoc, blockOp))) + return failure(); + builder.setInsertionPointToStart(successor); + return {ValueRange{successor->getArguments()}}; +} + +template <> +inline parsed_inst_t +ExpressionParser::parseSpecificInstruction<WasmBinaryEncoding::OpCode::block>( + OpBuilder &builder) { + return parseBlockLikeOp<BlockOp>(builder); +} + +template <> +inline parsed_inst_t +ExpressionParser::parseSpecificInstruction<WasmBinaryEncoding::OpCode::loop>( + OpBuilder &builder) { + return parseBlockLikeOp<LoopOp>(builder); +} + +template <> +inline parsed_inst_t ExpressionParser::parseSpecificInstruction< + WasmBinaryEncoding::OpCode::ifOpCode>(OpBuilder &builder) { + auto opLoc = currentOpLoc; + auto funcType = parseBlockFuncType(builder); + if (failed(funcType)) + return failure(); + + LDBG() << "Parsing an if instruction of type " << *funcType; + auto inputTypes = funcType->getInputs(); + auto conditionValue = popOperands(builder.getI32Type()); + if (failed(conditionValue)) + return failure(); + auto inputOps = popOperands(inputTypes); + if (failed(inputOps)) + return failure(); + + Block *curBlock = builder.getBlock(); + Region *curRegion = curBlock->getParent(); + auto resTypes = funcType->getResults(); + llvm::SmallVector<Location> locations{}; + locations.resize(resTypes.size(), *currentOpLoc); + auto *successor = + builder.createBlock(curRegion, curRegion->end(), resTypes, locations); + builder.setInsertionPointToEnd(curBlock); + auto ifOp = builder.create<IfOp>(*currentOpLoc, conditionValue->front(), + *inputOps, successor); + auto *ifEntryBlock = ifOp.createIfBlock(); + constexpr auto ifElseFilter = + ByteSequence<WasmBinaryEncoding::endByte, + WasmBinaryEncoding::OpCode::elseOpCode>{}; + auto parseIfRes = parseBlockContent(builder, ifEntryBlock, resTypes, *opLoc, + ifOp, ifElseFilter); + if (failed(parseIfRes)) + return failure(); + if (*parseIfRes == WasmBinaryEncoding::OpCode::elseOpCode) { + LDBG() << " else block is present."; + Block *elseEntryBlock = ifOp.createElseBlock(); + auto parseElseRes = + parseBlockContent(builder, elseEntryBlock, resTypes, *opLoc, ifOp); + if (failed(parseElseRes)) + return failure(); + } + builder.setInsertionPointToStart(successor); + return {ValueRange{successor->getArguments()}}; +} + +template <> +inline parsed_inst_t ExpressionParser::parseSpecificInstruction< + WasmBinaryEncoding::OpCode::branchIf>(OpBuilder &builder) { + auto level = parser.parseLiteral<uint32_t>(); + if (failed(level)) + return failure(); + Block *curBlock = builder.getBlock(); + Region *curRegion = curBlock->getParent(); + auto sip = builder.saveInsertionPoint(); + Block *elseBlock = builder.createBlock(curRegion, curRegion->end()); + auto condition = popOperands(builder.getI32Type()); + if (failed(condition)) + return failure(); + builder.restoreInsertionPoint(sip); + auto targetOp = + LabelBranchingOpInterface::getTargetOpFromBlock(curBlock, *level); + if (failed(targetOp)) + return failure(); + auto inputTypes = targetOp->getLabelTarget()->getArgumentTypes(); + auto branchArgs = popOperands(inputTypes); + if (failed(branchArgs)) + return failure(); + builder.create<BranchIfOp>(*currentOpLoc, condition->front(), + builder.getUI32IntegerAttr(*level), *branchArgs, + elseBlock); + builder.setInsertionPointToStart(elseBlock); + return {*branchArgs}; +} + +template <> +inline parsed_inst_t +ExpressionParser::parseSpecificInstruction<WasmBinaryEncoding::OpCode::call>( + OpBuilder &builder) { + auto loc = *currentOpLoc; + auto funcIdx = parser.parseLiteral<uint32_t>(); + if (failed(funcIdx)) + return failure(); + if (*funcIdx >= symbols.funcSymbols.size()) + return emitError(loc, "Invalid function index: ") << *funcIdx; + auto callee = symbols.funcSymbols[*funcIdx]; + llvm::ArrayRef<Type> inTypes = callee.functionType.getInputs(); + llvm::ArrayRef<Type> resTypes = callee.functionType.getResults(); + parsed_inst_t inOperands = popOperands(inTypes); + if (failed(inOperands)) + return failure(); + auto callOp = + builder.create<FuncCallOp>(loc, resTypes, callee.symbol, *inOperands); + return {callOp.getResults()}; +} + template <> inline parsed_inst_t ExpressionParser::parseSpecificInstruction< WasmBinaryEncoding::OpCode::localGet>(OpBuilder &builder) { @@ -834,7 +1161,7 @@ parsed_inst_t ExpressionParser::parseSetOrTee(OpBuilder &builder) { if (valueStack.empty()) return emitError( *currentOpLoc, - "invalid stack access, trying to access a value on an empty stack."); + "invalid stack access, trying to access a value on an empty stack"); parsed_inst_t poppedOp = popOperands(locals[*id].getType().getElementType()); if (failed(poppedOp)) @@ -1000,11 +1327,23 @@ inline parsed_inst_t ExpressionParser::buildNumericOp( BUILD_NUMERIC_BINOP_FP(CopySignOp, copysign) BUILD_NUMERIC_BINOP_FP(DivOp, div) +BUILD_NUMERIC_BINOP_FP(GeOp, ge) +BUILD_NUMERIC_BINOP_FP(GtOp, gt) +BUILD_NUMERIC_BINOP_FP(LeOp, le) +BUILD_NUMERIC_BINOP_FP(LtOp, lt) BUILD_NUMERIC_BINOP_FP(MaxOp, max) BUILD_NUMERIC_BINOP_FP(MinOp, min) BUILD_NUMERIC_BINOP_INT(AndOp, and) BUILD_NUMERIC_BINOP_INT(DivSIOp, divS) BUILD_NUMERIC_BINOP_INT(DivUIOp, divU) +BUILD_NUMERIC_BINOP_INT(GeSIOp, geS) +BUILD_NUMERIC_BINOP_INT(GeUIOp, geU) +BUILD_NUMERIC_BINOP_INT(GtSIOp, gtS) +BUILD_NUMERIC_BINOP_INT(GtUIOp, gtU) +BUILD_NUMERIC_BINOP_INT(LeSIOp, leS) +BUILD_NUMERIC_BINOP_INT(LeUIOp, leU) +BUILD_NUMERIC_BINOP_INT(LtSIOp, ltS) +BUILD_NUMERIC_BINOP_INT(LtUIOp, ltU) BUILD_NUMERIC_BINOP_INT(OrOp, or) BUILD_NUMERIC_BINOP_INT(RemSIOp, remS) BUILD_NUMERIC_BINOP_INT(RemUIOp, remU) @@ -1015,7 +1354,9 @@ BUILD_NUMERIC_BINOP_INT(ShRSOp, shrS) BUILD_NUMERIC_BINOP_INT(ShRUOp, shrU) BUILD_NUMERIC_BINOP_INT(XOrOp, xor) BUILD_NUMERIC_BINOP_INTFP(AddOp, add) +BUILD_NUMERIC_BINOP_INTFP(EqOp, eq) BUILD_NUMERIC_BINOP_INTFP(MulOp, mul) +BUILD_NUMERIC_BINOP_INTFP(NeOp, ne) BUILD_NUMERIC_BINOP_INTFP(SubOp, sub) BUILD_NUMERIC_UNARY_OP_FP(AbsOp, abs) BUILD_NUMERIC_UNARY_OP_FP(CeilOp, ceil) @@ -1025,6 +1366,7 @@ BUILD_NUMERIC_UNARY_OP_FP(SqrtOp, sqrt) BUILD_NUMERIC_UNARY_OP_FP(TruncOp, trunc) BUILD_NUMERIC_UNARY_OP_INT(ClzOp, clz) BUILD_NUMERIC_UNARY_OP_INT(CtzOp, ctz) +BUILD_NUMERIC_UNARY_OP_INT(EqzOp, eqz) BUILD_NUMERIC_UNARY_OP_INT(PopCntOp, popcnt) // Don't need these anymore so let's undef them. @@ -1036,6 +1378,105 @@ BUILD_NUMERIC_UNARY_OP_INT(PopCntOp, popcnt) #undef BUILD_NUMERIC_OP #undef BUILD_NUMERIC_CAST_OP +template <typename opType, typename inputType, typename outputType, + typename... extraArgsT> +inline parsed_inst_t ExpressionParser::buildConvertOp(OpBuilder &builder, + extraArgsT... extraArgs) { + static_assert(std::is_arithmetic_v<inputType>, + "InputType should be an arithmetic type"); + static_assert(std::is_arithmetic_v<outputType>, + "OutputType should be an arithmetic type"); + auto intype = buildLiteralType<inputType>(builder); + auto outType = buildLiteralType<outputType>(builder); + auto operand = popOperands(intype); + if (failed(operand)) + return failure(); + auto op = builder.create<opType>(*currentOpLoc, outType, operand->front(), + extraArgs...); + LDBG() << "Built operation: " << op; + return {{op.getResult()}}; +} + +template <> +inline parsed_inst_t ExpressionParser::parseSpecificInstruction< + WasmBinaryEncoding::OpCode::demoteF64ToF32>(OpBuilder &builder) { + return buildConvertOp<DemoteOp, double, float>(builder); +} + +template <> +inline parsed_inst_t +ExpressionParser::parseSpecificInstruction<WasmBinaryEncoding::OpCode::wrap>( + OpBuilder &builder) { + return buildConvertOp<WrapOp, int64_t, int32_t>(builder); +} + +#define BUILD_CONVERSION_OP(IN_T, OUT_T, SOURCE_OP, TARGET_OP) \ + template <> \ + inline parsed_inst_t ExpressionParser::parseSpecificInstruction< \ + WasmBinaryEncoding::OpCode::SOURCE_OP>(OpBuilder & builder) { \ + return buildConvertOp<TARGET_OP, IN_T, OUT_T>(builder); \ + } + +#define BUILD_CONVERT_OP_FOR(DEST_T, WIDTH) \ + BUILD_CONVERSION_OP(uint32_t, DEST_T, convertUI32F##WIDTH, ConvertUOp) \ + BUILD_CONVERSION_OP(int32_t, DEST_T, convertSI32F##WIDTH, ConvertSOp) \ + BUILD_CONVERSION_OP(uint64_t, DEST_T, convertUI64F##WIDTH, ConvertUOp) \ + BUILD_CONVERSION_OP(int64_t, DEST_T, convertSI64F##WIDTH, ConvertSOp) + +BUILD_CONVERT_OP_FOR(float, 32) +BUILD_CONVERT_OP_FOR(double, 64) + +#undef BUILD_CONVERT_OP_FOR + +BUILD_CONVERSION_OP(int32_t, int64_t, extendS, ExtendSI32Op) +BUILD_CONVERSION_OP(int32_t, int64_t, extendU, ExtendUI32Op) + +#undef BUILD_CONVERSION_OP + +#define BUILD_SLICE_EXTEND_PARSER(IT_WIDTH, EXTRACT_WIDTH) \ + template <> \ + parsed_inst_t ExpressionParser::parseSpecificInstruction< \ + WasmBinaryEncoding::OpCode::extendI##IT_WIDTH##EXTRACT_WIDTH##S>( \ + OpBuilder & builder) { \ + using inout_t = int##IT_WIDTH##_t; \ + auto attr = builder.getUI32IntegerAttr(EXTRACT_WIDTH); \ + return buildConvertOp<ExtendLowBitsSOp, inout_t, inout_t>(builder, attr); \ + } + +BUILD_SLICE_EXTEND_PARSER(32, 8) +BUILD_SLICE_EXTEND_PARSER(32, 16) +BUILD_SLICE_EXTEND_PARSER(64, 8) +BUILD_SLICE_EXTEND_PARSER(64, 16) +BUILD_SLICE_EXTEND_PARSER(64, 32) + +#undef BUILD_SLICE_EXTEND_PARSER + +template <> +inline parsed_inst_t ExpressionParser::parseSpecificInstruction< + WasmBinaryEncoding::OpCode::promoteF32ToF64>(OpBuilder &builder) { + return buildConvertOp<PromoteOp, float, double>(builder); +} + +#define BUILD_REINTERPRET_PARSER(WIDTH, FP_TYPE) \ + template <> \ + inline parsed_inst_t ExpressionParser::parseSpecificInstruction< \ + WasmBinaryEncoding::OpCode::reinterpretF##WIDTH##AsI##WIDTH>(OpBuilder & \ + builder) { \ + return buildConvertOp<ReinterpretOp, FP_TYPE, int##WIDTH##_t>(builder); \ + } \ + \ + template <> \ + inline parsed_inst_t ExpressionParser::parseSpecificInstruction< \ + WasmBinaryEncoding::OpCode::reinterpretI##WIDTH##AsF##WIDTH>(OpBuilder & \ + builder) { \ + return buildConvertOp<ReinterpretOp, int##WIDTH##_t, FP_TYPE>(builder); \ + } + +BUILD_REINTERPRET_PARSER(32, float) +BUILD_REINTERPRET_PARSER(64, double) + +#undef BUILD_REINTERPRET_PARSER + class WasmBinaryParser { private: struct SectionRegistry { @@ -1153,7 +1594,7 @@ private: if (tid.id >= symbols.moduleFuncTypes.size()) return emitError(loc, "invalid type id: ") << tid.id << ". Only " << symbols.moduleFuncTypes.size() - << " type registration."; + << " type registrations"; FunctionType type = symbols.moduleFuncTypes[tid.id]; std::string symbol = symbols.getNewFuncSymbolName(); auto funcOp = FuncImportOp::create(builder, loc, symbol, moduleName, @@ -1221,7 +1662,7 @@ public: FileLineColLoc magicLoc = parser.getLocation(); FailureOr<StringRef> magic = parser.consumeNBytes(wasmHeader.size()); if (failed(magic) || magic->compare(wasmHeader)) { - emitError(magicLoc, "source file does not contain valid Wasm header."); + emitError(magicLoc, "source file does not contain valid Wasm header"); return; } auto const expectedVersionString = StringRef{"\1\0\0\0", 4}; @@ -1391,7 +1832,7 @@ WasmBinaryParser::parseSectionItem<WasmSectionType::EXPORT>(ParserHead &ph, return failure(); Operation *op = SymbolTable::lookupSymbolIn(mOp, *currentSymbol); - SymbolTable::setSymbolVisibility(op, SymbolTable::Visibility::Public); + op->setAttr("exported", UnitAttr::get(op->getContext())); StringAttr symName = SymbolTable::getSymbolName(op); return SymbolTable{mOp}.rename(symName, *exportName); } |