//===---- XeGPUUtils.cpp - MLIR Utilities for XeGPUOps ------------------===// // // Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements utility methods for working with the XeGPU dialect. // //===----------------------------------------------------------------------===// #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/Index/IR/IndexOps.h" #include "mlir/Dialect/LLVMIR/XeVMDialect.h" #include "mlir/Dialect/SCF/Transforms/Patterns.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/IR/Builders.h" #include "mlir/IR/Operation.h" #include "mlir/IR/ValueRange.h" #include "mlir/Interfaces/LoopLikeInterface.h" #include "mlir/Transforms/DialectConversion.h" #include "llvm/Support/FormatVariadic.h" #include #include using namespace mlir; /// convert ArrayRef into SmallVector SmallVector xegpu::flattenValues(ArrayRef values) { SmallVector result; for (const auto &vals : values) llvm::append_range(result, vals); return result; } FailureOr mlir::xegpu::getDistributedVectorType(xegpu::TensorDescType tdescTy) { auto layout = llvm::dyn_cast_if_present(tdescTy.getLayout()); // It only works for subgroup level layout, which only has lane_layout // and lane_data, and is to distribute a SIMD code into SIMT code. if (!layout || !layout.isForSubgroup()) return failure(); SmallVector laneData(layout.getLaneData().asArrayRef()); SmallVector laneLayout(layout.getLaneLayout().asArrayRef()); auto tdescShape = tdescTy.getShape(); auto elementType = tdescTy.getElementType(); // compute sgSize by multiply elements of laneLayout // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1] // e.g. for 1D layout, sgSize = laneLayout[0] int64_t sgSize = llvm::product_of(laneLayout); // Case 1: regular loads/stores auto scatterAttr = tdescTy.getEncodingOfType(); if (scatterAttr) { auto chunkSize = scatterAttr.getChunkSize().getInt(); // Verify if the first dimension of the tensor descriptor shape is // distributable. assert(tdescShape[0] == laneLayout[0] && "tensor descriptor shape is not distributable"); return VectorType::get({chunkSize}, elementType); } // Case 2: block loads/stores // Check if the tensor descriptor shape is distributable. int64_t tensorSize = 1; for (auto [tdescDim, laneDim, laneDataDim] : llvm::zip_equal(tdescShape, laneLayout, laneData)) { assert((tdescDim % (laneDim * laneDataDim) == 0) && "tensor descriptor shape is not distributable"); tensorSize *= tdescDim; } // tensorSize must be adjusted for array_length. tensorSize *= tdescTy.getArrayLength(); return VectorType::get({tensorSize / sgSize}, elementType); } FailureOr mlir::xegpu::getDistributedVectorType(VectorType originalType, xegpu::LayoutAttr layout) { int64_t rank = originalType.getRank(); // Distributed vector type is only supported for 1D, 2D and 3D vectors. if (rank < 1 || rank > 3) return failure(); ArrayRef shape = originalType.getShape(); // arrayLength is 1 for 1D and 2D vectors, and equal to the first dimension // of the 3D vector. int arrayLength = 1; if (rank == 3) { arrayLength = shape[0]; shape = shape.drop_front(); } auto helperTdescTy = xegpu::TensorDescType::get( shape, originalType.getElementType(), arrayLength, /*boundary_check=*/true, /*memory_space=*/xegpu::MemorySpace::Global, layout); return xegpu::getDistributedVectorType(helperTdescTy); } std::string xegpu::getLayoutName(const OpOperand &operand) { const StringRef prefix("layout_operand_"); unsigned idx = const_cast(operand).getOperandNumber(); return llvm::formatv("{0}{1}", prefix, idx).str(); } std::string xegpu::getLayoutName(const OpResult result) { const StringRef prefix = "layout_result_"; return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str(); } xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { if (!value) return nullptr; if (auto tdescTy = dyn_cast_if_present(value.getType())) return tdescTy.getLayoutAttr(); if (auto result = dyn_cast(value)) { Operation *defOp = result.getDefiningOp(); assert(defOp && "result must have a defining op"); // For ConvertLayoutOp, the layout is stored in the targetLayoutAttr if (auto convertOp = dyn_cast(defOp)) return convertOp.getTargetLayoutAttr(); // for LoadNdOp, the layout is stored in the tensor descriptor if (auto loadNd = dyn_cast(defOp)) return getDistributeLayoutAttr(loadNd.getTensorDesc()); // for LoadMatrixOp, the layout is attached to the property of the op if (auto loadOp = dyn_cast(defOp)) return loadOp.getLayoutAttr(); // for StoreMatrixOp, the layout is attached to the property of the op if (auto storeOp = dyn_cast(defOp)) return storeOp.getLayoutAttr(); std::string layoutName = getLayoutName(result); if (defOp->hasAttr(layoutName)) return defOp->getAttrOfType(layoutName); } if (auto arg = dyn_cast(value)) { auto parentOp = arg.getOwner()->getParentOp(); if (auto loop = dyn_cast(parentOp)) { OpOperand *tiedInit = loop.getTiedLoopInit(arg); if (tiedInit) return getDistributeLayoutAttr(tiedInit->get()); } } return nullptr; } xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const OpOperand &opr) { Operation *op = opr.getOwner(); if (auto loadOp = dyn_cast(op)) return loadOp.getLayoutAttr(); if (auto storeOp = dyn_cast(op)) return storeOp.getLayoutAttr(); std::string layoutName = xegpu::getLayoutName(opr); if (op->hasAttr(layoutName)) return op->getAttrOfType(layoutName); return getDistributeLayoutAttr(opr.get()); } template void xegpu::setDistributeLayoutAttr(const T &operandOrResult, const DistributeLayoutAttr layout) { Operation *owner = operandOrResult.getOwner(); std::string name = xegpu::getLayoutName(operandOrResult); if (layout && !owner->hasAttrOfType(name)) owner->setAttr(name, layout); } // Explicit instantiation for OpResult template void xegpu::setDistributeLayoutAttr( const mlir::OpResult &result, const mlir::xegpu::DistributeLayoutAttr layout); // Explicit instantiation for OpOperand template void xegpu::setDistributeLayoutAttr( const mlir::OpOperand &operand, const mlir::xegpu::DistributeLayoutAttr layout); void xegpu::setDistributeLayoutAttrs( Operation *op, function_ref getLayoutImpl) { op->walk([&](Operation *nestOp) { if (isa(nestOp)) return; for (OpOperand &opr : nestOp->getOpOperands()) { auto layout = getLayoutImpl(opr.get()); setDistributeLayoutAttr(opr, layout); } for (OpResult result : nestOp->getOpResults()) { auto layout = getLayoutImpl(result); setDistributeLayoutAttr(result, layout); } }); } template void xegpu::removeLayoutAttr(const T &operandOrResult) { Operation *owner = operandOrResult.getOwner(); std::string name = xegpu::getLayoutName(operandOrResult); if (owner->hasAttrOfType(name)) owner->removeAttr(name); } // Explicit instantiation for OpResult template void xegpu::removeLayoutAttr(const mlir::OpResult &result); // Explicit instantiation for OpOperand template void xegpu::removeLayoutAttr(const mlir::OpOperand &operand); void xegpu::removeLayoutAttrs(Operation *op) { op->walk([&](Operation *nestOp) { for (OpOperand &opr : nestOp->getOpOperands()) removeLayoutAttr(opr); for (OpResult result : nestOp->getOpResults()) removeLayoutAttr(result); }); } SmallVector xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc, Value value, ArrayRef shape) { auto vecTy = dyn_cast(value.getType()); if (!vecTy) return {value}; ArrayRef srcShape = vecTy.getShape(); if (!computeShapeRatio(srcShape, shape)) return {value}; SmallVector result; for (SmallVector offsets : StaticTileOffsetRange(srcShape, shape)) { SmallVector staticStrides(offsets.size(), 1); result.push_back(vector::ExtractStridedSliceOp::create( builder, loc, value, offsets, shape, staticStrides)); } return result; } Value xegpu::createVectorWithShapeFromValues(OpBuilder &builder, Location loc, ValueRange values, ArrayRef shape) { VectorType inputTy = dyn_cast(values[0].getType()); assert(llvm::all_of(values.getTypes(), [&](Type type) { return type == inputTy; }) && "values must be of the same VectorType"); Type elemTy = inputTy.getElementType(); ArrayRef tileShape = inputTy.getShape(); VectorType resultTy = VectorType::get(shape, elemTy); auto zeroAttr = builder.getZeroAttr(elemTy); Value result = arith::ConstantOp::create( builder, loc, resultTy, DenseElementsAttr::get(resultTy, zeroAttr)); for (auto [src, offsets] : llvm::zip_equal(values, StaticTileOffsetRange(shape, tileShape))) { SmallVector staticStrides(offsets.size(), 1); result = vector::InsertStridedSliceOp::create(builder, loc, src, result, offsets, staticStrides); } return result; } void xegpu::doSCFStructuralTypeConversionWithTensorType( Operation *op, TypeConverter converter) { MLIRContext *context = op->getContext(); auto materializeCast = [](OpBuilder &builder, Type type, ValueRange inputs, Location loc) -> Value { return UnrealizedConversionCastOp::create(builder, loc, type, inputs) .getResult(0); }; { // convert VectorType to RankedTensorType for SCF Structural ops TypeConverter converter; converter.addConversion([](Type type) -> Type { return type; }); converter.addConversion([](VectorType type) -> Type { return RankedTensorType::get(type.getShape(), type.getElementType()); }); converter.addSourceMaterialization(materializeCast); converter.addTargetMaterialization(materializeCast); mlir::ConversionTarget target(*context); target.addLegalOp(); mlir::RewritePatternSet patterns(context); scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns, target); (void)mlir::applyPartialConversion(op, target, std::move(patterns)); } { // propagate the layout attribute to RankedTensorType by checking // BuiltInUnrealizedCastOps // for VectorType to RankedTensorType cast. op->walk([](UnrealizedConversionCastOp castOp) { if (castOp.getNumOperands() != 1 || castOp.getNumResults() != 1) return WalkResult::skip(); Value input = castOp.getInputs()[0]; Value result = castOp.getResults()[0]; auto inputTy = dyn_cast(input.getType()); auto resultTy = dyn_cast(result.getType()); // Only look at ops casting from VectorType to RankedTensorType if (!inputTy || !resultTy) return WalkResult::skip(); xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(input); if (!layout) return WalkResult::skip(); RankedTensorType newTy = resultTy.cloneWithEncoding(layout); result.setType(newTy); // update the arguments if user is a LoopLike op. for (OpOperand &use : result.getUses()) { if (auto loop = dyn_cast(use.getOwner())) { BlockArgument arg = loop.getTiedLoopRegionIterArg(&use); arg.setType(newTy); } // whileOp has two regions, the BlockArgument of the after region // is not exposed by LoopLikeOpInterface if (auto whileOp = dyn_cast(use.getOwner())) { unsigned idx = use.getOperandNumber(); BlockArgument arg = whileOp.getAfterArguments()[idx]; arg.setType(newTy); } } return WalkResult::advance(); }); // using yieldOp as anchor to update the result type of its ParentOp op->walk([](scf::YieldOp yieldOp) { Operation *parentOp = yieldOp->getParentOp(); for (OpResult r : parentOp->getOpResults()) { unsigned idx = r.getResultNumber(); Type resultTy = r.getType(); Type yieldTy = yieldOp.getResults()[idx].getType(); if (isa(resultTy) && yieldTy != resultTy) r.setType(yieldTy); } }); } { // perform the conversion from RankedTensorType to VectorType based on the // DistributeLayoutAttr // Handle the UnrealizedConversionCastOp introduced by the first step. // For vector->RankedTensorType, it will simply forward the inputs. // For RankedTensorType->vector, it will update the inputs with the // one from the adaptor. class UnrealizedConversionCastOpPattern : public OpConversionPattern { using OpConversionPattern< mlir::UnrealizedConversionCastOp>::OpConversionPattern; mlir::LogicalResult matchAndRewrite(mlir::UnrealizedConversionCastOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto inputs = op.getOperands(); auto outputs = op.getOutputs(); if (inputs.size() != 1 || outputs.size() != 1) return failure(); auto inputTy = inputs[0].getType(); auto outputTy = outputs[0].getType(); if (isa(inputTy) && isa(outputTy)) { rewriter.replaceOpWithMultiple(op, adaptor.getInputs()); return success(); } if (isa(inputTy) && isa(outputTy)) { SmallVector values = xegpu::flattenValues(adaptor.getInputs()); auto newOp = UnrealizedConversionCastOp::create(rewriter, op.getLoc(), outputTy, values); rewriter.replaceOp(op, newOp); return success(); } return failure(); } }; converter.addSourceMaterialization(materializeCast); converter.addTargetMaterialization([&](OpBuilder &builder, TypeRange type, ValueRange inputs, Location loc) { return UnrealizedConversionCastOp::create(builder, loc, type, inputs) .getResults(); }); mlir::ConversionTarget target(*context); target.addDynamicallyLegalOp( [](UnrealizedConversionCastOp op) { auto isTensorTy = [](Type type) { return isa(type); }; return llvm::none_of(op->getOperandTypes(), isTensorTy) && llvm::none_of(op->getResultTypes(), isTensorTy); }); mlir::RewritePatternSet patterns(context); patterns.insert(context); scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns, target); (void)mlir::applyPartialConversion(op, target, std::move(patterns)); } } std::optional xegpu::getChipStr(Operation *op) { auto gpuModuleOp = op->getParentOfType(); if (!gpuModuleOp) return std::nullopt; auto targetAttrs = gpuModuleOp.getTargets(); if (targetAttrs) { for (auto &attr : *targetAttrs) { auto xevmAttr = llvm::dyn_cast(attr); if (xevmAttr) return xevmAttr.getChip().str(); } } return std::nullopt; } /// Generates element-wise addition ops of two arrays with same length. SmallVector xegpu::addElementwise(OpBuilder &builder, Location loc, ArrayRef lhs, ArrayRef rhs) { assert(lhs.size() == rhs.size() && "lhs and rhs must have the same size"); SmallVector results; for (auto [l, r] : llvm::zip_equal(lhs, rhs)) { auto lval = getValueOrCreateConstantIndexOp(builder, loc, l); auto rval = getValueOrCreateConstantIndexOp(builder, loc, r); results.push_back(builder.createOrFold(loc, lval, rval)); } return results; } /// Generates element-wise addition ops of two arrays with automatic alignment. /// When the input arrays have different sizes, the shorter array is /// right-aligned with the longer array, and the unmatched leading elements from /// the longer array are preserved unchanged. This is commonly used for offset /// computation where higher-dimensional offsets need to be added to /// lower-dimensional adjustments. /// /// Example: /// lhs = [l1, l2, l3], rhs = [r1, r2] /// Result: [11, l2+r1, l3+r2] SmallVector xegpu::addWithRightAligned(OpBuilder &builder, Location loc, ArrayRef lhs, ArrayRef rhs) { // ensure a is longer than b ArrayRef a = lhs.size() >= rhs.size() ? lhs : rhs; ArrayRef b = lhs.size() >= rhs.size() ? rhs : lhs; SmallVector results(a.take_front(a.size() - b.size())); a = a.slice(a.size() - b.size()); results.append(addElementwise(builder, loc, a, b)); return results; }