//===- XeGPUDialect.cpp - MLIR XeGPU dialect implementation -----*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Index/IR/IndexOps.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h" #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" #include "mlir/IR/Builders.h" #include "mlir/IR/DialectImplementation.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Debug.h" using std::optional; namespace mlir { namespace xegpu { void XeGPUDialect::initialize() { addTypes< #define GET_TYPEDEF_LIST #include >(); addOperations< #define GET_OP_LIST #include >(); addAttributes< #define GET_ATTRDEF_LIST #include >(); } /// Generates instructions to compute offsets for a subgroup identified by /// its multidimensional indices (sgId), using the specified subgroup layout /// (sgLayout), subgroup data dimensions (sizePerSg), and the overall data /// dimensions (sizePerWg). static SmallVector> genOffsetsComputingInsts(OpBuilder &builder, Location loc, SmallVector sgId, ArrayRef sgLayout, ArrayRef sizePerSg, ArrayRef sizePerWg) { SmallVector> offsets; // nd local offset, localOffset[i] = sgId[i] * sizePerSg[i] SmallVector localOffsets = llvm::map_to_vector( llvm::zip(sgId, sizePerSg), [&](const auto &t) -> Value { return builder.createOrFold( loc, std::get<0>(t), builder.createOrFold(loc, std::get<1>(t))); }); // distUnit[i] is the minimum value between sizePerWg[i] and // sgLayout[i] * sizePerSg[i] SmallVector distUnit = llvm::map_to_vector( llvm::zip_equal(sizePerWg, computeElementwiseMul(sgLayout, sizePerSg)), [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); }); for (SmallVector unitOffs : StaticTileOffsetRange(sizePerWg, distUnit)) { SmallVector base = llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value { return arith::ConstantIndexOp::create(builder, loc, d); }); SmallVector adds = llvm::map_to_vector( llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value { return builder.createOrFold(loc, std::get<0>(t), std::get<1>(t)); }); SmallVector mods = llvm::map_to_vector( llvm::zip_equal(adds, sizePerWg), [&](const auto &t) -> Value { return builder.createOrFold( loc, std::get<0>(t), arith::ConstantIndexOp::create(builder, loc, std::get<1>(t))); }); offsets.push_back(mods); } return offsets; } // Checks if the given shape can be evenly distributed based on the layout // and data factors provided by the LayoutAttr. bool XeGPUDialect::isEvenlyDistributable(llvm::ArrayRef shape, xegpu::DistributeLayoutAttr attr) { assert(attr && "Layout attribute is missing."); // Checks whether the given shape can be evenly distributed using the // specified layout and data attributes. If successful, it returns the work // size for each compute unit; otherwise, it returns `std::nullopt`. The work // size per compute unit is calculated as follows: // - If `data` is null: newShape[i] = shape[i] / layout[i] // - If `data` is not null: newShape[i] = data[i] // When round-robin distribution (`rr`) is enabled, `shape[i]` can be // smaller than `layout[i] * data[i]`, allowing multiple compute units to // share the data. auto tryDistribute = [&](llvm::ArrayRef shape, SmallVector layout, SmallVector data, bool rr = true) -> optional> { llvm::SmallVector newShape(shape); if (layout.size()) { if (layout.size() != shape.size()) return std::nullopt; auto ratio = computeShapeRatio(shape, layout); if (!ratio.has_value()) return std::nullopt; newShape = ratio.value(); } if (data.size()) { if (data.size() != shape.size()) return std::nullopt; auto ratio = computeShapeRatio(newShape, data); if (!ratio.has_value() && rr) ratio = computeShapeRatio(data, newShape); if (!ratio.has_value()) return std::nullopt; // if data is not null, we always return it for next phase. newShape = data; } return newShape; }; // check the sgLayout and sgData auto maybeSgShape = tryDistribute(shape, attr.getEffectiveSgLayoutAsInt(), attr.getEffectiveSgDataAsInt()); if (!maybeSgShape) return false; auto sgShape = maybeSgShape.value(); // check InstData, it neither have layout nor need round-robin auto maybeInstShape = tryDistribute(sgShape, {}, attr.getEffectiveInstDataAsInt(), false); if (!maybeInstShape) return false; auto instShape = maybeInstShape.value(); // check LaneLayout and LaneData auto maybeLaneShape = tryDistribute(instShape, attr.getEffectiveLaneLayoutAsInt(), attr.getEffectiveLaneDataAsInt(), false); return maybeLaneShape.has_value(); } //===----------------------------------------------------------------------===// // XeGPU_BlockTensorDescAttr //===----------------------------------------------------------------------===// BlockTensorDescAttr BlockTensorDescAttr::get(mlir::MLIRContext *context, xegpu::MemorySpace memory_space, int array_length, bool boundary_check) { auto scopeAttr = MemorySpaceAttr::get(context, memory_space); auto lengthAttr = IntegerAttr::get(IntegerType::get(context, 64), array_length); auto boundaryAttr = BoolAttr::get(context, boundary_check); return Base::get(context, scopeAttr, lengthAttr, boundaryAttr); } bool BlockTensorDescAttr::hasDefaultsOnly() { return getMemorySpace().getValue() == xegpu::MemorySpace::Global && getArrayLength().getInt() == 1 && getBoundaryCheck().getValue(); } //===----------------------------------------------------------------------===// // XeGPU_ScatterTensorDescAttr //===----------------------------------------------------------------------===// ScatterTensorDescAttr ScatterTensorDescAttr::get(mlir::MLIRContext *context, xegpu::MemorySpace memory_space, int chunk_size) { auto scopeAttr = MemorySpaceAttr::get(context, memory_space); auto chunkSizeAttr = IntegerAttr::get(IntegerType::get(context, 64), chunk_size); return Base::get(context, scopeAttr, chunkSizeAttr); } LogicalResult ScatterTensorDescAttr::verify( llvm::function_ref emitError, MemorySpaceAttr memory_space, IntegerAttr chunk_size) { int64_t chunkSize = chunk_size.getInt(); if (chunkSize <= 0) return emitError() << "invalid chunk size"; return success(); } //===----------------------------------------------------------------------===// // XeGPU_LayoutAttr //===----------------------------------------------------------------------===// LogicalResult LayoutAttr::verify(llvm::function_ref emitError, DenseI32ArrayAttr sg_layout, DenseI32ArrayAttr sg_data, DenseI32ArrayAttr inst_data, DenseI32ArrayAttr lane_layout, DenseI32ArrayAttr lane_data, DenseI32ArrayAttr order) { // A valid layout must include at least one of sg_layout and lane_layout. // sg_layout is essential for Workgroup layout, while lane_layout is // required for Subgroup layout. if (!sg_layout && !inst_data && !lane_layout) { return emitError() << "expected at least one of sg_layout, inst_data or lane_layout"; } // generate code to check sg_laout, inst_data and lane_layout having the same // rank if they are not null. if (sg_layout && inst_data && sg_layout.size() != inst_data.size()) { return emitError() << "expected sg_layout and inst_data to have the same rank"; } if (sg_layout && lane_layout && sg_layout.size() != lane_layout.size()) { return emitError() << "expected sg_layout and lane_layout to have the same rank"; } if (inst_data && lane_layout && inst_data.size() != lane_layout.size()) { return emitError() << "expected inst_data and lane_layout to have the same rank"; } // sg_data is optional for Workgroup layout, but its presence requires // sg_layout. if (sg_data) { if (!sg_layout) return emitError() << "expected sg_layout being used with sg_data"; if (sg_data.size() != sg_layout.size()) return emitError() << "expected sg_data and sg_layout to have the same rank"; } // lane_data is optional for Subgroup layout, but its presence requires // lane_layout. if (lane_data) { if (!lane_layout) return emitError() << "expected lane_layout being used with lane_data"; if (lane_data.size() != lane_layout.size()) return emitError() << "expected lane_data and lane_layout to have the same rank"; } if (order) { if (!sg_layout && !lane_layout) return emitError() << "expected sg_layout/lane_layout being used with order"; if (sg_layout && order.size() != sg_layout.size()) return emitError() << "expected order and sg_layout to have the same rank"; if (lane_layout && order.size() != lane_layout.size()) return emitError() << "expected order and lane_layout to have the same rank"; } return success(); } FailureOr> LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId) { // delinearizeSubgroupId is only available for // workgroup-level layout attribute if (!isForWorkgroup()) return failure(); // TODO: handle order attribute auto hasDefaultOrder = [&]() { DenseI32ArrayAttr order = getOrder(); return !order || isIdentityPermutation(llvm::to_vector_of( llvm::reverse(order.asArrayRef()))); }; if (!hasDefaultOrder()) return mlir::emitError(loc, "order attribute is currently not supported."); auto dims = llvm::map_to_vector(getEffectiveSgLayoutAsInt(), [&](int64_t d) -> Value { return builder.createOrFold(loc, d); }); return affine::delinearizeIndex(builder, loc, linearId, dims); } /// Implements DistributeLayoutAttr::getOffsets to generate /// instructions for computing multi-dimensional offsets when distributed by /// LayoutAttr. FailureOr>> LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef shape) { if (!isForWorkgroup()) return failure(); SmallVector sgLayout = getEffectiveSgLayoutAsInt(); SmallVector sgShape = getEffectiveSgDataAsInt(); if (sgShape.empty()) { if (auto derivedShape = computeShapeRatio(shape, sgLayout)) sgShape = derivedShape.value(); else return failure(); } // delinearize Ids auto maybeIds = delinearizeSubgroupId(builder, loc, linearId); if (failed(maybeIds)) return failure(); SmallVector sgIds = *maybeIds; return genOffsetsComputingInsts(builder, loc, sgIds, sgLayout, sgShape, shape); } //===----------------------------------------------------------------------===// // XeGPU_SliceAttr //===----------------------------------------------------------------------===// LogicalResult SliceAttr::verify(llvm::function_ref emitError, xegpu::DistributeLayoutAttr parent, DenseI64ArrayAttr dims) { if (!parent || !dims) return emitError() << "expected parent layout and dims attribute"; int64_t rank = parent.getRank(); // check every element in dims is unique and smaller than rank llvm::SmallDenseSet seen; for (int64_t dim : dims.asArrayRef()) { if (dim < 0 || dim >= rank) return emitError() << "invalid dim (" << dim << ") in slice attribute."; if (!seen.insert(dim).second) return emitError() << "repeated dim (" << dim << ") in slice attribute."; } return success(); } SliceAttr SliceAttr::flatten() const { xegpu::DistributeLayoutAttr parent = getParent(); SmallVector slicedDims({getDims()}); while (auto sliceAttr = dyn_cast(parent)) { parent = sliceAttr.getParent(); slicedDims.push_back(sliceAttr.getDims()); } auto layoutAttr = dyn_cast(parent); SmallVector indices = llvm::to_vector(llvm::seq(0, layoutAttr.getRank())); // get remaining dims (flattend) by applying slice ops with all slicedDims SmallVector remainingDims(indices); for (auto dim : llvm::reverse(slicedDims)) remainingDims = XeGPUDialect::slice(llvm::ArrayRef(remainingDims), dim.asArrayRef()); // get flattend sliced dims by applying slice ops with the remaining dims SmallVector flattendDims = XeGPUDialect::slice( llvm::ArrayRef(indices), llvm::ArrayRef(remainingDims)); return xegpu::SliceAttr::get( getContext(), layoutAttr, DenseI64ArrayAttr::get(getContext(), flattendDims)); } FailureOr> SliceAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId) { SliceAttr attr = flatten(); auto parent = dyn_cast(attr.getParent()); return parent.delinearizeSubgroupId(builder, loc, linearId); } /// Implements DistributeLayoutAttr::getOffsets to generate /// instructions for computing multi-dimensional offsets when distributed by /// SliceAttr. FailureOr>> SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef shape) { assert(getRank() == static_cast(shape.size()) && "invalid shape."); if (!isForWorkgroup()) return failure(); SmallVector sgLayout = getEffectiveSgLayoutAsInt(); SmallVector sgShape = getEffectiveSgDataAsInt(); if (sgShape.empty()) { if (auto derivedShape = computeShapeRatio(shape, sgLayout)) sgShape = derivedShape.value(); else return failure(); } // delinearize Ids auto maybeIds = delinearizeSubgroupId(builder, loc, linearId); if (failed(maybeIds)) return failure(); // The effective sgIds for offsets computing correspond // to the dims that are not sliced. ArrayRef dims = flatten().getDims().asArrayRef(); SmallVector sgIds = XeGPUDialect::slice(ArrayRef(*maybeIds), dims); return genOffsetsComputingInsts(builder, loc, sgIds, sgLayout, sgShape, shape); } bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) { auto flattenedThis = flatten(); // If other is a LayoutAttr, just compare directly with parent of // flattenedThis. if (auto otherLayout = dyn_cast(other)) return flattenedThis.getParent() == otherLayout; // If other is a SliceAttr, flatten it first before comparing. auto flattenedOther = dyn_cast(other).flatten(); // Both must have common parent LayoutAttr. if (flattenedThis.getParent() != flattenedOther.getParent()) return false; // otherFlattened's sliced dims must be a subset of flattenedThis's sliced // dims. llvm::SmallDenseSet thisDims( flattenedThis.getDims().asArrayRef().begin(), flattenedThis.getDims().asArrayRef().end()); return llvm::all_of(flattenedOther.getDims().asArrayRef(), [&](int64_t dim) { return thisDims.contains(dim); }); } //===----------------------------------------------------------------------===// // XeGPU_RangeAttr //===----------------------------------------------------------------------===// LogicalResult RangeAttr::verify(llvm::function_ref emitError, IntegerAttr startOfRange, IntegerAttr endOfRange) { if (startOfRange.getInt() >= endOfRange.getInt()) return emitError() << "'end' : " << endOfRange.getInt() << " must be greater than 'start' : " << startOfRange.getInt(); return success(); } //===----------------------------------------------------------------------===// // XeGPU_TensorDescType //===----------------------------------------------------------------------===// mlir::Type TensorDescType::parse(AsmParser &parser) { llvm::SmallVector shape; mlir::Type elementType; mlir::FailureOr encoding; mlir::FailureOr layout; // Parse literal '<' if (parser.parseLess()) return {}; auto shapeLoc = parser.getCurrentLocation(); if (mlir::failed(parser.parseDimensionList(shape))) { parser.emitError(shapeLoc, "failed to parse parameter 'shape'"); return {}; } auto elemTypeLoc = parser.getCurrentLocation(); if (mlir::failed(parser.parseType(elementType))) { parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'"); return {}; } // parse optional attributes while (mlir::succeeded(parser.parseOptionalComma())) { mlir::Attribute attr; ParseResult res = parser.parseAttribute(attr); if (mlir::succeeded(res)) { if (mlir::isa(attr)) { layout = attr; continue; } if (mlir::isa(attr)) { encoding = attr; continue; } } return {}; } // Parse literal '>' if (parser.parseGreater()) return {}; MLIRContext *ctxt = parser.getContext(); return TensorDescType::getChecked( [&]() { return parser.emitError(parser.getNameLoc()); }, ctxt, shape, elementType, encoding.value_or(BlockTensorDescAttr::get(ctxt)), layout.value_or(mlir::Attribute())); } void TensorDescType::print(AsmPrinter &printer) const { printer << "<"; auto shape = getShape(); for (int64_t dim : shape) { if (mlir::ShapedType::isDynamic(dim)) printer << '?'; else printer << dim; printer << 'x'; } printer << getElementType(); auto encoding = getEncoding(); auto blockAttr = llvm::dyn_cast_if_present(encoding); if (encoding && (!blockAttr || !blockAttr.hasDefaultsOnly())) printer << ", " << encoding; if (auto layout = getLayout()) printer << ", " << layout; printer << ">"; } TensorDescType TensorDescType::get(llvm::ArrayRef shape, mlir::Type elementType, int array_length, bool boundary_check, MemorySpace memory_space, mlir::Attribute layout) { auto context = elementType.getContext(); auto attr = BlockTensorDescAttr::get(context, memory_space, array_length, boundary_check); return Base::get(context, shape, elementType, attr, layout); } TensorDescType TensorDescType::get(llvm::ArrayRef shape, mlir::Type elementType, int chunk_size, MemorySpace memory_space, mlir::Attribute layout) { auto context = elementType.getContext(); auto attr = ScatterTensorDescAttr::get(context, memory_space, chunk_size); return Base::get(context, shape, elementType, attr, layout); } LogicalResult TensorDescType::verify(llvm::function_ref emitError, llvm::ArrayRef shape, mlir::Type elementType, mlir::Attribute encoding, mlir::Attribute layout) { size_t rank = shape.size(); if (rank == 0) return emitError() << "expected non-zero rank tensor"; auto blockAttr = mlir::dyn_cast_if_present(encoding); if (blockAttr) { MemorySpaceAttr memorySpaceAttr = blockAttr.getMemorySpace(); if (rank > 1 && memorySpaceAttr && memorySpaceAttr.getValue() == MemorySpace::SLM) return emitError() << "SLM is only supported for 1D block tensor"; } // for gather and scatter ops, Low-precision types are packed in 32-bit units. unsigned bitWidth = elementType.getIntOrFloatBitWidth(); int chunkAlignmentFactor = bitWidth < targetinfo::packedSizeInBitsForGatherScatter ? targetinfo::packedSizeInBitsForGatherScatter / bitWidth : 1; auto scatterAttr = mlir::dyn_cast_if_present(encoding); if (scatterAttr) { int64_t chunkSize = scatterAttr.getChunkSizeAsInt(); if (rank == 1 && chunkSize != 1) return emitError() << "expected non-contiguous elements for 1D tensor"; // If chunk size > 1, the second dimension of the tensor shape must be // equal to chunk size and it must be a multiple of the // chunkAlignmentFactor. if (chunkSize > 1) { if (shape.back() != chunkSize) return emitError() << "expected last dim of tensor to match chunk size"; if (shape.back() % chunkAlignmentFactor != 0) return emitError() << "expected last dim of tensor to be a multiple of " << chunkAlignmentFactor; } } auto layoutAttr = llvm::dyn_cast_if_present(layout); if (layoutAttr) { if (rank != (size_t)layoutAttr.getRank()) return emitError() << "expected layout rank to match tensor rank"; auto laneData = layoutAttr.getLaneData(); if (scatterAttr && laneData) { // Validate subgroup mapping rules for scattered tensors. // if chunkSize > 1, the last dimension of the tensor should // be distributed in the units divisible by chunkAlignmentFactor. int64_t chunkSize = scatterAttr.getChunkSizeAsInt(); if (chunkSize > 1 && laneData[rank - 1] % chunkAlignmentFactor) return emitError() << "expected last dim of lane_data to be a multiple of: " << chunkAlignmentFactor; } if (!XeGPUDialect::isEvenlyDistributable(shape, layoutAttr)) { std::string shapeStr; llvm::raw_string_ostream stream(shapeStr); llvm::interleaveComma(shape, stream); return emitError() << "cannot distribute [" << shapeStr << "] using " << layoutAttr; } } return success(); } //===----------------------------------------------------------------------===// // XeGPU_MemDescType //===----------------------------------------------------------------------===// mlir::Type MemDescType::parse(AsmParser &parser) { llvm::SmallVector shape; mlir::Type elementType; mlir::FailureOr layout; // Parse literal '<' if (parser.parseLess()) return {}; auto shapeLoc = parser.getCurrentLocation(); if (mlir::failed(parser.parseDimensionList(shape, false, true))) { parser.emitError(shapeLoc, "failed to parse parameter 'shape'"); return {}; } auto elemTypeLoc = parser.getCurrentLocation(); if (mlir::failed(parser.parseType(elementType))) { parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'"); return {}; } // parse optional attributes if (mlir::succeeded(parser.parseOptionalComma())) { MemLayoutAttr attr; ParseResult res = parser.parseAttribute(attr); if (mlir::failed(res)) return {}; layout = attr; } // Parse literal '>' if (parser.parseGreater()) return {}; MLIRContext *ctxt = parser.getContext(); return MemDescType::getChecked( [&]() { return parser.emitError(parser.getNameLoc()); }, ctxt, shape, elementType, layout.value_or(MemLayoutAttr())); } void MemDescType::print(AsmPrinter &printer) const { printer << "<"; printer.printDimensionList(getShape()); printer << 'x'; printer << getElementType(); if (auto layout = getMemLayout()) printer << ", " << layout; printer << ">"; } //===----------------------------------------------------------------------===// // XeGPU_MemDescType //===----------------------------------------------------------------------===// Attribute MemLayoutAttr::parse(AsmParser &parser, Type type) { auto context = parser.getContext(); llvm::SMLoc loc = parser.getCurrentLocation(); llvm::SmallDenseSet seenKeys; SmallVector attributes; auto parseElt = [&]() -> ParseResult { StringRef nameId; if (failed(parser.parseKeyword(&nameId))) return parser.emitError(loc, "expected valid attribute name"); if (!seenKeys.insert(nameId).second) return parser.emitError(loc, "duplicate key '") << nameId << " in mem layout attribute"; if (failed(parser.parseEqual())) return failure(); Attribute attr; if (failed(parser.parseAttribute(attr))) return failure(); attributes.emplace_back(nameId, attr); return success(); }; // Parse literal '<' if (parser.parseLess()) return {}; if (failed(parser.parseCommaSeparatedList(parseElt))) return {}; // Parse literal '>' if (parser.parseGreater()) return {}; return parser.getChecked( loc, context, DictionaryAttr::get(context, attributes)); } void MemLayoutAttr::print(AsmPrinter &printer) const { printer << "<"; ArrayRef attrs = getAttrs().getValue(); for (size_t i = 0; i < attrs.size(); i++) { printer << attrs[i].getName().str() << " = " << attrs[i].getValue(); if (i < attrs.size() - 1) printer << ", "; } printer << ">"; } } // namespace xegpu } // namespace mlir #include #define GET_ATTRDEF_CLASSES #include #define GET_TYPEDEF_CLASSES #include