diff options
Diffstat (limited to 'mlir/lib/Dialect')
5 files changed, 291 insertions, 81 deletions
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index 81c3069..ec1571a 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -416,13 +416,39 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op, if (ci.clusterSize >= 32) { if (chipset.majorVersion <= 9) { // Broadcast last value from each row to next row. - // Use row mask to avoid polluting rows 1 and 3. + // Use row mask to avoid polluting row 0 (and row 2 if wave-64). dpp = amdgpu::DPPOp::create(rewriter, loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15, rewriter.getUnitAttr(), 0xa, allBanks, /*bound_ctrl*/ false); res = vector::makeArithReduction( rewriter, loc, gpu::convertReductionKind(mode), res, dpp); + + // For subgroupSize = 64, at this point lanes [16, 32) contain the full + // reduction over lanes [0, 32), but lanes [0, 16) do not. Similarly, + // lanes [48, 64) contain the full reduction over lanes [32, 64), but + // lanes [32, 48) do not. + // + // If subgroup size is 64 and cluster size is 64, we don't need lanes [0, + // 16) and [32, 48) to have the correct cluster-32 reduction values at + // this point, because only lane 63's value will ultimately be read in + // this full-cluster case. + // + // If subgroup size is 64 and cluster size is 32, we need to ensure that + // lanes [0, 16) and [32, 48) have the correct final cluster-32 reduction + // values (subgroup_reduce guarantees that all lanes within each cluster + // contain the final reduction value). We do this by broadcasting lane + // 31's value to lanes [0, 16) and lanes 63's value to lanes [32, 48). + // + // See https://gpuopen.com/learn/amd-gcn-assembly-cross-lane-operations + // for an illustration of how this within-cluster broadcast works with a + // swizzle. + if (ci.subgroupSize == 64 && ci.clusterSize == 32) { + res = + amdgpu::SwizzleBitModeOp::create(rewriter, loc, res, /*and_mask=*/0, + /*or_mask=*/31, + /*xor_mask=*/0); + } } else if (chipset.majorVersion <= 12) { // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2). Value uint32Max = arith::ConstantOp::create( diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp index 660c313..fbac28e 100644 --- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp +++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp @@ -145,3 +145,13 @@ std::string mlir::acc::getRecipeName(mlir::acc::RecipeKind kind, return recipeName; } + +mlir::Value mlir::acc::getBaseEntity(mlir::Value val) { + if (auto partialEntityAccessOp = + dyn_cast<PartialEntityAccessOpInterface>(val.getDefiningOp())) { + if (!partialEntityAccessOp.isCompleteView()) + return partialEntityAccessOp.getBaseEntity(); + } + + return val; +} diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index f9aa28d5..83406c8 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -11,7 +11,6 @@ #include "mlir/Dialect/Index/IR/IndexOps.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" -#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h" #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" #include "mlir/IR/Builders.h" #include "mlir/IR/DialectImplementation.h" @@ -229,8 +228,10 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError, } if (inst_data && lane_layout && inst_data.size() != lane_layout.size()) { - return emitError() - << "expected inst_data and lane_layout to have the same rank"; + return emitError() << "expected inst_data and lane_layout to have the same " + "rank, got inst_data " + << inst_data.size() << ", lane_layout " + << lane_layout.size(); } // sg_data is optional for Workgroup layout, but its presence requires @@ -569,8 +570,8 @@ TensorDescType::verify(llvm::function_ref<InFlightDiagnostic()> emitError, // for gather and scatter ops, Low-precision types are packed in 32-bit units. unsigned bitWidth = elementType.getIntOrFloatBitWidth(); int chunkAlignmentFactor = - bitWidth < targetinfo::packedSizeInBitsForGatherScatter - ? targetinfo::packedSizeInBitsForGatherScatter / bitWidth + bitWidth < xegpu::uArch::generalPackedFormatBitSize + ? xegpu::uArch::generalPackedFormatBitSize / bitWidth : 1; auto scatterAttr = mlir::dyn_cast_if_present<ScatterTensorDescAttr>(encoding); if (scatterAttr) { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 8fab255..90eae87 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -14,7 +14,6 @@ #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" -#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h" #include "mlir/Dialect/XeGPU/Transforms/Passes.h" #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/IR/Attributes.h" @@ -37,6 +36,8 @@ #include "llvm/Support/LogicalResult.h" #include "llvm/Support/raw_ostream.h" +#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" + namespace mlir { namespace xegpu { #define GEN_PASS_DEF_XEGPUPROPAGATELAYOUT @@ -104,6 +105,8 @@ public: SmallVector<int> getLaneData() const; + SmallVector<int> getInstData() const; + bool isSliceLayout() const { if (!isAssigned()) return false; @@ -137,6 +140,13 @@ SmallVector<int> LayoutInfo::getLaneData() const { [](int64_t val) { return static_cast<int>(val); }); } +SmallVector<int> LayoutInfo::getInstData() const { + if (!isAssigned()) + return {}; + return llvm::map_to_vector(storage.getEffectiveInstDataAsInt(), + [](int64_t val) { return static_cast<int>(val); }); +} + void LayoutInfo::print(raw_ostream &os) const { if (isAssigned()) { os << storage; @@ -174,12 +184,14 @@ LayoutInfo LayoutInfo::transpose(ArrayRef<int64_t> permutation) const { SmallVector<int32_t> laneLayout; SmallVector<int32_t> laneData; + SmallVector<int32_t> instData; for (int64_t idx : permutation) { laneLayout.push_back(static_cast<int32_t>(getLaneLayout()[idx])); laneData.push_back(static_cast<int32_t>(getLaneData()[idx])); + instData.push_back(static_cast<int32_t>(getInstData()[idx])); } - return LayoutInfo( - xegpu::LayoutAttr::get(storage.getContext(), laneLayout, laneData)); + return LayoutInfo(xegpu::LayoutAttr::get(storage.getContext(), instData, + laneLayout, laneData)); } //===----------------------------------------------------------------------===// @@ -192,6 +204,28 @@ struct LayoutInfoLattice : public Lattice<LayoutInfo> { using Lattice::Lattice; }; +/// Helper Function to find a proper instruction multiple for the user-supplied +/// sg-level data shape. `candidates` are uArch allowed shapes. +/// `candidateMultiples` are uArch multiples of such shapes (e.g., block count). +template <typename T> +int getLargestDivisor(T dim, ArrayRef<T> candidates, + ArrayRef<T> candidateMultiples = {}) { + static_assert(std::is_integral<T>::value, "T must be an integer type"); + int largest = -1; + SmallVector<T> multiples = {1}; + if (!candidateMultiples.empty()) + multiples = + SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end()); + for (T candidate : candidates) { + for (T multiple : multiples) { + int value = static_cast<int>(candidate * multiple); + if (value != 0 && dim % value == 0 && value > largest) + largest = value; + } + } + return largest; +} + /// Helper Functions to get default layouts. A `default layout` is a layout that /// is assigned to a value when the layout is not fixed by some anchor operation /// (like DPAS). @@ -200,18 +234,32 @@ struct LayoutInfoLattice : public Lattice<LayoutInfo> { /// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1]. /// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1]. static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx, - unsigned rank) { + unsigned rank, + const xegpu::uArch::uArch *uArch, + ArrayRef<int> instData) { assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector."); if (rank == 1) { return LayoutInfo( - xegpu::LayoutAttr::get(ctx, {xegpu::targetinfo::subgroupSize}, {1})); + xegpu::LayoutAttr::get(ctx, instData, {uArch->getSubgroupSize()}, {1})); } return LayoutInfo(xegpu::LayoutAttr::get( - ctx, {1, xegpu::targetinfo::subgroupSize}, {1, 1})); + ctx, instData, {1, uArch->getSubgroupSize()}, {1, 1})); +} + +static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx, + unsigned rank, int subgroupSize) { + assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector."); + if (rank == 1) { + return LayoutInfo(xegpu::LayoutAttr::get(ctx, {subgroupSize}, {1})); + } + return LayoutInfo(xegpu::LayoutAttr::get(ctx, {1, subgroupSize}, {1, 1})); } /// Helper to get the default layout for a vector type. static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy, + const xegpu::uArch::uArch *uArch, + ArrayRef<int> instData, + unsigned packingSize, bool isScattered = false) { // Expecting a 1D or 2D vector. assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) && @@ -221,28 +269,25 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy, "Expected int or float element type."); // If the rank is 1, then return default layout for 1D vector. if (vectorTy.getRank() == 1) - return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1); + return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1, uArch, instData); // Packing factor is determined by the element type bitwidth. - int packingFactor = 1; unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth(); + int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1; if (isScattered) { - packingFactor = - bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter - ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth - : 1; - return LayoutInfo(xegpu::LayoutAttr::get( - vectorTy.getContext(), {xegpu::targetinfo::subgroupSize, 1}, - {1, packingFactor})); + return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData, + {uArch->getSubgroupSize(), 1}, + {1, packingFactor})); } - if (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault) - packingFactor = xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth; - return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), - {1, xegpu::targetinfo::subgroupSize}, + return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData, + {1, uArch->getSubgroupSize()}, {1, packingFactor})); } /// Helper to get the default layout for a vector type. static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy, + const xegpu::uArch::uArch *uArch, + ArrayRef<int> instData, + unsigned packingSize, bool isScattered = false) { // Expecting a 1D or 2D vector. assert((tdescTy.getRank() == 1 || tdescTy.getRank() == 2) && @@ -252,27 +297,18 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy, "Expected int or float element type."); // If the rank is 1, then return default layout for 1D vector. if (tdescTy.getRank() == 1) - return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1); + return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1, uArch, instData); // Packing factor is determined by the element type bitwidth. unsigned bitwidth = tdescTy.getElementType().getIntOrFloatBitWidth(); - + int subgroupSize = uArch->getSubgroupSize(); + int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1; if (isScattered) { - int packingFactor = - bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter - ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth - : 1; return LayoutInfo(xegpu::LayoutAttr::get( - tdescTy.getContext(), {xegpu::targetinfo::subgroupSize, 1}, - {1, packingFactor})); + tdescTy.getContext(), instData, {subgroupSize, 1}, {1, packingFactor})); } - int packingFactor = - (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault) - ? xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth - : 1; - return LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), - {1, xegpu::targetinfo::subgroupSize}, - {1, packingFactor})); + return LayoutInfo(xegpu::LayoutAttr::get( + tdescTy.getContext(), instData, {1, subgroupSize}, {1, packingFactor})); } /// Helper Function to get the expected layouts for DPAS operands. `lane_data` @@ -281,25 +317,25 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy, /// `packedSizeInBitsForDefault` /// * For B operand, the data must be packed in minimum /// `packedSizeInBitsForDpasB` -static LayoutInfo getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, - unsigned operandNum) { +static LayoutInfo +getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, unsigned operandNum, + const xegpu::uArch::uArch *uArch, + ArrayRef<int> instData, unsigned packingSize) { Type elementTy = vectorTy.getElementType(); assert(elementTy.isIntOrFloat() && "Expected int or float type in DPAS operands"); - SmallVector<int32_t, 2> layout({1, xegpu::targetinfo::subgroupSize}); + SmallVector<int32_t, 2> layout({1, uArch->getSubgroupSize()}); // For B operand, data must be packed in minimum `packedDpasBSizeInBits` and // must have the VNNI format. - if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() < - xegpu::targetinfo::packedSizeInBitsForDpasB) { + if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() < packingSize) { SmallVector<int32_t, 2> data( - {static_cast<int32_t>(xegpu::targetinfo::packedSizeInBitsForDpasB / - elementTy.getIntOrFloatBitWidth()), + {static_cast<int32_t>(packingSize / elementTy.getIntOrFloatBitWidth()), 1}); return LayoutInfo( - xegpu::LayoutAttr::get(vectorTy.getContext(), layout, data)); + xegpu::LayoutAttr::get(vectorTy.getContext(), instData, layout, data)); } // Otherwise, return the default layout for the vector type. - return getDefaultSIMTLayoutInfo(vectorTy); + return getDefaultSIMTLayoutInfo(vectorTy, uArch, instData, packingSize); } //===----------------------------------------------------------------------===// @@ -456,7 +492,37 @@ void LayoutInfoPropagation::visitPrefetchNdOp( // Here we assign the default layout to the tensor descriptor operand of // prefetch. auto tdescTy = prefetch.getTensorDescType(); - auto prefetchLayout = getDefaultSIMTLayoutInfo(tdescTy); + + auto uArch = getUArch(getChipStr(prefetch).value_or("")); + const auto *uArchInstruction = + dyn_cast<xegpu::uArch::Subgroup2DBlockPrefetchInstruction>( + uArch->getInstruction( + xegpu::uArch::InstructionKind::Subgroup2DBlockPrefetch)); + + auto blockWHC = + uArchInstruction->getBlockWidthHeightCount(tdescTy.getElementType()); + if (!blockWHC) + prefetch.emitWarning("No known block params found for the element type."); + auto [bWidth, bHeight, bCount] = blockWHC.value(); + SmallVector<int> instData; + int instWidth = getLargestDivisor( + static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth, + bCount); + if (instWidth == -1) + prefetch.emitWarning( + "No suitable instruction multiple found for the given shape."); + if (tdescTy.getRank() == 1) + instData = {instWidth}; + else { + int instHeight = getLargestDivisor( + static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight); + if (instHeight == -1) + prefetch.emitWarning( + "No suitable instruction multiple found for the given shape."); + instData = {instHeight, instWidth}; + } + auto prefetchLayout = getDefaultSIMTLayoutInfo( + tdescTy, uArch, instData, uArchInstruction->getPackedFormatBitSize()); // Propagate the layout to the source tensor descriptor. propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout)); } @@ -475,10 +541,11 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp( reduction.emitWarning("Expecting output type to be 1D vector."); return; } + auto uArch = getUArch(xegpu::getChipStr(reduction).value_or("")); // Given that the result is 1D, the layout of the operand should be 2D with // default layout. - LayoutInfo operandLayout = - getDefaultSIMTLayoutInfo(reduction->getContext(), 2); + LayoutInfo operandLayout = getDefaultSIMTLayoutInfo( + reduction->getContext(), 2, uArch->getSubgroupSize()); propagateIfChanged(operands[0], operands[0]->meet(operandLayout)); // Accumulator should have the same layout as the result. propagateIfChanged(operands[1], operands[1]->meet(resultLayout)); @@ -557,15 +624,53 @@ void LayoutInfoPropagation::visitDpasOp( ArrayRef<const LayoutInfoLattice *> results) { VectorType aTy = dpas.getLhsType(); VectorType bTy = dpas.getRhsType(); - propagateIfChanged( - operands[0], operands[0]->meet(getSIMTLayoutInfoForDPASOperand(aTy, 0))); - propagateIfChanged( - operands[1], operands[1]->meet(getSIMTLayoutInfoForDPASOperand(bTy, 1))); + + auto uArch = getUArch(getChipStr(dpas).value_or("")); + const int subgroupSize = uArch->getSubgroupSize(); + const auto *uArchInstruction = + dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction( + xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc)); + + const unsigned dataALen = aTy.getShape().front(); + auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType()); + const int maxALen = + getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen)); + if (maxALen == -1) + dpas.emitWarning( + "No suitable instruction multiple found for the given shape."); + + const unsigned dataBLen = bTy.getShape().back(); + auto supportedBLen = uArchInstruction->getSupportedK(bTy.getElementType()); + const int maxBLen = + getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen)); + if (maxBLen == -1) + dpas.emitWarning( + "No suitable instruction multiple found for the given shape."); + SmallVector<int> instDataA = {maxALen, subgroupSize}; + SmallVector<int> instDataB = {subgroupSize, maxBLen}; + + propagateIfChanged(operands[0], + operands[0]->meet(getSIMTLayoutInfoForDPASOperand( + aTy, 0, uArch, instDataA, + uArchInstruction->getPackedFormatBitSizeA()))); + propagateIfChanged(operands[1], + operands[1]->meet(getSIMTLayoutInfoForDPASOperand( + bTy, 1, uArch, instDataB, + uArchInstruction->getPackedFormatBitSizeB()))); if (operands.size() > 2) { VectorType cTy = dpas.getAccType(); - propagateIfChanged( - operands[2], - operands[2]->meet(getSIMTLayoutInfoForDPASOperand(cTy, 2))); + const unsigned dataCLen = bTy.getShape().back(); + auto supportedCLen = uArchInstruction->getSupportedN(bTy.getElementType()); + const int maxCLen = + getLargestDivisor(dataCLen, ArrayRef<unsigned>(supportedCLen)); + if (maxCLen == -1) + dpas.emitWarning( + "No suitable instruction multiple found for the given shape."); + SmallVector<int> instDataC = {maxALen, maxCLen}; + propagateIfChanged(operands[2], + operands[2]->meet(getSIMTLayoutInfoForDPASOperand( + cTy, 2, uArch, instDataC, + uArchInstruction->getPackedFormatBitSizeB()))); } } @@ -573,7 +678,38 @@ void LayoutInfoPropagation::visitDpasOp( void LayoutInfoPropagation::visitStoreNdOp( xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands, ArrayRef<const LayoutInfoLattice *> results) { - LayoutInfo storeLayout = getDefaultSIMTLayoutInfo(store.getValueType()); + + auto uArch = getUArch(getChipStr(store).value_or("")); + const auto *uArchInstruction = + dyn_cast<xegpu::uArch::Subgroup2DBlockStoreInstruction>( + uArch->getInstruction( + xegpu::uArch::InstructionKind::Subgroup2DBlockStore)); + VectorType dataTy = store.getValueType(); + auto blockWHC = uArchInstruction->getBlockWidthHeightCount( + store.getValueType().getElementType()); + if (!blockWHC) + store.emitWarning("No known block params found for the element type."); + auto [bWidth, bHeight, bCount] = blockWHC.value(); + SmallVector<int> instData; + int instWidth = getLargestDivisor( + static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth, + bCount); + if (instWidth == -1) + store.emitWarning( + "No suitable instruction multiple found for the given shape."); + if (dataTy.getRank() == 1) + instData = {instWidth}; + else { + int instHeight = getLargestDivisor( + static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight); + if (instHeight == -1) + store.emitWarning( + "No suitable instruction multiple found for the given shape."); + instData = {instHeight, instWidth}; + } + LayoutInfo storeLayout = + getDefaultSIMTLayoutInfo(store.getValueType(), uArch, instData, + uArchInstruction->getPackedFormatBitSize()); // Both operands should have the same layout for (LayoutInfoLattice *operand : operands) propagateIfChanged(operand, operand->meet(storeLayout)); @@ -694,10 +830,23 @@ void LayoutInfoPropagation::visitLoadGatherOp( load.emitWarning("Not propagating, non-vector payload supplied."); return; } - LayoutInfo layout = getDefaultSIMTLayoutInfo(payloadTy, /*scattered*/ true); + auto uArch = getUArch(getChipStr(load).value_or("")); + const int subgroupSize = uArch->getSubgroupSize(); + SmallVector<int> instData{subgroupSize}; + if (auto chunkSize = load.getChunkSize().value_or(0); chunkSize > 1) + instData.push_back(chunkSize); + else if (auto srcTdescTy = + dyn_cast<xegpu::TensorDescType>(load.getSourceType())) { + if (srcTdescTy.getChunkSizeAsInt() > 1) + instData.push_back(chunkSize); + } + LayoutInfo layout = getDefaultSIMTLayoutInfo( + payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(), + /*scattered*/ true); // Mask operand should have 1D default layout. - LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(load->getContext(), 1); + LayoutInfo maskLayout = + getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize); // Propagate the new layout to the tensor descriptor operand. if (isa<xegpu::TensorDescType>(load.getSourceType())) @@ -717,8 +866,10 @@ void LayoutInfoPropagation::visitCreateDescOp( // Need the layout of the descriptor to propagate to the operands. if (!descLayout.isAssigned()) return; + auto uArch = getUArch(getChipStr(createDesc).value_or("")); // For offset operand propagate 1D default layout. - LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1); + LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1, + uArch->getSubgroupSize()); propagateIfChanged(operands[1], operands[1]->meet(layout)); } @@ -735,18 +886,30 @@ void LayoutInfoPropagation::visitStoreScatterOp( storeScatter.emitWarning("Not propagating, non-vector payload supplied."); return; } + auto uArch = getUArch(getChipStr(storeScatter).value_or("")); + const int subgroupSize = uArch->getSubgroupSize(); + auto payloadShape = payloadTy.getShape(); if (payloadShape.size() > 1) assert( - payloadShape[0] == xegpu::targetinfo::subgroupSize && + payloadShape[0] == subgroupSize && "Expected the first dimension of 2D tensor descriptor to be equal to " "subgroup size."); - LayoutInfo payloadLayout = - getDefaultSIMTLayoutInfo(payloadTy, /*scattered=*/true); + SmallVector<int> instData{subgroupSize}; + if (auto chunkSize = storeScatter.getChunkSize().value_or(0); chunkSize > 1) + instData.push_back(chunkSize); + else if (auto dstTdescTy = + dyn_cast<xegpu::TensorDescType>(storeScatter.getDestType())) { + if (dstTdescTy.getChunkSizeAsInt() > 1) + instData.push_back(chunkSize); + } + LayoutInfo payloadLayout = getDefaultSIMTLayoutInfo( + payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(), + /*scattered=*/true); LayoutInfo maskLayout = - getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1); + getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize); // Propagate the payload operand layout propagateIfChanged(operands[0], operands[0]->meet(payloadLayout)); // Propagate the destination (if tdesc) operand layout @@ -1023,9 +1186,13 @@ void XeGPUPropagateLayoutPass::runOnOperation() { LayoutInfo layout = analysis.getLayoutInfo(val); if (!layout.isAssigned()) return {}; + xegpu::DistributeLayoutAttr layoutAttr = + cast<xegpu::DistributeLayoutAttr>(layout.get()); + if (this->layoutKind == "lane") + layoutAttr = layoutAttr.dropInstData(); if (layout.isSliceLayout()) - return cast<xegpu::SliceAttr>(layout.get()); - return cast<xegpu::LayoutAttr>(layout.get()); + return cast<xegpu::SliceAttr>(layoutAttr); + return cast<xegpu::LayoutAttr>(layoutAttr); }; mlir::OpBuilder builder(&getContext()); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index d09dc19..5a3b27e 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -11,10 +11,10 @@ #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" -#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h" #include "mlir/Dialect/XeGPU/Transforms/Passes.h" #include "mlir/Dialect/XeGPU/Transforms/Transforms.h" #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" +#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" @@ -159,17 +159,18 @@ static bool requirePacked(const xegpu::LayoutAttr layout) { /// Helper function to check if the layout requires a transpose effect. static bool requireTranspose(const xegpu::LayoutAttr layout, - const std::string &chipStr) { + const xegpu::uArch::uArch *uArch) { // Return false for unsupported targets. // TODO: Add more support or move to target info. - if (chipStr != "pvc" && chipStr != "bmg") + if (uArch->getName().equals_insensitive("pvc") && + uArch->getName().equals_insensitive("bmg")) return false; if (!layout) return false; auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); if (laneLayout.size() != 2) return false; - return laneLayout[0] == xegpu::targetinfo::subgroupSize && laneLayout[1] == 1; + return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1; } /// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body @@ -199,6 +200,11 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> { using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern; LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, PatternRewriter &rewriter) const override { + auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or("")); + if (!uArch) + return rewriter.notifyMatchFailure( + gpuFuncOp, "Subgroup distribution requires target attribute attached " + "to set the warp size"); // If the function only contains a single void return, skip. if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) { return isa<gpu::ReturnOp>(op) && !op.getNumOperands(); @@ -230,7 +236,7 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> { ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults(); auto warpOp = gpu::WarpExecuteOnLane0Op::create( rewriter, laneId.getLoc(), gpuFuncResultType, laneId, - xegpu::targetinfo::subgroupSize, newGpuFunc.getArguments(), + uArch->getSubgroupSize(), newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes()); Block &warpBodyBlock = warpOp.getBodyRegion().front(); // Replace the ReturnOp of the original gpu function with a YieldOp. @@ -495,14 +501,14 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { warpOp, "warp result is not a xegpu::LoadNd op"); auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>(); + auto uArch = getUArch(xegpu::getChipStr(loadOp).value_or("")); + if (!uArch) + return rewriter.notifyMatchFailure( + loadOp, "xegpu::LoadNdOp require target attribute attached to " + "determine transpose " + "requirement"); // Chip information is required to decide if the layout requires transpose // effect. - auto chipStr = xegpu::getChipStr(loadOp); - if (!chipStr) - return rewriter.notifyMatchFailure( - loadOp, - "xegpu::LoadNdOp require chip information to determine transpose " - "requirement"); // Expecting offsets to be present. SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets(); if (offsets.empty()) @@ -556,7 +562,7 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { // Set the packed attribute if the layout requires it. newLoadOp.setPacked(requirePacked(layout)); // Set the transpose attribute if the layout requires it. - if (requireTranspose(layout, chipStr.value())) + if (requireTranspose(layout, uArch)) newLoadOp.setTranspose( DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0})); Value distributedVal = newWarpOp.getResult(operandIdx); |
