aboutsummaryrefslogtreecommitdiff
path: root/mlir/lib
diff options
context:
space:
mode:
Diffstat (limited to 'mlir/lib')
-rw-r--r--mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp9
-rw-r--r--mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp4
-rw-r--r--mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp5
-rw-r--r--mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp28
-rw-r--r--mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp57
-rw-r--r--mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp9
-rw-r--r--mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp10
-rw-r--r--mlir/lib/Dialect/Vector/IR/VectorOps.cpp2
-rw-r--r--mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp12
-rw-r--r--mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp293
-rw-r--r--mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp30
-rw-r--r--mlir/lib/Transforms/ViewOpGraph.cpp5
12 files changed, 363 insertions, 101 deletions
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index a9efada..ec182f1 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -846,13 +846,8 @@ struct NVGPUMBarrierInitLowering
Value barrier = getMbarrierPtr(b, mbarrierType, adaptor.getBarriers(),
adaptor.getMbarId(), rewriter);
Value count = truncToI32(b, adaptor.getCount());
- if (isMbarrierShared(mbarrierType)) {
- rewriter.replaceOpWithNewOp<NVVM::MBarrierInitSharedOp>(
- op, barrier, count, adaptor.getPredicate());
- } else {
- rewriter.replaceOpWithNewOp<NVVM::MBarrierInitOp>(op, barrier, count,
- adaptor.getPredicate());
- }
+ rewriter.replaceOpWithNewOp<NVVM::MBarrierInitOp>(op, barrier, count,
+ adaptor.getPredicate());
return success();
}
};
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 41d8d53..69a317ec 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -716,7 +716,7 @@ lowerReductionWithStartValue(ConversionPatternRewriter &rewriter, Location loc,
accumulator = getOrCreateAccumulator<ReductionNeutral>(rewriter, loc,
llvmType, accumulator);
return LLVMRedIntrinOp::create(rewriter, loc, llvmType,
- /*startValue=*/accumulator, vectorOperand,
+ /*start_value=*/accumulator, vectorOperand,
fmf);
}
@@ -743,7 +743,7 @@ static Value lowerPredicatedReductionWithStartValue(
Value vectorLength =
createVectorLengthValue(rewriter, loc, vectorOperand.getType());
return LLVMVPRedIntrinOp::create(rewriter, loc, llvmType,
- /*startValue=*/accumulator, vectorOperand,
+ /*satrt_value=*/accumulator, vectorOperand,
mask, vectorLength);
}
diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
index e08cc6f..d428fbf 100644
--- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -1106,10 +1106,7 @@ static bool isUniformDefinition(Value value,
return false;
}
- if (!value.getType().isIntOrIndexOrFloat())
- return false;
-
- return true;
+ return value.getType().isIntOrIndexOrFloat();
}
/// Generates a broadcast op for the provided uniform value using the
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 81c3069..ec1571a 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -416,13 +416,39 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
if (ci.clusterSize >= 32) {
if (chipset.majorVersion <= 9) {
// Broadcast last value from each row to next row.
- // Use row mask to avoid polluting rows 1 and 3.
+ // Use row mask to avoid polluting row 0 (and row 2 if wave-64).
dpp = amdgpu::DPPOp::create(rewriter, loc, res.getType(), res, res,
amdgpu::DPPPerm::row_bcast_15,
rewriter.getUnitAttr(), 0xa, allBanks,
/*bound_ctrl*/ false);
res = vector::makeArithReduction(
rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+
+ // For subgroupSize = 64, at this point lanes [16, 32) contain the full
+ // reduction over lanes [0, 32), but lanes [0, 16) do not. Similarly,
+ // lanes [48, 64) contain the full reduction over lanes [32, 64), but
+ // lanes [32, 48) do not.
+ //
+ // If subgroup size is 64 and cluster size is 64, we don't need lanes [0,
+ // 16) and [32, 48) to have the correct cluster-32 reduction values at
+ // this point, because only lane 63's value will ultimately be read in
+ // this full-cluster case.
+ //
+ // If subgroup size is 64 and cluster size is 32, we need to ensure that
+ // lanes [0, 16) and [32, 48) have the correct final cluster-32 reduction
+ // values (subgroup_reduce guarantees that all lanes within each cluster
+ // contain the final reduction value). We do this by broadcasting lane
+ // 31's value to lanes [0, 16) and lanes 63's value to lanes [32, 48).
+ //
+ // See https://gpuopen.com/learn/amd-gcn-assembly-cross-lane-operations
+ // for an illustration of how this within-cluster broadcast works with a
+ // swizzle.
+ if (ci.subgroupSize == 64 && ci.clusterSize == 32) {
+ res =
+ amdgpu::SwizzleBitModeOp::create(rewriter, loc, res, /*and_mask=*/0,
+ /*or_mask=*/31,
+ /*xor_mask=*/0);
+ }
} else if (chipset.majorVersion <= 12) {
// Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
Value uint32Max = arith::ConstantOp::create(
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index f0de4db..a5ffb9e 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -896,6 +896,12 @@ std::pair<mlir::Type, unsigned> NVVM::inferMMAType(NVVM::MMATypes type,
} else if (type == NVVM::MMATypes::f32) {
elementType = builder.getF32Type();
numberElements = 8;
+ } else if (type == NVVM::MMATypes::f64) {
+ elementType = builder.getF64Type();
+ if (frag == NVVM::MMAFrag::a || frag == NVVM::MMAFrag::b)
+ numberElements = 1;
+ else
+ numberElements = 2;
} else if (type == NVVM::MMATypes::tf32) {
elementType = builder.getI32Type();
numberElements = 4;
@@ -954,6 +960,14 @@ LogicalResult NVVM::WMMALoadOp::verify() {
return emitOpError() << "invalid attribute combination";
std::pair<Type, unsigned> typeInfo = inferMMATypeFromMNK(
getEltype(), getFrag(), getM(), getN(), getK(), getContext());
+ // Special case for f64 fragments
+ Type f64Ty = Float64Type::get(getContext());
+ if (typeInfo.first == f64Ty && typeInfo.second == 1) {
+ if (getType() != f64Ty)
+ return emitOpError("expected destination type to be f64");
+ return success();
+ }
+ // Everything else is a struct
Type dstType = LLVM::LLVMStructType::getLiteral(
getContext(), SmallVector<Type, 8>(typeInfo.second, typeInfo.first));
if (getType() != dstType)
@@ -1608,9 +1622,52 @@ void Tcgen05MmaSmemDescOp::createSmemDescriptor(Operation &op,
}
//===----------------------------------------------------------------------===//
+// getPtx methods
+//===----------------------------------------------------------------------===//
+
+std::string NVVM::MBarrierInitOp::getPtx() {
+ unsigned addressSpace =
+ llvm::cast<LLVM::LLVMPointerType>(getAddr().getType()).getAddressSpace();
+ return (addressSpace == NVVMMemorySpace::Shared)
+ ? std::string("mbarrier.init.shared.b64 [%0], %1;")
+ : std::string("mbarrier.init.b64 [%0], %1;");
+}
+
+//===----------------------------------------------------------------------===//
// getIntrinsicID/getIntrinsicIDAndArgs methods
//===----------------------------------------------------------------------===//
+mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs(
+ Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+ auto thisOp = cast<NVVM::MBarrierInitOp>(op);
+ unsigned addressSpace =
+ llvm::cast<LLVM::LLVMPointerType>(thisOp.getAddr().getType())
+ .getAddressSpace();
+ llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared)
+ ? llvm::Intrinsic::nvvm_mbarrier_init_shared
+ : llvm::Intrinsic::nvvm_mbarrier_init;
+
+ // Fill the Intrinsic Args
+ llvm::SmallVector<llvm::Value *> args;
+ args.push_back(mt.lookupValue(thisOp.getAddr()));
+ args.push_back(mt.lookupValue(thisOp.getCount()));
+
+ return {id, std::move(args)};
+}
+
+mlir::NVVM::IDArgPair MBarrierInvalOp::getIntrinsicIDAndArgs(
+ Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+ auto thisOp = cast<NVVM::MBarrierInvalOp>(op);
+ unsigned addressSpace =
+ llvm::cast<LLVM::LLVMPointerType>(thisOp.getAddr().getType())
+ .getAddressSpace();
+ llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared)
+ ? llvm::Intrinsic::nvvm_mbarrier_inval_shared
+ : llvm::Intrinsic::nvvm_mbarrier_inval;
+
+ return {id, {mt.lookupValue(thisOp.getAddr())}};
+}
+
#define CP_ASYNC_ID_IMPL(mod, size, suffix) \
llvm::Intrinsic::nvvm_cp_async_##mod##_shared_global_##size##suffix
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index b09112b..3a43382 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -1958,7 +1958,7 @@ enum class OuterOrInnerPerm { Outer = 0, Inner = 1 };
/// Return true if either `op` or `permutation` are empty to allow a simpler
/// polymorphic implementation.
template <typename RelayoutOpTy>
-bool isValidPackingPermutation(
+static bool isValidPackingPermutation(
RelayoutOpTy op, ArrayRef<int64_t> permutation,
OuterOrInnerPerm outerOrInnerPerm = OuterOrInnerPerm::Outer) {
static_assert(
@@ -4322,9 +4322,10 @@ DiagnosedSilenceableFailure transform::TransposeMatmulOp::applyToOne(
// InsertSliceToCopyOp
//===----------------------------------------------------------------------===//
template <typename OpTy>
-DiagnosedSilenceableFailure doit(RewriterBase &rewriter, OpTy target,
- transform::ApplyToEachResultList &results,
- transform::TransformState &state) {
+static DiagnosedSilenceableFailure
+doit(RewriterBase &rewriter, OpTy target,
+ transform::ApplyToEachResultList &results,
+ transform::TransformState &state) {
static_assert(llvm::is_one_of<OpTy, tensor::InsertSliceOp,
tensor::ParallelInsertSliceOp>() &&
"wrong op type");
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
index 660c313..fbac28e 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
@@ -145,3 +145,13 @@ std::string mlir::acc::getRecipeName(mlir::acc::RecipeKind kind,
return recipeName;
}
+
+mlir::Value mlir::acc::getBaseEntity(mlir::Value val) {
+ if (auto partialEntityAccessOp =
+ dyn_cast<PartialEntityAccessOpInterface>(val.getDefiningOp())) {
+ if (!partialEntityAccessOp.isCompleteView())
+ return partialEntityAccessOp.getBaseEntity();
+ }
+
+ return val;
+}
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index ad8255a..ae3423c 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -4336,7 +4336,7 @@ OpFoldResult ExtractStridedSliceOp::fold(FoldAdaptor adaptor) {
// ExtractStridedSliceOp(splat ConstantOp) -> ConstantOp.
if (auto splat =
llvm::dyn_cast_if_present<SplatElementsAttr>(adaptor.getSource()))
- DenseElementsAttr::get(getType(), splat.getSplatValue<Attribute>());
+ return DenseElementsAttr::get(getType(), splat.getSplatValue<Attribute>());
// ExtractStridedSliceOp(non-splat ConstantOp) -> ConstantOp.
return foldExtractStridedSliceNonSplatConstant(*this, adaptor.getSource());
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index f9aa28d5..6b4c185 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -11,7 +11,7 @@
#include "mlir/Dialect/Index/IR/IndexOps.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/DialectImplementation.h"
@@ -229,8 +229,10 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
}
if (inst_data && lane_layout && inst_data.size() != lane_layout.size()) {
- return emitError()
- << "expected inst_data and lane_layout to have the same rank";
+ return emitError() << "expected inst_data and lane_layout to have the same "
+ "rank, got inst_data "
+ << inst_data.size() << ", lane_layout "
+ << lane_layout.size();
}
// sg_data is optional for Workgroup layout, but its presence requires
@@ -569,8 +571,8 @@ TensorDescType::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
// for gather and scatter ops, Low-precision types are packed in 32-bit units.
unsigned bitWidth = elementType.getIntOrFloatBitWidth();
int chunkAlignmentFactor =
- bitWidth < targetinfo::packedSizeInBitsForGatherScatter
- ? targetinfo::packedSizeInBitsForGatherScatter / bitWidth
+ bitWidth < xegpu::uArch::generalPackedFormatBitSize
+ ? xegpu::uArch::generalPackedFormatBitSize / bitWidth
: 1;
auto scatterAttr = mlir::dyn_cast_if_present<ScatterTensorDescAttr>(encoding);
if (scatterAttr) {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 8fab255..90eae87 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -14,7 +14,6 @@
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/IR/Attributes.h"
@@ -37,6 +36,8 @@
#include "llvm/Support/LogicalResult.h"
#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
+
namespace mlir {
namespace xegpu {
#define GEN_PASS_DEF_XEGPUPROPAGATELAYOUT
@@ -104,6 +105,8 @@ public:
SmallVector<int> getLaneData() const;
+ SmallVector<int> getInstData() const;
+
bool isSliceLayout() const {
if (!isAssigned())
return false;
@@ -137,6 +140,13 @@ SmallVector<int> LayoutInfo::getLaneData() const {
[](int64_t val) { return static_cast<int>(val); });
}
+SmallVector<int> LayoutInfo::getInstData() const {
+ if (!isAssigned())
+ return {};
+ return llvm::map_to_vector(storage.getEffectiveInstDataAsInt(),
+ [](int64_t val) { return static_cast<int>(val); });
+}
+
void LayoutInfo::print(raw_ostream &os) const {
if (isAssigned()) {
os << storage;
@@ -174,12 +184,14 @@ LayoutInfo LayoutInfo::transpose(ArrayRef<int64_t> permutation) const {
SmallVector<int32_t> laneLayout;
SmallVector<int32_t> laneData;
+ SmallVector<int32_t> instData;
for (int64_t idx : permutation) {
laneLayout.push_back(static_cast<int32_t>(getLaneLayout()[idx]));
laneData.push_back(static_cast<int32_t>(getLaneData()[idx]));
+ instData.push_back(static_cast<int32_t>(getInstData()[idx]));
}
- return LayoutInfo(
- xegpu::LayoutAttr::get(storage.getContext(), laneLayout, laneData));
+ return LayoutInfo(xegpu::LayoutAttr::get(storage.getContext(), instData,
+ laneLayout, laneData));
}
//===----------------------------------------------------------------------===//
@@ -192,6 +204,28 @@ struct LayoutInfoLattice : public Lattice<LayoutInfo> {
using Lattice::Lattice;
};
+/// Helper Function to find a proper instruction multiple for the user-supplied
+/// sg-level data shape. `candidates` are uArch allowed shapes.
+/// `candidateMultiples` are uArch multiples of such shapes (e.g., block count).
+template <typename T>
+int getLargestDivisor(T dim, ArrayRef<T> candidates,
+ ArrayRef<T> candidateMultiples = {}) {
+ static_assert(std::is_integral<T>::value, "T must be an integer type");
+ int largest = -1;
+ SmallVector<T> multiples = {1};
+ if (!candidateMultiples.empty())
+ multiples =
+ SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end());
+ for (T candidate : candidates) {
+ for (T multiple : multiples) {
+ int value = static_cast<int>(candidate * multiple);
+ if (value != 0 && dim % value == 0 && value > largest)
+ largest = value;
+ }
+ }
+ return largest;
+}
+
/// Helper Functions to get default layouts. A `default layout` is a layout that
/// is assigned to a value when the layout is not fixed by some anchor operation
/// (like DPAS).
@@ -200,18 +234,32 @@ struct LayoutInfoLattice : public Lattice<LayoutInfo> {
/// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1].
/// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1].
static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx,
- unsigned rank) {
+ unsigned rank,
+ const xegpu::uArch::uArch *uArch,
+ ArrayRef<int> instData) {
assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
if (rank == 1) {
return LayoutInfo(
- xegpu::LayoutAttr::get(ctx, {xegpu::targetinfo::subgroupSize}, {1}));
+ xegpu::LayoutAttr::get(ctx, instData, {uArch->getSubgroupSize()}, {1}));
}
return LayoutInfo(xegpu::LayoutAttr::get(
- ctx, {1, xegpu::targetinfo::subgroupSize}, {1, 1}));
+ ctx, instData, {1, uArch->getSubgroupSize()}, {1, 1}));
+}
+
+static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx,
+ unsigned rank, int subgroupSize) {
+ assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
+ if (rank == 1) {
+ return LayoutInfo(xegpu::LayoutAttr::get(ctx, {subgroupSize}, {1}));
+ }
+ return LayoutInfo(xegpu::LayoutAttr::get(ctx, {1, subgroupSize}, {1, 1}));
}
/// Helper to get the default layout for a vector type.
static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy,
+ const xegpu::uArch::uArch *uArch,
+ ArrayRef<int> instData,
+ unsigned packingSize,
bool isScattered = false) {
// Expecting a 1D or 2D vector.
assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) &&
@@ -221,28 +269,25 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy,
"Expected int or float element type.");
// If the rank is 1, then return default layout for 1D vector.
if (vectorTy.getRank() == 1)
- return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1);
+ return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1, uArch, instData);
// Packing factor is determined by the element type bitwidth.
- int packingFactor = 1;
unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth();
+ int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1;
if (isScattered) {
- packingFactor =
- bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter
- ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth
- : 1;
- return LayoutInfo(xegpu::LayoutAttr::get(
- vectorTy.getContext(), {xegpu::targetinfo::subgroupSize, 1},
- {1, packingFactor}));
+ return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData,
+ {uArch->getSubgroupSize(), 1},
+ {1, packingFactor}));
}
- if (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault)
- packingFactor = xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth;
- return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(),
- {1, xegpu::targetinfo::subgroupSize},
+ return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData,
+ {1, uArch->getSubgroupSize()},
{1, packingFactor}));
}
/// Helper to get the default layout for a vector type.
static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
+ const xegpu::uArch::uArch *uArch,
+ ArrayRef<int> instData,
+ unsigned packingSize,
bool isScattered = false) {
// Expecting a 1D or 2D vector.
assert((tdescTy.getRank() == 1 || tdescTy.getRank() == 2) &&
@@ -252,27 +297,18 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
"Expected int or float element type.");
// If the rank is 1, then return default layout for 1D vector.
if (tdescTy.getRank() == 1)
- return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1);
+ return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1, uArch, instData);
// Packing factor is determined by the element type bitwidth.
unsigned bitwidth = tdescTy.getElementType().getIntOrFloatBitWidth();
-
+ int subgroupSize = uArch->getSubgroupSize();
+ int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1;
if (isScattered) {
- int packingFactor =
- bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter
- ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth
- : 1;
return LayoutInfo(xegpu::LayoutAttr::get(
- tdescTy.getContext(), {xegpu::targetinfo::subgroupSize, 1},
- {1, packingFactor}));
+ tdescTy.getContext(), instData, {subgroupSize, 1}, {1, packingFactor}));
}
- int packingFactor =
- (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault)
- ? xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth
- : 1;
- return LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(),
- {1, xegpu::targetinfo::subgroupSize},
- {1, packingFactor}));
+ return LayoutInfo(xegpu::LayoutAttr::get(
+ tdescTy.getContext(), instData, {1, subgroupSize}, {1, packingFactor}));
}
/// Helper Function to get the expected layouts for DPAS operands. `lane_data`
@@ -281,25 +317,25 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
/// `packedSizeInBitsForDefault`
/// * For B operand, the data must be packed in minimum
/// `packedSizeInBitsForDpasB`
-static LayoutInfo getSIMTLayoutInfoForDPASOperand(VectorType vectorTy,
- unsigned operandNum) {
+static LayoutInfo
+getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, unsigned operandNum,
+ const xegpu::uArch::uArch *uArch,
+ ArrayRef<int> instData, unsigned packingSize) {
Type elementTy = vectorTy.getElementType();
assert(elementTy.isIntOrFloat() &&
"Expected int or float type in DPAS operands");
- SmallVector<int32_t, 2> layout({1, xegpu::targetinfo::subgroupSize});
+ SmallVector<int32_t, 2> layout({1, uArch->getSubgroupSize()});
// For B operand, data must be packed in minimum `packedDpasBSizeInBits` and
// must have the VNNI format.
- if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() <
- xegpu::targetinfo::packedSizeInBitsForDpasB) {
+ if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() < packingSize) {
SmallVector<int32_t, 2> data(
- {static_cast<int32_t>(xegpu::targetinfo::packedSizeInBitsForDpasB /
- elementTy.getIntOrFloatBitWidth()),
+ {static_cast<int32_t>(packingSize / elementTy.getIntOrFloatBitWidth()),
1});
return LayoutInfo(
- xegpu::LayoutAttr::get(vectorTy.getContext(), layout, data));
+ xegpu::LayoutAttr::get(vectorTy.getContext(), instData, layout, data));
}
// Otherwise, return the default layout for the vector type.
- return getDefaultSIMTLayoutInfo(vectorTy);
+ return getDefaultSIMTLayoutInfo(vectorTy, uArch, instData, packingSize);
}
//===----------------------------------------------------------------------===//
@@ -456,7 +492,37 @@ void LayoutInfoPropagation::visitPrefetchNdOp(
// Here we assign the default layout to the tensor descriptor operand of
// prefetch.
auto tdescTy = prefetch.getTensorDescType();
- auto prefetchLayout = getDefaultSIMTLayoutInfo(tdescTy);
+
+ auto uArch = getUArch(getChipStr(prefetch).value_or(""));
+ const auto *uArchInstruction =
+ dyn_cast<xegpu::uArch::Subgroup2DBlockPrefetchInstruction>(
+ uArch->getInstruction(
+ xegpu::uArch::InstructionKind::Subgroup2DBlockPrefetch));
+
+ auto blockWHC =
+ uArchInstruction->getBlockWidthHeightCount(tdescTy.getElementType());
+ if (!blockWHC)
+ prefetch.emitWarning("No known block params found for the element type.");
+ auto [bWidth, bHeight, bCount] = blockWHC.value();
+ SmallVector<int> instData;
+ int instWidth = getLargestDivisor(
+ static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth,
+ bCount);
+ if (instWidth == -1)
+ prefetch.emitWarning(
+ "No suitable instruction multiple found for the given shape.");
+ if (tdescTy.getRank() == 1)
+ instData = {instWidth};
+ else {
+ int instHeight = getLargestDivisor(
+ static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight);
+ if (instHeight == -1)
+ prefetch.emitWarning(
+ "No suitable instruction multiple found for the given shape.");
+ instData = {instHeight, instWidth};
+ }
+ auto prefetchLayout = getDefaultSIMTLayoutInfo(
+ tdescTy, uArch, instData, uArchInstruction->getPackedFormatBitSize());
// Propagate the layout to the source tensor descriptor.
propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout));
}
@@ -475,10 +541,11 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
reduction.emitWarning("Expecting output type to be 1D vector.");
return;
}
+ auto uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
// Given that the result is 1D, the layout of the operand should be 2D with
// default layout.
- LayoutInfo operandLayout =
- getDefaultSIMTLayoutInfo(reduction->getContext(), 2);
+ LayoutInfo operandLayout = getDefaultSIMTLayoutInfo(
+ reduction->getContext(), 2, uArch->getSubgroupSize());
propagateIfChanged(operands[0], operands[0]->meet(operandLayout));
// Accumulator should have the same layout as the result.
propagateIfChanged(operands[1], operands[1]->meet(resultLayout));
@@ -557,15 +624,53 @@ void LayoutInfoPropagation::visitDpasOp(
ArrayRef<const LayoutInfoLattice *> results) {
VectorType aTy = dpas.getLhsType();
VectorType bTy = dpas.getRhsType();
- propagateIfChanged(
- operands[0], operands[0]->meet(getSIMTLayoutInfoForDPASOperand(aTy, 0)));
- propagateIfChanged(
- operands[1], operands[1]->meet(getSIMTLayoutInfoForDPASOperand(bTy, 1)));
+
+ auto uArch = getUArch(getChipStr(dpas).value_or(""));
+ const int subgroupSize = uArch->getSubgroupSize();
+ const auto *uArchInstruction =
+ dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction(
+ xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc));
+
+ const unsigned dataALen = aTy.getShape().front();
+ auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType());
+ const int maxALen =
+ getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen));
+ if (maxALen == -1)
+ dpas.emitWarning(
+ "No suitable instruction multiple found for the given shape.");
+
+ const unsigned dataBLen = bTy.getShape().back();
+ auto supportedBLen = uArchInstruction->getSupportedK(bTy.getElementType());
+ const int maxBLen =
+ getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen));
+ if (maxBLen == -1)
+ dpas.emitWarning(
+ "No suitable instruction multiple found for the given shape.");
+ SmallVector<int> instDataA = {maxALen, subgroupSize};
+ SmallVector<int> instDataB = {subgroupSize, maxBLen};
+
+ propagateIfChanged(operands[0],
+ operands[0]->meet(getSIMTLayoutInfoForDPASOperand(
+ aTy, 0, uArch, instDataA,
+ uArchInstruction->getPackedFormatBitSizeA())));
+ propagateIfChanged(operands[1],
+ operands[1]->meet(getSIMTLayoutInfoForDPASOperand(
+ bTy, 1, uArch, instDataB,
+ uArchInstruction->getPackedFormatBitSizeB())));
if (operands.size() > 2) {
VectorType cTy = dpas.getAccType();
- propagateIfChanged(
- operands[2],
- operands[2]->meet(getSIMTLayoutInfoForDPASOperand(cTy, 2)));
+ const unsigned dataCLen = bTy.getShape().back();
+ auto supportedCLen = uArchInstruction->getSupportedN(bTy.getElementType());
+ const int maxCLen =
+ getLargestDivisor(dataCLen, ArrayRef<unsigned>(supportedCLen));
+ if (maxCLen == -1)
+ dpas.emitWarning(
+ "No suitable instruction multiple found for the given shape.");
+ SmallVector<int> instDataC = {maxALen, maxCLen};
+ propagateIfChanged(operands[2],
+ operands[2]->meet(getSIMTLayoutInfoForDPASOperand(
+ cTy, 2, uArch, instDataC,
+ uArchInstruction->getPackedFormatBitSizeB())));
}
}
@@ -573,7 +678,38 @@ void LayoutInfoPropagation::visitDpasOp(
void LayoutInfoPropagation::visitStoreNdOp(
xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
- LayoutInfo storeLayout = getDefaultSIMTLayoutInfo(store.getValueType());
+
+ auto uArch = getUArch(getChipStr(store).value_or(""));
+ const auto *uArchInstruction =
+ dyn_cast<xegpu::uArch::Subgroup2DBlockStoreInstruction>(
+ uArch->getInstruction(
+ xegpu::uArch::InstructionKind::Subgroup2DBlockStore));
+ VectorType dataTy = store.getValueType();
+ auto blockWHC = uArchInstruction->getBlockWidthHeightCount(
+ store.getValueType().getElementType());
+ if (!blockWHC)
+ store.emitWarning("No known block params found for the element type.");
+ auto [bWidth, bHeight, bCount] = blockWHC.value();
+ SmallVector<int> instData;
+ int instWidth = getLargestDivisor(
+ static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth,
+ bCount);
+ if (instWidth == -1)
+ store.emitWarning(
+ "No suitable instruction multiple found for the given shape.");
+ if (dataTy.getRank() == 1)
+ instData = {instWidth};
+ else {
+ int instHeight = getLargestDivisor(
+ static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight);
+ if (instHeight == -1)
+ store.emitWarning(
+ "No suitable instruction multiple found for the given shape.");
+ instData = {instHeight, instWidth};
+ }
+ LayoutInfo storeLayout =
+ getDefaultSIMTLayoutInfo(store.getValueType(), uArch, instData,
+ uArchInstruction->getPackedFormatBitSize());
// Both operands should have the same layout
for (LayoutInfoLattice *operand : operands)
propagateIfChanged(operand, operand->meet(storeLayout));
@@ -694,10 +830,23 @@ void LayoutInfoPropagation::visitLoadGatherOp(
load.emitWarning("Not propagating, non-vector payload supplied.");
return;
}
- LayoutInfo layout = getDefaultSIMTLayoutInfo(payloadTy, /*scattered*/ true);
+ auto uArch = getUArch(getChipStr(load).value_or(""));
+ const int subgroupSize = uArch->getSubgroupSize();
+ SmallVector<int> instData{subgroupSize};
+ if (auto chunkSize = load.getChunkSize().value_or(0); chunkSize > 1)
+ instData.push_back(chunkSize);
+ else if (auto srcTdescTy =
+ dyn_cast<xegpu::TensorDescType>(load.getSourceType())) {
+ if (srcTdescTy.getChunkSizeAsInt() > 1)
+ instData.push_back(chunkSize);
+ }
+ LayoutInfo layout = getDefaultSIMTLayoutInfo(
+ payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(),
+ /*scattered*/ true);
// Mask operand should have 1D default layout.
- LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(load->getContext(), 1);
+ LayoutInfo maskLayout =
+ getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize);
// Propagate the new layout to the tensor descriptor operand.
if (isa<xegpu::TensorDescType>(load.getSourceType()))
@@ -717,8 +866,10 @@ void LayoutInfoPropagation::visitCreateDescOp(
// Need the layout of the descriptor to propagate to the operands.
if (!descLayout.isAssigned())
return;
+ auto uArch = getUArch(getChipStr(createDesc).value_or(""));
// For offset operand propagate 1D default layout.
- LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1);
+ LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1,
+ uArch->getSubgroupSize());
propagateIfChanged(operands[1], operands[1]->meet(layout));
}
@@ -735,18 +886,30 @@ void LayoutInfoPropagation::visitStoreScatterOp(
storeScatter.emitWarning("Not propagating, non-vector payload supplied.");
return;
}
+ auto uArch = getUArch(getChipStr(storeScatter).value_or(""));
+ const int subgroupSize = uArch->getSubgroupSize();
+
auto payloadShape = payloadTy.getShape();
if (payloadShape.size() > 1)
assert(
- payloadShape[0] == xegpu::targetinfo::subgroupSize &&
+ payloadShape[0] == subgroupSize &&
"Expected the first dimension of 2D tensor descriptor to be equal to "
"subgroup size.");
- LayoutInfo payloadLayout =
- getDefaultSIMTLayoutInfo(payloadTy, /*scattered=*/true);
+ SmallVector<int> instData{subgroupSize};
+ if (auto chunkSize = storeScatter.getChunkSize().value_or(0); chunkSize > 1)
+ instData.push_back(chunkSize);
+ else if (auto dstTdescTy =
+ dyn_cast<xegpu::TensorDescType>(storeScatter.getDestType())) {
+ if (dstTdescTy.getChunkSizeAsInt() > 1)
+ instData.push_back(chunkSize);
+ }
+ LayoutInfo payloadLayout = getDefaultSIMTLayoutInfo(
+ payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(),
+ /*scattered=*/true);
LayoutInfo maskLayout =
- getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1);
+ getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize);
// Propagate the payload operand layout
propagateIfChanged(operands[0], operands[0]->meet(payloadLayout));
// Propagate the destination (if tdesc) operand layout
@@ -1023,9 +1186,13 @@ void XeGPUPropagateLayoutPass::runOnOperation() {
LayoutInfo layout = analysis.getLayoutInfo(val);
if (!layout.isAssigned())
return {};
+ xegpu::DistributeLayoutAttr layoutAttr =
+ cast<xegpu::DistributeLayoutAttr>(layout.get());
+ if (this->layoutKind == "lane")
+ layoutAttr = layoutAttr.dropInstData();
if (layout.isSliceLayout())
- return cast<xegpu::SliceAttr>(layout.get());
- return cast<xegpu::LayoutAttr>(layout.get());
+ return cast<xegpu::SliceAttr>(layoutAttr);
+ return cast<xegpu::LayoutAttr>(layoutAttr);
};
mlir::OpBuilder builder(&getContext());
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index d09dc19..5a3b27e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -11,10 +11,10 @@
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
@@ -159,17 +159,18 @@ static bool requirePacked(const xegpu::LayoutAttr layout) {
/// Helper function to check if the layout requires a transpose effect.
static bool requireTranspose(const xegpu::LayoutAttr layout,
- const std::string &chipStr) {
+ const xegpu::uArch::uArch *uArch) {
// Return false for unsupported targets.
// TODO: Add more support or move to target info.
- if (chipStr != "pvc" && chipStr != "bmg")
+ if (uArch->getName().equals_insensitive("pvc") &&
+ uArch->getName().equals_insensitive("bmg"))
return false;
if (!layout)
return false;
auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
if (laneLayout.size() != 2)
return false;
- return laneLayout[0] == xegpu::targetinfo::subgroupSize && laneLayout[1] == 1;
+ return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1;
}
/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
@@ -199,6 +200,11 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
PatternRewriter &rewriter) const override {
+ auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or(""));
+ if (!uArch)
+ return rewriter.notifyMatchFailure(
+ gpuFuncOp, "Subgroup distribution requires target attribute attached "
+ "to set the warp size");
// If the function only contains a single void return, skip.
if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
@@ -230,7 +236,7 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
auto warpOp = gpu::WarpExecuteOnLane0Op::create(
rewriter, laneId.getLoc(), gpuFuncResultType, laneId,
- xegpu::targetinfo::subgroupSize, newGpuFunc.getArguments(),
+ uArch->getSubgroupSize(), newGpuFunc.getArguments(),
newGpuFunc.getArgumentTypes());
Block &warpBodyBlock = warpOp.getBodyRegion().front();
// Replace the ReturnOp of the original gpu function with a YieldOp.
@@ -495,14 +501,14 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
warpOp, "warp result is not a xegpu::LoadNd op");
auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
+ auto uArch = getUArch(xegpu::getChipStr(loadOp).value_or(""));
+ if (!uArch)
+ return rewriter.notifyMatchFailure(
+ loadOp, "xegpu::LoadNdOp require target attribute attached to "
+ "determine transpose "
+ "requirement");
// Chip information is required to decide if the layout requires transpose
// effect.
- auto chipStr = xegpu::getChipStr(loadOp);
- if (!chipStr)
- return rewriter.notifyMatchFailure(
- loadOp,
- "xegpu::LoadNdOp require chip information to determine transpose "
- "requirement");
// Expecting offsets to be present.
SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets();
if (offsets.empty())
@@ -556,7 +562,7 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
// Set the packed attribute if the layout requires it.
newLoadOp.setPacked(requirePacked(layout));
// Set the transpose attribute if the layout requires it.
- if (requireTranspose(layout, chipStr.value()))
+ if (requireTranspose(layout, uArch))
newLoadOp.setTranspose(
DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
Value distributedVal = newWarpOp.getResult(operandIdx);
diff --git a/mlir/lib/Transforms/ViewOpGraph.cpp b/mlir/lib/Transforms/ViewOpGraph.cpp
index 08cac1f..5790a77 100644
--- a/mlir/lib/Transforms/ViewOpGraph.cpp
+++ b/mlir/lib/Transforms/ViewOpGraph.cpp
@@ -158,7 +158,8 @@ private:
/// Emit a cluster (subgraph). The specified builder generates the body of the
/// cluster. Return the anchor node of the cluster.
- Node emitClusterStmt(function_ref<void()> builder, std::string label = "") {
+ Node emitClusterStmt(function_ref<void()> builder,
+ const std::string &label = "") {
int clusterId = ++counter;
os << "subgraph cluster_" << clusterId << " {\n";
os.indent();
@@ -269,7 +270,7 @@ private:
}
/// Emit a node statement.
- Node emitNodeStmt(std::string label, StringRef shape = kShapeNode,
+ Node emitNodeStmt(const std::string &label, StringRef shape = kShapeNode,
StringRef background = "") {
int nodeId = ++counter;
AttributeMap attrs;