aboutsummaryrefslogtreecommitdiff
path: root/mlir/lib/Dialect
diff options
context:
space:
mode:
Diffstat (limited to 'mlir/lib/Dialect')
-rw-r--r--mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp23
-rw-r--r--mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp2
-rw-r--r--mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp67
3 files changed, 66 insertions, 26 deletions
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index ee3e402..6598ac1 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -2674,6 +2674,11 @@ LogicalResult acc::LoopOp::verify() {
"privatizations", false)))
return failure();
+ if (failed(checkSymOperandList<mlir::acc::FirstprivateRecipeOp>(
+ *this, getFirstprivatizationRecipes(), getFirstprivateOperands(),
+ "firstprivate", "firstprivatizations", /*checkOperandType=*/false)))
+ return failure();
+
if (failed(checkSymOperandList<mlir::acc::ReductionRecipeOp>(
*this, getReductionRecipes(), getReductionOperands(), "reduction",
"reductions", false)))
@@ -2737,7 +2742,8 @@ LogicalResult acc::LoopOp::verify() {
}
unsigned LoopOp::getNumDataOperands() {
- return getReductionOperands().size() + getPrivateOperands().size();
+ return getReductionOperands().size() + getPrivateOperands().size() +
+ getFirstprivateOperands().size();
}
Value LoopOp::getDataOperand(unsigned i) {
@@ -3117,6 +3123,21 @@ void acc::LoopOp::addPrivatization(MLIRContext *context,
setPrivatizationRecipesAttr(mlir::ArrayAttr::get(context, recipes));
}
+void acc::LoopOp::addFirstPrivatization(
+ MLIRContext *context, mlir::acc::FirstprivateOp op,
+ mlir::acc::FirstprivateRecipeOp recipe) {
+ getFirstprivateOperandsMutable().append(op.getResult());
+
+ llvm::SmallVector<mlir::Attribute> recipes;
+
+ if (getFirstprivatizationRecipesAttr())
+ llvm::copy(getFirstprivatizationRecipesAttr(), std::back_inserter(recipes));
+
+ recipes.push_back(
+ mlir::SymbolRefAttr::get(context, recipe.getSymName().str()));
+ setFirstprivatizationRecipesAttr(mlir::ArrayAttr::get(context, recipes));
+}
+
void acc::LoopOp::addReduction(MLIRContext *context, mlir::acc::ReductionOp op,
mlir::acc::ReductionRecipeOp recipe) {
getReductionOperandsMutable().append(op.getResult());
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index 3a6684f..255f2bf 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -796,7 +796,7 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
currentSourceIndex, remainingElements, 0);
// Generate back mask.
- auto maskValues = SmallVector<bool>(emulatedPerContainerElem, 0);
+ auto maskValues = SmallVector<bool>(emulatedPerContainerElem, false);
std::fill_n(maskValues.begin(), remainingElements, 1);
auto backMask = arith::ConstantOp::create(
rewriter, loc,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 882691f..f1dbc5d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -875,14 +875,17 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
storeScatterOp,
"Some vector operands have no layouts, using defaults instead.");
}
- VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value();
- VectorType expectedPayloadTy = VectorType::get(
- {distPayloadTy.getNumElements()}, distPayloadTy.getElementType());
+ // Distributed store payload type according to the lane layout.
+ VectorType distPayloadTyByWarpOp = distStoreVecByWarpOpOrFailure.value();
+ // Expected distributed payload type is always 1D.
+ VectorType expectedPayloadTy =
+ VectorType::get({distPayloadTyByWarpOp.getNumElements()},
+ distPayloadTyByWarpOp.getElementType());
SmallVector<size_t> newRetIndices;
SmallVector<Value> operands = storeScatterOp->getOperands();
SmallVector<Type> operandTypesToYield = {
- expectedPayloadTy, operands[1].getType(),
+ distPayloadTyByWarpOp, operands[1].getType(),
distOffsetsByWarpOpOrFailure.value(),
distMaskByWarpOpOrFailure.value()};
@@ -890,8 +893,11 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
SmallVector<Value> newStoreScatterOpOperands = llvm::map_to_vector(
newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
-
+ // The payload operand may need type adjustment due to mismatch between warp
+ // distributed type and expected SIMT type.
rewriter.setInsertionPointAfter(newWarpOp);
+ newStoreScatterOpOperands[0] = resolveDistributedTy(
+ newStoreScatterOpOperands[0], expectedPayloadTy, rewriter);
xegpu::StoreScatterOp newOp = xegpu::StoreScatterOp::create(
rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands,
storeScatterOp->getAttrs());
@@ -976,8 +982,11 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
distMaskByWarpOpOrFailure.value()};
const unsigned operandIdx = producedByLastLoad->getOperandNumber();
- VectorType loadVecTy =
+ VectorType distResultTy =
cast<VectorType>(warpOp.getResult(operandIdx).getType());
+ // Distributed load op will always be 1D.
+ VectorType loadVecTy = VectorType::get({distResultTy.getNumElements()},
+ distResultTy.getElementType());
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
@@ -991,13 +1000,16 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
loadGatherOp->getAttrs());
xegpu::removeLayoutAttrs(newOp);
Value distributedVal = newWarpOp.getResult(operandIdx);
- rewriter.replaceAllUsesWith(distributedVal, newOp->getResult(0));
+ // Resolve the output type and replace all uses.
+ rewriter.replaceAllUsesWith(
+ distributedVal,
+ resolveDistributedTy(newOp.getResult(), distResultTy, rewriter));
return success();
}
};
/// Helper to rewrite a 2D VectorMultiReductionOp into a sequence of 1D
-/// VectorReductionOps.
+/// VectorReductionOps. We also insert layouts for the newly created ops.
static Value lowerToVectorReductions(TypedValue<VectorType> src,
TypedValue<VectorType> acc,
vector::CombiningKind kind,
@@ -1014,6 +1026,9 @@ static Value lowerToVectorReductions(TypedValue<VectorType> src,
Value reductionResult = arith::ConstantOp::create(
rewriter, loc, acc.getType(),
DenseElementsAttr::get(acc.getType(), zeroAttr));
+ // Reduction result should have the same layout as the accumulator.
+ xegpu::setDistributeLayoutAttr(cast<OpResult>(reductionResult),
+ xegpu::getDistributeLayoutAttr(acc));
// For each slice of the source, extract the slice vector, do a reduction
// and, insert the reduced value back to the result vector.
for (int i = 0; i < nSlices; ++i) {
@@ -1029,13 +1044,23 @@ static Value lowerToVectorReductions(TypedValue<VectorType> src,
vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
sliceSizes, {1, 1});
int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
- Value slice = vector::ShapeCastOp::create(
+ vector::ShapeCastOp slice = vector::ShapeCastOp::create(
rewriter, loc,
VectorType::get({nSliceElements}, sourceType.getElementType()),
extractOp.getResult());
+ // Shape cast is currently handled in xegpu side. So layouts must be
+ // retained during lowering. Shape cast output has the same layout as the
+ // accumulator. Shape cast source has the same layout as the original
+ // reduction source.
+ // TODO: other ops generated here may also need layout attributes.
+ xegpu::setDistributeLayoutAttr(slice->getOpOperand(0),
+ xegpu::getDistributeLayoutAttr(src));
+ xegpu::setDistributeLayoutAttr(slice->getOpResult(0),
+ xegpu::getDistributeLayoutAttr(acc));
+ // Extract and reduction results in scalars, so no result layout is needed.
Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
- Value reduction =
- vector::ReductionOp::create(rewriter, loc, kind, slice, accExtract);
+ Value reduction = vector::ReductionOp::create(
+ rewriter, loc, kind, slice.getResult(), accExtract);
reductionResult =
vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i);
}
@@ -1107,7 +1132,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
return failure();
auto reductionOp =
cast<vector::MultiDimReductionOp>(yieldOperand->get().getDefiningOp());
- unsigned operandNumber = yieldOperand->getOperandNumber();
+ unsigned operandIdx = yieldOperand->getOperandNumber();
VectorType sourceType = reductionOp.getSourceVectorType();
// Only 2D vectors are supported.
if (sourceType.getRank() != 2)
@@ -1121,7 +1146,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
warpOp, "Only 1 reduction dimension is supported.");
int64_t reductionDim = reductionDims[0];
VectorType distributedResultType =
- cast<VectorType>(warpOp.getResult(operandNumber).getType());
+ cast<VectorType>(warpOp.getResult(operandIdx).getType());
VectorType resultType = cast<VectorType>(reductionOp.getType());
xegpu::DistributeLayoutAttr sourceLayout =
xegpu::getDistributeLayoutAttr(reductionOp.getSource());
@@ -1184,7 +1209,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
// Replace the warp op result with the final result.
- rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
+ rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result);
return success();
}
// For non-lane-local case, we simply rewrite the MultiReductionOp in terms
@@ -1217,7 +1242,7 @@ struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
auto resultDistTy =
cast<VectorType>(warpOp.getResult(operandNumber).getType());
xegpu::DistributeLayoutAttr sourceLayout =
- xegpu::getDistributeLayoutAttr(shapeCastOp.getSource());
+ xegpu::getDistributeLayoutAttr(shapeCastOp->getOpOperand(0));
xegpu::DistributeLayoutAttr resultLayout =
xegpu::getDistributeLayoutAttr(shapeCastOp.getResult());
if (!sourceLayout || !resultLayout)
@@ -1403,11 +1428,6 @@ namespace {
struct XeGPUSubgroupDistributePass final
: public xegpu::impl::XeGPUSubgroupDistributeBase<
XeGPUSubgroupDistributePass> {
- XeGPUSubgroupDistributePass() = default;
- XeGPUSubgroupDistributePass(const XeGPUSubgroupDistributePass &other) =
- default;
- XeGPUSubgroupDistributePass(xegpu::XeGPUSubgroupDistributeOptions options)
- : XeGPUSubgroupDistributeBase(options) {}
void runOnOperation() override;
};
} // namespace
@@ -1515,10 +1535,9 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
return laneVal;
};
- if (enableSGReductions)
- vector::populateDistributeReduction(
- patterns, warpReduction,
- /*pattern benefit=*/regularPatternBenefit);
+ vector::populateDistributeReduction(
+ patterns, warpReduction,
+ /*pattern benefit=*/regularPatternBenefit);
vector::populatePropagateWarpVectorDistributionPatterns(
patterns, distributionFn, shuffleFn,