diff options
Diffstat (limited to 'mlir/lib/Dialect')
-rw-r--r-- | mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp | 23 | ||||
-rw-r--r-- | mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp | 2 | ||||
-rw-r--r-- | mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 67 |
3 files changed, 66 insertions, 26 deletions
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index ee3e402..6598ac1 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -2674,6 +2674,11 @@ LogicalResult acc::LoopOp::verify() { "privatizations", false))) return failure(); + if (failed(checkSymOperandList<mlir::acc::FirstprivateRecipeOp>( + *this, getFirstprivatizationRecipes(), getFirstprivateOperands(), + "firstprivate", "firstprivatizations", /*checkOperandType=*/false))) + return failure(); + if (failed(checkSymOperandList<mlir::acc::ReductionRecipeOp>( *this, getReductionRecipes(), getReductionOperands(), "reduction", "reductions", false))) @@ -2737,7 +2742,8 @@ LogicalResult acc::LoopOp::verify() { } unsigned LoopOp::getNumDataOperands() { - return getReductionOperands().size() + getPrivateOperands().size(); + return getReductionOperands().size() + getPrivateOperands().size() + + getFirstprivateOperands().size(); } Value LoopOp::getDataOperand(unsigned i) { @@ -3117,6 +3123,21 @@ void acc::LoopOp::addPrivatization(MLIRContext *context, setPrivatizationRecipesAttr(mlir::ArrayAttr::get(context, recipes)); } +void acc::LoopOp::addFirstPrivatization( + MLIRContext *context, mlir::acc::FirstprivateOp op, + mlir::acc::FirstprivateRecipeOp recipe) { + getFirstprivateOperandsMutable().append(op.getResult()); + + llvm::SmallVector<mlir::Attribute> recipes; + + if (getFirstprivatizationRecipesAttr()) + llvm::copy(getFirstprivatizationRecipesAttr(), std::back_inserter(recipes)); + + recipes.push_back( + mlir::SymbolRefAttr::get(context, recipe.getSymName().str())); + setFirstprivatizationRecipesAttr(mlir::ArrayAttr::get(context, recipes)); +} + void acc::LoopOp::addReduction(MLIRContext *context, mlir::acc::ReductionOp op, mlir::acc::ReductionRecipeOp recipe) { getReductionOperandsMutable().append(op.getResult()); diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp index 3a6684f..255f2bf 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp @@ -796,7 +796,7 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> { currentSourceIndex, remainingElements, 0); // Generate back mask. - auto maskValues = SmallVector<bool>(emulatedPerContainerElem, 0); + auto maskValues = SmallVector<bool>(emulatedPerContainerElem, false); std::fill_n(maskValues.begin(), remainingElements, 1); auto backMask = arith::ConstantOp::create( rewriter, loc, diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 882691f..f1dbc5d 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -875,14 +875,17 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern { storeScatterOp, "Some vector operands have no layouts, using defaults instead."); } - VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value(); - VectorType expectedPayloadTy = VectorType::get( - {distPayloadTy.getNumElements()}, distPayloadTy.getElementType()); + // Distributed store payload type according to the lane layout. + VectorType distPayloadTyByWarpOp = distStoreVecByWarpOpOrFailure.value(); + // Expected distributed payload type is always 1D. + VectorType expectedPayloadTy = + VectorType::get({distPayloadTyByWarpOp.getNumElements()}, + distPayloadTyByWarpOp.getElementType()); SmallVector<size_t> newRetIndices; SmallVector<Value> operands = storeScatterOp->getOperands(); SmallVector<Type> operandTypesToYield = { - expectedPayloadTy, operands[1].getType(), + distPayloadTyByWarpOp, operands[1].getType(), distOffsetsByWarpOpOrFailure.value(), distMaskByWarpOpOrFailure.value()}; @@ -890,8 +893,11 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern { rewriter, warpOp, operands, operandTypesToYield, newRetIndices); SmallVector<Value> newStoreScatterOpOperands = llvm::map_to_vector( newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); }); - + // The payload operand may need type adjustment due to mismatch between warp + // distributed type and expected SIMT type. rewriter.setInsertionPointAfter(newWarpOp); + newStoreScatterOpOperands[0] = resolveDistributedTy( + newStoreScatterOpOperands[0], expectedPayloadTy, rewriter); xegpu::StoreScatterOp newOp = xegpu::StoreScatterOp::create( rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands, storeScatterOp->getAttrs()); @@ -976,8 +982,11 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern { distMaskByWarpOpOrFailure.value()}; const unsigned operandIdx = producedByLastLoad->getOperandNumber(); - VectorType loadVecTy = + VectorType distResultTy = cast<VectorType>(warpOp.getResult(operandIdx).getType()); + // Distributed load op will always be 1D. + VectorType loadVecTy = VectorType::get({distResultTy.getNumElements()}, + distResultTy.getElementType()); gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( rewriter, warpOp, operands, operandTypesToYield, newRetIndices); @@ -991,13 +1000,16 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern { loadGatherOp->getAttrs()); xegpu::removeLayoutAttrs(newOp); Value distributedVal = newWarpOp.getResult(operandIdx); - rewriter.replaceAllUsesWith(distributedVal, newOp->getResult(0)); + // Resolve the output type and replace all uses. + rewriter.replaceAllUsesWith( + distributedVal, + resolveDistributedTy(newOp.getResult(), distResultTy, rewriter)); return success(); } }; /// Helper to rewrite a 2D VectorMultiReductionOp into a sequence of 1D -/// VectorReductionOps. +/// VectorReductionOps. We also insert layouts for the newly created ops. static Value lowerToVectorReductions(TypedValue<VectorType> src, TypedValue<VectorType> acc, vector::CombiningKind kind, @@ -1014,6 +1026,9 @@ static Value lowerToVectorReductions(TypedValue<VectorType> src, Value reductionResult = arith::ConstantOp::create( rewriter, loc, acc.getType(), DenseElementsAttr::get(acc.getType(), zeroAttr)); + // Reduction result should have the same layout as the accumulator. + xegpu::setDistributeLayoutAttr(cast<OpResult>(reductionResult), + xegpu::getDistributeLayoutAttr(acc)); // For each slice of the source, extract the slice vector, do a reduction // and, insert the reduced value back to the result vector. for (int i = 0; i < nSlices; ++i) { @@ -1029,13 +1044,23 @@ static Value lowerToVectorReductions(TypedValue<VectorType> src, vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets, sliceSizes, {1, 1}); int64_t nSliceElements = extractOp.getResult().getType().getNumElements(); - Value slice = vector::ShapeCastOp::create( + vector::ShapeCastOp slice = vector::ShapeCastOp::create( rewriter, loc, VectorType::get({nSliceElements}, sourceType.getElementType()), extractOp.getResult()); + // Shape cast is currently handled in xegpu side. So layouts must be + // retained during lowering. Shape cast output has the same layout as the + // accumulator. Shape cast source has the same layout as the original + // reduction source. + // TODO: other ops generated here may also need layout attributes. + xegpu::setDistributeLayoutAttr(slice->getOpOperand(0), + xegpu::getDistributeLayoutAttr(src)); + xegpu::setDistributeLayoutAttr(slice->getOpResult(0), + xegpu::getDistributeLayoutAttr(acc)); + // Extract and reduction results in scalars, so no result layout is needed. Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i); - Value reduction = - vector::ReductionOp::create(rewriter, loc, kind, slice, accExtract); + Value reduction = vector::ReductionOp::create( + rewriter, loc, kind, slice.getResult(), accExtract); reductionResult = vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i); } @@ -1107,7 +1132,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern { return failure(); auto reductionOp = cast<vector::MultiDimReductionOp>(yieldOperand->get().getDefiningOp()); - unsigned operandNumber = yieldOperand->getOperandNumber(); + unsigned operandIdx = yieldOperand->getOperandNumber(); VectorType sourceType = reductionOp.getSourceVectorType(); // Only 2D vectors are supported. if (sourceType.getRank() != 2) @@ -1121,7 +1146,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern { warpOp, "Only 1 reduction dimension is supported."); int64_t reductionDim = reductionDims[0]; VectorType distributedResultType = - cast<VectorType>(warpOp.getResult(operandNumber).getType()); + cast<VectorType>(warpOp.getResult(operandIdx).getType()); VectorType resultType = cast<VectorType>(reductionOp.getType()); xegpu::DistributeLayoutAttr sourceLayout = xegpu::getDistributeLayoutAttr(reductionOp.getSource()); @@ -1184,7 +1209,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern { cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])), reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter); // Replace the warp op result with the final result. - rewriter.replaceAllUsesWith(reductionOp.getResult(), result); + rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result); return success(); } // For non-lane-local case, we simply rewrite the MultiReductionOp in terms @@ -1217,7 +1242,7 @@ struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern { auto resultDistTy = cast<VectorType>(warpOp.getResult(operandNumber).getType()); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getDistributeLayoutAttr(shapeCastOp.getSource()); + xegpu::getDistributeLayoutAttr(shapeCastOp->getOpOperand(0)); xegpu::DistributeLayoutAttr resultLayout = xegpu::getDistributeLayoutAttr(shapeCastOp.getResult()); if (!sourceLayout || !resultLayout) @@ -1403,11 +1428,6 @@ namespace { struct XeGPUSubgroupDistributePass final : public xegpu::impl::XeGPUSubgroupDistributeBase< XeGPUSubgroupDistributePass> { - XeGPUSubgroupDistributePass() = default; - XeGPUSubgroupDistributePass(const XeGPUSubgroupDistributePass &other) = - default; - XeGPUSubgroupDistributePass(xegpu::XeGPUSubgroupDistributeOptions options) - : XeGPUSubgroupDistributeBase(options) {} void runOnOperation() override; }; } // namespace @@ -1515,10 +1535,9 @@ void XeGPUSubgroupDistributePass::runOnOperation() { return laneVal; }; - if (enableSGReductions) - vector::populateDistributeReduction( - patterns, warpReduction, - /*pattern benefit=*/regularPatternBenefit); + vector::populateDistributeReduction( + patterns, warpReduction, + /*pattern benefit=*/regularPatternBenefit); vector::populatePropagateWarpVectorDistributionPatterns( patterns, distributionFn, shuffleFn, |