diff options
Diffstat (limited to 'mlir/lib/Conversion')
7 files changed, 66 insertions, 19 deletions
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 41e333c..3a307a0 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -935,7 +935,7 @@ static std::optional<uint32_t> mfmaTypeSelectCode(Type mlirElemType) { .Case([](Float6E2M3FNType) { return 2u; }) .Case([](Float6E3M2FNType) { return 3u; }) .Case([](Float4E2M1FNType) { return 4u; }) - .Default([](Type) { return std::nullopt; }); + .Default(std::nullopt); } /// If there is a scaled MFMA instruction for the input element types `aType` diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp index 247dba1..cfdcd9c 100644 --- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp +++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp @@ -432,7 +432,7 @@ static Value getOriginalVectorValue(Value value) { current = op.getSource(); return false; }) - .Default([](Operation *) { return false; }); + .Default(false); if (!skipOp) { break; diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp index 25f1e1b..425594b 100644 --- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp +++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp @@ -259,7 +259,7 @@ struct GPUShuffleConversion final : ConvertOpToLLVMPattern<gpu::ShuffleOp> { } return std::nullopt; }) - .Default([](auto) { return std::nullopt; }); + .Default(std::nullopt); } static std::optional<std::string> getFuncName(gpu::ShuffleMode mode, diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp index a9efada..ec182f1 100644 --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -846,13 +846,8 @@ struct NVGPUMBarrierInitLowering Value barrier = getMbarrierPtr(b, mbarrierType, adaptor.getBarriers(), adaptor.getMbarId(), rewriter); Value count = truncToI32(b, adaptor.getCount()); - if (isMbarrierShared(mbarrierType)) { - rewriter.replaceOpWithNewOp<NVVM::MBarrierInitSharedOp>( - op, barrier, count, adaptor.getPredicate()); - } else { - rewriter.replaceOpWithNewOp<NVVM::MBarrierInitOp>(op, barrier, count, - adaptor.getPredicate()); - } + rewriter.replaceOpWithNewOp<NVVM::MBarrierInitOp>(op, barrier, count, + adaptor.getPredicate()); return success(); } }; diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp index 7d0a236..76a822b 100644 --- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp +++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp @@ -14,6 +14,7 @@ #include "mlir/Conversion/SCFToGPU/SCFToGPU.h" +#include "mlir/Analysis/AliasAnalysis/LocalAliasAnalysis.h" #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" @@ -27,6 +28,7 @@ #include "mlir/Interfaces/SideEffectInterfaces.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/RegionUtils.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/Support/DebugLog.h" #include <optional> @@ -625,18 +627,49 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, bool seenSideeffects = false; // Whether we have left a nesting scope (and hence are no longer innermost). bool leftNestingScope = false; + LocalAliasAnalysis aliasAnalysis; + llvm::DenseSet<Value> writtenBuffer; while (!worklist.empty()) { Operation *op = worklist.pop_back_val(); // Now walk over the body and clone it. // TODO: This is only correct if there either is no further scf.parallel - // nested or this code is side-effect free. Otherwise we might need - // predication. We are overly conservative for now and only allow - // side-effects in the innermost scope. + // nested or this code has side-effect but the memory buffer is not + // alias to inner loop access buffer. Otherwise we might need + // predication. if (auto nestedParallel = dyn_cast<ParallelOp>(op)) { // Before entering a nested scope, make sure there have been no - // sideeffects until now. - if (seenSideeffects) - return failure(); + // sideeffects until now or the nested operations do not access the + // buffer written by outer scope. + if (seenSideeffects) { + WalkResult walkRes = nestedParallel.walk([&](Operation *nestedOp) { + if (isMemoryEffectFree(nestedOp)) + return WalkResult::advance(); + + auto memEffectInterface = dyn_cast<MemoryEffectOpInterface>(nestedOp); + if (!memEffectInterface) + return WalkResult::advance(); + + SmallVector<MemoryEffects::EffectInstance> effects; + memEffectInterface.getEffects(effects); + for (const MemoryEffects::EffectInstance &effect : effects) { + if (isa<MemoryEffects::Read>(effect.getEffect()) || + isa<MemoryEffects::Write>(effect.getEffect())) { + Value baseBuffer = effect.getValue(); + if (!baseBuffer) + return WalkResult::interrupt(); + for (Value val : writtenBuffer) { + if (aliasAnalysis.alias(baseBuffer, val) != + AliasResult::NoAlias) { + return WalkResult::interrupt(); + } + } + } + } + return WalkResult::advance(); + }); + if (walkRes.wasInterrupted()) + return failure(); + } // A nested scf.parallel needs insertion of code to compute indices. // Insert that now. This will also update the worklist with the loops // body. @@ -650,6 +683,7 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, rewriter.setInsertionPointAfter(parent); leftNestingScope = true; seenSideeffects = false; + writtenBuffer.clear(); } else if (auto reduceOp = dyn_cast<scf::ReduceOp>(op)) { // Convert scf.reduction op auto parentLoop = op->getParentOfType<ParallelOp>(); @@ -682,6 +716,24 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, Operation *clone = rewriter.clone(*op, cloningMap); cloningMap.map(op->getResults(), clone->getResults()); // Check for side effects. + if (!isMemoryEffectFree(clone)) { + // Record the buffer accessed by the operations with write effects. + if (auto memEffectInterface = + dyn_cast<MemoryEffectOpInterface>(clone)) { + SmallVector<MemoryEffects::EffectInstance> effects; + memEffectInterface.getEffects(effects); + for (const MemoryEffects::EffectInstance &effect : effects) { + if (isa<MemoryEffects::Write>(effect.getEffect())) { + Value writtenBase = effect.getValue(); + // Conservatively return failure if we cannot find the written + // address. + if (!writtenBase) + return failure(); + writtenBuffer.insert(writtenBase); + } + } + } + } // TODO: Handle region side effects properly. seenSideeffects |= !isMemoryEffectFree(clone) || clone->getNumRegions() != 0; diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp index 41d8d53..69a317ec 100644 --- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp +++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp @@ -716,7 +716,7 @@ lowerReductionWithStartValue(ConversionPatternRewriter &rewriter, Location loc, accumulator = getOrCreateAccumulator<ReductionNeutral>(rewriter, loc, llvmType, accumulator); return LLVMRedIntrinOp::create(rewriter, loc, llvmType, - /*startValue=*/accumulator, vectorOperand, + /*start_value=*/accumulator, vectorOperand, fmf); } @@ -743,7 +743,7 @@ static Value lowerPredicatedReductionWithStartValue( Value vectorLength = createVectorLengthValue(rewriter, loc, vectorOperand.getType()); return LLVMVPRedIntrinOp::create(rewriter, loc, llvmType, - /*startValue=*/accumulator, vectorOperand, + /*satrt_value=*/accumulator, vectorOperand, mask, vectorLength); } diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp index e2c7d80..91c1aa5 100644 --- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp +++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp @@ -46,7 +46,7 @@ static bool isZeroConstant(Value val) { [](auto floatAttr) { return floatAttr.getValue().isZero(); }) .Case<IntegerAttr>( [](auto intAttr) { return intAttr.getValue().isZero(); }) - .Default([](auto) { return false; }); + .Default(false); } static LogicalResult storeLoadPreconditions(PatternRewriter &rewriter, |
