diff options
author | Noah Goldstein <goldstein.w.n@gmail.com> | 2024-03-13 13:13:52 -0700 |
---|---|---|
committer | Fangrui Song <i@maskray.me> | 2024-03-13 13:13:52 -0700 |
commit | 9ce8691dea8dadc1302abacf4302f3b805e1448d (patch) | |
tree | fdc2da3081156b4c9b80b0d417f090efadac946c /mlir | |
parent | 795e3c3d94da0a664642d4580d87c82c02d5eca4 (diff) | |
parent | 744a23f24b08e8b988b176173c433d64761e66b3 (diff) | |
download | llvm-users/MaskRay/spr/main.llvm-objcopy-add-compress-sections.zip llvm-users/MaskRay/spr/main.llvm-objcopy-add-compress-sections.tar.gz llvm-users/MaskRay/spr/main.llvm-objcopy-add-compress-sections.tar.bz2 |
[𝘀𝗽𝗿] changes introduced through rebaseusers/MaskRay/spr/main.llvm-objcopy-add-compress-sections
Created using spr 1.3.5-bogner
[skip ci]
Diffstat (limited to 'mlir')
-rw-r--r-- | mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 12 | ||||
-rw-r--r-- | mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td | 2 | ||||
-rw-r--r-- | mlir/include/mlir/Transforms/Inliner.h | 43 | ||||
-rw-r--r-- | mlir/include/mlir/Transforms/Passes.td | 7 | ||||
-rw-r--r-- | mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 5 | ||||
-rw-r--r-- | mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp | 99 | ||||
-rw-r--r-- | mlir/lib/Transforms/InlinerPass.cpp | 38 | ||||
-rw-r--r-- | mlir/lib/Transforms/Utils/Inliner.cpp | 3 | ||||
-rw-r--r-- | mlir/test/Conversion/TensorToLinalg/tensor-ops-to-linalg.mlir | 3 | ||||
-rw-r--r-- | mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir | 20 | ||||
-rw-r--r-- | mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir | 3 | ||||
-rw-r--r-- | mlir/test/Dialect/Linalg/transform-op-matmul-to-outerproduct.mlir | 67 | ||||
-rw-r--r-- | mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir | 66 | ||||
-rw-r--r-- | mlir/test/Transforms/inlining-dump-default-pipeline.mlir | 2 | ||||
-rw-r--r-- | mlir/test/Transforms/inlining-threshold.mlir | 18 |
15 files changed, 307 insertions, 81 deletions
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 39ff49f..0bd402d 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -268,6 +268,9 @@ def ParallelOp : OpenMP_Op<"parallel", [ The optional $proc_bind_val attribute controls the thread affinity for the execution of the parallel region. + + The optional byref attribute controls whether reduction arguments are passed by + reference or by value. }]; let arguments = (ins Optional<I1>:$if_expr_var, @@ -278,7 +281,8 @@ def ParallelOp : OpenMP_Op<"parallel", [ OptionalAttr<SymbolRefArrayAttr>:$reductions, OptionalAttr<ProcBindKindAttr>:$proc_bind_val, Variadic<AnyType>:$private_vars, - OptionalAttr<SymbolRefArrayAttr>:$privatizers); + OptionalAttr<SymbolRefArrayAttr>:$privatizers, + UnitAttr:$byref); let regions = (region AnyRegion:$region); @@ -299,6 +303,7 @@ def ParallelOp : OpenMP_Op<"parallel", [ $allocators_vars, type($allocators_vars) ) `)` | `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)` + | `byref` $byref ) custom<ParallelRegion>($region, $reduction_vars, type($reduction_vars), $reductions, $private_vars, type($private_vars), $privatizers) attr-dict @@ -570,6 +575,9 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments, The optional `order` attribute specifies which order the iterations of the associate loops are executed in. Currently the only option for this attribute is "concurrent". + + The optional `byref` attribute indicates that reduction arguments should be + passed by reference. }]; let arguments = (ins Variadic<IntLikeType>:$lowerBound, @@ -584,6 +592,7 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments, OptionalAttr<ScheduleModifierAttr>:$schedule_modifier, UnitAttr:$simd_modifier, UnitAttr:$nowait, + UnitAttr:$byref, ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$ordered_val, OptionalAttr<OrderKindAttr>:$order_val, UnitAttr:$inclusive); @@ -613,6 +622,7 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments, $schedule_val, $schedule_modifier, $simd_modifier, $schedule_chunk_var, type($schedule_chunk_var)) `)` |`nowait` $nowait + |`byref` $byref |`ordered` `(` $ordered_val `)` |`order` `(` custom<ClauseAttr>($order_val) `)` ) custom<WsLoop>($region, $lowerBound, $upperBound, $step, diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td index 670202f..cf7f3e8 100644 --- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td +++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td @@ -1364,7 +1364,7 @@ def Tensor_PadOp : Tensor_Op<"pad", [ unsigned count = staticAttrs.size(); for (unsigned idx = 0; idx < count; ++idx) { if (ShapedType::isDynamic(staticAttrs[idx])) - res.push_back(values[numDynamic++]); + res.push_back(getAsOpFoldResult(values[numDynamic++])); else res.push_back(builder.getI64IntegerAttr(staticAttrs[idx])); } diff --git a/mlir/include/mlir/Transforms/Inliner.h b/mlir/include/mlir/Transforms/Inliner.h index 1fe61fb..073b83f 100644 --- a/mlir/include/mlir/Transforms/Inliner.h +++ b/mlir/include/mlir/Transforms/Inliner.h @@ -69,19 +69,6 @@ private: /// of inlining decisions from the leafs to the roots of the callgraph. class Inliner { public: - using RunPipelineHelperTy = std::function<LogicalResult( - Pass &pass, OpPassManager &pipeline, Operation *op)>; - - Inliner(Operation *op, CallGraph &cg, Pass &pass, AnalysisManager am, - RunPipelineHelperTy runPipelineHelper, const InlinerConfig &config) - : op(op), cg(cg), pass(pass), am(am), - runPipelineHelper(std::move(runPipelineHelper)), config(config) {} - Inliner(Inliner &) = delete; - void operator=(const Inliner &) = delete; - - /// Perform inlining on a OpTrait::SymbolTable operation. - LogicalResult doInlining(); - /// This struct represents a resolved call to a given callgraph node. Given /// that the call does not actually contain a direct reference to the /// Region(CallGraphNode) that it is dispatching to, we need to resolve them @@ -94,7 +81,29 @@ public: CallGraphNode *sourceNode, *targetNode; }; -protected: + using RunPipelineHelperTy = std::function<LogicalResult( + Pass &pass, OpPassManager &pipeline, Operation *op)>; + + /// Type of the callback answering if it is profitable + /// to inline a callable operation at a call site. + /// It might be the case that the ResolvedCall does not provide + /// enough context to make the profitability decision, so + /// this hook's interface might need to be extended in future. + using ProfitabilityCallbackTy = std::function<bool(const ResolvedCall &)>; + + Inliner(Operation *op, CallGraph &cg, Pass &pass, AnalysisManager am, + RunPipelineHelperTy runPipelineHelper, const InlinerConfig &config, + ProfitabilityCallbackTy isProfitableToInline) + : op(op), cg(cg), pass(pass), am(am), + runPipelineHelper(std::move(runPipelineHelper)), config(config), + isProfitableToInline(std::move(isProfitableToInline)) {} + Inliner(Inliner &) = delete; + void operator=(const Inliner &) = delete; + + /// Perform inlining on a OpTrait::SymbolTable operation. + LogicalResult doInlining(); + +private: /// An OpTrait::SymbolTable operation to run the inlining on. Operation *op; /// A CallGraph analysis for the given operation. @@ -108,12 +117,12 @@ protected: const RunPipelineHelperTy runPipelineHelper; /// The inliner configuration parameters. const InlinerConfig &config; + /// Returns true, if it is profitable to inline the callable operation + /// at the call site. + ProfitabilityCallbackTy isProfitableToInline; -private: /// Forward declaration of the class providing the actual implementation. class Impl; - -public: }; } // namespace mlir diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td index b8fdf7a..51b2a27 100644 --- a/mlir/include/mlir/Transforms/Passes.td +++ b/mlir/include/mlir/Transforms/Passes.td @@ -278,6 +278,13 @@ def Inliner : Pass<"inline"> { Option<"maxInliningIterations", "max-iterations", "unsigned", /*default=*/"4", "Maximum number of iterations when inlining within an SCC">, + Option<"inliningThreshold", "inlining-threshold", "unsigned", + /*default=*/"-1U", + "If the ratio between the number of the operations " + "in the callee and the number of the operations " + "in the caller exceeds this value (in percentage), " + "then the callee is not inlined even if it is legal " + "to inline it">, ]; } diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 8a6980e..e7b899a 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -1209,7 +1209,7 @@ void ParallelOp::build(OpBuilder &builder, OperationState &state, /*allocate_vars=*/ValueRange(), /*allocators_vars=*/ValueRange(), /*reduction_vars=*/ValueRange(), /*reductions=*/nullptr, /*proc_bind_val=*/nullptr, /*private_vars=*/ValueRange(), - /*privatizers=*/nullptr); + /*privatizers=*/nullptr, /*byref=*/false); state.addAttributes(attributes); } @@ -1674,7 +1674,8 @@ void WsLoopOp::build(OpBuilder &builder, OperationState &state, /*linear_step_vars=*/ValueRange(), /*reduction_vars=*/ValueRange(), /*reductions=*/nullptr, /*schedule_val=*/nullptr, /*schedule_chunk_var=*/nullptr, /*schedule_modifier=*/nullptr, - /*simd_modifier=*/false, /*nowait=*/false, /*ordered_val=*/nullptr, + /*simd_modifier=*/false, /*nowait=*/false, /*byref=*/false, + /*ordered_val=*/nullptr, /*order_val=*/nullptr, /*inclusive=*/false); state.addAttributes(attributes); } diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index bef227f..5027f2a 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -805,12 +805,12 @@ convertOmpTaskgroupOp(omp::TaskGroupOp tgOp, llvm::IRBuilderBase &builder, /// Allocate space for privatized reduction variables. template <typename T> static void -allocReductionVars(T loop, llvm::IRBuilderBase &builder, - LLVM::ModuleTranslation &moduleTranslation, - llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, - SmallVector<omp::ReductionDeclareOp> &reductionDecls, - SmallVector<llvm::Value *> &privateReductionVariables, - DenseMap<Value, llvm::Value *> &reductionVariableMap) { +allocByValReductionVars(T loop, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation, + llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, + SmallVector<omp::ReductionDeclareOp> &reductionDecls, + SmallVector<llvm::Value *> &privateReductionVariables, + DenseMap<Value, llvm::Value *> &reductionVariableMap) { llvm::IRBuilderBase::InsertPointGuard guard(builder); builder.restoreIP(allocaIP); auto args = @@ -863,6 +863,7 @@ static LogicalResult convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { auto loop = cast<omp::WsLoopOp>(opInst); + const bool isByRef = loop.getByref(); // TODO: this should be in the op verifier instead. if (loop.getLowerBound().empty()) return failure(); @@ -888,18 +889,17 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder, SmallVector<llvm::Value *> privateReductionVariables; DenseMap<Value, llvm::Value *> reductionVariableMap; - allocReductionVars(loop, builder, moduleTranslation, allocaIP, reductionDecls, - privateReductionVariables, reductionVariableMap); - - // Store the mapping between reduction variables and their private copies on - // ModuleTranslation stack. It can be then recovered when translating - // omp.reduce operations in a separate call. - LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard( - moduleTranslation, reductionVariableMap); + if (!isByRef) { + allocByValReductionVars(loop, builder, moduleTranslation, allocaIP, + reductionDecls, privateReductionVariables, + reductionVariableMap); + } // Before the loop, store the initial values of reductions into reduction // variables. Although this could be done after allocas, we don't want to mess // up with the alloca insertion point. + MutableArrayRef<BlockArgument> reductionArgs = + loop.getRegion().getArguments().take_back(loop.getNumReductionVars()); for (unsigned i = 0; i < loop.getNumReductionVars(); ++i) { SmallVector<llvm::Value *> phis; if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(), @@ -908,9 +908,31 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder, return failure(); assert(phis.size() == 1 && "expected one value to be yielded from the " "reduction neutral element declaration region"); - builder.CreateStore(phis[0], privateReductionVariables[i]); + if (isByRef) { + // Allocate reduction variable (which is a pointer to the real reduction + // variable allocated in the inlined region) + llvm::Value *var = builder.CreateAlloca( + moduleTranslation.convertType(reductionDecls[i].getType())); + // Store the result of the inlined region to the allocated reduction var + // ptr + builder.CreateStore(phis[0], var); + + privateReductionVariables.push_back(var); + moduleTranslation.mapValue(reductionArgs[i], phis[0]); + reductionVariableMap.try_emplace(loop.getReductionVars()[i], phis[0]); + } else { + // for by-ref case the store is inside of the reduction region + builder.CreateStore(phis[0], privateReductionVariables[i]); + // the rest was handled in allocByValReductionVars + } } + // Store the mapping between reduction variables and their private copies on + // ModuleTranslation stack. It can be then recovered when translating + // omp.reduce operations in a separate call. + LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard( + moduleTranslation, reductionVariableMap); + // Set up the source location value for OpenMP runtime. llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); @@ -1014,7 +1036,7 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder, builder.SetInsertPoint(tempTerminator); llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint = ompBuilder->createReductions(builder.saveIP(), allocaIP, reductionInfos, - loop.getNowait()); + loop.getNowait(), isByRef); if (!contInsertPoint.getBlock()) return loop->emitOpError() << "failed to convert reductions"; auto nextInsertionPoint = @@ -1068,6 +1090,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; OmpParallelOpConversionManager raii(opInst); + const bool isByRef = opInst.getByref(); // TODO: support error propagation in OpenMPIRBuilder and use it instead of // relying on captured variables. @@ -1082,18 +1105,17 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, // Allocate reduction vars SmallVector<llvm::Value *> privateReductionVariables; DenseMap<Value, llvm::Value *> reductionVariableMap; - allocReductionVars(opInst, builder, moduleTranslation, allocaIP, - reductionDecls, privateReductionVariables, - reductionVariableMap); - - // Store the mapping between reduction variables and their private copies on - // ModuleTranslation stack. It can be then recovered when translating - // omp.reduce operations in a separate call. - LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard( - moduleTranslation, reductionVariableMap); + if (!isByRef) { + allocByValReductionVars(opInst, builder, moduleTranslation, allocaIP, + reductionDecls, privateReductionVariables, + reductionVariableMap); + } // Initialize reduction vars builder.restoreIP(allocaIP); + MutableArrayRef<BlockArgument> reductionArgs = + opInst.getRegion().getArguments().take_back( + opInst.getNumReductionVars()); for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) { SmallVector<llvm::Value *> phis; if (failed(inlineConvertOmpRegions( @@ -1104,9 +1126,32 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, "expected one value to be yielded from the " "reduction neutral element declaration region"); builder.restoreIP(allocaIP); - builder.CreateStore(phis[0], privateReductionVariables[i]); + + if (isByRef) { + // Allocate reduction variable (which is a pointer to the real reduciton + // variable allocated in the inlined region) + llvm::Value *var = builder.CreateAlloca( + moduleTranslation.convertType(reductionDecls[i].getType())); + // Store the result of the inlined region to the allocated reduction var + // ptr + builder.CreateStore(phis[0], var); + + privateReductionVariables.push_back(var); + moduleTranslation.mapValue(reductionArgs[i], phis[0]); + reductionVariableMap.try_emplace(opInst.getReductionVars()[i], phis[0]); + } else { + // for by-ref case the store is inside of the reduction init region + builder.CreateStore(phis[0], privateReductionVariables[i]); + // the rest is done in allocByValReductionVars + } } + // Store the mapping between reduction variables and their private copies on + // ModuleTranslation stack. It can be then recovered when translating + // omp.reduce operations in a separate call. + LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard( + moduleTranslation, reductionVariableMap); + // Save the alloca insertion point on ModuleTranslation stack for use in // nested regions. LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame( @@ -1137,7 +1182,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint = ompBuilder->createReductions(builder.saveIP(), allocaIP, - reductionInfos, false); + reductionInfos, false, isByRef); if (!contInsertPoint.getBlock()) { bodyGenStatus = opInst->emitOpError() << "failed to convert reductions"; return; diff --git a/mlir/lib/Transforms/InlinerPass.cpp b/mlir/lib/Transforms/InlinerPass.cpp index c058e80..08d8dbf 100644 --- a/mlir/lib/Transforms/InlinerPass.cpp +++ b/mlir/lib/Transforms/InlinerPass.cpp @@ -24,6 +24,8 @@ namespace mlir { #include "mlir/Transforms/Passes.h.inc" } // namespace mlir +#define DEBUG_TYPE "inliner-pass" + using namespace mlir; /// This function implements the inliner optimization pipeline. @@ -88,6 +90,35 @@ InlinerPass::InlinerPass(std::function<void(OpPassManager &)> defaultPipeline, config.setOpPipelines(std::move(opPipelines)); } +// Return true if the inlining ratio does not exceed the threshold. +static bool isProfitableToInline(const Inliner::ResolvedCall &resolvedCall, + unsigned inliningThreshold) { + Region *callerRegion = resolvedCall.sourceNode->getCallableRegion(); + Region *calleeRegion = resolvedCall.targetNode->getCallableRegion(); + + // We should not get external nodes here, but just return true + // for now to preserve the original behavior of the inliner pass. + if (!calleeRegion || !calleeRegion) + return true; + + auto countOps = [](Region *region) { + unsigned count = 0; + region->walk([&](Operation *) { ++count; }); + return count; + }; + + unsigned callerOps = countOps(callerRegion); + + // Always inline empty callees (if it is possible at all). + if (callerOps == 0) + return true; + + unsigned ratio = countOps(calleeRegion) * 100 / callerOps; + LLVM_DEBUG(llvm::dbgs() << "Callee / caller operation ratio (max: " + << inliningThreshold << "%): " << ratio << "%\n"); + return ratio <= inliningThreshold; +} + void InlinerPass::runOnOperation() { CallGraph &cg = getAnalysis<CallGraph>(); @@ -100,9 +131,14 @@ void InlinerPass::runOnOperation() { return signalPassFailure(); } + // By default, assume that any inlining is profitable. + auto profitabilityCb = [=](const Inliner::ResolvedCall &call) { + return isProfitableToInline(call, inliningThreshold); + }; + // Get an instance of the inliner. Inliner inliner(op, cg, *this, getAnalysisManager(), runPipelineHelper, - config); + config, profitabilityCb); // Run the inlining. if (failed(inliner.doInlining())) diff --git a/mlir/lib/Transforms/Utils/Inliner.cpp b/mlir/lib/Transforms/Utils/Inliner.cpp index f227ced..8acfc96 100644 --- a/mlir/lib/Transforms/Utils/Inliner.cpp +++ b/mlir/lib/Transforms/Utils/Inliner.cpp @@ -741,6 +741,9 @@ bool Inliner::Impl::shouldInline(ResolvedCall &resolvedCall) { if (calleeHasMultipleBlocks && !callerRegionSupportsMultipleBlocks()) return false; + if (!inliner.isProfitableToInline(resolvedCall)) + return false; + // Otherwise, inline. return true; } diff --git a/mlir/test/Conversion/TensorToLinalg/tensor-ops-to-linalg.mlir b/mlir/test/Conversion/TensorToLinalg/tensor-ops-to-linalg.mlir index 238c0c5..a0a676e 100644 --- a/mlir/test/Conversion/TensorToLinalg/tensor-ops-to-linalg.mlir +++ b/mlir/test/Conversion/TensorToLinalg/tensor-ops-to-linalg.mlir @@ -22,7 +22,6 @@ func.func @generalize_pad_tensor_static_shape(%arg0: tensor<1x28x28x1xf32>) -> t // CHECK-LABEL: func @generalize_pad_tensor_dynamic_shape( // CHECK-SAME: %[[IN:.*]]: tensor<4x?x2x?xf32>, // CHECK-SAME: %[[OFFSET:.*]]: index) -> tensor<4x?x?x?xf32> { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK: %[[DIM1:.*]] = tensor.dim %[[IN]], %[[C1]] : tensor<4x?x2x?xf32> @@ -33,7 +32,7 @@ func.func @generalize_pad_tensor_static_shape(%arg0: tensor<1x28x28x1xf32>) -> t // CHECK: %[[OUT_DIM3:.*]] = arith.addi %[[DIM3]], %[[OFFSET]] : index // CHECK: %[[INIT:.*]] = tensor.empty(%[[DIM1]], %[[OUT_DIM2]], %[[OUT_DIM3]]) : tensor<4x?x?x?xf32> // CHECK: %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT]] : tensor<4x?x?x?xf32>) -> tensor<4x?x?x?xf32> -// CHECK: %[[PADDED:.*]] = tensor.insert_slice %[[IN]] into %[[FILL]]{{\[}}%[[C0]], %[[C0]], %[[OFFSET]], %[[C0]]] [4, %[[DIM1]], 2, %[[DIM3]]] [1, 1, 1, 1] : tensor<4x?x2x?xf32> into tensor<4x?x?x?xf32> +// CHECK: %[[PADDED:.*]] = tensor.insert_slice %[[IN]] into %[[FILL]][0, 0, %[[OFFSET]], 0] [4, %[[DIM1]], 2, %[[DIM3]]] [1, 1, 1, 1] : tensor<4x?x2x?xf32> into tensor<4x?x?x?xf32> // CHECK: return %[[PADDED]] : tensor<4x?x?x?xf32> // CHECK: } func.func @generalize_pad_tensor_dynamic_shape(%arg0: tensor<4x?x2x?xf32>, %arg1: index) -> tensor<4x?x?x?xf32> { diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir index f2c490b..c140b6a 100644 --- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir +++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir @@ -1033,3 +1033,23 @@ func.func @do_not_drop_non_constant_padding(%arg0: tensor<1x1x3x1x1xf32>, %pad: // CHECK-SLICES-LABEL: func @do_not_drop_non_constant_padding // CHECK-SLICES: tensor.pad %{{.*}} low[0, 1, 0, %c0, 0] high[0, 0, 0, %c0, 2] // CHECK-SLICES: } : tensor<1x1x3x1x1xf32> to tensor<1x2x3x1x3xf32> + +// ----- + +func.func @drop_known_unit_constant_low_high(%arg0: tensor<1x383x128xf32>) -> tensor<1x384x128xf32> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f32 + %padded = tensor.pad %arg0 low[%c0, %c1, %c0] high[%c0, %c0, %c0] { + ^bb0(%arg1: index, %arg2: index, %arg3: index): + tensor.yield %cst : f32 + } : tensor<1x383x128xf32> to tensor<1x384x128xf32> + return %padded : tensor<1x384x128xf32> +} +// CHECK-LABEL: func @drop_known_unit_constant_low_high +// CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape +// CHECK-SAME: {{\[}}[0, 1], [2]] : tensor<1x383x128xf32> into tensor<383x128xf32> +// CHECK: %[[PADDED:.+]] = tensor.pad %[[COLLAPSE]] low[1, 0] high[0, 0] +// CHECK: } : tensor<383x128xf32> to tensor<384x128xf32> +// CHECK: tensor.expand_shape %[[PADDED]] +// CHECK-SAME: {{\[}}[0, 1], [2]] : tensor<384x128xf32> into tensor<1x384x128xf32> diff --git a/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir b/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir index ac0eb48..2beab31 100644 --- a/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir +++ b/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir @@ -19,7 +19,6 @@ func.func @generalize_pad_tensor_static_shape(%arg0: tensor<1x28x28x1xf32>) -> t // CHECK-LABEL: func @generalize_pad_tensor_dynamic_shape( // CHECK-SAME: %[[IN:.*]]: tensor<4x?x2x?xf32>, // CHECK-SAME: %[[OFFSET:.*]]: index) -> tensor<4x?x?x?xf32> { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index @@ -32,7 +31,7 @@ func.func @generalize_pad_tensor_static_shape(%arg0: tensor<1x28x28x1xf32>) -> t // CHECK: %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT]] : tensor<4x?x?x?xf32>) -> tensor<4x?x?x?xf32> // CHECK: %[[DIM1_1:.*]] = tensor.dim %[[IN]], %[[C1]] : tensor<4x?x2x?xf32> // CHECK: %[[DIM3_1:.*]] = tensor.dim %[[IN]], %[[C3]] : tensor<4x?x2x?xf32> -// CHECK: %[[PADDED:.*]] = tensor.insert_slice %[[IN]] into %[[FILL]]{{\[}}%[[C0]], %[[C0]], %[[OFFSET]], %[[C0]]] [4, %[[DIM1_1]], 2, %[[DIM3_1]]] [1, 1, 1, 1] : tensor<4x?x2x?xf32> into tensor<4x?x?x?xf32> +// CHECK: %[[PADDED:.*]] = tensor.insert_slice %[[IN]] into %[[FILL]][0, 0, %[[OFFSET]], 0] [4, %[[DIM1_1]], 2, %[[DIM3_1]]] [1, 1, 1, 1] : tensor<4x?x2x?xf32> into tensor<4x?x?x?xf32> // CHECK: return %[[PADDED]] : tensor<4x?x?x?xf32> // CHECK: } func.func @generalize_pad_tensor_dynamic_shape(%arg0: tensor<4x?x2x?xf32>, %arg1: index) -> tensor<4x?x?x?xf32> { diff --git a/mlir/test/Dialect/Linalg/transform-op-matmul-to-outerproduct.mlir b/mlir/test/Dialect/Linalg/transform-op-matmul-to-outerproduct.mlir index ee66073..a1a0c41 100644 --- a/mlir/test/Dialect/Linalg/transform-op-matmul-to-outerproduct.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-matmul-to-outerproduct.mlir @@ -1,38 +1,51 @@ // RUN: mlir-opt %s -transform-interpreter | FileCheck %s -func.func @outerproduct_matmul(%A: memref<3x3xf32>, %B: memref<3x3xf32>, %C: memref<3x3xf32>) { - linalg.matmul ins(%A, %B: memref<3x3xf32>, memref<3x3xf32>) +func.func @matmul_to_outerproduct(%A: memref<3x4xf32>, %B: memref<4x3xf32>, %C: memref<3x3xf32>) { + linalg.matmul ins(%A, %B: memref<3x4xf32>, memref<4x3xf32>) outs(%C: memref<3x3xf32>) return } -// CHECK-LABEL: func.func @outerproduct_matmul( -// CHECK-SAME: %[[VAL_0:.*]]: memref<3x3xf32>, %[[VAL_1:.*]]: memref<3x3xf32>, %[[VAL_2:.*]]: memref<3x3xf32>) { -// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index -// CHECK: %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK: %[[VAL_5:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_3]], %[[VAL_3]]], %[[VAL_4]] {in_bounds = [true, true]} : memref<3x3xf32>, vector<3x3xf32> -// CHECK: %[[VAL_6:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_3]], %[[VAL_3]]], %[[VAL_4]] {in_bounds = [true, true]} : memref<3x3xf32>, vector<3x3xf32> -// CHECK: %[[VAL_7:.*]] = vector.transfer_read %[[VAL_2]]{{\[}}%[[VAL_3]], %[[VAL_3]]], %[[VAL_4]] {in_bounds = [true, true]} : memref<3x3xf32>, vector<3x3xf32> -// CHECK: %[[VAL_8:.*]] = vector.transpose %[[VAL_5]], [1, 0] : vector<3x3xf32> to vector<3x3xf32> -// CHECK: %[[VAL_9:.*]] = vector.extract %[[VAL_8]][0] : vector<3xf32> from vector<3x3xf32> -// CHECK: %[[VAL_10:.*]] = vector.extract %[[VAL_6]][0] : vector<3xf32> from vector<3x3xf32> -// CHECK: %[[VAL_11:.*]] = vector.outerproduct %[[VAL_9]], %[[VAL_10]], %[[VAL_7]] {kind = #vector.kind<add>} : vector<3xf32>, vector<3xf32> -// CHECK: %[[VAL_12:.*]] = vector.extract %[[VAL_8]][1] : vector<3xf32> from vector<3x3xf32> -// CHECK: %[[VAL_13:.*]] = vector.extract %[[VAL_6]][1] : vector<3xf32> from vector<3x3xf32> -// CHECK: %[[VAL_14:.*]] = vector.outerproduct %[[VAL_12]], %[[VAL_13]], %[[VAL_11]] {kind = #vector.kind<add>} : vector<3xf32>, vector<3xf32> -// CHECK: %[[VAL_15:.*]] = vector.extract %[[VAL_8]][2] : vector<3xf32> from vector<3x3xf32> -// CHECK: %[[VAL_16:.*]] = vector.extract %[[VAL_6]][2] : vector<3xf32> from vector<3x3xf32> -// CHECK: %[[VAL_17:.*]] = vector.outerproduct %[[VAL_15]], %[[VAL_16]], %[[VAL_14]] {kind = #vector.kind<add>} : vector<3xf32>, vector<3xf32> -// CHECK: vector.transfer_write %[[VAL_17]], %[[VAL_2]]{{\[}}%[[VAL_3]], %[[VAL_3]]] {in_bounds = [true, true]} : vector<3x3xf32>, memref<3x3xf32> -// CHECK: return -// CHECK: } +// CHECK-LABEL: func.func @matmul_to_outerproduct( +// CHECK-SAME: %[[A:.*]]: memref<3x4xf32>, +// CHECK-SAME: %[[B:.*]]: memref<4x3xf32>, +// CHECK-SAME: %[[C:.*]]: memref<3x3xf32>) { +// CHECK: %[[VEC_A:.*]] = vector.transfer_read %[[A]] +// CHECK: %[[VEC_B:.*]] = vector.transfer_read %[[B]] +// CHECK: %[[VEC_C:.*]] = vector.transfer_read %[[C]] +// CHECK: %[[VEC_A_T:.*]] = vector.transpose %[[VEC_A]], [1, 0] : vector<3x4xf32> to vector<4x3xf32> +// CHECK: %[[A0:.*]] = vector.extract %[[VEC_A_T]][0] : vector<3xf32> from vector<4x3xf32> +// CHECK: %[[B0:.*]] = vector.extract %[[VEC_B]][0] : vector<3xf32> from vector<4x3xf32> +// CHECK: %[[OP_0:.*]] = vector.outerproduct %[[A0]], %[[B0]], %[[VEC_C]] +// CHECK: %[[A1:.*]] = vector.extract %[[VEC_A_T]][1] : vector<3xf32> from vector<4x3xf32> +// CHECK: %[[B1:.*]] = vector.extract %[[VEC_B]][1] : vector<3xf32> from vector<4x3xf32> +// CHECK: %[[OP_1:.*]] = vector.outerproduct %[[A1]], %[[B1]], %[[OP_0]] +// CHECK: %[[A_2:.*]] = vector.extract %[[VEC_A_T]][2] : vector<3xf32> from vector<4x3xf32> +// CHECK: %[[B_2:.*]] = vector.extract %[[VEC_B]][2] : vector<3xf32> from vector<4x3xf32> +// CHECK: %[[OP_2:.*]] = vector.outerproduct %[[A_2]], %[[B_2]], %[[OP_1]] +// CHECK: %[[A_3:.*]] = vector.extract %[[VEC_A_T]][3] : vector<3xf32> from vector<4x3xf32> +// CHECK: %[[B_3:.*]] = vector.extract %[[VEC_B]][3] : vector<3xf32> from vector<4x3xf32> +// CHECK: %[[RES:.*]] = vector.outerproduct %[[A_3]], %[[B_3]], %[[OP_2]] +// CHECK: vector.transfer_write %[[RES]], %[[C]]{{.*}} : vector<3x3xf32>, memref<3x3xf32> module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op - %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %2 { + transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %module : (!transform.any_op) -> !transform.any_op + + // Vectorize: linalg.matmul -> vector.multi_reduction + %matmul = transform.structured.match ops{["linalg.matmul"]} in %func : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %matmul : !transform.any_op + + // vector.multi_reduction --> vector.contract + transform.apply_patterns to %func { + transform.apply_patterns.vector.reduction_to_contract + // Reduce the rank of xfer ops. This transform vector.contract to be more + // more matmul-like and to enable the lowering to outer product Ops. + transform.apply_patterns.vector.transfer_permutation_patterns + } : !transform.any_op + + // vector.contract --> vector.outerproduct + transform.apply_patterns to %func { transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct" } : !transform.any_op transform.yield diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir new file mode 100644 index 0000000..4ac1ebd --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir @@ -0,0 +1,66 @@ +// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s + + omp.reduction.declare @add_reduction_i_32 : !llvm.ptr init { + ^bb0(%arg0: !llvm.ptr): + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.constant(1 : i64) : i64 + %2 = llvm.alloca %1 x i32 : (i64) -> !llvm.ptr + llvm.store %0, %2 : i32, !llvm.ptr + omp.yield(%2 : !llvm.ptr) + } combiner { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.load %arg0 : !llvm.ptr -> i32 + %1 = llvm.load %arg1 : !llvm.ptr -> i32 + %2 = llvm.add %0, %1 : i32 + llvm.store %2, %arg0 : i32, !llvm.ptr + omp.yield(%arg0 : !llvm.ptr) + } + + // CHECK-LABEL: @main + llvm.func @main() { + %0 = llvm.mlir.constant(-1 : i32) : i32 + %1 = llvm.mlir.addressof @i : !llvm.ptr + omp.parallel byref reduction(@add_reduction_i_32 %1 -> %arg0 : !llvm.ptr) { + llvm.store %0, %arg0 : i32, !llvm.ptr + omp.terminator + } + llvm.return + } + llvm.mlir.global internal @i() {addr_space = 0 : i32} : i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + llvm.return %0 : i32 + } + +// CHECK: %{{.+}} = +// Call to the outlined function. +// CHECK: call void {{.*}} @__kmpc_fork_call +// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Outlined function. +// CHECK: define internal void @[[OUTLINED]] + +// Private reduction variable and its initialization. +// CHECK: %tid.addr.local = alloca i32 +// CHECK: %[[PRIVATE:.+]] = alloca i32 +// CHECK: store i32 0, ptr %[[PRIVATE]] +// CHECK: store ptr %[[PRIVATE]], ptr %[[PRIV_PTR:.+]], + +// Call to the reduction function. +// CHECK: call i32 @__kmpc_reduce +// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]] + + +// Non-atomic reduction: +// CHECK: %[[PRIV_VAL_PTR:.+]] = load ptr, ptr %[[PRIV_PTR]] +// CHECK: %[[LOAD:.+]] = load i32, ptr @i +// CHECK: %[[PRIV_VAL:.+]] = load i32, ptr %[[PRIV_VAL_PTR]] +// CHECK: %[[SUM:.+]] = add i32 %[[LOAD]], %[[PRIV_VAL]] +// CHECK: store i32 %[[SUM]], ptr @i +// CHECK: call void @__kmpc_end_reduce +// CHECK: br label %[[FINALIZE:.+]] + +// CHECK: [[FINALIZE]]: + +// Reduction function. +// CHECK: define internal void @[[REDFUNC]] +// CHECK: add i32 diff --git a/mlir/test/Transforms/inlining-dump-default-pipeline.mlir b/mlir/test/Transforms/inlining-dump-default-pipeline.mlir index e2c3186..4f86380 100644 --- a/mlir/test/Transforms/inlining-dump-default-pipeline.mlir +++ b/mlir/test/Transforms/inlining-dump-default-pipeline.mlir @@ -1,2 +1,2 @@ // RUN: mlir-opt %s -pass-pipeline="builtin.module(inline)" -dump-pass-pipeline 2>&1 | FileCheck %s -// CHECK: builtin.module(inline{default-pipeline=canonicalize max-iterations=4 }) +// CHECK: builtin.module(inline{default-pipeline=canonicalize inlining-threshold=4294967295 max-iterations=4 }) diff --git a/mlir/test/Transforms/inlining-threshold.mlir b/mlir/test/Transforms/inlining-threshold.mlir new file mode 100644 index 0000000..649408a --- /dev/null +++ b/mlir/test/Transforms/inlining-threshold.mlir @@ -0,0 +1,18 @@ +// RUN: mlir-opt %s -inline='default-pipeline= inlining-threshold=100' | FileCheck %s + +// Check that inlining does not happen when the threshold is exceeded. +func.func @callee1(%arg : i32) -> i32 { + %v1 = arith.addi %arg, %arg : i32 + %v2 = arith.addi %v1, %arg : i32 + %v3 = arith.addi %v2, %arg : i32 + return %v3 : i32 +} + +// CHECK-LABEL: func @caller1 +func.func @caller1(%arg0 : i32) -> i32 { + // CHECK-NEXT: call @callee1 + // CHECK-NEXT: return + + %0 = call @callee1(%arg0) : (i32) -> i32 + return %0 : i32 +} |