[𝘀𝗽𝗿] changes introduced through rebaseusers/MaskRay/spr/main.llvm-objcopy-add-compress-sections

Created using spr 1.3.5-bogner [skip ci]
author: Noah Goldstein <goldstein.w.n@gmail.com> 2024-03-13 13:13:52 -0700
committer: Fangrui Song <i@maskray.me> 2024-03-13 13:13:52 -0700
commit: 9ce8691dea8dadc1302abacf4302f3b805e1448d (patch)
tree: fdc2da3081156b4c9b80b0d417f090efadac946c /mlir
parent: 795e3c3d94da0a664642d4580d87c82c02d5eca4 (diff)
parent: 744a23f24b08e8b988b176173c433d64761e66b3 (diff)
download: llvm-users/MaskRay/spr/main.llvm-objcopy-add-compress-sections.zip
llvm-users/MaskRay/spr/main.llvm-objcopy-add-compress-sections.tar.gz
llvm-users/MaskRay/spr/main.llvm-objcopy-add-compress-sections.tar.bz2
15 files changed, 307 insertions, 81 deletions
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 39ff49f..0bd402d 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -268,6 +268,9 @@ def ParallelOp : OpenMP_Op<"parallel", [
 
     The optional $proc_bind_val attribute controls the thread affinity for the execution
     of the parallel region.
+
+    The optional byref attribute controls whether reduction arguments are passed by
+    reference or by value.
   }];
 
   let arguments = (ins Optional<I1>:$if_expr_var,
@@ -278,7 +281,8 @@ def ParallelOp : OpenMP_Op<"parallel", [
              OptionalAttr<SymbolRefArrayAttr>:$reductions,
              OptionalAttr<ProcBindKindAttr>:$proc_bind_val,
              Variadic<AnyType>:$private_vars,
-             OptionalAttr<SymbolRefArrayAttr>:$privatizers);
+             OptionalAttr<SymbolRefArrayAttr>:$privatizers,
+             UnitAttr:$byref);
 
   let regions = (region AnyRegion:$region);
 
@@ -299,6 +303,7 @@ def ParallelOp : OpenMP_Op<"parallel", [
                 $allocators_vars, type($allocators_vars)
               ) `)`
           | `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)`
+          | `byref` $byref
     ) custom<ParallelRegion>($region, $reduction_vars, type($reduction_vars),
                              $reductions, $private_vars, type($private_vars),
                              $privatizers) attr-dict
@@ -570,6 +575,9 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
     The optional `order` attribute specifies which order the iterations of the
     associate loops are executed in. Currently the only option for this
     attribute is "concurrent".
+    
+    The optional `byref` attribute indicates that reduction arguments should be
+    passed by reference.
   }];
 
   let arguments = (ins Variadic<IntLikeType>:$lowerBound,
@@ -584,6 +592,7 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
              OptionalAttr<ScheduleModifierAttr>:$schedule_modifier,
              UnitAttr:$simd_modifier,
              UnitAttr:$nowait,
+             UnitAttr:$byref,
              ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$ordered_val,
              OptionalAttr<OrderKindAttr>:$order_val,
              UnitAttr:$inclusive);
@@ -613,6 +622,7 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
                 $schedule_val, $schedule_modifier, $simd_modifier,
                 $schedule_chunk_var, type($schedule_chunk_var)) `)`
           |`nowait` $nowait
+          |`byref` $byref
           |`ordered` `(` $ordered_val `)`
           |`order` `(` custom<ClauseAttr>($order_val) `)`
     ) custom<WsLoop>($region, $lowerBound, $upperBound, $step,
diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
index 670202f..cf7f3e8 100644
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -1364,7 +1364,7 @@ def Tensor_PadOp : Tensor_Op<"pad", [
       unsigned count = staticAttrs.size();
       for (unsigned idx = 0; idx < count; ++idx) {
         if (ShapedType::isDynamic(staticAttrs[idx]))
-          res.push_back(values[numDynamic++]);
+          res.push_back(getAsOpFoldResult(values[numDynamic++]));
         else
           res.push_back(builder.getI64IntegerAttr(staticAttrs[idx]));
       }
diff --git a/mlir/include/mlir/Transforms/Inliner.h b/mlir/include/mlir/Transforms/Inliner.h
index 1fe61fb..073b83f 100644
--- a/mlir/include/mlir/Transforms/Inliner.h
+++ b/mlir/include/mlir/Transforms/Inliner.h
@@ -69,19 +69,6 @@ private:
 /// of inlining decisions from the leafs to the roots of the callgraph.
 class Inliner {
 public:
-  using RunPipelineHelperTy = std::function<LogicalResult(
-      Pass &pass, OpPassManager &pipeline, Operation *op)>;
-
-  Inliner(Operation *op, CallGraph &cg, Pass &pass, AnalysisManager am,
-          RunPipelineHelperTy runPipelineHelper, const InlinerConfig &config)
-      : op(op), cg(cg), pass(pass), am(am),
-        runPipelineHelper(std::move(runPipelineHelper)), config(config) {}
-  Inliner(Inliner &) = delete;
-  void operator=(const Inliner &) = delete;
-
-  /// Perform inlining on a OpTrait::SymbolTable operation.
-  LogicalResult doInlining();
-
   /// This struct represents a resolved call to a given callgraph node. Given
   /// that the call does not actually contain a direct reference to the
   /// Region(CallGraphNode) that it is dispatching to, we need to resolve them
@@ -94,7 +81,29 @@ public:
     CallGraphNode *sourceNode, *targetNode;
   };
 
-protected:
+  using RunPipelineHelperTy = std::function<LogicalResult(
+      Pass &pass, OpPassManager &pipeline, Operation *op)>;
+
+  /// Type of the callback answering if it is profitable
+  /// to inline a callable operation at a call site.
+  /// It might be the case that the ResolvedCall does not provide
+  /// enough context to make the profitability decision, so
+  /// this hook's interface might need to be extended in future.
+  using ProfitabilityCallbackTy = std::function<bool(const ResolvedCall &)>;
+
+  Inliner(Operation *op, CallGraph &cg, Pass &pass, AnalysisManager am,
+          RunPipelineHelperTy runPipelineHelper, const InlinerConfig &config,
+          ProfitabilityCallbackTy isProfitableToInline)
+      : op(op), cg(cg), pass(pass), am(am),
+        runPipelineHelper(std::move(runPipelineHelper)), config(config),
+        isProfitableToInline(std::move(isProfitableToInline)) {}
+  Inliner(Inliner &) = delete;
+  void operator=(const Inliner &) = delete;
+
+  /// Perform inlining on a OpTrait::SymbolTable operation.
+  LogicalResult doInlining();
+
+private:
   /// An OpTrait::SymbolTable operation to run the inlining on.
   Operation *op;
   /// A CallGraph analysis for the given operation.
@@ -108,12 +117,12 @@ protected:
   const RunPipelineHelperTy runPipelineHelper;
   /// The inliner configuration parameters.
   const InlinerConfig &config;
+  /// Returns true, if it is profitable to inline the callable operation
+  /// at the call site.
+  ProfitabilityCallbackTy isProfitableToInline;
 
-private:
   /// Forward declaration of the class providing the actual implementation.
   class Impl;
-
-public:
 };
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
index b8fdf7a..51b2a27 100644
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -278,6 +278,13 @@ def Inliner : Pass<"inline"> {
     Option<"maxInliningIterations", "max-iterations", "unsigned",
            /*default=*/"4",
            "Maximum number of iterations when inlining within an SCC">,
+    Option<"inliningThreshold", "inlining-threshold", "unsigned",
+           /*default=*/"-1U",
+           "If the ratio between the number of the operations "
+           "in the callee and the number of the operations "
+           "in the caller exceeds this value (in percentage), "
+           "then the callee is not inlined even if it is legal "
+           "to inline it">,
   ];
 }
 
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 8a6980e..e7b899a 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1209,7 +1209,7 @@ void ParallelOp::build(OpBuilder &builder, OperationState &state,
       /*allocate_vars=*/ValueRange(), /*allocators_vars=*/ValueRange(),
       /*reduction_vars=*/ValueRange(), /*reductions=*/nullptr,
       /*proc_bind_val=*/nullptr, /*private_vars=*/ValueRange(),
-      /*privatizers=*/nullptr);
+      /*privatizers=*/nullptr, /*byref=*/false);
   state.addAttributes(attributes);
 }
 
@@ -1674,7 +1674,8 @@ void WsLoopOp::build(OpBuilder &builder, OperationState &state,
         /*linear_step_vars=*/ValueRange(), /*reduction_vars=*/ValueRange(),
         /*reductions=*/nullptr, /*schedule_val=*/nullptr,
         /*schedule_chunk_var=*/nullptr, /*schedule_modifier=*/nullptr,
-        /*simd_modifier=*/false, /*nowait=*/false, /*ordered_val=*/nullptr,
+        /*simd_modifier=*/false, /*nowait=*/false, /*byref=*/false,
+        /*ordered_val=*/nullptr,
         /*order_val=*/nullptr, /*inclusive=*/false);
   state.addAttributes(attributes);
 }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index bef227f..5027f2a 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -805,12 +805,12 @@ convertOmpTaskgroupOp(omp::TaskGroupOp tgOp, llvm::IRBuilderBase &builder,
 /// Allocate space for privatized reduction variables.
 template <typename T>
 static void
-allocReductionVars(T loop, llvm::IRBuilderBase &builder,
-                   LLVM::ModuleTranslation &moduleTranslation,
-                   llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
-                   SmallVector<omp::ReductionDeclareOp> &reductionDecls,
-                   SmallVector<llvm::Value *> &privateReductionVariables,
-                   DenseMap<Value, llvm::Value *> &reductionVariableMap) {
+allocByValReductionVars(T loop, llvm::IRBuilderBase &builder,
+                        LLVM::ModuleTranslation &moduleTranslation,
+                        llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
+                        SmallVector<omp::ReductionDeclareOp> &reductionDecls,
+                        SmallVector<llvm::Value *> &privateReductionVariables,
+                        DenseMap<Value, llvm::Value *> &reductionVariableMap) {
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   builder.restoreIP(allocaIP);
   auto args =
@@ -863,6 +863,7 @@ static LogicalResult
 convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
                  LLVM::ModuleTranslation &moduleTranslation) {
   auto loop = cast<omp::WsLoopOp>(opInst);
+  const bool isByRef = loop.getByref();
   // TODO: this should be in the op verifier instead.
   if (loop.getLowerBound().empty())
     return failure();
@@ -888,18 +889,17 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
 
   SmallVector<llvm::Value *> privateReductionVariables;
   DenseMap<Value, llvm::Value *> reductionVariableMap;
-  allocReductionVars(loop, builder, moduleTranslation, allocaIP, reductionDecls,
-                     privateReductionVariables, reductionVariableMap);
-
-  // Store the mapping between reduction variables and their private copies on
-  // ModuleTranslation stack. It can be then recovered when translating
-  // omp.reduce operations in a separate call.
-  LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(
-      moduleTranslation, reductionVariableMap);
+  if (!isByRef) {
+    allocByValReductionVars(loop, builder, moduleTranslation, allocaIP,
+                            reductionDecls, privateReductionVariables,
+                            reductionVariableMap);
+  }
 
   // Before the loop, store the initial values of reductions into reduction
   // variables. Although this could be done after allocas, we don't want to mess
   // up with the alloca insertion point.
+  MutableArrayRef<BlockArgument> reductionArgs =
+      loop.getRegion().getArguments().take_back(loop.getNumReductionVars());
   for (unsigned i = 0; i < loop.getNumReductionVars(); ++i) {
     SmallVector<llvm::Value *> phis;
     if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(),
@@ -908,9 +908,31 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
       return failure();
     assert(phis.size() == 1 && "expected one value to be yielded from the "
                                "reduction neutral element declaration region");
-    builder.CreateStore(phis[0], privateReductionVariables[i]);
+    if (isByRef) {
+      // Allocate reduction variable (which is a pointer to the real reduction
+      // variable allocated in the inlined region)
+      llvm::Value *var = builder.CreateAlloca(
+          moduleTranslation.convertType(reductionDecls[i].getType()));
+      // Store the result of the inlined region to the allocated reduction var
+      // ptr
+      builder.CreateStore(phis[0], var);
+
+      privateReductionVariables.push_back(var);
+      moduleTranslation.mapValue(reductionArgs[i], phis[0]);
+      reductionVariableMap.try_emplace(loop.getReductionVars()[i], phis[0]);
+    } else {
+      // for by-ref case the store is inside of the reduction region
+      builder.CreateStore(phis[0], privateReductionVariables[i]);
+      // the rest was handled in allocByValReductionVars
+    }
   }
 
+  // Store the mapping between reduction variables and their private copies on
+  // ModuleTranslation stack. It can be then recovered when translating
+  // omp.reduce operations in a separate call.
+  LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(
+      moduleTranslation, reductionVariableMap);
+
   // Set up the source location value for OpenMP runtime.
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
 
@@ -1014,7 +1036,7 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
   builder.SetInsertPoint(tempTerminator);
   llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint =
       ompBuilder->createReductions(builder.saveIP(), allocaIP, reductionInfos,
-                                   loop.getNowait());
+                                   loop.getNowait(), isByRef);
   if (!contInsertPoint.getBlock())
     return loop->emitOpError() << "failed to convert reductions";
   auto nextInsertionPoint =
@@ -1068,6 +1090,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) {
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   OmpParallelOpConversionManager raii(opInst);
+  const bool isByRef = opInst.getByref();
 
   // TODO: support error propagation in OpenMPIRBuilder and use it instead of
   // relying on captured variables.
@@ -1082,18 +1105,17 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     // Allocate reduction vars
     SmallVector<llvm::Value *> privateReductionVariables;
     DenseMap<Value, llvm::Value *> reductionVariableMap;
-    allocReductionVars(opInst, builder, moduleTranslation, allocaIP,
-                       reductionDecls, privateReductionVariables,
-                       reductionVariableMap);
-
-    // Store the mapping between reduction variables and their private copies on
-    // ModuleTranslation stack. It can be then recovered when translating
-    // omp.reduce operations in a separate call.
-    LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(
-        moduleTranslation, reductionVariableMap);
+    if (!isByRef) {
+      allocByValReductionVars(opInst, builder, moduleTranslation, allocaIP,
+                              reductionDecls, privateReductionVariables,
+                              reductionVariableMap);
+    }
 
     // Initialize reduction vars
     builder.restoreIP(allocaIP);
+    MutableArrayRef<BlockArgument> reductionArgs =
+        opInst.getRegion().getArguments().take_back(
+            opInst.getNumReductionVars());
     for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
       SmallVector<llvm::Value *> phis;
       if (failed(inlineConvertOmpRegions(
@@ -1104,9 +1126,32 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
              "expected one value to be yielded from the "
              "reduction neutral element declaration region");
       builder.restoreIP(allocaIP);
-      builder.CreateStore(phis[0], privateReductionVariables[i]);
+
+      if (isByRef) {
+        // Allocate reduction variable (which is a pointer to the real reduciton
+        // variable allocated in the inlined region)
+        llvm::Value *var = builder.CreateAlloca(
+            moduleTranslation.convertType(reductionDecls[i].getType()));
+        // Store the result of the inlined region to the allocated reduction var
+        // ptr
+        builder.CreateStore(phis[0], var);
+
+        privateReductionVariables.push_back(var);
+        moduleTranslation.mapValue(reductionArgs[i], phis[0]);
+        reductionVariableMap.try_emplace(opInst.getReductionVars()[i], phis[0]);
+      } else {
+        // for by-ref case the store is inside of the reduction init region
+        builder.CreateStore(phis[0], privateReductionVariables[i]);
+        // the rest is done in allocByValReductionVars
+      }
     }
 
+    // Store the mapping between reduction variables and their private copies on
+    // ModuleTranslation stack. It can be then recovered when translating
+    // omp.reduce operations in a separate call.
+    LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(
+        moduleTranslation, reductionVariableMap);
+
     // Save the alloca insertion point on ModuleTranslation stack for use in
     // nested regions.
     LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
@@ -1137,7 +1182,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
 
       llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint =
           ompBuilder->createReductions(builder.saveIP(), allocaIP,
-                                       reductionInfos, false);
+                                       reductionInfos, false, isByRef);
       if (!contInsertPoint.getBlock()) {
         bodyGenStatus = opInst->emitOpError() << "failed to convert reductions";
         return;
diff --git a/mlir/lib/Transforms/InlinerPass.cpp b/mlir/lib/Transforms/InlinerPass.cpp
index c058e80..08d8dbf 100644
--- a/mlir/lib/Transforms/InlinerPass.cpp
+++ b/mlir/lib/Transforms/InlinerPass.cpp
@@ -24,6 +24,8 @@ namespace mlir {
 #include "mlir/Transforms/Passes.h.inc"
 } // namespace mlir
 
+#define DEBUG_TYPE "inliner-pass"
+
 using namespace mlir;
 
 /// This function implements the inliner optimization pipeline.
@@ -88,6 +90,35 @@ InlinerPass::InlinerPass(std::function<void(OpPassManager &)> defaultPipeline,
   config.setOpPipelines(std::move(opPipelines));
 }
 
+// Return true if the inlining ratio does not exceed the threshold.
+static bool isProfitableToInline(const Inliner::ResolvedCall &resolvedCall,
+                                 unsigned inliningThreshold) {
+  Region *callerRegion = resolvedCall.sourceNode->getCallableRegion();
+  Region *calleeRegion = resolvedCall.targetNode->getCallableRegion();
+
+  // We should not get external nodes here, but just return true
+  // for now to preserve the original behavior of the inliner pass.
+  if (!calleeRegion || !calleeRegion)
+    return true;
+
+  auto countOps = [](Region *region) {
+    unsigned count = 0;
+    region->walk([&](Operation *) { ++count; });
+    return count;
+  };
+
+  unsigned callerOps = countOps(callerRegion);
+
+  // Always inline empty callees (if it is possible at all).
+  if (callerOps == 0)
+    return true;
+
+  unsigned ratio = countOps(calleeRegion) * 100 / callerOps;
+  LLVM_DEBUG(llvm::dbgs() << "Callee / caller operation ratio (max: "
+                          << inliningThreshold << "%): " << ratio << "%\n");
+  return ratio <= inliningThreshold;
+}
+
 void InlinerPass::runOnOperation() {
   CallGraph &cg = getAnalysis<CallGraph>();
 
@@ -100,9 +131,14 @@ void InlinerPass::runOnOperation() {
     return signalPassFailure();
   }
 
+  // By default, assume that any inlining is profitable.
+  auto profitabilityCb = [=](const Inliner::ResolvedCall &call) {
+    return isProfitableToInline(call, inliningThreshold);
+  };
+
   // Get an instance of the inliner.
   Inliner inliner(op, cg, *this, getAnalysisManager(), runPipelineHelper,
-                  config);
+                  config, profitabilityCb);
 
   // Run the inlining.
   if (failed(inliner.doInlining()))
diff --git a/mlir/lib/Transforms/Utils/Inliner.cpp b/mlir/lib/Transforms/Utils/Inliner.cpp
index f227ced..8acfc96 100644
--- a/mlir/lib/Transforms/Utils/Inliner.cpp
+++ b/mlir/lib/Transforms/Utils/Inliner.cpp
@@ -741,6 +741,9 @@ bool Inliner::Impl::shouldInline(ResolvedCall &resolvedCall) {
   if (calleeHasMultipleBlocks && !callerRegionSupportsMultipleBlocks())
     return false;
 
+  if (!inliner.isProfitableToInline(resolvedCall))
+    return false;
+
   // Otherwise, inline.
   return true;
 }
diff --git a/mlir/test/Conversion/TensorToLinalg/tensor-ops-to-linalg.mlir b/mlir/test/Conversion/TensorToLinalg/tensor-ops-to-linalg.mlir
index 238c0c5..a0a676e 100644
--- a/mlir/test/Conversion/TensorToLinalg/tensor-ops-to-linalg.mlir
+++ b/mlir/test/Conversion/TensorToLinalg/tensor-ops-to-linalg.mlir
@@ -22,7 +22,6 @@ func.func @generalize_pad_tensor_static_shape(%arg0: tensor<1x28x28x1xf32>) -> t
 // CHECK-LABEL:   func @generalize_pad_tensor_dynamic_shape(
 // CHECK-SAME:                                              %[[IN:.*]]: tensor<4x?x2x?xf32>,
 // CHECK-SAME:                                              %[[OFFSET:.*]]: index) -> tensor<4x?x?x?xf32> {
-// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
 // CHECK:           %[[DIM1:.*]] = tensor.dim %[[IN]], %[[C1]] : tensor<4x?x2x?xf32>
@@ -33,7 +32,7 @@ func.func @generalize_pad_tensor_static_shape(%arg0: tensor<1x28x28x1xf32>) -> t
 // CHECK:           %[[OUT_DIM3:.*]] = arith.addi %[[DIM3]], %[[OFFSET]] : index
 // CHECK:           %[[INIT:.*]] = tensor.empty(%[[DIM1]], %[[OUT_DIM2]], %[[OUT_DIM3]]) : tensor<4x?x?x?xf32>
 // CHECK:           %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT]] : tensor<4x?x?x?xf32>) -> tensor<4x?x?x?xf32>
-// CHECK:           %[[PADDED:.*]] = tensor.insert_slice %[[IN]] into %[[FILL]]{{\[}}%[[C0]], %[[C0]], %[[OFFSET]], %[[C0]]] [4, %[[DIM1]], 2, %[[DIM3]]] [1, 1, 1, 1] : tensor<4x?x2x?xf32> into tensor<4x?x?x?xf32>
+// CHECK:           %[[PADDED:.*]] = tensor.insert_slice %[[IN]] into %[[FILL]][0, 0, %[[OFFSET]], 0] [4, %[[DIM1]], 2, %[[DIM3]]] [1, 1, 1, 1] : tensor<4x?x2x?xf32> into tensor<4x?x?x?xf32>
 // CHECK:           return %[[PADDED]] : tensor<4x?x?x?xf32>
 // CHECK:         }
 func.func @generalize_pad_tensor_dynamic_shape(%arg0: tensor<4x?x2x?xf32>, %arg1: index) -> tensor<4x?x?x?xf32> {
diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
index f2c490b..c140b6a 100644
--- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
+++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
@@ -1033,3 +1033,23 @@ func.func @do_not_drop_non_constant_padding(%arg0: tensor<1x1x3x1x1xf32>, %pad:
 // CHECK-SLICES-LABEL: func @do_not_drop_non_constant_padding
 //       CHECK-SLICES:   tensor.pad %{{.*}} low[0, 1, 0, %c0, 0] high[0, 0, 0, %c0, 2]
 //       CHECK-SLICES:   } : tensor<1x1x3x1x1xf32> to tensor<1x2x3x1x3xf32>
+
+// -----
+
+func.func @drop_known_unit_constant_low_high(%arg0: tensor<1x383x128xf32>) -> tensor<1x384x128xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %padded = tensor.pad %arg0 low[%c0, %c1, %c0] high[%c0, %c0, %c0] {
+  ^bb0(%arg1: index, %arg2: index, %arg3: index):
+    tensor.yield %cst : f32
+  } : tensor<1x383x128xf32> to tensor<1x384x128xf32>
+  return %padded : tensor<1x384x128xf32>
+}
+// CHECK-LABEL: func @drop_known_unit_constant_low_high
+//       CHECK:   %[[COLLAPSE:.+]] = tensor.collapse_shape
+//  CHECK-SAME:     {{\[}}[0, 1], [2]] : tensor<1x383x128xf32> into tensor<383x128xf32>
+//       CHECK:   %[[PADDED:.+]] = tensor.pad %[[COLLAPSE]] low[1, 0] high[0, 0]
+//       CHECK:   } : tensor<383x128xf32> to tensor<384x128xf32>
+//       CHECK:   tensor.expand_shape %[[PADDED]]
+//  CHECK-SAME:     {{\[}}[0, 1], [2]] : tensor<384x128xf32> into tensor<1x384x128xf32>
diff --git a/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir b/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir
index ac0eb48..2beab31 100644
--- a/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir
@@ -19,7 +19,6 @@ func.func @generalize_pad_tensor_static_shape(%arg0: tensor<1x28x28x1xf32>) -> t
 // CHECK-LABEL:   func @generalize_pad_tensor_dynamic_shape(
 // CHECK-SAME:                                              %[[IN:.*]]: tensor<4x?x2x?xf32>,
 // CHECK-SAME:                                              %[[OFFSET:.*]]: index) -> tensor<4x?x?x?xf32> {
-// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-DAG:       %[[C2:.*]] = arith.constant 2 : index
 // CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
@@ -32,7 +31,7 @@ func.func @generalize_pad_tensor_static_shape(%arg0: tensor<1x28x28x1xf32>) -> t
 // CHECK:           %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT]] : tensor<4x?x?x?xf32>) -> tensor<4x?x?x?xf32>
 // CHECK:           %[[DIM1_1:.*]] = tensor.dim %[[IN]], %[[C1]] : tensor<4x?x2x?xf32>
 // CHECK:           %[[DIM3_1:.*]] = tensor.dim %[[IN]], %[[C3]] : tensor<4x?x2x?xf32>
-// CHECK:           %[[PADDED:.*]] = tensor.insert_slice %[[IN]] into %[[FILL]]{{\[}}%[[C0]], %[[C0]], %[[OFFSET]], %[[C0]]] [4, %[[DIM1_1]], 2, %[[DIM3_1]]] [1, 1, 1, 1] : tensor<4x?x2x?xf32> into tensor<4x?x?x?xf32>
+// CHECK:           %[[PADDED:.*]] = tensor.insert_slice %[[IN]] into %[[FILL]][0, 0, %[[OFFSET]], 0] [4, %[[DIM1_1]], 2, %[[DIM3_1]]] [1, 1, 1, 1] : tensor<4x?x2x?xf32> into tensor<4x?x?x?xf32>
 // CHECK:           return %[[PADDED]] : tensor<4x?x?x?xf32>
 // CHECK:         }
 func.func @generalize_pad_tensor_dynamic_shape(%arg0: tensor<4x?x2x?xf32>, %arg1: index) -> tensor<4x?x?x?xf32> {
diff --git a/mlir/test/Dialect/Linalg/transform-op-matmul-to-outerproduct.mlir b/mlir/test/Dialect/Linalg/transform-op-matmul-to-outerproduct.mlir
index ee66073..a1a0c41 100644
--- a/mlir/test/Dialect/Linalg/transform-op-matmul-to-outerproduct.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-matmul-to-outerproduct.mlir
@@ -1,38 +1,51 @@
 // RUN: mlir-opt %s -transform-interpreter | FileCheck %s
 
-func.func @outerproduct_matmul(%A: memref<3x3xf32>, %B: memref<3x3xf32>, %C: memref<3x3xf32>) {
-  linalg.matmul ins(%A, %B: memref<3x3xf32>, memref<3x3xf32>)
+func.func @matmul_to_outerproduct(%A: memref<3x4xf32>, %B: memref<4x3xf32>, %C: memref<3x3xf32>) {
+  linalg.matmul ins(%A, %B: memref<3x4xf32>, memref<4x3xf32>)
             outs(%C: memref<3x3xf32>)
   return
 }
 
-// CHECK-LABEL:   func.func @outerproduct_matmul(
-// CHECK-SAME:      %[[VAL_0:.*]]: memref<3x3xf32>, %[[VAL_1:.*]]: memref<3x3xf32>, %[[VAL_2:.*]]: memref<3x3xf32>) {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[VAL_5:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_3]], %[[VAL_3]]], %[[VAL_4]] {in_bounds = [true, true]} : memref<3x3xf32>, vector<3x3xf32>
-// CHECK:           %[[VAL_6:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_3]], %[[VAL_3]]], %[[VAL_4]] {in_bounds = [true, true]} : memref<3x3xf32>, vector<3x3xf32>
-// CHECK:           %[[VAL_7:.*]] = vector.transfer_read %[[VAL_2]]{{\[}}%[[VAL_3]], %[[VAL_3]]], %[[VAL_4]] {in_bounds = [true, true]} : memref<3x3xf32>, vector<3x3xf32>
-// CHECK:           %[[VAL_8:.*]] = vector.transpose %[[VAL_5]], [1, 0] : vector<3x3xf32> to vector<3x3xf32>
-// CHECK:           %[[VAL_9:.*]] = vector.extract %[[VAL_8]][0] : vector<3xf32> from vector<3x3xf32>
-// CHECK:           %[[VAL_10:.*]] = vector.extract %[[VAL_6]][0] : vector<3xf32> from vector<3x3xf32>
-// CHECK:           %[[VAL_11:.*]] = vector.outerproduct %[[VAL_9]], %[[VAL_10]], %[[VAL_7]] {kind = #vector.kind<add>} : vector<3xf32>, vector<3xf32>
-// CHECK:           %[[VAL_12:.*]] = vector.extract %[[VAL_8]][1] : vector<3xf32> from vector<3x3xf32>
-// CHECK:           %[[VAL_13:.*]] = vector.extract %[[VAL_6]][1] : vector<3xf32> from vector<3x3xf32>
-// CHECK:           %[[VAL_14:.*]] = vector.outerproduct %[[VAL_12]], %[[VAL_13]], %[[VAL_11]] {kind = #vector.kind<add>} : vector<3xf32>, vector<3xf32>
-// CHECK:           %[[VAL_15:.*]] = vector.extract %[[VAL_8]][2] : vector<3xf32> from vector<3x3xf32>
-// CHECK:           %[[VAL_16:.*]] = vector.extract %[[VAL_6]][2] : vector<3xf32> from vector<3x3xf32>
-// CHECK:           %[[VAL_17:.*]] = vector.outerproduct %[[VAL_15]], %[[VAL_16]], %[[VAL_14]] {kind = #vector.kind<add>} : vector<3xf32>, vector<3xf32>
-// CHECK:           vector.transfer_write %[[VAL_17]], %[[VAL_2]]{{\[}}%[[VAL_3]], %[[VAL_3]]] {in_bounds = [true, true]} : vector<3x3xf32>, memref<3x3xf32>
-// CHECK:           return
-// CHECK:         }
+// CHECK-LABEL:   func.func @matmul_to_outerproduct(
+// CHECK-SAME:      %[[A:.*]]: memref<3x4xf32>,
+// CHECK-SAME:      %[[B:.*]]: memref<4x3xf32>,
+// CHECK-SAME:      %[[C:.*]]: memref<3x3xf32>) {
+// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]
+// CHECK:           %[[VEC_B:.*]] = vector.transfer_read %[[B]]
+// CHECK:           %[[VEC_C:.*]] = vector.transfer_read %[[C]]
+// CHECK:           %[[VEC_A_T:.*]] = vector.transpose %[[VEC_A]], [1, 0] : vector<3x4xf32> to vector<4x3xf32>
+// CHECK:           %[[A0:.*]] = vector.extract %[[VEC_A_T]][0] : vector<3xf32> from vector<4x3xf32>
+// CHECK:           %[[B0:.*]] = vector.extract %[[VEC_B]][0] : vector<3xf32> from vector<4x3xf32>
+// CHECK:           %[[OP_0:.*]] = vector.outerproduct %[[A0]], %[[B0]], %[[VEC_C]]
+// CHECK:           %[[A1:.*]] = vector.extract %[[VEC_A_T]][1] : vector<3xf32> from vector<4x3xf32>
+// CHECK:           %[[B1:.*]] = vector.extract %[[VEC_B]][1] : vector<3xf32> from vector<4x3xf32>
+// CHECK:           %[[OP_1:.*]] = vector.outerproduct %[[A1]], %[[B1]], %[[OP_0]]
+// CHECK:           %[[A_2:.*]] = vector.extract %[[VEC_A_T]][2] : vector<3xf32> from vector<4x3xf32>
+// CHECK:           %[[B_2:.*]] = vector.extract %[[VEC_B]][2] : vector<3xf32> from vector<4x3xf32>
+// CHECK:           %[[OP_2:.*]] = vector.outerproduct %[[A_2]], %[[B_2]], %[[OP_1]]
+// CHECK:           %[[A_3:.*]] = vector.extract %[[VEC_A_T]][3] : vector<3xf32> from vector<4x3xf32>
+// CHECK:           %[[B_3:.*]] = vector.extract %[[VEC_B]][3] : vector<3xf32> from vector<4x3xf32>
+// CHECK:           %[[RES:.*]] = vector.outerproduct %[[A_3]], %[[B_3]], %[[OP_2]]
+// CHECK:           vector.transfer_write %[[RES]], %[[C]]{{.*}} : vector<3x3xf32>, memref<3x3xf32>
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %2 {
+  transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %module : (!transform.any_op) -> !transform.any_op
+
+    // Vectorize: linalg.matmul -> vector.multi_reduction
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %func : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %matmul : !transform.any_op
+
+    // vector.multi_reduction --> vector.contract
+    transform.apply_patterns to %func {
+      transform.apply_patterns.vector.reduction_to_contract
+      // Reduce the rank of xfer ops. This transform vector.contract to be more
+      // more matmul-like and to enable the lowering to outer product Ops.
+      transform.apply_patterns.vector.transfer_permutation_patterns
+    } : !transform.any_op
+
+    // vector.contract --> vector.outerproduct
+    transform.apply_patterns to %func {
       transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
     } : !transform.any_op
     transform.yield
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir
new file mode 100644
index 0000000..4ac1ebd
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir
@@ -0,0 +1,66 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+  omp.reduction.declare @add_reduction_i_32 : !llvm.ptr init {
+  ^bb0(%arg0: !llvm.ptr):
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    %1 = llvm.mlir.constant(1 : i64) : i64
+    %2 = llvm.alloca %1 x i32 : (i64) -> !llvm.ptr
+    llvm.store %0, %2 : i32, !llvm.ptr
+    omp.yield(%2 : !llvm.ptr)
+  } combiner {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.load %arg0 : !llvm.ptr -> i32
+    %1 = llvm.load %arg1 : !llvm.ptr -> i32
+    %2 = llvm.add %0, %1  : i32
+    llvm.store %2, %arg0 : i32, !llvm.ptr
+    omp.yield(%arg0 : !llvm.ptr)
+  } 
+
+  // CHECK-LABEL: @main
+  llvm.func @main()  {
+    %0 = llvm.mlir.constant(-1 : i32) : i32
+    %1 = llvm.mlir.addressof @i : !llvm.ptr
+    omp.parallel byref reduction(@add_reduction_i_32 %1 -> %arg0 : !llvm.ptr) {
+      llvm.store %0, %arg0 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+  llvm.mlir.global internal @i() {addr_space = 0 : i32} : i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    llvm.return %0 : i32
+  }
+
+// CHECK: %{{.+}} = 
+// Call to the outlined function.
+// CHECK: call void {{.*}} @__kmpc_fork_call
+// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Outlined function.
+// CHECK: define internal void @[[OUTLINED]]
+
+// Private reduction variable and its initialization.
+// CHECK: %tid.addr.local = alloca i32
+// CHECK: %[[PRIVATE:.+]] = alloca i32
+// CHECK: store i32 0, ptr %[[PRIVATE]]
+// CHECK: store ptr %[[PRIVATE]], ptr %[[PRIV_PTR:.+]],
+
+// Call to the reduction function.
+// CHECK: call i32 @__kmpc_reduce
+// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+
+// Non-atomic reduction:
+// CHECK: %[[PRIV_VAL_PTR:.+]] = load ptr, ptr %[[PRIV_PTR]]
+// CHECK: %[[LOAD:.+]] = load i32, ptr @i
+// CHECK: %[[PRIV_VAL:.+]] = load i32, ptr %[[PRIV_VAL_PTR]]
+// CHECK: %[[SUM:.+]] = add i32 %[[LOAD]], %[[PRIV_VAL]]
+// CHECK: store i32 %[[SUM]], ptr @i
+// CHECK: call void @__kmpc_end_reduce
+// CHECK: br label %[[FINALIZE:.+]]
+
+// CHECK: [[FINALIZE]]:
+
+// Reduction function.
+// CHECK: define internal void @[[REDFUNC]]
+// CHECK: add i32
diff --git a/mlir/test/Transforms/inlining-dump-default-pipeline.mlir b/mlir/test/Transforms/inlining-dump-default-pipeline.mlir
index e2c3186..4f86380 100644
--- a/mlir/test/Transforms/inlining-dump-default-pipeline.mlir
+++ b/mlir/test/Transforms/inlining-dump-default-pipeline.mlir
@@ -1,2 +1,2 @@
 // RUN: mlir-opt %s -pass-pipeline="builtin.module(inline)" -dump-pass-pipeline 2>&1 | FileCheck %s
-// CHECK: builtin.module(inline{default-pipeline=canonicalize max-iterations=4 })
+// CHECK: builtin.module(inline{default-pipeline=canonicalize inlining-threshold=4294967295 max-iterations=4 })
diff --git a/mlir/test/Transforms/inlining-threshold.mlir b/mlir/test/Transforms/inlining-threshold.mlir
new file mode 100644
index 0000000..649408a
--- /dev/null
+++ b/mlir/test/Transforms/inlining-threshold.mlir
@@ -0,0 +1,18 @@
+// RUN: mlir-opt %s -inline='default-pipeline= inlining-threshold=100' | FileCheck %s
+
+// Check that inlining does not happen when the threshold is exceeded.
+func.func @callee1(%arg : i32) -> i32 {
+  %v1 = arith.addi %arg, %arg : i32
+  %v2 = arith.addi %v1, %arg : i32
+  %v3 = arith.addi %v2, %arg : i32
+  return %v3 : i32
+}
+
+// CHECK-LABEL: func @caller1
+func.func @caller1(%arg0 : i32) -> i32 {
+  // CHECK-NEXT: call @callee1
+  // CHECK-NEXT: return
+
+  %0 = call @callee1(%arg0) : (i32) -> i32
+  return %0 : i32
+}
author	Noah Goldstein <goldstein.w.n@gmail.com>	2024-03-13 13:13:52 -0700
committer	Fangrui Song <i@maskray.me>	2024-03-13 13:13:52 -0700
commit	9ce8691dea8dadc1302abacf4302f3b805e1448d (patch)
tree	fdc2da3081156b4c9b80b0d417f090efadac946c /mlir
parent	795e3c3d94da0a664642d4580d87c82c02d5eca4 (diff)
parent	744a23f24b08e8b988b176173c433d64761e66b3 (diff)
download	llvm-users/MaskRay/spr/main.llvm-objcopy-add-compress-sections.zip llvm-users/MaskRay/spr/main.llvm-objcopy-add-compress-sections.tar.gz llvm-users/MaskRay/spr/main.llvm-objcopy-add-compress-sections.tar.bz2