[VectorCombine] try to form vector binop to eliminate an extract element

binop (extelt X, C), (extelt Y, C) --> extelt (binop X, Y), C This is a transform that has been considered for canonicalization (instcombine) in the past because it reduces instruction count. But as shown in the x86 tests, it's impossible to know if it's profitable without a cost model. There are many potential target constraints to consider. We have implemented similar transforms in the backend (DAGCombiner and target-specific), but I don't think we have this exact fold there either (and if we did it in SDAG, it wouldn't work across blocks). Note: this patch was intended to handle the more general case where the extract indexes do not match, but it got too big, so I scaled it back to this pattern for now. Differential Revision: https://reviews.llvm.org/D74495
author: Sanjay Patel <spatel@rotateright.com> 2020-02-13 16:08:15 -0500
committer: Sanjay Patel <spatel@rotateright.com> 2020-02-13 17:23:27 -0500
commit: 19b62b79db1bb154b40e8baba9a28ab8aa935b6b (patch)
tree: 539aacfb0a34477e489d5342113738c6531471e3 /llvm/lib/Transforms
parent: fe36127982e0a5889cc0653718e62ba6acccf7c4 (diff)
download: llvm-19b62b79db1bb154b40e8baba9a28ab8aa935b6b.zip
llvm-19b62b79db1bb154b40e8baba9a28ab8aa935b6b.tar.gz
llvm-19b62b79db1bb154b40e8baba9a28ab8aa935b6b.tar.bz2
1 files changed, 82 insertions, 1 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 2c69878..a70d962 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -30,6 +31,7 @@ using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "vector-combine"
 STATISTIC(NumVecCmp, "Number of vector compares formed");
+STATISTIC(NumVecBO, "Number of vector binops formed");
 
 static bool foldExtractCmp(Instruction &I, const TargetTransformInfo &TTI) {
   // Match a cmp with extracted vector operands.
@@ -76,6 +78,85 @@ static bool foldExtractCmp(Instruction &I, const TargetTransformInfo &TTI) {
   return true;
 }
 
+/// Try to reduce extract element costs by converting scalar binops to vector
+/// binops followed by extract.
+static bool foldExtractBinop(Instruction &I, const TargetTransformInfo &TTI) {
+  // It is not safe to transform things like div, urem, etc. because we may
+  // create undefined behavior when executing those on unknown vector elements.
+  if (!isSafeToSpeculativelyExecute(&I))
+    return false;
+
+  // Match a scalar binop with extracted vector operands:
+  // bo (extelt X, C0), (extelt Y, C1)
+  Instruction *Ext0, *Ext1;
+  if (!match(&I, m_BinOp(m_Instruction(Ext0), m_Instruction(Ext1))))
+    return false;
+
+  Value *X, *Y;
+  uint64_t C0, C1;
+  if (!match(Ext0, m_ExtractElement(m_Value(X), m_ConstantInt(C0))) ||
+      !match(Ext1, m_ExtractElement(m_Value(Y), m_ConstantInt(C1))) ||
+      X->getType() != Y->getType())
+    return false;
+
+  // Check if using a vector binop would be cheaper.
+  Instruction::BinaryOps BOpcode = cast<BinaryOperator>(I).getOpcode();
+  Type *ScalarTy = I.getType();
+  Type *VecTy = X->getType();
+  int ScalarBOCost = TTI.getArithmeticInstrCost(BOpcode, ScalarTy);
+  int VecBOCost = TTI.getArithmeticInstrCost(BOpcode, VecTy);
+  int Extract0Cost = TTI.getVectorInstrCost(Instruction::ExtractElement,
+                                            VecTy, C0);
+  int Extract1Cost = TTI.getVectorInstrCost(Instruction::ExtractElement,
+                                            VecTy, C1);
+
+  // Handle a special case - if the extract indexes are the same, the
+  // replacement sequence does not require a shuffle. Unless the vector binop is
+  // much more expensive than the scalar binop, this eliminates an extract.
+  // Extra uses of the extracts mean that we include those costs in the
+  // vector total because those instructions will not be eliminated.
+  if (C0 == C1) {
+    assert(Extract0Cost == Extract1Cost && "Different costs for same extract?");
+    int ExtractCost = Extract0Cost;
+    if (X != Y) {
+      int ScalarCost = ExtractCost + ExtractCost + ScalarBOCost;
+      int VecCost = VecBOCost + ExtractCost +
+                    !Ext0->hasOneUse() * ExtractCost +
+                    !Ext1->hasOneUse() * ExtractCost;
+      if (ScalarCost <= VecCost)
+        return false;
+    } else {
+      // Handle an extra-special case. If the 2 binop operands are identical,
+      // adjust the formulas to account for that:
+      // bo (extelt X, C), (extelt X, C) --> extelt (bo X, X), C
+      // The extra use charge allows for either the CSE'd pattern or an
+      // unoptimized form with identical values.
+      bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
+                                    : !Ext0->hasOneUse() || !Ext1->hasOneUse();
+      int ScalarCost = ExtractCost + ScalarBOCost;
+      int VecCost = VecBOCost + ExtractCost + HasUseTax * ExtractCost;
+      if (ScalarCost <= VecCost)
+        return false;
+    }
+
+    // bo (extelt X, C), (extelt Y, C) --> extelt (bo X, Y), C
+    ++NumVecBO;
+    IRBuilder<> Builder(&I);
+    Value *NewBO = Builder.CreateBinOp(BOpcode, X, Y);
+    if (auto *VecBOInst = dyn_cast<Instruction>(NewBO)) {
+      // All IR flags are safe to back-propagate because any potential poison
+      // created in unused vector elements is discarded by the extract.
+      VecBOInst->copyIRFlags(&I);
+    }
+    Value *Extract = Builder.CreateExtractElement(NewBO, Ext0->getOperand(1));
+    I.replaceAllUsesWith(Extract);
+    return true;
+  }
+
+  // TODO: Handle C0 != C1 by shuffling 1 of the operands.
+  return false;
+}
+
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
 static bool runImpl(Function &F, const TargetTransformInfo &TTI,
@@ -92,7 +173,7 @@ static bool runImpl(Function &F, const TargetTransformInfo &TTI,
     //       iteratively in this loop rather than waiting until the end.
     for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
       MadeChange |= foldExtractCmp(I, TTI);
-      // TODO: More transforms go here.
+      MadeChange |= foldExtractBinop(I, TTI);
     }
   }
author	Sanjay Patel <spatel@rotateright.com>	2020-02-13 16:08:15 -0500
committer	Sanjay Patel <spatel@rotateright.com>	2020-02-13 17:23:27 -0500
commit	19b62b79db1bb154b40e8baba9a28ab8aa935b6b (patch)
tree	539aacfb0a34477e489d5342113738c6531471e3 /llvm/lib/Transforms
parent	fe36127982e0a5889cc0653718e62ba6acccf7c4 (diff)
download	llvm-19b62b79db1bb154b40e8baba9a28ab8aa935b6b.zip llvm-19b62b79db1bb154b40e8baba9a28ab8aa935b6b.tar.gz llvm-19b62b79db1bb154b40e8baba9a28ab8aa935b6b.tar.bz2