[X86] Machine combine vnni instruction.

"vpmaddwd + vpaddd" can be combined to vpdpwssd and the latency is reduced after combination. However when vpdpwssd is in a critical path the combination get less ILP. It happens when vpdpwssd is in a loop, the vpmaddwd can be executed in parallel in multi-iterations while vpdpwssd has data dependency for each iterations. If vpaddd is in a critical path while vpmaddwd is not, it is profitable to split vpdpwssd into "vpmaddwd + vpaddd". This patch is based on the machine combiner framework to acheive decision on "vpmaddwd + vpaddd" combination. The typical example code is as below. ``` __m256i foo(int cnt, __m256i c, __m256i b, __m256i *p) { for (int i = 0; i < cnt; ++i) { __m256i a = p[i]; __m256i m = _mm256_madd_epi16 (b, a); c = _mm256_add_epi32(m, c); } return c; } ``` Differential Revision: https://reviews.llvm.org/D148980
author: Luo, Yuanke <yuanke.luo@intel.com> 2023-04-21 19:28:58 +0800
committer: Luo, Yuanke <yuanke.luo@intel.com> 2023-04-27 16:42:04 +0800
commit: 8f7f9d86a7555263ef08fded15a6b778d796ec3f (patch)
tree: 980c7a8267ac590ec793ac3f1b049f7f8d58bf2e /llvm/lib/CodeGen/MachineCombiner.cpp
parent: 47d3cbcf842a036c20c3f1c74255cdfc213f41c2 (diff)
download: llvm-8f7f9d86a7555263ef08fded15a6b778d796ec3f.zip
llvm-8f7f9d86a7555263ef08fded15a6b778d796ec3f.tar.gz
llvm-8f7f9d86a7555263ef08fded15a6b778d796ec3f.tar.bz2
1 files changed, 29 insertions, 5 deletions
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index bb4d131..5c58d3b 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -91,7 +91,8 @@ public:
 
 private:
   bool combineInstructions(MachineBasicBlock *);
-  MachineInstr *getOperandDef(const MachineOperand &MO);
+  MachineInstr *getOperandDef(const MachineOperand &MO,
+                              SmallVectorImpl<MachineInstr *> &InsInstrs);
   bool isTransientMI(const MachineInstr *MI);
   unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
                     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
@@ -149,11 +150,29 @@ void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
+MachineInstr *
+MachineCombiner::getOperandDef(const MachineOperand &MO,
+                               SmallVectorImpl<MachineInstr *> &InsInstrs) {
   MachineInstr *DefInstr = nullptr;
   // We need a virtual register definition.
   if (MO.isReg() && MO.getReg().isVirtual())
     DefInstr = MRI->getUniqueVRegDef(MO.getReg());
+  // Since the new instructions are not inserted into the machine function,
+  // the def-use information is not added in MRI. So it is possible that
+  // the register is defined in new instructions.
+  if (!DefInstr) {
+    for (auto *MI : InsInstrs) {
+      for (const MachineOperand &DefMO : MI->operands()) {
+        if (!(DefMO.isReg() && DefMO.getReg().isVirtual()))
+          continue;
+        if (!DefMO.isDef())
+          continue;
+        if (DefMO.getReg() != MO.getReg())
+          continue;
+        DefInstr = MI;
+      }
+    }
+  }
   // PHI's have no depth etc.
   if (DefInstr && DefInstr->isPHI())
     DefInstr = nullptr;
@@ -238,7 +257,7 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
         LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
                                                       InstrPtr, UseIdx);
       } else {
-        MachineInstr *DefInstr = getOperandDef(MO);
+        MachineInstr *DefInstr = getOperandDef(MO, InsInstrs);
         if (DefInstr && (TII->getMachineCombinerTraceStrategy() !=
                              MachineTraceStrategy::TS_Local ||
                          DefInstr->getParent() == &MBB)) {
@@ -404,8 +423,13 @@ bool MachineCombiner::improvesCriticalPathLen(
 
   // Account for the latency of the inserted and deleted instructions by
   unsigned NewRootLatency, RootLatency;
-  std::tie(NewRootLatency, RootLatency) =
-      getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
+  if (TII->accumulateInstrSeqToRootLatency(*Root)) {
+    std::tie(NewRootLatency, RootLatency) =
+        getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
+  } else {
+    NewRootLatency = TSchedModel.computeInstrLatency(InsInstrs.back());
+    RootLatency = TSchedModel.computeInstrLatency(Root);
+  }
 
   unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
   unsigned NewCycleCount = NewRootDepth + NewRootLatency;
author	Luo, Yuanke <yuanke.luo@intel.com>	2023-04-21 19:28:58 +0800
committer	Luo, Yuanke <yuanke.luo@intel.com>	2023-04-27 16:42:04 +0800
commit	8f7f9d86a7555263ef08fded15a6b778d796ec3f (patch)
tree	980c7a8267ac590ec793ac3f1b049f7f8d58bf2e /llvm/lib/CodeGen/MachineCombiner.cpp
parent	47d3cbcf842a036c20c3f1c74255cdfc213f41c2 (diff)
download	llvm-8f7f9d86a7555263ef08fded15a6b778d796ec3f.zip llvm-8f7f9d86a7555263ef08fded15a6b778d796ec3f.tar.gz llvm-8f7f9d86a7555263ef08fded15a6b778d796ec3f.tar.bz2