diff options
author | Luo, Yuanke <yuanke.luo@intel.com> | 2023-04-21 19:28:58 +0800 |
---|---|---|
committer | Luo, Yuanke <yuanke.luo@intel.com> | 2023-04-27 16:42:04 +0800 |
commit | 8f7f9d86a7555263ef08fded15a6b778d796ec3f (patch) | |
tree | 980c7a8267ac590ec793ac3f1b049f7f8d58bf2e /llvm/lib/CodeGen/MachineCombiner.cpp | |
parent | 47d3cbcf842a036c20c3f1c74255cdfc213f41c2 (diff) | |
download | llvm-8f7f9d86a7555263ef08fded15a6b778d796ec3f.zip llvm-8f7f9d86a7555263ef08fded15a6b778d796ec3f.tar.gz llvm-8f7f9d86a7555263ef08fded15a6b778d796ec3f.tar.bz2 |
[X86] Machine combine vnni instruction.
"vpmaddwd + vpaddd" can be combined to vpdpwssd and the latency is
reduced after combination. However when vpdpwssd is in a critical path
the combination get less ILP. It happens when vpdpwssd is in a loop, the
vpmaddwd can be executed in parallel in multi-iterations while vpdpwssd
has data dependency for each iterations. If vpaddd is in a critical path
while vpmaddwd is not, it is profitable to split vpdpwssd into "vpmaddwd
+ vpaddd".
This patch is based on the machine combiner framework to acheive decision
on "vpmaddwd + vpaddd" combination. The typical example code is as
below.
```
__m256i foo(int cnt, __m256i c, __m256i b, __m256i *p) {
for (int i = 0; i < cnt; ++i) {
__m256i a = p[i];
__m256i m = _mm256_madd_epi16 (b, a);
c = _mm256_add_epi32(m, c);
}
return c;
}
```
Differential Revision: https://reviews.llvm.org/D148980
Diffstat (limited to 'llvm/lib/CodeGen/MachineCombiner.cpp')
-rw-r--r-- | llvm/lib/CodeGen/MachineCombiner.cpp | 34 |
1 files changed, 29 insertions, 5 deletions
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp index bb4d131..5c58d3b 100644 --- a/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/llvm/lib/CodeGen/MachineCombiner.cpp @@ -91,7 +91,8 @@ public: private: bool combineInstructions(MachineBasicBlock *); - MachineInstr *getOperandDef(const MachineOperand &MO); + MachineInstr *getOperandDef(const MachineOperand &MO, + SmallVectorImpl<MachineInstr *> &InsInstrs); bool isTransientMI(const MachineInstr *MI); unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, @@ -149,11 +150,29 @@ void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { +MachineInstr * +MachineCombiner::getOperandDef(const MachineOperand &MO, + SmallVectorImpl<MachineInstr *> &InsInstrs) { MachineInstr *DefInstr = nullptr; // We need a virtual register definition. if (MO.isReg() && MO.getReg().isVirtual()) DefInstr = MRI->getUniqueVRegDef(MO.getReg()); + // Since the new instructions are not inserted into the machine function, + // the def-use information is not added in MRI. So it is possible that + // the register is defined in new instructions. + if (!DefInstr) { + for (auto *MI : InsInstrs) { + for (const MachineOperand &DefMO : MI->operands()) { + if (!(DefMO.isReg() && DefMO.getReg().isVirtual())) + continue; + if (!DefMO.isDef()) + continue; + if (DefMO.getReg() != MO.getReg()) + continue; + DefInstr = MI; + } + } + } // PHI's have no depth etc. if (DefInstr && DefInstr->isPHI()) DefInstr = nullptr; @@ -238,7 +257,7 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx, InstrPtr, UseIdx); } else { - MachineInstr *DefInstr = getOperandDef(MO); + MachineInstr *DefInstr = getOperandDef(MO, InsInstrs); if (DefInstr && (TII->getMachineCombinerTraceStrategy() != MachineTraceStrategy::TS_Local || DefInstr->getParent() == &MBB)) { @@ -404,8 +423,13 @@ bool MachineCombiner::improvesCriticalPathLen( // Account for the latency of the inserted and deleted instructions by unsigned NewRootLatency, RootLatency; - std::tie(NewRootLatency, RootLatency) = - getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace); + if (TII->accumulateInstrSeqToRootLatency(*Root)) { + std::tie(NewRootLatency, RootLatency) = + getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace); + } else { + NewRootLatency = TSchedModel.computeInstrLatency(InsInstrs.back()); + RootLatency = TSchedModel.computeInstrLatency(Root); + } unsigned RootSlack = BlockTrace.getInstrSlack(*Root); unsigned NewCycleCount = NewRootDepth + NewRootLatency; |