aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/CodeGen/MachineCombiner.cpp
diff options
context:
space:
mode:
authorLuo, Yuanke <yuanke.luo@intel.com>2023-04-21 19:28:58 +0800
committerLuo, Yuanke <yuanke.luo@intel.com>2023-04-27 16:42:04 +0800
commit8f7f9d86a7555263ef08fded15a6b778d796ec3f (patch)
tree980c7a8267ac590ec793ac3f1b049f7f8d58bf2e /llvm/lib/CodeGen/MachineCombiner.cpp
parent47d3cbcf842a036c20c3f1c74255cdfc213f41c2 (diff)
downloadllvm-8f7f9d86a7555263ef08fded15a6b778d796ec3f.zip
llvm-8f7f9d86a7555263ef08fded15a6b778d796ec3f.tar.gz
llvm-8f7f9d86a7555263ef08fded15a6b778d796ec3f.tar.bz2
[X86] Machine combine vnni instruction.
"vpmaddwd + vpaddd" can be combined to vpdpwssd and the latency is reduced after combination. However when vpdpwssd is in a critical path the combination get less ILP. It happens when vpdpwssd is in a loop, the vpmaddwd can be executed in parallel in multi-iterations while vpdpwssd has data dependency for each iterations. If vpaddd is in a critical path while vpmaddwd is not, it is profitable to split vpdpwssd into "vpmaddwd + vpaddd". This patch is based on the machine combiner framework to acheive decision on "vpmaddwd + vpaddd" combination. The typical example code is as below. ``` __m256i foo(int cnt, __m256i c, __m256i b, __m256i *p) { for (int i = 0; i < cnt; ++i) { __m256i a = p[i]; __m256i m = _mm256_madd_epi16 (b, a); c = _mm256_add_epi32(m, c); } return c; } ``` Differential Revision: https://reviews.llvm.org/D148980
Diffstat (limited to 'llvm/lib/CodeGen/MachineCombiner.cpp')
-rw-r--r--llvm/lib/CodeGen/MachineCombiner.cpp34
1 files changed, 29 insertions, 5 deletions
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index bb4d131..5c58d3b 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -91,7 +91,8 @@ public:
private:
bool combineInstructions(MachineBasicBlock *);
- MachineInstr *getOperandDef(const MachineOperand &MO);
+ MachineInstr *getOperandDef(const MachineOperand &MO,
+ SmallVectorImpl<MachineInstr *> &InsInstrs);
bool isTransientMI(const MachineInstr *MI);
unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
@@ -149,11 +150,29 @@ void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
MachineFunctionPass::getAnalysisUsage(AU);
}
-MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
+MachineInstr *
+MachineCombiner::getOperandDef(const MachineOperand &MO,
+ SmallVectorImpl<MachineInstr *> &InsInstrs) {
MachineInstr *DefInstr = nullptr;
// We need a virtual register definition.
if (MO.isReg() && MO.getReg().isVirtual())
DefInstr = MRI->getUniqueVRegDef(MO.getReg());
+ // Since the new instructions are not inserted into the machine function,
+ // the def-use information is not added in MRI. So it is possible that
+ // the register is defined in new instructions.
+ if (!DefInstr) {
+ for (auto *MI : InsInstrs) {
+ for (const MachineOperand &DefMO : MI->operands()) {
+ if (!(DefMO.isReg() && DefMO.getReg().isVirtual()))
+ continue;
+ if (!DefMO.isDef())
+ continue;
+ if (DefMO.getReg() != MO.getReg())
+ continue;
+ DefInstr = MI;
+ }
+ }
+ }
// PHI's have no depth etc.
if (DefInstr && DefInstr->isPHI())
DefInstr = nullptr;
@@ -238,7 +257,7 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
InstrPtr, UseIdx);
} else {
- MachineInstr *DefInstr = getOperandDef(MO);
+ MachineInstr *DefInstr = getOperandDef(MO, InsInstrs);
if (DefInstr && (TII->getMachineCombinerTraceStrategy() !=
MachineTraceStrategy::TS_Local ||
DefInstr->getParent() == &MBB)) {
@@ -404,8 +423,13 @@ bool MachineCombiner::improvesCriticalPathLen(
// Account for the latency of the inserted and deleted instructions by
unsigned NewRootLatency, RootLatency;
- std::tie(NewRootLatency, RootLatency) =
- getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
+ if (TII->accumulateInstrSeqToRootLatency(*Root)) {
+ std::tie(NewRootLatency, RootLatency) =
+ getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
+ } else {
+ NewRootLatency = TSchedModel.computeInstrLatency(InsInstrs.back());
+ RootLatency = TSchedModel.computeInstrLatency(Root);
+ }
unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
unsigned NewCycleCount = NewRootDepth + NewRootLatency;