aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp38
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp63
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp311
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp53
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h23
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td16
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td2
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td11
12 files changed, 224 insertions, 312 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index f01d5f6..a4ef524 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -647,7 +647,7 @@ public:
ModuleScopeVariables.insert(GV);
} else if (K.second.size() == 1) {
KernelAccessVariables.insert(GV);
- } else if (set_is_subset(K.second, HybridModuleRootKernels)) {
+ } else if (K.second == HybridModuleRootKernels) {
ModuleScopeVariables.insert(GV);
} else {
TableLookupVariables.insert(GV);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 92a587b..280fbe2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1384,6 +1384,11 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments)
addPass(createAMDGPULowerKernelArgumentsPass());
+ TargetPassConfig::addCodeGenPrepare();
+
+ if (isPassEnabled(EnableLoadStoreVectorizer))
+ addPass(createLoadStoreVectorizerPass());
+
if (TM->getTargetTriple().isAMDGCN()) {
// This lowering has been placed after codegenprepare to take advantage of
// address mode matching (which is why it isn't put with the LDS lowerings).
@@ -1392,15 +1397,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
// but has been put before switch lowering and CFG flattening so that those
// passes can run on the more optimized control flow this pass creates in
// many cases.
- //
- // FIXME: This should ideally be put after the LoadStoreVectorizer.
- // However, due to some annoying facts about ResourceUsageAnalysis,
- // (especially as exercised in the resource-usage-dead-function test),
- // we need all the function passes codegenprepare all the way through
- // said resource usage analysis to run on the call graph produced
- // before codegenprepare runs (because codegenprepare will knock some
- // nodes out of the graph, which leads to function-level passes not
- // being run on them, which causes crashes in the resource usage analysis).
addPass(createAMDGPULowerBufferFatPointersPass());
addPass(createAMDGPULowerIntrinsicsLegacyPass());
// In accordance with the above FIXME, manually force all the
@@ -1408,11 +1404,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
addPass(new DummyCGSCCPass());
}
- TargetPassConfig::addCodeGenPrepare();
-
- if (isPassEnabled(EnableLoadStoreVectorizer))
- addPass(createLoadStoreVectorizerPass());
-
// LowerSwitch pass may introduce unreachable blocks that can
// cause unexpected behavior for subsequent passes. Placing it
// here seems better that these blocks would get cleaned up by
@@ -2125,6 +2116,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
if (EnableLowerKernelArguments)
addPass(AMDGPULowerKernelArgumentsPass(TM));
+ Base::addCodeGenPrepare(addPass);
+
+ if (isPassEnabled(EnableLoadStoreVectorizer))
+ addPass(LoadStoreVectorizerPass());
+
// This lowering has been placed after codegenprepare to take advantage of
// address mode matching (which is why it isn't put with the LDS lowerings).
// It could be placed anywhere before uniformity annotations (an analysis
@@ -2132,25 +2128,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
// but has been put before switch lowering and CFG flattening so that those
// passes can run on the more optimized control flow this pass creates in
// many cases.
- //
- // FIXME: This should ideally be put after the LoadStoreVectorizer.
- // However, due to some annoying facts about ResourceUsageAnalysis,
- // (especially as exercised in the resource-usage-dead-function test),
- // we need all the function passes codegenprepare all the way through
- // said resource usage analysis to run on the call graph produced
- // before codegenprepare runs (because codegenprepare will knock some
- // nodes out of the graph, which leads to function-level passes not
- // being run on them, which causes crashes in the resource usage analysis).
addPass(AMDGPULowerBufferFatPointersPass(TM));
addPass.requireCGSCCOrder();
addPass(AMDGPULowerIntrinsicsPass(TM));
- Base::addCodeGenPrepare(addPass);
-
- if (isPassEnabled(EnableLoadStoreVectorizer))
- addPass(LoadStoreVectorizerPass());
-
// LowerSwitch pass may introduce unreachable blocks that can cause unexpected
// behavior for subsequent passes. Placing it here seems better that these
// blocks would get cleaned up by UnreachableBlockElim inserted next in the
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index fab78a9..bdc0810 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -29,6 +29,7 @@
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/MC/LaneBitmask.h"
#include "llvm/Support/ErrorHandling.h"
@@ -1633,64 +1634,6 @@ void GCNSchedStage::revertScheduling() {
DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
}
-bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
- SlotIndex OriginalIdx,
- SlotIndex RematIdx) const {
-
- LiveIntervals *LIS = DAG.LIS;
- MachineRegisterInfo &MRI = DAG.MRI;
- OriginalIdx = OriginalIdx.getRegSlot(true);
- RematIdx = std::max(RematIdx, RematIdx.getRegSlot(true));
- for (const MachineOperand &MO : InstToRemat->operands()) {
- if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
- continue;
-
- if (!MO.getReg().isVirtual()) {
- // Do not attempt to reason about PhysRegs
- // TODO: better analysis of PhysReg livness
- if (!DAG.MRI.isConstantPhysReg(MO.getReg()) &&
- !DAG.TII->isIgnorableUse(MO))
- return false;
-
- // Constant PhysRegs and IgnorableUses are okay
- continue;
- }
-
- LiveInterval &LI = LIS->getInterval(MO.getReg());
- const VNInfo *OVNI = LI.getVNInfoAt(OriginalIdx);
- assert(OVNI);
-
- // Don't allow rematerialization immediately after the original def.
- // It would be incorrect if InstToRemat redefines the register.
- // See PR14098.
- if (SlotIndex::isSameInstr(OriginalIdx, RematIdx))
- return false;
-
- if (OVNI != LI.getVNInfoAt(RematIdx))
- return false;
-
- // Check that subrange is live at RematIdx.
- if (LI.hasSubRanges()) {
- const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
- unsigned SubReg = MO.getSubReg();
- LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
- : MRI.getMaxLaneMaskForVReg(MO.getReg());
- for (LiveInterval::SubRange &SR : LI.subranges()) {
- if ((SR.LaneMask & LM).none())
- continue;
- if (!SR.liveAt(RematIdx))
- return false;
-
- // Early exit if all used lanes are checked. No need to continue.
- LM &= ~SR.LaneMask;
- if (LM.none())
- break;
- }
- }
- }
- return true;
-}
-
bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
const Function &F = MF.getFunction();
@@ -1812,9 +1755,9 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
// Do not rematerialize an instruction it it uses registers that aren't
// available at its use. This ensures that we are not extending any live
// range while rematerializing.
- SlotIndex DefIdx = DAG.LIS->getInstructionIndex(DefMI);
SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
- if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx))
+ if (!VirtRegAuxInfo::allUsesAvailableAt(&DefMI, UseIdx, *DAG.LIS, DAG.MRI,
+ *DAG.TII))
continue;
REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 06b9b64..8ea4267 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -496,12 +496,6 @@ private:
/// stage to their pre-stage values.
void finalizeGCNSchedStage() override;
- /// \p Returns true if all the uses in \p InstToRemat defined at \p
- /// OriginalIdx are live at \p RematIdx. This only checks liveness of virtual
- /// reg uses.
- bool allUsesAvailableAt(const MachineInstr *InstToRemat,
- SlotIndex OriginalIdx, SlotIndex RematIdx) const;
-
public:
bool initGCNSchedStage() override;
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 7c5d4fc..e4b3528 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -924,6 +924,7 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
case TargetStackID::SGPRSpill:
return true;
case TargetStackID::ScalableVector:
+ case TargetStackID::ScalablePredicateVector:
case TargetStackID::WasmLocal:
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1653008..f7265c5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -64,14 +64,6 @@ static cl::opt<bool> UseDivergentRegisterIndexing(
cl::desc("Use indirect register addressing for divergent indexes"),
cl::init(false));
-// TODO: This option should be removed once we switch to always using PTRADD in
-// the SelectionDAG.
-static cl::opt<bool> UseSelectionDAGPTRADD(
- "amdgpu-use-sdag-ptradd", cl::Hidden,
- cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
- "SelectionDAG ISel"),
- cl::init(false));
-
static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
@@ -11466,7 +11458,7 @@ static bool isNoUnsignedWrap(SDValue Addr) {
bool SITargetLowering::shouldPreservePtrArith(const Function &F,
EVT PtrVT) const {
- return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
+ return PtrVT == MVT::i64;
}
bool SITargetLowering::canTransformPtrArithOutOfBounds(const Function &F,
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index f291191..5e27b37 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -418,15 +418,14 @@ public:
class SIInsertWaitcnts {
public:
const GCNSubtarget *ST;
+ const SIInstrInfo *TII = nullptr;
+ const SIRegisterInfo *TRI = nullptr;
+ const MachineRegisterInfo *MRI = nullptr;
InstCounterType SmemAccessCounter;
InstCounterType MaxCounter;
const unsigned *WaitEventMaskForInst;
private:
- const SIInstrInfo *TII = nullptr;
- const SIRegisterInfo *TRI = nullptr;
- const MachineRegisterInfo *MRI = nullptr;
-
DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
MachineLoopInfo *MLI;
@@ -495,13 +494,6 @@ public:
bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
bool run(MachineFunction &MF);
- bool isForceEmitWaitcnt() const {
- for (auto T : inst_counter_types())
- if (ForceEmitWaitcnt[T])
- return true;
- return false;
- }
-
void setForceEmitWaitcnt() {
// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
// For debug builds, get the debug counter info and adjust if need be
@@ -570,10 +562,6 @@ public:
return VmemReadMapping[getVmemType(Inst)];
}
- bool hasXcnt() const { return ST->hasWaitXCnt(); }
-
- bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
- bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
bool isVmemAccess(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
@@ -591,7 +579,6 @@ public:
WaitcntBrackets &ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
- static bool asynchronouslyWritesSCC(unsigned Opcode);
};
// This objects maintains the current score brackets of each wait counter, and
@@ -643,8 +630,6 @@ public:
bool merge(const WaitcntBrackets &Other);
RegInterval getRegInterval(const MachineInstr *MI,
- const MachineRegisterInfo *MRI,
- const SIRegisterInfo *TRI,
const MachineOperand &Op) const;
bool counterOutOfOrder(InstCounterType T) const;
@@ -662,9 +647,7 @@ public:
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
void applyWaitcnt(InstCounterType T, unsigned Count);
void applyXcnt(const AMDGPU::Waitcnt &Wait);
- void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
- const MachineRegisterInfo *MRI, WaitEventType E,
- MachineInstr &MI);
+ void updateByEvent(WaitEventType E, MachineInstr &MI);
unsigned hasPendingEvent() const { return PendingEvents; }
unsigned hasPendingEvent(WaitEventType E) const {
@@ -773,10 +756,8 @@ private:
void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
unsigned Score);
- void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
- const MachineRegisterInfo *MRI,
- const MachineOperand &Op, InstCounterType CntTy,
- unsigned Val);
+ void setScoreByOperand(const MachineInstr *MI, const MachineOperand &Op,
+ InstCounterType CntTy, unsigned Val);
const SIInsertWaitcnts *Context;
@@ -833,12 +814,13 @@ public:
} // end anonymous namespace
RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
- const MachineRegisterInfo *MRI,
- const SIRegisterInfo *TRI,
const MachineOperand &Op) const {
if (Op.getReg() == AMDGPU::SCC)
return {SCC, SCC + 1};
+ const SIRegisterInfo *TRI = Context->TRI;
+ const MachineRegisterInfo *MRI = Context->MRI;
+
if (!TRI->isInAllocatableClass(Op.getReg()))
return {-1, -1};
@@ -903,11 +885,9 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
}
void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
- const SIRegisterInfo *TRI,
- const MachineRegisterInfo *MRI,
const MachineOperand &Op,
InstCounterType CntTy, unsigned Score) {
- RegInterval Interval = getRegInterval(MI, MRI, TRI, Op);
+ RegInterval Interval = getRegInterval(MI, Op);
setScoreByInterval(Interval, CntTy, Score);
}
@@ -939,10 +919,7 @@ bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER);
}
-void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
- const SIRegisterInfo *TRI,
- const MachineRegisterInfo *MRI,
- WaitEventType E, MachineInstr &Inst) {
+void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
unsigned UB = getScoreUB(T);
@@ -955,6 +932,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
PendingEvents |= 1 << E;
setScoreUB(T, CurrScore);
+ const SIRegisterInfo *TRI = Context->TRI;
+ const MachineRegisterInfo *MRI = Context->MRI;
+ const SIInstrInfo *TII = Context->TII;
+
if (T == EXP_CNT) {
// Put score on the source vgprs. If this is a store, just use those
// specific register(s).
@@ -962,59 +943,56 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
// All GDS operations must protect their address register (same as
// export.)
if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
- setScoreByOperand(&Inst, TRI, MRI, *AddrOp, EXP_CNT, CurrScore);
+ setScoreByOperand(&Inst, *AddrOp, EXP_CNT, CurrScore);
if (Inst.mayStore()) {
if (const auto *Data0 =
TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
- setScoreByOperand(&Inst, TRI, MRI, *Data0, EXP_CNT, CurrScore);
+ setScoreByOperand(&Inst, *Data0, EXP_CNT, CurrScore);
if (const auto *Data1 =
TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
- setScoreByOperand(&Inst, TRI, MRI, *Data1, EXP_CNT, CurrScore);
+ setScoreByOperand(&Inst, *Data1, EXP_CNT, CurrScore);
} else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
Inst.getOpcode() != AMDGPU::DS_APPEND &&
Inst.getOpcode() != AMDGPU::DS_CONSUME &&
Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
for (const MachineOperand &Op : Inst.all_uses()) {
if (TRI->isVectorRegister(*MRI, Op.getReg()))
- setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
+ setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore);
}
}
} else if (TII->isFLAT(Inst)) {
if (Inst.mayStore()) {
- setScoreByOperand(&Inst, TRI, MRI,
+ setScoreByOperand(&Inst,
*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
EXP_CNT, CurrScore);
} else if (SIInstrInfo::isAtomicRet(Inst)) {
- setScoreByOperand(&Inst, TRI, MRI,
+ setScoreByOperand(&Inst,
*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
EXP_CNT, CurrScore);
}
} else if (TII->isMIMG(Inst)) {
if (Inst.mayStore()) {
- setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
- CurrScore);
+ setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
} else if (SIInstrInfo::isAtomicRet(Inst)) {
- setScoreByOperand(&Inst, TRI, MRI,
+ setScoreByOperand(&Inst,
*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
EXP_CNT, CurrScore);
}
} else if (TII->isMTBUF(Inst)) {
if (Inst.mayStore())
- setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
- CurrScore);
+ setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
} else if (TII->isMUBUF(Inst)) {
if (Inst.mayStore()) {
- setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
- CurrScore);
+ setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
} else if (SIInstrInfo::isAtomicRet(Inst)) {
- setScoreByOperand(&Inst, TRI, MRI,
+ setScoreByOperand(&Inst,
*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
EXP_CNT, CurrScore);
}
} else if (TII->isLDSDIR(Inst)) {
// LDSDIR instructions attach the score to the destination.
- setScoreByOperand(&Inst, TRI, MRI,
+ setScoreByOperand(&Inst,
*TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
EXP_CNT, CurrScore);
} else {
@@ -1025,18 +1003,27 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
// score.
for (MachineOperand &DefMO : Inst.all_defs()) {
if (TRI->isVGPR(*MRI, DefMO.getReg())) {
- setScoreByOperand(&Inst, TRI, MRI, DefMO, EXP_CNT, CurrScore);
+ setScoreByOperand(&Inst, DefMO, EXP_CNT, CurrScore);
}
}
}
for (const MachineOperand &Op : Inst.all_uses()) {
if (TRI->isVectorRegister(*MRI, Op.getReg()))
- setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
+ setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore);
}
}
} else if (T == X_CNT) {
+ WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
+ if (PendingEvents & (1 << OtherEvent)) {
+ // Hardware inserts an implicit xcnt between interleaved
+ // SMEM and VMEM operations. So there will never be
+ // outstanding address translations for both SMEM and
+ // VMEM at the same time.
+ setScoreLB(T, CurrScore - 1);
+ PendingEvents &= ~(1 << OtherEvent);
+ }
for (const MachineOperand &Op : Inst.all_uses())
- setScoreByOperand(&Inst, TRI, MRI, Op, T, CurrScore);
+ setScoreByOperand(&Inst, Op, T, CurrScore);
} else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
// Match the score to the destination registers.
//
@@ -1048,7 +1035,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
// Special cases where implicit register defs exists, such as M0 or VCC,
// but none with memory instructions.
for (const MachineOperand &Op : Inst.defs()) {
- RegInterval Interval = getRegInterval(&Inst, MRI, TRI, Op);
+ RegInterval Interval = getRegInterval(&Inst, Op);
if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
if (Interval.first >= NUM_ALL_VGPRS)
continue;
@@ -1109,7 +1096,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
setRegScore(FIRST_LDS_VGPR, T, CurrScore);
}
- if (Context->asynchronouslyWritesSCC(Inst.getOpcode())) {
+ if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
setRegScore(SCC, T, CurrScore);
PendingSCCWrite = &Inst;
}
@@ -1831,12 +1818,6 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
return Modified;
}
-static bool readsVCCZ(const MachineInstr &MI) {
- unsigned Opc = MI.getOpcode();
- return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
- !MI.getOperand(1).isUndef();
-}
-
/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
// Currently all conventions wait, but this may not always be the case.
@@ -1871,26 +1852,24 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
assert(!MI.isMetaInstruction());
AMDGPU::Waitcnt Wait;
+ const unsigned Opc = MI.getOpcode();
// FIXME: This should have already been handled by the memory legalizer.
// Removing this currently doesn't affect any lit tests, but we need to
// verify that nothing was relying on this. The number of buffer invalidates
// being handled here should not be expanded.
- if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
- MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
- MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
- MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
- MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
+ if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC ||
+ Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV ||
+ Opc == AMDGPU::BUFFER_GL1_INV) {
Wait.LoadCnt = 0;
}
// All waits must be resolved at call return.
// NOTE: this could be improved with knowledge of all call sites or
// with knowledge of the called routines.
- if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
- MI.getOpcode() == AMDGPU::SI_RETURN ||
- MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
- MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
+ if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN ||
+ Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
+ Opc == AMDGPU::S_SETPC_B64_return ||
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
}
@@ -1902,8 +1881,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// send a message to explicitly release all VGPRs before the stores have
// completed, but it is only safe to do this if there are no outstanding
// scratch stores.
- else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
- MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
+ else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) {
if (!WCG->isOptNone() &&
(MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
(ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
@@ -1912,8 +1890,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
ReleaseVGPRInsts.insert(&MI);
}
// Resolve vm waits before gs-done.
- else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
- MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
+ else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) &&
ST->hasLegacyGeometry() &&
((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
@@ -1938,7 +1915,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// Wait for any pending GDS instruction to complete before any
// "Always GDS" instruction.
- if (TII->isAlwaysGDS(MI.getOpcode()) && ScoreBrackets.hasPendingGDS())
+ if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
@@ -1950,7 +1927,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
if (CallAddrOp.isReg()) {
RegInterval CallAddrOpInterval =
- ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOp);
+ ScoreBrackets.getRegInterval(&MI, CallAddrOp);
ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval,
Wait);
@@ -1958,13 +1935,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (const auto *RtnAddrOp =
TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
RegInterval RtnAddrOpInterval =
- ScoreBrackets.getRegInterval(&MI, MRI, TRI, *RtnAddrOp);
+ ScoreBrackets.getRegInterval(&MI, *RtnAddrOp);
ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval,
Wait);
}
}
- } else if (MI.getOpcode() == AMDGPU::S_BARRIER_WAIT) {
+ } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
ScoreBrackets.tryClearSCCWriteEvent(&MI);
} else {
// FIXME: Should not be relying on memoperands.
@@ -2022,7 +1999,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
continue;
- RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, Op);
+ RegInterval Interval = ScoreBrackets.getRegInterval(&MI, Op);
const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
if (IsVGPR) {
@@ -2061,7 +2038,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
}
- if (hasXcnt() && Op.isDef())
+ if (ST->hasWaitXCnt() && Op.isDef())
ScoreBrackets.determineWait(X_CNT, Interval, Wait);
}
}
@@ -2079,18 +2056,17 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
//
// In all other cases, ensure safety by ensuring that there are no outstanding
// memory operations.
- if (MI.getOpcode() == AMDGPU::S_BARRIER &&
- !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
+ if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() &&
+ !ST->supportsBackOffBarrier()) {
Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
}
// TODO: Remove this work-around, enable the assert for Bug 457939
// after fixing the scheduler. Also, the Shader Compiler code is
// independent of target.
- if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
- if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
- Wait.DsCnt = 0;
- }
+ if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() &&
+ ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
+ Wait.DsCnt = 0;
}
// Verify that the wait is actually needed.
@@ -2165,19 +2141,19 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
}
// XCnt may be already consumed by a load wait.
- if (Wait.KmCnt == 0 && Wait.XCnt != ~0u &&
- !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
- Wait.XCnt = ~0u;
+ if (Wait.XCnt != ~0u) {
+ if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
+ Wait.XCnt = ~0u;
- if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u &&
- !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
- Wait.XCnt = ~0u;
+ if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
+ Wait.XCnt = ~0u;
- // Since the translation for VMEM addresses occur in-order, we can skip the
- // XCnt if the current instruction is of VMEM type and has a memory dependency
- // with another VMEM instruction in flight.
- if (Wait.XCnt != ~0u && isVmemAccess(*It))
- Wait.XCnt = ~0u;
+ // Since the translation for VMEM addresses occur in-order, we can skip the
+ // XCnt if the current instruction is of VMEM type and has a memory
+ // dependency with another VMEM instruction in flight.
+ if (isVmemAccess(*It))
+ Wait.XCnt = ~0u;
+ }
if (WCG->createNewWaitcnt(Block, It, Wait))
Modified = true;
@@ -2185,75 +2161,11 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
return Modified;
}
-// This is a flat memory operation. Check to see if it has memory tokens other
-// than LDS. Other address spaces supported by flat memory operations involve
-// global memory.
-bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
- assert(TII->isFLAT(MI));
-
- // All flat instructions use the VMEM counter except prefetch.
- if (!TII->usesVM_CNT(MI))
- return false;
-
- // If there are no memory operands then conservatively assume the flat
- // operation may access VMEM.
- if (MI.memoperands_empty())
- return true;
-
- // See if any memory operand specifies an address space that involves VMEM.
- // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
- // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
- // (GDS) address space is not supported by flat operations. Therefore, simply
- // return true unless only the LDS address space is found.
- for (const MachineMemOperand *Memop : MI.memoperands()) {
- unsigned AS = Memop->getAddrSpace();
- assert(AS != AMDGPUAS::REGION_ADDRESS);
- if (AS != AMDGPUAS::LOCAL_ADDRESS)
- return true;
- }
-
- return false;
-}
-
-// This is a flat memory operation. Check to see if it has memory tokens for
-// either LDS or FLAT.
-bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
- assert(TII->isFLAT(MI));
-
- // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
- if (!TII->usesLGKM_CNT(MI))
- return false;
-
- // If in tgsplit mode then there can be no use of LDS.
- if (ST->isTgSplitEnabled())
- return false;
-
- // If there are no memory operands then conservatively assume the flat
- // operation may access LDS.
- if (MI.memoperands_empty())
- return true;
-
- // See if any memory operand specifies an address space that involves LDS.
- for (const MachineMemOperand *Memop : MI.memoperands()) {
- unsigned AS = Memop->getAddrSpace();
- if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
- return true;
- }
-
- return false;
-}
-
bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
- return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
+ return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||
(TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
}
-static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) {
- auto Opc = Inst.getOpcode();
- return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
- Opc == AMDGPU::GLOBAL_WBINV;
-}
-
// Return true if the next instruction is S_ENDPGM, following fallthrough
// blocks if necessary.
bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
@@ -2317,6 +2229,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
// Now look at the instruction opcode. If it is a memory access
// instruction, update the upper-bound of the appropriate counter's
// bracket and the destination operand scores.
+ // For architectures with X_CNT, mark the source address operands
+ // with the appropriate counter values.
// TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
bool IsVMEMAccess = false;
@@ -2324,16 +2238,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
if (TII->isAlwaysGDS(Inst.getOpcode()) ||
TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
- ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
- ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
+ ScoreBrackets->updateByEvent(GDS_ACCESS, Inst);
+ ScoreBrackets->updateByEvent(GDS_GPR_LOCK, Inst);
ScoreBrackets->setPendingGDS();
} else {
- ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
+ ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
}
} else if (TII->isFLAT(Inst)) {
- if (isGFX12CacheInvOrWBInst(Inst)) {
- ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
- Inst);
+ if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode())) {
+ ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
return;
}
@@ -2341,16 +2254,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
int FlatASCount = 0;
- if (mayAccessVMEMThroughFlat(Inst)) {
+ if (TII->mayAccessVMEMThroughFlat(Inst)) {
++FlatASCount;
IsVMEMAccess = true;
- ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
- Inst);
+ ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
}
- if (mayAccessLDSThroughFlat(Inst)) {
+ if (TII->mayAccessLDSThroughFlat(Inst)) {
++FlatASCount;
- ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
+ ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
}
// This is a flat memory operation that access both VMEM and LDS, so note it
@@ -2361,16 +2273,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
} else if (SIInstrInfo::isVMEM(Inst) &&
!llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
IsVMEMAccess = true;
- ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
- Inst);
+ ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
if (ST->vmemWriteNeedsExpWaitcnt() &&
(Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
- ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
+ ScoreBrackets->updateByEvent(VMW_GPR_LOCK, Inst);
}
} else if (TII->isSMRD(Inst)) {
IsSMEMAccess = true;
- ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+ ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
} else if (Inst.isCall()) {
if (callWaitsOnFunctionReturn(Inst)) {
// Act as a wait on everything
@@ -2382,45 +2293,45 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
}
} else if (SIInstrInfo::isLDSDIR(Inst)) {
- ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
+ ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);
} else if (TII->isVINTERP(Inst)) {
int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
} else if (SIInstrInfo::isEXP(Inst)) {
unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
- ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
+ ScoreBrackets->updateByEvent(EXP_PARAM_ACCESS, Inst);
else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
- ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
+ ScoreBrackets->updateByEvent(EXP_POS_ACCESS, Inst);
else
- ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
- } else if (asynchronouslyWritesSCC(Inst.getOpcode())) {
- ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst);
+ ScoreBrackets->updateByEvent(EXP_GPR_LOCK, Inst);
+ } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
+ ScoreBrackets->updateByEvent(SCC_WRITE, Inst);
} else {
switch (Inst.getOpcode()) {
case AMDGPU::S_SENDMSG:
case AMDGPU::S_SENDMSG_RTN_B32:
case AMDGPU::S_SENDMSG_RTN_B64:
case AMDGPU::S_SENDMSGHALT:
- ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
+ ScoreBrackets->updateByEvent(SQ_MESSAGE, Inst);
break;
case AMDGPU::S_MEMTIME:
case AMDGPU::S_MEMREALTIME:
case AMDGPU::S_GET_BARRIER_STATE_M0:
case AMDGPU::S_GET_BARRIER_STATE_IMM:
- ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+ ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
break;
}
}
- if (!hasXcnt())
+ if (!ST->hasWaitXCnt())
return;
if (IsVMEMAccess)
- ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_GROUP, Inst);
+ ScoreBrackets->updateByEvent(VMEM_GROUP, Inst);
if (IsSMEMAccess)
- ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_GROUP, Inst);
+ ScoreBrackets->updateByEvent(SMEM_GROUP, Inst);
}
bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
@@ -2478,9 +2389,8 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
if (!OldEventsHasSCCWrite) {
PendingSCCWrite = Other.PendingSCCWrite;
- } else {
- if (PendingSCCWrite != Other.PendingSCCWrite)
- PendingSCCWrite = nullptr;
+ } else if (PendingSCCWrite != Other.PendingSCCWrite) {
+ PendingSCCWrite = nullptr;
}
}
}
@@ -2516,12 +2426,6 @@ static bool isWaitInstr(MachineInstr &Inst) {
counterTypeForInstr(Opcode).has_value();
}
-bool SIInsertWaitcnts::asynchronouslyWritesSCC(unsigned Opcode) {
- return Opcode == AMDGPU::S_BARRIER_LEAVE ||
- Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
- Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
-}
-
// Generate s_waitcnt instructions where needed.
bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
MachineBasicBlock &Block,
@@ -2578,7 +2482,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
OldWaitcntInstr = nullptr;
// Restore vccz if it's not known to be correct already.
- bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
+ bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::isCBranchVCCZRead(Inst);
// Don't examine operands unless we need to track vccz correctness.
if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
@@ -2701,7 +2605,7 @@ bool SIInsertWaitcnts::isPreheaderToFlush(
bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
if (SIInstrInfo::isFLAT(MI))
- return mayAccessVMEMThroughFlat(MI);
+ return TII->mayAccessVMEMThroughFlat(MI);
return SIInstrInfo::isVMEM(MI);
}
@@ -2724,15 +2628,14 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
for (MachineBasicBlock *MBB : ML->blocks()) {
for (MachineInstr &MI : *MBB) {
if (isVMEMOrFlatVMEM(MI)) {
- if (MI.mayLoad())
- HasVMemLoad = true;
- if (MI.mayStore())
- HasVMemStore = true;
+ HasVMemLoad |= MI.mayLoad();
+ HasVMemStore |= MI.mayStore();
}
+
for (const MachineOperand &Op : MI.all_uses()) {
if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
continue;
- RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);
+ RegInterval Interval = Brackets.getRegInterval(&MI, Op);
// Vgpr use
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
// If we find a register that is loaded inside the loop, 1. and 2.
@@ -2757,7 +2660,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
// VMem load vgpr def
if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
for (const MachineOperand &Op : MI.all_defs()) {
- RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);
+ RegInterval Interval = Brackets.getRegInterval(&MI, Op);
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
// If we find a register that is loaded inside the loop, 1. and 2.
// are invalidated and we can exit.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 044ea86..56435a5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4344,6 +4344,59 @@ bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
});
}
+bool SIInstrInfo::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
+ assert(isFLAT(MI));
+
+ // All flat instructions use the VMEM counter except prefetch.
+ if (!usesVM_CNT(MI))
+ return false;
+
+ // If there are no memory operands then conservatively assume the flat
+ // operation may access VMEM.
+ if (MI.memoperands_empty())
+ return true;
+
+ // See if any memory operand specifies an address space that involves VMEM.
+ // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
+ // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
+ // (GDS) address space is not supported by flat operations. Therefore, simply
+ // return true unless only the LDS address space is found.
+ for (const MachineMemOperand *Memop : MI.memoperands()) {
+ unsigned AS = Memop->getAddrSpace();
+ assert(AS != AMDGPUAS::REGION_ADDRESS);
+ if (AS != AMDGPUAS::LOCAL_ADDRESS)
+ return true;
+ }
+
+ return false;
+}
+
+bool SIInstrInfo::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
+ assert(isFLAT(MI));
+
+ // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
+ if (!usesLGKM_CNT(MI))
+ return false;
+
+ // If in tgsplit mode then there can be no use of LDS.
+ if (ST.isTgSplitEnabled())
+ return false;
+
+ // If there are no memory operands then conservatively assume the flat
+ // operation may access LDS.
+ if (MI.memoperands_empty())
+ return true;
+
+ // See if any memory operand specifies an address space that involves LDS.
+ for (const MachineMemOperand *Memop : MI.memoperands()) {
+ unsigned AS = Memop->getAddrSpace();
+ if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
+ return true;
+ }
+
+ return false;
+}
+
bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
// Skip the full operand and register alias search modifiesRegister
// does. There's only a handful of instructions that touch this, it's only an
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index c2252af..a21089f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -688,6 +688,12 @@ public:
/// to not hit scratch.
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
+ /// \returns true for FLAT instructions that can access VMEM.
+ bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
+
+ /// \returns true for FLAT instructions that can access LDS.
+ bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
+
static bool isBlockLoadStore(uint16_t Opcode) {
switch (Opcode) {
case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
@@ -748,6 +754,18 @@ public:
return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
}
+ static bool isSBarrierSCCWrite(unsigned Opcode) {
+ return Opcode == AMDGPU::S_BARRIER_LEAVE ||
+ Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
+ Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
+ }
+
+ static bool isCBranchVCCZRead(const MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
+ !MI.getOperand(1).isUndef();
+ }
+
static bool isWQM(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::WQM;
}
@@ -1010,6 +1028,11 @@ public:
Opcode == AMDGPU::DS_GWS_BARRIER;
}
+ static bool isGFX12CacheInvOrWBInst(unsigned Opc) {
+ return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
+ Opc == AMDGPU::GLOBAL_WBINV;
+ }
+
static bool isF16PseudoScalarTrans(unsigned Opcode) {
return Opcode == AMDGPU::V_S_EXP_F16_e64 ||
Opcode == AMDGPU::V_S_LOG_F16_e64 ||
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 8f1dd62..5630580 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1163,6 +1163,22 @@ def VS_64_Lo256 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32,
let HasSGPR = 1;
let Size = 64;
}
+
+def VS_128 : SIRegisterClass<"AMDGPU", VReg_128.RegTypes, 32,
+ (add VReg_128, SReg_128)> {
+ let isAllocatable = 0;
+ let HasVGPR = 1;
+ let HasSGPR = 1;
+ let Size = 128;
+}
+
+def VS_128_Align2 : SIRegisterClass<"AMDGPU", VReg_128.RegTypes, 32,
+ (add VReg_128_Align2, SReg_128)> {
+ let isAllocatable = 0;
+ let HasVGPR = 1;
+ let HasSGPR = 1;
+ let Size = 128;
+}
} // End GeneratePressureSet = 0
// Define a register tuple class, along with one requiring an even
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index b3fd8c7..84287b6 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -352,10 +352,12 @@ def S_XNOR_SAVEEXEC_B64 : SOP1_64 <"s_xnor_saveexec_b64">;
} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC]
+let Defs = [SCC] in {
def S_QUADMASK_B32 : SOP1_32 <"s_quadmask_b32",
[(set i32:$sdst, (int_amdgcn_s_quadmask i32:$src0))]>;
def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64",
[(set i64:$sdst, (int_amdgcn_s_quadmask i64:$src0))]>;
+}
let Uses = [M0] in {
def S_MOVRELS_B32 : SOP1_32R <"s_movrels_b32">;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 77df721..54f57e0 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -314,9 +314,10 @@ let SubtargetPredicate = HasGFX950Insts, OtherPredicates = [HasBF16ConversionIns
defm V_CVT_F32_BF16 : VOP1Inst_t16 <"v_cvt_f32_bf16", VOP_F32_BF16>;
}
let SubtargetPredicate = isGFX1250Plus, OtherPredicates = [HasBF16ConversionInsts] in {
- defm V_CVT_F32_BF16_gfx1250 : VOP1Inst_t16_with_profiles <"v_cvt_f32_bf16_gfx1250", VOP_F32_BF16,
- VOPProfile_CVT_F32_BF16_gfx1250_t16,
- VOPProfile_CVT_F32_BF16_gfx1250_fake16>;
+ let True16Predicate = UseRealTrue16Insts in
+ defm V_CVT_F32_BF16_gfx1250_t16 : VOP1Inst <"V_CVT_F32_BF16_gfx1250_t16", VOPProfile_CVT_F32_BF16_gfx1250_t16>;
+ let True16Predicate = UseFakeTrue16Insts in
+ defm V_CVT_F32_BF16_gfx1250_fake16 : VOP1Inst <"V_CVT_F32_BF16_gfx1250_fake16", VOPProfile_CVT_F32_BF16_gfx1250_fake16>;
}
let ReadsModeReg = 0, mayRaiseFPException = 0 in {
@@ -899,6 +900,7 @@ class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p =
let DecoderNamespace = Gen.DecoderNamespace;
let OtherPredicates = !listconcat(ps.OtherPredicates,
!if(p.HasExt64BitDPP, [HasDPALU_DPP], []));
+ let True16Predicate = ps.True16Predicate;
}
class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
@@ -921,6 +923,7 @@ class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pf
VOP1_DPP8<op, ps, p> {
let AssemblerPredicate = Gen.AssemblerPredicate;
let DecoderNamespace = Gen.DecoderNamespace;
+ let True16Predicate = ps.True16Predicate;
}
//===----------------------------------------------------------------------===//
@@ -1149,7 +1152,7 @@ defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>;
defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>;
defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>;
-defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
+defm V_CVT_F32_BF16_gfx1250 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16">;
defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>;
defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>;
defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;