aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp185
1 files changed, 181 insertions, 4 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index bbed828..94886b0 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -520,8 +520,8 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
const MachineInstr *MI, IsExpiredFn IsExpired) {
DenseSet<const MachineBasicBlock *> Visited;
return getWaitStatesSince(IsHazard, MI->getParent(),
- std::next(MI->getReverseIterator()),
- 0, IsExpired, Visited);
+ std::next(MI->getReverseIterator()), 0, IsExpired,
+ Visited, SIInstrInfo::getNumWaitStates);
}
int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
@@ -1190,7 +1190,8 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVALUPartialForwardingHazard(MI);
fixVALUTransUseHazard(MI);
fixVALUTransCoexecutionHazards(MI);
- fixWMMAHazards(MI);
+ fixWMMAHazards(MI); // fall-through if co-execution is enabled.
+ fixWMMACoexecutionHazards(MI);
fixShift64HighRegBug(MI);
fixVALUMaskWriteHazard(MI);
fixRequiredExportPriority(MI);
@@ -1909,6 +1910,182 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
return true;
}
+static bool isCoexecutableVALUInst(const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isTRANS(MI) &&
+ !SIInstrInfo::isWMMA(MI) && !SIInstrInfo::isSWMMAC(MI); // What else?
+}
+
+static bool IsWMMAHazardInstInCategory(const MachineInstr &MI,
+ const SIInstrInfo *TII, unsigned Latency,
+ unsigned Category) {
+ assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) &&
+ "Handle me if the xdl wmma instruction latency changes");
+
+ switch (Category) {
+ case 0: // Dense WMMA Instructions:
+ // WMMA_*F16, WMMA_*BF16
+ // WMMA_*FP8FP8
+ // WMMA_*FP8BF8
+ // WMMA_*BF8FP8
+ // WMMA_*BF8BF8
+ // WMMA_*F8F6F4 if SRCA & SRCB != F8
+ return Latency == 8 && SIInstrInfo::isWMMA(MI);
+
+ case 1: // Dense WMMA Instructions:
+ // WMMA_IU8
+ // WMMA_IU4
+ // WMMA_*F8F6F4 if SRCA OR SRCB == F8
+ return Latency == 16 && SIInstrInfo::isWMMA(MI);
+
+ case 2: // Dense SWMMAC Instructions
+ // SWMMAC_*F16, SWMMAC_*BF16,
+ // SWMMAC_*FP8FP8
+ // SWMMAC_*BF8FP8
+ // SWMMAC_*FP8BF8
+ // SWMMAC_*BF8BF8
+ return Latency == 8 && SIInstrInfo::isSWMMAC(MI);
+
+ case 3: // Sparse WMMA Instructions:
+ // SWMMAC_IU8
+ // SWMMAC_IU4
+ return Latency == 16 && SIInstrInfo::isSWMMAC(MI);
+ default:
+ break;
+ } // end switch.
+
+ return false;
+}
+
+bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
+ if (!AMDGPU::isGFX1250(ST))
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
+ return false;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
+ // be in between the first WMMA and the second instruction to cover the hazard
+ // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
+ // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
+ // numbers, which depends on the category of the first WMMA.
+ const int WMMAWaitStates[] = {5, 9, 3, 5};
+ const int VALUWaitStates[] = {4, 8, 2, 4};
+ unsigned Category = 0;
+
+ auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
+ if (!TII->isXDLWMMA(I))
+ return false;
+
+ unsigned Latency = TSchedModel.computeInstrLatency(&I);
+ if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
+ return false;
+
+ Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
+ Register A1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
+ Register B1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
+
+ // WMMA0 wrires (D0), WMMA1 reads (A1/B1/Idx1).
+ if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
+ return true;
+
+ if (SIInstrInfo::isSWMMAC(*MI)) {
+ Register Idx1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
+ if (TRI->regsOverlap(D0, Idx1))
+ return true;
+ }
+
+ return false;
+ };
+
+ auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
+ if (!TII->isXDLWMMA(I))
+ return false;
+
+ unsigned Latency = TSchedModel.computeInstrLatency(&I);
+ if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
+ return false;
+
+ // WMMA writes, VALU reads.
+ Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
+ for (const MachineOperand &ValuUse : MI->explicit_uses()) {
+ if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
+ return true;
+ }
+
+ auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
+ if (!ValuDst || !ValuDst->isReg())
+ return false;
+ Register D1 = ValuDst->getReg();
+
+ // WMMA writes, VALU writes.
+ if (TRI->regsOverlap(D0, D1))
+ return true;
+
+ // WMMA reads, VALU writes.
+ Register A0 = TII->getNamedOperand(I, AMDGPU::OpName::src0)->getReg();
+ Register B0 = TII->getNamedOperand(I, AMDGPU::OpName::src1)->getReg();
+ if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
+ return true;
+
+ if (SIInstrInfo::isSWMMAC(I)) {
+ Register Idx0 = TII->getNamedOperand(I, AMDGPU::OpName::src2)->getReg();
+ if (TRI->regsOverlap(D1, Idx0))
+ return true;
+ }
+
+ return false;
+ };
+
+ int Limit = 0;
+ auto IsExpiredFn = [&Limit](const MachineInstr &, int WaitStates) {
+ return WaitStates >= Limit;
+ };
+
+ auto GetWaitStatesFn = [](const MachineInstr &I) {
+ return SIInstrInfo::isVALU(I) ? 1 : 0;
+ };
+
+ int WaitStatesNeeded = -1;
+ if (TII->isXDLWMMA(*MI)) {
+ for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
+ Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
+ DenseSet<const MachineBasicBlock *> Visited;
+ // '::getWaitStatesSince' returns the number of VALUs in between if hazard
+ // exists, and INT_MAX if there is no hazard. As a result, a negative
+ // WaitStatesNeeded here means no hazard, and we will continue to search
+ // for other categories.
+ WaitStatesNeeded =
+ Limit - ::getWaitStatesSince(IsWMMAHazardFn, MI->getParent(),
+ std::next(MI->getReverseIterator()), 0,
+ IsExpiredFn, Visited, GetWaitStatesFn);
+ }
+ } else { // Must be a co-executable VALU.
+ for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
+ Limit = VALUWaitStates[Category]; // for IsExpiredFn.
+ DenseSet<const MachineBasicBlock *> Visited;
+ // '::getWaitStatesSince' returns the number of VALUs in between if hazard
+ // exists, and INT_MAX if there is no hazard. As a result, a negative
+ // WaitStatesNeeded here means no hazard, and we will continue to search
+ // for other categories.
+ WaitStatesNeeded =
+ Limit - ::getWaitStatesSince(IsVALUHazardFn, MI->getParent(),
+ std::next(MI->getReverseIterator()), 0,
+ IsExpiredFn, Visited, GetWaitStatesFn);
+ }
+ }
+
+ // WaitStatesNeeded now is the number of V_NOPs we need to insert, negative
+ // means not needed.
+ for (int i = 0; i < WaitStatesNeeded; i++)
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::V_NOP_e32));
+
+ return true;
+}
+
bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
if (!ST.hasShift64HighRegBug())
return false;
@@ -3206,7 +3383,7 @@ bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
// Check entry priority at each export (as there will only be a few).
// Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
bool Changed = false;
- if (CC != CallingConv::AMDGPU_Gfx)
+ if (CC != CallingConv::AMDGPU_Gfx && CC != CallingConv::AMDGPU_Gfx_WholeWave)
Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
auto NextMI = std::next(It);