diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 185 |
1 files changed, 181 insertions, 4 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index bbed828..94886b0 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -520,8 +520,8 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineInstr *MI, IsExpiredFn IsExpired) { DenseSet<const MachineBasicBlock *> Visited; return getWaitStatesSince(IsHazard, MI->getParent(), - std::next(MI->getReverseIterator()), - 0, IsExpired, Visited); + std::next(MI->getReverseIterator()), 0, IsExpired, + Visited, SIInstrInfo::getNumWaitStates); } int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { @@ -1190,7 +1190,8 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { fixVALUPartialForwardingHazard(MI); fixVALUTransUseHazard(MI); fixVALUTransCoexecutionHazards(MI); - fixWMMAHazards(MI); + fixWMMAHazards(MI); // fall-through if co-execution is enabled. + fixWMMACoexecutionHazards(MI); fixShift64HighRegBug(MI); fixVALUMaskWriteHazard(MI); fixRequiredExportPriority(MI); @@ -1909,6 +1910,182 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { return true; } +static bool isCoexecutableVALUInst(const MachineInstr &MI) { + return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isTRANS(MI) && + !SIInstrInfo::isWMMA(MI) && !SIInstrInfo::isSWMMAC(MI); // What else? +} + +static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, + const SIInstrInfo *TII, unsigned Latency, + unsigned Category) { + assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) && + "Handle me if the xdl wmma instruction latency changes"); + + switch (Category) { + case 0: // Dense WMMA Instructions: + // WMMA_*F16, WMMA_*BF16 + // WMMA_*FP8FP8 + // WMMA_*FP8BF8 + // WMMA_*BF8FP8 + // WMMA_*BF8BF8 + // WMMA_*F8F6F4 if SRCA & SRCB != F8 + return Latency == 8 && SIInstrInfo::isWMMA(MI); + + case 1: // Dense WMMA Instructions: + // WMMA_IU8 + // WMMA_IU4 + // WMMA_*F8F6F4 if SRCA OR SRCB == F8 + return Latency == 16 && SIInstrInfo::isWMMA(MI); + + case 2: // Dense SWMMAC Instructions + // SWMMAC_*F16, SWMMAC_*BF16, + // SWMMAC_*FP8FP8 + // SWMMAC_*BF8FP8 + // SWMMAC_*FP8BF8 + // SWMMAC_*BF8BF8 + return Latency == 8 && SIInstrInfo::isSWMMAC(MI); + + case 3: // Sparse WMMA Instructions: + // SWMMAC_IU8 + // SWMMAC_IU4 + return Latency == 16 && SIInstrInfo::isSWMMAC(MI); + default: + break; + } // end switch. + + return false; +} + +bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) { + if (!AMDGPU::isGFX1250(ST)) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI)) + return false; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + // WaitStates here is the number of V_NOPs or unrelated VALU instructions must + // be in between the first WMMA and the second instruction to cover the hazard + // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second + // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for + // numbers, which depends on the category of the first WMMA. + const int WMMAWaitStates[] = {5, 9, 3, 5}; + const int VALUWaitStates[] = {4, 8, 2, 4}; + unsigned Category = 0; + + auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) { + if (!TII->isXDLWMMA(I)) + return false; + + unsigned Latency = TSchedModel.computeInstrLatency(&I); + if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category)) + return false; + + Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); + Register A1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg(); + Register B1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg(); + + // WMMA0 wrires (D0), WMMA1 reads (A1/B1/Idx1). + if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1)) + return true; + + if (SIInstrInfo::isSWMMAC(*MI)) { + Register Idx1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); + if (TRI->regsOverlap(D0, Idx1)) + return true; + } + + return false; + }; + + auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) { + if (!TII->isXDLWMMA(I)) + return false; + + unsigned Latency = TSchedModel.computeInstrLatency(&I); + if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category)) + return false; + + // WMMA writes, VALU reads. + Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); + for (const MachineOperand &ValuUse : MI->explicit_uses()) { + if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg())) + return true; + } + + auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst); + if (!ValuDst || !ValuDst->isReg()) + return false; + Register D1 = ValuDst->getReg(); + + // WMMA writes, VALU writes. + if (TRI->regsOverlap(D0, D1)) + return true; + + // WMMA reads, VALU writes. + Register A0 = TII->getNamedOperand(I, AMDGPU::OpName::src0)->getReg(); + Register B0 = TII->getNamedOperand(I, AMDGPU::OpName::src1)->getReg(); + if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1)) + return true; + + if (SIInstrInfo::isSWMMAC(I)) { + Register Idx0 = TII->getNamedOperand(I, AMDGPU::OpName::src2)->getReg(); + if (TRI->regsOverlap(D1, Idx0)) + return true; + } + + return false; + }; + + int Limit = 0; + auto IsExpiredFn = [&Limit](const MachineInstr &, int WaitStates) { + return WaitStates >= Limit; + }; + + auto GetWaitStatesFn = [](const MachineInstr &I) { + return SIInstrInfo::isVALU(I) ? 1 : 0; + }; + + int WaitStatesNeeded = -1; + if (TII->isXDLWMMA(*MI)) { + for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) { + Limit = WMMAWaitStates[Category]; // for IsExpiredFn. + DenseSet<const MachineBasicBlock *> Visited; + // '::getWaitStatesSince' returns the number of VALUs in between if hazard + // exists, and INT_MAX if there is no hazard. As a result, a negative + // WaitStatesNeeded here means no hazard, and we will continue to search + // for other categories. + WaitStatesNeeded = + Limit - ::getWaitStatesSince(IsWMMAHazardFn, MI->getParent(), + std::next(MI->getReverseIterator()), 0, + IsExpiredFn, Visited, GetWaitStatesFn); + } + } else { // Must be a co-executable VALU. + for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) { + Limit = VALUWaitStates[Category]; // for IsExpiredFn. + DenseSet<const MachineBasicBlock *> Visited; + // '::getWaitStatesSince' returns the number of VALUs in between if hazard + // exists, and INT_MAX if there is no hazard. As a result, a negative + // WaitStatesNeeded here means no hazard, and we will continue to search + // for other categories. + WaitStatesNeeded = + Limit - ::getWaitStatesSince(IsVALUHazardFn, MI->getParent(), + std::next(MI->getReverseIterator()), 0, + IsExpiredFn, Visited, GetWaitStatesFn); + } + } + + // WaitStatesNeeded now is the number of V_NOPs we need to insert, negative + // means not needed. + for (int i = 0; i < WaitStatesNeeded; i++) + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_NOP_e32)); + + return true; +} + bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { if (!ST.hasShift64HighRegBug()) return false; @@ -3206,7 +3383,7 @@ bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) { // Check entry priority at each export (as there will only be a few). // Note: amdgpu_gfx can only be a callee, so defer to caller setprio. bool Changed = false; - if (CC != CallingConv::AMDGPU_Gfx) + if (CC != CallingConv::AMDGPU_Gfx && CC != CallingConv::AMDGPU_Gfx_WholeWave) Changed = ensureEntrySetPrio(MF, NormalPriority, TII); auto NextMI = std::next(It); |