diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/DSInstructions.td | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/FLATInstructions.td | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 180 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 19 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 61 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 9 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 47 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 3 |
11 files changed, 284 insertions, 54 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 1b559a6..8ed4062 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -514,8 +514,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, MVT::i64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); - setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32, - Legal); + setOperationAction({ISD::ABS, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, + MVT::i32, Legal); setOperationAction( {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index bfe2c80..a67b12a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -901,6 +901,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}}); + addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}}); + bool hasSALUFloat = ST->hasSALUFloatInsts(); addRulesForGOpcs({G_FADD}, Standard) diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index d0ad120..b841171 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1488,6 +1488,12 @@ let AssemblerPredicate = isGFX12Plus in { def : MnemonicAlias<"ds_load_tr_b64", "ds_load_tr8_b64">, Requires<[isGFX1250Plus]>; def : MnemonicAlias<"ds_load_tr_b128", "ds_load_tr16_b128">, Requires<[isGFX1250Plus]>; +// Additional aliases for ds load transpose instructions. +def : MnemonicAlias<"ds_load_b64_tr_b8", "ds_load_tr8_b64">, Requires<[isGFX125xOnly]>; +def : MnemonicAlias<"ds_load_b128_tr_b16", "ds_load_tr16_b128">, Requires<[isGFX125xOnly]>; +def : MnemonicAlias<"ds_load_b64_tr_b4", "ds_load_tr4_b64">, Requires<[isGFX125xOnly]>; +def : MnemonicAlias<"ds_load_b96_tr_b6", "ds_load_tr6_b96">, Requires<[isGFX125xOnly]>; + //===----------------------------------------------------------------------===// // GFX11. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index e0375ea..e3f3aba 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -892,6 +892,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // have EXEC as implicit destination. Issue a warning if encoding for // vdst is not EXEC. if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3) && + MCII->get(MI.getOpcode()).getNumDefs() == 0 && MCII->get(MI.getOpcode()).hasImplicitDefOfPhysReg(AMDGPU::EXEC)) { auto ExecEncoding = MRI.getEncodingValue(AMDGPU::EXEC_LO); if (Bytes_[0] != ExecEncoding) diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 6de59be..8ea64d1 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -3711,6 +3711,12 @@ defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "globa defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>; defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>; +// Additional aliases for global load transpose instructions. +def : MnemonicAlias<"global_load_b128_tr_b16", "global_load_tr16_b128">, Requires<[isGFX125xOnly]>; +def : MnemonicAlias<"global_load_b64_tr_b8", "global_load_tr8_b64">, Requires<[isGFX125xOnly]>; +def : MnemonicAlias<"global_load_b64_tr_b4", "global_load_tr4_b64">, Requires<[isGFX125xOnly]>; +def : MnemonicAlias<"global_load_b96_tr_b6", "global_load_tr6_b96">, Requires<[isGFX125xOnly]>; + defm FLAT_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>; defm FLAT_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">; defm FLAT_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 58482ea..9fbf9e5 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -69,6 +69,12 @@ static cl::opt<bool> GCNTrackers( cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false)); +static cl::opt<unsigned> PendingQueueLimit( + "amdgpu-scheduler-pending-queue-limit", cl::Hidden, + cl::desc( + "Max (Available+Pending) size to inspect pending queue (0 disables)"), + cl::init(256)); + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) #define DUMP_MAX_REG_PRESSURE static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler( @@ -335,17 +341,52 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, } } +static bool shouldCheckPending(SchedBoundary &Zone, + const TargetSchedModel *SchedModel) { + bool HasBufferedModel = + SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize(); + unsigned Combined = Zone.Available.size() + Zone.Pending.size(); + return Combined <= PendingQueueLimit && HasBufferedModel; +} + +static SUnit *pickOnlyChoice(SchedBoundary &Zone, + const TargetSchedModel *SchedModel) { + // pickOnlyChoice() releases pending instructions and checks for new hazards. + SUnit *OnlyChoice = Zone.pickOnlyChoice(); + if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty()) + return OnlyChoice; + + return nullptr; +} + +void GCNSchedStrategy::printCandidateDecision(const SchedCandidate &Current, + const SchedCandidate &Preferred) { + LLVM_DEBUG({ + dbgs() << "Prefer:\t\t"; + DAG->dumpNode(*Preferred.SU); + + if (Current.SU) { + dbgs() << "Not:\t"; + DAG->dumpNode(*Current.SU); + } + + dbgs() << "Reason:\t\t"; + traceCandidate(Preferred); + }); +} + // This function is mostly cut and pasted from // GenericScheduler::pickNodeFromQueue() void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, - SchedCandidate &Cand, + SchedCandidate &Cand, bool &IsPending, bool IsBottomUp) { const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); unsigned SGPRPressure = 0; unsigned VGPRPressure = 0; + IsPending = false; if (DAG->isTrackingPressure()) { if (!GCNTrackers) { SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; @@ -358,8 +399,9 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, VGPRPressure = T->getPressure().getArchVGPRNum(); } } - ReadyQueue &Q = Zone.Available; - for (SUnit *SU : Q) { + LLVM_DEBUG(dbgs() << "Available Q:\n"); + ReadyQueue &AQ = Zone.Available; + for (SUnit *SU : AQ) { SchedCandidate TryCand(ZonePolicy); initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, @@ -371,27 +413,55 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, // Initialize resource delta if needed in case future heuristics query it. if (TryCand.ResDelta == SchedResourceDelta()) TryCand.initResourceDelta(Zone.DAG, SchedModel); + LLVM_DEBUG(printCandidateDecision(Cand, TryCand)); Cand.setBest(TryCand); - LLVM_DEBUG(traceCandidate(Cand)); + } else { + printCandidateDecision(TryCand, Cand); + } + } + + if (!shouldCheckPending(Zone, SchedModel)) + return; + + LLVM_DEBUG(dbgs() << "Pending Q:\n"); + ReadyQueue &PQ = Zone.Pending; + for (SUnit *SU : PQ) { + + SchedCandidate TryCand(ZonePolicy); + initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, + VGPRPressure, IsBottomUp); + // Pass SchedBoundary only when comparing nodes from the same boundary. + SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; + tryPendingCandidate(Cand, TryCand, ZoneArg); + if (TryCand.Reason != NoCand) { + // Initialize resource delta if needed in case future heuristics query it. + if (TryCand.ResDelta == SchedResourceDelta()) + TryCand.initResourceDelta(Zone.DAG, SchedModel); + LLVM_DEBUG(printCandidateDecision(Cand, TryCand)); + IsPending = true; + Cand.setBest(TryCand); + } else { + printCandidateDecision(TryCand, Cand); } } } // This function is mostly cut and pasted from // GenericScheduler::pickNodeBidirectional() -SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { +SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode, + bool &PickedPending) { // Schedule as far as possible in the direction of no choice. This is most // efficient, but also provides the best heuristics for CriticalPSets. - if (SUnit *SU = Bot.pickOnlyChoice()) { + if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) { IsTopNode = false; return SU; } - if (SUnit *SU = Top.pickOnlyChoice()) { + if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) { IsTopNode = true; return SU; } - // Set the bottom-up policy based on the state of the current bottom zone and - // the instructions outside the zone, including the top zone. + // Set the bottom-up policy based on the state of the current bottom zone + // and the instructions outside the zone, including the top zone. CandPolicy BotPolicy; setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); // Set the top-down policy based on the state of the current top zone and @@ -399,12 +469,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { CandPolicy TopPolicy; setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); + bool BotPending = false; // See if BotCand is still valid (because we previously scheduled from Top). LLVM_DEBUG(dbgs() << "Picking from Bot:\n"); if (!BotCand.isValid() || BotCand.SU->isScheduled || BotCand.Policy != BotPolicy) { BotCand.reset(CandPolicy()); pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand, + BotPending, /*IsBottomUp=*/true); assert(BotCand.Reason != NoCand && "failed to find the first candidate"); } else { @@ -414,6 +486,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { SchedCandidate TCand; TCand.reset(CandPolicy()); pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand, + BotPending, /*IsBottomUp=*/true); assert(TCand.SU == BotCand.SU && "Last pick result should correspond to re-picking right now"); @@ -421,12 +494,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { #endif } + bool TopPending = false; // Check if the top Q has a better candidate. LLVM_DEBUG(dbgs() << "Picking from Top:\n"); if (!TopCand.isValid() || TopCand.SU->isScheduled || TopCand.Policy != TopPolicy) { TopCand.reset(CandPolicy()); pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand, + TopPending, /*IsBottomUp=*/false); assert(TopCand.Reason != NoCand && "failed to find the first candidate"); } else { @@ -436,6 +511,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { SchedCandidate TCand; TCand.reset(CandPolicy()); pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand, + TopPending, /*IsBottomUp=*/false); assert(TCand.SU == TopCand.SU && "Last pick result should correspond to re-picking right now"); @@ -446,12 +522,21 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { // Pick best from BotCand and TopCand. LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); dbgs() << "Bot Cand: "; traceCandidate(BotCand);); - SchedCandidate Cand = BotCand; - TopCand.Reason = NoCand; - tryCandidate(Cand, TopCand, nullptr); - if (TopCand.Reason != NoCand) { - Cand.setBest(TopCand); + SchedCandidate Cand = BotPending ? TopCand : BotCand; + SchedCandidate TryCand = BotPending ? BotCand : TopCand; + PickedPending = BotPending && TopPending; + + TryCand.Reason = NoCand; + if (BotPending || TopPending) { + PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr); + } else { + tryCandidate(Cand, TryCand, nullptr); + } + + if (TryCand.Reason != NoCand) { + Cand.setBest(TryCand); } + LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); IsTopNode = Cand.AtTop; @@ -466,35 +551,55 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); return nullptr; } + bool PickedPending; SUnit *SU; do { + PickedPending = false; if (RegionPolicy.OnlyTopDown) { - SU = Top.pickOnlyChoice(); + SU = pickOnlyChoice(Top, SchedModel); if (!SU) { CandPolicy NoPolicy; TopCand.reset(NoPolicy); pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand, + PickedPending, /*IsBottomUp=*/false); assert(TopCand.Reason != NoCand && "failed to find a candidate"); SU = TopCand.SU; } IsTopNode = true; } else if (RegionPolicy.OnlyBottomUp) { - SU = Bot.pickOnlyChoice(); + SU = pickOnlyChoice(Bot, SchedModel); if (!SU) { CandPolicy NoPolicy; BotCand.reset(NoPolicy); pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand, + PickedPending, /*IsBottomUp=*/true); assert(BotCand.Reason != NoCand && "failed to find a candidate"); SU = BotCand.SU; } IsTopNode = false; } else { - SU = pickNodeBidirectional(IsTopNode); + SU = pickNodeBidirectional(IsTopNode, PickedPending); } } while (SU->isScheduled); + if (PickedPending) { + unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle; + SchedBoundary &Zone = IsTopNode ? Top : Bot; + unsigned CurrentCycle = Zone.getCurrCycle(); + if (ReadyCycle > CurrentCycle) + Zone.bumpCycle(ReadyCycle); + + // FIXME: checkHazard() doesn't give information about which cycle the + // hazard will resolve so just keep bumping the cycle by 1. This could be + // made more efficient if checkHazard() returned more details. + while (Zone.checkHazard(SU)) + Zone.bumpCycle(Zone.getCurrCycle() + 1); + + Zone.releasePending(); + } + if (SU->isTopReady()) Top.removeReady(SU); if (SU->isBottomReady()) @@ -540,6 +645,47 @@ GCNSchedStageID GCNSchedStrategy::getNextStage() const { return *std::next(CurrentStage); } +bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand, + SchedBoundary *Zone) const { + // Initialize the candidate if needed. + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return true; + } + + // Bias PhysReg Defs and copies to their uses and defined respectively. + if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), + biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) + return TryCand.Reason != NoCand; + + // Avoid exceeding the target's limit. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, + RegExcess, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + // Avoid increasing the max critical pressure in the scheduled region. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, + TryCand, Cand, RegCritical, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + bool SameBoundary = Zone != nullptr; + if (SameBoundary) { + TryCand.initResourceDelta(DAG, SchedModel); + if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, + TryCand, Cand, ResourceReduce)) + return TryCand.Reason != NoCand; + if (tryGreater(TryCand.ResDelta.DemandedResources, + Cand.ResDelta.DemandedResources, TryCand, Cand, + ResourceDemand)) + return TryCand.Reason != NoCand; + } + + return false; +} + GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( const MachineSchedContext *C, bool IsLegacyScheduler) : GCNSchedStrategy(C) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 8ea4267..975781f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -44,17 +44,32 @@ raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID); /// heuristics to determine excess/critical pressure sets. class GCNSchedStrategy : public GenericScheduler { protected: - SUnit *pickNodeBidirectional(bool &IsTopNode); + SUnit *pickNodeBidirectional(bool &IsTopNode, bool &PickedPending); void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, - SchedCandidate &Cand, bool IsBottomUp); + SchedCandidate &Cand, bool &IsPending, + bool IsBottomUp); void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp); + /// Evaluates instructions in the pending queue using a subset of scheduling + /// heuristics. + /// + /// Instructions that cannot be issued due to hardware constraints are placed + /// in the pending queue rather than the available queue, making them normally + /// invisible to scheduling heuristics. However, in certain scenarios (such as + /// avoiding register spilling), it may be beneficial to consider scheduling + /// these not-yet-ready instructions. + bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) const; + + void printCandidateDecision(const SchedCandidate &Current, + const SchedCandidate &Preferred); + std::vector<unsigned> Pressure; std::vector<unsigned> MaxPressure; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d516330..50447f4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -9072,6 +9072,67 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist, MachineOperand &Src1 = Inst.getOperand(2); const DebugLoc &DL = Inst.getDebugLoc(); + if (ST.useRealTrue16Insts()) { + Register SrcReg0, SrcReg1; + if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) { + SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0); + } else { + SrcReg0 = Src0.getReg(); + } + + if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) { + SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1); + } else { + SrcReg1 = Src1.getReg(); + } + + bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass); + bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass); + + auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg); + switch (Inst.getOpcode()) { + case AMDGPU::S_PACK_LL_B32_B16: + NewMI + .addReg(SrcReg0, 0, + isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16) + .addImm(AMDGPU::lo16) + .addReg(SrcReg1, 0, + isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16) + .addImm(AMDGPU::hi16); + break; + case AMDGPU::S_PACK_LH_B32_B16: + NewMI + .addReg(SrcReg0, 0, + isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16) + .addImm(AMDGPU::lo16) + .addReg(SrcReg1, 0, AMDGPU::hi16) + .addImm(AMDGPU::hi16); + break; + case AMDGPU::S_PACK_HL_B32_B16: + NewMI.addReg(SrcReg0, 0, AMDGPU::hi16) + .addImm(AMDGPU::lo16) + .addReg(SrcReg1, 0, + isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16) + .addImm(AMDGPU::hi16); + break; + case AMDGPU::S_PACK_HH_B32_B16: + NewMI.addReg(SrcReg0, 0, AMDGPU::hi16) + .addImm(AMDGPU::lo16) + .addReg(SrcReg1, 0, AMDGPU::hi16) + .addImm(AMDGPU::hi16); + break; + default: + llvm_unreachable("unhandled s_pack_* instruction"); + } + + MachineOperand &Dest = Inst.getOperand(0); + MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); + return; + } + switch (Inst.getOpcode()) { case AMDGPU::S_PACK_LL_B32_B16: { Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index e979eeb..df27ec1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -879,6 +879,11 @@ public: MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64; } + bool isMFMA(uint16_t Opcode) const { + return isMAI(Opcode) && Opcode != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + Opcode != AMDGPU::V_ACCVGPR_READ_B32_e64; + } + static bool isDOT(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::IsDOT; } @@ -895,6 +900,10 @@ public: return isMFMA(MI) || isWMMA(MI) || isSWMMAC(MI); } + bool isMFMAorWMMA(uint16_t Opcode) const { + return isMFMA(Opcode) || isWMMA(Opcode) || isSWMMAC(Opcode); + } + static bool isSWMMAC(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC; } diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 01a40c1..7431e11 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -47,9 +47,6 @@ private: const MachineBasicBlock &From, const MachineBasicBlock &To) const; bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); - // Check if the machine instruction being processed is a supported packed - // instruction. - bool isUnpackingSupportedInstr(MachineInstr &MI) const; // Creates a list of packed instructions following an MFMA that are suitable // for unpacking. void collectUnpackingCandidates(MachineInstr &BeginMI, @@ -454,23 +451,6 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, return true; } -// If support is extended to new operations, add tests in -// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir. -bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const { - if (!TII->isNeverCoissue(MI)) - return false; - unsigned Opcode = MI.getOpcode(); - switch (Opcode) { - case AMDGPU::V_PK_ADD_F32: - case AMDGPU::V_PK_MUL_F32: - case AMDGPU::V_PK_FMA_F32: - return true; - default: - return false; - } - llvm_unreachable("Fully covered switch"); -} - bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) { unsigned OpCode = MI.getOpcode(); Register DstReg = MI.getOperand(0).getReg(); @@ -612,10 +592,13 @@ void SIPreEmitPeephole::collectUnpackingCandidates( for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { MachineInstr &Instr = *I; + uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr); + bool IsUnpackable = + !(UnpackedOpCode == std::numeric_limits<uint16_t>::max()); if (Instr.isMetaInstruction()) continue; if ((Instr.isTerminator()) || - (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) || + (TII->isNeverCoissue(Instr) && !IsUnpackable) || (SIInstrInfo::modifiesModeRegister(Instr) && Instr.modifiesRegister(AMDGPU::EXEC, TRI))) return; @@ -639,7 +622,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates( if (TRI->regsOverlap(MFMADef, InstrMO.getReg())) return; } - if (!isUnpackingSupportedInstr(Instr)) + if (!IsUnpackable) continue; if (canUnpackingClobberRegister(Instr)) @@ -687,8 +670,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, bool IsHiBits) { MachineBasicBlock &MBB = *I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0); - const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1); + const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0); + const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1); Register DstReg = I.getOperand(0).getReg(); unsigned OpCode = I.getOpcode(); Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1) @@ -702,15 +685,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); NewMI.addDef(UnpackedDstReg); // vdst - addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1); - addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2); + addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0); + addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1); if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { - const MachineOperand *SrcMO3 = + const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src2); unsigned Src2Mods = TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm(); - addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3); + addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2); } NewMI.addImm(ClampVal); // clamp // Packed instructions do not support output modifiers. safe to assign them 0 @@ -787,9 +770,13 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { // TODO: Fold this into previous block, if possible. Evaluate and handle any // side effects. + + // Perform the extra MF scans only for supported archs + if (!ST.hasGFX940Insts()) + return Changed; for (MachineBasicBlock &MBB : MF) { - // Unpack packed instructions overlapped by MFMAs. This allows the compiler - // to co-issue unpacked instructions with MFMA + // Unpack packed instructions overlapped by MFMAs. This allows the + // compiler to co-issue unpacked instructions with MFMA auto SchedModel = TII->getSchedModel(); SetVector<MachineInstr *> InstrsToUnpack; for (auto &MI : make_early_inc_range(MBB.instrs())) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index a01a5fd..5e3195b 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1697,9 +1697,6 @@ LLVM_READNONE bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi); LLVM_READNONE -bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi); - -LLVM_READNONE bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi); LLVM_READNONE |