diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 5 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp | 84 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp | 21 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 22 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp | 16 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h | 5 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/MIMGInstructions.td | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 84 |
11 files changed, 154 insertions, 93 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index aed325c..0c97741 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -224,13 +224,12 @@ bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const { Register VCCReg = I.getOperand(1).getReg(); MachineInstr *Cmp; - if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Set SCC as a side effect with S_CMP or S_OR. + if (STI.hasScalarCompareEq64()) { unsigned CmpOpc = STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0); } else { - // For gfx7 and earlier, S_CMP_LG_U64 doesn't exist, so we use S_OR_B64 - // which sets SCC as a side effect. Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst) .addReg(VCCReg) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp index 1e6589e..d7d0292 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp @@ -58,6 +58,8 @@ class AMDGPULowerVGPREncoding { static constexpr unsigned BitsPerField = 2; static constexpr unsigned NumFields = 4; static constexpr unsigned FieldMask = (1 << BitsPerField) - 1; + static constexpr unsigned ModeWidth = NumFields * BitsPerField; + static constexpr unsigned ModeMask = (1 << ModeWidth) - 1; using ModeType = PackedVector<unsigned, BitsPerField, std::bitset<BitsPerField * NumFields>>; @@ -82,12 +84,12 @@ private: const SIInstrInfo *TII; const SIRegisterInfo *TRI; + // Current basic block. + MachineBasicBlock *MBB; + /// Most recent s_set_* instruction. MachineInstr *MostRecentModeSet; - /// Whether the current mode is known. - bool CurrentModeKnown; - /// Current mode bits. ModeTy CurrentMode; @@ -108,10 +110,13 @@ private: MachineInstr *Clause; /// Insert mode change before \p I. \returns true if mode was changed. - bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I); + bool setMode(ModeTy NewMode, ModeTy Mask, + MachineBasicBlock::instr_iterator I); /// Reset mode to default. - void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); } + void resetMode(MachineBasicBlock::instr_iterator I) { + setMode(ModeTy(), ModeTy::fullMask(), I); + } /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt. std::optional<unsigned> getMSBs(const MachineOperand &MO) const; @@ -130,38 +135,43 @@ private: /// Check if an instruction \p I is within a clause and returns a suitable /// iterator to insert mode change. It may also modify the S_CLAUSE /// instruction to extend it or drop the clause if it cannot be adjusted. - MachineInstr *handleClause(MachineInstr *I); + MachineBasicBlock::instr_iterator + handleClause(MachineBasicBlock::instr_iterator I); }; bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask, - MachineInstr *I) { + MachineBasicBlock::instr_iterator I) { assert((NewMode.raw_bits() & ~Mask.raw_bits()).none()); - if (CurrentModeKnown) { - auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits(); + auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits(); - if ((Delta & Mask.raw_bits()).none()) { - CurrentMask |= Mask; - return false; - } + if ((Delta & Mask.raw_bits()).none()) { + CurrentMask |= Mask; + return false; + } - if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) { - CurrentMode |= NewMode; - CurrentMask |= Mask; + if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) { + CurrentMode |= NewMode; + CurrentMask |= Mask; - MostRecentModeSet->getOperand(0).setImm(CurrentMode); - return true; - } + MachineOperand &Op = MostRecentModeSet->getOperand(0); + + // Carry old mode bits from the existing instruction. + int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth); + + Op.setImm(CurrentMode | OldModeBits); + return true; } + // Record previous mode into high 8 bits of the immediate. + int64_t OldModeBits = CurrentMode << ModeWidth; + I = handleClause(I); - MostRecentModeSet = - BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB)) - .addImm(NewMode); + MostRecentModeSet = BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB)) + .addImm(NewMode | OldModeBits); CurrentMode = NewMode; CurrentMask = Mask; - CurrentModeKnown = true; return true; } @@ -233,21 +243,22 @@ bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) { if (Ops.first) { ModeTy NewMode, Mask; computeMode(NewMode, Mask, MI, Ops.first, Ops.second); - return setMode(NewMode, Mask, &MI); + return setMode(NewMode, Mask, MI.getIterator()); } assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo()); return false; } -MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) { +MachineBasicBlock::instr_iterator +AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) { if (!ClauseRemaining) return I; // A clause cannot start with a special instruction, place it right before // the clause. if (ClauseRemaining == ClauseLen) { - I = Clause->getPrevNode(); + I = Clause->getPrevNode()->getIterator(); assert(I->isBundle()); return I; } @@ -284,9 +295,9 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { ClauseLen = ClauseRemaining = 0; CurrentMode.reset(); CurrentMask.reset(); - CurrentModeKnown = true; for (auto &MBB : MF) { MostRecentModeSet = nullptr; + this->MBB = &MBB; for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) { if (MI.isMetaInstruction()) @@ -294,17 +305,16 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { if (MI.isTerminator() || MI.isCall()) { if (MI.getOpcode() == AMDGPU::S_ENDPGM || - MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { + MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) CurrentMode.reset(); - CurrentModeKnown = true; - } else - resetMode(&MI); + else + resetMode(MI.getIterator()); continue; } if (MI.isInlineAsm()) { if (TII->hasVGPRUses(MI)) - resetMode(&MI); + resetMode(MI.getIterator()); continue; } @@ -323,14 +333,8 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { --ClauseRemaining; } - // If we're falling through to a block that has at least one other - // predecessor, we no longer know the mode. - MachineBasicBlock *Next = MBB.getNextNode(); - if (Next && Next->pred_size() >= 2 && - llvm::is_contained(Next->predecessors(), &MBB)) { - if (CurrentMode.raw_bits().any()) - CurrentModeKnown = false; - } + // Reset the mode if we are falling through. + resetMode(MBB.instr_end()); } return Changed; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 680e7eb..844649ebb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -412,7 +412,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { *OutStreamer); if (isVerbose() && MI->getOpcode() == AMDGPU::S_SET_VGPR_MSB) { - unsigned V = MI->getOperand(0).getImm(); + unsigned V = MI->getOperand(0).getImm() & 0xff; OutStreamer->AddComment( " msbs: dst=" + Twine(V >> 6) + " src0=" + Twine(V & 3) + " src1=" + Twine((V >> 2) & 3) + " src2=" + Twine((V >> 4) & 3)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index e187959..907f830 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" @@ -34,9 +35,17 @@ using namespace llvm; using namespace AMDGPU; +using namespace llvm::MIPatternMatch; namespace { +// AMDGPU-specific pattern matchers +template <typename SrcTy> +inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE> +m_GAMDGPUReadAnyLane(const SrcTy &Src) { + return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src); +} + class AMDGPURegBankLegalize : public MachineFunctionPass { public: static char ID; @@ -160,10 +169,18 @@ AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) { Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) { // Src = G_AMDGPU_READANYLANE RALSrc - auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); - if (RAL) + Register RALSrc; + if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))) return RALSrc; + // TruncSrc = G_AMDGPU_READANYLANE RALSrc + // AextSrc = G_TRUNC TruncSrc + // Src = G_ANYEXT AextSrc + if (mi_match(Src, MRI, + m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) { + return RALSrc; + } + // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc // LoSgpr = G_AMDGPU_READANYLANE LoVgpr // HiSgpr = G_AMDGPU_READANYLANE HiVgpr diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index b84c30e..dc8fa7f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -626,6 +626,23 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) { MI.eraseFromParent(); } +void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + assert(MRI.getType(Dst) == V2S16); + auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg()); + auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg()); + unsigned Opc = MI.getOpcode(); + auto Flags = MI.getFlags(); + auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32); + auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32); + auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32); + auto Op2Hi = B.buildTrunc(SgprRB_S16, Op2Hi32); + auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags); + auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags); + B.buildMergeLikeInstr(Dst, {Lo, Hi}); + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(Dst); @@ -698,6 +715,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, return lowerUnpackBitShift(MI); case UnpackMinMax: return lowerUnpackMinMax(MI); + case ScalarizeToS16: + return lowerSplitTo16(MI); case Ext32To64: { const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg()); MachineInstrBuilder Hi; @@ -849,6 +868,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { return LLT::scalar(32); case Sgpr64: case Vgpr64: + case UniInVgprS64: return LLT::scalar(64); case Sgpr128: case Vgpr128: @@ -972,6 +992,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case UniInVcc: case UniInVgprS16: case UniInVgprS32: + case UniInVgprS64: case UniInVgprV2S16: case UniInVgprV4S32: case UniInVgprB32: @@ -1104,6 +1125,7 @@ void RegBankLegalizeHelper::applyMappingDst( break; } case UniInVgprS32: + case UniInVgprS64: case UniInVgprV2S16: case UniInVgprV4S32: { assert(Ty == getTyFromID(MethodIDs[OpIdx])); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index ad3ff1d..e7598f8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -72,6 +72,7 @@ class RegBankLegalizeHelper { static constexpr LLT P6 = LLT::pointer(6, 32); MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32}; + MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16}; MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32}; MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1}; @@ -121,6 +122,7 @@ private: void lowerV_BFE(MachineInstr &MI); void lowerS_BFE(MachineInstr &MI); void lowerSplitTo32(MachineInstr &MI); + void lowerSplitTo16(MachineInstr &MI); void lowerSplitTo32Select(MachineInstr &MI); void lowerSplitTo32SExtInReg(MachineInstr &MI); void lowerUnpackMinMax(MachineInstr &MI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 01abd35..103cdec 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -913,14 +913,26 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}}); - addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}}); + addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard) + .Uni(S64, {{Sgpr64}, {}}); bool hasSALUFloat = ST->hasSALUFloatInsts(); addRulesForGOpcs({G_FADD}, Standard) + .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat) + .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat) .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat) - .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}}) + .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16}, + hasSALUFloat) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}}) + .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}}); addRulesForGOpcs({G_FPTOUI}) .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 030bd75..e6df5d8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -92,8 +92,10 @@ enum UniformityLLTOpPredicateID { V4S32, UniV2S16, + UniV2S32, DivV2S16, + DivV2S32, // B types B32, @@ -178,7 +180,9 @@ enum RegBankLLTMappingApplyID { UniInVcc, UniInVgprS16, UniInVgprS32, + UniInVgprS64, UniInVgprV2S16, + UniInVgprV2S32, UniInVgprV4S32, UniInVgprB32, UniInVgprB64, @@ -217,6 +221,7 @@ enum LoweringMethodID { V_BFE, VgprToVccCopy, SplitTo32, + ScalarizeToS16, SplitTo32Select, SplitTo32SExtInReg, Ext32To64, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 013cfeb..28b4da8 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -168,7 +168,7 @@ bool AMDGPUMCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr, void AMDGPUMCInstrAnalysis::updateState(const MCInst &Inst, uint64_t Addr) { if (Inst.getOpcode() == AMDGPU::S_SET_VGPR_MSB_gfx12) - VgprMSBs = Inst.getOperand(0).getImm(); + VgprMSBs = Inst.getOperand(0).getImm() & 0xff; else if (isTerminator(Inst)) VgprMSBs = 0; } diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index d950131..65dce74 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -2116,8 +2116,10 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?); let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?); + // Set VADDR4 to NULL + let vaddr4 = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding); + // set to 0 based on SPG. - let vaddr4 = 0; let rsrc = 0; let vdata = 0; let d16 = 0; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d930a21..d9f76c9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10618,6 +10618,42 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, return false; } +// SCC is already valid after SCCValid. +// SCCRedefine will redefine SCC to the same value already available after +// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and +// update kill/dead flags if necessary. +static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, + const SIRegisterInfo &RI) { + MachineInstr *KillsSCC = nullptr; + for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()), + SCCRedefine->getIterator())) { + if (MI.modifiesRegister(AMDGPU::SCC, &RI)) + return false; + if (MI.killsRegister(AMDGPU::SCC, &RI)) + KillsSCC = &MI; + } + if (MachineOperand *SccDef = + SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) + SccDef->setIsDead(false); + if (KillsSCC) + KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); + SCCRedefine->eraseFromParent(); + return true; +} + +static bool foldableSelect(const MachineInstr &Def) { + if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 && + Def.getOpcode() != AMDGPU::S_CSELECT_B64) + return false; + bool Op1IsNonZeroImm = + Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0; + bool Op2IsZeroImm = + Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0; + if (!Op1IsNonZeroImm || !Op2IsZeroImm) + return false; + return true; +} + bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, @@ -10637,19 +10673,6 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (!Def || Def->getParent() != CmpInstr.getParent()) return false; - const auto foldableSelect = [](MachineInstr *Def) -> bool { - if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 || - Def->getOpcode() == AMDGPU::S_CSELECT_B64) { - bool Op1IsNonZeroImm = - Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0; - bool Op2IsZeroImm = - Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0; - if (Op1IsNonZeroImm && Op2IsZeroImm) - return true; - } - return false; - }; - // For S_OP that set SCC = DST!=0, do the transformation // // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...) @@ -10660,24 +10683,12 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero // imm), 0) - if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def)) + if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def)) return false; - MachineInstr *KillsSCC = nullptr; - for (MachineInstr &MI : - make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { - if (MI.modifiesRegister(AMDGPU::SCC, &RI)) - return false; - if (MI.killsRegister(AMDGPU::SCC, &RI)) - KillsSCC = &MI; - } + if (!optimizeSCC(Def, &CmpInstr, RI)) + return false; - if (MachineOperand *SccDef = - Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) - SccDef->setIsDead(false); - if (KillsSCC) - KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); - CmpInstr.eraseFromParent(); return true; }; @@ -10755,21 +10766,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg)) return false; - MachineInstr *KillsSCC = nullptr; - for (MachineInstr &MI : - make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { - if (MI.modifiesRegister(AMDGPU::SCC, &RI)) - return false; - if (MI.killsRegister(AMDGPU::SCC, &RI)) - KillsSCC = &MI; - } - - MachineOperand *SccDef = - Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr); - SccDef->setIsDead(false); - if (KillsSCC) - KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); - CmpInstr.eraseFromParent(); + if (!optimizeSCC(Def, &CmpInstr, RI)) + return false; if (!MRI->use_nodbg_empty(DefReg)) { assert(!IsReversedCC); |
