diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInstrInfo.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 356 |
1 files changed, 266 insertions, 90 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d9f76c9..6d21109 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -63,7 +63,8 @@ static cl::opt<bool> Fix16BitCopies( cl::ReallyHidden); SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) - : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), + : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP, + AMDGPU::ADJCALLSTACKDOWN), RI(ST), ST(ST) { SchedModel.init(&ST); } @@ -161,7 +162,7 @@ bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const { if (!DstReg.isVirtual()) return true; - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) { switch (Use.getOpcode()) { case AMDGPU::S_AND_SAVEEXEC_B32: @@ -1667,8 +1668,7 @@ unsigned SIInstrInfo::getVectorRegSpillSaveOpcode( void SIInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); @@ -1680,7 +1680,7 @@ void SIInstrInfo::storeRegToStackSlot( MachineMemOperand *MMO = MF->getMachineMemOperand( PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), FrameInfo.getObjectAlign(FrameIndex)); - unsigned SpillSize = TRI->getSpillSize(*RC); + unsigned SpillSize = RI.getSpillSize(*RC); MachineRegisterInfo &MRI = MF->getRegInfo(); if (RI.isSGPRClass(RC)) { @@ -1862,14 +1862,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const DebugLoc &DL = MBB.findDebugLoc(MI); - unsigned SpillSize = TRI->getSpillSize(*RC); + unsigned SpillSize = RI.getSpillSize(*RC); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); @@ -1964,6 +1963,10 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI, BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB); MF->push_back(TrapBB); MBB.addSuccessor(TrapBB); + } else { + // Since we're adding HaltLoopBB and modifying the CFG, we must return a + // different block to signal the change. + ContBB = HaltLoopBB; } // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this @@ -2518,8 +2521,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - unsigned SubIdx, const MachineInstr &Orig, - const TargetRegisterInfo &RI) const { + unsigned SubIdx, + const MachineInstr &Orig) const { // Try shrinking the instruction to remat only the part needed for current // context. @@ -2569,7 +2572,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, const MCInstrDesc &TID = get(NewOpcode); const TargetRegisterClass *NewRC = - RI.getAllocatableClass(getRegClass(TID, 0, &RI)); + RI.getAllocatableClass(getRegClass(TID, 0)); MRI.setRegClass(DestReg, NewRC); UseMO->setReg(DestReg); @@ -2599,7 +2602,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, break; } - TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI); + TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig); } std::pair<MachineInstr*, MachineInstr*> @@ -2935,7 +2938,7 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() { if (FlushSGPRWrites) BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); + .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0, ST)); }; // We need to compute the offset relative to the instruction immediately after @@ -3461,6 +3464,21 @@ void SIInstrInfo::removeModOperands(MachineInstr &MI) const { } } +void SIInstrInfo::mutateAndCleanupImplicit(MachineInstr &MI, + const MCInstrDesc &NewDesc) const { + MI.setDesc(NewDesc); + + // Remove any leftover implicit operands from mutating the instruction. e.g. + // if we replace an s_and_b32 with a copy, we don't need the implicit scc def + // anymore. + const MCInstrDesc &Desc = MI.getDesc(); + unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() + + Desc.implicit_defs().size(); + + for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I) + MI.removeOperand(I); +} + std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm, unsigned SubRegIndex) { switch (SubRegIndex) { @@ -3612,7 +3630,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) { const MCInstrDesc &MovDesc = get(MovOp); - const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI); + const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0); if (Is16Bit) { // We just need to find a correctly sized register class, so the // subregister index compatibility doesn't matter since we're statically @@ -3917,6 +3935,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, if (isLDSDMA(MIa) || isLDSDMA(MIb)) return false; + if (MIa.isBundle() || MIb.isBundle()) + return false; + // TODO: Should we check the address space from the MachineMemOperand? That // would allow us to distinguish objects we know don't alias based on the // underlying address space, even if it was lowered to a different one, @@ -3982,7 +4003,7 @@ static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm, MachineInstr **DefMI = nullptr) { if (!MO->isReg()) return false; - const MachineFunction *MF = MO->getParent()->getParent()->getParent(); + const MachineFunction *MF = MO->getParent()->getMF(); const MachineRegisterInfo &MRI = MF->getRegInfo(); return getFoldableImm(MO->getReg(), MRI, Imm, DefMI); } @@ -4044,10 +4065,29 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const { MachineBasicBlock &MBB = *MI.getParent(); + MachineInstr *CandidateMI = &MI; + + if (MI.isBundle()) { + // This is a temporary placeholder for bundle handling that enables us to + // exercise the relevant code paths in the two-address instruction pass. + if (MI.getBundleSize() != 1) + return nullptr; + CandidateMI = MI.getNextNode(); + } + ThreeAddressUpdates U; - MachineInstr *NewMI = convertToThreeAddressImpl(MI, U); + MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U); + if (!NewMI) + return nullptr; + + if (MI.isBundle()) { + CandidateMI->eraseFromBundle(); - if (NewMI) { + for (MachineOperand &MO : MI.all_defs()) { + if (MO.isTied()) + MI.untieRegOperand(MO.getOperandNo()); + } + } else { updateLiveVariables(LV, MI, *NewMI); if (LIS) { LIS->ReplaceMachineInstrInMaps(MI, *NewMI); @@ -4088,7 +4128,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, LV->getVarInfo(DefReg).AliveBlocks.clear(); } - if (LIS) { + if (MI.isBundle()) { + VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg); + if (!VRI.Reads && !VRI.Writes) { + for (MachineOperand &MO : MI.all_uses()) { + if (MO.isReg() && MO.getReg() == DefReg) { + assert(MO.getSubReg() == 0 && + "tied sub-registers in bundles currently not supported"); + MI.removeOperand(MO.getOperandNo()); + break; + } + } + + if (LIS) + LIS->shrinkToUses(&LIS->getInterval(DefReg)); + } + } else if (LIS) { LiveInterval &DefLI = LIS->getInterval(DefReg); // We cannot delete the original instruction here, so hack out the use @@ -4103,11 +4158,26 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, } } + if (MI.isBundle()) { + VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg); + if (!VRI.Reads && !VRI.Writes) { + for (MachineOperand &MIOp : MI.uses()) { + if (MIOp.isReg() && MIOp.getReg() == DefReg) { + MIOp.setIsUndef(true); + MIOp.setReg(DummyReg); + } + } + } + + MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false, + false, /*isUndef=*/true)); + } + LIS->shrinkToUses(&DefLI); } } - return NewMI; + return MI.isBundle() ? &MI : NewMI; } MachineInstr * @@ -4121,7 +4191,7 @@ SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI, if (NewMFMAOpc != -1) { MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc)); - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) + for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) MIB.add(MI.getOperand(I)); return MIB; } @@ -4130,7 +4200,7 @@ SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI, unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode()); MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) .setMIFlags(MI.getFlags()); - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) + for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) MIB->addOperand(MI.getOperand(I)); return MIB; } @@ -4329,8 +4399,9 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode); } -bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const { - if (!isFLAT(MI) || isFLATGlobal(MI)) +bool SIInstrInfo::mayAccessScratch(const MachineInstr &MI) const { + // Instructions that access scratch use FLAT encoding or BUF encodings. + if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI)) return false; // If scratch is not initialized, we can never access it. @@ -4948,7 +5019,7 @@ bool SIInstrInfo::verifyCopy(const MachineInstr &MI, bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const { uint16_t Opcode = MI.getOpcode(); - const MachineFunction *MF = MI.getParent()->getParent(); + const MachineFunction *MF = MI.getMF(); const MachineRegisterInfo &MRI = MF->getRegInfo(); // FIXME: At this point the COPY verify is done only for non-ssa forms. @@ -5452,9 +5523,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, Desc.getNumOperands() + Desc.implicit_uses().size(); const unsigned NumImplicitOps = IsDst ? 2 : 1; - // Allow additional implicit operands. This allows a fixup done by the post - // RA scheduler where the main implicit operand is killed and implicit-defs - // are added for sub-registers that remain live after this instruction. + // Require additional implicit operands. This allows a fixup done by the + // post RA scheduler where the main implicit operand is killed and + // implicit-defs are added for sub-registers that remain live after this + // instruction. if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { ErrInfo = "missing implicit register operands"; return false; @@ -5736,6 +5808,17 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) && + MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) { + const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst); + if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) == + &AMDGPU::SReg_64RegClass) || + Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) { + ErrInfo = "Instruction cannot read flat_scratch_base_hi"; + return false; + } + } + return true; } @@ -5754,7 +5837,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM; case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM; case AMDGPU::S_MOV_B32: { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); return MI.getOperand(1).isReg() || RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; @@ -6021,19 +6104,6 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const { llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction"); } -// FIXME: This should not be an overridable function. All subtarget dependent -// operand modifications should go through isLookupRegClassByHwMode in the -// generic handling. -const TargetRegisterClass * -SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum, - const TargetRegisterInfo *TRI) const { - if (OpNum >= TID.getNumOperands()) - return nullptr; - const MCOperandInfo &OpInfo = TID.operands()[OpNum]; - int16_t RegClass = getOpRegClassID(OpInfo); - return RI.getRegClass(RegClass); -} - const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, unsigned OpNo) const { const MCInstrDesc &Desc = get(MI.getOpcode()); @@ -6042,14 +6112,14 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, Register Reg = MI.getOperand(OpNo).getReg(); if (Reg.isVirtual()) { - const MachineRegisterInfo &MRI = - MI.getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); return MRI.getRegClass(Reg); } return RI.getPhysRegBaseClass(Reg); } - return RI.getRegClass(getOpRegClassID(Desc.operands()[OpNo])); + int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]); + return RegClass < 0 ? nullptr : RI.getRegClass(RegClass); } void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { @@ -6133,7 +6203,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC = MRI.getRegClass(Reg); if (MO.getSubReg()) { - const MachineFunction *MF = MO.getParent()->getParent()->getParent(); + const MachineFunction *MF = MO.getParent()->getMF(); const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF); if (!SuperRC) return false; @@ -6145,7 +6215,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, const MachineOperand &MO) const { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx]; unsigned Opc = MI.getOpcode(); @@ -6153,7 +6223,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, // information. if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) && MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) { - constexpr const AMDGPU::OpName OpNames[] = { + constexpr AMDGPU::OpName OpNames[] = { AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2}; for (auto [I, OpName] : enumerate(OpNames)) { @@ -6198,6 +6268,18 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) && RI.isSGPRReg(MRI, MO.getReg())) return false; + + if (ST.hasFlatScratchHiInB64InstHazard() && + MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) { + if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) { + if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) == + 64) + return false; + } + if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64) + return false; + } + return true; } @@ -6215,8 +6297,8 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand( const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO) const { - constexpr const unsigned NumOps = 3; - constexpr const AMDGPU::OpName OpNames[NumOps * 2] = { + constexpr unsigned NumOps = 3; + constexpr AMDGPU::OpName OpNames[NumOps * 2] = { AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers}; @@ -6247,7 +6329,7 @@ bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand( bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx]; @@ -6801,7 +6883,7 @@ void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, return; const TargetRegisterClass *DeclaredRC = - getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI); + getRegClass(MI.getDesc(), SAddr->getOperandNo()); Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC); SAddr->setReg(ToSGPR); @@ -7143,7 +7225,7 @@ extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { MachineBasicBlock * SIInstrInfo::legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT) const { - MachineFunction &MF = *MI.getParent()->getParent(); + MachineFunction &MF = *MI.getMF(); MachineRegisterInfo &MRI = MF.getRegInfo(); MachineBasicBlock *CreatedBB = nullptr; @@ -7632,6 +7714,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); unsigned Opcode = Inst.getOpcode(); unsigned NewOpcode = getVALUOp(Inst); + const DebugLoc &DL = Inst.getDebugLoc(); + // Handle some special cases switch (Opcode) { default: @@ -7783,6 +7867,11 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, Inst.eraseFromParent(); return; + case AMDGPU::S_ABSDIFF_I32: + lowerScalarAbsDiff(Worklist, Inst); + Inst.eraseFromParent(); + return; + case AMDGPU::S_CBRANCH_SCC0: case AMDGPU::S_CBRANCH_SCC1: { // Clear unused bits of vcc @@ -7869,7 +7958,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, return; case AMDGPU::S_UADDO_PSEUDO: case AMDGPU::S_USUBO_PSEUDO: { - const DebugLoc &DL = Inst.getDebugLoc(); MachineOperand &Dest0 = Inst.getOperand(0); MachineOperand &Dest1 = Inst.getOperand(1); MachineOperand &Src0 = Inst.getOperand(2); @@ -7889,12 +7977,37 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, legalizeOperands(*NewInstr, MDT); MRI.replaceRegWith(Dest0.getReg(), DestReg); - addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, - Worklist); + addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); Inst.eraseFromParent(); } return; + case AMDGPU::S_LSHL1_ADD_U32: + case AMDGPU::S_LSHL2_ADD_U32: + case AMDGPU::S_LSHL3_ADD_U32: + case AMDGPU::S_LSHL4_ADD_U32: { + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1 + : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2 + : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3 + : 4); + const TargetRegisterClass *NewRC = + RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())); + Register DestReg = MRI.createVirtualRegister(NewRC); + MachineInstr *NewInstr = + BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg) + .add(Src0) + .addImm(ShiftAmt) + .add(Src1); + + legalizeOperands(*NewInstr, MDT); + MRI.replaceRegWith(Dest.getReg(), DestReg); + addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); + Inst.eraseFromParent(); + } + return; case AMDGPU::S_CSELECT_B32: case AMDGPU::S_CSELECT_B64: lowerSelect(Worklist, Inst, MDT); @@ -7945,7 +8058,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); - MachineOperand SCCOp = Inst.getOperand(SCCIdx); + const MachineOperand &SCCOp = Inst.getOperand(SCCIdx); addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); Inst.eraseFromParent(); return; @@ -7985,13 +8098,12 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, legalizeOperandsVALUt16(*NewInstr, MRI); legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); - MachineOperand SCCOp = Inst.getOperand(SCCIdx); + const MachineOperand &SCCOp = Inst.getOperand(SCCIdx); addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); Inst.eraseFromParent(); return; } case AMDGPU::S_CVT_HI_F32_F16: { - const DebugLoc &DL = Inst.getDebugLoc(); Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); if (ST.useRealTrue16Insts()) { @@ -8021,7 +8133,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } case AMDGPU::S_MINIMUM_F32: case AMDGPU::S_MAXIMUM_F32: { - const DebugLoc &DL = Inst.getDebugLoc(); Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst) .addImm(0) // src0_modifiers @@ -8039,7 +8150,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } case AMDGPU::S_MINIMUM_F16: case AMDGPU::S_MAXIMUM_F16: { - const DebugLoc &DL = Inst.getDebugLoc(); Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass : &AMDGPU::VGPR_32RegClass); @@ -8063,7 +8173,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, case AMDGPU::V_S_RCP_F16_e64: case AMDGPU::V_S_RSQ_F16_e64: case AMDGPU::V_S_SQRT_F16_e64: { - const DebugLoc &DL = Inst.getDebugLoc(); Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass : &AMDGPU::VGPR_32RegClass); @@ -8112,26 +8221,34 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, return; } - if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && - NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { - // Instead of creating a copy where src and dst are the same register - // class, we just replace all uses of dst with src. These kinds of - // copies interfere with the heuristics MachineSink uses to decide - // whether or not to split a critical edge. Since the pass assumes - // that copies will end up as machine instructions and not be - // eliminated. - addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); + if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) { Register NewDstReg = Inst.getOperand(1).getReg(); - MRI.replaceRegWith(DstReg, NewDstReg); - MRI.clearKillFlags(NewDstReg); - Inst.getOperand(0).setReg(DstReg); - Inst.eraseFromParent(); - // Legalize t16 operand since replaceReg is called after addUsersToVALU - for (MachineOperand &MO : - make_early_inc_range(MRI.use_operands(NewDstReg))) { - legalizeOperandsVALUt16(*MO.getParent(), MRI); + const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg); + if (const TargetRegisterClass *CommonRC = + RI.getCommonSubClass(NewDstRC, SrcRC)) { + // Instead of creating a copy where src and dst are the same register + // class, we just replace all uses of dst with src. These kinds of + // copies interfere with the heuristics MachineSink uses to decide + // whether or not to split a critical edge. Since the pass assumes + // that copies will end up as machine instructions and not be + // eliminated. + addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); + MRI.replaceRegWith(DstReg, NewDstReg); + MRI.clearKillFlags(NewDstReg); + Inst.getOperand(0).setReg(DstReg); + + if (!MRI.constrainRegClass(NewDstReg, CommonRC)) + llvm_unreachable("failed to constrain register"); + + Inst.eraseFromParent(); + // Legalize t16 operand since replaceReg is called after addUsersToVALU + for (MachineOperand &MO : + make_early_inc_range(MRI.use_operands(NewDstReg))) { + legalizeOperandsVALUt16(*MO.getParent(), MRI); + } + + return; } - return; } // If this is a v2s copy between 16bit and 32bit reg, @@ -8183,7 +8300,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, AMDGPU::OpName::src0_modifiers) >= 0) NewInstr.addImm(0); if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) { - MachineOperand Src = Inst.getOperand(1); + const MachineOperand &Src = Inst.getOperand(1); NewInstr->addOperand(Src); } @@ -8412,6 +8529,37 @@ void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } +void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist, + MachineInstr &Inst) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + const DebugLoc &DL = Inst.getDebugLoc(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src1 = Inst.getOperand(1); + MachineOperand &Src2 = Inst.getOperand(2); + Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + unsigned SubOp = + ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32; + + BuildMI(MBB, MII, DL, get(SubOp), SubResultReg) + .addReg(Src1.getReg()) + .addReg(Src2.getReg()); + + BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) + .addReg(SubResultReg) + .addReg(TmpReg); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); +} + void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -9199,7 +9347,7 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, +void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op, MachineInstr &SCCDefInst, SIInstrWorklist &Worklist, Register NewCond) const { @@ -9217,7 +9365,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false); if (SCCIdx != -1) { if (MI.isCopy()) { - MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); Register DestReg = MI.getOperand(0).getReg(); MRI.replaceRegWith(DestReg, NewCond); @@ -9329,7 +9477,7 @@ Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, return SGPRReg; Register UsedSGPRs[3] = {Register()}; - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); for (unsigned i = 0; i < 3; ++i) { int Idx = OpIndices[i]; @@ -9579,7 +9727,7 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return getInstBundleSize(MI); case TargetOpcode::INLINEASM: case TargetOpcode::INLINEASM_BR: { - const MachineFunction *MF = MI.getParent()->getParent(); + const MachineFunction *MF = MI.getMF(); const char *AsmStr = MI.getOperand(0).getSymbolName(); return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST); } @@ -9714,7 +9862,7 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI, // needed by the prolog. However, the insertions for scalar registers can // always be placed at the BB top as they are independent of the exec mask // value. - const MachineFunction *MF = MI.getParent()->getParent(); + const MachineFunction *MF = MI.getMF(); bool IsNullOrVectorRegister = true; if (Reg) { const MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -10160,7 +10308,7 @@ static bool followSubRegDef(MachineInstr &MI, } MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, - MachineRegisterInfo &MRI) { + const MachineRegisterInfo &MRI) { assert(MRI.isSSA()); if (!P.Reg.isVirtual()) return nullptr; @@ -10501,7 +10649,7 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const { return InstructionUniformity::Default; } - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo(); // FIXME: It's conceptually broken to report this for an instruction, and not @@ -10625,6 +10773,8 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, const SIRegisterInfo &RI) { MachineInstr *KillsSCC = nullptr; + if (SCCValid->getParent() != SCCRedefine->getParent()) + return false; for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()), SCCRedefine->getIterator())) { if (MI.modifiesRegister(AMDGPU::SCC, &RI)) @@ -10669,8 +10819,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (CmpValue != 0) return false; - MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); - if (!Def || Def->getParent() != CmpInstr.getParent()) + MachineInstr *Def = MRI->getVRegDef(SrcReg); + if (!Def) return false; // For S_OP that set SCC = DST!=0, do the transformation @@ -10689,6 +10839,32 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (!optimizeSCC(Def, &CmpInstr, RI)) return false; + // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit + // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a + // 64-bit foldableSelect then delete s_or_b32 in the sequence: + // sX = s_cselect_b64 (non-zero imm), 0 + // sLo = copy sX.sub0 + // sHi = copy sX.sub1 + // sY = s_or_b32 sLo, sHi + if (Def->getOpcode() == AMDGPU::S_OR_B32 && + MRI->use_nodbg_empty(Def->getOperand(0).getReg())) { + const MachineOperand &OrOpnd1 = Def->getOperand(1); + const MachineOperand &OrOpnd2 = Def->getOperand(2); + if (OrOpnd1.isReg() && OrOpnd2.isReg()) { + MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg()); + MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg()); + if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 && + Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() && + Def2->getOperand(1).isReg() && + Def1->getOperand(1).getSubReg() == AMDGPU::sub0 && + Def2->getOperand(1).getSubReg() == AMDGPU::sub1 && + Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) { + MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg()); + if (Select && foldableSelect(*Select)) + optimizeSCC(Select, Def, RI); + } + } + } return true; }; @@ -10718,8 +10894,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n - MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); - if (!Def || Def->getParent() != CmpInstr.getParent()) + MachineInstr *Def = MRI->getVRegDef(SrcReg); + if (!Def) return false; if (Def->getOpcode() != AMDGPU::S_AND_B32 && |
