aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInstrInfo.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp356
1 files changed, 266 insertions, 90 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d9f76c9..6d21109 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -63,7 +63,8 @@ static cl::opt<bool> Fix16BitCopies(
cl::ReallyHidden);
SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
- : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
+ : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
+ AMDGPU::ADJCALLSTACKDOWN),
RI(ST), ST(ST) {
SchedModel.init(&ST);
}
@@ -161,7 +162,7 @@ bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
if (!DstReg.isVirtual())
return true;
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
switch (Use.getOpcode()) {
case AMDGPU::S_AND_SAVEEXEC_B32:
@@ -1667,8 +1668,7 @@ unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
void SIInstrInfo::storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
- bool isKill, int FrameIndex, const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI, Register VReg,
+ bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
MachineInstr::MIFlag Flags) const {
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
@@ -1680,7 +1680,7 @@ void SIInstrInfo::storeRegToStackSlot(
MachineMemOperand *MMO = MF->getMachineMemOperand(
PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
FrameInfo.getObjectAlign(FrameIndex));
- unsigned SpillSize = TRI->getSpillSize(*RC);
+ unsigned SpillSize = RI.getSpillSize(*RC);
MachineRegisterInfo &MRI = MF->getRegInfo();
if (RI.isSGPRClass(RC)) {
@@ -1862,14 +1862,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
Register DestReg, int FrameIndex,
const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI,
Register VReg,
MachineInstr::MIFlag Flags) const {
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
const DebugLoc &DL = MBB.findDebugLoc(MI);
- unsigned SpillSize = TRI->getSpillSize(*RC);
+ unsigned SpillSize = RI.getSpillSize(*RC);
MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, FrameIndex);
@@ -1964,6 +1963,10 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
MF->push_back(TrapBB);
MBB.addSuccessor(TrapBB);
+ } else {
+ // Since we're adding HaltLoopBB and modifying the CFG, we must return a
+ // different block to signal the change.
+ ContBB = HaltLoopBB;
}
// Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
@@ -2518,8 +2521,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, Register DestReg,
- unsigned SubIdx, const MachineInstr &Orig,
- const TargetRegisterInfo &RI) const {
+ unsigned SubIdx,
+ const MachineInstr &Orig) const {
// Try shrinking the instruction to remat only the part needed for current
// context.
@@ -2569,7 +2572,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
const MCInstrDesc &TID = get(NewOpcode);
const TargetRegisterClass *NewRC =
- RI.getAllocatableClass(getRegClass(TID, 0, &RI));
+ RI.getAllocatableClass(getRegClass(TID, 0));
MRI.setRegClass(DestReg, NewRC);
UseMO->setReg(DestReg);
@@ -2599,7 +2602,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
break;
}
- TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
+ TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig);
}
std::pair<MachineInstr*, MachineInstr*>
@@ -2935,7 +2938,7 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
if (FlushSGPRWrites)
BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0, ST));
};
// We need to compute the offset relative to the instruction immediately after
@@ -3461,6 +3464,21 @@ void SIInstrInfo::removeModOperands(MachineInstr &MI) const {
}
}
+void SIInstrInfo::mutateAndCleanupImplicit(MachineInstr &MI,
+ const MCInstrDesc &NewDesc) const {
+ MI.setDesc(NewDesc);
+
+ // Remove any leftover implicit operands from mutating the instruction. e.g.
+ // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
+ // anymore.
+ const MCInstrDesc &Desc = MI.getDesc();
+ unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
+ Desc.implicit_defs().size();
+
+ for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
+ MI.removeOperand(I);
+}
+
std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
unsigned SubRegIndex) {
switch (SubRegIndex) {
@@ -3612,7 +3630,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
const MCInstrDesc &MovDesc = get(MovOp);
- const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI);
+ const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
if (Is16Bit) {
// We just need to find a correctly sized register class, so the
// subregister index compatibility doesn't matter since we're statically
@@ -3917,6 +3935,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
if (isLDSDMA(MIa) || isLDSDMA(MIb))
return false;
+ if (MIa.isBundle() || MIb.isBundle())
+ return false;
+
// TODO: Should we check the address space from the MachineMemOperand? That
// would allow us to distinguish objects we know don't alias based on the
// underlying address space, even if it was lowered to a different one,
@@ -3982,7 +4003,7 @@ static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
MachineInstr **DefMI = nullptr) {
if (!MO->isReg())
return false;
- const MachineFunction *MF = MO->getParent()->getParent()->getParent();
+ const MachineFunction *MF = MO->getParent()->getMF();
const MachineRegisterInfo &MRI = MF->getRegInfo();
return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
}
@@ -4044,10 +4065,29 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
MachineBasicBlock &MBB = *MI.getParent();
+ MachineInstr *CandidateMI = &MI;
+
+ if (MI.isBundle()) {
+ // This is a temporary placeholder for bundle handling that enables us to
+ // exercise the relevant code paths in the two-address instruction pass.
+ if (MI.getBundleSize() != 1)
+ return nullptr;
+ CandidateMI = MI.getNextNode();
+ }
+
ThreeAddressUpdates U;
- MachineInstr *NewMI = convertToThreeAddressImpl(MI, U);
+ MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
+ if (!NewMI)
+ return nullptr;
+
+ if (MI.isBundle()) {
+ CandidateMI->eraseFromBundle();
- if (NewMI) {
+ for (MachineOperand &MO : MI.all_defs()) {
+ if (MO.isTied())
+ MI.untieRegOperand(MO.getOperandNo());
+ }
+ } else {
updateLiveVariables(LV, MI, *NewMI);
if (LIS) {
LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
@@ -4088,7 +4128,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LV->getVarInfo(DefReg).AliveBlocks.clear();
}
- if (LIS) {
+ if (MI.isBundle()) {
+ VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
+ if (!VRI.Reads && !VRI.Writes) {
+ for (MachineOperand &MO : MI.all_uses()) {
+ if (MO.isReg() && MO.getReg() == DefReg) {
+ assert(MO.getSubReg() == 0 &&
+ "tied sub-registers in bundles currently not supported");
+ MI.removeOperand(MO.getOperandNo());
+ break;
+ }
+ }
+
+ if (LIS)
+ LIS->shrinkToUses(&LIS->getInterval(DefReg));
+ }
+ } else if (LIS) {
LiveInterval &DefLI = LIS->getInterval(DefReg);
// We cannot delete the original instruction here, so hack out the use
@@ -4103,11 +4158,26 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
}
}
+ if (MI.isBundle()) {
+ VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
+ if (!VRI.Reads && !VRI.Writes) {
+ for (MachineOperand &MIOp : MI.uses()) {
+ if (MIOp.isReg() && MIOp.getReg() == DefReg) {
+ MIOp.setIsUndef(true);
+ MIOp.setReg(DummyReg);
+ }
+ }
+ }
+
+ MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
+ false, /*isUndef=*/true));
+ }
+
LIS->shrinkToUses(&DefLI);
}
}
- return NewMI;
+ return MI.isBundle() ? &MI : NewMI;
}
MachineInstr *
@@ -4121,7 +4191,7 @@ SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
if (NewMFMAOpc != -1) {
MachineInstrBuilder MIB =
BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
- for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+ for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
MIB.add(MI.getOperand(I));
return MIB;
}
@@ -4130,7 +4200,7 @@ SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
.setMIFlags(MI.getFlags());
- for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+ for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
MIB->addOperand(MI.getOperand(I));
return MIB;
}
@@ -4329,8 +4399,9 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
}
-bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
- if (!isFLAT(MI) || isFLATGlobal(MI))
+bool SIInstrInfo::mayAccessScratch(const MachineInstr &MI) const {
+ // Instructions that access scratch use FLAT encoding or BUF encodings.
+ if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
return false;
// If scratch is not initialized, we can never access it.
@@ -4948,7 +5019,7 @@ bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
StringRef &ErrInfo) const {
uint16_t Opcode = MI.getOpcode();
- const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineFunction *MF = MI.getMF();
const MachineRegisterInfo &MRI = MF->getRegInfo();
// FIXME: At this point the COPY verify is done only for non-ssa forms.
@@ -5452,9 +5523,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
Desc.getNumOperands() + Desc.implicit_uses().size();
const unsigned NumImplicitOps = IsDst ? 2 : 1;
- // Allow additional implicit operands. This allows a fixup done by the post
- // RA scheduler where the main implicit operand is killed and implicit-defs
- // are added for sub-registers that remain live after this instruction.
+ // Require additional implicit operands. This allows a fixup done by the
+ // post RA scheduler where the main implicit operand is killed and
+ // implicit-defs are added for sub-registers that remain live after this
+ // instruction.
if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
ErrInfo = "missing implicit register operands";
return false;
@@ -5736,6 +5808,17 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
+ MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
+ const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
+ if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
+ &AMDGPU::SReg_64RegClass) ||
+ Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
+ ErrInfo = "Instruction cannot read flat_scratch_base_hi";
+ return false;
+ }
+ }
+
return true;
}
@@ -5754,7 +5837,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
case AMDGPU::S_MOV_B32: {
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
return MI.getOperand(1).isReg() ||
RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
@@ -6021,19 +6104,6 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
}
-// FIXME: This should not be an overridable function. All subtarget dependent
-// operand modifications should go through isLookupRegClassByHwMode in the
-// generic handling.
-const TargetRegisterClass *
-SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum,
- const TargetRegisterInfo *TRI) const {
- if (OpNum >= TID.getNumOperands())
- return nullptr;
- const MCOperandInfo &OpInfo = TID.operands()[OpNum];
- int16_t RegClass = getOpRegClassID(OpInfo);
- return RI.getRegClass(RegClass);
-}
-
const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
unsigned OpNo) const {
const MCInstrDesc &Desc = get(MI.getOpcode());
@@ -6042,14 +6112,14 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
Register Reg = MI.getOperand(OpNo).getReg();
if (Reg.isVirtual()) {
- const MachineRegisterInfo &MRI =
- MI.getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
return MRI.getRegClass(Reg);
}
return RI.getPhysRegBaseClass(Reg);
}
- return RI.getRegClass(getOpRegClassID(Desc.operands()[OpNo]));
+ int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
+ return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
}
void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
@@ -6133,7 +6203,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
if (MO.getSubReg()) {
- const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+ const MachineFunction *MF = MO.getParent()->getMF();
const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
if (!SuperRC)
return false;
@@ -6145,7 +6215,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand &MO) const {
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
unsigned Opc = MI.getOpcode();
@@ -6153,7 +6223,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
// information.
if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
- constexpr const AMDGPU::OpName OpNames[] = {
+ constexpr AMDGPU::OpName OpNames[] = {
AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
for (auto [I, OpName] : enumerate(OpNames)) {
@@ -6198,6 +6268,18 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
(int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
RI.isSGPRReg(MRI, MO.getReg()))
return false;
+
+ if (ST.hasFlatScratchHiInB64InstHazard() &&
+ MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
+ if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
+ if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
+ 64)
+ return false;
+ }
+ if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
+ return false;
+ }
+
return true;
}
@@ -6215,8 +6297,8 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(
const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
const MachineOperand *MO) const {
- constexpr const unsigned NumOps = 3;
- constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
+ constexpr unsigned NumOps = 3;
+ constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
AMDGPU::OpName::src0, AMDGPU::OpName::src1,
AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
@@ -6247,7 +6329,7 @@ bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(
bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand *MO) const {
- const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineFunction &MF = *MI.getMF();
const MachineRegisterInfo &MRI = MF.getRegInfo();
const MCInstrDesc &InstDesc = MI.getDesc();
const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
@@ -6801,7 +6883,7 @@ void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
return;
const TargetRegisterClass *DeclaredRC =
- getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI);
+ getRegClass(MI.getDesc(), SAddr->getOperandNo());
Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
SAddr->setReg(ToSGPR);
@@ -7143,7 +7225,7 @@ extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
MachineBasicBlock *
SIInstrInfo::legalizeOperands(MachineInstr &MI,
MachineDominatorTree *MDT) const {
- MachineFunction &MF = *MI.getParent()->getParent();
+ MachineFunction &MF = *MI.getMF();
MachineRegisterInfo &MRI = MF.getRegInfo();
MachineBasicBlock *CreatedBB = nullptr;
@@ -7632,6 +7714,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
unsigned Opcode = Inst.getOpcode();
unsigned NewOpcode = getVALUOp(Inst);
+ const DebugLoc &DL = Inst.getDebugLoc();
+
// Handle some special cases
switch (Opcode) {
default:
@@ -7783,6 +7867,11 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Inst.eraseFromParent();
return;
+ case AMDGPU::S_ABSDIFF_I32:
+ lowerScalarAbsDiff(Worklist, Inst);
+ Inst.eraseFromParent();
+ return;
+
case AMDGPU::S_CBRANCH_SCC0:
case AMDGPU::S_CBRANCH_SCC1: {
// Clear unused bits of vcc
@@ -7869,7 +7958,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
- const DebugLoc &DL = Inst.getDebugLoc();
MachineOperand &Dest0 = Inst.getOperand(0);
MachineOperand &Dest1 = Inst.getOperand(1);
MachineOperand &Src0 = Inst.getOperand(2);
@@ -7889,12 +7977,37 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
legalizeOperands(*NewInstr, MDT);
MRI.replaceRegWith(Dest0.getReg(), DestReg);
- addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
- Worklist);
+ addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
Inst.eraseFromParent();
}
return;
+ case AMDGPU::S_LSHL1_ADD_U32:
+ case AMDGPU::S_LSHL2_ADD_U32:
+ case AMDGPU::S_LSHL3_ADD_U32:
+ case AMDGPU::S_LSHL4_ADD_U32: {
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+ unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
+ : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
+ : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
+ : 4);
+ const TargetRegisterClass *NewRC =
+ RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
+ Register DestReg = MRI.createVirtualRegister(NewRC);
+ MachineInstr *NewInstr =
+ BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
+ .add(Src0)
+ .addImm(ShiftAmt)
+ .add(Src1);
+
+ legalizeOperands(*NewInstr, MDT);
+ MRI.replaceRegWith(Dest.getReg(), DestReg);
+ addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
+ Inst.eraseFromParent();
+ }
+ return;
case AMDGPU::S_CSELECT_B32:
case AMDGPU::S_CSELECT_B64:
lowerSelect(Worklist, Inst, MDT);
@@ -7945,7 +8058,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
legalizeOperands(*NewInstr, MDT);
int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
- MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+ const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
Inst.eraseFromParent();
return;
@@ -7985,13 +8098,12 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
legalizeOperandsVALUt16(*NewInstr, MRI);
legalizeOperands(*NewInstr, MDT);
int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
- MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+ const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
Inst.eraseFromParent();
return;
}
case AMDGPU::S_CVT_HI_F32_F16: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
if (ST.useRealTrue16Insts()) {
@@ -8021,7 +8133,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
case AMDGPU::S_MINIMUM_F32:
case AMDGPU::S_MAXIMUM_F32: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
.addImm(0) // src0_modifiers
@@ -8039,7 +8150,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
case AMDGPU::S_MINIMUM_F16:
case AMDGPU::S_MAXIMUM_F16: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
? &AMDGPU::VGPR_16RegClass
: &AMDGPU::VGPR_32RegClass);
@@ -8063,7 +8173,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
case AMDGPU::V_S_RCP_F16_e64:
case AMDGPU::V_S_RSQ_F16_e64:
case AMDGPU::V_S_SQRT_F16_e64: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
? &AMDGPU::VGPR_16RegClass
: &AMDGPU::VGPR_32RegClass);
@@ -8112,26 +8221,34 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
- if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
- NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
- // Instead of creating a copy where src and dst are the same register
- // class, we just replace all uses of dst with src. These kinds of
- // copies interfere with the heuristics MachineSink uses to decide
- // whether or not to split a critical edge. Since the pass assumes
- // that copies will end up as machine instructions and not be
- // eliminated.
- addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+ if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
Register NewDstReg = Inst.getOperand(1).getReg();
- MRI.replaceRegWith(DstReg, NewDstReg);
- MRI.clearKillFlags(NewDstReg);
- Inst.getOperand(0).setReg(DstReg);
- Inst.eraseFromParent();
- // Legalize t16 operand since replaceReg is called after addUsersToVALU
- for (MachineOperand &MO :
- make_early_inc_range(MRI.use_operands(NewDstReg))) {
- legalizeOperandsVALUt16(*MO.getParent(), MRI);
+ const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
+ if (const TargetRegisterClass *CommonRC =
+ RI.getCommonSubClass(NewDstRC, SrcRC)) {
+ // Instead of creating a copy where src and dst are the same register
+ // class, we just replace all uses of dst with src. These kinds of
+ // copies interfere with the heuristics MachineSink uses to decide
+ // whether or not to split a critical edge. Since the pass assumes
+ // that copies will end up as machine instructions and not be
+ // eliminated.
+ addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+ MRI.replaceRegWith(DstReg, NewDstReg);
+ MRI.clearKillFlags(NewDstReg);
+ Inst.getOperand(0).setReg(DstReg);
+
+ if (!MRI.constrainRegClass(NewDstReg, CommonRC))
+ llvm_unreachable("failed to constrain register");
+
+ Inst.eraseFromParent();
+ // Legalize t16 operand since replaceReg is called after addUsersToVALU
+ for (MachineOperand &MO :
+ make_early_inc_range(MRI.use_operands(NewDstReg))) {
+ legalizeOperandsVALUt16(*MO.getParent(), MRI);
+ }
+
+ return;
}
- return;
}
// If this is a v2s copy between 16bit and 32bit reg,
@@ -8183,7 +8300,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
AMDGPU::OpName::src0_modifiers) >= 0)
NewInstr.addImm(0);
if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
- MachineOperand Src = Inst.getOperand(1);
+ const MachineOperand &Src = Inst.getOperand(1);
NewInstr->addOperand(Src);
}
@@ -8412,6 +8529,37 @@ void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
+void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
+ MachineInstr &Inst) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineBasicBlock::iterator MII = Inst;
+ const DebugLoc &DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src1 = Inst.getOperand(1);
+ MachineOperand &Src2 = Inst.getOperand(2);
+ Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ unsigned SubOp =
+ ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
+
+ BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
+ .addReg(Src1.getReg())
+ .addReg(Src2.getReg());
+
+ BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
+
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
+ .addReg(SubResultReg)
+ .addReg(TmpReg);
+
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
@@ -9199,7 +9347,7 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
-void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
MachineInstr &SCCDefInst,
SIInstrWorklist &Worklist,
Register NewCond) const {
@@ -9217,7 +9365,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
if (SCCIdx != -1) {
if (MI.isCopy()) {
- MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
Register DestReg = MI.getOperand(0).getReg();
MRI.replaceRegWith(DestReg, NewCond);
@@ -9329,7 +9477,7 @@ Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
return SGPRReg;
Register UsedSGPRs[3] = {Register()};
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
for (unsigned i = 0; i < 3; ++i) {
int Idx = OpIndices[i];
@@ -9579,7 +9727,7 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
return getInstBundleSize(MI);
case TargetOpcode::INLINEASM:
case TargetOpcode::INLINEASM_BR: {
- const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineFunction *MF = MI.getMF();
const char *AsmStr = MI.getOperand(0).getSymbolName();
return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
}
@@ -9714,7 +9862,7 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
// needed by the prolog. However, the insertions for scalar registers can
// always be placed at the BB top as they are independent of the exec mask
// value.
- const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineFunction *MF = MI.getMF();
bool IsNullOrVectorRegister = true;
if (Reg) {
const MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -10160,7 +10308,7 @@ static bool followSubRegDef(MachineInstr &MI,
}
MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
- MachineRegisterInfo &MRI) {
+ const MachineRegisterInfo &MRI) {
assert(MRI.isSSA());
if (!P.Reg.isVirtual())
return nullptr;
@@ -10501,7 +10649,7 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
return InstructionUniformity::Default;
}
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
// FIXME: It's conceptually broken to report this for an instruction, and not
@@ -10625,6 +10773,8 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
const SIRegisterInfo &RI) {
MachineInstr *KillsSCC = nullptr;
+ if (SCCValid->getParent() != SCCRedefine->getParent())
+ return false;
for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
SCCRedefine->getIterator())) {
if (MI.modifiesRegister(AMDGPU::SCC, &RI))
@@ -10669,8 +10819,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (CmpValue != 0)
return false;
- MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
- if (!Def || Def->getParent() != CmpInstr.getParent())
+ MachineInstr *Def = MRI->getVRegDef(SrcReg);
+ if (!Def)
return false;
// For S_OP that set SCC = DST!=0, do the transformation
@@ -10689,6 +10839,32 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (!optimizeSCC(Def, &CmpInstr, RI))
return false;
+ // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
+ // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
+ // 64-bit foldableSelect then delete s_or_b32 in the sequence:
+ // sX = s_cselect_b64 (non-zero imm), 0
+ // sLo = copy sX.sub0
+ // sHi = copy sX.sub1
+ // sY = s_or_b32 sLo, sHi
+ if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
+ MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
+ const MachineOperand &OrOpnd1 = Def->getOperand(1);
+ const MachineOperand &OrOpnd2 = Def->getOperand(2);
+ if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
+ MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
+ MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
+ if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
+ Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
+ Def2->getOperand(1).isReg() &&
+ Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
+ Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
+ Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
+ MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
+ if (Select && foldableSelect(*Select))
+ optimizeSCC(Select, Def, RI);
+ }
+ }
+ }
return true;
};
@@ -10718,8 +10894,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
// s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
// s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
- MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
- if (!Def || Def->getParent() != CmpInstr.getParent())
+ MachineInstr *Def = MRI->getVRegDef(SrcReg);
+ if (!Def)
return false;
if (Def->getOpcode() != AMDGPU::S_AND_B32 &&