aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td6
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp130
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h8
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td6
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp132
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h31
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp211
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h19
-rw-r--r--llvm/lib/Target/AMDGPU/MIMGInstructions.td28
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp162
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h33
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td17
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp49
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h3
21 files changed, 702 insertions, 172 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index dbe74b1..5700468 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2394,15 +2394,19 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
(TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
TII->isTRANS(MI)))
- Result = true;
+ Result = !MI.mayLoadOrStore();
else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
- TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))
- Result = true;
+ TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) {
+ // Some memory instructions may be marked as VALU (e.g. BUFFER_LOAD_*_LDS).
+ // For our purposes, these shall not be classified as VALU as this results
+ // in unexpected behavior.
+ Result = !MI.mayLoadOrStore();
+ }
else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
TII->isSALU(MI))
- Result = true;
+ Result = !MI.mayLoadOrStore();
else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
TII->isMFMAorWMMA(MI))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index a44af5f..8ed4062 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -514,8 +514,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MVT::i64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
- setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
- Legal);
+ setOperationAction({ISD::ABS, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX},
+ MVT::i32, Legal);
setOperationAction(
{ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
@@ -2833,8 +2833,8 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
R = getMad(DAG, DL, VT, YH, CH, Mad1);
}
- const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
- (Flags.hasNoInfs() || Options.NoInfsFPMath);
+ const bool IsFiniteOnly =
+ (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs();
// TODO: Check if known finite from source value.
if (!IsFiniteOnly) {
@@ -3161,9 +3161,8 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
- const auto &Options = getTargetMachine().Options;
- if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
+ if (!Flags.hasNoInfs()) {
SDValue OverflowCheckConst =
DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
SDValue Overflow =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ee466ca..596a895 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -3575,7 +3575,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
const bool IsFiniteOnly =
(MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
- (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
+ MI.getFlag(MachineInstr::FmNoInfs);
if (!IsFiniteOnly) {
// Expand isfinite(x) => fabs(x) < inf
@@ -3864,9 +3864,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
R = B.buildSelect(Ty, Underflow, Zero, R);
- const auto &Options = MF.getTarget().Options;
-
- if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
+ if (!(Flags & MachineInstr::FmNoInfs)) {
auto OverflowCheckConst =
B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index bfe2c80..a67b12a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -901,6 +901,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
+ addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}});
+
bool hasSALUFloat = ST->hasSALUFloatInsts();
addRulesForGOpcs({G_FADD}, Standard)
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index d0ad120..b841171 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1488,6 +1488,12 @@ let AssemblerPredicate = isGFX12Plus in {
def : MnemonicAlias<"ds_load_tr_b64", "ds_load_tr8_b64">, Requires<[isGFX1250Plus]>;
def : MnemonicAlias<"ds_load_tr_b128", "ds_load_tr16_b128">, Requires<[isGFX1250Plus]>;
+// Additional aliases for ds load transpose instructions.
+def : MnemonicAlias<"ds_load_b64_tr_b8", "ds_load_tr8_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b128_tr_b16", "ds_load_tr16_b128">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b64_tr_b4", "ds_load_tr4_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b96_tr_b6", "ds_load_tr6_b96">, Requires<[isGFX125xOnly]>;
+
//===----------------------------------------------------------------------===//
// GFX11.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index be62395..e3f3aba 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -513,8 +513,7 @@ void AMDGPUDisassembler::decodeImmOperands(MCInst &MI,
}
if (Imm == AMDGPU::EncValues::LITERAL_CONST) {
- Op = decodeLiteralConstant(
- Desc, OpDesc, OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64);
+ Op = decodeLiteralConstant(Desc, OpDesc);
continue;
}
@@ -893,6 +892,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// have EXEC as implicit destination. Issue a warning if encoding for
// vdst is not EXEC.
if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3) &&
+ MCII->get(MI.getOpcode()).getNumDefs() == 0 &&
MCII->get(MI.getOpcode()).hasImplicitDefOfPhysReg(AMDGPU::EXEC)) {
auto ExecEncoding = MRI.getEncodingValue(AMDGPU::EXEC_LO);
if (Bytes_[0] != ExecEncoding)
@@ -1545,21 +1545,21 @@ AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
MCOperand
AMDGPUDisassembler::decodeMandatoryLiteral64Constant(uint64_t Val) const {
if (HasLiteral) {
- if (Literal64 != Val)
+ if (Literal != Val)
return errOperand(Val, "More than one unique literal is illegal");
}
HasLiteral = true;
- Literal = Literal64 = Val;
+ Literal = Val;
- bool UseLit64 = Hi_32(Literal64) == 0;
+ bool UseLit64 = Hi_32(Literal) == 0;
return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
- LitModifier::Lit64, Literal64, getContext()))
- : MCOperand::createImm(Literal64);
+ LitModifier::Lit64, Literal, getContext()))
+ : MCOperand::createImm(Literal);
}
-MCOperand AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
- const MCOperandInfo &OpDesc,
- bool ExtendFP64) const {
+MCOperand
+AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
+ const MCOperandInfo &OpDesc) const {
// For now all literal constants are supposed to be unsigned integer
// ToDo: deal with signed/unsigned 64-bit integer constants
// ToDo: deal with float/double constants
@@ -1569,35 +1569,79 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
Twine(Bytes.size()));
}
HasLiteral = true;
- Literal = Literal64 = eatBytes<uint32_t>(Bytes);
- if (ExtendFP64)
- Literal64 <<= 32;
+ Literal = eatBytes<uint32_t>(Bytes);
}
- int64_t Val = ExtendFP64 ? Literal64 : Literal;
+ // For disassembling always assume all inline constants are available.
+ bool HasInv2Pi = true;
- bool CanUse64BitLiterals =
- STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
- !(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P));
-
- bool UseLit64 = false;
- if (CanUse64BitLiterals) {
- if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
- OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64)
- UseLit64 = false;
- else if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 ||
- OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 ||
- OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64)
- UseLit64 = Hi_32(Literal64) == 0;
+ // Invalid instruction codes may contain literals for inline-only
+ // operands, so we support them here as well.
+ int64_t Val = Literal;
+ bool UseLit = false;
+ switch (OpDesc.OperandType) {
+ default:
+ llvm_unreachable("Unexpected operand type!");
+ case AMDGPU::OPERAND_REG_IMM_BF16:
+ case AMDGPU::OPERAND_REG_INLINE_C_BF16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
+ UseLit = AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
+ break;
+ case AMDGPU::OPERAND_REG_IMM_V2BF16:
+ UseLit = AMDGPU::isInlinableLiteralV2BF16(Val);
+ break;
+ case AMDGPU::OPERAND_REG_IMM_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ UseLit = AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
+ break;
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ UseLit = AMDGPU::isInlinableLiteralV2F16(Val);
+ break;
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
+ break;
+ case AMDGPU::OPERAND_REG_IMM_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ UseLit = AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
+ break;
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ UseLit = AMDGPU::isInlinableLiteralV2I16(Val);
+ break;
+ case AMDGPU::OPERAND_REG_IMM_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_IMM_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32:
+ case AMDGPU::OPERAND_KIMM32:
+ UseLit = AMDGPU::isInlinableLiteral32(Val, HasInv2Pi);
+ break;
+ case AMDGPU::OPERAND_REG_IMM_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
+ Val <<= 32;
+ break;
+ case AMDGPU::OPERAND_REG_IMM_INT64:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+ UseLit = AMDGPU::isInlinableLiteral64(Val, HasInv2Pi);
+ break;
+ case MCOI::OPERAND_REGISTER:
+ // TODO: Disassembling V_DUAL_FMAMK_F32_X_FMAMK_F32_gfx11 hits
+ // decoding a literal in a position of a register operand. Give
+ // it special handling in the caller, decodeImmOperands(), instead
+ // of quietly allowing it here.
+ break;
}
- return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
- LitModifier::Lit64, Val, getContext()))
- : MCOperand::createImm(Val);
+ return UseLit ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
+ LitModifier::Lit, Val, getContext()))
+ : MCOperand::createImm(Val);
}
-MCOperand
-AMDGPUDisassembler::decodeLiteral64Constant(const MCInst &Inst) const {
+MCOperand AMDGPUDisassembler::decodeLiteral64Constant() const {
assert(STI.hasFeature(AMDGPU::Feature64BitLiterals));
if (!HasLiteral) {
@@ -1606,25 +1650,13 @@ AMDGPUDisassembler::decodeLiteral64Constant(const MCInst &Inst) const {
Twine(Bytes.size()));
}
HasLiteral = true;
- Literal64 = eatBytes<uint64_t>(Bytes);
- }
-
- bool UseLit64 = false;
- const MCInstrDesc &Desc = MCII->get(Inst.getOpcode());
- const MCOperandInfo &OpDesc = Desc.operands()[Inst.getNumOperands()];
- if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
- OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64) {
- UseLit64 = false;
- } else {
- assert(OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 ||
- OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 ||
- OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64);
- UseLit64 = Hi_32(Literal64) == 0;
+ Literal = eatBytes<uint64_t>(Bytes);
}
+ bool UseLit64 = Hi_32(Literal) == 0;
return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
- LitModifier::Lit64, Literal64, getContext()))
- : MCOperand::createImm(Literal64);
+ LitModifier::Lit64, Literal, getContext()))
+ : MCOperand::createImm(Literal);
}
MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
@@ -1913,7 +1945,7 @@ MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(const MCInst &Inst,
return MCOperand::createImm(Val);
if (Val == LITERAL64_CONST && STI.hasFeature(AMDGPU::Feature64BitLiterals)) {
- return decodeLiteral64Constant(Inst);
+ return decodeLiteral64Constant();
}
switch (Width) {
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 2751857..d103d79 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -44,8 +44,7 @@ private:
const unsigned HwModeRegClass;
const unsigned TargetMaxInstBytes;
mutable ArrayRef<uint8_t> Bytes;
- mutable uint32_t Literal;
- mutable uint64_t Literal64;
+ mutable uint64_t Literal;
mutable bool HasLiteral;
mutable std::optional<bool> EnableWavefrontSize32;
unsigned CodeObjectVersion;
@@ -144,9 +143,8 @@ public:
MCOperand decodeMandatoryLiteralConstant(unsigned Imm) const;
MCOperand decodeMandatoryLiteral64Constant(uint64_t Imm) const;
MCOperand decodeLiteralConstant(const MCInstrDesc &Desc,
- const MCOperandInfo &OpDesc,
- bool ExtendFP64) const;
- MCOperand decodeLiteral64Constant(const MCInst &Inst) const;
+ const MCOperandInfo &OpDesc) const;
+ MCOperand decodeLiteral64Constant() const;
MCOperand decodeSrcOp(const MCInst &Inst, unsigned Width, unsigned Val) const;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 6de59be..8ea64d1 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -3711,6 +3711,12 @@ defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "globa
defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>;
defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>;
+// Additional aliases for global load transpose instructions.
+def : MnemonicAlias<"global_load_b128_tr_b16", "global_load_tr16_b128">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b64_tr_b8", "global_load_tr8_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b64_tr_b4", "global_load_tr4_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b96_tr_b6", "global_load_tr6_b96">, Requires<[isGFX125xOnly]>;
+
defm FLAT_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>;
defm FLAT_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">;
defm FLAT_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">;
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 71494be..4e11c4f 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -14,6 +14,7 @@
#include "GCNRegPressure.h"
#include "AMDGPU.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/RegisterPressure.h"
using namespace llvm;
@@ -459,10 +460,14 @@ LaneBitmask llvm::getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
const LiveIntervals &LIS,
- const MachineRegisterInfo &MRI) {
+ const MachineRegisterInfo &MRI,
+ GCNRegPressure::RegKind RegKind) {
GCNRPTracker::LiveRegSet LiveRegs;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
auto Reg = Register::index2VirtReg(I);
+ if (RegKind != GCNRegPressure::TOTAL_KINDS &&
+ GCNRegPressure::getRegKind(Reg, MRI) != RegKind)
+ continue;
if (!LIS.hasInterval(Reg))
continue;
auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
@@ -986,3 +991,128 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
#undef PFX
}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void llvm::dumpMaxRegPressure(MachineFunction &MF,
+ GCNRegPressure::RegKind Kind,
+ LiveIntervals &LIS,
+ const MachineLoopInfo *MLI) {
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ auto &OS = dbgs();
+ const char *RegName = GCNRegPressure::getName(Kind);
+
+ unsigned MaxNumRegs = 0;
+ const MachineInstr *MaxPressureMI = nullptr;
+ GCNUpwardRPTracker RPT(LIS);
+ for (const MachineBasicBlock &MBB : MF) {
+ RPT.reset(MRI, LIS.getSlotIndexes()->getMBBEndIdx(&MBB).getPrevSlot());
+ for (const MachineInstr &MI : reverse(MBB)) {
+ RPT.recede(MI);
+ unsigned NumRegs = RPT.getMaxPressure().getNumRegs(Kind);
+ if (NumRegs > MaxNumRegs) {
+ MaxNumRegs = NumRegs;
+ MaxPressureMI = &MI;
+ }
+ }
+ }
+
+ SlotIndex MISlot = LIS.getInstructionIndex(*MaxPressureMI);
+
+ // Max pressure can occur at either the early-clobber or register slot.
+ // Choose the maximum liveset between both slots. This is ugly but this is
+ // diagnostic code.
+ SlotIndex ECSlot = MISlot.getRegSlot(true);
+ SlotIndex RSlot = MISlot.getRegSlot(false);
+ GCNRPTracker::LiveRegSet ECLiveSet = getLiveRegs(ECSlot, LIS, MRI, Kind);
+ GCNRPTracker::LiveRegSet RLiveSet = getLiveRegs(RSlot, LIS, MRI, Kind);
+ unsigned ECNumRegs = getRegPressure(MRI, ECLiveSet).getNumRegs(Kind);
+ unsigned RNumRegs = getRegPressure(MRI, RLiveSet).getNumRegs(Kind);
+ GCNRPTracker::LiveRegSet *LiveSet =
+ ECNumRegs > RNumRegs ? &ECLiveSet : &RLiveSet;
+ SlotIndex MaxPressureSlot = ECNumRegs > RNumRegs ? ECSlot : RSlot;
+ assert(getRegPressure(MRI, *LiveSet).getNumRegs(Kind) == MaxNumRegs);
+
+ // Split live registers into single-def and multi-def sets.
+ GCNRegPressure SDefPressure, MDefPressure;
+ SmallVector<Register, 16> SDefRegs, MDefRegs;
+ for (auto [Reg, LaneMask] : *LiveSet) {
+ assert(GCNRegPressure::getRegKind(Reg, MRI) == Kind);
+ LiveInterval &LI = LIS.getInterval(Reg);
+ if (LI.getNumValNums() == 1 ||
+ (LI.hasSubRanges() &&
+ llvm::all_of(LI.subranges(), [](const LiveInterval::SubRange &SR) {
+ return SR.getNumValNums() == 1;
+ }))) {
+ SDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI);
+ SDefRegs.push_back(Reg);
+ } else {
+ MDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI);
+ MDefRegs.push_back(Reg);
+ }
+ }
+ unsigned SDefNumRegs = SDefPressure.getNumRegs(Kind);
+ unsigned MDefNumRegs = MDefPressure.getNumRegs(Kind);
+ assert(SDefNumRegs + MDefNumRegs == MaxNumRegs);
+
+ auto printLoc = [&](const MachineBasicBlock *MBB, SlotIndex SI) {
+ return Printable([&, MBB, SI](raw_ostream &OS) {
+ OS << SI << ':' << printMBBReference(*MBB);
+ if (MLI)
+ if (const MachineLoop *ML = MLI->getLoopFor(MBB))
+ OS << " (LoopHdr " << printMBBReference(*ML->getHeader())
+ << ", Depth " << ML->getLoopDepth() << ")";
+ });
+ };
+
+ auto PrintRegInfo = [&](Register Reg, LaneBitmask LiveMask) {
+ GCNRegPressure RegPressure;
+ RegPressure.inc(Reg, LaneBitmask::getNone(), LiveMask, MRI);
+ OS << " " << printReg(Reg, TRI) << ':'
+ << TRI->getRegClassName(MRI.getRegClass(Reg)) << ", LiveMask "
+ << PrintLaneMask(LiveMask) << " (" << RegPressure.getNumRegs(Kind) << ' '
+ << RegName << "s)\n";
+
+ // Use std::map to sort def/uses by SlotIndex.
+ std::map<SlotIndex, const MachineInstr *> Instrs;
+ for (const MachineInstr &MI : MRI.reg_nodbg_instructions(Reg)) {
+ Instrs[LIS.getInstructionIndex(MI).getRegSlot()] = &MI;
+ }
+
+ for (const auto &[SI, MI] : Instrs) {
+ OS << " ";
+ if (MI->definesRegister(Reg, TRI))
+ OS << "def ";
+ if (MI->readsRegister(Reg, TRI))
+ OS << "use ";
+ OS << printLoc(MI->getParent(), SI) << ": " << *MI;
+ }
+ };
+
+ OS << "\n*** Register pressure info (" << RegName << "s) for " << MF.getName()
+ << " ***\n";
+ OS << "Max pressure is " << MaxNumRegs << ' ' << RegName << "s at "
+ << printLoc(MaxPressureMI->getParent(), MaxPressureSlot) << ": "
+ << *MaxPressureMI;
+
+ OS << "\nLive registers with single definition (" << SDefNumRegs << ' '
+ << RegName << "s):\n";
+
+ // Sort SDefRegs by number of uses (smallest first)
+ llvm::sort(SDefRegs, [&](Register A, Register B) {
+ return std::distance(MRI.use_nodbg_begin(A), MRI.use_nodbg_end()) <
+ std::distance(MRI.use_nodbg_begin(B), MRI.use_nodbg_end());
+ });
+
+ for (const Register Reg : SDefRegs) {
+ PrintRegInfo(Reg, LiveSet->lookup(Reg));
+ }
+
+ OS << "\nLive registers with multiple definitions (" << MDefNumRegs << ' '
+ << RegName << "s):\n";
+ for (const Register Reg : MDefRegs) {
+ PrintRegInfo(Reg, LiveSet->lookup(Reg));
+ }
+}
+#endif
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 898d1ff..979a8b0 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -31,6 +31,12 @@ class SlotIndex;
struct GCNRegPressure {
enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS };
+ static constexpr const char *getName(RegKind Kind) {
+ const char *Names[] = {"SGPR", "VGPR", "AGPR", "AVGPR"};
+ assert(Kind < TOTAL_KINDS);
+ return Names[Kind];
+ }
+
GCNRegPressure() {
clear();
}
@@ -41,6 +47,11 @@ struct GCNRegPressure {
void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
+ unsigned getNumRegs(RegKind Kind) const {
+ assert(Kind < TOTAL_KINDS);
+ return Value[Kind];
+ }
+
/// \returns the SGPR32 pressure
unsigned getSGPRNum() const { return Value[SGPR]; }
/// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure
@@ -138,6 +149,12 @@ struct GCNRegPressure {
void dump() const;
+ static RegKind getRegKind(unsigned Reg, const MachineRegisterInfo &MRI) {
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ const SIRegisterInfo *STI = static_cast<const SIRegisterInfo *>(TRI);
+ return (RegKind)getRegKind(MRI.getRegClass(Reg), STI);
+ }
+
private:
static constexpr unsigned ValueArraySize = TOTAL_KINDS * 2;
@@ -294,8 +311,10 @@ public:
}
};
-GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
- const MachineRegisterInfo &MRI);
+GCNRPTracker::LiveRegSet
+getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
+ const MachineRegisterInfo &MRI,
+ GCNRegPressure::RegKind RegKind = GCNRegPressure::TOTAL_KINDS);
////////////////////////////////////////////////////////////////////////////////
// GCNUpwardRPTracker
@@ -428,9 +447,6 @@ LaneBitmask getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
const MachineRegisterInfo &MRI,
LaneBitmask LaneMaskFilter = LaneBitmask::getAll());
-GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
- const MachineRegisterInfo &MRI);
-
/// creates a map MachineInstr -> LiveRegSet
/// R - range of iterators on instructions
/// After - upon entry or exit of every instruction
@@ -524,6 +540,11 @@ public:
}
};
+LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF,
+ GCNRegPressure::RegKind Kind,
+ LiveIntervals &LIS,
+ const MachineLoopInfo *MLI);
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index bdc0810..9fbf9e5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -69,6 +69,27 @@ static cl::opt<bool> GCNTrackers(
cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
cl::init(false));
+static cl::opt<unsigned> PendingQueueLimit(
+ "amdgpu-scheduler-pending-queue-limit", cl::Hidden,
+ cl::desc(
+ "Max (Available+Pending) size to inspect pending queue (0 disables)"),
+ cl::init(256));
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+#define DUMP_MAX_REG_PRESSURE
+static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler(
+ "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
+ cl::desc("Print a list of live registers along with their def/uses at the "
+ "point of maximum register pressure before scheduling."),
+ cl::init(false));
+
+static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
+ "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
+ cl::desc("Print a list of live registers along with their def/uses at the "
+ "point of maximum register pressure after scheduling."),
+ cl::init(false));
+#endif
+
const unsigned ScheduleMetrics::ScaleFactor = 100;
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
@@ -320,17 +341,52 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
}
}
+static bool shouldCheckPending(SchedBoundary &Zone,
+ const TargetSchedModel *SchedModel) {
+ bool HasBufferedModel =
+ SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
+ unsigned Combined = Zone.Available.size() + Zone.Pending.size();
+ return Combined <= PendingQueueLimit && HasBufferedModel;
+}
+
+static SUnit *pickOnlyChoice(SchedBoundary &Zone,
+ const TargetSchedModel *SchedModel) {
+ // pickOnlyChoice() releases pending instructions and checks for new hazards.
+ SUnit *OnlyChoice = Zone.pickOnlyChoice();
+ if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
+ return OnlyChoice;
+
+ return nullptr;
+}
+
+void GCNSchedStrategy::printCandidateDecision(const SchedCandidate &Current,
+ const SchedCandidate &Preferred) {
+ LLVM_DEBUG({
+ dbgs() << "Prefer:\t\t";
+ DAG->dumpNode(*Preferred.SU);
+
+ if (Current.SU) {
+ dbgs() << "Not:\t";
+ DAG->dumpNode(*Current.SU);
+ }
+
+ dbgs() << "Reason:\t\t";
+ traceCandidate(Preferred);
+ });
+}
+
// This function is mostly cut and pasted from
// GenericScheduler::pickNodeFromQueue()
void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker,
- SchedCandidate &Cand,
+ SchedCandidate &Cand, bool &IsPending,
bool IsBottomUp) {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
unsigned SGPRPressure = 0;
unsigned VGPRPressure = 0;
+ IsPending = false;
if (DAG->isTrackingPressure()) {
if (!GCNTrackers) {
SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
@@ -343,8 +399,9 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
VGPRPressure = T->getPressure().getArchVGPRNum();
}
}
- ReadyQueue &Q = Zone.Available;
- for (SUnit *SU : Q) {
+ LLVM_DEBUG(dbgs() << "Available Q:\n");
+ ReadyQueue &AQ = Zone.Available;
+ for (SUnit *SU : AQ) {
SchedCandidate TryCand(ZonePolicy);
initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
@@ -356,27 +413,55 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
// Initialize resource delta if needed in case future heuristics query it.
if (TryCand.ResDelta == SchedResourceDelta())
TryCand.initResourceDelta(Zone.DAG, SchedModel);
+ LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
+ Cand.setBest(TryCand);
+ } else {
+ printCandidateDecision(TryCand, Cand);
+ }
+ }
+
+ if (!shouldCheckPending(Zone, SchedModel))
+ return;
+
+ LLVM_DEBUG(dbgs() << "Pending Q:\n");
+ ReadyQueue &PQ = Zone.Pending;
+ for (SUnit *SU : PQ) {
+
+ SchedCandidate TryCand(ZonePolicy);
+ initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
+ VGPRPressure, IsBottomUp);
+ // Pass SchedBoundary only when comparing nodes from the same boundary.
+ SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
+ tryPendingCandidate(Cand, TryCand, ZoneArg);
+ if (TryCand.Reason != NoCand) {
+ // Initialize resource delta if needed in case future heuristics query it.
+ if (TryCand.ResDelta == SchedResourceDelta())
+ TryCand.initResourceDelta(Zone.DAG, SchedModel);
+ LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
+ IsPending = true;
Cand.setBest(TryCand);
- LLVM_DEBUG(traceCandidate(Cand));
+ } else {
+ printCandidateDecision(TryCand, Cand);
}
}
}
// This function is mostly cut and pasted from
// GenericScheduler::pickNodeBidirectional()
-SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
+SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode,
+ bool &PickedPending) {
// Schedule as far as possible in the direction of no choice. This is most
// efficient, but also provides the best heuristics for CriticalPSets.
- if (SUnit *SU = Bot.pickOnlyChoice()) {
+ if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) {
IsTopNode = false;
return SU;
}
- if (SUnit *SU = Top.pickOnlyChoice()) {
+ if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) {
IsTopNode = true;
return SU;
}
- // Set the bottom-up policy based on the state of the current bottom zone and
- // the instructions outside the zone, including the top zone.
+ // Set the bottom-up policy based on the state of the current bottom zone
+ // and the instructions outside the zone, including the top zone.
CandPolicy BotPolicy;
setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
// Set the top-down policy based on the state of the current top zone and
@@ -384,12 +469,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
CandPolicy TopPolicy;
setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
+ bool BotPending = false;
// See if BotCand is still valid (because we previously scheduled from Top).
LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
if (!BotCand.isValid() || BotCand.SU->isScheduled ||
BotCand.Policy != BotPolicy) {
BotCand.reset(CandPolicy());
pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
+ BotPending,
/*IsBottomUp=*/true);
assert(BotCand.Reason != NoCand && "failed to find the first candidate");
} else {
@@ -399,6 +486,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
SchedCandidate TCand;
TCand.reset(CandPolicy());
pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
+ BotPending,
/*IsBottomUp=*/true);
assert(TCand.SU == BotCand.SU &&
"Last pick result should correspond to re-picking right now");
@@ -406,12 +494,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
#endif
}
+ bool TopPending = false;
// Check if the top Q has a better candidate.
LLVM_DEBUG(dbgs() << "Picking from Top:\n");
if (!TopCand.isValid() || TopCand.SU->isScheduled ||
TopCand.Policy != TopPolicy) {
TopCand.reset(CandPolicy());
pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
+ TopPending,
/*IsBottomUp=*/false);
assert(TopCand.Reason != NoCand && "failed to find the first candidate");
} else {
@@ -421,6 +511,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
SchedCandidate TCand;
TCand.reset(CandPolicy());
pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
+ TopPending,
/*IsBottomUp=*/false);
assert(TCand.SU == TopCand.SU &&
"Last pick result should correspond to re-picking right now");
@@ -431,12 +522,21 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
// Pick best from BotCand and TopCand.
LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
dbgs() << "Bot Cand: "; traceCandidate(BotCand););
- SchedCandidate Cand = BotCand;
- TopCand.Reason = NoCand;
- tryCandidate(Cand, TopCand, nullptr);
- if (TopCand.Reason != NoCand) {
- Cand.setBest(TopCand);
+ SchedCandidate Cand = BotPending ? TopCand : BotCand;
+ SchedCandidate TryCand = BotPending ? BotCand : TopCand;
+ PickedPending = BotPending && TopPending;
+
+ TryCand.Reason = NoCand;
+ if (BotPending || TopPending) {
+ PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr);
+ } else {
+ tryCandidate(Cand, TryCand, nullptr);
+ }
+
+ if (TryCand.Reason != NoCand) {
+ Cand.setBest(TryCand);
}
+
LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
IsTopNode = Cand.AtTop;
@@ -451,35 +551,55 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
return nullptr;
}
+ bool PickedPending;
SUnit *SU;
do {
+ PickedPending = false;
if (RegionPolicy.OnlyTopDown) {
- SU = Top.pickOnlyChoice();
+ SU = pickOnlyChoice(Top, SchedModel);
if (!SU) {
CandPolicy NoPolicy;
TopCand.reset(NoPolicy);
pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
+ PickedPending,
/*IsBottomUp=*/false);
assert(TopCand.Reason != NoCand && "failed to find a candidate");
SU = TopCand.SU;
}
IsTopNode = true;
} else if (RegionPolicy.OnlyBottomUp) {
- SU = Bot.pickOnlyChoice();
+ SU = pickOnlyChoice(Bot, SchedModel);
if (!SU) {
CandPolicy NoPolicy;
BotCand.reset(NoPolicy);
pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
+ PickedPending,
/*IsBottomUp=*/true);
assert(BotCand.Reason != NoCand && "failed to find a candidate");
SU = BotCand.SU;
}
IsTopNode = false;
} else {
- SU = pickNodeBidirectional(IsTopNode);
+ SU = pickNodeBidirectional(IsTopNode, PickedPending);
}
} while (SU->isScheduled);
+ if (PickedPending) {
+ unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
+ SchedBoundary &Zone = IsTopNode ? Top : Bot;
+ unsigned CurrentCycle = Zone.getCurrCycle();
+ if (ReadyCycle > CurrentCycle)
+ Zone.bumpCycle(ReadyCycle);
+
+ // FIXME: checkHazard() doesn't give information about which cycle the
+ // hazard will resolve so just keep bumping the cycle by 1. This could be
+ // made more efficient if checkHazard() returned more details.
+ while (Zone.checkHazard(SU))
+ Zone.bumpCycle(Zone.getCurrCycle() + 1);
+
+ Zone.releasePending();
+ }
+
if (SU->isTopReady())
Top.removeReady(SU);
if (SU->isBottomReady())
@@ -525,6 +645,47 @@ GCNSchedStageID GCNSchedStrategy::getNextStage() const {
return *std::next(CurrentStage);
}
+bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand,
+ SchedBoundary *Zone) const {
+ // Initialize the candidate if needed.
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+
+ // Bias PhysReg Defs and copies to their uses and defined respectively.
+ if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+ biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
+ return TryCand.Reason != NoCand;
+
+ // Avoid exceeding the target's limit.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
+ RegExcess, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ // Avoid increasing the max critical pressure in the scheduled region.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
+ TryCand, Cand, RegCritical, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ bool SameBoundary = Zone != nullptr;
+ if (SameBoundary) {
+ TryCand.initResourceDelta(DAG, SchedModel);
+ if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+ TryCand, Cand, ResourceReduce))
+ return TryCand.Reason != NoCand;
+ if (tryGreater(TryCand.ResDelta.DemandedResources,
+ Cand.ResDelta.DemandedResources, TryCand, Cand,
+ ResourceDemand))
+ return TryCand.Reason != NoCand;
+ }
+
+ return false;
+}
+
GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
const MachineSchedContext *C, bool IsLegacyScheduler)
: GCNSchedStrategy(C) {
@@ -960,6 +1121,14 @@ void GCNScheduleDAGMILive::runSchedStages() {
RegionLiveOuts.buildLiveRegMap();
}
+#ifdef DUMP_MAX_REG_PRESSURE
+ if (PrintMaxRPRegUsageBeforeScheduler) {
+ dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
+ dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
+ LIS->dump();
+ }
+#endif
+
GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
while (S.advanceStage()) {
auto Stage = createSchedStage(S.getCurrentStage());
@@ -995,6 +1164,14 @@ void GCNScheduleDAGMILive::runSchedStages() {
Stage->finalizeGCNSchedStage();
}
+
+#ifdef DUMP_MAX_REG_PRESSURE
+ if (PrintMaxRPRegUsageAfterScheduler) {
+ dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
+ dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
+ LIS->dump();
+ }
+#endif
}
#ifndef NDEBUG
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 8ea4267..975781f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -44,17 +44,32 @@ raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID);
/// heuristics to determine excess/critical pressure sets.
class GCNSchedStrategy : public GenericScheduler {
protected:
- SUnit *pickNodeBidirectional(bool &IsTopNode);
+ SUnit *pickNodeBidirectional(bool &IsTopNode, bool &PickedPending);
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker,
- SchedCandidate &Cand, bool IsBottomUp);
+ SchedCandidate &Cand, bool &IsPending,
+ bool IsBottomUp);
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop,
const RegPressureTracker &RPTracker,
const SIRegisterInfo *SRI, unsigned SGPRPressure,
unsigned VGPRPressure, bool IsBottomUp);
+ /// Evaluates instructions in the pending queue using a subset of scheduling
+ /// heuristics.
+ ///
+ /// Instructions that cannot be issued due to hardware constraints are placed
+ /// in the pending queue rather than the available queue, making them normally
+ /// invisible to scheduling heuristics. However, in certain scenarios (such as
+ /// avoiding register spilling), it may be beneficial to consider scheduling
+ /// these not-yet-ready instructions.
+ bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+ SchedBoundary *Zone) const;
+
+ void printCandidateDecision(const SchedCandidate &Current,
+ const SchedCandidate &Preferred);
+
std::vector<unsigned> Pressure;
std::vector<unsigned> MaxPressure;
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 64e34db..5f6d742 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -260,8 +260,12 @@ class NSAHelper {
}
class MIMGNSAHelper<int num_addrs,
- list<RegisterClass> addr_types=!listsplat(VGPR_32, num_addrs)>
- : NSAHelper<> {
+ list<RegisterOperand> addr_types_in=[]>
+ : NSAHelper<> {
+ list<RegisterOperand> addr_types =
+ !if(!empty(addr_types_in), !listsplat(VGPROp_32, num_addrs),
+ addr_types_in);
+
list<string> AddrAsmNames = !foreach(i, !range(num_addrs), "vaddr" # i);
let AddrIns = !dag(ins, addr_types, AddrAsmNames);
let AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]";
@@ -358,7 +362,7 @@ class MIMG_gfx11<int op, dag outs, string dns = "">
// Base class for all NSA MIMG instructions.
// Note that 1-dword addresses always use non-NSA variants.
class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
- list<RegisterClass> addr_types=[],
+ list<RegisterOperand> addr_types=[],
RegisterOperand LastAddrRC = VGPROp_32>
: MIMG<outs, dns>, MIMGe_gfx11<op> {
let SubtargetPredicate = isGFX11Only;
@@ -378,7 +382,7 @@ class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
}
class VIMAGE_gfx12<int op, dag outs, int num_addrs, string dns="",
- list<RegisterClass> addr_types=[]>
+ list<RegisterOperand> addr_types=[]>
: VIMAGE<outs, dns>, VIMAGEe<op> {
let SubtargetPredicate = isGFX12Plus;
let AssemblerPredicate = isGFX12Plus;
@@ -1521,12 +1525,12 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16, bit isDual, bit isBVH8> {
int VAddrDwords = !srl(Size, 5);
int GFX11PlusNSAAddrs = !if(IsA16, 4, 5);
- RegisterClass node_ptr_type = !if(Is64, VReg_64, VGPR_32);
- list<RegisterClass> GFX11PlusAddrTypes =
- !cond(isBVH8 : [node_ptr_type, VReg_64, VReg_96, VReg_96, VGPR_32],
- isDual : [node_ptr_type, VReg_64, VReg_96, VReg_96, VReg_64],
- IsA16 : [node_ptr_type, VGPR_32, VReg_96, VReg_96],
- true : [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]);
+ RegisterOperand node_ptr_type = !if(Is64, VGPROp_64, VGPROp_32);
+ list<RegisterOperand> GFX11PlusAddrTypes =
+ !cond(isBVH8 : [node_ptr_type, VGPROp_64, VGPROp_96, VGPROp_96, VGPROp_32],
+ isDual : [node_ptr_type, VGPROp_64, VGPROp_96, VGPROp_96, VGPROp_64],
+ IsA16 : [node_ptr_type, VGPROp_32, VGPROp_96, VGPROp_96],
+ true : [node_ptr_type, VGPROp_32, VGPROp_96, VGPROp_96, VGPROp_96]);
}
class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterOperand AddrRC>
@@ -1552,7 +1556,7 @@ class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterOperand AddrRC>
}
class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
- list<RegisterClass> addr_types>
+ list<RegisterOperand> addr_types>
: MIMG_nsa_gfx11<op.GFX11, (outs VReg_128:$vdata), num_addrs, "GFX11",
addr_types> {
let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$srsrc, A16:$a16));
@@ -1561,7 +1565,7 @@ class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
class VIMAGE_IntersectRay_gfx12<mimgopc op, string opcode, int num_addrs,
bit isDual, bit isBVH8,
- list<RegisterClass> addr_types>
+ list<RegisterOperand> addr_types>
: VIMAGE_gfx12<op.GFX12, !if(!or(isDual, isBVH8),
(outs VReg_320:$vdata, VReg_96:$ray_origin_out,
VReg_96:$ray_dir_out),
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 80e985d..a2841c11 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -18168,7 +18168,7 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
return CacheLineAlign;
}
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
assert(N->getOpcode() == ISD::CopyFromReg);
do {
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 5e27b37..6dcbced 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1019,7 +1019,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
// SMEM and VMEM operations. So there will never be
// outstanding address translations for both SMEM and
// VMEM at the same time.
- setScoreLB(T, CurrScore - 1);
+ setScoreLB(T, getScoreUB(T) - 1);
PendingEvents &= ~(1 << OtherEvent);
}
for (const MachineOperand &Op : Inst.all_uses())
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ec5c5bb3..942e784 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -865,22 +865,16 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- if (DestReg == AMDGPU::VCC_LO) {
- if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
- BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
- .addReg(SrcReg, getKillRegState(KillSrc));
- } else {
+ if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
+ if (DestReg == AMDGPU::VCC_LO) {
// FIXME: Hack until VReg_1 removed.
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
- .addImm(0)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ .addImm(0)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
}
- return;
- }
-
- if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
@@ -898,22 +892,16 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- if (DestReg == AMDGPU::VCC) {
- if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
- BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
- .addReg(SrcReg, getKillRegState(KillSrc));
- } else {
+ if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
+ if (DestReg == AMDGPU::VCC) {
// FIXME: Hack until VReg_1 removed.
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
- .addImm(0)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ .addImm(0)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
}
- return;
- }
-
- if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
@@ -9084,6 +9072,67 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
MachineOperand &Src1 = Inst.getOperand(2);
const DebugLoc &DL = Inst.getDebugLoc();
+ if (ST.useRealTrue16Insts()) {
+ Register SrcReg0, SrcReg1;
+ if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
+ SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
+ } else {
+ SrcReg0 = Src0.getReg();
+ }
+
+ if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
+ SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
+ } else {
+ SrcReg1 = Src1.getReg();
+ }
+
+ bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
+ bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
+
+ auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
+ switch (Inst.getOpcode()) {
+ case AMDGPU::S_PACK_LL_B32_B16:
+ NewMI
+ .addReg(SrcReg0, 0,
+ isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, 0,
+ isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::hi16);
+ break;
+ case AMDGPU::S_PACK_LH_B32_B16:
+ NewMI
+ .addReg(SrcReg0, 0,
+ isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, 0, AMDGPU::hi16)
+ .addImm(AMDGPU::hi16);
+ break;
+ case AMDGPU::S_PACK_HL_B32_B16:
+ NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, 0,
+ isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::hi16);
+ break;
+ case AMDGPU::S_PACK_HH_B32_B16:
+ NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, 0, AMDGPU::hi16)
+ .addImm(AMDGPU::hi16);
+ break;
+ default:
+ llvm_unreachable("unhandled s_pack_* instruction");
+ }
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+ return;
+ }
+
switch (Inst.getOpcode()) {
case AMDGPU::S_PACK_LL_B32_B16: {
Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -10577,6 +10626,59 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
return false;
+ const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
+ this]() -> bool {
+ if (CmpValue != 0)
+ return false;
+
+ MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
+ if (!Def || Def->getParent() != CmpInstr.getParent())
+ return false;
+
+ bool CanOptimize = false;
+
+ // For S_OP that set SCC = DST!=0, do the transformation
+ //
+ // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
+ if (setsSCCifResultIsNonZero(*Def))
+ CanOptimize = true;
+
+ // s_cmp_lg_* is redundant because the SCC input value for S_CSELECT* has
+ // the same value that will be calculated by s_cmp_lg_*
+ //
+ // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
+ // imm), 0)
+ if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
+ Def->getOpcode() == AMDGPU::S_CSELECT_B64) {
+ bool Op1IsNonZeroImm =
+ Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0;
+ bool Op2IsZeroImm =
+ Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0;
+ if (Op1IsNonZeroImm && Op2IsZeroImm)
+ CanOptimize = true;
+ }
+
+ if (!CanOptimize)
+ return false;
+
+ MachineInstr *KillsSCC = nullptr;
+ for (MachineInstr &MI :
+ make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
+ if (MI.modifiesRegister(AMDGPU::SCC, &RI))
+ return false;
+ if (MI.killsRegister(AMDGPU::SCC, &RI))
+ KillsSCC = &MI;
+ }
+
+ if (MachineOperand *SccDef =
+ Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
+ SccDef->setIsDead(false);
+ if (KillsSCC)
+ KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
+ CmpInstr.eraseFromParent();
+ return true;
+ };
+
const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
this](int64_t ExpectedValue, unsigned SrcSize,
bool IsReversible, bool IsSigned) -> bool {
@@ -10651,16 +10753,20 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
return false;
- for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
- I != E; ++I) {
- if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
- I->killsRegister(AMDGPU::SCC, &RI))
+ MachineInstr *KillsSCC = nullptr;
+ for (MachineInstr &MI :
+ make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
+ if (MI.modifiesRegister(AMDGPU::SCC, &RI))
return false;
+ if (MI.killsRegister(AMDGPU::SCC, &RI))
+ KillsSCC = &MI;
}
MachineOperand *SccDef =
Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
SccDef->setIsDead(false);
+ if (KillsSCC)
+ KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
CmpInstr.eraseFromParent();
if (!MRI->use_nodbg_empty(DefReg)) {
@@ -10704,7 +10810,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMP_LG_I32:
case AMDGPU::S_CMPK_LG_U32:
case AMDGPU::S_CMPK_LG_I32:
- return optimizeCmpAnd(0, 32, true, false);
+ return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
case AMDGPU::S_CMP_GT_U32:
case AMDGPU::S_CMPK_GT_U32:
return optimizeCmpAnd(0, 32, false, false);
@@ -10712,7 +10818,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMPK_GT_I32:
return optimizeCmpAnd(0, 32, false, true);
case AMDGPU::S_CMP_LG_U64:
- return optimizeCmpAnd(0, 64, true, false);
+ return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
}
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e979eeb..ee99a74 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -709,6 +709,30 @@ public:
}
}
+ static bool setsSCCifResultIsNonZero(const MachineInstr &MI) {
+ if (!MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
+ return false;
+ // Compares have no result
+ if (MI.isCompare())
+ return false;
+ switch (MI.getOpcode()) {
+ default:
+ return true;
+ case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_ADD_U32:
+ case AMDGPU::S_ADDC_U32:
+ case AMDGPU::S_SUB_I32:
+ case AMDGPU::S_SUB_U32:
+ case AMDGPU::S_SUBB_U32:
+ case AMDGPU::S_MIN_I32:
+ case AMDGPU::S_MIN_U32:
+ case AMDGPU::S_MAX_I32:
+ case AMDGPU::S_MAX_U32:
+ case AMDGPU::S_ADDK_I32:
+ return false;
+ }
+ }
+
static bool isEXP(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::EXP;
}
@@ -879,6 +903,11 @@ public:
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
}
+ bool isMFMA(uint16_t Opcode) const {
+ return isMAI(Opcode) && Opcode != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ Opcode != AMDGPU::V_ACCVGPR_READ_B32_e64;
+ }
+
static bool isDOT(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::IsDOT;
}
@@ -895,6 +924,10 @@ public:
return isMFMA(MI) || isWMMA(MI) || isSWMMAC(MI);
}
+ bool isMFMAorWMMA(uint16_t Opcode) const {
+ return isMFMA(Opcode) || isWMMA(Opcode) || isSWMMAC(Opcode);
+ }
+
static bool isSWMMAC(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index eac9fd4..27e5ee9c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3726,6 +3726,23 @@ def : GCNPat <
} // End foreach Ty = ...
} // End AddedComplexity = 1
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat<
+ (i32 (DivergentBinFrag<or>
+ (i32 (zext i16:$src_lo)),
+ (i32 (bitconvert (v2i16 (build_vector (i16 0), (i16 VGPR_16:$src_hi)))))
+ )),
+ (REG_SEQUENCE VGPR_32, $src_lo, lo16, $src_hi, hi16)
+>;
+def : GCNPat<
+ (i32 (DivergentBinFrag<or>
+ (i32 (bitconvert (v2i16 (build_vector (i16 0), (i16 VGPR_16:$src_hi))))),
+ (i32 (zext i16:$src_lo))
+ )),
+ (REG_SEQUENCE VGPR_32, $src_lo, lo16, $src_hi, hi16)
+>;
+}
+
let True16Predicate = UseRealTrue16Insts in
def : GCNPat <
(v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 (trunc i32:$src1)))),
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index c684f9e..7431e11 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -47,9 +47,6 @@ private:
const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
- // Check if the machine instruction being processed is a supported packed
- // instruction.
- bool isUnpackingSupportedInstr(MachineInstr &MI) const;
// Creates a list of packed instructions following an MFMA that are suitable
// for unpacking.
void collectUnpackingCandidates(MachineInstr &BeginMI,
@@ -454,23 +451,6 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
return true;
}
-// If support is extended to new operations, add tests in
-// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
-bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
- if (!TII->isNeverCoissue(MI))
- return false;
- unsigned Opcode = MI.getOpcode();
- switch (Opcode) {
- case AMDGPU::V_PK_ADD_F32:
- case AMDGPU::V_PK_MUL_F32:
- case AMDGPU::V_PK_FMA_F32:
- return true;
- default:
- return false;
- }
- llvm_unreachable("Fully covered switch");
-}
-
bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
unsigned OpCode = MI.getOpcode();
Register DstReg = MI.getOperand(0).getReg();
@@ -612,10 +592,13 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
MachineInstr &Instr = *I;
+ uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
+ bool IsUnpackable =
+ !(UnpackedOpCode == std::numeric_limits<uint16_t>::max());
if (Instr.isMetaInstruction())
continue;
if ((Instr.isTerminator()) ||
- (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
+ (TII->isNeverCoissue(Instr) && !IsUnpackable) ||
(SIInstrInfo::modifiesModeRegister(Instr) &&
Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
return;
@@ -639,7 +622,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
return;
}
- if (!isUnpackingSupportedInstr(Instr))
+ if (!IsUnpackable)
continue;
if (canUnpackingClobberRegister(Instr))
@@ -654,7 +637,6 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
InstrsToUnpack.insert(&Instr);
}
- return;
}
void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
@@ -681,7 +663,6 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
HiDstOp.setIsRenamable(DstOp.isRenamable());
I.eraseFromParent();
- return;
}
MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
@@ -689,8 +670,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
bool IsHiBits) {
MachineBasicBlock &MBB = *I.getParent();
const DebugLoc &DL = I.getDebugLoc();
- const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
- const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
+ const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
+ const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
Register DstReg = I.getOperand(0).getReg();
unsigned OpCode = I.getOpcode();
Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
@@ -704,15 +685,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
NewMI.addDef(UnpackedDstReg); // vdst
- addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1);
- addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2);
+ addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
+ addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
- const MachineOperand *SrcMO3 =
+ const MachineOperand *SrcMO2 =
TII->getNamedOperand(I, AMDGPU::OpName::src2);
unsigned Src2Mods =
TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
- addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3);
+ addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2);
}
NewMI.addImm(ClampVal); // clamp
// Packed instructions do not support output modifiers. safe to assign them 0
@@ -789,9 +770,13 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
// TODO: Fold this into previous block, if possible. Evaluate and handle any
// side effects.
+
+ // Perform the extra MF scans only for supported archs
+ if (!ST.hasGFX940Insts())
+ return Changed;
for (MachineBasicBlock &MBB : MF) {
- // Unpack packed instructions overlapped by MFMAs. This allows the compiler
- // to co-issue unpacked instructions with MFMA
+ // Unpack packed instructions overlapped by MFMAs. This allows the
+ // compiler to co-issue unpacked instructions with MFMA
auto SchedModel = TII->getSchedModel();
SetVector<MachineInstr *> InstrsToUnpack;
for (auto &MI : make_early_inc_range(MBB.instrs())) {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index be1c883..ebd2e7e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2356,7 +2356,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
AMDGPU::M0)
.add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
}
case AMDGPU::SI_SPILL_V1024_SAVE:
case AMDGPU::SI_SPILL_V512_SAVE:
@@ -2446,7 +2446,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
AMDGPU::M0)
.add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
}
case AMDGPU::SI_SPILL_V16_RESTORE:
case AMDGPU::SI_SPILL_V32_RESTORE:
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index a01a5fd..5e3195b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1697,9 +1697,6 @@ LLVM_READNONE
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi);
LLVM_READNONE
-bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi);
-
-LLVM_READNONE
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi);
LLVM_READNONE