aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp137
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h3
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll29
5 files changed, 125 insertions, 69 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 5f8e32c..709de61 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -80,6 +80,10 @@ public:
bool updateOperand(FoldCandidate &Fold) const;
+ bool canUseImmWithOpSel(FoldCandidate &Fold) const;
+
+ bool tryFoldImmWithOpSel(FoldCandidate &Fold) const;
+
bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
MachineInstr *MI, unsigned OpNo,
MachineOperand *OpToFold) const;
@@ -196,62 +200,85 @@ FunctionPass *llvm::createSIFoldOperandsPass() {
return new SIFoldOperands();
}
-bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
+bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
MachineInstr *MI = Fold.UseMI;
MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
- assert(Old.isReg());
+ const uint64_t TSFlags = MI->getDesc().TSFlags;
+ assert(Old.isReg() && Fold.isImm());
- const uint64_t TSFlags = MI->getDesc().TSFlags;
- if (Fold.isImm()) {
- if (TSFlags & SIInstrFlags::IsPacked && !(TSFlags & SIInstrFlags::IsMAI) &&
- AMDGPU::isFoldableLiteralV216(Fold.ImmToFold,
- ST->hasInv2PiInlineImm())) {
- if (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT))
- return false; // Prevent further folding of this operand without opsel.
-
- // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
- // already set.
- unsigned Opcode = MI->getOpcode();
- int OpNo = MI->getOperandNo(&Old);
- int ModIdx = -1;
- if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
- ModIdx = AMDGPU::OpName::src0_modifiers;
- else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
- ModIdx = AMDGPU::OpName::src1_modifiers;
- else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
- ModIdx = AMDGPU::OpName::src2_modifiers;
- assert(ModIdx != -1);
- ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
- MachineOperand &Mod = MI->getOperand(ModIdx);
- unsigned Val = Mod.getImm();
- if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) {
- // Only apply the following transformation if that operand requires
- // a packed immediate.
- switch (TII->get(Opcode).operands()[OpNo].OperandType) {
- case AMDGPU::OPERAND_REG_IMM_V2FP16:
- case AMDGPU::OPERAND_REG_IMM_V2INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
- // If upper part is all zero we do not need op_sel_hi.
- if (!isUInt<16>(Fold.ImmToFold)) {
- if (!(Fold.ImmToFold & 0xffff)) {
- Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
- Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
- Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
- return true;
- }
- Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
- Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
- return true;
- }
- break;
- default:
- break;
- }
- }
- }
+ if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
+ (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) ||
+ isUInt<16>(Fold.ImmToFold) ||
+ !AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST->hasInv2PiInlineImm()))
+ return false;
+
+ unsigned Opcode = MI->getOpcode();
+ int OpNo = MI->getOperandNo(&Old);
+ uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
+ switch (OpType) {
+ default:
+ return false;
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ break;
+ }
+
+ return true;
+}
+
+bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
+ MachineInstr *MI = Fold.UseMI;
+ MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
+ unsigned Opcode = MI->getOpcode();
+ int OpNo = MI->getOperandNo(&Old);
+
+ // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
+ // already set.
+ int ModIdx = -1;
+ if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
+ ModIdx = AMDGPU::OpName::src0_modifiers;
+ else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
+ ModIdx = AMDGPU::OpName::src1_modifiers;
+ else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
+ ModIdx = AMDGPU::OpName::src2_modifiers;
+ assert(ModIdx != -1);
+ ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
+ MachineOperand &Mod = MI->getOperand(ModIdx);
+ unsigned Val = Mod.getImm();
+ if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
+ return false;
+
+ // Only apply the following transformation if that operand requires
+ // a packed immediate.
+ // If upper part is all zero we do not need op_sel_hi.
+ if (!(Fold.ImmToFold & 0xffff)) {
+ MachineOperand New =
+ MachineOperand::CreateImm((Fold.ImmToFold >> 16) & 0xffff);
+ if (!TII->isOperandLegal(*MI, OpNo, &New))
+ return false;
+ Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
+ Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+ Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+ return true;
}
+ MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold & 0xffff);
+ if (!TII->isOperandLegal(*MI, OpNo, &New))
+ return false;
+ Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+ Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
+ return true;
+}
+
+bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
+ MachineInstr *MI = Fold.UseMI;
+ MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
+ assert(Old.isReg());
+
+ if (Fold.isImm() && canUseImmWithOpSel(Fold))
+ return tryFoldImmWithOpSel(Fold);
if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
MachineBasicBlock *MBB = MI->getParent();
@@ -383,7 +410,13 @@ bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
return false;
};
- if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
+ bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold);
+ if (!IsLegal && OpToFold->isImm()) {
+ FoldCandidate Fold(MI, OpNo, OpToFold);
+ IsLegal = canUseImmWithOpSel(Fold);
+ }
+
+ if (!IsLegal) {
// Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
unsigned NewOpc = macToMad(Opc);
if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 56adf26..328bc66 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4144,12 +4144,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
- // This suffers the same problem as the scalar 16-bit cases.
- return AMDGPU::isInlinableIntLiteralV216(Imm);
+ return (isInt<16>(Imm) || isUInt<16>(Imm)) &&
+ AMDGPU::isInlinableIntLiteral((int16_t)Imm);
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
if (isInt<16>(Imm) || isUInt<16>(Imm)) {
// A few special case instructions have 16-bit operands on subtargets
// where 16-bit instructions are not legal.
@@ -4162,12 +4165,6 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
return false;
}
- case AMDGPU::OPERAND_REG_IMM_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
- uint32_t Trunc = static_cast<uint32_t>(Imm);
- return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
- }
case AMDGPU::OPERAND_KIMM32:
case AMDGPU::OPERAND_KIMM16:
return false;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index bc5cedf..627132c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2507,6 +2507,16 @@ bool isInlinableIntLiteralV216(int32_t Literal) {
return Lo16 == Hi16 && isInlinableIntLiteral(Lo16);
}
+bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType) {
+ switch (OpType) {
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ return isInlinableLiteralV216(Literal, HasInv2Pi);
+ default:
+ return isInlinableIntLiteralV216(Literal);
+ }
+}
+
bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) {
assert(HasInv2Pi);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index b60c0fc..7b99558 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1287,6 +1287,9 @@ LLVM_READNONE
bool isInlinableIntLiteralV216(int32_t Literal);
LLVM_READNONE
+bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType);
+
+LLVM_READNONE
bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi);
LLVM_READNONE
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index 3d3bf7b..54bd78e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -72,16 +72,29 @@ entry:
ret void
}
-; FIXME: This test violates constant bus restriction.
+; Make sure we do not violate constant bus restriction with 3 scalar inputs and simingly inlinable literal.
define amdgpu_ps void @test_llvm_amdgcn_fdot2_bf16_bf16_sis(
-; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: v_dot2_bf16_bf16 v2, s0, 0x10001, s1
-; GFX11-NEXT: global_store_b16 v[0:1], v2, off
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis:
+; SDAG-GFX11: ; %bb.0: ; %entry
+; SDAG-GFX11-NEXT: v_mov_b32_e32 v2, s1
+; SDAG-GFX11-NEXT: s_mov_b32 s1, 0x10001
+; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; SDAG-GFX11-NEXT: v_dot2_bf16_bf16 v2, s0, s1, v2
+; SDAG-GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; SDAG-GFX11-NEXT: s_nop 0
+; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-GFX11-NEXT: s_endpgm
+;
+; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis:
+; GISEL-GFX11: ; %bb.0: ; %entry
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v2, 0x10001
+; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_dot2_bf16_bf16 v2, s0, v2, s1
+; GISEL-GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GISEL-GFX11-NEXT: s_nop 0
+; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
<2 x i16> inreg %a,
i16 inreg %c) {