diff options
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 137 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 15 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 3 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 29 |
5 files changed, 125 insertions, 69 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 5f8e32c..709de61 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -80,6 +80,10 @@ public: bool updateOperand(FoldCandidate &Fold) const; + bool canUseImmWithOpSel(FoldCandidate &Fold) const; + + bool tryFoldImmWithOpSel(FoldCandidate &Fold) const; + bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *OpToFold) const; @@ -196,62 +200,85 @@ FunctionPass *llvm::createSIFoldOperandsPass() { return new SIFoldOperands(); } -bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { +bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const { MachineInstr *MI = Fold.UseMI; MachineOperand &Old = MI->getOperand(Fold.UseOpNo); - assert(Old.isReg()); + const uint64_t TSFlags = MI->getDesc().TSFlags; + assert(Old.isReg() && Fold.isImm()); - const uint64_t TSFlags = MI->getDesc().TSFlags; - if (Fold.isImm()) { - if (TSFlags & SIInstrFlags::IsPacked && !(TSFlags & SIInstrFlags::IsMAI) && - AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, - ST->hasInv2PiInlineImm())) { - if (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) - return false; // Prevent further folding of this operand without opsel. - - // Set op_sel/op_sel_hi on this operand or bail out if op_sel is - // already set. - unsigned Opcode = MI->getOpcode(); - int OpNo = MI->getOperandNo(&Old); - int ModIdx = -1; - if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) - ModIdx = AMDGPU::OpName::src0_modifiers; - else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) - ModIdx = AMDGPU::OpName::src1_modifiers; - else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) - ModIdx = AMDGPU::OpName::src2_modifiers; - assert(ModIdx != -1); - ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx); - MachineOperand &Mod = MI->getOperand(ModIdx); - unsigned Val = Mod.getImm(); - if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) { - // Only apply the following transformation if that operand requires - // a packed immediate. - switch (TII->get(Opcode).operands()[OpNo].OperandType) { - case AMDGPU::OPERAND_REG_IMM_V2FP16: - case AMDGPU::OPERAND_REG_IMM_V2INT16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: - // If upper part is all zero we do not need op_sel_hi. - if (!isUInt<16>(Fold.ImmToFold)) { - if (!(Fold.ImmToFold & 0xffff)) { - Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); - Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); - Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); - return true; - } - Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); - Old.ChangeToImmediate(Fold.ImmToFold & 0xffff); - return true; - } - break; - default: - break; - } - } - } + if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) || + (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) || + isUInt<16>(Fold.ImmToFold) || + !AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST->hasInv2PiInlineImm())) + return false; + + unsigned Opcode = MI->getOpcode(); + int OpNo = MI->getOperandNo(&Old); + uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType; + switch (OpType) { + default: + return false; + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + break; + } + + return true; +} + +bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const { + MachineInstr *MI = Fold.UseMI; + MachineOperand &Old = MI->getOperand(Fold.UseOpNo); + unsigned Opcode = MI->getOpcode(); + int OpNo = MI->getOperandNo(&Old); + + // Set op_sel/op_sel_hi on this operand or bail out if op_sel is + // already set. + int ModIdx = -1; + if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) + ModIdx = AMDGPU::OpName::src0_modifiers; + else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) + ModIdx = AMDGPU::OpName::src1_modifiers; + else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) + ModIdx = AMDGPU::OpName::src2_modifiers; + assert(ModIdx != -1); + ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx); + MachineOperand &Mod = MI->getOperand(ModIdx); + unsigned Val = Mod.getImm(); + if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1)) + return false; + + // Only apply the following transformation if that operand requires + // a packed immediate. + // If upper part is all zero we do not need op_sel_hi. + if (!(Fold.ImmToFold & 0xffff)) { + MachineOperand New = + MachineOperand::CreateImm((Fold.ImmToFold >> 16) & 0xffff); + if (!TII->isOperandLegal(*MI, OpNo, &New)) + return false; + Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); + Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); + Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); + return true; } + MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold & 0xffff); + if (!TII->isOperandLegal(*MI, OpNo, &New)) + return false; + Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); + Old.ChangeToImmediate(Fold.ImmToFold & 0xffff); + return true; +} + +bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { + MachineInstr *MI = Fold.UseMI; + MachineOperand &Old = MI->getOperand(Fold.UseOpNo); + assert(Old.isReg()); + + if (Fold.isImm() && canUseImmWithOpSel(Fold)) + return tryFoldImmWithOpSel(Fold); if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) { MachineBasicBlock *MBB = MI->getParent(); @@ -383,7 +410,13 @@ bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, return false; }; - if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { + bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold); + if (!IsLegal && OpToFold->isImm()) { + FoldCandidate Fold(MI, OpNo, OpToFold); + IsLegal = canUseImmWithOpSel(Fold); + } + + if (!IsLegal) { // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 unsigned NewOpc = macToMad(Opc); if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 56adf26..328bc66 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4144,12 +4144,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: - // This suffers the same problem as the scalar 16-bit cases. - return AMDGPU::isInlinableIntLiteralV216(Imm); + return (isInt<16>(Imm) || isUInt<16>(Imm)) && + AMDGPU::isInlinableIntLiteral((int16_t)Imm); case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: case AMDGPU::OPERAND_REG_INLINE_C_FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { // A few special case instructions have 16-bit operands on subtargets // where 16-bit instructions are not legal. @@ -4162,12 +4165,6 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, return false; } - case AMDGPU::OPERAND_REG_IMM_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { - uint32_t Trunc = static_cast<uint32_t>(Imm); - return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); - } case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_KIMM16: return false; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index bc5cedf..627132c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -2507,6 +2507,16 @@ bool isInlinableIntLiteralV216(int32_t Literal) { return Lo16 == Hi16 && isInlinableIntLiteral(Lo16); } +bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType) { + switch (OpType) { + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + return isInlinableLiteralV216(Literal, HasInv2Pi); + default: + return isInlinableIntLiteralV216(Literal); + } +} + bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) { assert(HasInv2Pi); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index b60c0fc..7b99558 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1287,6 +1287,9 @@ LLVM_READNONE bool isInlinableIntLiteralV216(int32_t Literal); LLVM_READNONE +bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType); + +LLVM_READNONE bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi); LLVM_READNONE diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll index 3d3bf7b..54bd78e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll @@ -72,16 +72,29 @@ entry: ret void } -; FIXME: This test violates constant bus restriction. +; Make sure we do not violate constant bus restriction with 3 scalar inputs and simingly inlinable literal. define amdgpu_ps void @test_llvm_amdgcn_fdot2_bf16_bf16_sis( -; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: v_dot2_bf16_bf16 v2, s0, 0x10001, s1 -; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis: +; SDAG-GFX11: ; %bb.0: ; %entry +; SDAG-GFX11-NEXT: v_mov_b32_e32 v2, s1 +; SDAG-GFX11-NEXT: s_mov_b32 s1, 0x10001 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; SDAG-GFX11-NEXT: v_dot2_bf16_bf16 v2, s0, s1, v2 +; SDAG-GFX11-NEXT: global_store_b16 v[0:1], v2, off +; SDAG-GFX11-NEXT: s_nop 0 +; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; SDAG-GFX11-NEXT: s_endpgm +; +; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis: +; GISEL-GFX11: ; %bb.0: ; %entry +; GISEL-GFX11-NEXT: v_mov_b32_e32 v2, 0x10001 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_dot2_bf16_bf16 v2, s0, v2, s1 +; GISEL-GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GISEL-GFX11-NEXT: s_nop 0 +; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GISEL-GFX11-NEXT: s_endpgm ptr addrspace(1) %r, <2 x i16> inreg %a, i16 inreg %c) { |