5 files changed, 125 insertions, 69 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 5f8e32c..709de61 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -80,6 +80,10 @@ public:
 
   bool updateOperand(FoldCandidate &Fold) const;
 
+  bool canUseImmWithOpSel(FoldCandidate &Fold) const;
+
+  bool tryFoldImmWithOpSel(FoldCandidate &Fold) const;
+
   bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
                         MachineInstr *MI, unsigned OpNo,
                         MachineOperand *OpToFold) const;
@@ -196,62 +200,85 @@ FunctionPass *llvm::createSIFoldOperandsPass() {
   return new SIFoldOperands();
 }
 
-bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
+bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
   MachineInstr *MI = Fold.UseMI;
   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
-  assert(Old.isReg());
+  const uint64_t TSFlags = MI->getDesc().TSFlags;
 
+  assert(Old.isReg() && Fold.isImm());
 
-  const uint64_t TSFlags = MI->getDesc().TSFlags;
-  if (Fold.isImm()) {
-    if (TSFlags & SIInstrFlags::IsPacked && !(TSFlags & SIInstrFlags::IsMAI) &&
-        AMDGPU::isFoldableLiteralV216(Fold.ImmToFold,
-                                      ST->hasInv2PiInlineImm())) {
-      if (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT))
-        return false; // Prevent further folding of this operand without opsel.
-
-      // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
-      // already set.
-      unsigned Opcode = MI->getOpcode();
-      int OpNo = MI->getOperandNo(&Old);
-      int ModIdx = -1;
-      if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
-        ModIdx = AMDGPU::OpName::src0_modifiers;
-      else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
-        ModIdx = AMDGPU::OpName::src1_modifiers;
-      else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
-        ModIdx = AMDGPU::OpName::src2_modifiers;
-      assert(ModIdx != -1);
-      ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
-      MachineOperand &Mod = MI->getOperand(ModIdx);
-      unsigned Val = Mod.getImm();
-      if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) {
-        // Only apply the following transformation if that operand requires
-        // a packed immediate.
-        switch (TII->get(Opcode).operands()[OpNo].OperandType) {
-        case AMDGPU::OPERAND_REG_IMM_V2FP16:
-        case AMDGPU::OPERAND_REG_IMM_V2INT16:
-        case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
-        case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
-          // If upper part is all zero we do not need op_sel_hi.
-          if (!isUInt<16>(Fold.ImmToFold)) {
-            if (!(Fold.ImmToFold & 0xffff)) {
-              Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
-              Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
-              Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
-              return true;
-            }
-            Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
-            Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
-            return true;
-          }
-          break;
-        default:
-          break;
-        }
-      }
-    }
+  if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
+      (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) ||
+      isUInt<16>(Fold.ImmToFold) ||
+      !AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST->hasInv2PiInlineImm()))
+    return false;
+
+  unsigned Opcode = MI->getOpcode();
+  int OpNo = MI->getOperandNo(&Old);
+  uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
+  switch (OpType) {
+  default:
+    return false;
+  case AMDGPU::OPERAND_REG_IMM_V2FP16:
+  case AMDGPU::OPERAND_REG_IMM_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+    break;
+  }
+
+  return true;
+}
+
+bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
+  MachineInstr *MI = Fold.UseMI;
+  MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
+  unsigned Opcode = MI->getOpcode();
+  int OpNo = MI->getOperandNo(&Old);
+
+  // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
+  // already set.
+  int ModIdx = -1;
+  if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
+    ModIdx = AMDGPU::OpName::src0_modifiers;
+  else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
+    ModIdx = AMDGPU::OpName::src1_modifiers;
+  else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
+    ModIdx = AMDGPU::OpName::src2_modifiers;
+  assert(ModIdx != -1);
+  ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
+  MachineOperand &Mod = MI->getOperand(ModIdx);
+  unsigned Val = Mod.getImm();
+  if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
+    return false;
+
+  // Only apply the following transformation if that operand requires
+  // a packed immediate.
+  // If upper part is all zero we do not need op_sel_hi.
+  if (!(Fold.ImmToFold & 0xffff)) {
+    MachineOperand New =
+        MachineOperand::CreateImm((Fold.ImmToFold >> 16) & 0xffff);
+    if (!TII->isOperandLegal(*MI, OpNo, &New))
+      return false;
+    Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
+    Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+    Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+    return true;
   }
+  MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold & 0xffff);
+  if (!TII->isOperandLegal(*MI, OpNo, &New))
+    return false;
+  Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+  Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
+  return true;
+}
+
+bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
+  MachineInstr *MI = Fold.UseMI;
+  MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
+  assert(Old.isReg());
+
+  if (Fold.isImm() && canUseImmWithOpSel(Fold))
+    return tryFoldImmWithOpSel(Fold);
 
   if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
     MachineBasicBlock *MBB = MI->getParent();
@@ -383,7 +410,13 @@ bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
     return false;
   };
 
-  if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
+  bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold);
+  if (!IsLegal && OpToFold->isImm()) {
+    FoldCandidate Fold(MI, OpNo, OpToFold);
+    IsLegal = canUseImmWithOpSel(Fold);
+  }
+
+  if (!IsLegal) {
     // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
     unsigned NewOpc = macToMad(Opc);
     if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 56adf26..328bc66 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4144,12 +4144,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
   case AMDGPU::OPERAND_REG_IMM_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
-    // This suffers the same problem as the scalar 16-bit cases.
-    return AMDGPU::isInlinableIntLiteralV216(Imm);
+    return (isInt<16>(Imm) || isUInt<16>(Imm)) &&
+           AMDGPU::isInlinableIntLiteral((int16_t)Imm);
   case AMDGPU::OPERAND_REG_IMM_FP16:
   case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
-  case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
+  case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
+  case AMDGPU::OPERAND_REG_IMM_V2FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
       // A few special case instructions have 16-bit operands on subtargets
       // where 16-bit instructions are not legal.
@@ -4162,12 +4165,6 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
 
     return false;
   }
-  case AMDGPU::OPERAND_REG_IMM_V2FP16:
-  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
-  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
-    uint32_t Trunc = static_cast<uint32_t>(Imm);
-    return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
-  }
   case AMDGPU::OPERAND_KIMM32:
   case AMDGPU::OPERAND_KIMM16:
     return false;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index bc5cedf..627132c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2507,6 +2507,16 @@ bool isInlinableIntLiteralV216(int32_t Literal) {
   return Lo16 == Hi16 && isInlinableIntLiteral(Lo16);
 }
 
+bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType) {
+  switch (OpType) {
+  case AMDGPU::OPERAND_REG_IMM_V2FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+    return isInlinableLiteralV216(Literal, HasInv2Pi);
+  default:
+    return isInlinableIntLiteralV216(Literal);
+  }
+}
+
 bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) {
   assert(HasInv2Pi);
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index b60c0fc..7b99558 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1287,6 +1287,9 @@ LLVM_READNONE
 bool isInlinableIntLiteralV216(int32_t Literal);
 
 LLVM_READNONE
+bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType);
+
+LLVM_READNONE
 bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi);
 
 LLVM_READNONE
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index 3d3bf7b..54bd78e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -72,16 +72,29 @@ entry:
   ret void
 }
 
-; FIXME: This test violates constant bus restriction.
+; Make sure we do not violate constant bus restriction with 3 scalar inputs and simingly inlinable literal.
 
 define amdgpu_ps void @test_llvm_amdgcn_fdot2_bf16_bf16_sis(
-; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    v_dot2_bf16_bf16 v2, s0, 0x10001, s1
-; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis:
+; SDAG-GFX11:       ; %bb.0: ; %entry
+; SDAG-GFX11-NEXT:    v_mov_b32_e32 v2, s1
+; SDAG-GFX11-NEXT:    s_mov_b32 s1, 0x10001
+; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; SDAG-GFX11-NEXT:    v_dot2_bf16_bf16 v2, s0, s1, v2
+; SDAG-GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; SDAG-GFX11-NEXT:    s_nop 0
+; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-GFX11-NEXT:    s_endpgm
+;
+; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis:
+; GISEL-GFX11:       ; %bb.0: ; %entry
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v2, 0x10001
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    v_dot2_bf16_bf16 v2, s0, v2, s1
+; GISEL-GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GISEL-GFX11-NEXT:    s_nop 0
+; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     <2 x i16> inreg %a,
     i16 inreg %c) {