AMDGPU: Fix packed 16-bit inline constants (#76522)

Consistently treat packed 16-bit operands as 32-bit values, because that's really what they are. The attempt to treat them differently was ultimately incorrect and lead to miscompiles, e.g. when using non-splat constants such as (1, 0) as operands. Recognize 32-bit float constants for i/u16 instructions. This is a bit odd conceptually, but it matches HW behavior and SP3. Remove isFoldableLiteralV216; there was too much magic in the dependency between it and its use in SIFoldOperands. Instead, we now simply rely on checking whether a constant is an inline constant, and trying a bunch of permutations of the low and high halves. This is more obviously correct and leads to some new cases where inline constants are used as shown by tests. Move the logic for switching packed add vs. sub into SIFoldOperands. This has two benefits: all logic that optimizes for inline constants in packed math is now in one place; and it applies to both SelectionDAG and GISel paths. Disable the use of opsel with v_dot* instructions on gfx11. They are documented to ignore opsel on src0 and src1. It may be interesting to re-enable to use of opsel on src2 as a future optimization. A similar "proper" fix of what inline constants mean could potentially be applied to unpacked 16-bit ops. However, it's less clear what the benefit would be, and there are surely places where we'd have to carefully audit whether values are properly sign- or zero-extended. It is best to keep such a change separate. Fixes: Corruption in FSR 2.0 (latent bug exposed by an LLPC change)
author: Nicolai Hähnle <nicolai.haehnle@amd.com> 2024-01-04 00:10:15 +0100
committer: GitHub <noreply@github.com> 2024-01-04 00:10:15 +0100
commit: 49b492048af2b2093aaed899c0bbd6d740aad83c (patch)
tree: 1f4f9b2ba68133bd36b607d09abed154b32ee783 /llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
parent: 49029f926d359075d59ad4aec2d01a21d9514b02 (diff)
download: llvm-49b492048af2b2093aaed899c0bbd6d740aad83c.zip
llvm-49b492048af2b2093aaed899c0bbd6d740aad83c.tar.gz
llvm-49b492048af2b2093aaed899c0bbd6d740aad83c.tar.bz2
1 files changed, 119 insertions, 29 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 709de61..aa7639a 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -208,9 +208,7 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
   assert(Old.isReg() && Fold.isImm());
 
   if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
-      (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) ||
-      isUInt<16>(Fold.ImmToFold) ||
-      !AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST->hasInv2PiInlineImm()))
+      (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
     return false;
 
   unsigned Opcode = MI->getOpcode();
@@ -234,42 +232,123 @@ bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
   unsigned Opcode = MI->getOpcode();
   int OpNo = MI->getOperandNo(&Old);
+  uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
+
+  // If the literal can be inlined as-is, apply it and short-circuit the
+  // tests below. The main motivation for this is to avoid unintuitive
+  // uses of opsel.
+  if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) {
+    Old.ChangeToImmediate(Fold.ImmToFold);
+    return true;
+  }
 
-  // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
-  // already set.
+  // Refer to op_sel/op_sel_hi and check if we can change the immediate and
+  // op_sel in a way that allows an inline constant.
   int ModIdx = -1;
-  if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
+  unsigned SrcIdx = ~0;
+  if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
     ModIdx = AMDGPU::OpName::src0_modifiers;
-  else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
+    SrcIdx = 0;
+  } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
     ModIdx = AMDGPU::OpName::src1_modifiers;
-  else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
+    SrcIdx = 1;
+  } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
     ModIdx = AMDGPU::OpName::src2_modifiers;
+    SrcIdx = 2;
+  }
   assert(ModIdx != -1);
   ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
   MachineOperand &Mod = MI->getOperand(ModIdx);
-  unsigned Val = Mod.getImm();
-  if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
+  unsigned ModVal = Mod.getImm();
+
+  uint16_t ImmLo = static_cast<uint16_t>(
+      Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
+  uint16_t ImmHi = static_cast<uint16_t>(
+      Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
+  uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
+  unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
+
+  // Helper function that attempts to inline the given value with a newly
+  // chosen opsel pattern.
+  auto tryFoldToInline = [&](uint32_t Imm) -> bool {
+    if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
+      Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
+      Old.ChangeToImmediate(Imm);
+      return true;
+    }
+
+    // Try to shuffle the halves around and leverage opsel to get an inline
+    // constant.
+    uint16_t Lo = static_cast<uint16_t>(Imm);
+    uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
+    if (Lo == Hi) {
+      if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
+        Mod.setImm(NewModVal);
+        Old.ChangeToImmediate(Lo);
+        return true;
+      }
+
+      if (static_cast<int16_t>(Lo) < 0) {
+        int32_t SExt = static_cast<int16_t>(Lo);
+        if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
+          Mod.setImm(NewModVal);
+          Old.ChangeToImmediate(SExt);
+          return true;
+        }
+      }
+
+      // This check is only useful for integer instructions
+      if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
+          OpType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16) {
+        if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
+          Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
+          Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
+          return true;
+        }
+      }
+    } else {
+      uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
+      if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
+        Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
+        Old.ChangeToImmediate(Swapped);
+        return true;
+      }
+    }
+
     return false;
+  };
 
-  // Only apply the following transformation if that operand requires
-  // a packed immediate.
-  // If upper part is all zero we do not need op_sel_hi.
-  if (!(Fold.ImmToFold & 0xffff)) {
-    MachineOperand New =
-        MachineOperand::CreateImm((Fold.ImmToFold >> 16) & 0xffff);
-    if (!TII->isOperandLegal(*MI, OpNo, &New))
-      return false;
-    Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
-    Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
-    Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+  if (tryFoldToInline(Imm))
     return true;
+
+  // Replace integer addition by subtraction and vice versa if it allows
+  // folding the immediate to an inline constant.
+  //
+  // We should only ever get here for SrcIdx == 1 due to canonicalization
+  // earlier in the pipeline, but we double-check here to be safe / fully
+  // general.
+  bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
+  bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
+  if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
+    unsigned ClampIdx =
+        AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
+    bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
+
+    if (!Clamp) {
+      uint16_t NegLo = -static_cast<uint16_t>(Imm);
+      uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
+      uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
+
+      if (tryFoldToInline(NegImm)) {
+        unsigned NegOpcode =
+            IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
+        MI->setDesc(TII->get(NegOpcode));
+        return true;
+      }
+    }
   }
-  MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold & 0xffff);
-  if (!TII->isOperandLegal(*MI, OpNo, &New))
-    return false;
-  Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
-  Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
-  return true;
+
+  return false;
 }
 
 bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
@@ -277,8 +356,19 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
   assert(Old.isReg());
 
-  if (Fold.isImm() && canUseImmWithOpSel(Fold))
-    return tryFoldImmWithOpSel(Fold);
+  if (Fold.isImm() && canUseImmWithOpSel(Fold)) {
+    if (tryFoldImmWithOpSel(Fold))
+      return true;
+
+    // We can't represent the candidate as an inline constant. Try as a literal
+    // with the original opsel, checking constant bus limitations.
+    MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold);
+    int OpNo = MI->getOperandNo(&Old);
+    if (!TII->isOperandLegal(*MI, OpNo, &New))
+      return false;
+    Old.ChangeToImmediate(Fold.ImmToFold);
+    return true;
+  }
 
   if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
     MachineBasicBlock *MBB = MI->getParent();
author	Nicolai Hähnle <nicolai.haehnle@amd.com>	2024-01-04 00:10:15 +0100
committer	GitHub <noreply@github.com>	2024-01-04 00:10:15 +0100
commit	49b492048af2b2093aaed899c0bbd6d740aad83c (patch)
tree	1f4f9b2ba68133bd36b607d09abed154b32ee783 /llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
parent	49029f926d359075d59ad4aec2d01a21d9514b02 (diff)
download	llvm-49b492048af2b2093aaed899c0bbd6d740aad83c.zip llvm-49b492048af2b2093aaed899c0bbd6d740aad83c.tar.gz llvm-49b492048af2b2093aaed899c0bbd6d740aad83c.tar.bz2