15 files changed, 324 insertions, 196 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index bffea82..48ee0d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -317,26 +317,16 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
   }
 }
 
-bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
-                                           bool Negated) const {
+bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
   if (N->isUndef())
     return true;
 
   const SIInstrInfo *TII = Subtarget->getInstrInfo();
-  if (Negated) {
-    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
-      return TII->isInlineConstant(-C->getAPIntValue());
+  if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
+    return TII->isInlineConstant(C->getAPIntValue());
 
-    if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
-      return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
-
-  } else {
-    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
-      return TII->isInlineConstant(C->getAPIntValue());
-
-    if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
-      return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
-  }
+  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
+    return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
 
   return false;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 374108a..df4a211 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -50,15 +50,13 @@ static inline bool getConstantValue(SDValue N, uint32_t &Out) {
 }
 
 // TODO: Handle undef as zero
-static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
-                                        bool Negate = false) {
+static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
   assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
   uint32_t LHSVal, RHSVal;
   if (getConstantValue(N->getOperand(0), LHSVal) &&
       getConstantValue(N->getOperand(1), RHSVal)) {
     SDLoc SL(N);
-    uint32_t K = Negate ? (-LHSVal & 0xffff) | (-RHSVal << 16)
-                        : (LHSVal & 0xffff) | (RHSVal << 16);
+    uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
     return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
                               DAG.getTargetConstant(K, SL, MVT::i32));
   }
@@ -66,9 +64,6 @@ static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
   return nullptr;
 }
 
-static inline SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
-  return packConstantV2I16(N, DAG, true);
-}
 } // namespace
 
 /// AMDGPU specific code to select AMDGPU machine instructions for
@@ -110,10 +105,7 @@ protected:
 
 private:
   std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
-  bool isInlineImmediate(const SDNode *N, bool Negated = false) const;
-  bool isNegInlineImmediate(const SDNode *N) const {
-    return isInlineImmediate(N, true);
-  }
+  bool isInlineImmediate(const SDNode *N) const;
 
   bool isInlineImmediate16(int64_t Imm) const {
     return AMDGPU::isInlinableLiteral16(Imm, Subtarget->hasInv2PiInlineImm());
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 5f2b7c0..b7f0438 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1865,6 +1865,9 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
   case AMDGPU::OPERAND_REG_IMM_V2FP32:
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
   case AMDGPU::OPERAND_REG_IMM_V2INT32:
+  case AMDGPU::OPERAND_REG_IMM_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
   case AMDGPU::OPERAND_KIMM32:
   case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
     return &APFloat::IEEEsingle();
@@ -1879,13 +1882,10 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
   case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
-  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
   case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
-  case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
-  case AMDGPU::OPERAND_REG_IMM_V2INT16:
   case AMDGPU::OPERAND_REG_IMM_V2FP16:
   case AMDGPU::OPERAND_KIMM16:
     return &APFloat::IEEEhalf();
@@ -2033,9 +2033,14 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
   // We allow fp literals with f16x2 operands assuming that the specified
   // literal goes into the lower half and the upper half is zero. We also
   // require that the literal may be losslessly converted to f16.
-  MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 :
-                     (type == MVT::v2i16)? MVT::i16 :
-                     (type == MVT::v2f32)? MVT::f32 : type;
+  //
+  // For i16x2 operands, we assume that the specified literal is encoded as a
+  // single-precision float. This is pretty odd, but it matches SP3 and what
+  // happens in hardware.
+  MVT ExpectedType = (type == MVT::v2f16)   ? MVT::f16
+                     : (type == MVT::v2i16) ? MVT::f32
+                     : (type == MVT::v2f32) ? MVT::f32
+                                            : type;
 
   APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));
   return canLosslesslyConvertToFPType(FPLiteral, ExpectedType);
@@ -3401,12 +3406,12 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
     if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
         OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16 ||
         OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16)
-      return AMDGPU::isInlinableIntLiteralV216(Val);
+      return AMDGPU::isInlinableLiteralV2I16(Val);
 
     if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 ||
         OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2FP16 ||
         OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
-      return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm());
+      return AMDGPU::isInlinableLiteralV2F16(Val);
 
     return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm());
   }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 67be7b0..9dff3f6 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -182,6 +182,9 @@ static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val,
   DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, Imm,      \
                false, ImmWidth)
 
+#define DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(Name, OpWidth, ImmWidth)         \
+  DECODE_SrcOp(decodeOperand_##Name, 9, OpWidth, Imm, false, ImmWidth)
+
 // Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc)
 // and decode using 'enum10' from decodeSrcOp.
 #define DECODE_OPERAND_SRC_REG_OR_IMM_A9(RegClass, OpWidth, ImmWidth)          \
@@ -262,6 +265,9 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)
 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)
 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32)
 
+DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2I16, OPW32, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2F16, OPW32, 16)
+
 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_64, OPW64, 64)
 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_128, OPW128, 32)
 DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_256, OPW256, 64)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 91a7093..b85eb76 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1096,7 +1096,7 @@ public:
   bool hasDstSelForwardingHazard() const { return GFX940Insts; }
 
   // Cannot use op_sel with v_dot instructions.
-  bool hasDOTOpSelHazard() const { return GFX940Insts; }
+  bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
 
   // Does not have HW interlocs for VALU writing and then reading SGPRs.
   bool hasVDecCoExecHazard() const {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index ef1b85f..6c7977e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -460,56 +460,84 @@ void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm,
   }
 }
 
-void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
-                                         const MCSubtargetInfo &STI,
-                                         raw_ostream &O) {
-  int16_t SImm = static_cast<int16_t>(Imm);
-  if (isInlinableIntLiteral(SImm)) {
-    O << SImm;
-    return;
-  }
-
+// This must accept a 32-bit immediate value to correctly handle packed 16-bit
+// operations.
+static bool printImmediateFloat16(uint32_t Imm, const MCSubtargetInfo &STI,
+                                  raw_ostream &O) {
   if (Imm == 0x3C00)
-    O<< "1.0";
+    O << "1.0";
   else if (Imm == 0xBC00)
-    O<< "-1.0";
+    O << "-1.0";
   else if (Imm == 0x3800)
-    O<< "0.5";
+    O << "0.5";
   else if (Imm == 0xB800)
-    O<< "-0.5";
+    O << "-0.5";
   else if (Imm == 0x4000)
-    O<< "2.0";
+    O << "2.0";
   else if (Imm == 0xC000)
-    O<< "-2.0";
+    O << "-2.0";
   else if (Imm == 0x4400)
-    O<< "4.0";
+    O << "4.0";
   else if (Imm == 0xC400)
-    O<< "-4.0";
-  else if (Imm == 0x3118 &&
-           STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) {
+    O << "-4.0";
+  else if (Imm == 0x3118 && STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
     O << "0.15915494";
-  } else {
-    uint64_t Imm16 = static_cast<uint16_t>(Imm);
-    O << formatHex(Imm16);
-  }
-}
+  else
+    return false;
 
-void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm,
-                                           const MCSubtargetInfo &STI,
-                                           raw_ostream &O) {
-  uint16_t Lo16 = static_cast<uint16_t>(Imm);
-  printImmediate16(Lo16, STI, O);
+  return true;
 }
 
-void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
+void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
                                          const MCSubtargetInfo &STI,
                                          raw_ostream &O) {
+  int16_t SImm = static_cast<int16_t>(Imm);
+  if (isInlinableIntLiteral(SImm)) {
+    O << SImm;
+    return;
+  }
+
+  uint16_t HImm = static_cast<uint16_t>(Imm);
+  if (printImmediateFloat16(HImm, STI, O))
+    return;
+
+  uint64_t Imm16 = static_cast<uint16_t>(Imm);
+  O << formatHex(Imm16);
+}
+
+void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
   int32_t SImm = static_cast<int32_t>(Imm);
-  if (SImm >= -16 && SImm <= 64) {
+  if (isInlinableIntLiteral(SImm)) {
     O << SImm;
     return;
   }
 
+  switch (OpType) {
+  case AMDGPU::OPERAND_REG_IMM_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+    if (printImmediateFloat32(Imm, STI, O))
+      return;
+    break;
+  case AMDGPU::OPERAND_REG_IMM_V2FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+    if (isUInt<16>(Imm) &&
+        printImmediateFloat16(static_cast<uint16_t>(Imm), STI, O))
+      return;
+    break;
+  default:
+    llvm_unreachable("bad operand type");
+  }
+
+  O << formatHex(static_cast<uint64_t>(Imm));
+}
+
+bool AMDGPUInstPrinter::printImmediateFloat32(uint32_t Imm,
+                                              const MCSubtargetInfo &STI,
+                                              raw_ostream &O) {
   if (Imm == llvm::bit_cast<uint32_t>(0.0f))
     O << "0.0";
   else if (Imm == llvm::bit_cast<uint32_t>(1.0f))
@@ -532,7 +560,24 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
            STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
     O << "0.15915494";
   else
-    O << formatHex(static_cast<uint64_t>(Imm));
+    return false;
+
+  return true;
+}
+
+void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
+  int32_t SImm = static_cast<int32_t>(Imm);
+  if (isInlinableIntLiteral(SImm)) {
+    O << SImm;
+    return;
+  }
+
+  if (printImmediateFloat32(Imm, STI, O))
+    return;
+
+  O << formatHex(static_cast<uint64_t>(Imm));
 }
 
 void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
@@ -755,25 +800,11 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
       break;
     case AMDGPU::OPERAND_REG_IMM_V2INT16:
     case AMDGPU::OPERAND_REG_IMM_V2FP16:
-      if (!isUInt<16>(Op.getImm()) &&
-          STI.hasFeature(AMDGPU::FeatureVOP3Literal)) {
-        printImmediate32(Op.getImm(), STI, O);
-        break;
-      }
-
-      //  Deal with 16-bit FP inline immediates not working.
-      if (OpTy == AMDGPU::OPERAND_REG_IMM_V2FP16) {
-        printImmediate16(static_cast<uint16_t>(Op.getImm()), STI, O);
-        break;
-      }
-      [[fallthrough]];
     case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
     case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
-      printImmediateInt16(static_cast<uint16_t>(Op.getImm()), STI, O);
-      break;
     case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
     case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
-      printImmediateV216(Op.getImm(), STI, O);
+      printImmediateV216(Op.getImm(), OpTy, STI, O);
       break;
     case MCOI::OPERAND_UNKNOWN:
     case MCOI::OPERAND_PCREL:
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index f2f985f..e3958f8 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -88,8 +88,10 @@ private:
                            raw_ostream &O);
   void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
-  void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI,
-                          raw_ostream &O);
+  void printImmediateV216(uint32_t Imm, uint8_t OpType,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  bool printImmediateFloat32(uint32_t Imm, const MCSubtargetInfo &STI,
+                             raw_ostream &O);
   void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
   void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index b403d69..de1abaf 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -284,22 +284,15 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO,
     // which does not have f16 support?
     return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
   case AMDGPU::OPERAND_REG_IMM_V2INT16:
-  case AMDGPU::OPERAND_REG_IMM_V2FP16: {
-    if (!isUInt<16>(Imm) && STI.hasFeature(AMDGPU::FeatureVOP3Literal))
-      return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
-    if (OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
-      return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
-    [[fallthrough]];
-  }
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
-    return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI);
+    return AMDGPU::getInlineEncodingV2I16(static_cast<uint32_t>(Imm))
+        .value_or(255);
+  case AMDGPU::OPERAND_REG_IMM_V2FP16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
-  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
-    uint16_t Lo16 = static_cast<uint16_t>(Imm);
-    uint32_t Encoding = getLit16Encoding(Lo16, STI);
-    return Encoding;
-  }
+  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+    return AMDGPU::getInlineEncodingV2F16(static_cast<uint32_t>(Imm))
+        .value_or(255);
   case AMDGPU::OPERAND_KIMM32:
   case AMDGPU::OPERAND_KIMM16:
     return MO.getImm();
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 709de61..aa7639a 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -208,9 +208,7 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
   assert(Old.isReg() && Fold.isImm());
 
   if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
-      (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) ||
-      isUInt<16>(Fold.ImmToFold) ||
-      !AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST->hasInv2PiInlineImm()))
+      (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
     return false;
 
   unsigned Opcode = MI->getOpcode();
@@ -234,42 +232,123 @@ bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
   unsigned Opcode = MI->getOpcode();
   int OpNo = MI->getOperandNo(&Old);
+  uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
+
+  // If the literal can be inlined as-is, apply it and short-circuit the
+  // tests below. The main motivation for this is to avoid unintuitive
+  // uses of opsel.
+  if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) {
+    Old.ChangeToImmediate(Fold.ImmToFold);
+    return true;
+  }
 
-  // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
-  // already set.
+  // Refer to op_sel/op_sel_hi and check if we can change the immediate and
+  // op_sel in a way that allows an inline constant.
   int ModIdx = -1;
-  if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
+  unsigned SrcIdx = ~0;
+  if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
     ModIdx = AMDGPU::OpName::src0_modifiers;
-  else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
+    SrcIdx = 0;
+  } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
     ModIdx = AMDGPU::OpName::src1_modifiers;
-  else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
+    SrcIdx = 1;
+  } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
     ModIdx = AMDGPU::OpName::src2_modifiers;
+    SrcIdx = 2;
+  }
   assert(ModIdx != -1);
   ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
   MachineOperand &Mod = MI->getOperand(ModIdx);
-  unsigned Val = Mod.getImm();
-  if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
+  unsigned ModVal = Mod.getImm();
+
+  uint16_t ImmLo = static_cast<uint16_t>(
+      Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
+  uint16_t ImmHi = static_cast<uint16_t>(
+      Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
+  uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
+  unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
+
+  // Helper function that attempts to inline the given value with a newly
+  // chosen opsel pattern.
+  auto tryFoldToInline = [&](uint32_t Imm) -> bool {
+    if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
+      Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
+      Old.ChangeToImmediate(Imm);
+      return true;
+    }
+
+    // Try to shuffle the halves around and leverage opsel to get an inline
+    // constant.
+    uint16_t Lo = static_cast<uint16_t>(Imm);
+    uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
+    if (Lo == Hi) {
+      if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
+        Mod.setImm(NewModVal);
+        Old.ChangeToImmediate(Lo);
+        return true;
+      }
+
+      if (static_cast<int16_t>(Lo) < 0) {
+        int32_t SExt = static_cast<int16_t>(Lo);
+        if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
+          Mod.setImm(NewModVal);
+          Old.ChangeToImmediate(SExt);
+          return true;
+        }
+      }
+
+      // This check is only useful for integer instructions
+      if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
+          OpType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16) {
+        if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
+          Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
+          Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
+          return true;
+        }
+      }
+    } else {
+      uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
+      if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
+        Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
+        Old.ChangeToImmediate(Swapped);
+        return true;
+      }
+    }
+
     return false;
+  };
 
-  // Only apply the following transformation if that operand requires
-  // a packed immediate.
-  // If upper part is all zero we do not need op_sel_hi.
-  if (!(Fold.ImmToFold & 0xffff)) {
-    MachineOperand New =
-        MachineOperand::CreateImm((Fold.ImmToFold >> 16) & 0xffff);
-    if (!TII->isOperandLegal(*MI, OpNo, &New))
-      return false;
-    Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
-    Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
-    Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+  if (tryFoldToInline(Imm))
     return true;
+
+  // Replace integer addition by subtraction and vice versa if it allows
+  // folding the immediate to an inline constant.
+  //
+  // We should only ever get here for SrcIdx == 1 due to canonicalization
+  // earlier in the pipeline, but we double-check here to be safe / fully
+  // general.
+  bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
+  bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
+  if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
+    unsigned ClampIdx =
+        AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
+    bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
+
+    if (!Clamp) {
+      uint16_t NegLo = -static_cast<uint16_t>(Imm);
+      uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
+      uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
+
+      if (tryFoldToInline(NegImm)) {
+        unsigned NegOpcode =
+            IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
+        MI->setDesc(TII->get(NegOpcode));
+        return true;
+      }
+    }
   }
-  MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold & 0xffff);
-  if (!TII->isOperandLegal(*MI, OpNo, &New))
-    return false;
-  Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
-  Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
-  return true;
+
+  return false;
 }
 
 bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
@@ -277,8 +356,19 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
   assert(Old.isReg());
 
-  if (Fold.isImm() && canUseImmWithOpSel(Fold))
-    return tryFoldImmWithOpSel(Fold);
+  if (Fold.isImm() && canUseImmWithOpSel(Fold)) {
+    if (tryFoldImmWithOpSel(Fold))
+      return true;
+
+    // We can't represent the candidate as an inline constant. Try as a literal
+    // with the original opsel, checking constant bus limitations.
+    MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold);
+    int OpNo = MI->getOperandNo(&Old);
+    if (!TII->isOperandLegal(*MI, OpNo, &New))
+      return false;
+    Old.ChangeToImmediate(Fold.ImmToFold);
+    return true;
+  }
 
   if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
     MachineBasicBlock *MBB = MI->getParent();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 396d22c..6799292 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4153,15 +4153,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
   case AMDGPU::OPERAND_REG_IMM_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
-    return (isInt<16>(Imm) || isUInt<16>(Imm)) &&
-           AMDGPU::isInlinableIntLiteral((int16_t)Imm);
+    return AMDGPU::isInlinableLiteralV2I16(Imm);
+  case AMDGPU::OPERAND_REG_IMM_V2FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+    return AMDGPU::isInlinableLiteralV2F16(Imm);
   case AMDGPU::OPERAND_REG_IMM_FP16:
   case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
-  case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
-  case AMDGPU::OPERAND_REG_IMM_V2FP16:
-  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
-  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
+  case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
       // A few special case instructions have 16-bit operands on subtargets
       // where 16-bit instructions are not legal.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 50724fd..f07b8fa0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -860,23 +860,6 @@ def ShiftAmt32Imm : ImmLeaf <i32, [{
   return Imm < 32;
 }]>;
 
-def getNegV2I16Imm : SDNodeXForm<build_vector, [{
-  return SDValue(packNegConstantV2I16(N, *CurDAG), 0);
-}]>;
-
-def NegSubInlineConstV216 : PatLeaf<(build_vector), [{
-  assert(N->getNumOperands() == 2);
-  assert(N->getOperand(0).getValueType().getSizeInBits() == 16);
-  SDValue Src0 = N->getOperand(0);
-  SDValue Src1 = N->getOperand(1);
-  if (Src0 == Src1)
-    return isNegInlineImmediate(Src0.getNode());
-
-  return (isNullConstantOrUndef(Src0) && isNegInlineImmediate(Src1.getNode())) ||
-         (isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode()));
-}], getNegV2I16Imm>;
-
-
 def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
   return fp16SrcZerosHighBits(N->getOpcode());
 }]>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index c94b894..1d197dc 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1152,11 +1152,11 @@ class RegOrF32 <string RegisterClass, string OperandTypePrefix>
 
 class RegOrV2B16 <string RegisterClass, string OperandTypePrefix>
   : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2INT16",
-                     !subst("_v2b16", "V2B16", NAME), "_Imm16">;
+                     !subst("_v2b16", "V2B16", NAME), "_ImmV2I16">;
 
 class RegOrV2F16 <string RegisterClass, string OperandTypePrefix>
   : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2FP16",
-                     !subst("_v2f16", "V2F16", NAME), "_Imm16">;
+                     !subst("_v2f16", "V2F16", NAME), "_ImmV2F16">;
 
 class RegOrF64 <string RegisterClass, string OperandTypePrefix>
   : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP64",
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index a91d771..26ba257 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2506,53 +2506,95 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
          Val == 0x3118;   // 1/2pi
 }
 
-bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
-  assert(HasInv2Pi);
-
-  if (isInt<16>(Literal) || isUInt<16>(Literal)) {
-    int16_t Trunc = static_cast<int16_t>(Literal);
-    return AMDGPU::isInlinableLiteral16(Trunc, HasInv2Pi);
+std::optional<unsigned> getInlineEncodingV216(bool IsFloat, uint32_t Literal) {
+  // Unfortunately, the Instruction Set Architecture Reference Guide is
+  // misleading about how the inline operands work for (packed) 16-bit
+  // instructions. In a nutshell, the actual HW behavior is:
+  //
+  //  - integer encodings (-16 .. 64) are always produced as sign-extended
+  //    32-bit values
+  //  - float encodings are produced as:
+  //    - for F16 instructions: corresponding half-precision float values in
+  //      the LSBs, 0 in the MSBs
+  //    - for UI16 instructions: corresponding single-precision float value
+  int32_t Signed = static_cast<int32_t>(Literal);
+  if (Signed >= 0 && Signed <= 64)
+    return 128 + Signed;
+
+  if (Signed >= -16 && Signed <= -1)
+    return 192 + std::abs(Signed);
+
+  if (IsFloat) {
+    // clang-format off
+    switch (Literal) {
+    case 0x3800: return 240; // 0.5
+    case 0xB800: return 241; // -0.5
+    case 0x3C00: return 242; // 1.0
+    case 0xBC00: return 243; // -1.0
+    case 0x4000: return 244; // 2.0
+    case 0xC000: return 245; // -2.0
+    case 0x4400: return 246; // 4.0
+    case 0xC400: return 247; // -4.0
+    case 0x3118: return 248; // 1.0 / (2.0 * pi)
+    default: break;
+    }
+    // clang-format on
+  } else {
+    // clang-format off
+    switch (Literal) {
+    case 0x3F000000: return 240; // 0.5
+    case 0xBF000000: return 241; // -0.5
+    case 0x3F800000: return 242; // 1.0
+    case 0xBF800000: return 243; // -1.0
+    case 0x40000000: return 244; // 2.0
+    case 0xC0000000: return 245; // -2.0
+    case 0x40800000: return 246; // 4.0
+    case 0xC0800000: return 247; // -4.0
+    case 0x3E22F983: return 248; // 1.0 / (2.0 * pi)
+    default: break;
+    }
+    // clang-format on
   }
-  if (!(Literal & 0xffff))
-    return AMDGPU::isInlinableLiteral16(Literal >> 16, HasInv2Pi);
 
-  int16_t Lo16 = static_cast<int16_t>(Literal);
-  int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
-  return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
+  return {};
 }
 
-bool isInlinableIntLiteralV216(int32_t Literal) {
-  int16_t Lo16 = static_cast<int16_t>(Literal);
-  if (isInt<16>(Literal) || isUInt<16>(Literal))
-    return isInlinableIntLiteral(Lo16);
+// Encoding of the literal as an inline constant for a V_PK_*_IU16 instruction
+// or nullopt.
+std::optional<unsigned> getInlineEncodingV2I16(uint32_t Literal) {
+  return getInlineEncodingV216(false, Literal);
+}
 
-  int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
-  if (!(Literal & 0xffff))
-    return isInlinableIntLiteral(Hi16);
-  return Lo16 == Hi16 && isInlinableIntLiteral(Lo16);
+// Encoding of the literal as an inline constant for a V_PK_*_F16 instruction
+// or nullopt.
+std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal) {
+  return getInlineEncodingV216(true, Literal);
 }
 
-bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType) {
+// Whether the given literal can be inlined for a V_PK_* instruction.
+bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) {
   switch (OpType) {
+  case AMDGPU::OPERAND_REG_IMM_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+    return getInlineEncodingV216(false, Literal).has_value();
   case AMDGPU::OPERAND_REG_IMM_V2FP16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
-    return isInlinableLiteralV216(Literal, HasInv2Pi);
+  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+    return getInlineEncodingV216(true, Literal).has_value();
   default:
-    return isInlinableIntLiteralV216(Literal);
+    llvm_unreachable("bad packed operand type");
   }
 }
 
-bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) {
-  assert(HasInv2Pi);
-
-  int16_t Lo16 = static_cast<int16_t>(Literal);
-  if (isInt<16>(Literal) || isUInt<16>(Literal))
-    return true;
+// Whether the given literal can be inlined for a V_PK_*_IU16 instruction.
+bool isInlinableLiteralV2I16(uint32_t Literal) {
+  return getInlineEncodingV2I16(Literal).has_value();
+}
 
-  int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
-  if (!(Literal & 0xffff))
-    return true;
-  return Lo16 == Hi16;
+// Whether the given literal can be inlined for a V_PK_*_F16 instruction.
+bool isInlinableLiteralV2F16(uint32_t Literal) {
+  return getInlineEncodingV2F16(Literal).has_value();
 }
 
 bool isValid32BitLiteral(uint64_t Val, bool IsFP64) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 3c9f330..50c7417 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1291,16 +1291,19 @@ LLVM_READNONE
 bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);
 
 LLVM_READNONE
-bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
+std::optional<unsigned> getInlineEncodingV2I16(uint32_t Literal);
 
 LLVM_READNONE
-bool isInlinableIntLiteralV216(int32_t Literal);
+std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal);
 
 LLVM_READNONE
-bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType);
+bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType);
 
 LLVM_READNONE
-bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi);
+bool isInlinableLiteralV2I16(uint32_t Literal);
+
+LLVM_READNONE
+bool isInlinableLiteralV2F16(uint32_t Literal);
 
 LLVM_READNONE
 bool isValid32BitLiteral(uint64_t Val, bool IsFP64);
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 7f52501..985b77b 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -125,15 +125,6 @@ defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3P_Profile<VOP_V2I16_V2
 
 let SubtargetPredicate = HasVOP3PInsts in {
 
-// Undo sub x, c -> add x, -c canonicalization since c is more likely
-// an inline immediate than -c.
-// The constant will be emitted as a mov, and folded later.
-// TODO: We could directly encode the immediate now
-def : GCNPat<
-  (add (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), NegSubInlineConstV216:$src1),
-  (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1)
->;
-
 // Integer operations with clamp bit set.
 class VOP3PSatPat<SDPatternOperator pat, Instruction inst> : GCNPat<
   (pat (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)),