aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp20
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h14
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp21
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp125
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h6
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp19
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp148
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td17
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td4
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp106
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h11
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td9
15 files changed, 324 insertions, 196 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index bffea82..48ee0d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -317,26 +317,16 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
}
}
-bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
- bool Negated) const {
+bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
if (N->isUndef())
return true;
const SIInstrInfo *TII = Subtarget->getInstrInfo();
- if (Negated) {
- if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
- return TII->isInlineConstant(-C->getAPIntValue());
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
+ return TII->isInlineConstant(C->getAPIntValue());
- if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
- return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
-
- } else {
- if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
- return TII->isInlineConstant(C->getAPIntValue());
-
- if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
- return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
- }
+ if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
+ return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 374108a..df4a211 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -50,15 +50,13 @@ static inline bool getConstantValue(SDValue N, uint32_t &Out) {
}
// TODO: Handle undef as zero
-static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
- bool Negate = false) {
+static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
uint32_t LHSVal, RHSVal;
if (getConstantValue(N->getOperand(0), LHSVal) &&
getConstantValue(N->getOperand(1), RHSVal)) {
SDLoc SL(N);
- uint32_t K = Negate ? (-LHSVal & 0xffff) | (-RHSVal << 16)
- : (LHSVal & 0xffff) | (RHSVal << 16);
+ uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
DAG.getTargetConstant(K, SL, MVT::i32));
}
@@ -66,9 +64,6 @@ static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
return nullptr;
}
-static inline SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
- return packConstantV2I16(N, DAG, true);
-}
} // namespace
/// AMDGPU specific code to select AMDGPU machine instructions for
@@ -110,10 +105,7 @@ protected:
private:
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
- bool isInlineImmediate(const SDNode *N, bool Negated = false) const;
- bool isNegInlineImmediate(const SDNode *N) const {
- return isInlineImmediate(N, true);
- }
+ bool isInlineImmediate(const SDNode *N) const;
bool isInlineImmediate16(int64_t Imm) const {
return AMDGPU::isInlinableLiteral16(Imm, Subtarget->hasInv2PiInlineImm());
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 5f2b7c0..b7f0438 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1865,6 +1865,9 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_IMM_V2FP32:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
case AMDGPU::OPERAND_REG_IMM_V2INT32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
case AMDGPU::OPERAND_KIMM32:
case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
return &APFloat::IEEEsingle();
@@ -1879,13 +1882,10 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
- case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_KIMM16:
return &APFloat::IEEEhalf();
@@ -2033,9 +2033,14 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
// We allow fp literals with f16x2 operands assuming that the specified
// literal goes into the lower half and the upper half is zero. We also
// require that the literal may be losslessly converted to f16.
- MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 :
- (type == MVT::v2i16)? MVT::i16 :
- (type == MVT::v2f32)? MVT::f32 : type;
+ //
+ // For i16x2 operands, we assume that the specified literal is encoded as a
+ // single-precision float. This is pretty odd, but it matches SP3 and what
+ // happens in hardware.
+ MVT ExpectedType = (type == MVT::v2f16) ? MVT::f16
+ : (type == MVT::v2i16) ? MVT::f32
+ : (type == MVT::v2f32) ? MVT::f32
+ : type;
APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));
return canLosslesslyConvertToFPType(FPLiteral, ExpectedType);
@@ -3401,12 +3406,12 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16 ||
OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16)
- return AMDGPU::isInlinableIntLiteralV216(Val);
+ return AMDGPU::isInlinableLiteralV2I16(Val);
if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 ||
OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2FP16 ||
OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
- return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm());
+ return AMDGPU::isInlinableLiteralV2F16(Val);
return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm());
}
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 67be7b0..9dff3f6 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -182,6 +182,9 @@ static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val,
DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, Imm, \
false, ImmWidth)
+#define DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(Name, OpWidth, ImmWidth) \
+ DECODE_SrcOp(decodeOperand_##Name, 9, OpWidth, Imm, false, ImmWidth)
+
// Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc)
// and decode using 'enum10' from decodeSrcOp.
#define DECODE_OPERAND_SRC_REG_OR_IMM_A9(RegClass, OpWidth, ImmWidth) \
@@ -262,6 +265,9 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2I16, OPW32, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2F16, OPW32, 16)
+
DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_64, OPW64, 64)
DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_128, OPW128, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_256, OPW256, 64)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 91a7093..b85eb76 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1096,7 +1096,7 @@ public:
bool hasDstSelForwardingHazard() const { return GFX940Insts; }
// Cannot use op_sel with v_dot instructions.
- bool hasDOTOpSelHazard() const { return GFX940Insts; }
+ bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
// Does not have HW interlocs for VALU writing and then reading SGPRs.
bool hasVDecCoExecHazard() const {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index ef1b85f..6c7977e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -460,56 +460,84 @@ void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm,
}
}
-void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- int16_t SImm = static_cast<int16_t>(Imm);
- if (isInlinableIntLiteral(SImm)) {
- O << SImm;
- return;
- }
-
+// This must accept a 32-bit immediate value to correctly handle packed 16-bit
+// operations.
+static bool printImmediateFloat16(uint32_t Imm, const MCSubtargetInfo &STI,
+ raw_ostream &O) {
if (Imm == 0x3C00)
- O<< "1.0";
+ O << "1.0";
else if (Imm == 0xBC00)
- O<< "-1.0";
+ O << "-1.0";
else if (Imm == 0x3800)
- O<< "0.5";
+ O << "0.5";
else if (Imm == 0xB800)
- O<< "-0.5";
+ O << "-0.5";
else if (Imm == 0x4000)
- O<< "2.0";
+ O << "2.0";
else if (Imm == 0xC000)
- O<< "-2.0";
+ O << "-2.0";
else if (Imm == 0x4400)
- O<< "4.0";
+ O << "4.0";
else if (Imm == 0xC400)
- O<< "-4.0";
- else if (Imm == 0x3118 &&
- STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) {
+ O << "-4.0";
+ else if (Imm == 0x3118 && STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
O << "0.15915494";
- } else {
- uint64_t Imm16 = static_cast<uint16_t>(Imm);
- O << formatHex(Imm16);
- }
-}
+ else
+ return false;
-void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- uint16_t Lo16 = static_cast<uint16_t>(Imm);
- printImmediate16(Lo16, STI, O);
+ return true;
}
-void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
+void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ int16_t SImm = static_cast<int16_t>(Imm);
+ if (isInlinableIntLiteral(SImm)) {
+ O << SImm;
+ return;
+ }
+
+ uint16_t HImm = static_cast<uint16_t>(Imm);
+ if (printImmediateFloat16(HImm, STI, O))
+ return;
+
+ uint64_t Imm16 = static_cast<uint16_t>(Imm);
+ O << formatHex(Imm16);
+}
+
+void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
int32_t SImm = static_cast<int32_t>(Imm);
- if (SImm >= -16 && SImm <= 64) {
+ if (isInlinableIntLiteral(SImm)) {
O << SImm;
return;
}
+ switch (OpType) {
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ if (printImmediateFloat32(Imm, STI, O))
+ return;
+ break;
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+ if (isUInt<16>(Imm) &&
+ printImmediateFloat16(static_cast<uint16_t>(Imm), STI, O))
+ return;
+ break;
+ default:
+ llvm_unreachable("bad operand type");
+ }
+
+ O << formatHex(static_cast<uint64_t>(Imm));
+}
+
+bool AMDGPUInstPrinter::printImmediateFloat32(uint32_t Imm,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
if (Imm == llvm::bit_cast<uint32_t>(0.0f))
O << "0.0";
else if (Imm == llvm::bit_cast<uint32_t>(1.0f))
@@ -532,7 +560,24 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
O << "0.15915494";
else
- O << formatHex(static_cast<uint64_t>(Imm));
+ return false;
+
+ return true;
+}
+
+void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ int32_t SImm = static_cast<int32_t>(Imm);
+ if (isInlinableIntLiteral(SImm)) {
+ O << SImm;
+ return;
+ }
+
+ if (printImmediateFloat32(Imm, STI, O))
+ return;
+
+ O << formatHex(static_cast<uint64_t>(Imm));
}
void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
@@ -755,25 +800,11 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
break;
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
- if (!isUInt<16>(Op.getImm()) &&
- STI.hasFeature(AMDGPU::FeatureVOP3Literal)) {
- printImmediate32(Op.getImm(), STI, O);
- break;
- }
-
- // Deal with 16-bit FP inline immediates not working.
- if (OpTy == AMDGPU::OPERAND_REG_IMM_V2FP16) {
- printImmediate16(static_cast<uint16_t>(Op.getImm()), STI, O);
- break;
- }
- [[fallthrough]];
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
- printImmediateInt16(static_cast<uint16_t>(Op.getImm()), STI, O);
- break;
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
- printImmediateV216(Op.getImm(), STI, O);
+ printImmediateV216(Op.getImm(), OpTy, STI, O);
break;
case MCOI::OPERAND_UNKNOWN:
case MCOI::OPERAND_PCREL:
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index f2f985f..e3958f8 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -88,8 +88,10 @@ private:
raw_ostream &O);
void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI,
- raw_ostream &O);
+ void printImmediateV216(uint32_t Imm, uint8_t OpType,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ bool printImmediateFloat32(uint32_t Imm, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index b403d69..de1abaf 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -284,22 +284,15 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO,
// which does not have f16 support?
return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
case AMDGPU::OPERAND_REG_IMM_V2INT16:
- case AMDGPU::OPERAND_REG_IMM_V2FP16: {
- if (!isUInt<16>(Imm) && STI.hasFeature(AMDGPU::FeatureVOP3Literal))
- return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
- if (OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
- return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
- [[fallthrough]];
- }
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
- return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI);
+ return AMDGPU::getInlineEncodingV2I16(static_cast<uint32_t>(Imm))
+ .value_or(255);
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
- uint16_t Lo16 = static_cast<uint16_t>(Imm);
- uint32_t Encoding = getLit16Encoding(Lo16, STI);
- return Encoding;
- }
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+ return AMDGPU::getInlineEncodingV2F16(static_cast<uint32_t>(Imm))
+ .value_or(255);
case AMDGPU::OPERAND_KIMM32:
case AMDGPU::OPERAND_KIMM16:
return MO.getImm();
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 709de61..aa7639a 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -208,9 +208,7 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
assert(Old.isReg() && Fold.isImm());
if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
- (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) ||
- isUInt<16>(Fold.ImmToFold) ||
- !AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST->hasInv2PiInlineImm()))
+ (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
return false;
unsigned Opcode = MI->getOpcode();
@@ -234,42 +232,123 @@ bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
unsigned Opcode = MI->getOpcode();
int OpNo = MI->getOperandNo(&Old);
+ uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
+
+ // If the literal can be inlined as-is, apply it and short-circuit the
+ // tests below. The main motivation for this is to avoid unintuitive
+ // uses of opsel.
+ if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) {
+ Old.ChangeToImmediate(Fold.ImmToFold);
+ return true;
+ }
- // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
- // already set.
+ // Refer to op_sel/op_sel_hi and check if we can change the immediate and
+ // op_sel in a way that allows an inline constant.
int ModIdx = -1;
- if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
+ unsigned SrcIdx = ~0;
+ if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
ModIdx = AMDGPU::OpName::src0_modifiers;
- else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
+ SrcIdx = 0;
+ } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
ModIdx = AMDGPU::OpName::src1_modifiers;
- else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
+ SrcIdx = 1;
+ } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
ModIdx = AMDGPU::OpName::src2_modifiers;
+ SrcIdx = 2;
+ }
assert(ModIdx != -1);
ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
MachineOperand &Mod = MI->getOperand(ModIdx);
- unsigned Val = Mod.getImm();
- if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
+ unsigned ModVal = Mod.getImm();
+
+ uint16_t ImmLo = static_cast<uint16_t>(
+ Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
+ uint16_t ImmHi = static_cast<uint16_t>(
+ Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
+ uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
+ unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
+
+ // Helper function that attempts to inline the given value with a newly
+ // chosen opsel pattern.
+ auto tryFoldToInline = [&](uint32_t Imm) -> bool {
+ if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
+ Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
+ Old.ChangeToImmediate(Imm);
+ return true;
+ }
+
+ // Try to shuffle the halves around and leverage opsel to get an inline
+ // constant.
+ uint16_t Lo = static_cast<uint16_t>(Imm);
+ uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
+ if (Lo == Hi) {
+ if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
+ Mod.setImm(NewModVal);
+ Old.ChangeToImmediate(Lo);
+ return true;
+ }
+
+ if (static_cast<int16_t>(Lo) < 0) {
+ int32_t SExt = static_cast<int16_t>(Lo);
+ if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
+ Mod.setImm(NewModVal);
+ Old.ChangeToImmediate(SExt);
+ return true;
+ }
+ }
+
+ // This check is only useful for integer instructions
+ if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
+ OpType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16) {
+ if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
+ Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
+ Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
+ return true;
+ }
+ }
+ } else {
+ uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
+ if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
+ Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
+ Old.ChangeToImmediate(Swapped);
+ return true;
+ }
+ }
+
return false;
+ };
- // Only apply the following transformation if that operand requires
- // a packed immediate.
- // If upper part is all zero we do not need op_sel_hi.
- if (!(Fold.ImmToFold & 0xffff)) {
- MachineOperand New =
- MachineOperand::CreateImm((Fold.ImmToFold >> 16) & 0xffff);
- if (!TII->isOperandLegal(*MI, OpNo, &New))
- return false;
- Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
- Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
- Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+ if (tryFoldToInline(Imm))
return true;
+
+ // Replace integer addition by subtraction and vice versa if it allows
+ // folding the immediate to an inline constant.
+ //
+ // We should only ever get here for SrcIdx == 1 due to canonicalization
+ // earlier in the pipeline, but we double-check here to be safe / fully
+ // general.
+ bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
+ bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
+ if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
+ unsigned ClampIdx =
+ AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
+ bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
+
+ if (!Clamp) {
+ uint16_t NegLo = -static_cast<uint16_t>(Imm);
+ uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
+ uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
+
+ if (tryFoldToInline(NegImm)) {
+ unsigned NegOpcode =
+ IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
+ MI->setDesc(TII->get(NegOpcode));
+ return true;
+ }
+ }
}
- MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold & 0xffff);
- if (!TII->isOperandLegal(*MI, OpNo, &New))
- return false;
- Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
- Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
- return true;
+
+ return false;
}
bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
@@ -277,8 +356,19 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
assert(Old.isReg());
- if (Fold.isImm() && canUseImmWithOpSel(Fold))
- return tryFoldImmWithOpSel(Fold);
+ if (Fold.isImm() && canUseImmWithOpSel(Fold)) {
+ if (tryFoldImmWithOpSel(Fold))
+ return true;
+
+ // We can't represent the candidate as an inline constant. Try as a literal
+ // with the original opsel, checking constant bus limitations.
+ MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold);
+ int OpNo = MI->getOperandNo(&Old);
+ if (!TII->isOperandLegal(*MI, OpNo, &New))
+ return false;
+ Old.ChangeToImmediate(Fold.ImmToFold);
+ return true;
+ }
if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
MachineBasicBlock *MBB = MI->getParent();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 396d22c..6799292 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4153,15 +4153,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
- return (isInt<16>(Imm) || isUInt<16>(Imm)) &&
- AMDGPU::isInlinableIntLiteral((int16_t)Imm);
+ return AMDGPU::isInlinableLiteralV2I16(Imm);
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+ return AMDGPU::isInlinableLiteralV2F16(Imm);
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
- case AMDGPU::OPERAND_REG_IMM_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
if (isInt<16>(Imm) || isUInt<16>(Imm)) {
// A few special case instructions have 16-bit operands on subtargets
// where 16-bit instructions are not legal.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 50724fd..f07b8fa0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -860,23 +860,6 @@ def ShiftAmt32Imm : ImmLeaf <i32, [{
return Imm < 32;
}]>;
-def getNegV2I16Imm : SDNodeXForm<build_vector, [{
- return SDValue(packNegConstantV2I16(N, *CurDAG), 0);
-}]>;
-
-def NegSubInlineConstV216 : PatLeaf<(build_vector), [{
- assert(N->getNumOperands() == 2);
- assert(N->getOperand(0).getValueType().getSizeInBits() == 16);
- SDValue Src0 = N->getOperand(0);
- SDValue Src1 = N->getOperand(1);
- if (Src0 == Src1)
- return isNegInlineImmediate(Src0.getNode());
-
- return (isNullConstantOrUndef(Src0) && isNegInlineImmediate(Src1.getNode())) ||
- (isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode()));
-}], getNegV2I16Imm>;
-
-
def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
return fp16SrcZerosHighBits(N->getOpcode());
}]>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index c94b894..1d197dc 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1152,11 +1152,11 @@ class RegOrF32 <string RegisterClass, string OperandTypePrefix>
class RegOrV2B16 <string RegisterClass, string OperandTypePrefix>
: RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2INT16",
- !subst("_v2b16", "V2B16", NAME), "_Imm16">;
+ !subst("_v2b16", "V2B16", NAME), "_ImmV2I16">;
class RegOrV2F16 <string RegisterClass, string OperandTypePrefix>
: RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2FP16",
- !subst("_v2f16", "V2F16", NAME), "_Imm16">;
+ !subst("_v2f16", "V2F16", NAME), "_ImmV2F16">;
class RegOrF64 <string RegisterClass, string OperandTypePrefix>
: RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP64",
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index a91d771..26ba257 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2506,53 +2506,95 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
Val == 0x3118; // 1/2pi
}
-bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
- assert(HasInv2Pi);
-
- if (isInt<16>(Literal) || isUInt<16>(Literal)) {
- int16_t Trunc = static_cast<int16_t>(Literal);
- return AMDGPU::isInlinableLiteral16(Trunc, HasInv2Pi);
+std::optional<unsigned> getInlineEncodingV216(bool IsFloat, uint32_t Literal) {
+ // Unfortunately, the Instruction Set Architecture Reference Guide is
+ // misleading about how the inline operands work for (packed) 16-bit
+ // instructions. In a nutshell, the actual HW behavior is:
+ //
+ // - integer encodings (-16 .. 64) are always produced as sign-extended
+ // 32-bit values
+ // - float encodings are produced as:
+ // - for F16 instructions: corresponding half-precision float values in
+ // the LSBs, 0 in the MSBs
+ // - for UI16 instructions: corresponding single-precision float value
+ int32_t Signed = static_cast<int32_t>(Literal);
+ if (Signed >= 0 && Signed <= 64)
+ return 128 + Signed;
+
+ if (Signed >= -16 && Signed <= -1)
+ return 192 + std::abs(Signed);
+
+ if (IsFloat) {
+ // clang-format off
+ switch (Literal) {
+ case 0x3800: return 240; // 0.5
+ case 0xB800: return 241; // -0.5
+ case 0x3C00: return 242; // 1.0
+ case 0xBC00: return 243; // -1.0
+ case 0x4000: return 244; // 2.0
+ case 0xC000: return 245; // -2.0
+ case 0x4400: return 246; // 4.0
+ case 0xC400: return 247; // -4.0
+ case 0x3118: return 248; // 1.0 / (2.0 * pi)
+ default: break;
+ }
+ // clang-format on
+ } else {
+ // clang-format off
+ switch (Literal) {
+ case 0x3F000000: return 240; // 0.5
+ case 0xBF000000: return 241; // -0.5
+ case 0x3F800000: return 242; // 1.0
+ case 0xBF800000: return 243; // -1.0
+ case 0x40000000: return 244; // 2.0
+ case 0xC0000000: return 245; // -2.0
+ case 0x40800000: return 246; // 4.0
+ case 0xC0800000: return 247; // -4.0
+ case 0x3E22F983: return 248; // 1.0 / (2.0 * pi)
+ default: break;
+ }
+ // clang-format on
}
- if (!(Literal & 0xffff))
- return AMDGPU::isInlinableLiteral16(Literal >> 16, HasInv2Pi);
- int16_t Lo16 = static_cast<int16_t>(Literal);
- int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
- return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
+ return {};
}
-bool isInlinableIntLiteralV216(int32_t Literal) {
- int16_t Lo16 = static_cast<int16_t>(Literal);
- if (isInt<16>(Literal) || isUInt<16>(Literal))
- return isInlinableIntLiteral(Lo16);
+// Encoding of the literal as an inline constant for a V_PK_*_IU16 instruction
+// or nullopt.
+std::optional<unsigned> getInlineEncodingV2I16(uint32_t Literal) {
+ return getInlineEncodingV216(false, Literal);
+}
- int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
- if (!(Literal & 0xffff))
- return isInlinableIntLiteral(Hi16);
- return Lo16 == Hi16 && isInlinableIntLiteral(Lo16);
+// Encoding of the literal as an inline constant for a V_PK_*_F16 instruction
+// or nullopt.
+std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal) {
+ return getInlineEncodingV216(true, Literal);
}
-bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType) {
+// Whether the given literal can be inlined for a V_PK_* instruction.
+bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) {
switch (OpType) {
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ return getInlineEncodingV216(false, Literal).has_value();
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
- return isInlinableLiteralV216(Literal, HasInv2Pi);
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+ return getInlineEncodingV216(true, Literal).has_value();
default:
- return isInlinableIntLiteralV216(Literal);
+ llvm_unreachable("bad packed operand type");
}
}
-bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) {
- assert(HasInv2Pi);
-
- int16_t Lo16 = static_cast<int16_t>(Literal);
- if (isInt<16>(Literal) || isUInt<16>(Literal))
- return true;
+// Whether the given literal can be inlined for a V_PK_*_IU16 instruction.
+bool isInlinableLiteralV2I16(uint32_t Literal) {
+ return getInlineEncodingV2I16(Literal).has_value();
+}
- int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
- if (!(Literal & 0xffff))
- return true;
- return Lo16 == Hi16;
+// Whether the given literal can be inlined for a V_PK_*_F16 instruction.
+bool isInlinableLiteralV2F16(uint32_t Literal) {
+ return getInlineEncodingV2F16(Literal).has_value();
}
bool isValid32BitLiteral(uint64_t Val, bool IsFP64) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 3c9f330..50c7417 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1291,16 +1291,19 @@ LLVM_READNONE
bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);
LLVM_READNONE
-bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
+std::optional<unsigned> getInlineEncodingV2I16(uint32_t Literal);
LLVM_READNONE
-bool isInlinableIntLiteralV216(int32_t Literal);
+std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal);
LLVM_READNONE
-bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType);
+bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType);
LLVM_READNONE
-bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi);
+bool isInlinableLiteralV2I16(uint32_t Literal);
+
+LLVM_READNONE
+bool isInlinableLiteralV2F16(uint32_t Literal);
LLVM_READNONE
bool isValid32BitLiteral(uint64_t Val, bool IsFP64);
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 7f52501..985b77b 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -125,15 +125,6 @@ defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3P_Profile<VOP_V2I16_V2
let SubtargetPredicate = HasVOP3PInsts in {
-// Undo sub x, c -> add x, -c canonicalization since c is more likely
-// an inline immediate than -c.
-// The constant will be emitted as a mov, and folded later.
-// TODO: We could directly encode the immediate now
-def : GCNPat<
- (add (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), NegSubInlineConstV216:$src1),
- (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1)
->;
-
// Integer operations with clamp bit set.
class VOP3PSatPat<SDPatternOperator pat, Instruction inst> : GCNPat<
(pat (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)),