diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 20 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 14 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 21 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 125 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 19 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 148 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 12 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 17 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 106 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 11 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 9 |
15 files changed, 324 insertions, 196 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index bffea82..48ee0d9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -317,26 +317,16 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() { } } -bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N, - bool Negated) const { +bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const { if (N->isUndef()) return true; const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (Negated) { - if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) - return TII->isInlineConstant(-C->getAPIntValue()); + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) + return TII->isInlineConstant(C->getAPIntValue()); - if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) - return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt()); - - } else { - if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) - return TII->isInlineConstant(C->getAPIntValue()); - - if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) - return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt()); - } + if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) + return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt()); return false; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 374108a..df4a211 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -50,15 +50,13 @@ static inline bool getConstantValue(SDValue N, uint32_t &Out) { } // TODO: Handle undef as zero -static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG, - bool Negate = false) { +static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); uint32_t LHSVal, RHSVal; if (getConstantValue(N->getOperand(0), LHSVal) && getConstantValue(N->getOperand(1), RHSVal)) { SDLoc SL(N); - uint32_t K = Negate ? (-LHSVal & 0xffff) | (-RHSVal << 16) - : (LHSVal & 0xffff) | (RHSVal << 16); + uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16); return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32)); } @@ -66,9 +64,6 @@ static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG, return nullptr; } -static inline SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) { - return packConstantV2I16(N, DAG, true); -} } // namespace /// AMDGPU specific code to select AMDGPU machine instructions for @@ -110,10 +105,7 @@ protected: private: std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const; - bool isInlineImmediate(const SDNode *N, bool Negated = false) const; - bool isNegInlineImmediate(const SDNode *N) const { - return isInlineImmediate(N, true); - } + bool isInlineImmediate(const SDNode *N) const; bool isInlineImmediate16(int64_t Imm) const { return AMDGPU::isInlinableLiteral16(Imm, Subtarget->hasInv2PiInlineImm()); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 5f2b7c0..b7f0438 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1865,6 +1865,9 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_IMM_V2FP32: case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: return &APFloat::IEEEsingle(); @@ -1879,13 +1882,10 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: - case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_INT16: case AMDGPU::OPERAND_REG_INLINE_AC_FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: - case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_KIMM16: return &APFloat::IEEEhalf(); @@ -2033,9 +2033,14 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const { // We allow fp literals with f16x2 operands assuming that the specified // literal goes into the lower half and the upper half is zero. We also // require that the literal may be losslessly converted to f16. - MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 : - (type == MVT::v2i16)? MVT::i16 : - (type == MVT::v2f32)? MVT::f32 : type; + // + // For i16x2 operands, we assume that the specified literal is encoded as a + // single-precision float. This is pretty odd, but it matches SP3 and what + // happens in hardware. + MVT ExpectedType = (type == MVT::v2f16) ? MVT::f16 + : (type == MVT::v2i16) ? MVT::f32 + : (type == MVT::v2f32) ? MVT::f32 + : type; APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val)); return canLosslesslyConvertToFPType(FPLiteral, ExpectedType); @@ -3401,12 +3406,12 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 || OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16 || OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16) - return AMDGPU::isInlinableIntLiteralV216(Val); + return AMDGPU::isInlinableLiteralV2I16(Val); if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 || OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2FP16 || OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) - return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm()); + return AMDGPU::isInlinableLiteralV2F16(Val); return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm()); } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 67be7b0..9dff3f6 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -182,6 +182,9 @@ static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val, DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, Imm, \ false, ImmWidth) +#define DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(Name, OpWidth, ImmWidth) \ + DECODE_SrcOp(decodeOperand_##Name, 9, OpWidth, Imm, false, ImmWidth) + // Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc) // and decode using 'enum10' from decodeSrcOp. #define DECODE_OPERAND_SRC_REG_OR_IMM_A9(RegClass, OpWidth, ImmWidth) \ @@ -262,6 +265,9 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2I16, OPW32, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2F16, OPW32, 16) + DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_64, OPW64, 64) DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_128, OPW128, 32) DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_256, OPW256, 64) diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 91a7093..b85eb76 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1096,7 +1096,7 @@ public: bool hasDstSelForwardingHazard() const { return GFX940Insts; } // Cannot use op_sel with v_dot instructions. - bool hasDOTOpSelHazard() const { return GFX940Insts; } + bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; } // Does not have HW interlocs for VALU writing and then reading SGPRs. bool hasVDecCoExecHazard() const { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index ef1b85f..6c7977e 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -460,56 +460,84 @@ void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm, } } -void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, - const MCSubtargetInfo &STI, - raw_ostream &O) { - int16_t SImm = static_cast<int16_t>(Imm); - if (isInlinableIntLiteral(SImm)) { - O << SImm; - return; - } - +// This must accept a 32-bit immediate value to correctly handle packed 16-bit +// operations. +static bool printImmediateFloat16(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O) { if (Imm == 0x3C00) - O<< "1.0"; + O << "1.0"; else if (Imm == 0xBC00) - O<< "-1.0"; + O << "-1.0"; else if (Imm == 0x3800) - O<< "0.5"; + O << "0.5"; else if (Imm == 0xB800) - O<< "-0.5"; + O << "-0.5"; else if (Imm == 0x4000) - O<< "2.0"; + O << "2.0"; else if (Imm == 0xC000) - O<< "-2.0"; + O << "-2.0"; else if (Imm == 0x4400) - O<< "4.0"; + O << "4.0"; else if (Imm == 0xC400) - O<< "-4.0"; - else if (Imm == 0x3118 && - STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) { + O << "-4.0"; + else if (Imm == 0x3118 && STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) O << "0.15915494"; - } else { - uint64_t Imm16 = static_cast<uint16_t>(Imm); - O << formatHex(Imm16); - } -} + else + return false; -void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, - const MCSubtargetInfo &STI, - raw_ostream &O) { - uint16_t Lo16 = static_cast<uint16_t>(Imm); - printImmediate16(Lo16, STI, O); + return true; } -void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, +void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O) { + int16_t SImm = static_cast<int16_t>(Imm); + if (isInlinableIntLiteral(SImm)) { + O << SImm; + return; + } + + uint16_t HImm = static_cast<uint16_t>(Imm); + if (printImmediateFloat16(HImm, STI, O)) + return; + + uint64_t Imm16 = static_cast<uint16_t>(Imm); + O << formatHex(Imm16); +} + +void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType, + const MCSubtargetInfo &STI, + raw_ostream &O) { int32_t SImm = static_cast<int32_t>(Imm); - if (SImm >= -16 && SImm <= 64) { + if (isInlinableIntLiteral(SImm)) { O << SImm; return; } + switch (OpType) { + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + if (printImmediateFloat32(Imm, STI, O)) + return; + break; + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + if (isUInt<16>(Imm) && + printImmediateFloat16(static_cast<uint16_t>(Imm), STI, O)) + return; + break; + default: + llvm_unreachable("bad operand type"); + } + + O << formatHex(static_cast<uint64_t>(Imm)); +} + +bool AMDGPUInstPrinter::printImmediateFloat32(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { if (Imm == llvm::bit_cast<uint32_t>(0.0f)) O << "0.0"; else if (Imm == llvm::bit_cast<uint32_t>(1.0f)) @@ -532,7 +560,24 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) O << "0.15915494"; else - O << formatHex(static_cast<uint64_t>(Imm)); + return false; + + return true; +} + +void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { + int32_t SImm = static_cast<int32_t>(Imm); + if (isInlinableIntLiteral(SImm)) { + O << SImm; + return; + } + + if (printImmediateFloat32(Imm, STI, O)) + return; + + O << formatHex(static_cast<uint64_t>(Imm)); } void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, @@ -755,25 +800,11 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, break; case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: - if (!isUInt<16>(Op.getImm()) && - STI.hasFeature(AMDGPU::FeatureVOP3Literal)) { - printImmediate32(Op.getImm(), STI, O); - break; - } - - // Deal with 16-bit FP inline immediates not working. - if (OpTy == AMDGPU::OPERAND_REG_IMM_V2FP16) { - printImmediate16(static_cast<uint16_t>(Op.getImm()), STI, O); - break; - } - [[fallthrough]]; case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: - printImmediateInt16(static_cast<uint16_t>(Op.getImm()), STI, O); - break; case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: - printImmediateV216(Op.getImm(), STI, O); + printImmediateV216(Op.getImm(), OpTy, STI, O); break; case MCOI::OPERAND_UNKNOWN: case MCOI::OPERAND_PCREL: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index f2f985f..e3958f8 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -88,8 +88,10 @@ private: raw_ostream &O); void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); - void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI, - raw_ostream &O); + void printImmediateV216(uint32_t Imm, uint8_t OpType, + const MCSubtargetInfo &STI, raw_ostream &O); + bool printImmediateFloat32(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O); void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index b403d69..de1abaf 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -284,22 +284,15 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO, // which does not have f16 support? return getLit16Encoding(static_cast<uint16_t>(Imm), STI); case AMDGPU::OPERAND_REG_IMM_V2INT16: - case AMDGPU::OPERAND_REG_IMM_V2FP16: { - if (!isUInt<16>(Imm) && STI.hasFeature(AMDGPU::FeatureVOP3Literal)) - return getLit32Encoding(static_cast<uint32_t>(Imm), STI); - if (OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) - return getLit16Encoding(static_cast<uint16_t>(Imm), STI); - [[fallthrough]]; - } case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: - return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI); + return AMDGPU::getInlineEncodingV2I16(static_cast<uint32_t>(Imm)) + .value_or(255); + case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { - uint16_t Lo16 = static_cast<uint16_t>(Imm); - uint32_t Encoding = getLit16Encoding(Lo16, STI); - return Encoding; - } + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + return AMDGPU::getInlineEncodingV2F16(static_cast<uint32_t>(Imm)) + .value_or(255); case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_KIMM16: return MO.getImm(); diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 709de61..aa7639a 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -208,9 +208,7 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const { assert(Old.isReg() && Fold.isImm()); if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) || - (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) || - isUInt<16>(Fold.ImmToFold) || - !AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST->hasInv2PiInlineImm())) + (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT))) return false; unsigned Opcode = MI->getOpcode(); @@ -234,42 +232,123 @@ bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const { MachineOperand &Old = MI->getOperand(Fold.UseOpNo); unsigned Opcode = MI->getOpcode(); int OpNo = MI->getOperandNo(&Old); + uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType; + + // If the literal can be inlined as-is, apply it and short-circuit the + // tests below. The main motivation for this is to avoid unintuitive + // uses of opsel. + if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) { + Old.ChangeToImmediate(Fold.ImmToFold); + return true; + } - // Set op_sel/op_sel_hi on this operand or bail out if op_sel is - // already set. + // Refer to op_sel/op_sel_hi and check if we can change the immediate and + // op_sel in a way that allows an inline constant. int ModIdx = -1; - if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) + unsigned SrcIdx = ~0; + if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) { ModIdx = AMDGPU::OpName::src0_modifiers; - else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) + SrcIdx = 0; + } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) { ModIdx = AMDGPU::OpName::src1_modifiers; - else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) + SrcIdx = 1; + } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) { ModIdx = AMDGPU::OpName::src2_modifiers; + SrcIdx = 2; + } assert(ModIdx != -1); ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx); MachineOperand &Mod = MI->getOperand(ModIdx); - unsigned Val = Mod.getImm(); - if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1)) + unsigned ModVal = Mod.getImm(); + + uint16_t ImmLo = static_cast<uint16_t>( + Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0)); + uint16_t ImmHi = static_cast<uint16_t>( + Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0)); + uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo; + unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1); + + // Helper function that attempts to inline the given value with a newly + // chosen opsel pattern. + auto tryFoldToInline = [&](uint32_t Imm) -> bool { + if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) { + Mod.setImm(NewModVal | SISrcMods::OP_SEL_1); + Old.ChangeToImmediate(Imm); + return true; + } + + // Try to shuffle the halves around and leverage opsel to get an inline + // constant. + uint16_t Lo = static_cast<uint16_t>(Imm); + uint16_t Hi = static_cast<uint16_t>(Imm >> 16); + if (Lo == Hi) { + if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) { + Mod.setImm(NewModVal); + Old.ChangeToImmediate(Lo); + return true; + } + + if (static_cast<int16_t>(Lo) < 0) { + int32_t SExt = static_cast<int16_t>(Lo); + if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) { + Mod.setImm(NewModVal); + Old.ChangeToImmediate(SExt); + return true; + } + } + + // This check is only useful for integer instructions + if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 || + OpType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16) { + if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) { + Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1); + Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16); + return true; + } + } + } else { + uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi; + if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) { + Mod.setImm(NewModVal | SISrcMods::OP_SEL_0); + Old.ChangeToImmediate(Swapped); + return true; + } + } + return false; + }; - // Only apply the following transformation if that operand requires - // a packed immediate. - // If upper part is all zero we do not need op_sel_hi. - if (!(Fold.ImmToFold & 0xffff)) { - MachineOperand New = - MachineOperand::CreateImm((Fold.ImmToFold >> 16) & 0xffff); - if (!TII->isOperandLegal(*MI, OpNo, &New)) - return false; - Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); - Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); - Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); + if (tryFoldToInline(Imm)) return true; + + // Replace integer addition by subtraction and vice versa if it allows + // folding the immediate to an inline constant. + // + // We should only ever get here for SrcIdx == 1 due to canonicalization + // earlier in the pipeline, but we double-check here to be safe / fully + // general. + bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16; + bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16; + if (SrcIdx == 1 && (IsUAdd || IsUSub)) { + unsigned ClampIdx = + AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp); + bool Clamp = MI->getOperand(ClampIdx).getImm() != 0; + + if (!Clamp) { + uint16_t NegLo = -static_cast<uint16_t>(Imm); + uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16); + uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo; + + if (tryFoldToInline(NegImm)) { + unsigned NegOpcode = + IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16; + MI->setDesc(TII->get(NegOpcode)); + return true; + } + } } - MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold & 0xffff); - if (!TII->isOperandLegal(*MI, OpNo, &New)) - return false; - Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); - Old.ChangeToImmediate(Fold.ImmToFold & 0xffff); - return true; + + return false; } bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { @@ -277,8 +356,19 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { MachineOperand &Old = MI->getOperand(Fold.UseOpNo); assert(Old.isReg()); - if (Fold.isImm() && canUseImmWithOpSel(Fold)) - return tryFoldImmWithOpSel(Fold); + if (Fold.isImm() && canUseImmWithOpSel(Fold)) { + if (tryFoldImmWithOpSel(Fold)) + return true; + + // We can't represent the candidate as an inline constant. Try as a literal + // with the original opsel, checking constant bus limitations. + MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold); + int OpNo = MI->getOperandNo(&Old); + if (!TII->isOperandLegal(*MI, OpNo, &New)) + return false; + Old.ChangeToImmediate(Fold.ImmToFold); + return true; + } if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) { MachineBasicBlock *MBB = MI->getParent(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 396d22c..6799292 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4153,15 +4153,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: - return (isInt<16>(Imm) || isUInt<16>(Imm)) && - AMDGPU::isInlinableIntLiteral((int16_t)Imm); + return AMDGPU::isInlinableLiteralV2I16(Imm); + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + return AMDGPU::isInlinableLiteralV2F16(Imm); case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: case AMDGPU::OPERAND_REG_INLINE_C_FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_FP16: - case AMDGPU::OPERAND_REG_IMM_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { // A few special case instructions have 16-bit operands on subtargets // where 16-bit instructions are not legal. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 50724fd..f07b8fa0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -860,23 +860,6 @@ def ShiftAmt32Imm : ImmLeaf <i32, [{ return Imm < 32; }]>; -def getNegV2I16Imm : SDNodeXForm<build_vector, [{ - return SDValue(packNegConstantV2I16(N, *CurDAG), 0); -}]>; - -def NegSubInlineConstV216 : PatLeaf<(build_vector), [{ - assert(N->getNumOperands() == 2); - assert(N->getOperand(0).getValueType().getSizeInBits() == 16); - SDValue Src0 = N->getOperand(0); - SDValue Src1 = N->getOperand(1); - if (Src0 == Src1) - return isNegInlineImmediate(Src0.getNode()); - - return (isNullConstantOrUndef(Src0) && isNegInlineImmediate(Src1.getNode())) || - (isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode())); -}], getNegV2I16Imm>; - - def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{ return fp16SrcZerosHighBits(N->getOpcode()); }]>; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index c94b894..1d197dc 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1152,11 +1152,11 @@ class RegOrF32 <string RegisterClass, string OperandTypePrefix> class RegOrV2B16 <string RegisterClass, string OperandTypePrefix> : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2INT16", - !subst("_v2b16", "V2B16", NAME), "_Imm16">; + !subst("_v2b16", "V2B16", NAME), "_ImmV2I16">; class RegOrV2F16 <string RegisterClass, string OperandTypePrefix> : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2FP16", - !subst("_v2f16", "V2F16", NAME), "_Imm16">; + !subst("_v2f16", "V2F16", NAME), "_ImmV2F16">; class RegOrF64 <string RegisterClass, string OperandTypePrefix> : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP64", diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index a91d771..26ba257 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -2506,53 +2506,95 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { Val == 0x3118; // 1/2pi } -bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) { - assert(HasInv2Pi); - - if (isInt<16>(Literal) || isUInt<16>(Literal)) { - int16_t Trunc = static_cast<int16_t>(Literal); - return AMDGPU::isInlinableLiteral16(Trunc, HasInv2Pi); +std::optional<unsigned> getInlineEncodingV216(bool IsFloat, uint32_t Literal) { + // Unfortunately, the Instruction Set Architecture Reference Guide is + // misleading about how the inline operands work for (packed) 16-bit + // instructions. In a nutshell, the actual HW behavior is: + // + // - integer encodings (-16 .. 64) are always produced as sign-extended + // 32-bit values + // - float encodings are produced as: + // - for F16 instructions: corresponding half-precision float values in + // the LSBs, 0 in the MSBs + // - for UI16 instructions: corresponding single-precision float value + int32_t Signed = static_cast<int32_t>(Literal); + if (Signed >= 0 && Signed <= 64) + return 128 + Signed; + + if (Signed >= -16 && Signed <= -1) + return 192 + std::abs(Signed); + + if (IsFloat) { + // clang-format off + switch (Literal) { + case 0x3800: return 240; // 0.5 + case 0xB800: return 241; // -0.5 + case 0x3C00: return 242; // 1.0 + case 0xBC00: return 243; // -1.0 + case 0x4000: return 244; // 2.0 + case 0xC000: return 245; // -2.0 + case 0x4400: return 246; // 4.0 + case 0xC400: return 247; // -4.0 + case 0x3118: return 248; // 1.0 / (2.0 * pi) + default: break; + } + // clang-format on + } else { + // clang-format off + switch (Literal) { + case 0x3F000000: return 240; // 0.5 + case 0xBF000000: return 241; // -0.5 + case 0x3F800000: return 242; // 1.0 + case 0xBF800000: return 243; // -1.0 + case 0x40000000: return 244; // 2.0 + case 0xC0000000: return 245; // -2.0 + case 0x40800000: return 246; // 4.0 + case 0xC0800000: return 247; // -4.0 + case 0x3E22F983: return 248; // 1.0 / (2.0 * pi) + default: break; + } + // clang-format on } - if (!(Literal & 0xffff)) - return AMDGPU::isInlinableLiteral16(Literal >> 16, HasInv2Pi); - int16_t Lo16 = static_cast<int16_t>(Literal); - int16_t Hi16 = static_cast<int16_t>(Literal >> 16); - return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi); + return {}; } -bool isInlinableIntLiteralV216(int32_t Literal) { - int16_t Lo16 = static_cast<int16_t>(Literal); - if (isInt<16>(Literal) || isUInt<16>(Literal)) - return isInlinableIntLiteral(Lo16); +// Encoding of the literal as an inline constant for a V_PK_*_IU16 instruction +// or nullopt. +std::optional<unsigned> getInlineEncodingV2I16(uint32_t Literal) { + return getInlineEncodingV216(false, Literal); +} - int16_t Hi16 = static_cast<int16_t>(Literal >> 16); - if (!(Literal & 0xffff)) - return isInlinableIntLiteral(Hi16); - return Lo16 == Hi16 && isInlinableIntLiteral(Lo16); +// Encoding of the literal as an inline constant for a V_PK_*_F16 instruction +// or nullopt. +std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal) { + return getInlineEncodingV216(true, Literal); } -bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType) { +// Whether the given literal can be inlined for a V_PK_* instruction. +bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) { switch (OpType) { + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + return getInlineEncodingV216(false, Literal).has_value(); case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - return isInlinableLiteralV216(Literal, HasInv2Pi); + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + return getInlineEncodingV216(true, Literal).has_value(); default: - return isInlinableIntLiteralV216(Literal); + llvm_unreachable("bad packed operand type"); } } -bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) { - assert(HasInv2Pi); - - int16_t Lo16 = static_cast<int16_t>(Literal); - if (isInt<16>(Literal) || isUInt<16>(Literal)) - return true; +// Whether the given literal can be inlined for a V_PK_*_IU16 instruction. +bool isInlinableLiteralV2I16(uint32_t Literal) { + return getInlineEncodingV2I16(Literal).has_value(); +} - int16_t Hi16 = static_cast<int16_t>(Literal >> 16); - if (!(Literal & 0xffff)) - return true; - return Lo16 == Hi16; +// Whether the given literal can be inlined for a V_PK_*_F16 instruction. +bool isInlinableLiteralV2F16(uint32_t Literal) { + return getInlineEncodingV2F16(Literal).has_value(); } bool isValid32BitLiteral(uint64_t Val, bool IsFP64) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 3c9f330..50c7417 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1291,16 +1291,19 @@ LLVM_READNONE bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi); LLVM_READNONE -bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi); +std::optional<unsigned> getInlineEncodingV2I16(uint32_t Literal); LLVM_READNONE -bool isInlinableIntLiteralV216(int32_t Literal); +std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal); LLVM_READNONE -bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType); +bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType); LLVM_READNONE -bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi); +bool isInlinableLiteralV2I16(uint32_t Literal); + +LLVM_READNONE +bool isInlinableLiteralV2F16(uint32_t Literal); LLVM_READNONE bool isValid32BitLiteral(uint64_t Val, bool IsFP64); diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 7f52501..985b77b 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -125,15 +125,6 @@ defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3P_Profile<VOP_V2I16_V2 let SubtargetPredicate = HasVOP3PInsts in { -// Undo sub x, c -> add x, -c canonicalization since c is more likely -// an inline immediate than -c. -// The constant will be emitted as a mov, and folded later. -// TODO: We could directly encode the immediate now -def : GCNPat< - (add (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), NegSubInlineConstV216:$src1), - (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1) ->; - // Integer operations with clamp bit set. class VOP3PSatPat<SDPatternOperator pat, Instruction inst> : GCNPat< (pat (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), |