diff options
39 files changed, 1522 insertions, 1729 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index bffea82..48ee0d9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -317,26 +317,16 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() { } } -bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N, - bool Negated) const { +bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const { if (N->isUndef()) return true; const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (Negated) { - if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) - return TII->isInlineConstant(-C->getAPIntValue()); + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) + return TII->isInlineConstant(C->getAPIntValue()); - if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) - return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt()); - - } else { - if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) - return TII->isInlineConstant(C->getAPIntValue()); - - if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) - return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt()); - } + if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) + return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt()); return false; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 374108a..df4a211 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -50,15 +50,13 @@ static inline bool getConstantValue(SDValue N, uint32_t &Out) { } // TODO: Handle undef as zero -static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG, - bool Negate = false) { +static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); uint32_t LHSVal, RHSVal; if (getConstantValue(N->getOperand(0), LHSVal) && getConstantValue(N->getOperand(1), RHSVal)) { SDLoc SL(N); - uint32_t K = Negate ? (-LHSVal & 0xffff) | (-RHSVal << 16) - : (LHSVal & 0xffff) | (RHSVal << 16); + uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16); return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32)); } @@ -66,9 +64,6 @@ static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG, return nullptr; } -static inline SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) { - return packConstantV2I16(N, DAG, true); -} } // namespace /// AMDGPU specific code to select AMDGPU machine instructions for @@ -110,10 +105,7 @@ protected: private: std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const; - bool isInlineImmediate(const SDNode *N, bool Negated = false) const; - bool isNegInlineImmediate(const SDNode *N) const { - return isInlineImmediate(N, true); - } + bool isInlineImmediate(const SDNode *N) const; bool isInlineImmediate16(int64_t Imm) const { return AMDGPU::isInlinableLiteral16(Imm, Subtarget->hasInv2PiInlineImm()); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 5f2b7c0..b7f0438 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1865,6 +1865,9 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_IMM_V2FP32: case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: return &APFloat::IEEEsingle(); @@ -1879,13 +1882,10 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: - case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_INT16: case AMDGPU::OPERAND_REG_INLINE_AC_FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: - case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_KIMM16: return &APFloat::IEEEhalf(); @@ -2033,9 +2033,14 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const { // We allow fp literals with f16x2 operands assuming that the specified // literal goes into the lower half and the upper half is zero. We also // require that the literal may be losslessly converted to f16. - MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 : - (type == MVT::v2i16)? MVT::i16 : - (type == MVT::v2f32)? MVT::f32 : type; + // + // For i16x2 operands, we assume that the specified literal is encoded as a + // single-precision float. This is pretty odd, but it matches SP3 and what + // happens in hardware. + MVT ExpectedType = (type == MVT::v2f16) ? MVT::f16 + : (type == MVT::v2i16) ? MVT::f32 + : (type == MVT::v2f32) ? MVT::f32 + : type; APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val)); return canLosslesslyConvertToFPType(FPLiteral, ExpectedType); @@ -3401,12 +3406,12 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 || OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16 || OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16) - return AMDGPU::isInlinableIntLiteralV216(Val); + return AMDGPU::isInlinableLiteralV2I16(Val); if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 || OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2FP16 || OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) - return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm()); + return AMDGPU::isInlinableLiteralV2F16(Val); return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm()); } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 67be7b0..9dff3f6 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -182,6 +182,9 @@ static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val, DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, Imm, \ false, ImmWidth) +#define DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(Name, OpWidth, ImmWidth) \ + DECODE_SrcOp(decodeOperand_##Name, 9, OpWidth, Imm, false, ImmWidth) + // Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc) // and decode using 'enum10' from decodeSrcOp. #define DECODE_OPERAND_SRC_REG_OR_IMM_A9(RegClass, OpWidth, ImmWidth) \ @@ -262,6 +265,9 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2I16, OPW32, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2F16, OPW32, 16) + DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_64, OPW64, 64) DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_128, OPW128, 32) DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_256, OPW256, 64) diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 91a7093..b85eb76 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1096,7 +1096,7 @@ public: bool hasDstSelForwardingHazard() const { return GFX940Insts; } // Cannot use op_sel with v_dot instructions. - bool hasDOTOpSelHazard() const { return GFX940Insts; } + bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; } // Does not have HW interlocs for VALU writing and then reading SGPRs. bool hasVDecCoExecHazard() const { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index ef1b85f..6c7977e 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -460,56 +460,84 @@ void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm, } } -void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, - const MCSubtargetInfo &STI, - raw_ostream &O) { - int16_t SImm = static_cast<int16_t>(Imm); - if (isInlinableIntLiteral(SImm)) { - O << SImm; - return; - } - +// This must accept a 32-bit immediate value to correctly handle packed 16-bit +// operations. +static bool printImmediateFloat16(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O) { if (Imm == 0x3C00) - O<< "1.0"; + O << "1.0"; else if (Imm == 0xBC00) - O<< "-1.0"; + O << "-1.0"; else if (Imm == 0x3800) - O<< "0.5"; + O << "0.5"; else if (Imm == 0xB800) - O<< "-0.5"; + O << "-0.5"; else if (Imm == 0x4000) - O<< "2.0"; + O << "2.0"; else if (Imm == 0xC000) - O<< "-2.0"; + O << "-2.0"; else if (Imm == 0x4400) - O<< "4.0"; + O << "4.0"; else if (Imm == 0xC400) - O<< "-4.0"; - else if (Imm == 0x3118 && - STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) { + O << "-4.0"; + else if (Imm == 0x3118 && STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) O << "0.15915494"; - } else { - uint64_t Imm16 = static_cast<uint16_t>(Imm); - O << formatHex(Imm16); - } -} + else + return false; -void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, - const MCSubtargetInfo &STI, - raw_ostream &O) { - uint16_t Lo16 = static_cast<uint16_t>(Imm); - printImmediate16(Lo16, STI, O); + return true; } -void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, +void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O) { + int16_t SImm = static_cast<int16_t>(Imm); + if (isInlinableIntLiteral(SImm)) { + O << SImm; + return; + } + + uint16_t HImm = static_cast<uint16_t>(Imm); + if (printImmediateFloat16(HImm, STI, O)) + return; + + uint64_t Imm16 = static_cast<uint16_t>(Imm); + O << formatHex(Imm16); +} + +void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType, + const MCSubtargetInfo &STI, + raw_ostream &O) { int32_t SImm = static_cast<int32_t>(Imm); - if (SImm >= -16 && SImm <= 64) { + if (isInlinableIntLiteral(SImm)) { O << SImm; return; } + switch (OpType) { + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + if (printImmediateFloat32(Imm, STI, O)) + return; + break; + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + if (isUInt<16>(Imm) && + printImmediateFloat16(static_cast<uint16_t>(Imm), STI, O)) + return; + break; + default: + llvm_unreachable("bad operand type"); + } + + O << formatHex(static_cast<uint64_t>(Imm)); +} + +bool AMDGPUInstPrinter::printImmediateFloat32(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { if (Imm == llvm::bit_cast<uint32_t>(0.0f)) O << "0.0"; else if (Imm == llvm::bit_cast<uint32_t>(1.0f)) @@ -532,7 +560,24 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) O << "0.15915494"; else - O << formatHex(static_cast<uint64_t>(Imm)); + return false; + + return true; +} + +void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { + int32_t SImm = static_cast<int32_t>(Imm); + if (isInlinableIntLiteral(SImm)) { + O << SImm; + return; + } + + if (printImmediateFloat32(Imm, STI, O)) + return; + + O << formatHex(static_cast<uint64_t>(Imm)); } void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, @@ -755,25 +800,11 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, break; case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: - if (!isUInt<16>(Op.getImm()) && - STI.hasFeature(AMDGPU::FeatureVOP3Literal)) { - printImmediate32(Op.getImm(), STI, O); - break; - } - - // Deal with 16-bit FP inline immediates not working. - if (OpTy == AMDGPU::OPERAND_REG_IMM_V2FP16) { - printImmediate16(static_cast<uint16_t>(Op.getImm()), STI, O); - break; - } - [[fallthrough]]; case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: - printImmediateInt16(static_cast<uint16_t>(Op.getImm()), STI, O); - break; case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: - printImmediateV216(Op.getImm(), STI, O); + printImmediateV216(Op.getImm(), OpTy, STI, O); break; case MCOI::OPERAND_UNKNOWN: case MCOI::OPERAND_PCREL: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index f2f985f..e3958f8 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -88,8 +88,10 @@ private: raw_ostream &O); void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); - void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI, - raw_ostream &O); + void printImmediateV216(uint32_t Imm, uint8_t OpType, + const MCSubtargetInfo &STI, raw_ostream &O); + bool printImmediateFloat32(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O); void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index b403d69..de1abaf 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -284,22 +284,15 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO, // which does not have f16 support? return getLit16Encoding(static_cast<uint16_t>(Imm), STI); case AMDGPU::OPERAND_REG_IMM_V2INT16: - case AMDGPU::OPERAND_REG_IMM_V2FP16: { - if (!isUInt<16>(Imm) && STI.hasFeature(AMDGPU::FeatureVOP3Literal)) - return getLit32Encoding(static_cast<uint32_t>(Imm), STI); - if (OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) - return getLit16Encoding(static_cast<uint16_t>(Imm), STI); - [[fallthrough]]; - } case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: - return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI); + return AMDGPU::getInlineEncodingV2I16(static_cast<uint32_t>(Imm)) + .value_or(255); + case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { - uint16_t Lo16 = static_cast<uint16_t>(Imm); - uint32_t Encoding = getLit16Encoding(Lo16, STI); - return Encoding; - } + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + return AMDGPU::getInlineEncodingV2F16(static_cast<uint32_t>(Imm)) + .value_or(255); case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_KIMM16: return MO.getImm(); diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 709de61..aa7639a 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -208,9 +208,7 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const { assert(Old.isReg() && Fold.isImm()); if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) || - (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) || - isUInt<16>(Fold.ImmToFold) || - !AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST->hasInv2PiInlineImm())) + (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT))) return false; unsigned Opcode = MI->getOpcode(); @@ -234,42 +232,123 @@ bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const { MachineOperand &Old = MI->getOperand(Fold.UseOpNo); unsigned Opcode = MI->getOpcode(); int OpNo = MI->getOperandNo(&Old); + uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType; + + // If the literal can be inlined as-is, apply it and short-circuit the + // tests below. The main motivation for this is to avoid unintuitive + // uses of opsel. + if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) { + Old.ChangeToImmediate(Fold.ImmToFold); + return true; + } - // Set op_sel/op_sel_hi on this operand or bail out if op_sel is - // already set. + // Refer to op_sel/op_sel_hi and check if we can change the immediate and + // op_sel in a way that allows an inline constant. int ModIdx = -1; - if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) + unsigned SrcIdx = ~0; + if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) { ModIdx = AMDGPU::OpName::src0_modifiers; - else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) + SrcIdx = 0; + } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) { ModIdx = AMDGPU::OpName::src1_modifiers; - else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) + SrcIdx = 1; + } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) { ModIdx = AMDGPU::OpName::src2_modifiers; + SrcIdx = 2; + } assert(ModIdx != -1); ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx); MachineOperand &Mod = MI->getOperand(ModIdx); - unsigned Val = Mod.getImm(); - if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1)) + unsigned ModVal = Mod.getImm(); + + uint16_t ImmLo = static_cast<uint16_t>( + Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0)); + uint16_t ImmHi = static_cast<uint16_t>( + Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0)); + uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo; + unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1); + + // Helper function that attempts to inline the given value with a newly + // chosen opsel pattern. + auto tryFoldToInline = [&](uint32_t Imm) -> bool { + if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) { + Mod.setImm(NewModVal | SISrcMods::OP_SEL_1); + Old.ChangeToImmediate(Imm); + return true; + } + + // Try to shuffle the halves around and leverage opsel to get an inline + // constant. + uint16_t Lo = static_cast<uint16_t>(Imm); + uint16_t Hi = static_cast<uint16_t>(Imm >> 16); + if (Lo == Hi) { + if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) { + Mod.setImm(NewModVal); + Old.ChangeToImmediate(Lo); + return true; + } + + if (static_cast<int16_t>(Lo) < 0) { + int32_t SExt = static_cast<int16_t>(Lo); + if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) { + Mod.setImm(NewModVal); + Old.ChangeToImmediate(SExt); + return true; + } + } + + // This check is only useful for integer instructions + if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 || + OpType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16) { + if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) { + Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1); + Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16); + return true; + } + } + } else { + uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi; + if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) { + Mod.setImm(NewModVal | SISrcMods::OP_SEL_0); + Old.ChangeToImmediate(Swapped); + return true; + } + } + return false; + }; - // Only apply the following transformation if that operand requires - // a packed immediate. - // If upper part is all zero we do not need op_sel_hi. - if (!(Fold.ImmToFold & 0xffff)) { - MachineOperand New = - MachineOperand::CreateImm((Fold.ImmToFold >> 16) & 0xffff); - if (!TII->isOperandLegal(*MI, OpNo, &New)) - return false; - Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); - Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); - Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); + if (tryFoldToInline(Imm)) return true; + + // Replace integer addition by subtraction and vice versa if it allows + // folding the immediate to an inline constant. + // + // We should only ever get here for SrcIdx == 1 due to canonicalization + // earlier in the pipeline, but we double-check here to be safe / fully + // general. + bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16; + bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16; + if (SrcIdx == 1 && (IsUAdd || IsUSub)) { + unsigned ClampIdx = + AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp); + bool Clamp = MI->getOperand(ClampIdx).getImm() != 0; + + if (!Clamp) { + uint16_t NegLo = -static_cast<uint16_t>(Imm); + uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16); + uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo; + + if (tryFoldToInline(NegImm)) { + unsigned NegOpcode = + IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16; + MI->setDesc(TII->get(NegOpcode)); + return true; + } + } } - MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold & 0xffff); - if (!TII->isOperandLegal(*MI, OpNo, &New)) - return false; - Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); - Old.ChangeToImmediate(Fold.ImmToFold & 0xffff); - return true; + + return false; } bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { @@ -277,8 +356,19 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { MachineOperand &Old = MI->getOperand(Fold.UseOpNo); assert(Old.isReg()); - if (Fold.isImm() && canUseImmWithOpSel(Fold)) - return tryFoldImmWithOpSel(Fold); + if (Fold.isImm() && canUseImmWithOpSel(Fold)) { + if (tryFoldImmWithOpSel(Fold)) + return true; + + // We can't represent the candidate as an inline constant. Try as a literal + // with the original opsel, checking constant bus limitations. + MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold); + int OpNo = MI->getOperandNo(&Old); + if (!TII->isOperandLegal(*MI, OpNo, &New)) + return false; + Old.ChangeToImmediate(Fold.ImmToFold); + return true; + } if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) { MachineBasicBlock *MBB = MI->getParent(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 396d22c..6799292 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4153,15 +4153,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: - return (isInt<16>(Imm) || isUInt<16>(Imm)) && - AMDGPU::isInlinableIntLiteral((int16_t)Imm); + return AMDGPU::isInlinableLiteralV2I16(Imm); + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + return AMDGPU::isInlinableLiteralV2F16(Imm); case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: case AMDGPU::OPERAND_REG_INLINE_C_FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_FP16: - case AMDGPU::OPERAND_REG_IMM_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { // A few special case instructions have 16-bit operands on subtargets // where 16-bit instructions are not legal. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 50724fd..f07b8fa0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -860,23 +860,6 @@ def ShiftAmt32Imm : ImmLeaf <i32, [{ return Imm < 32; }]>; -def getNegV2I16Imm : SDNodeXForm<build_vector, [{ - return SDValue(packNegConstantV2I16(N, *CurDAG), 0); -}]>; - -def NegSubInlineConstV216 : PatLeaf<(build_vector), [{ - assert(N->getNumOperands() == 2); - assert(N->getOperand(0).getValueType().getSizeInBits() == 16); - SDValue Src0 = N->getOperand(0); - SDValue Src1 = N->getOperand(1); - if (Src0 == Src1) - return isNegInlineImmediate(Src0.getNode()); - - return (isNullConstantOrUndef(Src0) && isNegInlineImmediate(Src1.getNode())) || - (isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode())); -}], getNegV2I16Imm>; - - def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{ return fp16SrcZerosHighBits(N->getOpcode()); }]>; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index c94b894..1d197dc 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1152,11 +1152,11 @@ class RegOrF32 <string RegisterClass, string OperandTypePrefix> class RegOrV2B16 <string RegisterClass, string OperandTypePrefix> : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2INT16", - !subst("_v2b16", "V2B16", NAME), "_Imm16">; + !subst("_v2b16", "V2B16", NAME), "_ImmV2I16">; class RegOrV2F16 <string RegisterClass, string OperandTypePrefix> : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2FP16", - !subst("_v2f16", "V2F16", NAME), "_Imm16">; + !subst("_v2f16", "V2F16", NAME), "_ImmV2F16">; class RegOrF64 <string RegisterClass, string OperandTypePrefix> : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP64", diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index a91d771..26ba257 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -2506,53 +2506,95 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { Val == 0x3118; // 1/2pi } -bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) { - assert(HasInv2Pi); - - if (isInt<16>(Literal) || isUInt<16>(Literal)) { - int16_t Trunc = static_cast<int16_t>(Literal); - return AMDGPU::isInlinableLiteral16(Trunc, HasInv2Pi); +std::optional<unsigned> getInlineEncodingV216(bool IsFloat, uint32_t Literal) { + // Unfortunately, the Instruction Set Architecture Reference Guide is + // misleading about how the inline operands work for (packed) 16-bit + // instructions. In a nutshell, the actual HW behavior is: + // + // - integer encodings (-16 .. 64) are always produced as sign-extended + // 32-bit values + // - float encodings are produced as: + // - for F16 instructions: corresponding half-precision float values in + // the LSBs, 0 in the MSBs + // - for UI16 instructions: corresponding single-precision float value + int32_t Signed = static_cast<int32_t>(Literal); + if (Signed >= 0 && Signed <= 64) + return 128 + Signed; + + if (Signed >= -16 && Signed <= -1) + return 192 + std::abs(Signed); + + if (IsFloat) { + // clang-format off + switch (Literal) { + case 0x3800: return 240; // 0.5 + case 0xB800: return 241; // -0.5 + case 0x3C00: return 242; // 1.0 + case 0xBC00: return 243; // -1.0 + case 0x4000: return 244; // 2.0 + case 0xC000: return 245; // -2.0 + case 0x4400: return 246; // 4.0 + case 0xC400: return 247; // -4.0 + case 0x3118: return 248; // 1.0 / (2.0 * pi) + default: break; + } + // clang-format on + } else { + // clang-format off + switch (Literal) { + case 0x3F000000: return 240; // 0.5 + case 0xBF000000: return 241; // -0.5 + case 0x3F800000: return 242; // 1.0 + case 0xBF800000: return 243; // -1.0 + case 0x40000000: return 244; // 2.0 + case 0xC0000000: return 245; // -2.0 + case 0x40800000: return 246; // 4.0 + case 0xC0800000: return 247; // -4.0 + case 0x3E22F983: return 248; // 1.0 / (2.0 * pi) + default: break; + } + // clang-format on } - if (!(Literal & 0xffff)) - return AMDGPU::isInlinableLiteral16(Literal >> 16, HasInv2Pi); - int16_t Lo16 = static_cast<int16_t>(Literal); - int16_t Hi16 = static_cast<int16_t>(Literal >> 16); - return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi); + return {}; } -bool isInlinableIntLiteralV216(int32_t Literal) { - int16_t Lo16 = static_cast<int16_t>(Literal); - if (isInt<16>(Literal) || isUInt<16>(Literal)) - return isInlinableIntLiteral(Lo16); +// Encoding of the literal as an inline constant for a V_PK_*_IU16 instruction +// or nullopt. +std::optional<unsigned> getInlineEncodingV2I16(uint32_t Literal) { + return getInlineEncodingV216(false, Literal); +} - int16_t Hi16 = static_cast<int16_t>(Literal >> 16); - if (!(Literal & 0xffff)) - return isInlinableIntLiteral(Hi16); - return Lo16 == Hi16 && isInlinableIntLiteral(Lo16); +// Encoding of the literal as an inline constant for a V_PK_*_F16 instruction +// or nullopt. +std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal) { + return getInlineEncodingV216(true, Literal); } -bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType) { +// Whether the given literal can be inlined for a V_PK_* instruction. +bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) { switch (OpType) { + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + return getInlineEncodingV216(false, Literal).has_value(); case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - return isInlinableLiteralV216(Literal, HasInv2Pi); + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + return getInlineEncodingV216(true, Literal).has_value(); default: - return isInlinableIntLiteralV216(Literal); + llvm_unreachable("bad packed operand type"); } } -bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) { - assert(HasInv2Pi); - - int16_t Lo16 = static_cast<int16_t>(Literal); - if (isInt<16>(Literal) || isUInt<16>(Literal)) - return true; +// Whether the given literal can be inlined for a V_PK_*_IU16 instruction. +bool isInlinableLiteralV2I16(uint32_t Literal) { + return getInlineEncodingV2I16(Literal).has_value(); +} - int16_t Hi16 = static_cast<int16_t>(Literal >> 16); - if (!(Literal & 0xffff)) - return true; - return Lo16 == Hi16; +// Whether the given literal can be inlined for a V_PK_*_F16 instruction. +bool isInlinableLiteralV2F16(uint32_t Literal) { + return getInlineEncodingV2F16(Literal).has_value(); } bool isValid32BitLiteral(uint64_t Val, bool IsFP64) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 3c9f330..50c7417 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1291,16 +1291,19 @@ LLVM_READNONE bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi); LLVM_READNONE -bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi); +std::optional<unsigned> getInlineEncodingV2I16(uint32_t Literal); LLVM_READNONE -bool isInlinableIntLiteralV216(int32_t Literal); +std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal); LLVM_READNONE -bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType); +bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType); LLVM_READNONE -bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi); +bool isInlinableLiteralV2I16(uint32_t Literal); + +LLVM_READNONE +bool isInlinableLiteralV2F16(uint32_t Literal); LLVM_READNONE bool isValid32BitLiteral(uint64_t Val, bool IsFP64); diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 7f52501..985b77b 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -125,15 +125,6 @@ defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3P_Profile<VOP_V2I16_V2 let SubtargetPredicate = HasVOP3PInsts in { -// Undo sub x, c -> add x, -c canonicalization since c is more likely -// an inline immediate than -c. -// The constant will be emitted as a mov, and folded later. -// TODO: We could directly encode the immediate now -def : GCNPat< - (add (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), NegSubInlineConstV216:$src1), - (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1) ->; - // Integer operations with clamp bit set. class VOP3PSatPat<SDPatternOperator pat, Instruction inst> : GCNPat< (pat (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll index e4cabab..496ee9f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -172,8 +172,7 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) { ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffc0ffc0 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 64 op_sel_hi:[1,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat: @@ -188,7 +187,7 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) { ; GFX10-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v0, 0xffc0, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_sub_u16 v0, v0, 64 op_sel_hi:[1,0] ; GFX10-NEXT: s_setpc_b64 s[30:31] %add = add <2 x i16> %a, <i16 -64, i16 -64> ret <2 x i16> %add @@ -609,3 +608,65 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha %cast = bitcast <2 x i16> %add to i32 ret i32 %cast } + +define <2 x i16> @add_inline_imm_neg1_0(<2 x i16> %x) { +; GFX7-LABEL: add_inline_imm_neg1_0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_inline_imm_neg1_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: add_inline_imm_neg1_0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX8-NEXT: v_add_u16_e32 v0, -1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: add_inline_imm_neg1_0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_sub_u16 v0, v0, 1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %y = add <2 x i16> %x, <i16 -1, i16 0> + ret <2 x i16> %y +} + +define <2 x i16> @add_inline_imm_1_0(<2 x i16> %x) { +; GFX7-LABEL: add_inline_imm_1_0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_inline_imm_1_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, 1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: add_inline_imm_1_0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX8-NEXT: v_add_u16_e32 v0, 1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: add_inline_imm_1_0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_add_u16 v0, v0, 1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %y = add <2 x i16> %x, <i16 1, i16 0> + ret <2 x i16> %y +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll index aa7aa6b..5613501 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll @@ -156,13 +156,13 @@ define <2 x i16> @v_sub_v2i16_neg_inline_imm_splat(<2 x i16> %a) { ; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_splat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0xffc0 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0xffc0ffc0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_sub_v2i16_neg_inline_imm_splat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0xffc0 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0xffc0ffc0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %sub = sub <2 x i16> %a, <i16 -64, i16 -64> ret <2 x i16> %sub diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index b90d68a..7cf58a2 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -437,7 +437,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, -1 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -449,7 +449,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_add_u16 v0, v0, -1 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -460,7 +460,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_sub_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, -1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -566,8 +566,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s2, 1.0 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s2 +; GFX9-NEXT: v_pk_add_u16 v0, v0, 1.0 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -579,7 +578,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v0, 0x3f80, v0 op_sel:[1,0] op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_add_u16 v0, v0, 1.0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -590,7 +589,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v0, 0x3f80, v0 op_sel:[1,0] op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -990,6 +989,66 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ret void } +define <2 x i16> @add_inline_imm_neg1_0(<2 x i16> %x) { +; VI-LABEL: add_inline_imm_neg1_0: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_add_u16_e32 v0, -1, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_inline_imm_neg1_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: add_inline_imm_neg1_0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_sub_u16 v0, v0, 1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_inline_imm_neg1_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v0, v0, 1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %y = add <2 x i16> %x, <i16 -1, i16 0> + ret <2 x i16> %y +} + +define <2 x i16> @add_inline_imm_1_0(<2 x i16> %x) { +; VI-LABEL: add_inline_imm_1_0: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_add_u16_e32 v0, 1, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_inline_imm_1_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, 1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: add_inline_imm_1_0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_add_u16 v0, v0, 1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_inline_imm_1_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v0, 1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %y = add <2 x i16> %x, <i16 1, i16 0> + ret <2 x i16> %y +} + declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index cb89841b..d63ebde 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -431,7 +431,7 @@ define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) { ; ; GFX11-LABEL: ps_mesa_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -468,7 +468,7 @@ define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) { ; ; GFX11-LABEL: ps_mesa_inreg_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_pk_sub_u16 v0, s0, -1 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 1 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index 329f0a2..dfc8361 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -597,7 +597,7 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep(ptr addrspace(3) %ptr) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_read_u16_d16_hi v1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; GCN-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] ; GCN-NEXT: ds_read_u16_d16 v1, v0 offset:2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 @@ -608,7 +608,7 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep(ptr addrspace(3) %ptr) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_u16_d16_hi v1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] ; GFX10-NEXT: ds_read_u16_d16 v1, v0 offset:2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v1 @@ -619,7 +619,7 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep(ptr addrspace(3) %ptr) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_u16_d16_hi v1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] ; GFX11-NEXT: ds_load_u16_d16 v1, v0 offset:2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 @@ -643,7 +643,7 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(ptr addrspace(3) %p ; GFX900-NEXT: ds_read_u16_d16_hi v0, v0 ; GFX900-NEXT: s_mov_b32 s4, 0xffff ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] ; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -654,7 +654,7 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(ptr addrspace(3) %p ; FLATSCR-NEXT: ds_read_u16_d16_hi v0, v0 ; FLATSCR-NEXT: s_mov_b32 s0, 0xffff ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; FLATSCR-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; FLATSCR-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] ; FLATSCR-NEXT: v_bfi_b32 v0, s0, v1, v0 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] ; @@ -664,7 +664,7 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(ptr addrspace(3) %p ; GFX10-NEXT: ds_read_u16 v1, v0 offset:2 ; GFX10-NEXT: ds_read_u16_d16_hi v0, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -674,7 +674,7 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(ptr addrspace(3) %p ; GFX11-NEXT: ds_load_u16 v1, v0 offset:2 ; GFX11-NEXT: ds_load_u16_d16_hi v0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -694,7 +694,7 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; GFX900-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] ; GFX900-NEXT: buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v0, v1 @@ -705,7 +705,7 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) { ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR-NEXT: scratch_load_short_d16_hi v1, v0, off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; FLATSCR-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] ; FLATSCR-NEXT: scratch_load_short_d16 v1, v0, off offset:2 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_mov_b32_e32 v0, v1 @@ -716,7 +716,7 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) { ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) -; GFX10_DEFAULT-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; GFX10_DEFAULT-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] ; GFX10_DEFAULT-NEXT: buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v0, v1 @@ -727,7 +727,7 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) { ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, v0, off ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) -; FLATSCR_GFX10-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; FLATSCR_GFX10-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] ; FLATSCR_GFX10-NEXT: scratch_load_short_d16 v1, v0, off offset:2 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1 @@ -738,7 +738,7 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] ; GFX11-NEXT: scratch_load_d16_b16 v1, v0, off offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v1 @@ -762,7 +762,7 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) { ; GFX900-NEXT: global_load_short_d16_hi v0, v[0:1], off glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_mov_b32 s4, 0xffff -; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] ; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -774,7 +774,7 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) { ; FLATSCR-NEXT: global_load_short_d16_hi v0, v[0:1], off glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_mov_b32 s0, 0xffff -; FLATSCR-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; FLATSCR-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] ; FLATSCR-NEXT: v_bfi_b32 v0, s0, v2, v0 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] ; @@ -785,7 +785,7 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_short_d16_hi v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -796,7 +796,7 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -820,7 +820,7 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_mov_b32 s4, 0xffff ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] ; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -833,7 +833,7 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) { ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_mov_b32 s0, 0xffff ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; FLATSCR-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; FLATSCR-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] ; FLATSCR-NEXT: v_bfi_b32 v0, s0, v2, v0 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] ; @@ -846,7 +846,7 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: flat_load_short_d16_hi v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -857,7 +857,7 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll index 7894f6b..e12de1d 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll @@ -255,8 +255,8 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX10-GISEL: ; %bb.0: ; %bb ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e211e -; GFX10-GISEL-NEXT: v_pk_mul_f16 v2, 0x291e, v0 op_sel_hi:[0,1] -; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, 0x291e, v0, v1 op_sel_hi:[0,1,1] +; GFX10-GISEL-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0 +; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 ; GFX10-GISEL-NEXT: v_cmp_gt_f16_e64 s4, 0, v0 @@ -288,9 +288,9 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e211e -; GFX11-GISEL-NEXT: v_pk_mul_f16 v2, 0x291e, v0 op_sel_hi:[0,1] +; GFX11-GISEL-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, 0x291e, v0, v1 op_sel_hi:[0,1,1] +; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1 ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX11-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll index 0ff5ea6..3e658c6 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll @@ -77,11 +77,29 @@ define <2 x half> @v_mul_42_v2f16(<2 x half> %x) { ; GFX9-GISEL-NEXT: v_pk_mul_f16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_mul_42_v2f16: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_pk_mul_f16 v0, 0x5140, v0 op_sel_hi:[0,1] -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX10-SDAG-LABEL: v_mul_42_v2f16: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_pk_mul_f16 v0, 0x5140, v0 op_sel_hi:[0,1] +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_mul_42_v2f16: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_pk_mul_f16 v0, 0x51405140, v0 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_mul_42_v2f16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, 0x5140, v0 op_sel_hi:[0,1] +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_mul_42_v2f16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_pk_mul_f16 v0, 0x51405140, v0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul <2 x half> %x, <half 42.0, half 42.0> ret <2 x half> %mul } @@ -3192,11 +3210,29 @@ define <2 x half> @v_mul_16_v2f16(<2 x half> %x) { ; GFX9-GISEL-NEXT: v_pk_mul_f16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_mul_16_v2f16: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_pk_mul_f16 v0, 0x4c00, v0 op_sel_hi:[0,1] -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX10-SDAG-LABEL: v_mul_16_v2f16: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_pk_mul_f16 v0, 0x4c00, v0 op_sel_hi:[0,1] +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_mul_16_v2f16: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_pk_mul_f16 v0, 0x4c004c00, v0 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_mul_16_v2f16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, 0x4c00, v0 op_sel_hi:[0,1] +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_mul_16_v2f16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_pk_mul_f16 v0, 0x4c004c00, v0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul <2 x half> %x, <half 16.0, half 16.0> ret <2 x half> %mul } @@ -3216,11 +3252,29 @@ define <2 x half> @v_mul_neg16_v2f16(<2 x half> %x) { ; GFX9-GISEL-NEXT: v_pk_mul_f16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_mul_neg16_v2f16: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_pk_mul_f16 v0, 0xcc00, v0 op_sel_hi:[0,1] -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX10-SDAG-LABEL: v_mul_neg16_v2f16: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_pk_mul_f16 v0, 0xcc00, v0 op_sel_hi:[0,1] +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_mul_neg16_v2f16: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_pk_mul_f16 v0, 0xcc00cc00, v0 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_mul_neg16_v2f16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, 0xcc00, v0 op_sel_hi:[0,1] +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_mul_neg16_v2f16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_pk_mul_f16 v0, 0xcc00cc00, v0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul <2 x half> %x, <half -16.0, half -16.0> ret <2 x half> %mul } @@ -3242,12 +3296,33 @@ define <2 x half> @v_mul_fabs_16_v2f16(<2 x half> %x) { ; GFX9-GISEL-NEXT: v_pk_mul_f16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_mul_fabs_16_v2f16: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX1011-NEXT: v_pk_mul_f16 v0, 0x4c00, v0 op_sel_hi:[0,1] -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX10-SDAG-LABEL: v_mul_fabs_16_v2f16: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX10-SDAG-NEXT: v_pk_mul_f16 v0, 0x4c00, v0 op_sel_hi:[0,1] +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_mul_fabs_16_v2f16: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX10-GISEL-NEXT: v_pk_mul_f16 v0, 0x4c004c00, v0 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_mul_fabs_16_v2f16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, 0x4c00, v0 op_sel_hi:[0,1] +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_mul_fabs_16_v2f16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11-GISEL-NEXT: v_pk_mul_f16 v0, 0x4c004c00, v0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %mul = fmul <2 x half> %x.fabs, <half 16.0, half 16.0> ret <2 x half> %mul @@ -3268,11 +3343,29 @@ define <2 x half> @v_fma_mul_add_32_v2f16(<2 x half> %x, <2 x half> %y) { ; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v0, v2, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_fma_mul_add_32_v2f16: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_pk_fma_f16 v0, 0x5000, v0, v1 op_sel_hi:[0,1,1] -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX10-SDAG-LABEL: v_fma_mul_add_32_v2f16: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_pk_fma_f16 v0, 0x5000, v0, v1 op_sel_hi:[0,1,1] +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_fma_mul_add_32_v2f16: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, 0x50005000, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_fma_mul_add_32_v2f16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, 0x5000, v0, v1 op_sel_hi:[0,1,1] +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_fma_mul_add_32_v2f16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, 0x50005000, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul contract <2 x half> %x, <half 32.0, half 32.0> %fma = fadd contract <2 x half> %mul, %y ret <2 x half> %fma diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll index 3afcc7d..afb3a02 100644 --- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -480,7 +480,7 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_f16_e64 s[4:5], 0xbc00, s4 +; VI-NEXT: v_cmp_eq_f16_e64 s[4:5], -1.0, s4 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -492,7 +492,7 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0xbc00, s2 +; GFX11-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll index 8c33004..b66ca71 100644 --- a/llvm/test/CodeGen/AMDGPU/immv216.ll +++ b/llvm/test/CodeGen/AMDGPU/immv216.ll @@ -580,7 +580,7 @@ define amdgpu_kernel void @add_inline_imm_64_v2f16(ptr addrspace(1) %out, <2 x h ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x38003800 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] -; GFX10: v_pk_mul_lo_u16 v0, 0x3800, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x38,0x00,0x00] +; GFX10: v_pk_mul_lo_u16 v0, 0x38003800, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x38,0x00,0x38] define <2 x i16> @mul_inline_imm_0.5_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> <half 0.5, half 0.5> to <2 x i16>) ret <2 x i16> %y @@ -590,7 +590,7 @@ define <2 x i16> @mul_inline_imm_0.5_v2i16(<2 x i16> %x) { ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xb800b800 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] -; GFX10: v_pk_mul_lo_u16 v0, 0xb800, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xb8,0x00,0x00] +; GFX10: v_pk_mul_lo_u16 v0, 0xb800b800, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xb8,0x00,0xb8] define <2 x i16> @mul_inline_imm_neg_0.5_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> <half -0.5, half -0.5> to <2 x i16>) ret <2 x i16> %y @@ -600,7 +600,7 @@ define <2 x i16> @mul_inline_imm_neg_0.5_v2i16(<2 x i16> %x) { ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x3c003c00 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] -; GFX10: v_pk_mul_lo_u16 v0, 0x3c00, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x3c,0x00,0x00] +; GFX10: v_pk_mul_lo_u16 v0, 0x3c003c00, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x3c,0x00,0x3c] define <2 x i16> @mul_inline_imm_1.0_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> <half 1.0, half 1.0> to <2 x i16>) ret <2 x i16> %y @@ -610,27 +610,25 @@ define <2 x i16> @mul_inline_imm_1.0_v2i16(<2 x i16> %x) { ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00bc00 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] -; GFX10: v_pk_mul_lo_u16 v0, 0xbc00, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xbc,0x00,0x00] +; GFX10: v_pk_mul_lo_u16 v0, 0xbc00bc00, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xbc,0x00,0xbc] define <2 x i16> @mul_inline_imm_neg_1.0_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> <half -1.0, half -1.0> to <2 x i16>) ret <2 x i16> %y } ; GCN-LABEL: {{^}}shl_inline_imm_2.0_v2i16: -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40004000 -; GFX9: v_pk_lshlrev_b16 v0, v0, [[K]] +; GFX9: v_pk_lshlrev_b16 v0, v0, 2.0 op_sel:[0,1] -; GFX10: v_pk_lshlrev_b16 v0, v0, 0x4000 op_sel_hi:[1,0] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x40,0x00,0x00] +; GFX10: v_pk_lshlrev_b16 v0, v0, 2.0 op_sel:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xe9,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}}] define <2 x i16> @shl_inline_imm_2.0_v2i16(<2 x i16> %x) { %y = shl <2 x i16> bitcast (<2 x half> <half 2.0, half 2.0> to <2 x i16>), %x ret <2 x i16> %y } ; GCN-LABEL: {{^}}shl_inline_imm_neg_2.0_v2i16: -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xc000c000 -; GFX9: v_pk_lshlrev_b16 v0, v0, [[K]] +; GFX9: v_pk_lshlrev_b16 v0, v0, -2.0 op_sel:[0,1] -; GFX10: v_pk_lshlrev_b16 v0, v0, 0xc000 op_sel_hi:[1,0] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc0,0x00,0x00] +; GFX10: v_pk_lshlrev_b16 v0, v0, -2.0 op_sel:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xeb,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}}] define <2 x i16> @shl_inline_imm_neg_2.0_v2i16(<2 x i16> %x) { %y = shl <2 x i16> bitcast (<2 x half> <half -2.0, half -2.0> to <2 x i16>), %x ret <2 x i16> %y @@ -640,7 +638,7 @@ define <2 x i16> @shl_inline_imm_neg_2.0_v2i16(<2 x i16> %x) { ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004400 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] -; GFX10: v_pk_mul_lo_u16 v0, 0x4400, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x44,0x00,0x00] +; GFX10: v_pk_mul_lo_u16 v0, 0x44004400, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x44,0x00,0x44] define <2 x i16> @mul_inline_imm_4.0_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> <half 4.0, half 4.0> to <2 x i16>) ret <2 x i16> %y @@ -651,7 +649,7 @@ define <2 x i16> @mul_inline_imm_4.0_v2i16(<2 x i16> %x) { ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xc400c400 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] -; GFX10: v_pk_mul_lo_u16 v0, 0xc400, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc4,0x00,0x00] +; GFX10: v_pk_mul_lo_u16 v0, 0xc400c400, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc4,0x00,0xc4] define <2 x i16> @mul_inline_imm_neg_4.0_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> <half -4.0, half -4.0> to <2 x i16>) ret <2 x i16> %y @@ -661,7 +659,7 @@ define <2 x i16> @mul_inline_imm_neg_4.0_v2i16(<2 x i16> %x) { ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x31183118 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] -; GFX10: v_pk_mul_lo_u16 v0, 0x3118, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x18,0x31,0x00,0x00] +; GFX10: v_pk_mul_lo_u16 v0, 0x31183118, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x18,0x31,0x18,0x31] define <2 x i16> @mul_inline_imm_inv2pi_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> <half 0xH3118, half 0xH3118> to <2 x i16>) ret <2 x i16> %y diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index e2a3749..8874240 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -473,89 +473,47 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-SDAG-LABEL: clpeak_imad_pat_v2i16: -; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: clpeak_imad_pat_v2i16: -; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i16: -; GFX10-SDAG: ; %bb.0: ; %entry -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: clpeak_imad_pat_v2i16: -; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: clpeak_imad_pat_v2i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX9-NEXT: v_pk_add_u16 v0, v2, v0 +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: clpeak_imad_pat_v2i16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: clpeak_imad_pat_v2i16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX10-NEXT: v_pk_add_u16 v0, v2, v0 +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX10-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: clpeak_imad_pat_v2i16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX11-NEXT: v_pk_add_u16 v0, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX11-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <2 x i16> %x, <i16 1, i16 1> %add = mul <2 x i16> %y18, %y @@ -733,18 +691,18 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX9-SDAG-LABEL: clpeak_imad_pat_v3i16: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 -; GFX9-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 +; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v4, v0, v2 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v5, v1, v3 ; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v5, v1 ; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v4, v0 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-SDAG-NEXT: v_pk_sub_u16 v2, v4, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_sub_u16 v3, v5, -1 -; GFX9-SDAG-NEXT: v_pk_sub_u16 v4, v1, -1 -; GFX9-SDAG-NEXT: v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_add_u16 v3, v5, 1 +; GFX9-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 +; GFX9-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 @@ -775,18 +733,18 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX10-SDAG-LABEL: clpeak_imad_pat_v3i16: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 -; GFX10-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 +; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 ; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v4, v1 ; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v5, v0 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-SDAG-NEXT: v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_sub_u16 v3, v4, -1 -; GFX10-SDAG-NEXT: v_pk_sub_u16 v4, v1, -1 -; GFX10-SDAG-NEXT: v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v3, v4, 1 +; GFX10-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 +; GFX10-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 @@ -817,8 +775,8 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX11-SDAG-LABEL: clpeak_imad_pat_v3i16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 +; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 @@ -828,11 +786,11 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v3, v4, -1 +; GFX11-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v3, v4, 1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v4, v1, -1 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 +; GFX11-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 @@ -1130,18 +1088,18 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX9-SDAG-LABEL: clpeak_imad_pat_v4i16: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v4, v0, v2 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v5, v1, v3 ; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v5, v1 ; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v4, v0 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-SDAG-NEXT: v_pk_sub_u16 v2, v4, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_sub_u16 v3, v5, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_sub_u16 v4, v1, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_add_u16 v3, v5, 1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 @@ -1172,18 +1130,18 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX10-SDAG-LABEL: clpeak_imad_pat_v4i16: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 ; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v4, v1 ; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v5, v0 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-SDAG-NEXT: v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_sub_u16 v3, v4, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_sub_u16 v4, v1, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v3, v4, 1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 @@ -1214,8 +1172,8 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX11-SDAG-LABEL: clpeak_imad_pat_v4i16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 @@ -1225,11 +1183,11 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v3, v4, -1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v3, v4, 1 op_sel_hi:[1,0] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v4, v1, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 @@ -1555,89 +1513,47 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-SDAG-LABEL: clpeak_umad_pat_v2i16: -; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: clpeak_umad_pat_v2i16: -; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: clpeak_umad_pat_v2i16: -; GFX10-SDAG: ; %bb.0: ; %entry -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: clpeak_umad_pat_v2i16: -; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: clpeak_umad_pat_v2i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX9-NEXT: v_pk_add_u16 v0, v2, v0 +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: clpeak_umad_pat_v2i16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: clpeak_umad_pat_v2i16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX10-NEXT: v_pk_add_u16 v0, v2, v0 +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX10-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: clpeak_umad_pat_v2i16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: clpeak_umad_pat_v2i16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX11-NEXT: v_pk_add_u16 v0, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX11-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <2 x i16> %x, <i16 1, i16 1> %add = mul <2 x i16> %y18, %y @@ -1815,18 +1731,18 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX9-SDAG-LABEL: clpeak_umad_pat_v3i16: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 -; GFX9-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 +; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v4, v0, v2 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v5, v1, v3 ; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v5, v1 ; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v4, v0 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-SDAG-NEXT: v_pk_sub_u16 v2, v4, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_sub_u16 v3, v5, -1 -; GFX9-SDAG-NEXT: v_pk_sub_u16 v4, v1, -1 -; GFX9-SDAG-NEXT: v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_add_u16 v3, v5, 1 +; GFX9-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 +; GFX9-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 @@ -1857,18 +1773,18 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX10-SDAG-LABEL: clpeak_umad_pat_v3i16: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 -; GFX10-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 +; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 ; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v4, v1 ; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v5, v0 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-SDAG-NEXT: v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_sub_u16 v3, v4, -1 -; GFX10-SDAG-NEXT: v_pk_sub_u16 v4, v1, -1 -; GFX10-SDAG-NEXT: v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v3, v4, 1 +; GFX10-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 +; GFX10-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 @@ -1899,8 +1815,8 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX11-SDAG-LABEL: clpeak_umad_pat_v3i16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 +; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 @@ -1910,11 +1826,11 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v3, v4, -1 +; GFX11-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v3, v4, 1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v4, v1, -1 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 +; GFX11-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 @@ -2212,18 +2128,18 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX9-SDAG-LABEL: clpeak_umad_pat_v4i16: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v4, v0, v2 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v5, v1, v3 ; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v5, v1 ; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v4, v0 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX9-SDAG-NEXT: v_pk_sub_u16 v2, v4, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_sub_u16 v3, v5, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_sub_u16 v4, v1, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_add_u16 v3, v5, 1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 @@ -2254,18 +2170,18 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX10-SDAG-LABEL: clpeak_umad_pat_v4i16: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 ; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v4, v1 ; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v5, v0 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-SDAG-NEXT: v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_sub_u16 v3, v4, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_sub_u16 v4, v1, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v3, v4, 1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0] +; GFX10-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 @@ -2296,8 +2212,8 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX11-SDAG-LABEL: clpeak_umad_pat_v4i16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 @@ -2307,11 +2223,11 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v3, v4, -1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v3, v4, 1 op_sel_hi:[1,0] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v4, v1, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0] +; GFX11-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 @@ -7277,143 +7193,74 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-SDAG-LABEL: clpeak_imad_pat_v2i16_x2: -; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: clpeak_imad_pat_v2i16_x2: -; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i16_x2: -; GFX10-SDAG: ; %bb.0: ; %entry -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX10-SDAG-NEXT: v_pk_sub_u16 v2, v2, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v0, v2 -; GFX10-SDAG-NEXT: v_pk_add_u16 v2, v1, v2 -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v2, v0 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: clpeak_imad_pat_v2i16_x2: -; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX10-GISEL-NEXT: v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v0, v2 -; GFX10-GISEL-NEXT: v_pk_add_u16 v2, v1, v2 -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v2, v0 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: clpeak_imad_pat_v2i16_x2: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX9-NEXT: v_pk_add_u16 v0, v2, v0 +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX9-NEXT: v_pk_add_u16 v1, v2, v1 +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v1, v0 +; GFX9-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX9-NEXT: v_pk_add_u16 v1, v2, v1 +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v1, v0 +; GFX9-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: clpeak_imad_pat_v2i16_x2: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v2, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_add_u16 v2, v1, v2 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v2, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: clpeak_imad_pat_v2i16_x2: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX10-NEXT: v_pk_add_u16 v0, v2, v0 +; GFX10-NEXT: v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX10-NEXT: v_pk_mul_lo_u16 v1, v0, v2 +; GFX10-NEXT: v_pk_add_u16 v2, v1, v2 +; GFX10-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v2, v0 +; GFX10-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX10-NEXT: v_pk_add_u16 v1, v2, v1 +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v1, v0 +; GFX10-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i16_x2: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v1, v2 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v2, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: clpeak_imad_pat_v2i16_x2: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX11-NEXT: v_pk_add_u16 v0, v2, v0 +; GFX11-NEXT: v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX11-NEXT: v_pk_mul_lo_u16 v1, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_add_u16 v2, v1, v2 +; GFX11-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX11-NEXT: v_pk_add_u16 v1, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v1, v0 +; GFX11-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %y38 = add <2 x i16> %x, <i16 1, i16 1> %add = mul <2 x i16> %y38, %y @@ -7654,143 +7501,74 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-SDAG-LABEL: clpeak_umad_pat_v2i16_x2: -; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: clpeak_umad_pat_v2i16_x2: -; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: clpeak_umad_pat_v2i16_x2: -; GFX10-SDAG: ; %bb.0: ; %entry -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX10-SDAG-NEXT: v_pk_sub_u16 v2, v2, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v0, v2 -; GFX10-SDAG-NEXT: v_pk_add_u16 v2, v1, v2 -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v2, v0 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: clpeak_umad_pat_v2i16_x2: -; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX10-GISEL-NEXT: v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v0, v2 -; GFX10-GISEL-NEXT: v_pk_add_u16 v2, v1, v2 -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v2, v0 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: clpeak_umad_pat_v2i16_x2: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX9-NEXT: v_pk_add_u16 v0, v2, v0 +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX9-NEXT: v_pk_add_u16 v1, v2, v1 +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v1, v0 +; GFX9-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX9-NEXT: v_pk_add_u16 v1, v2, v1 +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v1, v0 +; GFX9-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: clpeak_umad_pat_v2i16_x2: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v2, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_add_u16 v2, v1, v2 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v2, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: clpeak_umad_pat_v2i16_x2: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX10-NEXT: v_pk_add_u16 v0, v2, v0 +; GFX10-NEXT: v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX10-NEXT: v_pk_mul_lo_u16 v1, v0, v2 +; GFX10-NEXT: v_pk_add_u16 v2, v1, v2 +; GFX10-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v2, v0 +; GFX10-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX10-NEXT: v_pk_add_u16 v1, v2, v1 +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v1, v0 +; GFX10-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: clpeak_umad_pat_v2i16_x2: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v1, v2 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v2, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: clpeak_umad_pat_v2i16_x2: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX11-NEXT: v_pk_add_u16 v0, v2, v0 +; GFX11-NEXT: v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX11-NEXT: v_pk_mul_lo_u16 v1, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_add_u16 v2, v1, v2 +; GFX11-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_lo_u16 v2, v0, v1 +; GFX11-NEXT: v_pk_add_u16 v1, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v1, v0 +; GFX11-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %y38 = add <2 x i16> %x, <i16 1, i16 1> %add = mul <2 x i16> %y38, %y @@ -8373,6 +8151,24 @@ define i64 @mul_u24_add64(i32 %x, i32 %y, i64 %z) { ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: mul_u24_add64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: mul_u24_add64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v1 +; GFX11-GISEL-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = call i64 @llvm.amdgcn.mul.u24.i64(i32 %x, i32 %y) %add = add i64 %mul, %z ret i64 %add @@ -8410,6 +8206,15 @@ define i64 @mul_u24_zext_add64(i32 %x, i32 %y, i64 %z) { ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: mul_u24_zext_add64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] %mul = call i32 @llvm.amdgcn.mul.u24(i32 %x, i32 %y) %mul.zext = zext i32 %mul to i64 %add = add i64 %mul.zext, %z diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll index 54bd78e..66f159f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll @@ -75,26 +75,15 @@ entry: ; Make sure we do not violate constant bus restriction with 3 scalar inputs and simingly inlinable literal. define amdgpu_ps void @test_llvm_amdgcn_fdot2_bf16_bf16_sis( -; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis: -; SDAG-GFX11: ; %bb.0: ; %entry -; SDAG-GFX11-NEXT: v_mov_b32_e32 v2, s1 -; SDAG-GFX11-NEXT: s_mov_b32 s1, 0x10001 -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; SDAG-GFX11-NEXT: v_dot2_bf16_bf16 v2, s0, s1, v2 -; SDAG-GFX11-NEXT: global_store_b16 v[0:1], v2, off -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; SDAG-GFX11-NEXT: s_endpgm -; -; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis: -; GISEL-GFX11: ; %bb.0: ; %entry -; GISEL-GFX11-NEXT: v_mov_b32_e32 v2, 0x10001 -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_dot2_bf16_bf16 v2, s0, v2, s1 -; GISEL-GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GISEL-GFX11-NEXT: s_endpgm +; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dot2_bf16_bf16 v2, s0, 0x10001, v2 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, <2 x i16> inreg %a, i16 inreg %c) { diff --git a/llvm/test/CodeGen/AMDGPU/pk_max_f16_literal.ll b/llvm/test/CodeGen/AMDGPU/pk_max_f16_literal.ll index 81918f5..e96570d 100644 --- a/llvm/test/CodeGen/AMDGPU/pk_max_f16_literal.ll +++ b/llvm/test/CodeGen/AMDGPU/pk_max_f16_literal.ll @@ -23,7 +23,7 @@ bb: %tmp1 = zext i32 %tmp to i64 %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1 %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4 - %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH3C00, half 0xH0000>) + %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH3C00, half 0xH0000>) store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4 ret void } @@ -96,7 +96,7 @@ bb: ; GCN-LABEL: {{^}}test_pk_max_f16_literal_0_41c8: ; GFX9: s_mov_b32 [[C:s[0-9]+]], 0x41c80000 ; GFX9: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, [[C]]{{$}} -; GFX10: v_pk_max_f16 v{{[0-9]+}}, 0x41c8, v{{[0-9]+}} op_sel:[1,0] op_sel_hi:[0,1]{{$}} +; GFX10: v_pk_max_f16 v{{[0-9]+}}, 0x41c80000, v{{[0-9]+}}{{$}} define amdgpu_kernel void @test_pk_max_f16_literal_0_41c8(ptr addrspace(1) nocapture %arg) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll index 536b2d0..3c654e9 100644 --- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll @@ -1622,14 +1622,14 @@ define <2 x i16> @v_mul_add_1_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX9-LABEL: v_mul_add_1_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_add_1_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %add = add <2 x i16> %y, <i16 1, i16 1> @@ -1665,14 +1665,14 @@ define <2 x i16> @v_mul_add_1_v2i16_commute(<2 x i16> %x, <2 x i16> %y) { ; GFX9-LABEL: v_mul_add_1_v2i16_commute: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_add_1_v2i16_commute: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %add = add <2 x i16> %y, <i16 1, i16 1> @@ -1886,14 +1886,14 @@ define <2 x i16> @v_mul_add_2_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX9-LABEL: v_mul_add_2_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v1, v1, -2 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_add_2_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v1, v1, -2 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0] ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %add = add <2 x i16> %y, <i16 2, i16 2> @@ -2929,14 +2929,14 @@ define <2 x i16> @v_mul_5_add_1_v2i16(<2 x i16> %arg) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, 5 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_5_add_1_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, 5 op_sel_hi:[1,0] -; GFX10-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX10-NEXT: s_setpc_b64 s[30:31] %mul = mul <2 x i16> %arg, <i16 5, i16 5> %add = add <2 x i16> %mul, <i16 1, i16 1> diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index a8ae8c0..73f2834 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -2399,7 +2399,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0xc400 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0xc4000000 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -2410,7 +2410,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0xc400 op_sel:[0,1] op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0xc4000000 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2534,7 +2534,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x4400 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x44000000 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -2545,7 +2545,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x4400 op_sel:[0,1] op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x44000000 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2645,76 +2645,40 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffe0ffe0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xffe0, v1 op_sel_hi:[0,1] -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm +; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0xffe0, v1 op_sel_hi:[0,1] -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_test_v2i16_x_add_neg32_neg32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -2803,76 +2767,40 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_test_v2i16_x_add_0_neg32: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_test_v2i16_x_add_0_neg32: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffe00000 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX10-SDAG-LABEL: v_test_v2i16_x_add_0_neg32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_test_v2i16_x_add_0_neg32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xffe0, v1 op_sel:[1,0] op_sel_hi:[0,1] -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_test_v2i16_x_add_0_neg32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_test_v2i16_x_add_0_neg32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm +; GFX10-LABEL: v_test_v2i16_x_add_0_neg32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_v2i16_x_add_0_neg32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0xffe0, v1 op_sel:[1,0] op_sel_hi:[0,1] -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_test_v2i16_x_add_0_neg32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -2963,76 +2891,40 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg32_0: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg32_0: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffe0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg32_0: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg32_0: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xffe0, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_test_v2i16_x_add_neg32_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_0: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm +; GFX10-LABEL: v_test_v2i16_x_add_neg32_0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg32_0: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0xffe0, v1 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_test_v2i16_x_add_neg32_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -3128,75 +3020,40 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm +; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_test_v2i16_x_add_neg16_neg16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -3285,75 +3142,40 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_test_v2i16_x_add_0_neg16: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_test_v2i16_x_add_0_neg16: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, -16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX10-SDAG-LABEL: v_test_v2i16_x_add_0_neg16: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_test_v2i16_x_add_0_neg16: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v1, -16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_test_v2i16_x_add_0_neg16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_test_v2i16_x_add_0_neg16: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm +; GFX10-LABEL: v_test_v2i16_x_add_0_neg16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_v2i16_x_add_0_neg16: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v1, -16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_test_v2i16_x_add_0_neg16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -3444,75 +3266,40 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg16_0: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, 16 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg16_0: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, -16 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg16_0: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, 16 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg16_0: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v1, -16 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_test_v2i16_x_add_neg16_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg16_0: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, 16 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm +; GFX10-LABEL: v_test_v2i16_x_add_neg16_0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg16_0: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v1, -16 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_test_v2i16_x_add_neg16_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -3613,9 +3400,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x3c003c00 +; GFX9-SDAG-NEXT: s_mov_b32 s2, 0xc400c400 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, s2 +; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 ; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-SDAG-NEXT: s_endpgm ; @@ -3631,53 +3418,29 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm ; -; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, 0x3c00 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xc400, v1 op_sel_hi:[0,1] -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, 0x3c00 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm +; GFX10-LABEL: v_test_v2i16_x_add_neg_fpone: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_u16 v1, 0xc400c400, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0xc400, v1 op_sel_hi:[0,1] -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_test_v2i16_x_add_neg_fpone: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v1, 0xc400c400, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -3778,9 +3541,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_mov_b32 s2, 0xbc00bc00 +; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x44004400 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, s2 +; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 ; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-SDAG-NEXT: s_endpgm ; @@ -3796,53 +3559,29 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm ; -; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, 0xbc00 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1] -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, 0xbc00 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm +; GFX10-LABEL: v_test_v2i16_x_add_neg_negfpone: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_u16 v1, 0x44004400, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1] -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_test_v2i16_x_add_neg_negfpone: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v1, 0x44004400, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -3937,77 +3676,40 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_mov_b32 s2, 0xc000c000 -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, s2 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x40004000 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, 0xc000 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0x4000, v1 op_sel_hi:[0,1] -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, 0xc000 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm +; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0x4000, v1 op_sel_hi:[0,1] -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_test_v2i16_x_add_neg_fptwo: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -4102,77 +3804,40 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x40004000 -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, s2 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc000c000 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, 0x4000 op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xc000, v1 op_sel_hi:[0,1] -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, 0x4000 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm +; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0xc000, v1 op_sel_hi:[0,1] -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_test_v2i16_x_add_neg_negfptwo: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -4260,76 +3925,40 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffe00000 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX10-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xffe0, v1 op_sel:[1,0] op_sel_hi:[0,1] -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm +; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0xffe0, v1 op_sel:[1,0] op_sel_hi:[0,1] -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -4455,7 +4084,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xffe0, v1 op_sel_hi:[0,1] +; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xffffffe0, v1 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; @@ -4479,7 +4108,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0xffe0, v1 op_sel_hi:[0,1] +; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0xffffffe0, v1 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index 9a6851c..b237703 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -6,7 +6,7 @@ ; GFX9: s_load_dword [[VAL:s[0-9]+]] ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]] ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] -; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0] +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 op_sel_hi:[1,0] ; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 ; CIVI: s_sub_i32 @@ -30,7 +30,7 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0 ; GFX9: global_load_dword [[VAL:v[0-9]+]] ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]] ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] -; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0] +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 op_sel_hi:[1,0] ; VI-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 ; VI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, @@ -70,7 +70,7 @@ define amdgpu_kernel void @v_abs_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9: s_load_dword [[VAL:s[0-9]+]] ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]] ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] -; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0] +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 op_sel_hi:[1,0] define amdgpu_kernel void @s_abs_v2i16_2(ptr addrspace(1) %out, <2 x i16> %val) #0 { %z0 = insertelement <2 x i16> undef, i16 0, i16 0 %z1 = insertelement <2 x i16> %z0, i16 0, i16 1 @@ -88,7 +88,7 @@ define amdgpu_kernel void @s_abs_v2i16_2(ptr addrspace(1) %out, <2 x i16> %val) ; GFX9: global_load_dword [[VAL:v[0-9]+]] ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]] ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] -; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0] +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 op_sel_hi:[1,0] define amdgpu_kernel void @v_abs_v2i16_2(ptr addrspace(1) %out, ptr addrspace(1) %src) #0 { %z0 = insertelement <2 x i16> undef, i16 0, i16 0 %z1 = insertelement <2 x i16> %z0, i16 0, i16 1 @@ -111,8 +111,8 @@ define amdgpu_kernel void @v_abs_v2i16_2(ptr addrspace(1) %out, ptr addrspace(1) ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, s[[#LOAD + 3]] ; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], s[[#LOAD + 2]], [[SUB0]] ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], s[[#LOAD + 3]], [[SUB1]] -; GFX9-DAG: v_pk_sub_u16 [[ADD0:v[0-9]+]], [[MAX0]], -2 op_sel_hi:[1,0] -; GFX9-DAG: v_pk_sub_u16 [[ADD1:v[0-9]+]], [[MAX1]], -2 op_sel_hi:[1,0] +; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 op_sel_hi:[1,0] +; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 op_sel_hi:[1,0] define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0 { %z0 = insertelement <4 x i16> undef, i16 0, i16 0 %z1 = insertelement <4 x i16> %z0, i16 0, i16 1 @@ -135,11 +135,11 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0 ; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, v[[VAL0]] ; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], v[[VAL0]], [[SUB0]] -; GFX9-DAG: v_pk_sub_u16 [[ADD0:v[0-9]+]], [[MAX0]], -2 op_sel_hi:[1,0] +; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 op_sel_hi:[1,0] ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, v[[VAL1]] ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], v[[VAL1]], [[SUB1]] -; GFX9-DAG: v_pk_sub_u16 [[ADD1:v[0-9]+]], [[MAX1]], -2 op_sel_hi:[1,0] +; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 op_sel_hi:[1,0] define amdgpu_kernel void @v_abs_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %src) #0 { %z0 = insertelement <4 x i16> undef, i16 0, i16 0 %z1 = insertelement <4 x i16> %z0, i16 0, i16 1 diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index aedf06d..a2712ec 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -427,7 +427,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -460,7 +460,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v0, v0, -1 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; @@ -473,7 +473,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_sub_i16 v0, v0, -1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -562,13 +562,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, 1.0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -600,7 +599,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x3f80 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v0, v0, 1.0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; @@ -613,7 +612,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0x3f80 op_sel:[0,1] op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_sub_i16 v0, v0, 1.0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index e46992c..819e5e8 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -369,13 +369,13 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) { ; SDAG-GFX9-NEXT: v_pk_min_i16 v0, v0, s4 op_sel_hi:[1,0] ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: vec_smax_smin: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_i16 v0, v0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-LABEL: vec_smax_smin: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: vec_smax_smin: ; GISEL-VI: ; %bb.0: @@ -396,6 +396,14 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) { ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0xff00ff ; GISEL-GFX9-NEXT: v_pk_min_i16 v0, v0, v1 ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-LABEL: vec_smax_smin: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] %src.max = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %src, <2 x i16> <i16 0, i16 0>) %src.clamp = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %src.max, <2 x i16> <i16 255, i16 255>) ret <2 x i16> %src.clamp @@ -548,13 +556,13 @@ define <2 x i16> @vec_smin_smax(<2 x i16> %src) { ; SDAG-GFX9-NEXT: v_pk_max_i16 v0, v0, 0 ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: vec_smin_smax: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_i16 v0, v0, 0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-LABEL: vec_smin_smax: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: vec_smin_smax: ; GISEL-VI: ; %bb.0: @@ -575,7 +583,17 @@ define <2 x i16> @vec_smin_smax(<2 x i16> %src) { ; GISEL-GFX9-NEXT: v_pk_min_i16 v0, v0, v1 ; GISEL-GFX9-NEXT: v_pk_max_i16 v0, v0, 0 ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-LABEL: vec_smin_smax: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] %src.min = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %src, <2 x i16> <i16 255, i16 255>) %src.clamp = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %src.min, <2 x i16> <i16 0, i16 0>) ret <2 x i16> %src.clamp } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p.s index 45a320a..829b0eb 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p.s @@ -463,7 +463,7 @@ v_pk_add_i16 v5, ttmp15, src_scc // GFX11: [0x05,0x40,0x02,0xcc,0x7b,0xfa,0x01,0x18] v_pk_add_i16 v5, m0, 0.5 -// GFX11: [0x05,0x40,0x02,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x02,0xcc,0x7d,0xe0,0x01,0x18] v_pk_add_i16 v5, exec_lo, -1 // GFX11: [0x05,0x40,0x02,0xcc,0x7e,0x82,0x01,0x18] @@ -477,9 +477,12 @@ v_pk_add_i16 v5, null, exec_lo v_pk_add_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX11: [0x05,0x58,0x02,0xcc,0xc1,0xfe,0x00,0x00] -v_pk_add_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] +v_pk_add_i16 v5, 0x3800, m0 op_sel:[0,0] op_sel_hi:[1,1] // GFX11: [0x05,0x40,0x02,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +v_pk_add_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] +// GFX11: [0x05,0x40,0x02,0xcc,0xf0,0xfa,0x00,0x18] + v_pk_add_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX11: [0x05,0x48,0x02,0xcc,0xfd,0xd4,0x00,0x10] @@ -508,7 +511,7 @@ v_pk_add_u16 v5, ttmp15, src_scc // GFX11: [0x05,0x40,0x0a,0xcc,0x7b,0xfa,0x01,0x18] v_pk_add_u16 v5, m0, 0.5 -// GFX11: [0x05,0x40,0x0a,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x0a,0xcc,0x7d,0xe0,0x01,0x18] v_pk_add_u16 v5, exec_lo, -1 // GFX11: [0x05,0x40,0x0a,0xcc,0x7e,0x82,0x01,0x18] @@ -523,7 +526,7 @@ v_pk_add_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX11: [0x05,0x58,0x0a,0xcc,0xc1,0xfe,0x00,0x00] v_pk_add_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX11: [0x05,0x40,0x0a,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x0a,0xcc,0xf0,0xfa,0x00,0x18] v_pk_add_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX11: [0x05,0x48,0x0a,0xcc,0xfd,0xd4,0x00,0x10] @@ -553,7 +556,7 @@ v_pk_ashrrev_i16 v5, ttmp15, src_scc // GFX11: [0x05,0x40,0x06,0xcc,0x7b,0xfa,0x01,0x18] v_pk_ashrrev_i16 v5, m0, 0.5 -// GFX11: [0x05,0x40,0x06,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x06,0xcc,0x7d,0xe0,0x01,0x18] v_pk_ashrrev_i16 v5, exec_lo, -1 // GFX11: [0x05,0x40,0x06,0xcc,0x7e,0x82,0x01,0x18] @@ -568,7 +571,7 @@ v_pk_ashrrev_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX11: [0x05,0x58,0x06,0xcc,0xc1,0xfe,0x00,0x00] v_pk_ashrrev_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX11: [0x05,0x40,0x06,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x06,0xcc,0xf0,0xfa,0x00,0x18] v_pk_ashrrev_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX11: [0x05,0x48,0x06,0xcc,0xfd,0xd4,0x00,0x10] @@ -643,7 +646,7 @@ v_pk_lshlrev_b16 v5, ttmp15, src_scc // GFX11: [0x05,0x40,0x04,0xcc,0x7b,0xfa,0x01,0x18] v_pk_lshlrev_b16 v5, m0, 0.5 -// GFX11: [0x05,0x40,0x04,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x04,0xcc,0x7d,0xe0,0x01,0x18] v_pk_lshlrev_b16 v5, exec_lo, -1 // GFX11: [0x05,0x40,0x04,0xcc,0x7e,0x82,0x01,0x18] @@ -658,7 +661,7 @@ v_pk_lshlrev_b16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX11: [0x05,0x58,0x04,0xcc,0xc1,0xfe,0x00,0x00] v_pk_lshlrev_b16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX11: [0x05,0x40,0x04,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x04,0xcc,0xf0,0xfa,0x00,0x18] v_pk_lshlrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX11: [0x05,0x48,0x04,0xcc,0xfd,0xd4,0x00,0x10] @@ -688,7 +691,7 @@ v_pk_lshrrev_b16 v5, ttmp15, src_scc // GFX11: [0x05,0x40,0x05,0xcc,0x7b,0xfa,0x01,0x18] v_pk_lshrrev_b16 v5, m0, 0.5 -// GFX11: [0x05,0x40,0x05,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x05,0xcc,0x7d,0xe0,0x01,0x18] v_pk_lshrrev_b16 v5, exec_lo, -1 // GFX11: [0x05,0x40,0x05,0xcc,0x7e,0x82,0x01,0x18] @@ -703,7 +706,7 @@ v_pk_lshrrev_b16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX11: [0x05,0x58,0x05,0xcc,0xc1,0xfe,0x00,0x00] v_pk_lshrrev_b16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX11: [0x05,0x40,0x05,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x05,0xcc,0xf0,0xfa,0x00,0x18] v_pk_lshrrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX11: [0x05,0x48,0x05,0xcc,0xfd,0xd4,0x00,0x10] @@ -733,7 +736,7 @@ v_pk_mad_i16 v5, ttmp15, src_scc, ttmp15 // GFX11: [0x05,0x40,0x00,0xcc,0x7b,0xfa,0xed,0x19] v_pk_mad_i16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0] -// GFX11: [0x05,0x00,0x00,0xcc,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x00,0x00,0xcc,0x7d,0xe0,0xf5,0x01] v_pk_mad_i16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1] // GFX11: [0x05,0x40,0x00,0xcc,0x7e,0x82,0xad,0x01] @@ -748,7 +751,7 @@ v_pk_mad_i16 v5, -1, exec_hi, src_scc op_sel:[0,0,0] op_sel_hi:[1,1,1] // GFX11: [0x05,0x40,0x00,0xcc,0xc1,0xfe,0xf4,0x1b] v_pk_mad_i16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] -// GFX11: [0x05,0x48,0x00,0xcc,0xff,0xfa,0xfc,0x13,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x48,0x00,0xcc,0xf0,0xfa,0xc0,0x13] v_pk_mad_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] // GFX11: [0x05,0x50,0x00,0xcc,0xfd,0xd4,0x04,0x0b] @@ -778,7 +781,7 @@ v_pk_mad_u16 v5, ttmp15, src_scc, ttmp15 // GFX11: [0x05,0x40,0x09,0xcc,0x7b,0xfa,0xed,0x19] v_pk_mad_u16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0] -// GFX11: [0x05,0x00,0x09,0xcc,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x00,0x09,0xcc,0x7d,0xe0,0xf5,0x01] v_pk_mad_u16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1] // GFX11: [0x05,0x40,0x09,0xcc,0x7e,0x82,0xad,0x01] @@ -793,7 +796,7 @@ v_pk_mad_u16 v5, -1, exec_hi, src_scc op_sel:[0,0,0] op_sel_hi:[1,1,1] // GFX11: [0x05,0x40,0x09,0xcc,0xc1,0xfe,0xf4,0x1b] v_pk_mad_u16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] -// GFX11: [0x05,0x48,0x09,0xcc,0xff,0xfa,0xfc,0x13,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x48,0x09,0xcc,0xf0,0xfa,0xc0,0x13] v_pk_mad_u16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] // GFX11: [0x05,0x50,0x09,0xcc,0xfd,0xd4,0x04,0x0b] @@ -868,7 +871,7 @@ v_pk_max_i16 v5, ttmp15, src_scc // GFX11: [0x05,0x40,0x07,0xcc,0x7b,0xfa,0x01,0x18] v_pk_max_i16 v5, m0, 0.5 -// GFX11: [0x05,0x40,0x07,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x07,0xcc,0x7d,0xe0,0x01,0x18] v_pk_max_i16 v5, exec_lo, -1 // GFX11: [0x05,0x40,0x07,0xcc,0x7e,0x82,0x01,0x18] @@ -883,7 +886,7 @@ v_pk_max_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX11: [0x05,0x58,0x07,0xcc,0xc1,0xfe,0x00,0x00] v_pk_max_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX11: [0x05,0x40,0x07,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x07,0xcc,0xf0,0xfa,0x00,0x18] v_pk_max_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX11: [0x05,0x48,0x07,0xcc,0xfd,0xd4,0x00,0x10] @@ -913,7 +916,7 @@ v_pk_max_u16 v5, ttmp15, src_scc // GFX11: [0x05,0x40,0x0c,0xcc,0x7b,0xfa,0x01,0x18] v_pk_max_u16 v5, m0, 0.5 -// GFX11: [0x05,0x40,0x0c,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x0c,0xcc,0x7d,0xe0,0x01,0x18] v_pk_max_u16 v5, exec_lo, -1 // GFX11: [0x05,0x40,0x0c,0xcc,0x7e,0x82,0x01,0x18] @@ -928,7 +931,7 @@ v_pk_max_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX11: [0x05,0x58,0x0c,0xcc,0xc1,0xfe,0x00,0x00] v_pk_max_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX11: [0x05,0x40,0x0c,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x0c,0xcc,0xf0,0xfa,0x00,0x18] v_pk_max_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX11: [0x05,0x48,0x0c,0xcc,0xfd,0xd4,0x00,0x10] @@ -1003,7 +1006,7 @@ v_pk_min_i16 v5, ttmp15, src_scc // GFX11: [0x05,0x40,0x08,0xcc,0x7b,0xfa,0x01,0x18] v_pk_min_i16 v5, m0, 0.5 -// GFX11: [0x05,0x40,0x08,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x08,0xcc,0x7d,0xe0,0x01,0x18] v_pk_min_i16 v5, exec_lo, -1 // GFX11: [0x05,0x40,0x08,0xcc,0x7e,0x82,0x01,0x18] @@ -1018,7 +1021,7 @@ v_pk_min_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX11: [0x05,0x58,0x08,0xcc,0xc1,0xfe,0x00,0x00] v_pk_min_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX11: [0x05,0x40,0x08,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x08,0xcc,0xf0,0xfa,0x00,0x18] v_pk_min_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX11: [0x05,0x48,0x08,0xcc,0xfd,0xd4,0x00,0x10] @@ -1048,7 +1051,7 @@ v_pk_min_u16 v5, ttmp15, src_scc // GFX11: [0x05,0x40,0x0d,0xcc,0x7b,0xfa,0x01,0x18] v_pk_min_u16 v5, m0, 0.5 -// GFX11: [0x05,0x40,0x0d,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x0d,0xcc,0x7d,0xe0,0x01,0x18] v_pk_min_u16 v5, exec_lo, -1 // GFX11: [0x05,0x40,0x0d,0xcc,0x7e,0x82,0x01,0x18] @@ -1063,7 +1066,7 @@ v_pk_min_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX11: [0x05,0x58,0x0d,0xcc,0xc1,0xfe,0x00,0x00] v_pk_min_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX11: [0x05,0x40,0x0d,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x0d,0xcc,0xf0,0xfa,0x00,0x18] v_pk_min_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX11: [0x05,0x48,0x0d,0xcc,0xfd,0xd4,0x00,0x10] @@ -1138,7 +1141,7 @@ v_pk_mul_lo_u16 v5, ttmp15, src_scc // GFX11: [0x05,0x40,0x01,0xcc,0x7b,0xfa,0x01,0x18] v_pk_mul_lo_u16 v5, m0, 0.5 -// GFX11: [0x05,0x40,0x01,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x01,0xcc,0x7d,0xe0,0x01,0x18] v_pk_mul_lo_u16 v5, exec_lo, -1 // GFX11: [0x05,0x40,0x01,0xcc,0x7e,0x82,0x01,0x18] @@ -1153,7 +1156,7 @@ v_pk_mul_lo_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX11: [0x05,0x58,0x01,0xcc,0xc1,0xfe,0x00,0x00] v_pk_mul_lo_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX11: [0x05,0x40,0x01,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x01,0xcc,0xf0,0xfa,0x00,0x18] v_pk_mul_lo_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX11: [0x05,0x48,0x01,0xcc,0xfd,0xd4,0x00,0x10] @@ -1183,7 +1186,7 @@ v_pk_sub_i16 v5, ttmp15, src_scc // GFX11: [0x05,0x40,0x03,0xcc,0x7b,0xfa,0x01,0x18] v_pk_sub_i16 v5, m0, 0.5 -// GFX11: [0x05,0x40,0x03,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x03,0xcc,0x7d,0xe0,0x01,0x18] v_pk_sub_i16 v5, exec_lo, -1 // GFX11: [0x05,0x40,0x03,0xcc,0x7e,0x82,0x01,0x18] @@ -1198,7 +1201,7 @@ v_pk_sub_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX11: [0x05,0x58,0x03,0xcc,0xc1,0xfe,0x00,0x00] v_pk_sub_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX11: [0x05,0x40,0x03,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x03,0xcc,0xf0,0xfa,0x00,0x18] v_pk_sub_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX11: [0x05,0x48,0x03,0xcc,0xfd,0xd4,0x00,0x10] @@ -1228,7 +1231,7 @@ v_pk_sub_u16 v5, ttmp15, src_scc // GFX11: [0x05,0x40,0x0b,0xcc,0x7b,0xfa,0x01,0x18] v_pk_sub_u16 v5, m0, 0.5 -// GFX11: [0x05,0x40,0x0b,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x0b,0xcc,0x7d,0xe0,0x01,0x18] v_pk_sub_u16 v5, exec_lo, -1 // GFX11: [0x05,0x40,0x0b,0xcc,0x7e,0x82,0x01,0x18] @@ -1243,7 +1246,7 @@ v_pk_sub_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX11: [0x05,0x58,0x0b,0xcc,0xc1,0xfe,0x00,0x00] v_pk_sub_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX11: [0x05,0x40,0x0b,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX11: [0x05,0x40,0x0b,0xcc,0xf0,0xfa,0x00,0x18] v_pk_sub_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX11: [0x05,0x48,0x0b,0xcc,0xfd,0xd4,0x00,0x10] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s index 9a21f7a..a8347fb 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s @@ -463,7 +463,7 @@ v_pk_add_i16 v5, ttmp15, src_scc // GFX12: [0x05,0x40,0x02,0xcc,0x7b,0xfa,0x01,0x18] v_pk_add_i16 v5, m0, 0.5 -// GFX12: [0x05,0x40,0x02,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x02,0xcc,0x7d,0xe0,0x01,0x18] v_pk_add_i16 v5, exec_lo, -1 // GFX12: [0x05,0x40,0x02,0xcc,0x7e,0x82,0x01,0x18] @@ -478,7 +478,7 @@ v_pk_add_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX12: [0x05,0x58,0x02,0xcc,0xc1,0xfe,0x00,0x00] v_pk_add_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX12: [0x05,0x40,0x02,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x02,0xcc,0xf0,0xfa,0x00,0x18] v_pk_add_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX12: [0x05,0x48,0x02,0xcc,0xfd,0xd4,0x00,0x10] @@ -508,7 +508,7 @@ v_pk_add_u16 v5, ttmp15, src_scc // GFX12: [0x05,0x40,0x0a,0xcc,0x7b,0xfa,0x01,0x18] v_pk_add_u16 v5, m0, 0.5 -// GFX12: [0x05,0x40,0x0a,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x0a,0xcc,0x7d,0xe0,0x01,0x18] v_pk_add_u16 v5, exec_lo, -1 // GFX12: [0x05,0x40,0x0a,0xcc,0x7e,0x82,0x01,0x18] @@ -523,7 +523,7 @@ v_pk_add_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX12: [0x05,0x58,0x0a,0xcc,0xc1,0xfe,0x00,0x00] v_pk_add_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX12: [0x05,0x40,0x0a,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x0a,0xcc,0xf0,0xfa,0x00,0x18] v_pk_add_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX12: [0x05,0x48,0x0a,0xcc,0xfd,0xd4,0x00,0x10] @@ -553,7 +553,7 @@ v_pk_ashrrev_i16 v5, ttmp15, src_scc // GFX12: [0x05,0x40,0x06,0xcc,0x7b,0xfa,0x01,0x18] v_pk_ashrrev_i16 v5, m0, 0.5 -// GFX12: [0x05,0x40,0x06,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x06,0xcc,0x7d,0xe0,0x01,0x18] v_pk_ashrrev_i16 v5, exec_lo, -1 // GFX12: [0x05,0x40,0x06,0xcc,0x7e,0x82,0x01,0x18] @@ -568,7 +568,7 @@ v_pk_ashrrev_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX12: [0x05,0x58,0x06,0xcc,0xc1,0xfe,0x00,0x00] v_pk_ashrrev_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX12: [0x05,0x40,0x06,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x06,0xcc,0xf0,0xfa,0x00,0x18] v_pk_ashrrev_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX12: [0x05,0x48,0x06,0xcc,0xfd,0xd4,0x00,0x10] @@ -643,7 +643,7 @@ v_pk_lshlrev_b16 v5, ttmp15, src_scc // GFX12: [0x05,0x40,0x04,0xcc,0x7b,0xfa,0x01,0x18] v_pk_lshlrev_b16 v5, m0, 0.5 -// GFX12: [0x05,0x40,0x04,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x04,0xcc,0x7d,0xe0,0x01,0x18] v_pk_lshlrev_b16 v5, exec_lo, -1 // GFX12: [0x05,0x40,0x04,0xcc,0x7e,0x82,0x01,0x18] @@ -658,6 +658,9 @@ v_pk_lshlrev_b16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX12: [0x05,0x58,0x04,0xcc,0xc1,0xfe,0x00,0x00] v_pk_lshlrev_b16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] +// GFX12: [0x05,0x40,0x04,0xcc,0xf0,0xfa,0x00,0x18] + +v_pk_lshlrev_b16 v5, 0x3800, m0 op_sel:[0,0] op_sel_hi:[1,1] // GFX12: [0x05,0x40,0x04,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] v_pk_lshlrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] @@ -688,7 +691,7 @@ v_pk_lshrrev_b16 v5, ttmp15, src_scc // GFX12: [0x05,0x40,0x05,0xcc,0x7b,0xfa,0x01,0x18] v_pk_lshrrev_b16 v5, m0, 0.5 -// GFX12: [0x05,0x40,0x05,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x05,0xcc,0x7d,0xe0,0x01,0x18] v_pk_lshrrev_b16 v5, exec_lo, -1 // GFX12: [0x05,0x40,0x05,0xcc,0x7e,0x82,0x01,0x18] @@ -703,7 +706,7 @@ v_pk_lshrrev_b16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX12: [0x05,0x58,0x05,0xcc,0xc1,0xfe,0x00,0x00] v_pk_lshrrev_b16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX12: [0x05,0x40,0x05,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x05,0xcc,0xf0,0xfa,0x00,0x18] v_pk_lshrrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX12: [0x05,0x48,0x05,0xcc,0xfd,0xd4,0x00,0x10] @@ -733,7 +736,7 @@ v_pk_mad_i16 v5, ttmp15, src_scc, ttmp15 // GFX12: [0x05,0x40,0x00,0xcc,0x7b,0xfa,0xed,0x19] v_pk_mad_i16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0] -// GFX12: [0x05,0x00,0x00,0xcc,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x00,0x00,0xcc,0x7d,0xe0,0xf5,0x01] v_pk_mad_i16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1] // GFX12: [0x05,0x40,0x00,0xcc,0x7e,0x82,0xad,0x01] @@ -748,7 +751,7 @@ v_pk_mad_i16 v5, -1, exec_hi, src_scc op_sel:[0,0,0] op_sel_hi:[1,1,1] // GFX12: [0x05,0x40,0x00,0xcc,0xc1,0xfe,0xf4,0x1b] v_pk_mad_i16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] -// GFX12: [0x05,0x48,0x00,0xcc,0xff,0xfa,0xfc,0x13,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x48,0x00,0xcc,0xf0,0xfa,0xc0,0x13] v_pk_mad_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] // GFX12: [0x05,0x50,0x00,0xcc,0xfd,0xd4,0x04,0x0b] @@ -778,7 +781,7 @@ v_pk_mad_u16 v5, ttmp15, src_scc, ttmp15 // GFX12: [0x05,0x40,0x09,0xcc,0x7b,0xfa,0xed,0x19] v_pk_mad_u16 v5, m0, 0.5, m0 op_sel_hi:[0,0,0] -// GFX12: [0x05,0x00,0x09,0xcc,0x7d,0xfe,0xf5,0x01,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x00,0x09,0xcc,0x7d,0xe0,0xf5,0x01] v_pk_mad_u16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,1] // GFX12: [0x05,0x40,0x09,0xcc,0x7e,0x82,0xad,0x01] @@ -793,7 +796,7 @@ v_pk_mad_u16 v5, -1, exec_hi, src_scc op_sel:[0,0,0] op_sel_hi:[1,1,1] // GFX12: [0x05,0x40,0x09,0xcc,0xc1,0xfe,0xf4,0x1b] v_pk_mad_u16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] -// GFX12: [0x05,0x48,0x09,0xcc,0xff,0xfa,0xfc,0x13,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x48,0x09,0xcc,0xf0,0xfa,0xc0,0x13] v_pk_mad_u16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] // GFX12: [0x05,0x50,0x09,0xcc,0xfd,0xd4,0x04,0x0b] @@ -868,7 +871,7 @@ v_pk_max_i16 v5, ttmp15, src_scc // GFX12: [0x05,0x40,0x07,0xcc,0x7b,0xfa,0x01,0x18] v_pk_max_i16 v5, m0, 0.5 -// GFX12: [0x05,0x40,0x07,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x07,0xcc,0x7d,0xe0,0x01,0x18] v_pk_max_i16 v5, exec_lo, -1 // GFX12: [0x05,0x40,0x07,0xcc,0x7e,0x82,0x01,0x18] @@ -883,7 +886,7 @@ v_pk_max_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX12: [0x05,0x58,0x07,0xcc,0xc1,0xfe,0x00,0x00] v_pk_max_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX12: [0x05,0x40,0x07,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x07,0xcc,0xf0,0xfa,0x00,0x18] v_pk_max_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX12: [0x05,0x48,0x07,0xcc,0xfd,0xd4,0x00,0x10] @@ -913,7 +916,7 @@ v_pk_max_u16 v5, ttmp15, src_scc // GFX12: [0x05,0x40,0x0c,0xcc,0x7b,0xfa,0x01,0x18] v_pk_max_u16 v5, m0, 0.5 -// GFX12: [0x05,0x40,0x0c,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x0c,0xcc,0x7d,0xe0,0x01,0x18] v_pk_max_u16 v5, exec_lo, -1 // GFX12: [0x05,0x40,0x0c,0xcc,0x7e,0x82,0x01,0x18] @@ -928,7 +931,7 @@ v_pk_max_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX12: [0x05,0x58,0x0c,0xcc,0xc1,0xfe,0x00,0x00] v_pk_max_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX12: [0x05,0x40,0x0c,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x0c,0xcc,0xf0,0xfa,0x00,0x18] v_pk_max_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX12: [0x05,0x48,0x0c,0xcc,0xfd,0xd4,0x00,0x10] @@ -1003,7 +1006,7 @@ v_pk_min_i16 v5, ttmp15, src_scc // GFX12: [0x05,0x40,0x08,0xcc,0x7b,0xfa,0x01,0x18] v_pk_min_i16 v5, m0, 0.5 -// GFX12: [0x05,0x40,0x08,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x08,0xcc,0x7d,0xe0,0x01,0x18] v_pk_min_i16 v5, exec_lo, -1 // GFX12: [0x05,0x40,0x08,0xcc,0x7e,0x82,0x01,0x18] @@ -1018,7 +1021,7 @@ v_pk_min_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX12: [0x05,0x58,0x08,0xcc,0xc1,0xfe,0x00,0x00] v_pk_min_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX12: [0x05,0x40,0x08,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x08,0xcc,0xf0,0xfa,0x00,0x18] v_pk_min_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX12: [0x05,0x48,0x08,0xcc,0xfd,0xd4,0x00,0x10] @@ -1048,7 +1051,7 @@ v_pk_min_u16 v5, ttmp15, src_scc // GFX12: [0x05,0x40,0x0d,0xcc,0x7b,0xfa,0x01,0x18] v_pk_min_u16 v5, m0, 0.5 -// GFX12: [0x05,0x40,0x0d,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x0d,0xcc,0x7d,0xe0,0x01,0x18] v_pk_min_u16 v5, exec_lo, -1 // GFX12: [0x05,0x40,0x0d,0xcc,0x7e,0x82,0x01,0x18] @@ -1063,7 +1066,7 @@ v_pk_min_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX12: [0x05,0x58,0x0d,0xcc,0xc1,0xfe,0x00,0x00] v_pk_min_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX12: [0x05,0x40,0x0d,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x0d,0xcc,0xf0,0xfa,0x00,0x18] v_pk_min_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX12: [0x05,0x48,0x0d,0xcc,0xfd,0xd4,0x00,0x10] @@ -1138,7 +1141,7 @@ v_pk_mul_lo_u16 v5, ttmp15, src_scc // GFX12: [0x05,0x40,0x01,0xcc,0x7b,0xfa,0x01,0x18] v_pk_mul_lo_u16 v5, m0, 0.5 -// GFX12: [0x05,0x40,0x01,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x01,0xcc,0x7d,0xe0,0x01,0x18] v_pk_mul_lo_u16 v5, exec_lo, -1 // GFX12: [0x05,0x40,0x01,0xcc,0x7e,0x82,0x01,0x18] @@ -1153,7 +1156,7 @@ v_pk_mul_lo_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX12: [0x05,0x58,0x01,0xcc,0xc1,0xfe,0x00,0x00] v_pk_mul_lo_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX12: [0x05,0x40,0x01,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x01,0xcc,0xf0,0xfa,0x00,0x18] v_pk_mul_lo_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX12: [0x05,0x48,0x01,0xcc,0xfd,0xd4,0x00,0x10] @@ -1183,7 +1186,7 @@ v_pk_sub_i16 v5, ttmp15, src_scc // GFX12: [0x05,0x40,0x03,0xcc,0x7b,0xfa,0x01,0x18] v_pk_sub_i16 v5, m0, 0.5 -// GFX12: [0x05,0x40,0x03,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x03,0xcc,0x7d,0xe0,0x01,0x18] v_pk_sub_i16 v5, exec_lo, -1 // GFX12: [0x05,0x40,0x03,0xcc,0x7e,0x82,0x01,0x18] @@ -1198,7 +1201,7 @@ v_pk_sub_i16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX12: [0x05,0x58,0x03,0xcc,0xc1,0xfe,0x00,0x00] v_pk_sub_i16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX12: [0x05,0x40,0x03,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x03,0xcc,0xf0,0xfa,0x00,0x18] v_pk_sub_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX12: [0x05,0x48,0x03,0xcc,0xfd,0xd4,0x00,0x10] @@ -1228,7 +1231,7 @@ v_pk_sub_u16 v5, ttmp15, src_scc // GFX12: [0x05,0x40,0x0b,0xcc,0x7b,0xfa,0x01,0x18] v_pk_sub_u16 v5, m0, 0.5 -// GFX12: [0x05,0x40,0x0b,0xcc,0x7d,0xfe,0x01,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x0b,0xcc,0x7d,0xe0,0x01,0x18] v_pk_sub_u16 v5, exec_lo, -1 // GFX12: [0x05,0x40,0x0b,0xcc,0x7e,0x82,0x01,0x18] @@ -1243,7 +1246,7 @@ v_pk_sub_u16 v5, -1, exec_hi op_sel:[1,1] op_sel_hi:[0,0] // GFX12: [0x05,0x58,0x0b,0xcc,0xc1,0xfe,0x00,0x00] v_pk_sub_u16 v5, 0.5, m0 op_sel:[0,0] op_sel_hi:[1,1] -// GFX12: [0x05,0x40,0x0b,0xcc,0xff,0xfa,0x00,0x18,0x00,0x38,0x00,0x00] +// GFX12: [0x05,0x40,0x0b,0xcc,0xf0,0xfa,0x00,0x18] v_pk_sub_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] // GFX12: [0x05,0x48,0x0b,0xcc,0xfd,0xd4,0x00,0x10] diff --git a/llvm/test/MC/AMDGPU/literalv216.s b/llvm/test/MC/AMDGPU/literalv216.s index 5b1c7a76..c695bc3 100644 --- a/llvm/test/MC/AMDGPU/literalv216.s +++ b/llvm/test/MC/AMDGPU/literalv216.s @@ -113,6 +113,10 @@ v_pk_add_f16 v1, 0x0001, v2 // GFX10: v_pk_add_f16 v1, 1, v2 ; encoding: [0x01,0x40,0x0f,0xcc,0x81,0x04,0x02,0x18] v_pk_add_f16 v1, 0xffff, v2 +// NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: literal operands are not supported +// GFX10: v_pk_add_f16 v1, 0xffff, v2 ; encoding: [0x01,0x40,0x0f,0xcc,0xff,0x04,0x02,0x18,0xff,0xff,0x00,0x00] + +v_pk_add_f16 v1, 0xffffffff, v2 // GFX9: v_pk_add_f16 v1, -1, v2 ; encoding: [0x01,0x40,0x8f,0xd3,0xc1,0x04,0x02,0x18] // GFX10: v_pk_add_f16 v1, -1, v2 ; encoding: [0x01,0x40,0x0f,0xcc,0xc1,0x04,0x02,0x18] @@ -153,6 +157,10 @@ v_pk_add_f16 v1, 0x3118, v2 // GFX10: v_pk_add_f16 v1, 0.15915494, v2 ; encoding: [0x01,0x40,0x0f,0xcc,0xf8,0x04,0x02,0x18] v_pk_add_f16 v1, 65535, v2 +// NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: literal operands are not supported +// GFX10: v_pk_add_f16 v1, 0xffff, v2 ; encoding: [0x01,0x40,0x0f,0xcc,0xff,0x04,0x02,0x18,0xff,0xff,0x00,0x00] + +v_pk_add_f16 v1, 4294967295, v2 // GFX9: v_pk_add_f16 v1, -1, v2 ; encoding: [0x01,0x40,0x8f,0xd3,0xc1,0x04,0x02,0x18] // GFX10: v_pk_add_f16 v1, -1, v2 ; encoding: [0x01,0x40,0x0f,0xcc,0xc1,0x04,0x02,0x18] @@ -242,7 +250,7 @@ v_pk_add_f16 v5, v1, 0.1234 v_pk_add_u16 v5, v1, 0.1234 // NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: literal operands are not supported -// GFX10: v_pk_add_u16 v5, v1, 0x2fe6 ; encoding: [0x05,0x40,0x0a,0xcc,0x01,0xff,0x01,0x18,0xe6,0x2f,0x00,0x00] +// GFX10: v_pk_add_u16 v5, v1, 0x3dfcb924 ; encoding: [0x05,0x40,0x0a,0xcc,0x01,0xff,0x01,0x18,0x24,0xb9,0xfc,0x3d] v_pk_fma_f16 v5, 0.1234, v2, v3 // NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: literal operands are not supported @@ -258,23 +266,23 @@ v_pk_fma_f16 v5, v1, v2, 0.1234 v_pk_mad_i16 v5, 0.1234, v2, v3 // NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: literal operands are not supported -// GFX10: v_pk_mad_i16 v5, 0x2fe6, v2, v3 ; encoding: [0x05,0x40,0x00,0xcc,0xff,0x04,0x0e,0x1c,0xe6,0x2f,0x00,0x00] +// GFX10: v_pk_mad_i16 v5, 0x3dfcb924, v2, v3 ; encoding: [0x05,0x40,0x00,0xcc,0xff,0x04,0x0e,0x1c,0x24,0xb9,0xfc,0x3d] v_pk_mad_i16 v5, v1, 0.1234, v3 // NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: literal operands are not supported -// GFX10: v_pk_mad_i16 v5, v1, 0x2fe6, v3 ; encoding: [0x05,0x40,0x00,0xcc,0x01,0xff,0x0d,0x1c,0xe6,0x2f,0x00,0x00] +// GFX10: v_pk_mad_i16 v5, v1, 0x3dfcb924, v3 ; encoding: [0x05,0x40,0x00,0xcc,0x01,0xff,0x0d,0x1c,0x24,0xb9,0xfc,0x3d] v_pk_mad_i16 v5, v1, v2, 0.1234 // NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: literal operands are not supported -// GFX10: v_pk_mad_i16 v5, v1, v2, 0x2fe6 ; encoding: [0x05,0x40,0x00,0xcc,0x01,0x05,0xfe,0x1b,0xe6,0x2f,0x00,0x00] +// GFX10: v_pk_mad_i16 v5, v1, v2, 0x3dfcb924 ; encoding: [0x05,0x40,0x00,0xcc,0x01,0x05,0xfe,0x1b,0x24,0xb9,0xfc,0x3d] v_pk_add_f16 v5, v1, 123456.0 // NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction // NOGFX10: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction v_pk_add_u16 v5, v1, 123456.0 -// NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction -// NOGFX10: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: literal operands are not supported +// GFX10: v_pk_add_u16 v5, v1, 0x47f12000 ; encoding: [0x05,0x40,0x0a,0xcc,0x01,0xff,0x01,0x18,0x00,0x20,0xf1,0x47] //===----------------------------------------------------------------------===// // Packed VOP2 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt index e42d0de..a022c79 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt @@ -79,7 +79,7 @@ # GFX10: v_pk_fma_f16 v5, -1, -2, -3 ; encoding: [0x05,0x40,0x0e,0xcc,0xc1,0x84,0x0d,0x1b] 0x05,0x40,0x0e,0xcc,0xc1,0x84,0x0d,0x1b -# GFX10: v_pk_mad_i16 v5, 0x3c00, 0x4000, 0x4400 ; encoding: [0x05,0x40,0x00,0xcc,0xff,0xfe,0xfd,0x1b,0x00,0x3c,0x00,0x00] +# GFX10: v_pk_mad_i16 v5, 1.0, 2.0, 4.0 ; encoding: [0x05,0x40,0x00,0xcc,0xf2,0xe8,0xd9,0x1b] 0x05,0x40,0x00,0xcc,0xf2,0xe8,0xd9,0x1b # GFX10: v_pk_mad_u16 v5, -1, -2, -3 ; encoding: [0x05,0x40,0x09,0xcc,0xc1,0x84,0x0d,0x1b] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p.txt index bc2cb5f..838e6e0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p.txt @@ -466,7 +466,7 @@ # GFX11: v_pk_add_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x02,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x02,0xcc,0x7b,0xfa,0x01,0x18 -# GFX11: v_pk_add_i16 v5, m0, 0x3800 +# GFX11: v_pk_add_i16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x02,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x02,0xcc,0x7d,0xe0,0x01,0x18 # GFX11: v_pk_add_i16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x02,0xcc,0x7e,0x82,0x01,0x18] @@ -481,7 +481,7 @@ # GFX11: v_pk_add_i16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x02,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x02,0xcc,0xc1,0xfe,0x00,0x18 -# GFX11: v_pk_add_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX11: v_pk_add_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x02,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x02,0xcc,0xf0,0xfa,0x00,0x00 # GFX11: v_pk_add_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x02,0xcc,0xfd,0xd4,0x00,0x10] @@ -511,7 +511,7 @@ # GFX11: v_pk_add_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x0a,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x0a,0xcc,0x7b,0xfa,0x01,0x18 -# GFX11: v_pk_add_u16 v5, m0, 0x3800 +# GFX11: v_pk_add_u16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x0a,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x0a,0xcc,0x7d,0xe0,0x01,0x18 # GFX11: v_pk_add_u16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x82,0x01,0x18] @@ -526,7 +526,7 @@ # GFX11: v_pk_add_u16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x0a,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x0a,0xcc,0xc1,0xfe,0x00,0x18 -# GFX11: v_pk_add_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX11: v_pk_add_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0a,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x0a,0xcc,0xf0,0xfa,0x00,0x00 # GFX11: v_pk_add_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0a,0xcc,0xfd,0xd4,0x00,0x10] @@ -556,7 +556,7 @@ # GFX11: v_pk_ashrrev_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x06,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x06,0xcc,0x7b,0xfa,0x01,0x18 -# GFX11: v_pk_ashrrev_i16 v5, m0, 0x3800 +# GFX11: v_pk_ashrrev_i16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x06,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x06,0xcc,0x7d,0xe0,0x01,0x18 # GFX11: v_pk_ashrrev_i16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x06,0xcc,0x7e,0x82,0x01,0x18] @@ -571,7 +571,7 @@ # GFX11: v_pk_ashrrev_i16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x06,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x06,0xcc,0xc1,0xfe,0x00,0x18 -# GFX11: v_pk_ashrrev_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX11: v_pk_ashrrev_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x06,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x06,0xcc,0xf0,0xfa,0x00,0x00 # GFX11: v_pk_ashrrev_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x06,0xcc,0xfd,0xd4,0x00,0x10] @@ -646,7 +646,7 @@ # GFX11: v_pk_lshlrev_b16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x04,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x04,0xcc,0x7b,0xfa,0x01,0x18 -# GFX11: v_pk_lshlrev_b16 v5, m0, 0x3800 +# GFX11: v_pk_lshlrev_b16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x04,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x04,0xcc,0x7d,0xe0,0x01,0x18 # GFX11: v_pk_lshlrev_b16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x04,0xcc,0x7e,0x82,0x01,0x18] @@ -661,7 +661,7 @@ # GFX11: v_pk_lshlrev_b16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x04,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x04,0xcc,0xc1,0xfe,0x00,0x18 -# GFX11: v_pk_lshlrev_b16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX11: v_pk_lshlrev_b16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x04,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x04,0xcc,0xf0,0xfa,0x00,0x00 # GFX11: v_pk_lshlrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x04,0xcc,0xfd,0xd4,0x00,0x10] @@ -691,7 +691,7 @@ # GFX11: v_pk_lshrrev_b16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x05,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x05,0xcc,0x7b,0xfa,0x01,0x18 -# GFX11: v_pk_lshrrev_b16 v5, m0, 0x3800 +# GFX11: v_pk_lshrrev_b16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x05,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x05,0xcc,0x7d,0xe0,0x01,0x18 # GFX11: v_pk_lshrrev_b16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x05,0xcc,0x7e,0x82,0x01,0x18] @@ -706,7 +706,7 @@ # GFX11: v_pk_lshrrev_b16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x05,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x05,0xcc,0xc1,0xfe,0x00,0x18 -# GFX11: v_pk_lshrrev_b16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX11: v_pk_lshrrev_b16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x05,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x05,0xcc,0xf0,0xfa,0x00,0x00 # GFX11: v_pk_lshrrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x05,0xcc,0xfd,0xd4,0x00,0x10] @@ -736,7 +736,7 @@ # GFX11: v_pk_mad_i16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x40,0x00,0xcc,0x7b,0xfa,0xed,0x19] 0x05,0x40,0x00,0xcc,0x7b,0xfa,0xed,0x19 -# GFX11: v_pk_mad_i16 v5, m0, 0x3800, m0 +# GFX11: v_pk_mad_i16 v5, m0, 0.5, m0 ; encoding: [0x05,0x40,0x00,0xcc,0x7d,0xe0,0xf5,0x19] 0x05,0x40,0x00,0xcc,0x7d,0xe0,0xf5,0x19 # GFX11: v_pk_mad_i16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,0] ; encoding: [0x05,0x00,0x00,0xcc,0x7e,0x82,0xad,0x01] @@ -751,7 +751,7 @@ # GFX11: v_pk_mad_i16 v5, -1, exec_hi, src_scc op_sel:[1,1,1] op_sel_hi:[1,0,0] ; encoding: [0x05,0x38,0x00,0xcc,0xc1,0xfe,0xf4,0x0b] 0x05,0x38,0x00,0xcc,0xc1,0xfe,0xf4,0x0b -# GFX11: v_pk_mad_i16 v5, 0x3800, m0, 0x3800 op_sel:[1,0,0] op_sel_hi:[0,1,1] +# GFX11: v_pk_mad_i16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x05,0x48,0x00,0xcc,0xf0,0xfa,0xc0,0x13] 0x05,0x48,0x00,0xcc,0xf0,0xfa,0xc0,0x13 # GFX11: v_pk_mad_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; encoding: [0x05,0x50,0x00,0xcc,0xfd,0xd4,0x04,0x0b] @@ -781,7 +781,7 @@ # GFX11: v_pk_mad_u16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x40,0x09,0xcc,0x7b,0xfa,0xed,0x19] 0x05,0x40,0x09,0xcc,0x7b,0xfa,0xed,0x19 -# GFX11: v_pk_mad_u16 v5, m0, 0x3800, m0 +# GFX11: v_pk_mad_u16 v5, m0, 0.5, m0 ; encoding: [0x05,0x40,0x09,0xcc,0x7d,0xe0,0xf5,0x19] 0x05,0x40,0x09,0xcc,0x7d,0xe0,0xf5,0x19 # GFX11: v_pk_mad_u16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,0] ; encoding: [0x05,0x00,0x09,0xcc,0x7e,0x82,0xad,0x01] @@ -796,7 +796,7 @@ # GFX11: v_pk_mad_u16 v5, -1, exec_hi, src_scc op_sel:[1,1,1] op_sel_hi:[1,0,0] ; encoding: [0x05,0x38,0x09,0xcc,0xc1,0xfe,0xf4,0x0b] 0x05,0x38,0x09,0xcc,0xc1,0xfe,0xf4,0x0b -# GFX11: v_pk_mad_u16 v5, 0x3800, m0, 0x3800 op_sel:[1,0,0] op_sel_hi:[0,1,1] +# GFX11: v_pk_mad_u16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x05,0x48,0x09,0xcc,0xf0,0xfa,0xc0,0x13] 0x05,0x48,0x09,0xcc,0xf0,0xfa,0xc0,0x13 # GFX11: v_pk_mad_u16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; encoding: [0x05,0x50,0x09,0xcc,0xfd,0xd4,0x04,0x0b] @@ -871,7 +871,7 @@ # GFX11: v_pk_max_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x07,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x07,0xcc,0x7b,0xfa,0x01,0x18 -# GFX11: v_pk_max_i16 v5, m0, 0x3800 +# GFX11: v_pk_max_i16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x07,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x07,0xcc,0x7d,0xe0,0x01,0x18 # GFX11: v_pk_max_i16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x07,0xcc,0x7e,0x82,0x01,0x18] @@ -886,7 +886,7 @@ # GFX11: v_pk_max_i16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x07,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x07,0xcc,0xc1,0xfe,0x00,0x18 -# GFX11: v_pk_max_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX11: v_pk_max_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x07,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x07,0xcc,0xf0,0xfa,0x00,0x00 # GFX11: v_pk_max_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x07,0xcc,0xfd,0xd4,0x00,0x10] @@ -916,7 +916,7 @@ # GFX11: v_pk_max_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x0c,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x0c,0xcc,0x7b,0xfa,0x01,0x18 -# GFX11: v_pk_max_u16 v5, m0, 0x3800 +# GFX11: v_pk_max_u16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x0c,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x0c,0xcc,0x7d,0xe0,0x01,0x18 # GFX11: v_pk_max_u16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x0c,0xcc,0x7e,0x82,0x01,0x18] @@ -931,7 +931,7 @@ # GFX11: v_pk_max_u16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x0c,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x0c,0xcc,0xc1,0xfe,0x00,0x18 -# GFX11: v_pk_max_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX11: v_pk_max_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0c,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x0c,0xcc,0xf0,0xfa,0x00,0x00 # GFX11: v_pk_max_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0c,0xcc,0xfd,0xd4,0x00,0x10] @@ -1006,7 +1006,7 @@ # GFX11: v_pk_min_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x08,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x08,0xcc,0x7b,0xfa,0x01,0x18 -# GFX11: v_pk_min_i16 v5, m0, 0x3800 +# GFX11: v_pk_min_i16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x08,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x08,0xcc,0x7d,0xe0,0x01,0x18 # GFX11: v_pk_min_i16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x08,0xcc,0x7e,0x82,0x01,0x18] @@ -1021,7 +1021,7 @@ # GFX11: v_pk_min_i16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x08,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x08,0xcc,0xc1,0xfe,0x00,0x18 -# GFX11: v_pk_min_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX11: v_pk_min_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x08,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x08,0xcc,0xf0,0xfa,0x00,0x00 # GFX11: v_pk_min_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x08,0xcc,0xfd,0xd4,0x00,0x10] @@ -1051,7 +1051,7 @@ # GFX11: v_pk_min_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x0d,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x0d,0xcc,0x7b,0xfa,0x01,0x18 -# GFX11: v_pk_min_u16 v5, m0, 0x3800 +# GFX11: v_pk_min_u16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x0d,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x0d,0xcc,0x7d,0xe0,0x01,0x18 # GFX11: v_pk_min_u16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x0d,0xcc,0x7e,0x82,0x01,0x18] @@ -1066,7 +1066,7 @@ # GFX11: v_pk_min_u16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x0d,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x0d,0xcc,0xc1,0xfe,0x00,0x18 -# GFX11: v_pk_min_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX11: v_pk_min_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0d,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x0d,0xcc,0xf0,0xfa,0x00,0x00 # GFX11: v_pk_min_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0d,0xcc,0xfd,0xd4,0x00,0x10] @@ -1141,7 +1141,7 @@ # GFX11: v_pk_mul_lo_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x01,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x01,0xcc,0x7b,0xfa,0x01,0x18 -# GFX11: v_pk_mul_lo_u16 v5, m0, 0x3800 +# GFX11: v_pk_mul_lo_u16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x01,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x01,0xcc,0x7d,0xe0,0x01,0x18 # GFX11: v_pk_mul_lo_u16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x01,0xcc,0x7e,0x82,0x01,0x18] @@ -1156,7 +1156,7 @@ # GFX11: v_pk_mul_lo_u16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x01,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x01,0xcc,0xc1,0xfe,0x00,0x18 -# GFX11: v_pk_mul_lo_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX11: v_pk_mul_lo_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x01,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x01,0xcc,0xf0,0xfa,0x00,0x00 # GFX11: v_pk_mul_lo_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x01,0xcc,0xfd,0xd4,0x00,0x10] @@ -1186,7 +1186,7 @@ # GFX11: v_pk_sub_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x03,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x03,0xcc,0x7b,0xfa,0x01,0x18 -# GFX11: v_pk_sub_i16 v5, m0, 0x3800 +# GFX11: v_pk_sub_i16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x03,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x03,0xcc,0x7d,0xe0,0x01,0x18 # GFX11: v_pk_sub_i16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x03,0xcc,0x7e,0x82,0x01,0x18] @@ -1201,7 +1201,7 @@ # GFX11: v_pk_sub_i16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x03,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x03,0xcc,0xc1,0xfe,0x00,0x18 -# GFX11: v_pk_sub_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX11: v_pk_sub_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x03,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x03,0xcc,0xf0,0xfa,0x00,0x00 # GFX11: v_pk_sub_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x03,0xcc,0xfd,0xd4,0x00,0x10] @@ -1231,7 +1231,7 @@ # GFX11: v_pk_sub_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x0b,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x0b,0xcc,0x7b,0xfa,0x01,0x18 -# GFX11: v_pk_sub_u16 v5, m0, 0x3800 +# GFX11: v_pk_sub_u16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x0b,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x0b,0xcc,0x7d,0xe0,0x01,0x18 # GFX11: v_pk_sub_u16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x0b,0xcc,0x7e,0x82,0x01,0x18] @@ -1246,7 +1246,7 @@ # GFX11: v_pk_sub_u16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x0b,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x0b,0xcc,0xc1,0xfe,0x00,0x18 -# GFX11: v_pk_sub_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX11: v_pk_sub_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0b,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x0b,0xcc,0xf0,0xfa,0x00,0x00 # GFX11: v_pk_sub_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0b,0xcc,0xfd,0xd4,0x00,0x10] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p.txt index 373cd71..44d8995 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p.txt @@ -463,7 +463,7 @@ # GFX12: v_pk_add_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x02,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x02,0xcc,0x7b,0xfa,0x01,0x18 -# GFX12: v_pk_add_i16 v5, m0, 0x3800 +# GFX12: v_pk_add_i16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x02,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x02,0xcc,0x7d,0xe0,0x01,0x18 # GFX12: v_pk_add_i16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x02,0xcc,0x7e,0x82,0x01,0x18] @@ -478,7 +478,7 @@ # GFX12: v_pk_add_i16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x02,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x02,0xcc,0xc1,0xfe,0x00,0x18 -# GFX12: v_pk_add_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX12: v_pk_add_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x02,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x02,0xcc,0xf0,0xfa,0x00,0x00 # GFX12: v_pk_add_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x02,0xcc,0xfd,0xd4,0x00,0x10] @@ -508,7 +508,7 @@ # GFX12: v_pk_add_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x0a,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x0a,0xcc,0x7b,0xfa,0x01,0x18 -# GFX12: v_pk_add_u16 v5, m0, 0x3800 +# GFX12: v_pk_add_u16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x0a,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x0a,0xcc,0x7d,0xe0,0x01,0x18 # GFX12: v_pk_add_u16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x82,0x01,0x18] @@ -523,7 +523,7 @@ # GFX12: v_pk_add_u16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x0a,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x0a,0xcc,0xc1,0xfe,0x00,0x18 -# GFX12: v_pk_add_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX12: v_pk_add_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0a,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x0a,0xcc,0xf0,0xfa,0x00,0x00 # GFX12: v_pk_add_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0a,0xcc,0xfd,0xd4,0x00,0x10] @@ -553,7 +553,7 @@ # GFX12: v_pk_ashrrev_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x06,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x06,0xcc,0x7b,0xfa,0x01,0x18 -# GFX12: v_pk_ashrrev_i16 v5, m0, 0x3800 +# GFX12: v_pk_ashrrev_i16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x06,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x06,0xcc,0x7d,0xe0,0x01,0x18 # GFX12: v_pk_ashrrev_i16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x06,0xcc,0x7e,0x82,0x01,0x18] @@ -568,7 +568,7 @@ # GFX12: v_pk_ashrrev_i16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x06,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x06,0xcc,0xc1,0xfe,0x00,0x18 -# GFX12: v_pk_ashrrev_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX12: v_pk_ashrrev_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x06,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x06,0xcc,0xf0,0xfa,0x00,0x00 # GFX12: v_pk_ashrrev_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x06,0xcc,0xfd,0xd4,0x00,0x10] @@ -643,7 +643,7 @@ # GFX12: v_pk_lshlrev_b16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x04,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x04,0xcc,0x7b,0xfa,0x01,0x18 -# GFX12: v_pk_lshlrev_b16 v5, m0, 0x3800 +# GFX12: v_pk_lshlrev_b16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x04,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x04,0xcc,0x7d,0xe0,0x01,0x18 # GFX12: v_pk_lshlrev_b16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x04,0xcc,0x7e,0x82,0x01,0x18] @@ -658,7 +658,7 @@ # GFX12: v_pk_lshlrev_b16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x04,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x04,0xcc,0xc1,0xfe,0x00,0x18 -# GFX12: v_pk_lshlrev_b16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX12: v_pk_lshlrev_b16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x04,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x04,0xcc,0xf0,0xfa,0x00,0x00 # GFX12: v_pk_lshlrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x04,0xcc,0xfd,0xd4,0x00,0x10] @@ -688,7 +688,7 @@ # GFX12: v_pk_lshrrev_b16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x05,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x05,0xcc,0x7b,0xfa,0x01,0x18 -# GFX12: v_pk_lshrrev_b16 v5, m0, 0x3800 +# GFX12: v_pk_lshrrev_b16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x05,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x05,0xcc,0x7d,0xe0,0x01,0x18 # GFX12: v_pk_lshrrev_b16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x05,0xcc,0x7e,0x82,0x01,0x18] @@ -703,7 +703,7 @@ # GFX12: v_pk_lshrrev_b16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x05,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x05,0xcc,0xc1,0xfe,0x00,0x18 -# GFX12: v_pk_lshrrev_b16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX12: v_pk_lshrrev_b16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x05,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x05,0xcc,0xf0,0xfa,0x00,0x00 # GFX12: v_pk_lshrrev_b16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x05,0xcc,0xfd,0xd4,0x00,0x10] @@ -733,7 +733,7 @@ # GFX12: v_pk_mad_i16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x40,0x00,0xcc,0x7b,0xfa,0xed,0x19] 0x05,0x40,0x00,0xcc,0x7b,0xfa,0xed,0x19 -# GFX12: v_pk_mad_i16 v5, m0, 0x3800, m0 +# GFX12: v_pk_mad_i16 v5, m0, 0.5, m0 ; encoding: [0x05,0x40,0x00,0xcc,0x7d,0xe0,0xf5,0x19] 0x05,0x40,0x00,0xcc,0x7d,0xe0,0xf5,0x19 # GFX12: v_pk_mad_i16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,0] ; encoding: [0x05,0x00,0x00,0xcc,0x7e,0x82,0xad,0x01] @@ -748,7 +748,7 @@ # GFX12: v_pk_mad_i16 v5, -1, exec_hi, src_scc op_sel:[1,1,1] op_sel_hi:[1,0,0] ; encoding: [0x05,0x38,0x00,0xcc,0xc1,0xfe,0xf4,0x0b] 0x05,0x38,0x00,0xcc,0xc1,0xfe,0xf4,0x0b -# GFX12: v_pk_mad_i16 v5, 0x3800, m0, 0x3800 op_sel:[1,0,0] op_sel_hi:[0,1,1] +# GFX12: v_pk_mad_i16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x05,0x48,0x00,0xcc,0xf0,0xfa,0xc0,0x13] 0x05,0x48,0x00,0xcc,0xf0,0xfa,0xc0,0x13 # GFX12: v_pk_mad_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; encoding: [0x05,0x50,0x00,0xcc,0xfd,0xd4,0x04,0x0b] @@ -778,7 +778,7 @@ # GFX12: v_pk_mad_u16 v5, ttmp15, src_scc, ttmp15 ; encoding: [0x05,0x40,0x09,0xcc,0x7b,0xfa,0xed,0x19] 0x05,0x40,0x09,0xcc,0x7b,0xfa,0xed,0x19 -# GFX12: v_pk_mad_u16 v5, m0, 0x3800, m0 +# GFX12: v_pk_mad_u16 v5, m0, 0.5, m0 ; encoding: [0x05,0x40,0x09,0xcc,0x7d,0xe0,0xf5,0x19] 0x05,0x40,0x09,0xcc,0x7d,0xe0,0xf5,0x19 # GFX12: v_pk_mad_u16 v5, exec_lo, -1, vcc_hi op_sel_hi:[0,0,0] ; encoding: [0x05,0x00,0x09,0xcc,0x7e,0x82,0xad,0x01] @@ -793,7 +793,7 @@ # GFX12: v_pk_mad_u16 v5, -1, exec_hi, src_scc op_sel:[1,1,1] op_sel_hi:[1,0,0] ; encoding: [0x05,0x38,0x09,0xcc,0xc1,0xfe,0xf4,0x0b] 0x05,0x38,0x09,0xcc,0xc1,0xfe,0xf4,0x0b -# GFX12: v_pk_mad_u16 v5, 0x3800, m0, 0x3800 op_sel:[1,0,0] op_sel_hi:[0,1,1] +# GFX12: v_pk_mad_u16 v5, 0.5, m0, 0.5 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; encoding: [0x05,0x48,0x09,0xcc,0xf0,0xfa,0xc0,0x13] 0x05,0x48,0x09,0xcc,0xf0,0xfa,0xc0,0x13 # GFX12: v_pk_mad_u16 v5, src_scc, vcc_lo, -1 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; encoding: [0x05,0x50,0x09,0xcc,0xfd,0xd4,0x04,0x0b] @@ -868,7 +868,7 @@ # GFX12: v_pk_max_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x07,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x07,0xcc,0x7b,0xfa,0x01,0x18 -# GFX12: v_pk_max_i16 v5, m0, 0x3800 +# GFX12: v_pk_max_i16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x07,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x07,0xcc,0x7d,0xe0,0x01,0x18 # GFX12: v_pk_max_i16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x07,0xcc,0x7e,0x82,0x01,0x18] @@ -883,7 +883,7 @@ # GFX12: v_pk_max_i16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x07,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x07,0xcc,0xc1,0xfe,0x00,0x18 -# GFX12: v_pk_max_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX12: v_pk_max_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x07,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x07,0xcc,0xf0,0xfa,0x00,0x00 # GFX12: v_pk_max_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x07,0xcc,0xfd,0xd4,0x00,0x10] @@ -913,7 +913,7 @@ # GFX12: v_pk_max_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x0c,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x0c,0xcc,0x7b,0xfa,0x01,0x18 -# GFX12: v_pk_max_u16 v5, m0, 0x3800 +# GFX12: v_pk_max_u16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x0c,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x0c,0xcc,0x7d,0xe0,0x01,0x18 # GFX12: v_pk_max_u16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x0c,0xcc,0x7e,0x82,0x01,0x18] @@ -928,7 +928,7 @@ # GFX12: v_pk_max_u16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x0c,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x0c,0xcc,0xc1,0xfe,0x00,0x18 -# GFX12: v_pk_max_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX12: v_pk_max_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0c,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x0c,0xcc,0xf0,0xfa,0x00,0x00 # GFX12: v_pk_max_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0c,0xcc,0xfd,0xd4,0x00,0x10] @@ -1003,7 +1003,7 @@ # GFX12: v_pk_min_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x08,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x08,0xcc,0x7b,0xfa,0x01,0x18 -# GFX12: v_pk_min_i16 v5, m0, 0x3800 +# GFX12: v_pk_min_i16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x08,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x08,0xcc,0x7d,0xe0,0x01,0x18 # GFX12: v_pk_min_i16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x08,0xcc,0x7e,0x82,0x01,0x18] @@ -1018,7 +1018,7 @@ # GFX12: v_pk_min_i16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x08,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x08,0xcc,0xc1,0xfe,0x00,0x18 -# GFX12: v_pk_min_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX12: v_pk_min_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x08,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x08,0xcc,0xf0,0xfa,0x00,0x00 # GFX12: v_pk_min_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x08,0xcc,0xfd,0xd4,0x00,0x10] @@ -1048,7 +1048,7 @@ # GFX12: v_pk_min_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x0d,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x0d,0xcc,0x7b,0xfa,0x01,0x18 -# GFX12: v_pk_min_u16 v5, m0, 0x3800 +# GFX12: v_pk_min_u16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x0d,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x0d,0xcc,0x7d,0xe0,0x01,0x18 # GFX12: v_pk_min_u16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x0d,0xcc,0x7e,0x82,0x01,0x18] @@ -1063,7 +1063,7 @@ # GFX12: v_pk_min_u16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x0d,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x0d,0xcc,0xc1,0xfe,0x00,0x18 -# GFX12: v_pk_min_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX12: v_pk_min_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0d,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x0d,0xcc,0xf0,0xfa,0x00,0x00 # GFX12: v_pk_min_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0d,0xcc,0xfd,0xd4,0x00,0x10] @@ -1138,7 +1138,7 @@ # GFX12: v_pk_mul_lo_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x01,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x01,0xcc,0x7b,0xfa,0x01,0x18 -# GFX12: v_pk_mul_lo_u16 v5, m0, 0x3800 +# GFX12: v_pk_mul_lo_u16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x01,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x01,0xcc,0x7d,0xe0,0x01,0x18 # GFX12: v_pk_mul_lo_u16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x01,0xcc,0x7e,0x82,0x01,0x18] @@ -1153,7 +1153,7 @@ # GFX12: v_pk_mul_lo_u16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x01,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x01,0xcc,0xc1,0xfe,0x00,0x18 -# GFX12: v_pk_mul_lo_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX12: v_pk_mul_lo_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x01,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x01,0xcc,0xf0,0xfa,0x00,0x00 # GFX12: v_pk_mul_lo_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x01,0xcc,0xfd,0xd4,0x00,0x10] @@ -1183,7 +1183,7 @@ # GFX12: v_pk_sub_i16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x03,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x03,0xcc,0x7b,0xfa,0x01,0x18 -# GFX12: v_pk_sub_i16 v5, m0, 0x3800 +# GFX12: v_pk_sub_i16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x03,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x03,0xcc,0x7d,0xe0,0x01,0x18 # GFX12: v_pk_sub_i16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x03,0xcc,0x7e,0x82,0x01,0x18] @@ -1198,7 +1198,7 @@ # GFX12: v_pk_sub_i16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x03,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x03,0xcc,0xc1,0xfe,0x00,0x18 -# GFX12: v_pk_sub_i16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX12: v_pk_sub_i16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x03,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x03,0xcc,0xf0,0xfa,0x00,0x00 # GFX12: v_pk_sub_i16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x03,0xcc,0xfd,0xd4,0x00,0x10] @@ -1228,7 +1228,7 @@ # GFX12: v_pk_sub_u16 v5, ttmp15, src_scc ; encoding: [0x05,0x40,0x0b,0xcc,0x7b,0xfa,0x01,0x18] 0x05,0x40,0x0b,0xcc,0x7b,0xfa,0x01,0x18 -# GFX12: v_pk_sub_u16 v5, m0, 0x3800 +# GFX12: v_pk_sub_u16 v5, m0, 0.5 ; encoding: [0x05,0x40,0x0b,0xcc,0x7d,0xe0,0x01,0x18] 0x05,0x40,0x0b,0xcc,0x7d,0xe0,0x01,0x18 # GFX12: v_pk_sub_u16 v5, exec_lo, -1 ; encoding: [0x05,0x40,0x0b,0xcc,0x7e,0x82,0x01,0x18] @@ -1243,7 +1243,7 @@ # GFX12: v_pk_sub_u16 v5, -1, exec_hi ; encoding: [0x05,0x40,0x0b,0xcc,0xc1,0xfe,0x00,0x18] 0x05,0x40,0x0b,0xcc,0xc1,0xfe,0x00,0x18 -# GFX12: v_pk_sub_u16 v5, 0x3800, m0 op_sel:[1,1] op_sel_hi:[0,0] +# GFX12: v_pk_sub_u16 v5, 0.5, m0 op_sel:[1,1] op_sel_hi:[0,0] ; encoding: [0x05,0x58,0x0b,0xcc,0xf0,0xfa,0x00,0x00] 0x05,0x58,0x0b,0xcc,0xf0,0xfa,0x00,0x00 # GFX12: v_pk_sub_u16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x05,0x48,0x0b,0xcc,0xfd,0xd4,0x00,0x10] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3p.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3p.txt index 215453d..003ece9 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3p.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3p.txt @@ -42,10 +42,10 @@ # CHECK: v_pk_mad_i16 v5, -1, v2, v3 ; encoding: [0x05,0x40,0x80,0xd3,0xc1,0x04,0x0e,0x1c] 0x05,0x40,0x80,0xd3,0xc1,0x04,0x0e,0x1c -# CHECK: v_pk_mad_i16 v5, 0x3800, v2, v3 ; encoding: [0x05,0x40,0x80,0xd3,0xff,0x04,0x0e,0x1c] +# CHECK: v_pk_mad_i16 v5, 0.5, v2, v3 ; encoding: [0x05,0x40,0x80,0xd3,0xf0,0x04,0x0e,0x1c] 0x05,0x40,0x80,0xd3,0xf0,0x04,0x0e,0x1c -# CHECK: v_pk_mad_i16 v5, 0xc400, v2, v3 ; encoding: [0x05,0x40,0x80,0xd3,0xff,0x04,0x0e,0x1c] +# CHECK: v_pk_mad_i16 v5, -4.0, v2, v3 ; encoding: [0x05,0x40,0x80,0xd3,0xf7,0x04,0x0e,0x1c] 0x05,0x40,0x80,0xd3,0xf7,0x04,0x0e,0x1c # CHECK: v_pk_mad_i16 v5, v1, v255, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xff,0x0f,0x1c] @@ -84,10 +84,10 @@ # CHECK: v_pk_mad_i16 v5, v1, -1, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x83,0x0d,0x1c] 0x05,0x40,0x80,0xd3,0x01,0x83,0x0d,0x1c -# CHECK: v_pk_mad_i16 v5, v1, 0x3800, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xff,0x0d,0x1c] +# CHECK: v_pk_mad_i16 v5, v1, 0.5, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xe1,0x0d,0x1c] 0x05,0x40,0x80,0xd3,0x01,0xe1,0x0d,0x1c -# CHECK: v_pk_mad_i16 v5, v1, 0xc400, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xff,0x0d,0x1c] +# CHECK: v_pk_mad_i16 v5, v1, -4.0, v3 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xef,0x0d,0x1c] 0x05,0x40,0x80,0xd3,0x01,0xef,0x0d,0x1c # CHECK: v_pk_mad_i16 v5, v1, v2, v255 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x1f] @@ -126,10 +126,10 @@ # CHECK: v_pk_mad_i16 v5, v1, v2, -1 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x06,0x1b] 0x05,0x40,0x80,0xd3,0x01,0x05,0x06,0x1b -# CHECK: v_pk_mad_i16 v5, v1, v2, 0x3800 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x1b] +# CHECK: v_pk_mad_i16 v5, v1, v2, 0.5 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xc2,0x1b] 0x05,0x40,0x80,0xd3,0x01,0x05,0xc2,0x1b -# CHECK: v_pk_mad_i16 v5, v1, v2, 0xc400 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x1b] +# CHECK: v_pk_mad_i16 v5, v1, v2, -4.0 ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xde,0x1b] 0x05,0x40,0x80,0xd3,0x01,0x05,0xde,0x1b # CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x05,0x48,0x80,0xd3,0x01,0x05,0x0e,0x1c] @@ -201,10 +201,10 @@ # CHECK: v_pk_mul_lo_u16 v5, -1, v2 ; encoding: [0x05,0x40,0x81,0xd3,0xc1,0x04,0x02,0x18] 0x05,0x00,0x81,0xd3,0xc1,0x04,0x02,0x18 -# CHECK: v_pk_mul_lo_u16 v5, 0x3800, v2 ; encoding: [0x05,0x40,0x81,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_mul_lo_u16 v5, 0.5, v2 ; encoding: [0x05,0x40,0x81,0xd3,0xf0,0x04,0x02,0x18] 0x05,0x00,0x81,0xd3,0xf0,0x04,0x02,0x18 -# CHECK: v_pk_mul_lo_u16 v5, 0xc400, v2 ; encoding: [0x05,0x40,0x81,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_mul_lo_u16 v5, -4.0, v2 ; encoding: [0x05,0x40,0x81,0xd3,0xf7,0x04,0x02,0x18] 0x05,0x00,0x81,0xd3,0xf7,0x04,0x02,0x18 # CHECK: v_pk_mul_lo_u16 v5, v1, v255 ; encoding: [0x05,0x40,0x81,0xd3,0x01,0xff,0x03,0x18] @@ -243,10 +243,10 @@ # CHECK: v_pk_mul_lo_u16 v5, v1, -1 ; encoding: [0x05,0x40,0x81,0xd3,0x01,0x83,0x01,0x18] 0x05,0x00,0x81,0xd3,0x01,0x83,0x01,0x18 -# CHECK: v_pk_mul_lo_u16 v5, v1, 0x3800 ; encoding: [0x05,0x40,0x81,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_mul_lo_u16 v5, v1, 0.5 ; encoding: [0x05,0x40,0x81,0xd3,0x01,0xe1,0x01,0x18] 0x05,0x00,0x81,0xd3,0x01,0xe1,0x01,0x18 -# CHECK: v_pk_mul_lo_u16 v5, v1, 0xc400 ; encoding: [0x05,0x40,0x81,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_mul_lo_u16 v5, v1, -4.0 ; encoding: [0x05,0x40,0x81,0xd3,0x01,0xef,0x01,0x18] 0x05,0x00,0x81,0xd3,0x01,0xef,0x01,0x18 # CHECK: v_pk_mul_lo_u16 v5, v1, v2 op_sel:[1,0] ; encoding: [0x05,0x48,0x81,0xd3,0x01,0x05,0x02,0x18] @@ -309,10 +309,10 @@ # CHECK: v_pk_add_i16 v5, -1, v2 ; encoding: [0x05,0x40,0x82,0xd3,0xc1,0x04,0x02,0x18] 0x05,0x00,0x82,0xd3,0xc1,0x04,0x02,0x18 -# CHECK: v_pk_add_i16 v5, 0x3800, v2 ; encoding: [0x05,0x40,0x82,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_add_i16 v5, 0.5, v2 ; encoding: [0x05,0x40,0x82,0xd3,0xf0,0x04,0x02,0x18] 0x05,0x00,0x82,0xd3,0xf0,0x04,0x02,0x18 -# CHECK: v_pk_add_i16 v5, 0xc400, v2 ; encoding: [0x05,0x40,0x82,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_add_i16 v5, -4.0, v2 ; encoding: [0x05,0x40,0x82,0xd3,0xf7,0x04,0x02,0x18] 0x05,0x00,0x82,0xd3,0xf7,0x04,0x02,0x18 # CHECK: v_pk_add_i16 v5, v1, v255 ; encoding: [0x05,0x40,0x82,0xd3,0x01,0xff,0x03,0x18] @@ -351,10 +351,10 @@ # CHECK: v_pk_add_i16 v5, v1, -1 ; encoding: [0x05,0x40,0x82,0xd3,0x01,0x83,0x01,0x18] 0x05,0x00,0x82,0xd3,0x01,0x83,0x01,0x18 -# CHECK: v_pk_add_i16 v5, v1, 0x3800 ; encoding: [0x05,0x40,0x82,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_add_i16 v5, v1, 0.5 ; encoding: [0x05,0x40,0x82,0xd3,0x01,0xe1,0x01,0x18] 0x05,0x00,0x82,0xd3,0x01,0xe1,0x01,0x18 -# CHECK: v_pk_add_i16 v5, v1, 0xc400 ; encoding: [0x05,0x40,0x82,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_add_i16 v5, v1, -4.0 ; encoding: [0x05,0x40,0x82,0xd3,0x01,0xef,0x01,0x18] 0x05,0x00,0x82,0xd3,0x01,0xef,0x01,0x18 # CHECK: v_pk_add_i16 v5, v1, v2 op_sel:[1,0] ; encoding: [0x05,0x48,0x82,0xd3,0x01,0x05,0x02,0x18] @@ -420,10 +420,10 @@ # CHECK: v_pk_sub_i16 v5, -1, v2 ; encoding: [0x05,0x40,0x83,0xd3,0xc1,0x04,0x02,0x18] 0x05,0x00,0x83,0xd3,0xc1,0x04,0x02,0x18 -# CHECK: v_pk_sub_i16 v5, 0x3800, v2 ; encoding: [0x05,0x40,0x83,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_sub_i16 v5, 0.5, v2 ; encoding: [0x05,0x40,0x83,0xd3,0xf0,0x04,0x02,0x18] 0x05,0x00,0x83,0xd3,0xf0,0x04,0x02,0x18 -# CHECK: v_pk_sub_i16 v5, 0xc400, v2 ; encoding: [0x05,0x40,0x83,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_sub_i16 v5, -4.0, v2 ; encoding: [0x05,0x40,0x83,0xd3,0xf7,0x04,0x02,0x18] 0x05,0x00,0x83,0xd3,0xf7,0x04,0x02,0x18 # CHECK: v_pk_sub_i16 v5, v1, v255 ; encoding: [0x05,0x40,0x83,0xd3,0x01,0xff,0x03,0x18] @@ -462,10 +462,10 @@ # CHECK: v_pk_sub_i16 v5, v1, -1 ; encoding: [0x05,0x40,0x83,0xd3,0x01,0x83,0x01,0x18] 0x05,0x00,0x83,0xd3,0x01,0x83,0x01,0x18 -# CHECK: v_pk_sub_i16 v5, v1, 0x3800 ; encoding: [0x05,0x40,0x83,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_sub_i16 v5, v1, 0.5 ; encoding: [0x05,0x40,0x83,0xd3,0x01,0xe1,0x01,0x18] 0x05,0x00,0x83,0xd3,0x01,0xe1,0x01,0x18 -# CHECK: v_pk_sub_i16 v5, v1, 0xc400 ; encoding: [0x05,0x40,0x83,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_sub_i16 v5, v1, -4.0 ; encoding: [0x05,0x40,0x83,0xd3,0x01,0xef,0x01,0x18] 0x05,0x00,0x83,0xd3,0x01,0xef,0x01,0x18 # CHECK: v_pk_sub_i16 v5, v1, v2 op_sel:[1,0] ; encoding: [0x05,0x48,0x83,0xd3,0x01,0x05,0x02,0x18] @@ -531,10 +531,10 @@ # CHECK: v_pk_lshlrev_b16 v5, -1, v2 ; encoding: [0x05,0x40,0x84,0xd3,0xc1,0x04,0x02,0x18] 0x05,0x00,0x84,0xd3,0xc1,0x04,0x02,0x18 -# CHECK: v_pk_lshlrev_b16 v5, 0x3800, v2 ; encoding: [0x05,0x40,0x84,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_lshlrev_b16 v5, 0.5, v2 ; encoding: [0x05,0x40,0x84,0xd3,0xf0,0x04,0x02,0x18] 0x05,0x00,0x84,0xd3,0xf0,0x04,0x02,0x18 -# CHECK: v_pk_lshlrev_b16 v5, 0xc400, v2 ; encoding: [0x05,0x40,0x84,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_lshlrev_b16 v5, -4.0, v2 ; encoding: [0x05,0x40,0x84,0xd3,0xf7,0x04,0x02,0x18] 0x05,0x00,0x84,0xd3,0xf7,0x04,0x02,0x18 # CHECK: v_pk_lshlrev_b16 v5, v1, v255 ; encoding: [0x05,0x40,0x84,0xd3,0x01,0xff,0x03,0x18] @@ -573,10 +573,10 @@ # CHECK: v_pk_lshlrev_b16 v5, v1, -1 ; encoding: [0x05,0x40,0x84,0xd3,0x01,0x83,0x01,0x18] 0x05,0x00,0x84,0xd3,0x01,0x83,0x01,0x18 -# CHECK: v_pk_lshlrev_b16 v5, v1, 0x3800 ; encoding: [0x05,0x40,0x84,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_lshlrev_b16 v5, v1, 0.5 ; encoding: [0x05,0x40,0x84,0xd3,0x01,0xe1,0x01,0x18] 0x05,0x00,0x84,0xd3,0x01,0xe1,0x01,0x18 -# CHECK: v_pk_lshlrev_b16 v5, v1, 0xc400 ; encoding: [0x05,0x40,0x84,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_lshlrev_b16 v5, v1, -4.0 ; encoding: [0x05,0x40,0x84,0xd3,0x01,0xef,0x01,0x18] 0x05,0x00,0x84,0xd3,0x01,0xef,0x01,0x18 # CHECK: v_pk_lshlrev_b16 v5, v1, v2 op_sel:[1,0] ; encoding: [0x05,0x48,0x84,0xd3,0x01,0x05,0x02,0x18] @@ -639,10 +639,10 @@ # CHECK: v_pk_lshrrev_b16 v5, -1, v2 ; encoding: [0x05,0x40,0x85,0xd3,0xc1,0x04,0x02,0x18] 0x05,0x00,0x85,0xd3,0xc1,0x04,0x02,0x18 -# CHECK: v_pk_lshrrev_b16 v5, 0x3800, v2 ; encoding: [0x05,0x40,0x85,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_lshrrev_b16 v5, 0.5, v2 ; encoding: [0x05,0x40,0x85,0xd3,0xf0,0x04,0x02,0x18] 0x05,0x00,0x85,0xd3,0xf0,0x04,0x02,0x18 -# CHECK: v_pk_lshrrev_b16 v5, 0xc400, v2 ; encoding: [0x05,0x40,0x85,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_lshrrev_b16 v5, -4.0, v2 ; encoding: [0x05,0x40,0x85,0xd3,0xf7,0x04,0x02,0x18] 0x05,0x00,0x85,0xd3,0xf7,0x04,0x02,0x18 # CHECK: v_pk_lshrrev_b16 v5, v1, v255 ; encoding: [0x05,0x40,0x85,0xd3,0x01,0xff,0x03,0x18] @@ -681,10 +681,10 @@ # CHECK: v_pk_lshrrev_b16 v5, v1, -1 ; encoding: [0x05,0x40,0x85,0xd3,0x01,0x83,0x01,0x18] 0x05,0x00,0x85,0xd3,0x01,0x83,0x01,0x18 -# CHECK: v_pk_lshrrev_b16 v5, v1, 0x3800 ; encoding: [0x05,0x40,0x85,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_lshrrev_b16 v5, v1, 0.5 ; encoding: [0x05,0x40,0x85,0xd3,0x01,0xe1,0x01,0x18] 0x05,0x00,0x85,0xd3,0x01,0xe1,0x01,0x18 -# CHECK: v_pk_lshrrev_b16 v5, v1, 0xc400 ; encoding: [0x05,0x40,0x85,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_lshrrev_b16 v5, v1, -4.0 ; encoding: [0x05,0x40,0x85,0xd3,0x01,0xef,0x01,0x18] 0x05,0x00,0x85,0xd3,0x01,0xef,0x01,0x18 # CHECK: v_pk_lshrrev_b16 v5, v1, v2 op_sel:[1,0] ; encoding: [0x05,0x48,0x85,0xd3,0x01,0x05,0x02,0x18] @@ -747,10 +747,10 @@ # CHECK: v_pk_ashrrev_i16 v5, -1, v2 ; encoding: [0x05,0x40,0x86,0xd3,0xc1,0x04,0x02,0x18] 0x05,0x00,0x86,0xd3,0xc1,0x04,0x02,0x18 -# CHECK: v_pk_ashrrev_i16 v5, 0x3800, v2 ; encoding: [0x05,0x40,0x86,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_ashrrev_i16 v5, 0.5, v2 ; encoding: [0x05,0x40,0x86,0xd3,0xf0,0x04,0x02,0x18] 0x05,0x00,0x86,0xd3,0xf0,0x04,0x02,0x18 -# CHECK: v_pk_ashrrev_i16 v5, 0xc400, v2 ; encoding: [0x05,0x40,0x86,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_ashrrev_i16 v5, -4.0, v2 ; encoding: [0x05,0x40,0x86,0xd3,0xf7,0x04,0x02,0x18] 0x05,0x00,0x86,0xd3,0xf7,0x04,0x02,0x18 # CHECK: v_pk_ashrrev_i16 v5, v1, v255 ; encoding: [0x05,0x40,0x86,0xd3,0x01,0xff,0x03,0x18] @@ -789,10 +789,10 @@ # CHECK: v_pk_ashrrev_i16 v5, v1, -1 ; encoding: [0x05,0x40,0x86,0xd3,0x01,0x83,0x01,0x18] 0x05,0x00,0x86,0xd3,0x01,0x83,0x01,0x18 -# CHECK: v_pk_ashrrev_i16 v5, v1, 0x3800 ; encoding: [0x05,0x40,0x86,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_ashrrev_i16 v5, v1, 0.5 ; encoding: [0x05,0x40,0x86,0xd3,0x01,0xe1,0x01,0x18] 0x05,0x00,0x86,0xd3,0x01,0xe1,0x01,0x18 -# CHECK: v_pk_ashrrev_i16 v5, v1, 0xc400 ; encoding: [0x05,0x40,0x86,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_ashrrev_i16 v5, v1, -4.0 ; encoding: [0x05,0x40,0x86,0xd3,0x01,0xef,0x01,0x18] 0x05,0x00,0x86,0xd3,0x01,0xef,0x01,0x18 # CHECK: v_pk_ashrrev_i16 v5, v1, v2 op_sel:[1,0] ; encoding: [0x05,0x48,0x86,0xd3,0x01,0x05,0x02,0x18] @@ -855,10 +855,10 @@ # CHECK: v_pk_max_i16 v5, -1, v2 ; encoding: [0x05,0x40,0x87,0xd3,0xc1,0x04,0x02,0x18] 0x05,0x00,0x87,0xd3,0xc1,0x04,0x02,0x18 -# CHECK: v_pk_max_i16 v5, 0x3800, v2 ; encoding: [0x05,0x40,0x87,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_max_i16 v5, 0.5, v2 ; encoding: [0x05,0x40,0x87,0xd3,0xf0,0x04,0x02,0x18] 0x05,0x00,0x87,0xd3,0xf0,0x04,0x02,0x18 -# CHECK: v_pk_max_i16 v5, 0xc400, v2 ; encoding: [0x05,0x40,0x87,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_max_i16 v5, -4.0, v2 ; encoding: [0x05,0x40,0x87,0xd3,0xf7,0x04,0x02,0x18] 0x05,0x00,0x87,0xd3,0xf7,0x04,0x02,0x18 # CHECK: v_pk_max_i16 v5, v1, v255 ; encoding: [0x05,0x40,0x87,0xd3,0x01,0xff,0x03,0x18] @@ -897,10 +897,10 @@ # CHECK: v_pk_max_i16 v5, v1, -1 ; encoding: [0x05,0x40,0x87,0xd3,0x01,0x83,0x01,0x18] 0x05,0x00,0x87,0xd3,0x01,0x83,0x01,0x18 -# CHECK: v_pk_max_i16 v5, v1, 0x3800 ; encoding: [0x05,0x40,0x87,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_max_i16 v5, v1, 0.5 ; encoding: [0x05,0x40,0x87,0xd3,0x01,0xe1,0x01,0x18] 0x05,0x00,0x87,0xd3,0x01,0xe1,0x01,0x18 -# CHECK: v_pk_max_i16 v5, v1, 0xc400 ; encoding: [0x05,0x40,0x87,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_max_i16 v5, v1, -4.0 ; encoding: [0x05,0x40,0x87,0xd3,0x01,0xef,0x01,0x18] 0x05,0x00,0x87,0xd3,0x01,0xef,0x01,0x18 # CHECK: v_pk_max_i16 v5, v1, v2 op_sel:[1,0] ; encoding: [0x05,0x48,0x87,0xd3,0x01,0x05,0x02,0x18] @@ -963,10 +963,10 @@ # CHECK: v_pk_min_i16 v5, -1, v2 ; encoding: [0x05,0x40,0x88,0xd3,0xc1,0x04,0x02,0x18] 0x05,0x00,0x88,0xd3,0xc1,0x04,0x02,0x18 -# CHECK: v_pk_min_i16 v5, 0x3800, v2 ; encoding: [0x05,0x40,0x88,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_min_i16 v5, 0.5, v2 ; encoding: [0x05,0x40,0x88,0xd3,0xf0,0x04,0x02,0x18] 0x05,0x00,0x88,0xd3,0xf0,0x04,0x02,0x18 -# CHECK: v_pk_min_i16 v5, 0xc400, v2 ; encoding: [0x05,0x40,0x88,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_min_i16 v5, -4.0, v2 ; encoding: [0x05,0x40,0x88,0xd3,0xf7,0x04,0x02,0x18] 0x05,0x00,0x88,0xd3,0xf7,0x04,0x02,0x18 # CHECK: v_pk_min_i16 v5, v1, v255 ; encoding: [0x05,0x40,0x88,0xd3,0x01,0xff,0x03,0x18] @@ -1005,10 +1005,10 @@ # CHECK: v_pk_min_i16 v5, v1, -1 ; encoding: [0x05,0x40,0x88,0xd3,0x01,0x83,0x01,0x18] 0x05,0x00,0x88,0xd3,0x01,0x83,0x01,0x18 -# CHECK: v_pk_min_i16 v5, v1, 0x3800 ; encoding: [0x05,0x40,0x88,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_min_i16 v5, v1, 0.5 ; encoding: [0x05,0x40,0x88,0xd3,0x01,0xe1,0x01,0x18] 0x05,0x00,0x88,0xd3,0x01,0xe1,0x01,0x18 -# CHECK: v_pk_min_i16 v5, v1, 0xc400 ; encoding: [0x05,0x40,0x88,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_min_i16 v5, v1, -4.0 ; encoding: [0x05,0x40,0x88,0xd3,0x01,0xef,0x01,0x18] 0x05,0x00,0x88,0xd3,0x01,0xef,0x01,0x18 # CHECK: v_pk_min_i16 v5, v1, v2 op_sel:[1,0] ; encoding: [0x05,0x48,0x88,0xd3,0x01,0x05,0x02,0x18] @@ -1071,10 +1071,10 @@ # CHECK: v_pk_mad_u16 v5, -1, v2, v3 ; encoding: [0x05,0x40,0x89,0xd3,0xc1,0x04,0x0e,0x1c] 0x05,0x40,0x89,0xd3,0xc1,0x04,0x0e,0x1c -# CHECK: v_pk_mad_u16 v5, 0x3800, v2, v3 ; encoding: [0x05,0x40,0x89,0xd3,0xff,0x04,0x0e,0x1c] +# CHECK: v_pk_mad_u16 v5, 0.5, v2, v3 ; encoding: [0x05,0x40,0x89,0xd3,0xf0,0x04,0x0e,0x1c] 0x05,0x40,0x89,0xd3,0xf0,0x04,0x0e,0x1c -# CHECK: v_pk_mad_u16 v5, 0xc400, v2, v3 ; encoding: [0x05,0x40,0x89,0xd3,0xff,0x04,0x0e,0x1c] +# CHECK: v_pk_mad_u16 v5, -4.0, v2, v3 ; encoding: [0x05,0x40,0x89,0xd3,0xf7,0x04,0x0e,0x1c] 0x05,0x40,0x89,0xd3,0xf7,0x04,0x0e,0x1c # CHECK: v_pk_mad_u16 v5, v1, v255, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xff,0x0f,0x1c] @@ -1113,10 +1113,10 @@ # CHECK: v_pk_mad_u16 v5, v1, -1, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x83,0x0d,0x1c] 0x05,0x40,0x89,0xd3,0x01,0x83,0x0d,0x1c -# CHECK: v_pk_mad_u16 v5, v1, 0x3800, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xff,0x0d,0x1c] +# CHECK: v_pk_mad_u16 v5, v1, 0.5, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xe1,0x0d,0x1c] 0x05,0x40,0x89,0xd3,0x01,0xe1,0x0d,0x1c -# CHECK: v_pk_mad_u16 v5, v1, 0xc400, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xff,0x0d,0x1c] +# CHECK: v_pk_mad_u16 v5, v1, -4.0, v3 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xef,0x0d,0x1c] 0x05,0x40,0x89,0xd3,0x01,0xef,0x0d,0x1c # CHECK: v_pk_mad_u16 v5, v1, v2, v255 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x1f] @@ -1155,10 +1155,10 @@ # CHECK: v_pk_mad_u16 v5, v1, v2, -1 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x06,0x1b] 0x05,0x40,0x89,0xd3,0x01,0x05,0x06,0x1b -# CHECK: v_pk_mad_u16 v5, v1, v2, 0x3800 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x1b] +# CHECK: v_pk_mad_u16 v5, v1, v2, 0.5 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xc2,0x1b] 0x05,0x40,0x89,0xd3,0x01,0x05,0xc2,0x1b -# CHECK: v_pk_mad_u16 v5, v1, v2, 0xc400 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x1b] +# CHECK: v_pk_mad_u16 v5, v1, v2, -4.0 ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xde,0x1b] 0x05,0x40,0x89,0xd3,0x01,0x05,0xde,0x1b # CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x05,0x48,0x89,0xd3,0x01,0x05,0x0e,0x1c] @@ -1230,10 +1230,10 @@ # CHECK: v_pk_add_u16 v5, -1, v2 ; encoding: [0x05,0x40,0x8a,0xd3,0xc1,0x04,0x02,0x18] 0x05,0x00,0x8a,0xd3,0xc1,0x04,0x02,0x18 -# CHECK: v_pk_add_u16 v5, 0x3800, v2 ; encoding: [0x05,0x40,0x8a,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_add_u16 v5, 0.5, v2 ; encoding: [0x05,0x40,0x8a,0xd3,0xf0,0x04,0x02,0x18] 0x05,0x00,0x8a,0xd3,0xf0,0x04,0x02,0x18 -# CHECK: v_pk_add_u16 v5, 0xc400, v2 ; encoding: [0x05,0x40,0x8a,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_add_u16 v5, -4.0, v2 ; encoding: [0x05,0x40,0x8a,0xd3,0xf7,0x04,0x02,0x18] 0x05,0x00,0x8a,0xd3,0xf7,0x04,0x02,0x18 # CHECK: v_pk_add_u16 v5, v1, v255 ; encoding: [0x05,0x40,0x8a,0xd3,0x01,0xff,0x03,0x18] @@ -1272,10 +1272,10 @@ # CHECK: v_pk_add_u16 v5, v1, -1 ; encoding: [0x05,0x40,0x8a,0xd3,0x01,0x83,0x01,0x18] 0x05,0x00,0x8a,0xd3,0x01,0x83,0x01,0x18 -# CHECK: v_pk_add_u16 v5, v1, 0x3800 ; encoding: [0x05,0x40,0x8a,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_add_u16 v5, v1, 0.5 ; encoding: [0x05,0x40,0x8a,0xd3,0x01,0xe1,0x01,0x18] 0x05,0x00,0x8a,0xd3,0x01,0xe1,0x01,0x18 -# CHECK: v_pk_add_u16 v5, v1, 0xc400 ; encoding: [0x05,0x40,0x8a,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_add_u16 v5, v1, -4.0 ; encoding: [0x05,0x40,0x8a,0xd3,0x01,0xef,0x01,0x18] 0x05,0x00,0x8a,0xd3,0x01,0xef,0x01,0x18 # CHECK: v_pk_add_u16 v5, v1, v2 op_sel:[1,0] ; encoding: [0x05,0x48,0x8a,0xd3,0x01,0x05,0x02,0x18] @@ -1341,10 +1341,10 @@ # CHECK: v_pk_sub_u16 v5, -1, v2 ; encoding: [0x05,0x40,0x8b,0xd3,0xc1,0x04,0x02,0x18] 0x05,0x00,0x8b,0xd3,0xc1,0x04,0x02,0x18 -# CHECK: v_pk_sub_u16 v5, 0x3800, v2 ; encoding: [0x05,0x40,0x8b,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_sub_u16 v5, 0.5, v2 ; encoding: [0x05,0x40,0x8b,0xd3,0xf0,0x04,0x02,0x18] 0x05,0x00,0x8b,0xd3,0xf0,0x04,0x02,0x18 -# CHECK: v_pk_sub_u16 v5, 0xc400, v2 ; encoding: [0x05,0x40,0x8b,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_sub_u16 v5, -4.0, v2 ; encoding: [0x05,0x40,0x8b,0xd3,0xf7,0x04,0x02,0x18] 0x05,0x00,0x8b,0xd3,0xf7,0x04,0x02,0x18 # CHECK: v_pk_sub_u16 v5, v1, v255 ; encoding: [0x05,0x40,0x8b,0xd3,0x01,0xff,0x03,0x18] @@ -1383,10 +1383,10 @@ # CHECK: v_pk_sub_u16 v5, v1, -1 ; encoding: [0x05,0x40,0x8b,0xd3,0x01,0x83,0x01,0x18] 0x05,0x00,0x8b,0xd3,0x01,0x83,0x01,0x18 -# CHECK: v_pk_sub_u16 v5, v1, 0x3800 ; encoding: [0x05,0x40,0x8b,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_sub_u16 v5, v1, 0.5 ; encoding: [0x05,0x40,0x8b,0xd3,0x01,0xe1,0x01,0x18] 0x05,0x00,0x8b,0xd3,0x01,0xe1,0x01,0x18 -# CHECK: v_pk_sub_u16 v5, v1, 0xc400 ; encoding: [0x05,0x40,0x8b,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_sub_u16 v5, v1, -4.0 ; encoding: [0x05,0x40,0x8b,0xd3,0x01,0xef,0x01,0x18] 0x05,0x00,0x8b,0xd3,0x01,0xef,0x01,0x18 # CHECK: v_pk_sub_u16 v5, v1, v2 op_sel:[1,0] ; encoding: [0x05,0x48,0x8b,0xd3,0x01,0x05,0x02,0x18] @@ -1452,10 +1452,10 @@ # CHECK: v_pk_max_u16 v5, -1, v2 ; encoding: [0x05,0x40,0x8c,0xd3,0xc1,0x04,0x02,0x18] 0x05,0x00,0x8c,0xd3,0xc1,0x04,0x02,0x18 -# CHECK: v_pk_max_u16 v5, 0x3800, v2 ; encoding: [0x05,0x40,0x8c,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_max_u16 v5, 0.5, v2 ; encoding: [0x05,0x40,0x8c,0xd3,0xf0,0x04,0x02,0x18] 0x05,0x00,0x8c,0xd3,0xf0,0x04,0x02,0x18 -# CHECK: v_pk_max_u16 v5, 0xc400, v2 ; encoding: [0x05,0x40,0x8c,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_max_u16 v5, -4.0, v2 ; encoding: [0x05,0x40,0x8c,0xd3,0xf7,0x04,0x02,0x18] 0x05,0x00,0x8c,0xd3,0xf7,0x04,0x02,0x18 # CHECK: v_pk_max_u16 v5, v1, v255 ; encoding: [0x05,0x40,0x8c,0xd3,0x01,0xff,0x03,0x18] @@ -1494,10 +1494,10 @@ # CHECK: v_pk_max_u16 v5, v1, -1 ; encoding: [0x05,0x40,0x8c,0xd3,0x01,0x83,0x01,0x18] 0x05,0x00,0x8c,0xd3,0x01,0x83,0x01,0x18 -# CHECK: v_pk_max_u16 v5, v1, 0x3800 ; encoding: [0x05,0x40,0x8c,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_max_u16 v5, v1, 0.5 ; encoding: [0x05,0x40,0x8c,0xd3,0x01,0xe1,0x01,0x18] 0x05,0x00,0x8c,0xd3,0x01,0xe1,0x01,0x18 -# CHECK: v_pk_max_u16 v5, v1, 0xc400 ; encoding: [0x05,0x40,0x8c,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_max_u16 v5, v1, -4.0 ; encoding: [0x05,0x40,0x8c,0xd3,0x01,0xef,0x01,0x18] 0x05,0x00,0x8c,0xd3,0x01,0xef,0x01,0x18 # CHECK: v_pk_max_u16 v5, v1, v2 op_sel:[1,0] ; encoding: [0x05,0x48,0x8c,0xd3,0x01,0x05,0x02,0x18] @@ -1560,10 +1560,10 @@ # CHECK: v_pk_min_u16 v5, -1, v2 ; encoding: [0x05,0x40,0x8d,0xd3,0xc1,0x04,0x02,0x18] 0x05,0x00,0x8d,0xd3,0xc1,0x04,0x02,0x18 -# CHECK: v_pk_min_u16 v5, 0x3800, v2 ; encoding: [0x05,0x40,0x8d,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_min_u16 v5, 0.5, v2 ; encoding: [0x05,0x40,0x8d,0xd3,0xf0,0x04,0x02,0x18] 0x05,0x00,0x8d,0xd3,0xf0,0x04,0x02,0x18 -# CHECK: v_pk_min_u16 v5, 0xc400, v2 ; encoding: [0x05,0x40,0x8d,0xd3,0xff,0x04,0x02,0x18] +# CHECK: v_pk_min_u16 v5, -4.0, v2 ; encoding: [0x05,0x40,0x8d,0xd3,0xf7,0x04,0x02,0x18] 0x05,0x00,0x8d,0xd3,0xf7,0x04,0x02,0x18 # CHECK: v_pk_min_u16 v5, v1, v255 ; encoding: [0x05,0x40,0x8d,0xd3,0x01,0xff,0x03,0x18] @@ -1602,10 +1602,10 @@ # CHECK: v_pk_min_u16 v5, v1, -1 ; encoding: [0x05,0x40,0x8d,0xd3,0x01,0x83,0x01,0x18] 0x05,0x00,0x8d,0xd3,0x01,0x83,0x01,0x18 -# CHECK: v_pk_min_u16 v5, v1, 0x3800 ; encoding: [0x05,0x40,0x8d,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_min_u16 v5, v1, 0.5 ; encoding: [0x05,0x40,0x8d,0xd3,0x01,0xe1,0x01,0x18] 0x05,0x00,0x8d,0xd3,0x01,0xe1,0x01,0x18 -# CHECK: v_pk_min_u16 v5, v1, 0xc400 ; encoding: [0x05,0x40,0x8d,0xd3,0x01,0xff,0x01,0x18] +# CHECK: v_pk_min_u16 v5, v1, -4.0 ; encoding: [0x05,0x40,0x8d,0xd3,0x01,0xef,0x01,0x18] 0x05,0x00,0x8d,0xd3,0x01,0xef,0x01,0x18 # CHECK: v_pk_min_u16 v5, v1, v2 op_sel:[1,0] ; encoding: [0x05,0x48,0x8d,0xd3,0x01,0x05,0x02,0x18] |