diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 38 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 37 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 18 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 67 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 13 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 14 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 2 |
10 files changed, 203 insertions, 10 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index fb83388..9d6584a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3212,6 +3212,44 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, Src = Src.getOperand(0); } + if (Mods != SISrcMods::NONE) + return true; + + // Convert various sign-bit masks on integers to src mods. Currently disabled + // for 16-bit types as the codegen replaces the operand without adding a + // srcmod. This is intentionally finding the cases where we are performing + // float neg and abs on int types, the goal is not to obtain two's complement + // neg or abs. Limit converison to select operands via the nonCanonalizing + // pattern. + // TODO: Add 16-bit support. + if (IsCanonicalizing) + return true; + + unsigned Opc = Src->getOpcode(); + EVT VT = Src.getValueType(); + if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) || + (VT != MVT::i32 && VT != MVT::i64)) + return true; + + ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Src->getOperand(1)); + if (!CRHS) + return true; + + // Recognise (xor a, 0x80000000) as NEG SrcMod. + // Recognise (and a, 0x7fffffff) as ABS SrcMod. + // Recognise (or a, 0x80000000) as NEG+ABS SrcModifiers. + if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) { + Mods |= SISrcMods::NEG; + Src = Src.getOperand(0); + } else if (Opc == ISD::AND && AllowAbs && + CRHS->getAPIntValue().isMaxSignedValue()) { + Mods |= SISrcMods::ABS; + Src = Src.getOperand(0); + } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) { + Mods |= SISrcMods::ABS | SISrcMods::NEG; + Src = Src.getOperand(0); + } + return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index a6e4a63..40d960e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5879,8 +5879,12 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, const LLT S32 = LLT::scalar(32); MachineRegisterInfo &MRI = *B.getMRI(); - std::tie(BaseReg, ImmOffset) = - AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); + // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before + // being added, so we can only safely match a 32-bit addition with no unsigned + // overflow. + bool CheckNUW = AMDGPU::isGFX1250(ST); + std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset( + MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW); // If BaseReg is a pointer, convert it to int. if (MRI.getType(BaseReg).isPointer()) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index ff8efd2..0d2feeb 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -4933,6 +4933,43 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) { return false; } + // Packed math FP32 instructions typically accept SGPRs or VGPRs as source + // operands. On gfx12+, if a source operand uses SGPRs, the HW can only read + // the first SGPR and use it for both the low and high operations. + if (isPackedFP32Inst(Opc) && isGFX12Plus()) { + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi); + + const MCOperand &Src0 = Inst.getOperand(Src0Idx); + const MCOperand &Src1 = Inst.getOperand(Src1Idx); + unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + unsigned OpSelHi = Inst.getOperand(OpSelHiIdx).getImm(); + + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + + auto VerifyOneSGPR = [OpSel, OpSelHi](unsigned Index) -> bool { + unsigned Mask = 1U << Index; + return ((OpSel & Mask) == 0) && ((OpSelHi & Mask) == 0); + }; + + if (Src0.isReg() && isSGPR(Src0.getReg(), TRI) && + !VerifyOneSGPR(/*Index=*/0)) + return false; + if (Src1.isReg() && isSGPR(Src1.getReg(), TRI) && + !VerifyOneSGPR(/*Index=*/1)) + return false; + + int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + if (Src2Idx != -1) { + const MCOperand &Src2 = Inst.getOperand(Src2Idx); + if (Src2.isReg() && isSGPR(Src2.getReg(), TRI) && + !VerifyOneSGPR(/*Index=*/2)) + return false; + } + } + return true; } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 0c653b1..962c276 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -2081,7 +2081,9 @@ SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const { case AMDGPU::V_MAX_F16_fake16_e64: case AMDGPU::V_MAX_F64_e64: case AMDGPU::V_MAX_NUM_F64_e64: - case AMDGPU::V_PK_MAX_F16: { + case AMDGPU::V_PK_MAX_F16: + case AMDGPU::V_MAX_BF16_PSEUDO_e64: + case AMDGPU::V_PK_MAX_NUM_BF16: { if (MI.mayRaiseFPException()) return nullptr; @@ -2108,8 +2110,10 @@ SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const { // Having a 0 op_sel_hi would require swizzling the output in the source // instruction, which we can't do. - unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 - : 0u; + unsigned UnsetMods = + (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16) + ? SISrcMods::OP_SEL_1 + : 0u; if (Src0Mods != UnsetMods && Src1Mods != UnsetMods) return nullptr; return Src0; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8f44c03..5b327fb 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6106,6 +6106,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, case MVT::f64: return true; case MVT::f16: + case MVT::bf16: return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF); default: break; @@ -10877,6 +10878,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } } +// Return whether the operation has NoUnsignedWrap property. +static bool isNoUnsignedWrap(SDValue Addr) { + return (Addr.getOpcode() == ISD::ADD && + Addr->getFlags().hasNoUnsignedWrap()) || + Addr->getOpcode() == ISD::OR; +} + bool SITargetLowering::shouldPreservePtrArith(const Function &F, EVT PtrVT) const { return UseSelectionDAGPTRADD && PtrVT == MVT::i64; @@ -10898,8 +10906,14 @@ SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const { if ((C1 = dyn_cast<ConstantSDNode>(N0))) N0 = SDValue(); else if (DAG.isBaseWithConstantOffset(N0)) { - C1 = cast<ConstantSDNode>(N0.getOperand(1)); - N0 = N0.getOperand(0); + // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before + // being added, so we can only safely match a 32-bit addition with no + // unsigned overflow. + bool CheckNUW = AMDGPU::isGFX1250(*Subtarget); + if (!CheckNUW || isNoUnsignedWrap(N0)) { + C1 = cast<ConstantSDNode>(N0.getOperand(1)); + N0 = N0.getOperand(0); + } } if (C1) { diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp index d8fe850..0a68512 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp @@ -51,7 +51,7 @@ static cl::opt<unsigned> namespace { enum HardClauseType { - // For GFX10: + // For GFX10 and GFX1250: // Texture, buffer, global or scratch memory instructions. HARDCLAUSE_VMEM, @@ -102,7 +102,8 @@ public: HardClauseType getHardClauseType(const MachineInstr &MI) { if (MI.mayLoad() || (MI.mayStore() && ST->shouldClusterStores())) { - if (ST->getGeneration() == AMDGPUSubtarget::GFX10) { + if (ST->getGeneration() == AMDGPUSubtarget::GFX10 || + ST->hasGFX1250Insts()) { if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) || SIInstrInfo::isSegmentSpecificFLAT(MI)) { if (ST->hasNSAClauseBug()) { @@ -115,7 +116,6 @@ public: if (SIInstrInfo::isFLAT(MI)) return HARDCLAUSE_FLAT; } else { - assert(ST->getGeneration() >= AMDGPUSubtarget::GFX11); if (SIInstrInfo::isMIMG(MI)) { const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f20b22d..19e6bcf 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -18,6 +18,7 @@ #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/LiveIntervals.h" @@ -5534,6 +5535,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more + // information. + if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) { + for (unsigned I = 0; I < 3; ++I) { + if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, I)) + return false; + } + } + return true; } @@ -6005,6 +6015,21 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx]; unsigned Opc = MI.getOpcode(); + // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more + // information. + if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) && + MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) { + constexpr const AMDGPU::OpName OpNames[] = { + AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2}; + + for (auto [I, OpName] : enumerate(OpNames)) { + int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]); + if (static_cast<unsigned>(SrcIdx) == OpIdx && + !isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, I, &MO)) + return false; + } + } + if (!isLegalRegOperand(MRI, OpInfo, MO)) return false; @@ -6053,6 +6078,39 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, return true; } +bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand( + const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, + const MachineOperand *MO) const { + constexpr const unsigned NumOps = 3; + constexpr const AMDGPU::OpName OpNames[NumOps * 2] = { + AMDGPU::OpName::src0, AMDGPU::OpName::src1, + AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers}; + + assert(SrcN < NumOps); + + if (!MO) { + int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]); + if (SrcIdx == -1) + return true; + MO = &MI.getOperand(SrcIdx); + } + + if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg())) + return true; + + int ModsIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]); + if (ModsIdx == -1) + return true; + + unsigned Mods = MI.getOperand(ModsIdx).getImm(); + bool OpSel = Mods & SISrcMods::OP_SEL_0; + bool OpSelHi = Mods & SISrcMods::OP_SEL_1; + + return !OpSel && !OpSelHi; +} + bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO) const { const MachineFunction &MF = *MI.getParent()->getParent(); @@ -6390,6 +6448,15 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) && !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg())) legalizeOpWithMove(MI, VOP3Idx[2]); + + // Fix the register class of packed FP32 instructions on gfx12+. See + // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information. + if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(ST)) { + for (unsigned I = 0; I < 3; ++I) { + if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I)) + legalizeOpWithMove(MI, VOP3Idx[I]); + } + } } Register SIInstrInfo::readlaneVGPRToSGPR( diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index e042b59..6b9403f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1287,6 +1287,19 @@ public: const MachineOperand &MO) const; bool isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, const MachineOperand &MO) const; + + /// Check if \p MO would be a legal operand for gfx12+ packed math FP32 + /// instructions. Packed math FP32 instructions typically accept SGPRs or + /// VGPRs as source operands. On gfx12+, if a source operand uses SGPRs, the + /// HW can only read the first SGPR and use it for both the low and high + /// operations. + /// \p SrcN can be 0, 1, or 2, representing src0, src1, and src2, + /// respectively. If \p MO is nullptr, the operand corresponding to SrcN will + /// be used. + bool isLegalGFX12PlusPackedMathFP32Operand( + const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, + const MachineOperand *MO = nullptr) const; + /// Legalize operands in \p MI by either commuting it or inserting a /// copy of src1. void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 00dcb9b..1e3e9a2 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -3318,6 +3318,20 @@ unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) { return 128; } +bool isPackedFP32Inst(unsigned Opc) { + switch (Opc) { + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_ADD_F32_gfx12: + case AMDGPU::V_PK_MUL_F32: + case AMDGPU::V_PK_MUL_F32_gfx12: + case AMDGPU::V_PK_FMA_F32: + case AMDGPU::V_PK_FMA_F32_gfx12: + return true; + default: + return false; + } +} + } // namespace AMDGPU raw_ostream &operator<<(raw_ostream &OS, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 1252e35..1bcd36c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1709,6 +1709,8 @@ bool isArgPassedInSGPR(const Argument *Arg); bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo); +LLVM_READONLY bool isPackedFP32Inst(unsigned Opc); + LLVM_READONLY bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST, int64_t EncodedOffset); |