diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 22 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 14 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 63 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 11 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 76 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 22 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP3Instructions.td | 31 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 70 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOPInstructions.td | 7 |
14 files changed, 163 insertions, 186 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index ddb2381..1a697f7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1411,20 +1411,6 @@ def FeatureGloballyAddressableScratch : SubtargetFeature< "FLAT instructions can access scratch memory for any thread in any wave" >; -// FIXME: Remove after all users are migrated to attribute. -def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr", - "DynamicVGPR", - "true", - "Enable dynamic VGPR mode" ->; - -// FIXME: Remove after all users are migrated to attribute. -def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32", - "DynamicVGPRBlockSize32", - "true", - "Use a block size of 32 for dynamic VGPR allocation (default is 16)" ->; - // Enable the use of SCRATCH_STORE/LOAD_BLOCK instructions for saving and // restoring the callee-saved registers. def FeatureUseBlockVGPROpsForCSR : SubtargetFeature<"block-vgpr-csr", @@ -1462,10 +1448,10 @@ def Feature45BitNumRecordsBufferResource : SubtargetFeature< "45-bit-num-records "The buffer resource (V#) supports 45-bit num_records" >; -def FeatureCluster : SubtargetFeature< "cluster", - "HasCluster", +def FeatureClusters : SubtargetFeature< "clusters", + "HasClusters", "true", - "Has cluster support" + "Has clusters of workgroups support" >; // Dummy feature used to disable assembler instructions. @@ -2134,7 +2120,7 @@ def FeatureISAVersion12_50 : FeatureSet< Feature45BitNumRecordsBufferResource, FeatureSupportsXNACK, FeatureXNACK, - FeatureCluster, + FeatureClusters, ]>; def FeatureISAVersion12_51 : FeatureSet< diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 848d9a5..557d87f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5043,6 +5043,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_mfma_i32_16x16x64_i8: case Intrinsic::amdgcn_mfma_i32_32x32x32_i8: case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned MinNumRegsRequired = DstSize / 32; + // Default for MAI intrinsics. // srcC can also be an immediate which can be folded later. // FIXME: Should we eventually add an alternative mapping with AGPR src @@ -5051,29 +5054,32 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // vdst, srcA, srcB, srcC const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); OpdsMapping[0] = - Info->mayNeedAGPRs() + Info->getMinNumAGPRs() >= MinNumRegsRequired ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->mayNeedAGPRs() + Info->getMinNumAGPRs() >= MinNumRegsRequired ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4: case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned MinNumRegsRequired = DstSize / 32; + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); OpdsMapping[0] = - Info->mayNeedAGPRs() + Info->getMinNumAGPRs() >= MinNumRegsRequired ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->mayNeedAGPRs() + Info->getMinNumAGPRs() >= MinNumRegsRequired ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index a67a7be..d0c0822 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1944,6 +1944,7 @@ public: void cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands); void cvtVINTERP(MCInst &Inst, const OperandVector &Operands); + void cvtOpSelHelper(MCInst &Inst, unsigned OpSel); bool parseDimId(unsigned &Encoding); ParseStatus parseDim(OperandVector &Operands); @@ -9239,6 +9240,33 @@ static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) { MCOI::OperandConstraint::TIED_TO) == -1; } +void AMDGPUAsmParser::cvtOpSelHelper(MCInst &Inst, unsigned OpSel) { + unsigned Opc = Inst.getOpcode(); + constexpr AMDGPU::OpName Ops[] = {AMDGPU::OpName::src0, AMDGPU::OpName::src1, + AMDGPU::OpName::src2}; + constexpr AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers}; + for (int J = 0; J < 3; ++J) { + int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]); + if (OpIdx == -1) + // Some instructions, e.g. v_interp_p2_f16 in GFX9, have src0, src2, but + // no src1. So continue instead of break. + continue; + + int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); + uint32_t ModVal = Inst.getOperand(ModIdx).getImm(); + + if ((OpSel & (1 << J)) != 0) + ModVal |= SISrcMods::OP_SEL_0; + // op_sel[3] is encoded in src0_modifiers. + if (ModOps[J] == AMDGPU::OpName::src0_modifiers && (OpSel & (1 << 3)) != 0) + ModVal |= SISrcMods::DST_OP_SEL; + + Inst.getOperand(ModIdx).setImm(ModVal); + } +} + void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands) { OptionalImmIndexMap OptionalIdx; @@ -9275,6 +9303,16 @@ void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands) if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod)) addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); + + // Some v_interp instructions use op_sel[3] for dst. + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, + AMDGPUOperand::ImmTyOpSel); + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + + cvtOpSelHelper(Inst, OpSel); + } } void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands) @@ -9310,31 +9348,10 @@ void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands) if (OpSelIdx == -1) return; - const AMDGPU::OpName Ops[] = {AMDGPU::OpName::src0, AMDGPU::OpName::src1, - AMDGPU::OpName::src2}; - const AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers, - AMDGPU::OpName::src1_modifiers, - AMDGPU::OpName::src2_modifiers}; - unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); - - for (int J = 0; J < 3; ++J) { - int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]); - if (OpIdx == -1) - break; - - int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); - uint32_t ModVal = Inst.getOperand(ModIdx).getImm(); - - if ((OpSel & (1 << J)) != 0) - ModVal |= SISrcMods::OP_SEL_0; - if (ModOps[J] == AMDGPU::OpName::src0_modifiers && - (OpSel & (1 << 3)) != 0) - ModVal |= SISrcMods::DST_OP_SEL; - - Inst.getOperand(ModIdx).setImm(ModVal); - } + cvtOpSelHelper(Inst, OpSel); } + void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst, const OperandVector &Operands) { OptionalImmIndexMap OptionalIdx; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 7b94ea3..f291e37 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -541,7 +541,7 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { unsigned GCNSubtarget::getBaseMaxNumVGPRs( const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const { - const auto &[Min, Max] = NumVGPRBounds; + const auto [Min, Max] = NumVGPRBounds; // Check if maximum number of VGPRs was explicitly requested using // "amdgpu-num-vgpr" attribute. diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 879bf5a..c2e6078 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -288,7 +288,7 @@ protected: bool Has45BitNumRecordsBufferResource = false; - bool HasCluster = false; + bool HasClusters = false; // Dummy feature to use for assembler in tablegen. bool FeatureDisable = false; @@ -1839,7 +1839,7 @@ public: } /// \returns true if the subtarget supports clusters of workgroups. - bool hasClusters() const { return HasCluster; } + bool hasClusters() const { return HasClusters; } /// \returns true if the subtarget requires a wait for xcnt before atomic /// flat/global stores & rmw. diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index d3b5718..3563caa 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1280,6 +1280,17 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI, (ModIdx != -1) ? MI->getOperand(ModIdx).getImm() : DefaultValue; } + // Some instructions, e.g. v_interp_p2_f16 in GFX9, have src0, src2, but no + // src1. + if (NumOps == 1 && AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src2) && + !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1)) { + Ops[NumOps++] = DefaultValue; // Set src1_modifiers to default. + int Mod2Idx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers); + assert(Mod2Idx != -1); + Ops[NumOps++] = MI->getOperand(Mod2Idx).getImm(); + } + const bool HasDst = (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst) != -1) || (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst) != -1); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e233457..1a686a9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17346,74 +17346,24 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, MachineFunction *MF = MI.getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); if (TII->isVOP3(MI.getOpcode())) { // Make sure constant bus requirements are respected. TII->legalizeOperandsVOP3(MRI, MI); - // Prefer VGPRs over AGPRs in mAI instructions where possible. - // This saves a chain-copy of registers and better balance register - // use between vgpr and agpr as agpr tuples tend to be big. - if (!MI.getDesc().operands().empty()) { - unsigned Opc = MI.getOpcode(); - bool HasAGPRs = Info->mayNeedAGPRs(); - const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); - int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); - for (auto I : - {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) { - if (I == -1) - break; - if ((I == Src2Idx) && (HasAGPRs)) - break; - MachineOperand &Op = MI.getOperand(I); - if (!Op.isReg() || !Op.getReg().isVirtual()) - continue; - auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); - if (!TRI->hasAGPRs(RC)) - continue; - auto *Src = MRI.getUniqueVRegDef(Op.getReg()); - if (!Src || !Src->isCopy() || - !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg())) - continue; - auto *NewRC = TRI->getEquivalentVGPRClass(RC); - // All uses of agpr64 and agpr32 can also accept vgpr except for - // v_accvgpr_read, but we do not produce agpr reads during selection, - // so no use checks are needed. - MRI.setRegClass(Op.getReg(), NewRC); - } - - if (TII->isMAI(MI)) { - // The ordinary src0, src1, src2 were legalized above. - // - // We have to also legalize the appended v_mfma_ld_scale_b32 operands, - // as a separate instruction. - int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::scale_src0); - if (Src0Idx != -1) { - int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::scale_src1); - if (TII->usesConstantBus(MRI, MI, Src0Idx) && - TII->usesConstantBus(MRI, MI, Src1Idx)) - TII->legalizeOpWithMove(MI, Src1Idx); - } - } - - if (!HasAGPRs) - return; - - // Resolve the rest of AV operands to AGPRs. - if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) { - if (Src2->isReg() && Src2->getReg().isVirtual()) { - auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg()); - if (TRI->isVectorSuperClass(RC)) { - auto *NewRC = TRI->getEquivalentAGPRClass(RC); - MRI.setRegClass(Src2->getReg(), NewRC); - if (Src2->isTied()) - MRI.setRegClass(MI.getOperand(0).getReg(), NewRC); - } - } + if (TII->isMAI(MI)) { + // The ordinary src0, src1, src2 were legalized above. + // + // We have to also legalize the appended v_mfma_ld_scale_b32 operands, + // as a separate instruction. + int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::scale_src0); + if (Src0Idx != -1) { + int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::scale_src1); + if (TII->usesConstantBus(MRI, MI, Src0Idx) && + TII->usesConstantBus(MRI, MI, Src1Idx)) + TII->legalizeOpWithMove(MI, Src1Idx); } } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 908d856..b398db4 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -33,17 +33,20 @@ using namespace llvm; // optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases // where it is better to produce the VGPR form (e.g. if there are VGPR users // of the MFMA result). -static cl::opt<bool> MFMAVGPRForm( - "amdgpu-mfma-vgpr-form", cl::Hidden, +static cl::opt<bool, true> MFMAVGPRFormOpt( + "amdgpu-mfma-vgpr-form", cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If " "unspecified, default to compiler heuristics"), - cl::init(false)); + cl::location(SIMachineFunctionInfo::MFMAVGPRForm), cl::init(false), + cl::Hidden); const GCNTargetMachine &getTM(const GCNSubtarget *STI) { const SITargetLowering *TLI = STI->getTargetLowering(); return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine()); } +bool SIMachineFunctionInfo::MFMAVGPRForm = false; + SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, const GCNSubtarget *STI) : AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)), @@ -81,14 +84,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, PSInputAddr = AMDGPU::getInitialPSInputAddr(F); } - MayNeedAGPRs = ST.hasMAIInsts(); if (ST.hasGFX90AInsts()) { - // FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection - // should be separated from availability of AGPRs - if (MFMAVGPRForm || - (ST.getMaxNumVGPRs(F) <= ST.getAddressableNumArchVGPRs() && - !mayUseAGPRs(F))) - MayNeedAGPRs = false; // We will select all MAI with VGPR operands. + // FIXME: Extract logic out of getMaxNumVectorRegs; we need to apply the + // allocation granule and clamping. + auto [MinNumAGPRAttr, MaxNumAGPRAttr] = + AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", {~0u, ~0u}, + /*OnlyFirstRequired=*/true); + MinNumAGPRs = MinNumAGPRAttr; } if (AMDGPU::isChainCC(CC)) { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 4560615..b7dbb59 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -509,7 +509,9 @@ private: // user arguments. This is an offset from the KernargSegmentPtr. bool ImplicitArgPtr : 1; - bool MayNeedAGPRs : 1; + /// Minimum number of AGPRs required to allocate in the function. Only + /// relevant for gfx90a-gfx950. For gfx908, this should be infinite. + unsigned MinNumAGPRs = ~0u; // The hard-wired high half of the address of the global information table // for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since @@ -537,6 +539,8 @@ private: void MRI_NoteCloneVirtualRegister(Register NewReg, Register SrcReg) override; public: + static bool MFMAVGPRForm; + struct VGPRSpillToAGPR { SmallVector<MCPhysReg, 32> Lanes; bool FullyAllocated = false; @@ -1196,9 +1200,7 @@ public: unsigned getMaxMemoryClusterDWords() const { return MaxMemoryClusterDWords; } - bool mayNeedAGPRs() const { - return MayNeedAGPRs; - } + unsigned getMinNumAGPRs() const { return MinNumAGPRs; } // \returns true if a function has a use of AGPRs via inline asm or // has a call which may use it. diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 3c2dd42..3115579 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1118,12 +1118,7 @@ SIRegisterInfo::getPointerRegClass(unsigned Kind) const { const TargetRegisterClass * SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { - if (isAGPRClass(RC) && !ST.hasGFX90AInsts()) - return getEquivalentVGPRClass(RC); - if (RC == &AMDGPU::SCC_CLASSRegClass) - return getWaveMaskRegClass(); - - return RC; + return RC == &AMDGPU::SCC_CLASSRegClass ? &AMDGPU::SReg_32RegClass : RC; } static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 20fa141..f7f4d46 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1353,11 +1353,6 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, if (DynamicVGPRBlockSize != 0) return DynamicVGPRBlockSize; - // Temporarily check the subtarget feature, until we fully switch to using - // attributes. - if (STI->getFeatureBits().test(FeatureDynamicVGPR)) - return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16; - bool IsWave32 = EnableWavefrontSize32 ? *EnableWavefrontSize32 : STI->getFeatureBits().test(FeatureWavefrontSize32); @@ -1412,10 +1407,7 @@ unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI, if (Features.test(FeatureGFX90AInsts)) return 512; - // Temporarily check the subtarget feature, until we fully switch to using - // attributes. - if (DynamicVGPRBlockSize != 0 || - STI->getFeatureBits().test(FeatureDynamicVGPR)) + if (DynamicVGPRBlockSize != 0) // On GFX12 we can allocate at most 8 blocks of VGPRs. return 8 * getVGPRAllocGranule(STI, DynamicVGPRBlockSize); return getAddressableNumArchVGPRs(STI); diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 4a2b54d..42ec8ba 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -97,6 +97,7 @@ class VOP3Interp<string OpName, VOPProfile P, list<dag> pattern = []> : VOP3_Pseudo<OpName, P, pattern> { let AsmMatchConverter = "cvtVOP3Interp"; let mayRaiseFPException = 0; + let VOP3_OPSEL = P.HasOpSel; } def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> { @@ -119,16 +120,17 @@ def VOP3_INTERP_MOV : VOPProfile<[f32, i32, i32, untyped]> { let HasSrc0Mods = 0; } -class getInterp16Asm <bit HasSrc2, bit HasOMod> { +class getInterp16Asm <bit HasSrc2, bit HasOMod, bit OpSel> { string src2 = !if(HasSrc2, ", $src2_modifiers", ""); string omod = !if(HasOMod, "$omod", ""); + string opsel = !if(OpSel, "$op_sel", ""); string ret = - " $vdst, $src0_modifiers, $attr$attrchan"#src2#"$high$clamp"#omod; + " $vdst, $src0_modifiers, $attr$attrchan"#src2#"$high$clamp"#omod#opsel; } class getInterp16Ins <bit HasSrc2, bit HasOMod, - Operand Src0Mod, Operand Src2Mod> { - dag ret = !if(HasSrc2, + Operand Src0Mod, Operand Src2Mod, bit OpSel> { + dag ret1 = !if(HasSrc2, !if(HasOMod, (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0, InterpAttr:$attr, InterpAttrChan:$attrchan, @@ -143,19 +145,22 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod, InterpAttr:$attr, InterpAttrChan:$attrchan, highmod:$high, Clamp0:$clamp, omod0:$omod) ); + dag ret2 = !if(OpSel, (ins op_sel0:$op_sel), (ins)); + dag ret = !con(ret1, ret2); } -class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> { +class VOP3_INTERP16 <list<ValueType> ArgVT, bit OpSel = 0> : VOPProfile<ArgVT> { let IsSingle = 1; let HasOMod = !ne(DstVT.Value, f16.Value); let HasHigh = 1; + let HasOpSel = OpSel; let Src0Mod = FPVRegInputMods; let Src2Mod = FPVRegInputMods; let Outs64 = (outs DstRC.RegClass:$vdst); - let Ins64 = getInterp16Ins<HasSrc2, HasOMod, Src0Mod, Src2Mod>.ret; - let Asm64 = getInterp16Asm<HasSrc2, HasOMod>.ret; + let Ins64 = getInterp16Ins<HasSrc2, HasOMod, Src0Mod, Src2Mod, OpSel>.ret; + let Asm64 = getInterp16Asm<HasSrc2, HasOMod, OpSel>.ret; } //===----------------------------------------------------------------------===// @@ -480,7 +485,7 @@ let SubtargetPredicate = isGFX9Plus in { defm V_MAD_U16_gfx9 : VOP3Inst_t16 <"v_mad_u16_gfx9", VOP_I16_I16_I16_I16>; defm V_MAD_I16_gfx9 : VOP3Inst_t16 <"v_mad_i16_gfx9", VOP_I16_I16_I16_I16>; let OtherPredicates = [isNotGFX90APlus] in -def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>; +def V_INTERP_P2_F16_opsel : VOP3Interp <"v_interp_p2_f16_opsel", VOP3_INTERP16<[f16, f32, i32, f32], /*OpSel*/ 1>>; } // End SubtargetPredicate = isGFX9Plus // This predicate should only apply to the selection pattern. The @@ -2676,6 +2681,14 @@ multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> } } +multiclass VOP3Interp_F16_OpSel_Real_gfx9<bits<10> op, string OpName, string AsmName> { + def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>, + VOP3Interp_OpSel_gfx9 <op, !cast<VOP3_Pseudo>(OpName).Pfl> { + VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName); + let AsmString = AsmName # ps.AsmOperands; + } +} + multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> { def _gfx9 : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>, VOP3e_vi <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl> { @@ -2788,7 +2801,7 @@ defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">; defm V_MAD_I16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">; defm V_FMA_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x206, "v_fma_f16">; defm V_DIV_FIXUP_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x207, "v_div_fixup_f16">; -defm V_INTERP_P2_F16_gfx9 : VOP3Interp_F16_Real_gfx9 <0x277, "V_INTERP_P2_F16_gfx9", "v_interp_p2_f16">; +defm V_INTERP_P2_F16_opsel : VOP3Interp_F16_OpSel_Real_gfx9 <0x277, "V_INTERP_P2_F16_opsel", "v_interp_p2_f16">; defm V_ADD_I32 : VOP3_Real_vi <0x29c>; defm V_SUB_I32 : VOP3_Real_vi <0x29d>; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 5daf860..3a0cc35 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -67,7 +67,7 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR, class VOP3P_Mix_Profile_t16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3P_Mix_Profile<P, Features, 0> { let IsTrue16 = 1; - let IsRealTrue16 = 1; + let IsRealTrue16 = 1; let DstRC64 = getVALUDstForVT<P.DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret; } @@ -950,7 +950,7 @@ class MFMA_F8F6F4_WithSizeTable_Helper<VOP3_Pseudo ps, string F8F8Op> : } // Currently assumes scaled instructions never have abid -class MAIFrag<SDPatternOperator Op, code pred, bit HasAbid = true, bit Scaled = false> : PatFrag < +class MAIFrag<SDPatternOperator Op, bit HasAbid = true, bit Scaled = false> : PatFrag < !if(Scaled, (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$blgp, node:$src0_modifiers, node:$scale_src0, node:$src1_modifiers, node:$scale_src1), @@ -959,37 +959,30 @@ class MAIFrag<SDPatternOperator Op, code pred, bit HasAbid = true, bit Scaled = (ops node:$blgp))), !if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $src0_modifiers, $scale_src0, $src1_modifiers, $scale_src1), !if(HasAbid, (Op $src0, $src1, $src2, $cbsz, $abid, $blgp), - (Op $src0, $src1, $src2, $cbsz, $blgp))), - pred ->; - -defvar MayNeedAGPRs = [{ - return MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); -}]; - -defvar MayNeedAGPRs_gisel = [{ - return MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); -}]; + (Op $src0, $src1, $src2, $cbsz, $blgp)))>; -defvar MayNotNeedAGPRs = [{ - return !MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); -}]; +class CanUseAGPR_MAI<ValueType vt> { + code PredicateCode = [{ + return !Subtarget->hasGFX90AInsts() || + (!SIMachineFunctionInfo::MFMAVGPRForm && + MF->getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >= + }] # !srl(vt.Size, 5) # ");"; -defvar MayNotNeedAGPRs_gisel = [{ - return !MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); -}]; + code GISelPredicateCode = [{ + return !Subtarget->hasGFX90AInsts() || + (!SIMachineFunctionInfo::MFMAVGPRForm && + MF.getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >= + }] # !srl(vt.Size, 5) # ");"; +} -class AgprMAIFrag<SDPatternOperator Op, bit HasAbid = true, +class AgprMAIFrag<SDPatternOperator Op, ValueType vt, bit HasAbid = true, bit Scaled = false> : - MAIFrag<Op, MayNeedAGPRs, HasAbid, Scaled> { - let GISelPredicateCode = MayNeedAGPRs_gisel; -} + MAIFrag<Op, HasAbid, Scaled>, + CanUseAGPR_MAI<vt>; class VgprMAIFrag<SDPatternOperator Op, bit HasAbid = true, - bit Scaled = false> : - MAIFrag<Op, MayNotNeedAGPRs, HasAbid, Scaled> { - let GISelPredicateCode = MayNotNeedAGPRs_gisel; -} + bit Scaled = false> : + MAIFrag<Op, HasAbid, Scaled>; let isAsCheapAsAMove = 1, isReMaterializable = 1 in { defm V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>; @@ -1037,16 +1030,19 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag, bit HasAbid = true, bit Scaled = false> { defvar NoDstOverlap = !cast<VOPProfileMAI>("VOPProfileMAI_" # P).NoDstOverlap; + defvar ProfileAGPR = !cast<VOPProfileMAI>("VOPProfileMAI_" # P); + defvar ProfileVGPR = !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"); + let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in { - def _e64 : MAIInst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), - !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node, HasAbid, Scaled>), Scaled>, + def _e64 : MAIInst<OpName, ProfileAGPR, + !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node, ProfileAGPR.DstVT, HasAbid, Scaled>), Scaled>, MFMATable<0, "AGPR", NAME # "_e64">; let OtherPredicates = [isGFX90APlus], Mnemonic = OpName in - def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"), + def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", ProfileVGPR, !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, VgprMAIFrag<node, HasAbid, Scaled>), Scaled>, MFMATable<0, "VGPR", NAME # "_vgprcd_e64", NAME # "_e64">; } @@ -1055,12 +1051,12 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag, let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""), isConvertibleToThreeAddress = NoDstOverlap, Mnemonic = OpName in { - def "_mac_e64" : MAIInst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), - !if(!eq(node, null_frag), null_frag, AgprMAIFrag<node, HasAbid, Scaled>), Scaled>, + def "_mac_e64" : MAIInst<OpName # "_mac", ProfileAGPR, + !if(!eq(node, null_frag), null_frag, AgprMAIFrag<node, ProfileAGPR.DstVT, HasAbid, Scaled>), Scaled>, MFMATable<1, "AGPR", NAME # "_e64", NAME # "_mac_e64">; let OtherPredicates = [isGFX90APlus] in - def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"), + def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", ProfileVGPR, !if(!eq(node, null_frag), null_frag, VgprMAIFrag<node, HasAbid, Scaled>), Scaled>, MFMATable<1, "VGPR", NAME # "_vgprcd_e64", NAME # "_mac_e64">; } @@ -1074,11 +1070,11 @@ multiclass ScaledMAIInst_mc<string OpName, string UnscaledOpName_, SDPatternOper defvar UnscaledOpName = UnscaledOpName_#VariantSuffix; defvar HasAbid = false; - - defvar NoDstOverlap = !cast<VOPProfileMAI>(!cast<MAIInst>(UnscaledOpName#"_e64").Pfl).NoDstOverlap; + defvar Profile = !cast<VOPProfileMAI>(!cast<MAIInst>(UnscaledOpName#"_e64").Pfl); + defvar NoDstOverlap = Profile.NoDstOverlap; def _e64 : ScaledMAIInst<OpName, - !cast<MAIInst>(UnscaledOpName#"_e64"), !if(NoDstOverlap, null_frag, AgprMAIFrag<node, HasAbid, true>)>, + !cast<MAIInst>(UnscaledOpName#"_e64"), !if(NoDstOverlap, null_frag, AgprMAIFrag<node, Profile.DstVT, HasAbid, true>)>, MFMATable<0, "AGPR", NAME # "_e64">; def _vgprcd_e64 : ScaledMAIInst<OpName # "_vgprcd", @@ -1090,7 +1086,7 @@ multiclass ScaledMAIInst_mc<string OpName, string UnscaledOpName_, SDPatternOper isConvertibleToThreeAddress = NoDstOverlap, Mnemonic = UnscaledOpName_ in { def _mac_e64 : ScaledMAIInst<OpName # "_mac", - !cast<MAIInst>(UnscaledOpName # "_mac_e64"), AgprMAIFrag<node, HasAbid, true>>, + !cast<MAIInst>(UnscaledOpName # "_mac_e64"), AgprMAIFrag<node, Profile.DstVT, HasAbid, true>>, MFMATable<1, "AGPR", NAME # "_e64">; def _mac_vgprcd_e64 : ScaledMAIInst<OpName # " _mac_vgprcd", diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 631f0f3..8325c62 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -419,6 +419,13 @@ class VOP3a_ScaleSel_gfx1250<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, let Inst{14-11} = scale_sel; } +class VOP3Interp_OpSel_gfx9<bits<10> op, VOPProfile p> : VOP3Interp_vi<op, p> { + let Inst{11} = src0_modifiers{2}; + // There's no src1 + let Inst{13} = src2_modifiers{2}; + let Inst{14} = !if(p.HasDst, src0_modifiers{3}, 0); +} + class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> { bits<6> attr; bits<2> attrchan; |