diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
26 files changed, 582 insertions, 351 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 502a8e8..ea32748 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -2772,6 +2772,9 @@ def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">, def HasMAIInsts : Predicate<"Subtarget->hasMAIInsts()">, AssemblerPredicate<(all_of FeatureMAIInsts)>; +def NotHasMAIInsts : Predicate<"!Subtarget->hasMAIInsts()">, + AssemblerPredicate<(all_of (not FeatureMAIInsts))>; + def HasSMemRealTime : Predicate<"Subtarget->hasSMemRealTime()">, AssemblerPredicate<(all_of FeatureSMemRealTime)>; @@ -2946,6 +2949,20 @@ def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic( def HasSetPrioIncWgInst : Predicate<"Subtarget->hasSetPrioIncWgInst()">, AssemblerPredicate<(all_of FeatureSetPrioIncWgInst)>; +def NeedsAlignedVGPRs : Predicate<"Subtarget->needsAlignedVGPRs()">, + AssemblerPredicate<(all_of FeatureRequiresAlignedVGPRs)>; + +//===----------------------------------------------------------------------===// +// HwModes +//===----------------------------------------------------------------------===// + +// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement +def AVAlign2LoadStoreMode : HwMode<[HasMAIInsts, NeedsAlignedVGPRs]>; + +// gfx1250, has alignment requirement but no AGPRs. +def AlignedVGPRNoAGPRMode : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs]>; + + // Include AMDGPU TD files include "SISchedule.td" include "GCNProcessors.td" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 65d049e..ef58004 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1211,16 +1211,81 @@ AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP, llvm_unreachable("AAAMDWavesPerEU is only valid for function position"); } -static bool inlineAsmUsesAGPRs(const InlineAsm *IA) { - for (const auto &CI : IA->ParseConstraints()) { +/// Compute the minimum number of AGPRs required to allocate the inline asm. +static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA, + const CallBase &Call) { + unsigned ArgNo = 0; + unsigned ResNo = 0; + unsigned AGPRDefCount = 0; + unsigned AGPRUseCount = 0; + unsigned MaxPhysReg = 0; + const DataLayout &DL = Call.getFunction()->getParent()->getDataLayout(); + + // TODO: Overestimates due to not accounting for tied operands + for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) { + Type *Ty = nullptr; + switch (CI.Type) { + case InlineAsm::isOutput: { + Ty = Call.getType(); + if (auto *STy = dyn_cast<StructType>(Ty)) + Ty = STy->getElementType(ResNo); + ++ResNo; + break; + } + case InlineAsm::isInput: { + Ty = Call.getArgOperand(ArgNo++)->getType(); + break; + } + case InlineAsm::isLabel: + continue; + case InlineAsm::isClobber: + // Parse the physical register reference. + break; + } + for (StringRef Code : CI.Codes) { - Code.consume_front("{"); - if (Code.starts_with("a")) - return true; + unsigned RegCount = 0; + if (Code.starts_with("a")) { + // Virtual register, compute number of registers based on the type. + // + // We ought to be going through TargetLowering to get the number of + // registers, but we should avoid the dependence on CodeGen here. + RegCount = divideCeil(DL.getTypeSizeInBits(Ty), 32); + } else { + // Physical register reference + auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Code); + if (Kind == 'a') { + RegCount = NumRegs; + MaxPhysReg = std::max(MaxPhysReg, std::min(RegIdx + NumRegs, 256u)); + } + + continue; + } + + if (CI.Type == InlineAsm::isOutput) { + // Apply tuple alignment requirement + // + // TODO: This is more conservative than necessary. + AGPRDefCount = alignTo(AGPRDefCount, RegCount); + + AGPRDefCount += RegCount; + if (CI.isEarlyClobber) { + AGPRUseCount = alignTo(AGPRUseCount, RegCount); + AGPRUseCount += RegCount; + } + } else { + AGPRUseCount = alignTo(AGPRUseCount, RegCount); + AGPRUseCount += RegCount; + } } } - return false; + unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount); + + // TODO: This is overly conservative. If there are any physical registers, + // allocate any virtual registers after them so we don't have to solve optimal + // packing. + return std::min(MaxVirtReg + MaxPhysReg, 256u); } // TODO: Migrate to range merge of amdgpu-agpr-alloc. @@ -1259,14 +1324,29 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> { const Function *Callee = dyn_cast<Function>(CalleeOp); if (!Callee) { if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp)) - return !inlineAsmUsesAGPRs(IA); + return inlineAsmGetNumRequiredAGPRs(IA, CB) == 0; return false; } - // Some intrinsics may use AGPRs, but if we have a choice, we are not - // required to use AGPRs. - if (Callee->isIntrinsic()) + switch (Callee->getIntrinsicID()) { + case Intrinsic::not_intrinsic: + break; + case Intrinsic::write_register: + case Intrinsic::read_register: + case Intrinsic::read_volatile_register: { + const MDString *RegName = cast<MDString>( + cast<MDNode>( + cast<MetadataAsValue>(CB.getArgOperand(0))->getMetadata()) + ->getOperand(0)); + auto [Kind, RegIdx, NumRegs] = + AMDGPU::parseAsmPhysRegName(RegName->getString()); + return Kind != 'a'; + } + default: + // Some intrinsics may use AGPRs, but if we have a choice, we are not + // required to use AGPRs. return true; + } // TODO: Handle callsite attributes const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 2192a72..e4d328a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -393,12 +393,13 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, switch (N->getMachineOpcode()) { default: { - const MCInstrDesc &Desc = - Subtarget->getInstrInfo()->get(N->getMachineOpcode()); + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + const MCInstrDesc &Desc = TII->get(N->getMachineOpcode()); unsigned OpIdx = Desc.getNumDefs() + OpNo; if (OpIdx >= Desc.getNumOperands()) return nullptr; - int RegClass = Desc.operands()[OpIdx].RegClass; + + int16_t RegClass = TII->getOpRegClassID(Desc.operands()[OpIdx]); if (RegClass == -1) return nullptr; @@ -4353,7 +4354,8 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { if (!RC || SIRI->isSGPRClass(RC)) return false; - if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) { + if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass && + RC != &AMDGPU::VS_64_Align2RegClass) { AllUsesAcceptSReg = false; SDNode *User = U->getUser(); if (User->isMachineOpcode()) { @@ -4367,7 +4369,8 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { const TargetRegisterClass *CommutedRC = getOperandRegClass(U->getUser(), CommutedOpNo); if (CommutedRC == &AMDGPU::VS_32RegClass || - CommutedRC == &AMDGPU::VS_64RegClass) + CommutedRC == &AMDGPU::VS_64RegClass || + CommutedRC == &AMDGPU::VS_64_Align2RegClass) AllUsesAcceptSReg = true; } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 723d07e..c7a91f4c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -929,7 +929,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { ThinOrFullLTOPhase Phase) { if (Level != OptimizationLevel::O0) { if (!isLTOPreLink(Phase)) { - if (getTargetTriple().isAMDGCN()) { + if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) { AMDGPUAttributorOptions Opts; MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); } @@ -966,7 +966,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(InternalizePass(mustPreserveGV)); PM.addPass(GlobalDCEPass()); } - if (EnableAMDGPUAttributor) { + if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) { AMDGPUAttributorOptions Opt; if (HasClosedWorldAssumption) Opt.IsClosedWorld = true; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index d0c0822..a8140c3 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1347,6 +1347,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool ForcedDPP = false; bool ForcedSDWA = false; KernelScopeInfo KernelScope; + const unsigned HwMode; /// @name Auto-generated Match Functions /// { @@ -1356,6 +1357,13 @@ class AMDGPUAsmParser : public MCTargetAsmParser { /// } + /// Get size of register operand + unsigned getRegOperandSize(const MCInstrDesc &Desc, unsigned OpNo) const { + assert(OpNo < Desc.NumOperands); + int16_t RCID = MII.getOpRegClassID(Desc.operands()[OpNo], HwMode); + return getRegBitWidth(RCID) / 8; + } + private: void createConstantSymbol(StringRef Id, int64_t Val); @@ -1442,9 +1450,9 @@ public: using OptionalImmIndexMap = std::map<AMDGPUOperand::ImmTy, unsigned>; AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser, - const MCInstrInfo &MII, - const MCTargetOptions &Options) - : MCTargetAsmParser(Options, STI, MII), Parser(_Parser) { + const MCInstrInfo &MII, const MCTargetOptions &Options) + : MCTargetAsmParser(Options, STI, MII), Parser(_Parser), + HwMode(STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)) { MCAsmParserExtension::Initialize(Parser); setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits())); @@ -4107,7 +4115,7 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst, SMLoc IDLoc) { if ((DMaskIdx == -1 || TFEIdx == -1) && isGFX10_AEncoding()) // intersect_ray return true; - unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx); + unsigned VDataSize = getRegOperandSize(Desc, VDataIdx); unsigned TFESize = (TFEIdx != -1 && Inst.getOperand(TFEIdx).getImm()) ? 1 : 0; unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf; if (DMask == 0) @@ -4171,8 +4179,7 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst, SMLoc IDLoc) { const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim); bool IsNSA = SrsrcIdx - VAddr0Idx > 1; unsigned ActualAddrSize = - IsNSA ? SrsrcIdx - VAddr0Idx - : AMDGPU::getRegOperandSize(getMRI(), Desc, VAddr0Idx) / 4; + IsNSA ? SrsrcIdx - VAddr0Idx : getRegOperandSize(Desc, VAddr0Idx) / 4; unsigned ExpectedAddrSize = AMDGPU::getAddrSizeMIMGOp(BaseOpcode, DimInfo, IsA16, hasG16()); @@ -4182,8 +4189,7 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst, SMLoc IDLoc) { ExpectedAddrSize > getNSAMaxSize(Desc.TSFlags & SIInstrFlags::VSAMPLE)) { int VAddrLastIdx = SrsrcIdx - 1; - unsigned VAddrLastSize = - AMDGPU::getRegOperandSize(getMRI(), Desc, VAddrLastIdx) / 4; + unsigned VAddrLastSize = getRegOperandSize(Desc, VAddrLastIdx) / 4; ActualAddrSize = VAddrLastIdx - VAddr0Idx + VAddrLastSize; } @@ -4429,7 +4435,8 @@ bool AMDGPUAsmParser::validateMFMA(const MCInst &Inst, return true; const MCRegisterInfo *TRI = getContext().getRegisterInfo(); - if (TRI->getRegClass(Desc.operands()[0].RegClass).getSizeInBits() <= 128) + if (TRI->getRegClass(MII.getOpRegClassID(Desc.operands()[0], HwMode)) + .getSizeInBits() <= 128) return true; if (TRI->regsOverlap(Src2Reg, DstReg)) { @@ -5000,7 +5007,7 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst, unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm(); if (!AMDGPU::isLegalDPALU_DPPControl(getSTI(), DppCtrl) && - AMDGPU::isDPALU_DPP(MII.get(Opc), getSTI())) { + AMDGPU::isDPALU_DPP(MII.get(Opc), MII, getSTI())) { // DP ALU DPP is supported for row_newbcast only on GFX9* and row_share // only on GFX12. SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands); @@ -5523,7 +5530,8 @@ bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst, unsigned Fmt = Inst.getOperand(FmtIdx).getImm(); int SrcIdx = AMDGPU::getNamedOperandIdx(Opc, SrcOp); unsigned RegSize = - TRI->getRegClass(Desc.operands()[SrcIdx].RegClass).getSizeInBits(); + TRI->getRegClass(MII.getOpRegClassID(Desc.operands()[SrcIdx], HwMode)) + .getSizeInBits(); if (RegSize == AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt) * 32) return true; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 09a66d7..b97b738 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -417,10 +417,10 @@ class getBUFVDataRegisterOperandForOp<RegisterOperand Op, bit isTFE> { } class getMUBUFInsDA<list<RegisterOperand> vdataList, - list<RegisterClass> vaddrList, bit isTFE, bit hasRestrictedSOffset> { + list<RegisterClassLike> vaddrList, bit isTFE, bit hasRestrictedSOffset> { RegisterOperand vdataClass = !if(!empty(vdataList), ?, !head(vdataList)); - RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); - RegisterOperand vdata_op = getBUFVDataRegisterOperandForOp<vdataClass, isTFE>.ret; + RegisterClassLike vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + RegisterOperand vdata_op = getBUFVDataRegisterOperand<!cast<SIRegisterClassLike>(vdataClass.RegClass).Size, isTFE>.ret; dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset)); dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz)); @@ -453,8 +453,8 @@ class getMUBUFIns<int addrKind, list<RegisterOperand> vdataList, bit isTFE, bit !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasRestrictedSOffset>.ret, !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret, !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret, - !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64], isTFE, hasRestrictedSOffset>.ret, - !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64], isTFE, hasRestrictedSOffset>.ret, + !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset>.ret, + !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset>.ret, (ins)))))); } @@ -677,8 +677,8 @@ class MUBUF_Pseudo_Store_Lds<string opName> } class getMUBUFAtomicInsDA<RegisterOperand vdata_op, bit vdata_in, bit hasRestrictedSOffset, - list<RegisterClass> vaddrList=[]> { - RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + list<RegisterClassLike> vaddrList=[]> { + RegisterClassLike vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); dag VData = !if(vdata_in, (ins vdata_op:$vdata_in), (ins vdata_op:$vdata)); dag Data = !if(!empty(vaddrList), VData, !con(VData, (ins vaddrClass:$vaddr))); @@ -702,9 +702,9 @@ class getMUBUFAtomicIns<int addrKind, !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret, !if(!eq(addrKind, BUFAddrKind.BothEn), - getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret, + getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64_AlignTarget]>.ret, !if(!eq(addrKind, BUFAddrKind.Addr64), - getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret, + getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64_AlignTarget]>.ret, (ins)))))); } @@ -1568,11 +1568,12 @@ multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string # !if(!eq(RtnMode, "ret"), "", "_noret") # "_" # vt); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); - defvar data_vt_RC = getVregSrcForVT<data_vt>.ret.RegClass; + defvar data_op = getVregSrcForVT<data_vt>.ret; + defvar data_vt_RC = getVregClassForVT<data_vt>.ret; let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in { defvar OffsetResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) - data_vt_RC:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset, + data_op:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset, Offset:$offset); def : GCNPat< (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), data_vt:$vdata_in)), @@ -1583,7 +1584,7 @@ multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string >; defvar Addr64ResDag = (!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix) - data_vt_RC:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, + data_op:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, Offset:$offset); def : GCNPat< (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), @@ -1832,7 +1833,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri (extract_cpol_set_glc $auxiliary), (extract_cpol $auxiliary)); defvar SrcRC = getVregSrcForVT<vt>.ret; - defvar DataRC = getVregSrcForVT<data_vt>.ret.RegClass; + defvar DataRC = getVregClassForVT<data_vt>.ret; defvar SubLo = !if(!eq(vt, i32), sub0, sub0_sub1); defvar SubHi = !if(!eq(vt, i32), sub1, sub2_sub3); @@ -2088,7 +2089,7 @@ defm : MUBUFStore_PatternOffset <"BUFFER_STORE_SHORT", i16, store_global>; multiclass MUBUFScratchStorePat_Common <string Instr, ValueType vt, PatFrag st, - RegisterClass rc = VGPR_32> { + RegisterClassLike rc = VGPR_32> { def : GCNPat < (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, i32:$offset)), @@ -2104,7 +2105,7 @@ multiclass MUBUFScratchStorePat_Common <string Instr, multiclass MUBUFScratchStorePat <string Instr, ValueType vt, PatFrag st, - RegisterClass rc = VGPR_32> { + RegisterClassLike rc = VGPR_32> { let SubtargetPredicate = HasUnrestrictedSOffset in { defm : MUBUFScratchStorePat_Common<Instr, vt, st, rc>; } diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 18582ed..d0ad120 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -904,7 +904,7 @@ let SubtargetPredicate = isGFX1250Plus in { let WaveSizePredicate = isWave32, mayStore = 0 in { let OtherPredicates = [HasTransposeLoadF4F6Insts] in { defm DS_LOAD_TR4_B64 : DS_1A_RET_NoM0<"ds_load_tr4_b64", VGPROp_64>; -defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VGPROp_96>; +defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VGPROp_96_Align1>; } // End OtherPredicates = [HasTransposeLoadF4F6Insts] defm DS_LOAD_TR8_B64 : DS_1A_RET_NoM0<"ds_load_tr8_b64", VGPROp_64>; defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VGPROp_128>; @@ -934,7 +934,7 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", AVLdSt_64>; defm DS_READ_B64_TR_B8 : DS_1A_RET_NoM0<"ds_read_b64_tr_b8", AVLdSt_64>; defm DS_READ_B64_TR_B16 : DS_1A_RET_NoM0<"ds_read_b64_tr_b16", AVLdSt_64>; - defm DS_READ_B96_TR_B6 : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", AVLdSt_96>; + defm DS_READ_B96_TR_B6 : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", AVLdSt_96_Align1>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 2120bf8..f11b373 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -57,7 +57,9 @@ static int64_t getInlineImmVal64(unsigned Imm); AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, MCInstrInfo const *MCII) : MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()), - MAI(*Ctx.getAsmInfo()), TargetMaxInstBytes(MAI.getMaxInstLength(&STI)), + MAI(*Ctx.getAsmInfo()), + HwModeRegClass(STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)), + TargetMaxInstBytes(MAI.getMaxInstLength(&STI)), CodeObjectVersion(AMDGPU::getDefaultAMDHSACodeObjectVersion()) { // ToDo: AMDGPUDisassembler supports only VI ISA. if (!STI.hasFeature(AMDGPU::FeatureGCN3Encoding) && !isGFX10Plus()) @@ -825,7 +827,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } } - if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG) { + const MCInstrDesc &Desc = MCII->get(MI.getOpcode()); + if (Desc.TSFlags & SIInstrFlags::MIMG) { int VAddr0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); int RsrcIdx = @@ -838,7 +841,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, for (unsigned i = 0; i < NSAArgs; ++i) { const unsigned VAddrIdx = VAddr0Idx + 1 + i; auto VAddrRCID = - MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass; + MCII->getOpRegClassID(Desc.operands()[VAddrIdx], HwModeRegClass); MI.insert(MI.begin() + VAddrIdx, createRegOperand(VAddrRCID, Bytes[i])); } Bytes = Bytes.slice(4 * NSAWords); @@ -1311,7 +1314,8 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { // Widen the register to the correct number of enabled channels. MCRegister NewVdata; if (DstSize != Info->VDataDwords) { - auto DataRCID = MCII->get(NewOpcode).operands()[VDataIdx].RegClass; + auto DataRCID = MCII->getOpRegClassID( + MCII->get(NewOpcode).operands()[VDataIdx], HwModeRegClass); // Get first subregister of VData MCRegister Vdata0 = MI.getOperand(VDataIdx).getReg(); @@ -1338,7 +1342,9 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { MCRegister VAddrSubSA = MRI.getSubReg(VAddrSA, AMDGPU::sub0); VAddrSA = VAddrSubSA ? VAddrSubSA : VAddrSA; - auto AddrRCID = MCII->get(NewOpcode).operands()[VAddrSAIdx].RegClass; + auto AddrRCID = MCII->getOpRegClassID( + MCII->get(NewOpcode).operands()[VAddrSAIdx], HwModeRegClass); + const MCRegisterClass &NewRC = MRI.getRegClass(AddrRCID); NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0, &NewRC); NewVAddrSA = CheckVGPROverflow(NewVAddrSA, NewRC, MRI); diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 935c383..2751857 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -41,6 +41,7 @@ private: std::unique_ptr<MCInstrInfo const> const MCII; const MCRegisterInfo &MRI; const MCAsmInfo &MAI; + const unsigned HwModeRegClass; const unsigned TargetMaxInstBytes; mutable ArrayRef<uint8_t> Bytes; mutable uint32_t Literal; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index e86816d..6de59be 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -229,7 +229,7 @@ class GlobalSaddrTable <bit is_saddr, string Name = ""> { class FLAT_Load_Pseudo< string opName, RegisterOperand vdata_op, bit HasTiedOutput = 0, bit HasSaddr = 0, bit EnableSaddr = 0, - RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)> + RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> : FLAT_Pseudo<opName, (outs), (ins), ""> { let OutOperandList = (outs vdata_op:$vdst); @@ -268,7 +268,7 @@ multiclass FLAT_Flat_Load_Pseudo_t16<string opName> { class FLAT_Store_Pseudo <string opName, RegisterOperand vdataClass, bit HasSaddr = 0, bit EnableSaddr = 0, - RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)> : FLAT_Pseudo<opName, (outs), (ins), ""> { + RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> : FLAT_Pseudo<opName, (outs), (ins), ""> { let InOperandList = !con( (ins VaddrRC:$vaddr, vdataClass:$vdata), !if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)), @@ -385,7 +385,7 @@ class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsy (outs ), !con( !if(IsAsync, (ins VGPR_32:$vdst), (ins)), - !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)), (ins flat_offset:$offset, CPol_0:$cpol)), !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { let LGKM_CNT = !not(IsAsync); @@ -417,7 +417,7 @@ class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_P opName, (outs ), !con( - !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), (ins VGPR_32:$vdata), + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)), (ins VGPR_32:$vdata), (ins flat_offset:$offset, CPol_0:$cpol)), " $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { let VM_CNT = 0; @@ -511,7 +511,7 @@ class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = n let sve = 0; } -class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64:$vaddr), string asm = " $vaddr"> : +class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64_AlignTarget:$vaddr), string asm = " $vaddr"> : FLAT_Pseudo<opName, (outs), !con(addr, (ins flat_offset:$offset, CPol_0:$cpol)), asm#"$offset$cpol"> { let has_vdst = 0; let has_data = 0; @@ -533,7 +533,7 @@ multiclass FLAT_Flat_Prefetch_Pseudo<string opName> { multiclass FLAT_Global_Prefetch_Pseudo<string opName> { let is_flat_global = 1, has_saddr = 1 in { - def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64:$vaddr), " $vaddr, off">, + def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64_AlignTarget:$vaddr), " $vaddr, off">, GlobalSaddrTable<0, opName>; def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">, GlobalSaddrTable<1, opName> { @@ -754,7 +754,7 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN< RegisterOperand data_op = vdst_op> { def "" : FLAT_AtomicNoRet_Pseudo <opName, (outs), - (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol), + (ins VReg_64_AlignTarget:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol), " $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName> { let FPAtomic = data_vt.isFP; @@ -786,7 +786,7 @@ multiclass FLAT_Atomic_Pseudo_RTN< def _RTN : FLAT_AtomicRet_Pseudo <opName, (outs vdst_op_vgpr:$vdst), - (ins VReg_64:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), + (ins VReg_64_AlignTarget:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), " $vdst, $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName#"_rtn"> { let FPAtomic = data_vt.isFP; @@ -811,7 +811,7 @@ multiclass FLAT_Atomic_Pseudo_RTN< def _RTN_agpr : FLAT_AtomicRet_Pseudo <opName, (outs vdst_op_agpr:$vdst), - (ins VReg_64:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), + (ins VReg_64_AlignTarget:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), " $vdst, $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName#"_rtn_agpr"> { let FPAtomic = data_vt.isFP; @@ -837,7 +837,7 @@ class FLAT_Global_Atomic_Pseudo_NO_RTN< ValueType data_vt = vt, RegisterOperand data_op = vdst_op, bit EnableSaddr = false, - RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)> + RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> : FLAT_AtomicNoRet_Pseudo<opName, (outs), (ins), "">, GlobalSaddrTable<EnableSaddr, opName> { let InOperandList = !con( (ins VaddrRC:$vaddr, data_op:$vdata), @@ -867,7 +867,7 @@ class FLAT_Global_Atomic_Pseudo_RTN< RegisterOperand data_op = vdst_op, bit EnableSaddr = false, bit IsVGPR = false, - RegisterClass VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64)> + RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> : FLAT_AtomicRet_Pseudo<opName, (outs), (ins), "">, GlobalSaddrTable<EnableSaddr, opName#"_rtn"#!if(IsVGPR, "", "_agpr")> { defvar vdst_rc= !if(IsVGPR, getEquivalentVGPROperand<vdst_op>.ret, getEquivalentAGPROperand<vdst_op>.ret); @@ -1321,7 +1321,7 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in { } let WaveSizePredicate = isWave32, SubtargetPredicate = HasTransposeLoadF4F6Insts in { - defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VGPROp_96>; + defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VGPROp_96_Align1>; defm GLOBAL_LOAD_TR4_B64 : FLAT_Global_Load_Pseudo <"global_load_tr4_b64", VGPROp_64>; } @@ -1539,7 +1539,7 @@ multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType let AddedComplexity = 1 in def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { + (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { let SubtargetPredicate = inst.SubtargetPredicate; let OtherPredicates = inst.OtherPredicates; } @@ -1568,7 +1568,7 @@ multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt, defvar rtnNode = !cast<SDPatternOperator>(node); def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { + (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { let SubtargetPredicate = inst.SubtargetPredicate; let OtherPredicates = inst.OtherPredicates; } @@ -1612,7 +1612,7 @@ multiclass FlatAtomicIntrPat <string inst, string node, ValueType vt, class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : GCNPat < (vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { + (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { let SubtargetPredicate = inst.SubtargetPredicate; let OtherPredicates = inst.OtherPredicates; } diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index 8821558..464cbec 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -722,7 +722,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { } if (!AMDGPU::isLegalDPALU_DPPControl(*ST, DppCtrlVal) && - AMDGPU::isDPALU_DPP(TII->get(OrigOp), *ST)) { + AMDGPU::isDPALU_DPP(TII->get(OrigOp), *TII, *ST)) { LLVM_DEBUG(dbgs() << " " << OrigMI << " failed: not valid 64-bit DPP control value\n"); break; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 1d9a427..a911e7e 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -869,7 +869,7 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); int VDataRCID = -1; if (VDataIdx != -1) - VDataRCID = Desc.operands()[VDataIdx].RegClass; + VDataRCID = TII->getOpRegClassID(Desc.operands()[VDataIdx]); if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { // There is no hazard if the instruction does not use vector regs @@ -893,8 +893,8 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. if (TII->isMIMG(MI)) { int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); - assert(SRsrcIdx != -1 && - AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256); + assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID( + Desc.operands()[SRsrcIdx])) == 256); (void)SRsrcIdx; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 3563caa..e82f998 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -788,9 +788,11 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, // Check if operand register class contains register used. // Intention: print disassembler message when invalid code is decoded, // for example sgpr register used in VReg or VISrc(VReg or imm) operand. - int RCID = Desc.operands()[OpNo].RegClass; + const MCOperandInfo &OpInfo = Desc.operands()[OpNo]; + int16_t RCID = MII.getOpRegClassID( + OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)); if (RCID != -1) { - const MCRegisterClass RC = MRI.getRegClass(RCID); + const MCRegisterClass &RC = MRI.getRegClass(RCID); auto Reg = mc2PseudoReg(Op.getReg()); if (!RC.contains(Reg) && !isInlineValue(Reg)) { O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC) @@ -1025,7 +1027,7 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCInstrDesc &Desc = MII.get(MI->getOpcode()); if (!AMDGPU::isLegalDPALU_DPPControl(STI, Imm) && - AMDGPU::isDPALU_DPP(Desc, STI)) { + AMDGPU::isDPALU_DPP(Desc, MII, STI)) { O << " /* DP ALU dpp only supports " << (isGFX12(STI) ? "row_share" : "row_newbcast") << " */"; return; diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 291c03a..64e34db 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -1516,7 +1516,8 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16, bit isDual, bit isBVH8> { int num_addrs = !if(isBVH8, 11, !if(Is64, !if(IsA16, 9, 12), !if(IsA16, 8, 11))); RegisterOperand RegClass = MIMGAddrSize<num_addrs, 0>.RegClass; - defvar Size = !cast<SIRegisterClass>(RegClass.RegClass).Size; + defvar Size = !cast<SIRegisterClassLike>(RegClass.RegClass).Size; + int VAddrDwords = !srl(Size, 5); int GFX11PlusNSAAddrs = !if(IsA16, 4, 5); diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 90c828b..6616b30 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1077,7 +1077,7 @@ bool SIFoldOperandsImpl::tryFoldRegSeqSplat( if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx)) return false; - int16_t RCID = Desc.operands()[UseOpIdx].RegClass; + int16_t RCID = TII->getOpRegClassID(Desc.operands()[UseOpIdx]); if (RCID == -1) return false; @@ -1299,10 +1299,8 @@ void SIFoldOperandsImpl::foldOperand( AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO, AMDGPU::AV_MOV_B64_IMM_PSEUDO}) { const MCInstrDesc &MovDesc = TII->get(MovOp); - assert(MovDesc.getNumDefs() > 0 && MovDesc.operands()[0].RegClass != -1); - const TargetRegisterClass *MovDstRC = - TRI->getRegClass(MovDesc.operands()[0].RegClass); + TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[0])); // Fold if the destination register class of the MOV instruction (ResRC) // is a superclass of (or equal to) the destination register class of the @@ -1312,7 +1310,8 @@ void SIFoldOperandsImpl::foldOperand( const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1; const TargetRegisterClass *MovSrcRC = - TRI->getRegClass(MovDesc.operands()[SrcIdx].RegClass); + TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[SrcIdx])); + if (MovSrcRC) { if (UseSubReg) MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 46757cf..ec5c5bb3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5029,9 +5029,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return false; } - int RegClass = Desc.operands()[i].RegClass; - const MCOperandInfo &OpInfo = Desc.operands()[i]; + int16_t RegClass = getOpRegClassID(OpInfo); + switch (OpInfo.OperandType) { case MCOI::OPERAND_REGISTER: if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { @@ -5635,7 +5635,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && !AMDGPU::isLegalDPALU_DPPControl(ST, DC) && - AMDGPU::isDPALU_DPP(Desc, ST)) { + AMDGPU::isDPALU_DPP(Desc, *this, ST)) { ErrInfo = "Invalid dpp_ctrl value: " "DP ALU dpp only support row_newbcast"; return false; @@ -6031,48 +6031,17 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const { llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction"); } -static const TargetRegisterClass * -adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, - const MCInstrDesc &TID, unsigned RCID) { - if (!ST.hasGFX90AInsts() && (TID.mayLoad() || TID.mayStore())) { - switch (RCID) { - case AMDGPU::AV_32RegClassID: - RCID = AMDGPU::VGPR_32RegClassID; - break; - case AMDGPU::AV_64RegClassID: - RCID = AMDGPU::VReg_64RegClassID; - break; - case AMDGPU::AV_96RegClassID: - RCID = AMDGPU::VReg_96RegClassID; - break; - case AMDGPU::AV_128RegClassID: - RCID = AMDGPU::VReg_128RegClassID; - break; - case AMDGPU::AV_160RegClassID: - RCID = AMDGPU::VReg_160RegClassID; - break; - case AMDGPU::AV_512RegClassID: - RCID = AMDGPU::VReg_512RegClassID; - break; - default: - break; - } - } - - return RI.getProperlyAlignedRC(RI.getRegClass(RCID)); -} - +// FIXME: This should not be an overridable function. All subtarget dependent +// operand modifications should go through isLookupRegClassByHwMode in the +// generic handling. const TargetRegisterClass * SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI) const { if (OpNum >= TID.getNumOperands()) return nullptr; - auto RegClass = TID.operands()[OpNum].RegClass; - // Special pseudos have no alignment requirement. - if (TID.getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO || isSpill(TID)) - return RI.getRegClass(RegClass); - - return adjustAllocatableRegClass(ST, RI, TID, RegClass); + const MCOperandInfo &OpInfo = TID.operands()[OpNum]; + int16_t RegClass = getOpRegClassID(OpInfo); + return RI.getRegClass(RegClass); } const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, @@ -6090,8 +6059,7 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, return RI.getPhysRegBaseClass(Reg); } - unsigned RCID = Desc.operands()[OpNo].RegClass; - return adjustAllocatableRegClass(ST, RI, Desc, RCID); + return RI.getRegClass(getOpRegClassID(Desc.operands()[OpNo])); } void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { @@ -6099,7 +6067,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { MachineBasicBlock *MBB = MI.getParent(); MachineOperand &MO = MI.getOperand(OpIdx); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass; + unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]); const TargetRegisterClass *RC = RI.getRegClass(RCID); unsigned Size = RI.getRegSizeInBits(*RC); unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO @@ -6168,7 +6136,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, Register Reg = MO.getReg(); - const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass); + const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo)); if (Reg.isPhysical()) return DRC->contains(Reg); @@ -6293,8 +6261,9 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineRegisterInfo &MRI = MF.getRegInfo(); const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx]; + int64_t RegClass = getOpRegClassID(OpInfo); const TargetRegisterClass *DefinedRC = - OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; + RegClass != -1 ? RI.getRegClass(RegClass) : nullptr; if (!MO) MO = &MI.getOperand(OpIdx); @@ -7619,7 +7588,7 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx, if (!RI.isVGPRClass(CurrRC)) return; - unsigned RCID = get(Opcode).operands()[OpIdx].RegClass; + int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]); const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID); if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) { Op.setSubReg(AMDGPU::lo16); @@ -9323,7 +9292,7 @@ Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, // Is this operand statically required to be an SGPR based on the operand // constraints? const TargetRegisterClass *OpRC = - RI.getRegClass(Desc.operands()[Idx].RegClass); + RI.getRegClass(getOpRegClassID(Desc.operands()[Idx])); bool IsRequiredSGPR = RI.isSGPRClass(OpRC); if (IsRequiredSGPR) return MO.getReg(); @@ -9804,7 +9773,7 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { if (Idx == -1) // e.g. s_memtime return false; - const auto RCID = MI.getDesc().operands()[Idx].RegClass; + const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]); return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index cc59acf..e979eeb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1298,7 +1298,7 @@ public: return 4; } - return RI.getRegSizeInBits(*RI.getRegClass(OpInfo.RegClass)) / 8; + return RI.getRegSizeInBits(*RI.getRegClass(getOpRegClassID(OpInfo))) / 8; } /// This form should usually be preferred since it handles operands diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 18a5393..b7f63ec 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1151,6 +1151,7 @@ def ExpSrc3 : RegisterOperand<VGPR_32> { let ParserMatchClass = VReg32OrOffClass; } +// FIXME: Should change class based on hasSDWAScalar to exclude SGPRs class SDWASrc<ValueType vt> : RegisterOperand<VS_32> { let OperandNamespace = "AMDGPU"; string Type = !if(vt.isFP, "FP", "INT"); @@ -1807,13 +1808,13 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> { defvar op16 = !if(IsTrue16, !if (IsVOP3Encoding, VOPDstOperand_t16, VOPDstOperand_t16Lo128), VOPDstOperand<VGPR_32>); - RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VOPDstOperand<VReg_1024>, - !eq(VT.Size, 512) : VOPDstOperand<VReg_512>, - !eq(VT.Size, 256) : VOPDstOperand<VReg_256>, - !eq(VT.Size, 192) : VOPDstOperand<VReg_192>, - !eq(VT.Size, 128) : VOPDstOperand<VReg_128>, - !eq(VT.Size, 96) : VOPDstOperand<VReg_96>, - !eq(VT.Size, 64) : VOPDstOperand<VReg_64>, + RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VOPDstOperand<VReg_1024_AlignTarget>, + !eq(VT.Size, 512) : VOPDstOperand<VReg_512_AlignTarget>, + !eq(VT.Size, 256) : VOPDstOperand<VReg_256_AlignTarget>, + !eq(VT.Size, 192) : VOPDstOperand<VReg_192_AlignTarget>, + !eq(VT.Size, 128) : VOPDstOperand<VReg_128_AlignTarget>, + !eq(VT.Size, 96) : VOPDstOperand<VReg_96_AlignTarget>, + !eq(VT.Size, 64) : VOPDstOperand<VReg_64_AlignTarget>, !eq(VT.Size, 32) : VOPDstOperand<VGPR_32>, !eq(VT.Size, 16) : op16, 1 : VOPDstS64orS32); // else VT == i1 @@ -1821,8 +1822,8 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> { class getVALUDstForVT_fake16<ValueType VT> { RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>, - !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>, - !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>, + !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128_AlignTarget>, + !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64_AlignTarget>, !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32_Lo128>, VOPDstS64orS32)))); // else VT == i1 } @@ -1890,21 +1891,38 @@ class getSOPSrcForVT<ValueType VT> { RegisterOperand ret = !if(!eq(VT.Size, 64), SSrc_b64, SSrc_b32); } -// Returns the vreg register class to use for source operand given VT +// Returns the vreg register operand to use for source operand given VT. +// This should only be used for a target instruction's ins list. class getVregSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 1> { RegisterOperand ret = - !cond(!eq(VT.Size, 512) : RegisterOperand<VReg_512>, - !eq(VT.Size, 192) : RegisterOperand<VReg_192>, - !eq(VT.Size, 128) : RegisterOperand<VReg_128>, - !eq(VT.Size, 96) : RegisterOperand<VReg_96>, - !eq(VT.Size, 64) : RegisterOperand<VReg_64>, - !eq(VT.Size, 48) : RegisterOperand<VReg_64>, + !cond(!eq(VT.Size, 512) : RegisterOperand<VReg_512_AlignTarget>, + !eq(VT.Size, 192) : RegisterOperand<VReg_192_AlignTarget>, + !eq(VT.Size, 128) : RegisterOperand<VReg_128_AlignTarget>, + !eq(VT.Size, 96) : RegisterOperand<VReg_96_AlignTarget>, + !eq(VT.Size, 64) : RegisterOperand<VReg_64_AlignTarget>, + !eq(VT.Size, 48) : RegisterOperand<VReg_64_AlignTarget>, !eq(VT.Size, 16) : !if(IsTrue16, !if(IsFake16, VGPROp_32_Lo128, VGPROp_16_Lo128), RegisterOperand<VGPR_32>), 1 : RegisterOperand<VGPR_32>); } +// Returns a concrete vgpr register class to use for a value type VT, +// which exists separately from a real instruction use. +class getVregClassForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 1> { + RegisterClass ret = + !cond(!eq(VT.Size, 512) : VReg_512, + !eq(VT.Size, 192) : VReg_192, + !eq(VT.Size, 128) : VReg_128, + !eq(VT.Size, 96) : VReg_96, + !eq(VT.Size, 64) : VReg_64, + !eq(VT.Size, 48) : VReg_64, + !eq(VT.Size, 16) : !if(IsTrue16, + !if(IsFake16, VGPR_32_Lo128, VGPR_16_Lo128), + VGPR_32), + 1 : VGPR_32); +} + class getSDWASrcForVT <ValueType VT> { RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32); RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32); @@ -2638,7 +2656,7 @@ class getAlign2RegOp<RegisterOperand RC> { } class getEquivalentAGPROperand<RegisterOperand RC> { - defvar Size = !cast<RegisterClass>(RC.RegClass).Size; + defvar Size = !cast<SIRegisterClassLike>(RC.RegClass).Size; RegisterOperand ret = !cond(!eq(Size, 32) : RegisterOperand<AGPR_32>, !eq(Size, 64) : RegisterOperand<AReg_64>, @@ -2649,16 +2667,33 @@ class getEquivalentAGPROperand<RegisterOperand RC> { } class getEquivalentVGPROperand<RegisterOperand RC> { - defvar Size = !cast<RegisterClass>(RC.RegClass).Size; + defvar Size = !cast<SIRegisterClassLike>(RC.RegClass).Size; RegisterOperand ret = - !cond(!eq(Size, 32) : RegisterOperand<VGPR_32>, - !eq(Size, 64) : RegisterOperand<VReg_64>, - !eq(Size, 96) : RegisterOperand<VReg_96>, - !eq(Size, 128) : RegisterOperand<VReg_128>, - !eq(Size, 160) : RegisterOperand<VReg_160>, - !eq(Size, 1024) : RegisterOperand<VReg_1024>); + !cond( + !eq(RC, VGPROp_32) : VGPROp_32, + !eq(RC, VGPROp_64) : VGPROp_64, + + !eq(RC, AVLdSt_32) : VGPROp_32, + !eq(RC, AVLdSt_64) : VGPROp_64, + !eq(RC, AVLdSt_96) : VGPROp_96, + !eq(RC, AVLdSt_128) : VGPROp_128, + !eq(RC, AVLdSt_160) : VGPROp_160, + !eq(RC, AVLdSt_1024) : VGPROp_1024, + + !eq(RC, AVLdSt_64_Align2) : VGPROp_64_Align2, + !eq(RC, AVLdSt_96_Align2) : VGPROp_96_Align2, + !eq(RC, AVLdSt_128_Align2) : VGPROp_128_Align2, + !eq(RC, AVLdSt_160_Align2) : VGPROp_160_Align2, + !eq(RC, AVLdSt_1024_Align2) : VGPROp_1024_Align2, + + !eq(RC, AVLdSt_64_Align1) : VGPROp_64_Align1, + !eq(RC, AVLdSt_96_Align1) : VGPROp_96_Align1, + !eq(RC, AVLdSt_128_Align1) : VGPROp_128_Align1, + !eq(RC, AVLdSt_160_Align1) : VGPROp_160_Align1, + !eq(RC, AVLdSt_1024_Align1) : VGPROp_1024_Align1); } + class getHasVOP3DPP <ValueType DstVT = i32, ValueType Src0VT = i32, ValueType Src1VT = i32, ValueType Src2VT = i32> { bit ret = !if(!eq(DstVT.Size, 64), @@ -3190,7 +3225,7 @@ class Commutable_REV <string revOp, bit isOrig> { // Interpolation opcodes //===----------------------------------------------------------------------===// -class VINTRPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVINTRPDst">; +class VINTRPDstOperand <RegisterClassLike rc> : RegisterOperand <rc, "printVINTRPDst">; class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : VINTRPCommon <outs, ins, "", pattern>, diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index be084a9..eac9fd4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -120,6 +120,8 @@ def ATOMIC_FENCE : SPseudoInstSI< let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { // For use in patterns +// No align needed as it will be decomposed anyway +// TODO: Remove alignment requirement from sources def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> { let isPseudo = 1; @@ -129,7 +131,7 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), // 64-bit vector move instruction. This is mainly used by the // SIFoldOperands pass to enable folding of inline immediates. -def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), +def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64_AlignTarget:$vdst), (ins VSrc_b64:$src0)> { let isReMaterializable = 1; let isAsCheapAsAMove = 1; @@ -163,9 +165,6 @@ def AV_MOV_B32_IMM_PSEUDO // 64-bit materialize immediate which supports AGPR or VGPR. This has // an unusual operand restriction which requires the two halves of the // immediate to each be 32-bit inline immediate values. -// -// FIXME: This unnecessarily has the even aligned vector register -// requirement applied. def AV_MOV_B64_IMM_PSEUDO : VPseudoInstSI<(outs AV_64:$vdst), (ins AV_64_PSEUDO_IMM:$src0)> { let isReMaterializable = 1; @@ -381,13 +380,13 @@ foreach Op = Operations in { let usesCustomInserter = 1, Defs = [VCC] in { def V_ADD_U64_PSEUDO : VPseudoInstSI < - (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), - [(set VReg_64:$vdst, (DivergentBinFrag<add> i64:$src0, i64:$src1))] + (outs VReg_64_AlignTarget:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), + [(set VReg_64_AlignTarget:$vdst, (DivergentBinFrag<add> i64:$src0, i64:$src1))] >; def V_SUB_U64_PSEUDO : VPseudoInstSI < - (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), - [(set VReg_64:$vdst, (DivergentBinFrag<sub> i64:$src0, i64:$src1))] + (outs VReg_64_AlignTarget:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), + [(set VReg_64_AlignTarget:$vdst, (DivergentBinFrag<sub> i64:$src0, i64:$src1))] >; } // End usesCustomInserter = 1, Defs = [VCC] @@ -1142,7 +1141,7 @@ def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst), // VGPR or AGPR spill instructions. In case of AGPR spilling a temp register // needs to be used and an extra instruction to move between VGPR and AGPR. // UsesTmp adds to the total size of an expanded spill in this case. -multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, +multiclass SI_SPILL_VGPR <SIRegisterClassLike vgpr_class, bit UsesTmp = 0, bit HasMask = 0> { let UseNamedOperandTable = 1, Spill = 1, VALU = 1, SchedRW = [WriteVMEM] in { @@ -1177,21 +1176,25 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, } // End UseNamedOperandTable = 1, Spill = 1, VALU = 1, SchedRW = [WriteVMEM] } +// TODO: Technically the AlignTarget register class constraint is +// overly conservative for gfx90a. There is an alignment requirement, +// but the underlying spill will be lowered to 32-bit accesses. + defm SI_SPILL_V16 : SI_SPILL_VGPR <VGPR_16>; defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>; -defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; -defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; -defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; -defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>; -defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>; -defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>; -defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; -defm SI_SPILL_V288 : SI_SPILL_VGPR <VReg_288>; -defm SI_SPILL_V320 : SI_SPILL_VGPR <VReg_320>; -defm SI_SPILL_V352 : SI_SPILL_VGPR <VReg_352>; -defm SI_SPILL_V384 : SI_SPILL_VGPR <VReg_384>; -defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; -defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>; +defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64_AlignTarget>; +defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96_AlignTarget>; +defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128_AlignTarget>; +defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160_AlignTarget>; +defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192_AlignTarget>; +defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224_AlignTarget>; +defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256_AlignTarget>; +defm SI_SPILL_V288 : SI_SPILL_VGPR <VReg_288_AlignTarget>; +defm SI_SPILL_V320 : SI_SPILL_VGPR <VReg_320_AlignTarget>; +defm SI_SPILL_V352 : SI_SPILL_VGPR <VReg_352_AlignTarget>; +defm SI_SPILL_V384 : SI_SPILL_VGPR <VReg_384_AlignTarget>; +defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512_AlignTarget>; +defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024_AlignTarget>; let Defs = [M0] in { // Spills a block of 32 VGPRs. M0 will contain a mask describing which @@ -1200,34 +1203,34 @@ let Defs = [M0] in { } defm SI_SPILL_A32 : SI_SPILL_VGPR <AGPR_32, 1>; -defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64, 1>; -defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>; -defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>; -defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>; -defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>; -defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>; -defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>; -defm SI_SPILL_A288 : SI_SPILL_VGPR <AReg_288, 1>; -defm SI_SPILL_A320 : SI_SPILL_VGPR <AReg_320, 1>; -defm SI_SPILL_A352 : SI_SPILL_VGPR <AReg_352, 1>; -defm SI_SPILL_A384 : SI_SPILL_VGPR <AReg_384, 1>; -defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>; -defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>; +defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64_AlignTarget, 1>; +defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96_AlignTarget, 1>; +defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128_AlignTarget, 1>; +defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160_AlignTarget, 1>; +defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192_AlignTarget, 1>; +defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224_AlignTarget, 1>; +defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256_AlignTarget, 1>; +defm SI_SPILL_A288 : SI_SPILL_VGPR <AReg_288_AlignTarget, 1>; +defm SI_SPILL_A320 : SI_SPILL_VGPR <AReg_320_AlignTarget, 1>; +defm SI_SPILL_A352 : SI_SPILL_VGPR <AReg_352_AlignTarget, 1>; +defm SI_SPILL_A384 : SI_SPILL_VGPR <AReg_384_AlignTarget, 1>; +defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512_AlignTarget, 1>; +defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024_AlignTarget, 1>; defm SI_SPILL_AV32 : SI_SPILL_VGPR <AV_32, 1>; -defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64, 1>; -defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96, 1>; -defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128, 1>; -defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>; -defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>; -defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>; -defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>; -defm SI_SPILL_AV288 : SI_SPILL_VGPR <AV_288, 1>; -defm SI_SPILL_AV320 : SI_SPILL_VGPR <AV_320, 1>; -defm SI_SPILL_AV352 : SI_SPILL_VGPR <AV_352, 1>; -defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384, 1>; -defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>; -defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>; +defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64_AlignTarget, 1>; +defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96_AlignTarget, 1>; +defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128_AlignTarget, 1>; +defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160_AlignTarget, 1>; +defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192_AlignTarget, 1>; +defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224_AlignTarget, 1>; +defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256_AlignTarget, 1>; +defm SI_SPILL_AV288 : SI_SPILL_VGPR <AV_288_AlignTarget, 1>; +defm SI_SPILL_AV320 : SI_SPILL_VGPR <AV_320_AlignTarget, 1>; +defm SI_SPILL_AV352 : SI_SPILL_VGPR <AV_352_AlignTarget, 1>; +defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384_AlignTarget, 1>; +defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512_AlignTarget, 1>; +defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024_AlignTarget, 1>; let isConvergent = 1 in { defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR <VGPR_32>; @@ -2383,18 +2386,24 @@ let True16Predicate = UseRealTrue16Insts in { } } -// V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit -// immediate and wil be expanded as needed, but we will only use these patterns -// for values which can be encoded. -def : GCNPat < - (VGPRImm<(i64 imm)>:$imm), - (V_MOV_B64_PSEUDO imm:$imm) ->; +/// FIXME: Increasing the priority of VGPRImm over the scalar forms as +/// a workaround for a phase ordering problem caused by overly +/// conservative MachineCSE. If we end up with an s_mov_b64 + copy to +/// vgpr pattern, MachineCSE will not perform the CSE which occurs +/// after operand folding. +let AddedComplexity = 1 in { + // V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit + // immediate and wil be expanded as needed, but we will only use these patterns + // for values which can be encoded. + def : GCNPat < + (VGPRImm<(i64 imm)>:$imm), + (V_MOV_B64_PSEUDO imm:$imm)>; -def : GCNPat < - (VGPRImm<(f64 fpimm)>:$imm), - (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm))) ->; + def : GCNPat < + (VGPRImm<(f64 fpimm)>:$imm), + (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm))) + >; +} // End let AddedComplexity = 2 def : GCNPat < (i64 imm:$imm), diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index afe76e1..bfac639 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1338,8 +1338,9 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, continue; unsigned I = Op.getOperandNo(); - if (Desc.operands()[I].RegClass == -1 || - !TRI->isVSSuperClass(TRI->getRegClass(Desc.operands()[I].RegClass))) + + int16_t RegClass = TII->getOpRegClassID(Desc.operands()[I]); + if (RegClass == -1 || !TRI->isVSSuperClass(TRI->getRegClass(RegClass))) continue; if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 3115579..be1c883 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -328,7 +328,8 @@ struct SGPRSpillBuilder { SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(), ST.getAMDGPUDwarfFlavour(), - /*PC=*/0, ST.getHwMode()), + /*PC=*/0, + ST.getHwMode(MCSubtargetInfo::HwMode_RegInfo)), ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 82fc240..fc8f46a 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -91,16 +91,23 @@ class SIReg <string n, bits<10> regIdx = 0, bit isVGPR = 0, int Index = !cast<int>(regIdx); } -// For register classes that use TSFlags. -class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList> - : RegisterClass <n, rTypes, Align, rList> { +class SIRegisterClassLike<int BW = 0, bit V = false, + bit A = false, + bit S = false> { + // Bitwidth of the register + field int Size = BW; + // For vector register classes. - field bit HasVGPR = 0; - field bit HasAGPR = 0; + field bit HasVGPR = V; + field bit HasAGPR = A; // For scalar register classes. - field bit HasSGPR = 0; + field bit HasSGPR = S; +} +// For register classes that use TSFlags. +class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList> + : RegisterClass <n, rTypes, Align, rList>, SIRegisterClassLike { // Alignment of the first register in tuple (in 32-bit units). field int RegTupleAlignUnits = 1; @@ -991,7 +998,8 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> : // Define a register tuple class, along with one requiring an even // aligned base register. multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> { - let HasVGPR = 1, BaseClassPriority = 1 in { + let HasVGPR = 1, BaseClassPriority = 1, + DecoderMethod = "DecodeVReg_"#!mul(numRegs, 32)#"RegisterClass" in { // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, regList> { let BaseClassOrder = !mul(numRegs, 32); @@ -1031,7 +1039,8 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>; } multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> { - let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1 in { + let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1, + DecoderMethod = "DecodeAReg_"#!mul(numRegs, 32)#"RegisterClass" in { // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, regList> { let BaseClassOrder = !mul(numRegs, 32); @@ -1197,15 +1206,87 @@ defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_102 } //===----------------------------------------------------------------------===// -// Register operands +// +// AlignTarget classes. Artifical classes to swap between +// even-aligned and any-aligned classes depending on subtarget. +// //===----------------------------------------------------------------------===// +def AV_LdSt_32_Target : RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [VGPR_32, AV_32, VGPR_32]>, SIRegisterClassLike<32, true, true> { + let DecoderMethod = "decodeAVLdSt"; +} + +foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 1024 ] in { + def VReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [!cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { + let DecoderMethod = "DecodeVReg_"#RegSize#"RegisterClass"; + } + + def AReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, false, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, /*Unused combination*/], + [!cast<RegisterClass>("AReg_"#RegSize), + !cast<RegisterClass>("AReg_"#RegSize#_Align2) + /*Unused combination*/]> { + let DecoderMethod = "DecodeAReg_"#RegSize#"RegisterClass"; + } + + def AV_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [!cast<RegisterClass>("AV_"#RegSize), + !cast<RegisterClass>("AV_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { + let DecoderMethod = "DecodeAV_"#RegSize#"RegisterClass"; + } + + def AV_LdSt_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [!cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("AV_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { + let DecoderMethod = "decodeAVLdSt"; + } + + def AV_LdSt_#RegSize#_Align2 : SIRegisterClassLike<RegSize, true, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [!cast<RegisterClass>("VReg_"#RegSize#_Align2), + !cast<RegisterClass>("AV_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { + let DecoderMethod = "decodeAVLdSt"; + } + + def AV_LdSt_#RegSize#_Align1 : SIRegisterClassLike<RegSize, true, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [!cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("AV_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize)]> { + let DecoderMethod = "decodeAVLdSt"; + } +} + +def VS_64_AlignTarget : SIRegisterClassLike<64, true, false, true>, + RegClassByHwMode< + [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [VS_64, VS_64_Align2, VS_64_Align2]> { + let DecoderMethod = "decodeSrcRegOrImm9"; +} + class RegImmMatcher<string name> : AsmOperandClass { let Name = name; let RenderMethod = "addRegOrImmOperands"; } -class RegOrImmOperand <RegisterClass RegClass, string OperandTypeName> +class RegOrImmOperand <RegisterClassLike RegClass, string OperandTypeName> : RegisterOperand<RegClass> { let OperandNamespace = "AMDGPU"; let OperandType = OperandTypeName; @@ -1213,14 +1294,18 @@ class RegOrImmOperand <RegisterClass RegClass, string OperandTypeName> } //===----------------------------------------------------------------------===// +// Register operands +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// // SSrc_* Operands with an SGPR, a 32-bit immediate, or 64-bit immediate // if supported by target. //===----------------------------------------------------------------------===// -class SrcRegOrImm9<RegisterClass regClass, string operandType> +class SrcRegOrImm9<RegisterClassLike regClass, string operandType> : RegOrImmOperand<regClass, operandType> { string DecoderMethodName = "decodeSrcRegOrImm9"; - let DecoderMethod = DecoderMethodName # "<" # regClass.Size # ">"; + let DecoderMethod = DecoderMethodName # "<" # !cast<SIRegisterClassLike>(regClass).Size # ">"; } class SrcRegOrImm9_t16<string operandType, RegisterClass regClass = VS_16> @@ -1277,12 +1362,12 @@ def VSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_FP32">; def VSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2INT16">; def VSrc_v2bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2BF16">; def VSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2FP16">; -def VSrc_b64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_INT64">; -def VSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_FP64"> { +def VSrc_b64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_INT64">; +def VSrc_f64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_FP64"> { let DecoderMethod = "decodeOperand_VSrc_f64"; } -def VSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2INT32">; -def VSrc_v2f32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2FP32">; +def VSrc_v2b32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_V2INT32">; +def VSrc_v2f32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_V2FP32">; def VSrc_NoInline_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_NOINLINE_V2FP16">; @@ -1292,19 +1377,19 @@ def VSrc_NoInline_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_NOINLINE_V2FP16 // This is for operands with the enum(9), VSrc encoding restriction, // but only allows VGPRs. -class SrcReg9<RegisterClass regClass> : RegisterOperand<regClass> { - let DecoderMethod = "decodeSrcReg9<" # regClass.Size # ">"; +class SrcReg9<RegisterClassLike regClass> : RegisterOperand<regClass> { + let DecoderMethod = "decodeSrcReg9<" # !cast<SIRegisterClassLike>(regClass).Size # ">"; } def VRegSrc_32 : SrcReg9<VGPR_32>; -def VRegSrc_64 : SrcReg9<VReg_64>; -def VRegSrc_96 : SrcReg9<VReg_96>; -def VRegSrc_128 : SrcReg9<VReg_128>; -def VRegSrc_192 : SrcReg9<VReg_192>; -def VRegSrc_256 : SrcReg9<VReg_256>; -def VRegSrc_384 : SrcReg9<VReg_384>; -def VRegSrc_512 : SrcReg9<VReg_512>; -def VRegSrc_1024 : SrcReg9<VReg_1024>; +def VRegSrc_64 : SrcReg9<VReg_64_AlignTarget>; +def VRegSrc_96 : SrcReg9<VReg_96_AlignTarget>; +def VRegSrc_128 : SrcReg9<VReg_128_AlignTarget>; +def VRegSrc_192 : SrcReg9<VReg_192_AlignTarget>; +def VRegSrc_256 : SrcReg9<VReg_256_AlignTarget>; +def VRegSrc_384 : SrcReg9<VReg_384_AlignTarget>; +def VRegSrc_512 : SrcReg9<VReg_512_AlignTarget>; +def VRegSrc_1024 : SrcReg9<VReg_1024_AlignTarget>; def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32>; // True 16 Operands @@ -1325,23 +1410,23 @@ class VGPROp<RegisterClass regClass> : RegisterOperand<regClass> { class VGPROp_Align2<RegisterClass regClass> : RegisterOperand<!cast<RegisterClass>(regClass#_Align2)> { let DecoderMethod = "Decode" # regClass # "RegisterClass"; } -multiclass VGPROp_Aligned<RegisterClass regClass> { - def _Align1 : VGPROp<regClass>; - def _Align2 : VGPROp_Align2<regClass>; -} // TODO: These cases should use default target alignment def VGPROp_16 : VGPROp<VGPR_16> { let EncoderMethod = "getMachineOpValueT16"; } + def VGPROp_32 : VGPROp<VGPR_32>; foreach size = ["64", "96", "128", "160", "192", "224", "256", "288", "320", "352", "384", "512", "1024"] in { - def VGPROp_#size : VGPROp<!cast<RegisterClass>("VReg_"#size)>; -} + // Target default alignment + def VGPROp_#size : RegisterOperand<!cast<RegisterClassLike>("VReg_"#size#_AlignTarget)>; + + // No alignment requirement + def VGPROp_#size#_Align1 : RegisterOperand<!cast<RegisterClassLike>("VReg_"#size)>; -foreach size = ["64", "96", "128", "160", "256", "1024"] in { - defm VGPROp_#size : VGPROp_Aligned<!cast<RegisterClass>("VReg_"#size)>; + // Always even alignment requirement + def VGPROp_#size#_Align2 : RegisterOperand<!cast<RegisterClassLike>("VReg_"#size#_Align2)>; } def VGPROp_16_Lo128 : RegisterOperand<VGPR_16_Lo128> { @@ -1357,9 +1442,9 @@ def VGPROp_32_Lo128 : RegisterOperand<VGPR_32_Lo128> { // ASrc_* Operands with an AccVGPR //===----------------------------------------------------------------------===// -class AVOperand<RegisterClass regClass, string decoder> +class AVOperand<RegisterClassLike regClass, string decoder> : RegisterOperand<regClass> { - let DecoderMethod = decoder # "<" # regClass.Size # ">"; + let DecoderMethod = decoder # "<" # !cast<SIRegisterClassLike>(regClass).Size # ">"; let EncoderMethod = "getAVOperandEncoding"; } @@ -1374,13 +1459,13 @@ def VCSrc_bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_BF16">; def VCSrc_f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP16">; def VCSrc_b32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_INT32">; def VCSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP32">; -def VCSrc_b64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_INT64">; -def VCSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_FP64">; +def VCSrc_b64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_INT64">; +def VCSrc_f64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">; def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">; def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">; def VCSrc_b32_Lo256 : SrcRegOrImm9 <VS_32_Lo256, "OPERAND_REG_INLINE_C_INT32">; -def VCSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_V2INT32">; +def VCSrc_v2b32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_V2INT32">; def VCSrc_b64_Lo256 : SrcRegOrImm9 <VS_64_Lo256, "OPERAND_REG_INLINE_C_INT64">; // True 16 Operands @@ -1391,73 +1476,80 @@ def VCSrcT_f16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_FP16">; // VISrc_* Operands with a VGPR or an inline constant //===----------------------------------------------------------------------===// -def VISrc_64_bf16 : SrcRegOrImm9 <VReg_64, "OPERAND_REG_INLINE_C_BF16">; -def VISrc_64_f16 : SrcRegOrImm9 <VReg_64, "OPERAND_REG_INLINE_C_FP16">; -def VISrc_64_b32 : SrcRegOrImm9 <VReg_64, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_64_f64 : SrcRegOrImm9 <VReg_64, "OPERAND_REG_INLINE_C_FP64">; -def VISrc_128_bf16 : SrcRegOrImm9 <VReg_128, "OPERAND_REG_INLINE_C_BF16">; -def VISrc_128_f16 : SrcRegOrImm9 <VReg_128, "OPERAND_REG_INLINE_C_FP16">; -def VISrc_128_b32 : SrcRegOrImm9 <VReg_128, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_128_f32 : SrcRegOrImm9 <VReg_128, "OPERAND_REG_INLINE_C_FP32">; -def VISrc_256_b32 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_256_f32 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_FP32">; -def VISrc_256_f64 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_FP64">; -def VISrc_512_b32 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_512_f32 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_FP32">; -def VISrc_512_f64 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_FP64">; -def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024, "OPERAND_REG_INLINE_C_FP32">; +def VISrc_64_bf16 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_BF16">; +def VISrc_64_f16 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_FP16">; +def VISrc_64_b32 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_64_f64 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; +def VISrc_128_bf16 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_BF16">; +def VISrc_128_f16 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_FP16">; +def VISrc_128_b32 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_128_f32 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; +def VISrc_256_b32 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_256_f32 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; +def VISrc_256_f64 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; +def VISrc_512_b32 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_512_f32 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; +def VISrc_512_f64 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; +def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; //===----------------------------------------------------------------------===// // AVSrc_*, AVDst_*, AVLdSt_* Operands with an AGPR or VGPR //===----------------------------------------------------------------------===// -class AVSrcOperand<RegisterClass regClass> +class AVSrcOperand<RegisterClassLike regClass> : AVOperand<regClass, "decodeSrcAV10">; def AVSrc_32 : AVSrcOperand<AV_32>; -def AVSrc_64 : AVSrcOperand<AV_64>; -def AVSrc_128 : AVSrcOperand<AV_128>; -def AVSrc_192 : AVSrcOperand<AV_192>; -def AVSrc_256 : AVSrcOperand<AV_256>; +def AVSrc_64 : AVSrcOperand<AV_64_AlignTarget>; +def AVSrc_128 : AVSrcOperand<AV_128_AlignTarget>; +def AVSrc_192 : AVSrcOperand<AV_192_AlignTarget>; +def AVSrc_256 : AVSrcOperand<AV_256_AlignTarget>; -class AVDstOperand<RegisterClass regClass> +def AVSrc_64_Align2 : AVSrcOperand<AV_64_Align2>; +def AVSrc_128_Align2 : AVSrcOperand<AV_128_Align2>; +def AVSrc_192_Align2 : AVSrcOperand<AV_192_Align2>; +def AVSrc_256_Align2 : AVSrcOperand<AV_256_Align2>; + +class AVDstOperand<RegisterClassLike regClass> : AVOperand<regClass, "decodeAV10">; def AVDst_128 : AVDstOperand<AV_128>; def AVDst_256 : AVDstOperand<AV_256>; def AVDst_512 : AVDstOperand<AV_512>; -class AVLdStOperand<RegisterClass regClass> +def AVDst_128_Align2 : AVDstOperand<AV_128_Align2>; +def AVDst_256_Align2 : AVDstOperand<AV_256_Align2>; +def AVDst_512_Align2 : AVDstOperand<AV_512_Align2>; + +class AVLdStOperand<RegisterClassLike regClass> : AVOperand<regClass, "decodeAVLdSt">; -def AVLdSt_32 : AVLdStOperand<AV_32>; +def AVLdSt_32 : AVLdStOperand<AV_LdSt_32_Target>; foreach size = ["64", "96", "128", "160", "256", "1024" ] in { - // TODO: These cases should use target align variant - def AVLdSt_#size : AVLdStOperand<!cast<RegisterClass>("AV_"#size)>; - - def AVLdSt_#size#_Align1 : AVLdStOperand<!cast<RegisterClass>("AV_"#size)>; - def AVLdSt_#size#_Align2 : AVLdStOperand<!cast<RegisterClass>("AV_"#size#_Align2)>; + def AVLdSt_#size : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_AlignTarget)>; + def AVLdSt_#size#_Align1 : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_Align1)>; + def AVLdSt_#size#_Align2 : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_Align2)>; } //===----------------------------------------------------------------------===// // ACSrc_* Operands with an AGPR or an inline constant //===----------------------------------------------------------------------===// -class SrcRegOrImmA9<RegisterClass regClass, string operandType> +class SrcRegOrImmA9<RegisterClassLike regClass, string operandType> : RegOrImmOperand<regClass, operandType> { - let DecoderMethod = "decodeSrcRegOrImmA9<" # regClass.Size # ">"; + let DecoderMethod = "decodeSrcRegOrImmA9<" # !cast<SIRegisterClassLike>(regClass).Size # ">"; } -def AISrc_64_f64 : SrcRegOrImmA9 <AReg_64, "OPERAND_REG_INLINE_AC_FP64">; -def AISrc_128_f32 : SrcRegOrImmA9 <AReg_128, "OPERAND_REG_INLINE_AC_FP32">; -def AISrc_128_b32 : SrcRegOrImmA9 <AReg_128, "OPERAND_REG_INLINE_AC_INT32">; -def AISrc_256_f64 : SrcRegOrImmA9 <AReg_256, "OPERAND_REG_INLINE_AC_FP64">; -def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512, "OPERAND_REG_INLINE_AC_FP32">; -def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512, "OPERAND_REG_INLINE_AC_INT32">; -def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_FP32">; -def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_INT32">; +def AISrc_64_f64 : SrcRegOrImmA9 <AReg_64_AlignTarget, "OPERAND_REG_INLINE_AC_FP64">; +def AISrc_128_f32 : SrcRegOrImmA9 <AReg_128_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">; +def AISrc_128_b32 : SrcRegOrImmA9 <AReg_128_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">; +def AISrc_256_f64 : SrcRegOrImmA9 <AReg_256_AlignTarget, "OPERAND_REG_INLINE_AC_FP64">; +def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">; +def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">; +def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">; +def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">; //===----------------------------------------------------------------------===// // Tablegen programming utilities @@ -1467,10 +1559,10 @@ def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_INT32">; /// instruction's operand list, which may be a RegisterOperand or a /// direct RegisterClass reference. class getRegClassFromOp<DAGOperand Op> { - SIRegisterClass ret = !if( + SIRegisterClassLike ret = !if( !isa<RegisterOperand>(Op), - !cast<SIRegisterClass>(!cast<RegisterOperand>(Op).RegClass), - !cast<SIRegisterClass>(Op)); + !cast<SIRegisterClassLike>(!cast<RegisterOperand>(Op).RegClass), + !cast<SIRegisterClassLike>(Op)); } /// Check if the operand will use an AV_* class. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index f7f4d46..76023d2 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1569,12 +1569,7 @@ static bool isValidRegPrefix(char C) { return C == 'v' || C == 's' || C == 'a'; } -std::tuple<char, unsigned, unsigned> -parseAsmConstraintPhysReg(StringRef Constraint) { - StringRef RegName = Constraint; - if (!RegName.consume_front("{") || !RegName.consume_back("}")) - return {}; - +std::tuple<char, unsigned, unsigned> parseAsmPhysRegName(StringRef RegName) { char Kind = RegName.front(); if (!isValidRegPrefix(Kind)) return {}; @@ -1601,6 +1596,14 @@ parseAsmConstraintPhysReg(StringRef Constraint) { return {}; } +std::tuple<char, unsigned, unsigned> +parseAsmConstraintPhysReg(StringRef Constraint) { + StringRef RegName = Constraint; + if (!RegName.consume_front("{") || !RegName.consume_back("}")) + return {}; + return parseAsmPhysRegName(RegName); +} + std::pair<unsigned, unsigned> getIntegerPairAttribute(const Function &F, StringRef Name, std::pair<unsigned, unsigned> Default, @@ -2927,13 +2930,6 @@ unsigned getRegBitWidth(const MCRegisterClass &RC) { return getRegBitWidth(RC.getID()); } -unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc, - unsigned OpNo) { - assert(OpNo < Desc.NumOperands); - unsigned RCID = Desc.operands()[OpNo].RegClass; - return getRegBitWidth(RCID) / 8; -} - bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) { if (isInlinableIntLiteral(Literal)) return true; @@ -3499,14 +3495,18 @@ bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode) { return false; } -bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) { +bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, + const MCSubtargetInfo &ST) { for (auto OpName : {OpName::vdst, OpName::src0, OpName::src1, OpName::src2}) { int Idx = getNamedOperandIdx(OpDesc.getOpcode(), OpName); if (Idx == -1) continue; - if (OpDesc.operands()[Idx].RegClass == AMDGPU::VReg_64RegClassID || - OpDesc.operands()[Idx].RegClass == AMDGPU::VReg_64_Align2RegClassID) + const MCOperandInfo &OpInfo = OpDesc.operands()[Idx]; + int16_t RegClass = MII.getOpRegClassID( + OpInfo, ST.getHwMode(MCSubtargetInfo::HwMode_RegInfo)); + if (RegClass == AMDGPU::VReg_64RegClassID || + RegClass == AMDGPU::VReg_64_Align2RegClassID) return true; } @@ -3533,14 +3533,15 @@ bool isDPALU_DPP32BitOpc(unsigned Opc) { } } -bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST) { +bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, + const MCSubtargetInfo &ST) { if (!ST.hasFeature(AMDGPU::FeatureDPALU_DPP)) return false; if (isDPALU_DPP32BitOpc(OpDesc.getOpcode())) return ST.hasFeature(AMDGPU::FeatureGFX1250Insts); - return hasAny64BitVGPROperands(OpDesc); + return hasAny64BitVGPROperands(OpDesc, MII, ST); } unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 2b9c063..49b4d02 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1014,6 +1014,13 @@ bool isReadOnlySegment(const GlobalValue *GV); bool shouldEmitConstantsToTextSection(const Triple &TT); /// Returns a valid charcode or 0 in the first entry if this is a valid physical +/// register name. Followed by the start register number, and the register +/// width. Does not validate the number of registers exists in the class. Unlike +/// parseAsmConstraintPhysReg, this does not expect the name to be wrapped in +/// "{}". +std::tuple<char, unsigned, unsigned> parseAsmPhysRegName(StringRef TupleString); + +/// Returns a valid charcode or 0 in the first entry if this is a valid physical /// register constraint. Followed by the start register number, and the register /// width. Does not validate the number of registers exists in the class. std::tuple<char, unsigned, unsigned> @@ -1620,10 +1627,6 @@ unsigned getRegBitWidth(unsigned RCID); /// Get the size in bits of a register from the register class \p RC. unsigned getRegBitWidth(const MCRegisterClass &RC); -/// Get size of register operand -unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc, - unsigned OpNo); - LLVM_READNONE inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { switch (OpInfo.OperandType) { @@ -1780,13 +1783,15 @@ inline bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC) { } /// \returns true if an instruction may have a 64-bit VGPR operand. -bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc); +bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc, + const MCSubtargetInfo &ST); /// \returns true if an instruction is a DP ALU DPP without any 64-bit operands. bool isDPALU_DPP32BitOpc(unsigned Opc); /// \returns true if an instruction is a DP ALU DPP. -bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST); +bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, + const MCSubtargetInfo &ST); /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 30dab55..d87d250 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -405,7 +405,7 @@ class VOP_MADAK <ValueType vt> : VOP_MADK_Base<vt> { field dag Ins32 = !if(!eq(vt.Size, 32), (ins VSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm), !if(!eq(vt.Size, 64), - (ins VSrc_f64:$src0, VReg_64:$src1, ImmOpType:$imm), + (ins VSrc_f64:$src0, VReg_64_AlignTarget:$src1, ImmOpType:$imm), (ins VSrc_f16:$src0, VGPR_32:$src1, ImmOpType:$imm))); field dag InsVOPDX = (ins VSrc_f32:$src0X, VGPR_32:$vsrc1X, ImmOpType:$imm); let InsVOPDX_immX = (ins VSrc_f32:$src0X, VGPR_32:$vsrc1X, ImmOpType:$immX); @@ -474,10 +474,10 @@ def VOP_MADMK_F64 : VOP_MADMK <f64>; // given VT. class getVOP3VRegForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> { RegisterOperand ret = - !cond(!eq(VT.Size, 128) : RegisterOperand<VReg_128>, - !eq(VT.Size, 96) : RegisterOperand<VReg_96>, - !eq(VT.Size, 64) : RegisterOperand<VReg_64>, - !eq(VT.Size, 48) : RegisterOperand<VReg_64>, + !cond(!eq(VT.Size, 128) : RegisterOperand<VReg_128_AlignTarget>, + !eq(VT.Size, 96) : RegisterOperand<VReg_96_AlignTarget>, + !eq(VT.Size, 64) : RegisterOperand<VReg_64_AlignTarget>, + !eq(VT.Size, 48) : RegisterOperand<VReg_64_AlignTarget>, !eq(VT.Size, 16) : !if(IsTrue16, !if(IsFake16, RegisterOperand<VGPR_32>, RegisterOperand<VGPR_16>), diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 3a0cc35..7cfd059 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -705,16 +705,16 @@ foreach Type = ["U", "I"] in (!cast<VOP3P_Pseudo>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; def ADst_32 : VOPDstOperand<AGPR_32>; -def ADst_64 : VOPDstOperand<AReg_64>; -def ADst_128 : VOPDstOperand<AReg_128>; -def ADst_256 : VOPDstOperand<AReg_256>; -def ADst_512 : VOPDstOperand<AReg_512>; -def ADst_1024 : VOPDstOperand<AReg_1024>; -def VDst_64 : VOPDstOperand<VReg_64>; -def VDst_128 : VOPDstOperand<VReg_128>; -def VDst_256 : VOPDstOperand<VReg_256>; -def VDst_512 : VOPDstOperand<VReg_512>; -def VDst_1024 : VOPDstOperand<VReg_1024>; +def ADst_64 : VOPDstOperand<AReg_64_AlignTarget>; +def ADst_128 : VOPDstOperand<AReg_128_AlignTarget>; +def ADst_256 : VOPDstOperand<AReg_256_AlignTarget>; +def ADst_512 : VOPDstOperand<AReg_512_AlignTarget>; +def ADst_1024 : VOPDstOperand<AReg_1024_AlignTarget>; +def VDst_64 : VOPDstOperand<VReg_64_AlignTarget>; +def VDst_128 : VOPDstOperand<VReg_128_AlignTarget>; +def VDst_256 : VOPDstOperand<VReg_256_AlignTarget>; +def VDst_512 : VOPDstOperand<VReg_512_AlignTarget>; +def VDst_1024 : VOPDstOperand<VReg_1024_AlignTarget>; def VOPProfileAccRead : VOP3P_Profile<VOP_I32_I32, VOP3_MAI> { let Src0RC64 = ARegSrc_32; @@ -811,23 +811,23 @@ def VOPProfileMAI_F32_V2F32_X32_VCD : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F def VOPProfileMAI_F32_I64_X32_VCD : VOPProfileMAI<VOP_V4F32_I64_I64_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>; def VOPProfileMAI_F32_I64_X16_VCD : VOPProfileMAI<VOP_V16F32_I64_I64_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>; -def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I32, AVDst_128, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_F32_16X16X64_F16 : VOPProfileSMFMAC<VOP_V4F32_V8F16_V16F16_I32, AVDst_128, AVSrc_128, AVSrc_256>; -def VOPProfileSMFMAC_F32_32X32X32_F16 : VOPProfileSMFMAC<VOP_V16F32_V8F16_V16F16_I32, AVDst_512, AVSrc_128, AVSrc_256>; -def VOPProfileSMFMAC_F32_16X16X64_BF16 : VOPProfileSMFMAC<VOP_V4F32_V8BF16_V16BF16_I32, AVDst_128, AVSrc_128, AVSrc_256>; -def VOPProfileSMFMAC_F32_32X32X32_BF16 : VOPProfileSMFMAC<VOP_V16F32_V8BF16_V16BF16_I32, AVDst_512, AVSrc_128, AVSrc_256>; -def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I32, AVDst_128, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_I32_16X16X64_I8 : VOPProfileSMFMAC<VOP_V4I32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_I32_32X32X32_I8 : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_F32_16X16X64_F8 : VOPProfileSMFMAC<VOP_V4F32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_F32_32X32X32_F8 : VOPProfileSMFMAC<VOP_V16F32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>; -def VOPProfileSMFMAC_I32_16X16X128_I8 : VOPProfileSMFMAC<VOP_V4I32_V4I32_V8I32_I32, AVDst_128, AVSrc_128, AVSrc_256>; -def VOPProfileSMFMAC_I32_32X32X64_I8 : VOPProfileSMFMAC<VOP_V16I32_V4I32_V8I32_I32, AVDst_512, AVSrc_128, AVSrc_256>; - -def VOPProfileSMFMAC_F32_16X16X128_F8 : VOPProfileSMFMAC<VOP_V4F32_V4I32_V8I32_I32, AVDst_128, AVSrc_128, AVSrc_256>; -def VOPProfileSMFMAC_F32_32X32X64_F8 : VOPProfileSMFMAC<VOP_V16F32_V4I32_V8I32_I32, AVDst_512, AVSrc_128, AVSrc_256>; +def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I32, AVDst_128_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_F32_16X16X64_F16 : VOPProfileSMFMAC<VOP_V4F32_V8F16_V16F16_I32, AVDst_128_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; +def VOPProfileSMFMAC_F32_32X32X32_F16 : VOPProfileSMFMAC<VOP_V16F32_V8F16_V16F16_I32, AVDst_512_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; +def VOPProfileSMFMAC_F32_16X16X64_BF16 : VOPProfileSMFMAC<VOP_V4F32_V8BF16_V16BF16_I32, AVDst_128_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; +def VOPProfileSMFMAC_F32_32X32X32_BF16 : VOPProfileSMFMAC<VOP_V16F32_V8BF16_V16BF16_I32, AVDst_512_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; +def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I32, AVDst_128_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_I32_16X16X64_I8 : VOPProfileSMFMAC<VOP_V4I32_V2I32_V4I32_I32, AVDst_128_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_I32_32X32X32_I8 : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_I32, AVDst_512_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_F32_16X16X64_F8 : VOPProfileSMFMAC<VOP_V4F32_V2I32_V4I32_I32, AVDst_128_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_F32_32X32X32_F8 : VOPProfileSMFMAC<VOP_V16F32_V2I32_V4I32_I32, AVDst_512_Align2, AVSrc_64_Align2, AVSrc_128_Align2>; +def VOPProfileSMFMAC_I32_16X16X128_I8 : VOPProfileSMFMAC<VOP_V4I32_V4I32_V8I32_I32, AVDst_128_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; +def VOPProfileSMFMAC_I32_32X32X64_I8 : VOPProfileSMFMAC<VOP_V16I32_V4I32_V8I32_I32, AVDst_512_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; + +def VOPProfileSMFMAC_F32_16X16X128_F8 : VOPProfileSMFMAC<VOP_V4F32_V4I32_V8I32_I32, AVDst_128_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; +def VOPProfileSMFMAC_F32_32X32X64_F8 : VOPProfileSMFMAC<VOP_V16F32_V4I32_V8I32_I32, AVDst_512_Align2, AVSrc_128_Align2, AVSrc_256_Align2>; def VOPProfileMAI_F32_V8F16_X32 : VOPProfileMAI<VOP_V4F32_V8F16_V8F16_V4F32, AISrc_128_f32, ADst_128, AVSrc_128>; def VOPProfileMAI_F32_V8F16_X32_VCD : VOPProfileMAI<VOP_V4F32_V8F16_V8F16_V4F32, VISrc_128_f32, VDst_128, AVSrc_128>; |