diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
56 files changed, 1687 insertions, 510 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 2a36f3d..8a0c4ac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -149,6 +149,12 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts", "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions" >; +def FeatureFmaMixBF16Insts : SubtargetFeature<"fma-mix-bf16-insts", + "HasFmaMixBF16Insts", + "true", + "Has v_fma_mix_f32_bf16, v_fma_mixlo_bf16, v_fma_mixhi_bf16 instructions" +>; + def FeatureIEEEMinimumMaximumInsts : SubtargetFeature<"ieee-minimum-maximum-insts", "HasIEEEMinimumMaximumInsts", "true", @@ -262,12 +268,30 @@ def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug", "S_INST_PREFETCH instruction causes shader to hang" >; +def FeatureVmemPrefInsts : SubtargetFeature<"vmem-pref-insts", + "HasVmemPrefInsts", + "true", + "Has flat_prefect_b8 and global_prefetch_b8 instructions" +>; + def FeatureSafeSmemPrefetch : SubtargetFeature<"safe-smem-prefetch", "HasSafeSmemPrefetch", "true", "SMEM prefetches do not fail on illegal address" >; +def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch", + "HasSafeCUPrefetch", + "true", + "VMEM CU scope prefetches do not fail on illegal address" +>; + +def FeatureCUStores : SubtargetFeature<"cu-stores", + "HasCUStores", + "true", + "Whether SCOPE_CU stores can be used on GFX12.5" +>; + def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", "HasVcmpxExecWARHazard", "true", @@ -1365,6 +1389,9 @@ def FeatureAddSubU64Insts : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true", "Has v_add_u64 and v_sub_u64 instructions">; +def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst", + "true", "Has v_mad_u32 instruction">; + def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts", "HasVMemToLDSLoad", "true", @@ -1970,6 +1997,7 @@ def FeatureISAVersion12 : FeatureSet< def FeatureISAVersion12_50 : FeatureSet< [FeatureGFX12, FeatureGFX1250Insts, + FeatureCUStores, FeatureCuMode, Feature64BitLiterals, FeatureLDSBankCount32, @@ -2007,6 +2035,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureBF16ConversionInsts, FeatureBF16PackedInsts, FeatureCvtPkF16F32Inst, + FeatureFmaMixBF16Insts, FeatureMin3Max3PKF16, FeatureMinimum3Maximum3PKF16, FeaturePrngInst, @@ -2020,8 +2049,10 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureFlatBufferGlobalAtomicFaddF64Inst, FeatureMemoryAtomicFAddF32DenormalSupport, FeatureKernargPreload, + FeatureVmemPrefInsts, FeatureLshlAddU64Inst, FeatureAddSubU64Insts, + FeatureMadU32Inst, FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, ]>; @@ -2402,7 +2433,7 @@ def HasAtomicFMinFMaxF64FlatInsts : def HasLdsAtomicAddF64 : Predicate<"Subtarget->hasLdsAtomicAddF64()">, - AssemblerPredicate<(any_of FeatureGFX90AInsts)>; + AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX1250Insts)>; def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">, AssemblerPredicate<(all_of FeatureFlatGlobalInsts)>; @@ -2545,6 +2576,10 @@ def HasFmaakFmamkF64Insts : Predicate<"Subtarget->hasFmaakFmamkF64Insts()">, AssemblerPredicate<(any_of FeatureGFX1250Insts)>; +def HasAddMinMaxInsts : + Predicate<"Subtarget->hasAddMinMaxInsts()">, + AssemblerPredicate<(any_of FeatureGFX1250Insts)>; + def HasPkAddMinMaxInsts : Predicate<"Subtarget->hasPkAddMinMaxInsts()">, AssemblerPredicate<(any_of FeatureGFX1250Insts)>; @@ -2599,6 +2634,9 @@ def HasMovrel : Predicate<"Subtarget->hasMovrel()">, def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">, AssemblerPredicate<(all_of FeatureFmaMixInsts)>; +def HasFmaMixBF16Insts : Predicate<"Subtarget->hasFmaMixBF16Insts()">, + AssemblerPredicate<(all_of FeatureFmaMixBF16Insts)>; + def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">, AssemblerPredicate<(all_of FeatureDLInsts)>; @@ -2797,6 +2835,9 @@ def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">; def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">, AssemblerPredicate<(all_of FeatureXF32Insts)>; +def HasVmemPrefInsts : Predicate<"Subtarget->hasVmemPrefInsts()">, + AssemblerPredicate<(all_of FeatureVmemPrefInsts)>; + def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">, AssemblerPredicate<(all_of FeatureAshrPkInsts)>; @@ -2806,6 +2847,9 @@ def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">, def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">, AssemblerPredicate<(all_of FeatureAddSubU64Insts)>; +def HasMadU32Inst : Predicate<"Subtarget->hasMadU32Inst()">, + AssemblerPredicate<(all_of FeatureMadU32Inst)>; + def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">, AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 4b3dc37..6681393 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -552,6 +552,7 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( MCContext &Ctx = MF.getContext(); uint16_t KernelCodeProperties = 0; const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (UserSGPRInfo.hasPrivateSegmentBuffer()) { KernelCodeProperties |= @@ -581,10 +582,13 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; } - if (MF.getSubtarget<GCNSubtarget>().isWave32()) { + if (ST.isWave32()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; } + if (isGFX1250(ST) && ST.hasCUStores()) { + KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES; + } // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be // un-evaluatable at this point so it cannot be conditionally checked here. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index dedee46..59cc1df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -13,7 +13,6 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Target/TargetMachine.h" @@ -1383,7 +1382,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID, - &AAIndirectCallInfo::ID, &AAInstanceInfo::ID}); + &AAIndirectCallInfo::ID}); AttributorConfig AC(CGUpdater); AC.IsClosedWorldModule = Options.IsClosedWorld; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 5f19837..a9278c1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -89,10 +89,6 @@ static cl::opt<bool> DisableFDivExpand( cl::ReallyHidden, cl::init(false)); -static bool hasUnsafeFPMath(const Function &F) { - return F.getFnAttribute("unsafe-fp-math").getValueAsBool(); -} - class AMDGPUCodeGenPrepareImpl : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> { public: @@ -104,7 +100,6 @@ public: const DominatorTree *DT; const UniformityInfo &UA; const DataLayout &DL; - const bool HasUnsafeFPMath; const bool HasFP32DenormalFlush; bool FlowChanged = false; mutable Function *SqrtF32 = nullptr; @@ -117,7 +112,6 @@ public: const DominatorTree *DT, const UniformityInfo &UA) : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC), DT(DT), UA(UA), DL(F.getDataLayout()), - HasUnsafeFPMath(hasUnsafeFPMath(F)), HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals == DenormalMode::getPreserveSign()) {} @@ -637,8 +631,7 @@ bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp, return false; // v_rsq_f32 gives 1ulp - return SqrtFMF.approxFunc() || HasUnsafeFPMath || - SqrtOp->getFPAccuracy() >= 1.0f; + return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f; } Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( @@ -664,7 +657,7 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( IRBuilder<>::FastMathFlagGuard Guard(Builder); Builder.setFastMathFlags(DivFMF | SqrtFMF); - if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath || + if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || canIgnoreDenormalInput(Den, CtxI)) { Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den); // -1.0 / sqrt(x) -> fneg(rsq(x)) @@ -680,7 +673,7 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( // Optimize fdiv with rcp: // // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is -// allowed with unsafe-fp-math or afn. +// allowed with afn. // // a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0 Value * @@ -803,9 +796,9 @@ Value *AMDGPUCodeGenPrepareImpl::visitFDivElement( // // With rcp: // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is -// allowed with unsafe-fp-math or afn. +// allowed with afn. // -// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn. +// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn. // // With fdiv.fast: // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed. @@ -843,7 +836,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { RsqOp = SqrtOp->getOperand(0); } - // Inaccurate rcp is allowed with unsafe-fp-math or afn. + // Inaccurate rcp is allowed with afn. // // Defer to codegen to handle this. // @@ -852,7 +845,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { // expansion of afn to codegen. The current interpretation is so aggressive we // don't need any pre-consideration here when we have better information. A // more conservative interpretation could use handling here. - const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc(); + const bool AllowInaccurateRcp = DivFMF.approxFunc(); if (!RsqOp && AllowInaccurateRcp) return false; @@ -2026,7 +2019,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { // We're trying to handle the fast-but-not-that-fast case only. The lowering // of fast llvm.sqrt will give the raw instruction anyway. - if (SqrtFMF.approxFunc() || HasUnsafeFPMath) + if (SqrtFMF.approxFunc()) return false; const float ReqdAccuracy = FPOp->getFPAccuracy(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 891d362..992572f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -137,9 +137,15 @@ def gi_global_offset : def gi_global_saddr : GIComplexOperandMatcher<s64, "selectGlobalSAddr">, GIComplexPatternEquiv<GlobalSAddr>; +def gi_global_saddr_cpol : + GIComplexOperandMatcher<s64, "selectGlobalSAddrCPol">, + GIComplexPatternEquiv<GlobalSAddrCPol>; def gi_global_saddr_glc : GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">, GIComplexPatternEquiv<GlobalSAddrGLC>; +def gi_global_saddr_no_ioffset : + GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffset">, + GIComplexPatternEquiv<GlobalSAddrNoIOffset>; def gi_mubuf_scratch_offset : GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">, @@ -446,5 +452,8 @@ def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">, def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">, GISDNodeXFormEquiv<as_hw_round_mode>; +def gi_prefetch_loc : GICustomOperandRenderer<"renderPrefetchLoc">, + GISDNodeXFormEquiv<PrefetchLoc>; + def gi_MFMALdScaleModifierOp : GICustomOperandRenderer<"renderScaledMAIIntrinsicOperand">, GISDNodeXFormEquiv<MFMALdScaleXForm>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index 2991778..19b8757 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -204,7 +204,7 @@ MetadataStreamerMsgPackV4::getWorkGroupDimensions(MDNode *Node) const { for (auto &Op : Node->operands()) Dims.push_back(Dims.getDocument()->getNode( - uint64_t(mdconst::extract<ConstantInt>(Op)->getZExtValue()))); + mdconst::extract<ConstantInt>(Op)->getZExtValue())); return Dims; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 5a2416de..39b4200 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1134,15 +1134,26 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { SDLoc SL(N); bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32; unsigned Opc; + bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1); if (Subtarget->hasMADIntraFwdBug()) Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 : AMDGPU::V_MAD_U64_U32_gfx11_e64; + else if (UseNoCarry) + Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64; else Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), Clamp }; + + if (UseNoCarry) { + MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops); + ReplaceUses(SDValue(N, 0), SDValue(Mad, 0)); + CurDAG->RemoveDeadNode(N); + return; + } + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } @@ -2020,6 +2031,22 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, return true; } +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, + SDValue &SAddr, SDValue &VOffset, + SDValue &Offset, + SDValue &CPol) const { + bool ScaleOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset)) + return false; + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL; + CPol = CurDAG->getTargetConstant( + (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, @@ -2033,6 +2060,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, return true; } +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, + SDValue &SAddr, + SDValue &VOffset, + SDValue &CPol) const { + bool ScaleOffset; + SDValue DummyOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset, + false)) + return false; + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL; + CPol = CurDAG->getTargetConstant( + (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32); + return true; +} + static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) { if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) { SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); @@ -3861,58 +3906,114 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +// Match lowered fpext from bf16 to f32. This is a bit operation extending +// a 16-bit value with 16-bit of zeroes at LSB: +// +// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val))))) +// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true +// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false +static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) { + if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST) + return SDValue(); + Op = Op.getOperand(0); + + IsExtractHigh = false; + if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) { + auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0)); + if (!Low16 || !Low16->isZero()) + return SDValue(); + Op = stripBitcast(Op.getOperand(1)); + if (Op.getValueType() != MVT::bf16) + return SDValue(); + return Op; + } + + if (Op.getValueType() != MVT::i32) + return SDValue(); + + if (Op.getOpcode() == ISD::AND) { + if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (Mask->getZExtValue() == 0xffff0000) { + IsExtractHigh = true; + return Op.getOperand(0); + } + } + return SDValue(); + } + + if (Op.getOpcode() == ISD::SHL) { + if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (Amt->getZExtValue() == 16) + return Op.getOperand(0); + } + } + + return SDValue(); +} + // The return value is not whether the match is possible (which it always is), // but whether or not it a conversion is really used. bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, - unsigned &Mods) const { + unsigned &Mods, + MVT VT) const { Mods = 0; SelectVOP3ModsImpl(In, Src, Mods); + bool IsExtractHigh = false; if (Src.getOpcode() == ISD::FP_EXTEND) { Src = Src.getOperand(0); - assert(Src.getValueType() == MVT::f16); - Src = stripBitcast(Src); + } else if (VT == MVT::bf16) { + SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh); + if (!B16) + return false; + Src = B16; + } else + return false; + + if (Src.getValueType() != VT && + (VT != MVT::bf16 || Src.getValueType() != MVT::i32)) + return false; - // Be careful about folding modifiers if we already have an abs. fneg is - // applied last, so we don't want to apply an earlier fneg. - if ((Mods & SISrcMods::ABS) == 0) { - unsigned ModsTmp; - SelectVOP3ModsImpl(Src, Src, ModsTmp); + Src = stripBitcast(Src); - if ((ModsTmp & SISrcMods::NEG) != 0) - Mods ^= SISrcMods::NEG; + // Be careful about folding modifiers if we already have an abs. fneg is + // applied last, so we don't want to apply an earlier fneg. + if ((Mods & SISrcMods::ABS) == 0) { + unsigned ModsTmp; + SelectVOP3ModsImpl(Src, Src, ModsTmp); - if ((ModsTmp & SISrcMods::ABS) != 0) - Mods |= SISrcMods::ABS; - } + if ((ModsTmp & SISrcMods::NEG) != 0) + Mods ^= SISrcMods::NEG; - // op_sel/op_sel_hi decide the source type and source. - // If the source's op_sel_hi is set, it indicates to do a conversion from fp16. - // If the sources's op_sel is set, it picks the high half of the source - // register. + if ((ModsTmp & SISrcMods::ABS) != 0) + Mods |= SISrcMods::ABS; + } - Mods |= SISrcMods::OP_SEL_1; - if (isExtractHiElt(Src, Src)) { - Mods |= SISrcMods::OP_SEL_0; + // op_sel/op_sel_hi decide the source type and source. + // If the source's op_sel_hi is set, it indicates to do a conversion from + // fp16. If the sources's op_sel is set, it picks the high half of the source + // register. - // TODO: Should we try to look for neg/abs here? - } + Mods |= SISrcMods::OP_SEL_1; + if (IsExtractHigh || + (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) { + Mods |= SISrcMods::OP_SEL_0; - // Prevent unnecessary subreg COPY to VGPR_16 - if (Src.getOpcode() == ISD::TRUNCATE && - Src.getOperand(0).getValueType() == MVT::i32) { - Src = Src.getOperand(0); - } - return true; + // TODO: Should we try to look for neg/abs here? } - return false; + // Prevent unnecessary subreg COPY to VGPR_16 + if (Src.getOpcode() == ISD::TRUNCATE && + Src.getOperand(0).getValueType() == MVT::i32) { + Src = Src.getOperand(0); + } + return true; } bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - if (!SelectVOP3PMadMixModsImpl(In, Src, Mods)) + if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16)) return false; SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; @@ -3921,7 +4022,24 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - SelectVOP3PMadMixModsImpl(In, Src, Mods); + SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16); + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16)) + return false; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16); SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 6123d75..983f1aa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -19,6 +19,7 @@ #include "SIModeRegisterDefaults.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Support/AMDGPUAddrSpace.h" #include "llvm/Target/TargetMachine.h" namespace llvm { @@ -167,9 +168,14 @@ private: bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; + bool SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &Offset, + SDValue &CPol) const; bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; + bool SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &CPol) const; bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &Offset) const; bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr, @@ -254,11 +260,15 @@ private: bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, - unsigned &Mods) const; + bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods, + MVT VT) const; bool SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src, + SDValue &SrcMods) const; + bool SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const; bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2, SDValue &Tbl) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e3ca09e..31c4f62 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -391,8 +391,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // Library functions. These default to Expand, but we have instructions // for them. setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR, - ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM}, - MVT::f32, Legal); + ISD::FROUNDEVEN, ISD::FTRUNC}, + {MVT::f16, MVT::f32}, Legal); + setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal); setOperationAction(ISD::FLOG2, MVT::f32, Custom); setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom); @@ -412,9 +413,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom); - if (Subtarget->has16BitInsts()) + if (Subtarget->has16BitInsts()) { setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal); - else { + setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal); + } else { setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal); setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom); } @@ -2632,7 +2634,7 @@ bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG, if (Flags.hasApproximateFuncs()) return true; auto &Options = DAG.getTarget().Options; - return Options.UnsafeFPMath || Options.ApproxFuncFPMath; + return Options.ApproxFuncFPMath; } bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG, @@ -2755,7 +2757,7 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, const auto &Options = getTargetMachine().Options; if (VT == MVT::f16 || Flags.hasApproximateFuncs() || - Options.ApproxFuncFPMath || Options.UnsafeFPMath) { + Options.ApproxFuncFPMath) { if (VT == MVT::f16 && !Subtarget->has16BitInsts()) { // Log and multiply in f32 is good enough for f16. @@ -3583,7 +3585,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) con if (N0.getValueType() == MVT::f32) return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0); - if (getTargetMachine().Options.UnsafeFPMath) { + if (Op->getFlags().hasApproximateFuncs()) { // There is a generic expand for FP_TO_FP16 with unsafe fast math. return SDValue(); } @@ -4844,94 +4846,11 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } -// Detect when CMP and SELECT use the same constant and fold them to avoid -// loading the constant twice. Specifically handles patterns like: -// %cmp = icmp eq i32 %val, 4242 -// %sel = select i1 %cmp, i32 4242, i32 %other -// It can be optimized to reuse %val instead of 4242 in select. -static SDValue -foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - const AMDGPUSubtarget *ST) { - SDValue Cond = N->getOperand(0); - SDValue TrueVal = N->getOperand(1); - SDValue FalseVal = N->getOperand(2); - - // Check if condition is a comparison. - if (Cond.getOpcode() != ISD::SETCC) - return SDValue(); - - SDValue LHS = Cond.getOperand(0); - SDValue RHS = Cond.getOperand(1); - ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); - - bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); - bool isInteger = LHS.getValueType().isInteger(); - - // Handle simple floating-point and integer types only. - if (!isFloatingPoint && !isInteger) - return SDValue(); - - bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); - bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); - if (!isEquality && !isNonEquality) - return SDValue(); - - SDValue ArgVal, ConstVal; - if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || - (isInteger && isa<ConstantSDNode>(RHS))) { - ConstVal = RHS; - ArgVal = LHS; - } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || - (isInteger && isa<ConstantSDNode>(LHS))) { - ConstVal = LHS; - ArgVal = RHS; - } else { - return SDValue(); - } - - // Check if constant should not be optimized - early return if not. - if (isFloatingPoint) { - const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); - const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST); - - // Only optimize normal floating-point values (finite, non-zero, and - // non-subnormal as per IEEE 754), skip optimization for inlinable - // floating-point constants. - if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val)) - return SDValue(); - } else { - int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue(); - - // Skip optimization for inlinable integer immediates. - // Inlinable immediates include: -16 to 64 (inclusive). - if (IntVal >= -16 && IntVal <= 64) - return SDValue(); - } - - // For equality and non-equality comparisons, patterns: - // select (setcc x, const), const, y -> select (setcc x, const), x, y - // select (setccinv x, const), y, const -> select (setccinv x, const), y, x - if (!(isEquality && TrueVal == ConstVal) && - !(isNonEquality && FalseVal == ConstVal)) - return SDValue(); - - SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; - SDValue SelectRHS = - (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; - return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, - SelectLHS, SelectRHS); -} - SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) return Folded; - // Try to fold CMP + SELECT patterns with shared constants (both FP and - // integer). - if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget)) - return Folded; - SDValue Cond = N->getOperand(0); if (Cond.getOpcode() != ISD::SETCC) return SDValue(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 877c3ac..b0d3b12 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -574,13 +574,22 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32( MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; + bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && + MRI->use_nodbg_empty(I.getOperand(1).getReg()); unsigned Opc; if (Subtarget->hasMADIntraFwdBug()) Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64 : AMDGPU::V_MAD_I64_I32_gfx11_e64; + else if (UseNoCarry) + Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64 + : AMDGPU::V_MAD_NC_I64_I32_e64; else Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64; + + if (UseNoCarry) + I.removeOperand(1); + I.setDesc(TII.get(Opc)); I.addOperand(*MF, MachineOperand::CreateImm(0)); I.addImplicitDefUseOperands(*MF); @@ -3995,6 +4004,9 @@ bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const { } unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64; + if (!IsB32 && STI.hasTrue16BitInsts()) + Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64 + : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64; unsigned CBL = STI.getConstantBusLimit(Opc); MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -5774,11 +5786,32 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const { + const MachineInstr &I = *Root.getParent(); + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL; + return selectGlobalSAddr(Root, PassedCPol); +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const { return selectGlobalSAddr(Root, AMDGPU::CPol::GLC); } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset( + MachineOperand &Root) const { + const MachineInstr &I = *Root.getParent(); + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL; + return selectGlobalSAddr(Root, PassedCPol, false); +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { Register Addr = Root.getReg(); Register PtrBase; @@ -6961,13 +6994,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); + (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); } void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : (int64_t)SISrcMods::DST_OP_SEL); } @@ -6976,13 +7009,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); + (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); } void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)(SISrcMods::OP_SEL_0) : 0); } @@ -7011,8 +7044,9 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0( void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0); + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) + ? (int64_t)SISrcMods::DST_OP_SEL + : 0); } void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB, @@ -7068,6 +7102,17 @@ void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB, MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4); } +void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + uint32_t V = MI.getOperand(2).getImm(); + V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK)) + << AMDGPU::CPol::SCOPE_SHIFT; + if (!Subtarget->hasSafeCUPrefetch()) + V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe + MIB.addImm(V); +} + /// Convert from 2-bit value to enum values used for op_sel* source modifiers. void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 5f7f05c..140e753 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -261,7 +261,11 @@ private: InstructionSelector::ComplexRendererFns selectGlobalSAddr(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectGlobalSAddrCPol(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectGlobalSAddrGLC(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectGlobalSAddrNoIOffset(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectScratchSAddr(MachineOperand &Root) const; @@ -414,6 +418,10 @@ private: void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; + + void renderPrefetchLoc(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderScaledMAIIntrinsicOperand(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 7a50923..511fc69 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -94,7 +94,6 @@ def NoFP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode() def NoFP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">; def IEEEModeEnabled : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">; def IEEEModeDisabled : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">; -def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; } def FMA : Predicate<"Subtarget->hasFMA()">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index fedfa3f..1fdf272 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1342,13 +1342,30 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); if (ST.hasVOP3PInsts()) { - getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) - .legalFor({S32, S16, V2S16}) - .clampMaxNumElements(0, S16, 2) - .minScalar(0, S16) - .widenScalarToNextPow2(0) - .scalarize(0) - .lower(); + getActionDefinitionsBuilder(G_ABS) + .legalFor({S32, S16, V2S16}) + .clampMaxNumElements(0, S16, 2) + .minScalar(0, S16) + .widenScalarToNextPow2(0) + .scalarize(0) + .lower(); + if (ST.hasIntMinMax64()) { + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + .legalFor({S32, S16, S64, V2S16}) + .clampMaxNumElements(0, S16, 2) + .minScalar(0, S16) + .widenScalarToNextPow2(0) + .scalarize(0) + .lower(); + } else { + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + .legalFor({S32, S16, V2S16}) + .clampMaxNumElements(0, S16, 2) + .minScalar(0, S16) + .widenScalarToNextPow2(0) + .scalarize(0) + .lower(); + } } else { getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) .legalFor({S32, S16}) @@ -1682,7 +1699,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.hasFlatAtomicFaddF32Inst()) Atomic.legalFor({{S32, FlatPtr}}); - if (ST.hasGFX90AInsts()) { + if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) { // These are legal with some caveats, and should have undergone expansion in // the IR in most situations // TODO: Move atomic expansion into legalizer @@ -2295,8 +2312,8 @@ Register AMDGPULegalizerInfo::getSegmentAperture( LLT::scalar(32), commonAlignment(Align(64), Offset)); // Pointer address - B.buildPtrAdd(LoadAddr, KernargPtrReg, - B.buildConstant(LLT::scalar(64), Offset).getReg(0)); + B.buildObjectPtrOffset(LoadAddr, KernargPtrReg, + B.buildConstant(LLT::scalar(64), Offset).getReg(0)); // Load address return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); } @@ -2317,8 +2334,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture( MachineMemOperand::MOInvariant, LLT::scalar(32), commonAlignment(Align(64), StructOffset)); - B.buildPtrAdd(LoadAddr, QueuePtr, - B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); + B.buildObjectPtrOffset( + LoadAddr, QueuePtr, + B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); } @@ -3326,7 +3344,7 @@ static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) { if (Flags & MachineInstr::FmAfn) return true; const auto &Options = MF.getTarget().Options; - return Options.UnsafeFPMath || Options.ApproxFuncFPMath; + return Options.ApproxFuncFPMath; } static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, @@ -3432,7 +3450,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) || - TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) { + TM.Options.ApproxFuncFPMath) { if (Ty == F16 && !ST.has16BitInsts()) { Register LogVal = MRI.createGenericVirtualRegister(F32); auto PromoteSrc = B.buildFPExt(F32, X); @@ -4500,8 +4518,7 @@ Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, llvm_unreachable("failed to find kernarg segment ptr"); auto COffset = B.buildConstant(LLT::scalar(64), Offset); - // TODO: Should get nuw - return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); + return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0); } /// Legalize a value that's loaded from kernel arguments. This is only used by @@ -4860,9 +4877,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, uint16_t Flags = MI.getFlags(); LLT ResTy = MRI.getType(Res); - const MachineFunction &MF = B.getMF(); - bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) || - MF.getTarget().Options.UnsafeFPMath; + bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn); if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) { if (!AllowInaccurateRcp && ResTy != LLT::scalar(16)) @@ -4922,9 +4937,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, uint16_t Flags = MI.getFlags(); LLT ResTy = MRI.getType(Res); - const MachineFunction &MF = B.getMF(); - bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || - MI.getFlag(MachineInstr::FmAfn); + bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn); if (!AllowInaccurateRcp) return false; @@ -5676,8 +5689,8 @@ bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) return false; - // FIXME: This should be nuw - B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); + B.buildObjectPtrOffset(DstReg, KernargPtrReg, + B.buildConstant(IdxTy, Offset).getReg(0)); return true; } @@ -7019,8 +7032,8 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( // Pointer address Register LoadAddr = MRI.createGenericVirtualRegister( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); - B.buildPtrAdd(LoadAddr, KernargPtrReg, - B.buildConstant(LLT::scalar(64), Offset).getReg(0)); + B.buildObjectPtrOffset(LoadAddr, KernargPtrReg, + B.buildConstant(LLT::scalar(64), Offset).getReg(0)); // Load address Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); B.buildCopy(SGPR01, Temp); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 8767208..aa75534 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -53,8 +53,6 @@ private: using FuncInfo = llvm::AMDGPULibFunc; - bool UnsafeFPMath = false; - // -fuse-native. bool AllNative = false; @@ -117,7 +115,6 @@ private: bool AllowStrictFP = false); protected: - bool isUnsafeMath(const FPMathOperator *FPOp) const; bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const; bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const; @@ -415,23 +412,17 @@ bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName, return AMDGPULibFunc::parse(FMangledName, FInfo); } -bool AMDGPULibCalls::isUnsafeMath(const FPMathOperator *FPOp) const { - return UnsafeFPMath || FPOp->isFast(); -} - bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const { - return UnsafeFPMath || - (FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs()); + return FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs(); } bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold( const FPMathOperator *FPOp) const { // TODO: Refine to approxFunc or contract - return isUnsafeMath(FPOp); + return FPOp->isFast(); } void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) { - UnsafeFPMath = F.getFnAttribute("unsafe-fp-math").getValueAsBool(); AC = &FAM.getResult<AssumptionAnalysis>(F); TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F); DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index f471881..b45627d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -294,7 +294,8 @@ void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, BasePlusOffset = Base; } else { auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset); - BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0); + BasePlusOffset = + B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0); } auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 9b05f7c..6bca2fe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3501,19 +3501,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl( applyMappingMAD_64_32(B, OpdMapper); return; case AMDGPU::G_PREFETCH: { - if (!Subtarget.hasPrefetch() || !Subtarget.hasSafeSmemPrefetch()) { + if (!Subtarget.hasSafeSmemPrefetch() && !Subtarget.hasVmemPrefInsts()) { MI.eraseFromParent(); return; } Register PtrReg = MI.getOperand(0).getReg(); unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID); - if (PtrBank == AMDGPU::VGPRRegBankID) { + if (PtrBank == AMDGPU::VGPRRegBankID && + (!Subtarget.hasVmemPrefInsts() || !MI.getOperand(3).getImm())) { + // Cannot do I$ prefetch with divergent pointer. MI.eraseFromParent(); return; } unsigned AS = MRI.getType(PtrReg).getAddressSpace(); - if (!AMDGPU::isFlatGlobalAddrSpace(AS) && - AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + if ((!AMDGPU::isFlatGlobalAddrSpace(AS) && + AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) || + (!Subtarget.hasSafeSmemPrefetch() && + (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || + !MI.getOperand(3).getImm() /* I$ prefetch */))) { MI.eraseFromParent(); return; } @@ -4004,10 +4009,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_SADDE: case AMDGPU::G_USUBE: case AMDGPU::G_SSUBE: - case AMDGPU::G_SMIN: - case AMDGPU::G_SMAX: - case AMDGPU::G_UMIN: - case AMDGPU::G_UMAX: case AMDGPU::G_ABS: case AMDGPU::G_SHUFFLE_VECTOR: case AMDGPU::G_SBFX: @@ -4017,6 +4018,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { if (isSALUMapping(MI)) return getDefaultMappingSOP(MI); return getDefaultMappingVOP(MI); + case AMDGPU::G_SMIN: + case AMDGPU::G_SMAX: + case AMDGPU::G_UMIN: + case AMDGPU::G_UMAX: + if (isSALUMapping(MI)) { + // There are no scalar 64-bit min and max, use vector instruction instead. + if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 64 && + Subtarget.hasIntMinMax64()) + return getDefaultMappingVOP(MI); + return getDefaultMappingSOP(MI); + } + return getDefaultMappingVOP(MI); case AMDGPU::G_FADD: case AMDGPU::G_FSUB: case AMDGPU::G_FMUL: @@ -4561,8 +4574,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_pknorm_u16: case Intrinsic::amdgcn_cvt_pk_i16: case Intrinsic::amdgcn_cvt_pk_u16: + case Intrinsic::amdgcn_cvt_sr_pk_bf16_f32: case Intrinsic::amdgcn_cvt_pk_f16_fp8: case Intrinsic::amdgcn_cvt_pk_f16_bf8: + case Intrinsic::amdgcn_cvt_pk_fp8_f16: + case Intrinsic::amdgcn_cvt_pk_bf8_f16: + case Intrinsic::amdgcn_cvt_sr_fp8_f16: + case Intrinsic::amdgcn_cvt_sr_bf8_f16: case Intrinsic::amdgcn_sat_pk4_i4_i8: case Intrinsic::amdgcn_sat_pk4_u4_u8: case Intrinsic::amdgcn_fmed3: @@ -5175,6 +5193,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_ds_load_tr16_b128: case Intrinsic::amdgcn_ds_load_tr4_b64: case Intrinsic::amdgcn_ds_load_tr6_b96: + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: case Intrinsic::amdgcn_ds_read_tr4_b64: case Intrinsic::amdgcn_ds_read_tr6_b96: case Intrinsic::amdgcn_ds_read_tr8_b64: @@ -5353,6 +5377,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); break; } + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); @@ -5437,6 +5469,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_flat_prefetch: + case Intrinsic::amdgcn_global_prefetch: + return getDefaultMappingVOP(MI); default: return getInvalidInstructionMapping(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index a8e1967..f580f43 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -159,7 +159,8 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // If the inputs are tied and the same register, we can shortcut and // directly replace the register. - if (Src2->getReg() != CopySrcReg) { + if (!Src2->isReg() || Src2->getReg() != CopySrcReg || + Src2->getSubReg() != DefMI->getOperand(1).getSubReg()) { LLVM_DEBUG( dbgs() << "Replacing untied VGPR MFMAs with AGPR form not yet handled\n"); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c865082..c1f1703 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -104,7 +104,9 @@ #include "llvm/Transforms/Scalar/FlattenCFG.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/InferAddressSpaces.h" +#include "llvm/Transforms/Scalar/LICM.h" #include "llvm/Transforms/Scalar/LoopDataPrefetch.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Scalar/NaryReassociate.h" #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h" #include "llvm/Transforms/Scalar/Sink.h" @@ -836,8 +838,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // When we are not using -fgpu-rdc, we can run accelerator code // selection relatively early, but still after linking to prevent // eager removal of potentially reachable symbols. - if (EnableHipStdPar) + if (EnableHipStdPar) { + PM.addPass(HipStdParMathFixupPass()); PM.addPass(HipStdParAcceleratorCodeSelectionPass()); + } PM.addPass(AMDGPUPrintfRuntimeBindingPass()); } @@ -916,8 +920,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // selection after linking to prevent, otherwise we end up removing // potentially reachable symbols that were exported as external in other // modules. - if (EnableHipStdPar) + if (EnableHipStdPar) { + PM.addPass(HipStdParMathFixupPass()); PM.addPass(HipStdParAcceleratorCodeSelectionPass()); + } // We want to support the -lto-partitions=N option as "best effort". // For that, we need to lower LDS earlier in the pipeline before the // module is partitioned for codegen. @@ -2062,7 +2068,12 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { // TODO: May want to move later or split into an early and late one. addPass(AMDGPUCodeGenPreparePass(TM)); - // TODO: LICM + // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may + // have expanded. + if (TM.getOptLevel() > CodeGenOptLevel::Less) { + addPass(createFunctionToLoopPassAdaptor(LICMPass(LICMOptions()), + /*UseMemorySSA=*/true)); + } } Base::addIRPasses(addPass); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 24f4df2..a0c99b0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -597,7 +597,6 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost( // Estimate all types may be fused with contract/unsafe flags const TargetOptions &Options = TLI->getTargetMachine().Options; if (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath || (FAdd->hasAllowContract() && CxtI->hasAllowContract())) return TargetTransformInfo::TCC_Free; } @@ -650,8 +649,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost( return LT.first * Cost * NElts; } - if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) || - TLI->getTargetMachine().Options.UnsafeFPMath)) { + if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) { // Fast unsafe fdiv lowering: // f32 rcp // f32 fmul diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 421fc42..a4ea8cf 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -689,6 +689,8 @@ public: bool isVSrc_v2f16() const { return isVSrc_f16() || isLiteralImm(MVT::v2f16); } + bool isVSrc_NoInline_v2f16() const { return isVSrc_v2f16(); } + bool isVISrcB32() const { return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::i32); } @@ -2036,6 +2038,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_KIMM16: return &APFloat::IEEEhalf(); case AMDGPU::OPERAND_REG_IMM_BF16: @@ -2405,6 +2408,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2FP32: case AMDGPU::OPERAND_REG_IMM_V2INT32: case AMDGPU::OPERAND_KIMM32: @@ -2456,6 +2460,9 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo setImmKindConst(); return; } + [[fallthrough]]; + + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: Inst.addOperand(MCOperand::createImm(Lo_32(Val))); setImmKindLiteral(); @@ -3761,6 +3768,9 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, OperandType == AMDGPU::OPERAND_REG_INLINE_C_BF16) return AMDGPU::isInlinableLiteralBF16(Val, hasInv2PiInlineImm()); + if (OperandType == AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16) + return false; + llvm_unreachable("invalid operand type"); } default: @@ -6066,6 +6076,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { ExprVal, ValRange); if (Val) ImpliedUserSGPRCount += 1; + } else if (ID == ".amdhsa_uses_cu_stores") { + if (!isGFX1250()) + return Error(IDRange.Start, "directive requires gfx12.5", IDRange); + + PARSE_BITS_ENTRY(KD.kernel_code_properties, + KERNEL_CODE_PROPERTY_USES_CU_STORES, ExprVal, ValRange); } else if (ID == ".amdhsa_wavefront_size32") { EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); if (IVersion.Major < 10) @@ -9415,7 +9431,19 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 || Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 || Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 || - Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12)) { + Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_gfx1250)) { Inst.addOperand(Inst.getOperand(0)); } diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index f99e716..1956a15 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -2489,7 +2489,7 @@ multiclass VBUFFER_MTBUF_Real_gfx12<bits<4> op, string real_name> { } //===----------------------------------------------------------------------===// -// MUBUF - GFX11, GFX12. +// MUBUF - GFX11, GFX12, GFX1250. //===----------------------------------------------------------------------===// // gfx11 instruction that accept both old and new assembler name. @@ -2600,6 +2600,12 @@ multiclass MUBUF_Real_Atomic_gfx11_gfx12<bits<8> op, def : Mnem_gfx12<gfx11_name, gfx12_name>; } +multiclass MUBUF_Real_Atomic_gfx12_Renamed<bits<8> op, string real_name> : + MUBUF_Real_Atomic_gfx12_impl<op, 0, real_name>, + MUBUF_Real_Atomic_gfx12_impl<op, 1, real_name> { + def : Mnem_gfx12<get_BUF_ps<NAME>.Mnemonic, real_name>; +} + defm BUFFER_GL0_INV : MUBUF_Real_gfx11<0x02B>; defm BUFFER_GL1_INV : MUBUF_Real_gfx11<0x02C>; @@ -2678,6 +2684,10 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_gfx11_gfx12<0x04B, "buffer defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_gfx12<0x059>; defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Real_Atomic_gfx12<0x05a>; +defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_gfx12<0x055>; +defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Real_Atomic_gfx12_Renamed<0x05b, "buffer_atomic_min_num_f64">; +defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Real_Atomic_gfx12_Renamed<0x05c, "buffer_atomic_max_num_f64">; + //===----------------------------------------------------------------------===// // MUBUF - GFX10. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 42edec0..c466f9c 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -199,6 +199,7 @@ add_llvm_target(AMDGPUCodeGen Instrumentation MC MIRParser + ObjCARC Passes Scalar SelectionDAG diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 319cc9d..3ff675d 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1397,6 +1397,9 @@ defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0, defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>; defm DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_Real_gfx12<0x0e2>; +defm DS_ADD_F64 : DS_Real_gfx12<0x054>; +defm DS_ADD_RTN_F64 : DS_Real_gfx12<0x074>; + let AssemblerPredicate = HasLdsBarrierArriveAtomic in { defm DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 : DS_Real_gfx12<0x056>; defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_Real_gfx12<0x075>; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 5c1989b..ffe6b06 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -2556,6 +2556,9 @@ Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective( KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size", KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); + if (isGFX1250()) + PRINT_DIRECTIVE(".amdhsa_uses_cu_stores", + KERNEL_CODE_PROPERTY_USES_CU_STORES); if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0) return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0, diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 679c55d..d5d1074 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -11,8 +11,10 @@ let WantsRoot = true in { def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [], -10>; def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>; + def GlobalSAddrNoIOffset : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffset", [], [], -3>; def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>; def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>; + def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>; def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>; def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>; } @@ -368,31 +370,68 @@ multiclass FLAT_Global_Store_Pseudo_t16<string opName> { } } -class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo< +// Async loads, introduced in gfx1250, will store directly +// to a DS address in vdst (they will not use M0 for DS addess). +class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0> : FLAT_Pseudo< opName, (outs ), !con( - !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), - (ins flat_offset:$offset, CPol_0:$cpol)), - " $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { - let LGKM_CNT = 1; + !if(IsAsync, (ins VGPR_32:$vdst), (ins)), + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), + (ins flat_offset:$offset, CPol_0:$cpol)), + !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { + let LGKM_CNT = !not(IsAsync); + let VM_CNT = !not(IsAsync); + let ASYNC_CNT = IsAsync; let is_flat_global = 1; let lds = 1; let has_data = 0; + let has_vdst = IsAsync; // vdst for ds address with IsAsync + let mayLoad = 1; + let mayStore = 1; + let has_saddr = 1; + let enabled_saddr = EnableSaddr; + let VALU = 1; + let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", ""); + let Uses = !if(IsAsync, [EXEC, ASYNCcnt], [M0, EXEC]); + let Defs = !if(IsAsync, [ASYNCcnt], []); + let SchedRW = [WriteVMEM, WriteLDS]; +} + +multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0> { + def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync>, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync>, + GlobalSaddrTable<1, opName>; +} + +class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo< + opName, + (outs ), + !con( + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), (ins VGPR_32:$vdata), + (ins flat_offset:$offset, CPol_0:$cpol)), + " $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { + let VM_CNT = 0; + let ASYNC_CNT = 1; + let is_flat_global = 1; + let lds = 1; + let has_data = 1; // vdata for ds address let has_vdst = 0; let mayLoad = 1; let mayStore = 1; let has_saddr = 1; let enabled_saddr = EnableSaddr; let VALU = 1; - let Uses = [M0, EXEC]; + let Uses = [EXEC, ASYNCcnt]; + let Defs = [ASYNCcnt]; let SchedRW = [WriteVMEM, WriteLDS]; } -multiclass FLAT_Global_Load_LDS_Pseudo<string opName> { - def "" : FLAT_Global_Load_LDS_Pseudo<opName>, +multiclass FLAT_Global_STORE_LDS_Pseudo<string opName> { + def "" : FLAT_Global_STORE_LDS_Pseudo<opName>, GlobalSaddrTable<0, opName>; - def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1>, + def _SADDR : FLAT_Global_STORE_LDS_Pseudo<opName, 1>, GlobalSaddrTable<1, opName>; } @@ -464,6 +503,37 @@ class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = n let sve = 0; } +class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64:$vaddr), string asm = " $vaddr"> : + FLAT_Pseudo<opName, (outs), !con(addr, (ins flat_offset:$offset, CPol_0:$cpol)), asm#"$offset$cpol"> { + let has_vdst = 0; + let has_data = 0; + let mayLoad = 1; + let mayStore = 1; + let VM_CNT = 0; + let LGKM_CNT = 0; +} + +multiclass FLAT_Flat_Prefetch_Pseudo<string opName> { + def "" : FLAT_Prefetch_Pseudo<opName>, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">, + GlobalSaddrTable<1, opName> { + let OtherPredicates = [HasFlatGVSMode]; + let enabled_saddr = 1; + } +} + +multiclass FLAT_Global_Prefetch_Pseudo<string opName> { + let is_flat_global = 1, has_saddr = 1 in { + def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64:$vaddr), " $vaddr, off">, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">, + GlobalSaddrTable<1, opName> { + let enabled_saddr = 1; + } + } +} + class FlatScratchInst <string sv_op, string mode> { string SVOp = sv_op; string Mode = mode; @@ -1124,6 +1194,15 @@ let SubtargetPredicate = isGFX12Plus in { let SubtargetPredicate = isGFX1250Plus in { +defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b8", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b32", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b64", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b128", 1>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b8">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b32">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b64">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b128">; + def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>; def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">; } // End SubtargetPredicate = isGFX1250Plus @@ -1162,6 +1241,16 @@ defm SCRATCH_LOAD_LDS_USHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_u defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sshort">; defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">; +let SubtargetPredicate = isGFX125xOnly in { +defm FLAT_LOAD_MONITOR_B32 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b32", VGPR_32>; +defm FLAT_LOAD_MONITOR_B64 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b64", VReg_64>; +defm FLAT_LOAD_MONITOR_B128 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b128", VReg_128>; + +defm GLOBAL_LOAD_MONITOR_B32 : FLAT_Global_Load_Pseudo <"global_load_monitor_b32", VGPR_32>; +defm GLOBAL_LOAD_MONITOR_B64 : FLAT_Global_Load_Pseudo <"global_load_monitor_b64", VReg_64>; +defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VReg_128>; +} // End SubtargetPredicate = isGFX125xOnly + let SubtargetPredicate = isGFX12Plus in { let Uses = [EXEC, M0] in { defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>; @@ -1218,6 +1307,11 @@ let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in "global_atomic_pk_add_f16", VGPR_32, v2f16 >; +let SubtargetPredicate = HasVmemPrefInsts in { + defm FLAT_PREFETCH_B8 : FLAT_Flat_Prefetch_Pseudo<"flat_prefetch_b8">; + defm GLOBAL_PREFETCH_B8 : FLAT_Global_Prefetch_Pseudo<"global_prefetch_b8">; +} + //===----------------------------------------------------------------------===// // Flat Patterns //===----------------------------------------------------------------------===// @@ -1228,6 +1322,11 @@ class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCN (inst $vaddr, $offset) >; +class FlatLoadPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (FlatOffset i64:$vaddr, i32:$offset), (i32 timm:$cpol))), + (inst $vaddr, $offset, $cpol) +>; + class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in), (inst $vaddr, $offset, 0, $in) @@ -1249,8 +1348,8 @@ class FlatSignedLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Value >; class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)), - (inst $saddr, $voffset, $offset, 0, $in) + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)), + (inst $saddr, $voffset, $offset, $cpol, $in) >; class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < @@ -1263,9 +1362,29 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT (inst $saddr, $voffset, $offset, $cpol) >; +class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)), + (inst $dsaddr, $vaddr, $offset, $cpol) +>; + +class GlobalLoadLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)), + (inst $dsaddr, $saddr, $voffset, $offset, $cpol) +>; + +class FlatStoreLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)), + (inst $vaddr, $dsaddr, $offset, $cpol) +>; + +class GlobalStoreLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)), + (inst $saddr, $voffset, $dsaddr, $offset, $cpol) +>; + class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), - (inst $saddr, $voffset, $offset, (i32 0)) + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (inst $saddr, $voffset, $offset, $cpol) >; class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < @@ -1278,6 +1397,16 @@ class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> (inst $saddr, $voffset, $offset, $cpol) >; +class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol))), + (inst $vaddr, $offset, $cpol) +>; + +class GlobalLoadSaddrPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddrCPol (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm))), + (inst $saddr, $voffset, $offset, $cpol) +>; + class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol)), @@ -1459,10 +1588,30 @@ class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT >; class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset))), - (inst $vaddr, $saddr, $offset, 0) + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))), + (inst $vaddr, $saddr, $offset, $cpol) >; +multiclass GlobalLoadLDSPats<FLAT_Pseudo inst, SDPatternOperator node> { + def : FlatLoadLDSSignedPat <inst, node> { + let AddedComplexity = 10; + } + + def : GlobalLoadLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> { + let AddedComplexity = 11; + } +} + +multiclass GlobalStoreLDSPats<FLAT_Pseudo inst, SDPatternOperator node> { + def : FlatStoreLDSSignedPat <inst, node> { + let AddedComplexity = 10; + } + + def : GlobalStoreLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> { + let AddedComplexity = 11; + } +} + multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatLoadSignedPat <inst, node, vt> { let AddedComplexity = 10; @@ -1473,6 +1622,16 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp } } +multiclass GlobalFLATLoadPats_CPOL<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadSignedPat_CPOL<inst, node, vt> { + let AddedComplexity = 10; + } + + def : GlobalLoadSaddrPat_CPOL<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 11; + } +} + multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatSignedLoadPat_D16 <inst, node, vt> { let AddedComplexity = 10; @@ -2009,6 +2168,28 @@ let WaveSizePredicate = isWave32, OtherPredicates = [HasTransposeLoadF4F6Insts] defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR6_B96, int_amdgcn_global_load_tr6_b96, v3i32>; } +let OtherPredicates = [isGFX125xOnly] in { + def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B32, int_amdgcn_flat_load_monitor_b32, i32>; + def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B64, int_amdgcn_flat_load_monitor_b64, v2i32>; + def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B128, int_amdgcn_flat_load_monitor_b128, v4i32>; + + defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B32, int_amdgcn_global_load_monitor_b32, i32>; + defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B64, int_amdgcn_global_load_monitor_b64, v2i32>; + defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>; +} // End SubtargetPredicate = isGFX125xOnly + +let OtherPredicates = [isGFX1250Plus] in { + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_global_load_async_to_lds_b8>; + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_global_load_async_to_lds_b32>; + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_global_load_async_to_lds_b64>; + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B128, int_amdgcn_global_load_async_to_lds_b128>; + + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B8, int_amdgcn_global_store_async_from_lds_b8>; + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B32, int_amdgcn_global_store_async_from_lds_b32>; + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B64, int_amdgcn_global_store_async_from_lds_b64>; + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B128, int_amdgcn_global_store_async_from_lds_b128>; +} + let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; @@ -2138,6 +2319,77 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f } // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch] +def PrefetchLoc: SDNodeXForm<timm, [{ + uint32_t V = N->getZExtValue(); + V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK)) << AMDGPU::CPol::SCOPE_SHIFT; + if (!Subtarget->hasSafeCUPrefetch()) + V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe + return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32); +}]>; + +def prefetch_flat : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), + (prefetch node:$ptr, node:$rw, node:$loc, node:$type), + [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; }]> { + let GISelPredicateCode = [{ + return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS; + }]; +} + +def prefetch_global : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), + (prefetch node:$ptr, node:$rw, node:$loc, node:$type), + [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + (cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + !Subtarget->hasSafeSmemPrefetch()); }]> { + let GISelPredicateCode = [{ + return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::GLOBAL_ADDRESS || + ((*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS && + !Subtarget->hasSafeSmemPrefetch()); + }]; +} + +multiclass FlatPrefetchPats<string inst, SDPatternOperator prefetch_kind, SDPatternOperator rw> { + def : GCNPat < + (prefetch_kind (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), rw, (i32 timm:$loc), i32imm_one), + (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, (i32 (PrefetchLoc $loc))) + > { + let AddedComplexity = !if(!eq(rw, i32imm_zero), 0, 25); + } + + def : GCNPat < + (prefetch_kind (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), rw, (i32 timm:$loc), i32imm_one), + (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, (i32 (PrefetchLoc $loc))) + > { + let AddedComplexity = !if(!eq(rw, i32imm_zero), 11, 30); + } +} + +multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> { + def : GCNPat < + (intr (FlatOffset i64:$vaddr, i32:$offset), timm:$cpol), + (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, $cpol) + >; + + def : GCNPat < + (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol), + (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> { + let AddedComplexity = 11; + } +} + +let SubtargetPredicate = HasVmemPrefInsts in { + defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_zero>; + defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_zero>; + + // Patterns for forced vector prefetch with rw = 1. + defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_one>; + defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_one>; + + + // Patterns for target intrinsics + defm : FlatIntrPrefetchPats<"FLAT_PREFETCH_B8", int_amdgcn_flat_prefetch>; + defm : FlatIntrPrefetchPats<"GLOBAL_PREFETCH_B8", int_amdgcn_global_prefetch>; +} // End SubtargetPredicate = HasVmemPrefInsts + //===----------------------------------------------------------------------===// // Target //===----------------------------------------------------------------------===// @@ -3210,12 +3462,40 @@ multiclass VFLAT_Real_Atomics_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME defm TENSOR_SAVE : VFLAT_Real_gfx1250<0x06e>; defm TENSOR_STOP : VFLAT_Real_gfx1250<0x06f>; +defm FLAT_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>; +defm GLOBAL_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>; + +defm FLAT_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>; +defm FLAT_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>; +defm FLAT_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>; + +defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>; +defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>; +defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>; + +defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x62>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x63>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x64>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x65>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x66>; + defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">; defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">; defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>; defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>; +defm FLAT_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>; +defm FLAT_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">; +defm FLAT_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">; + +defm GLOBAL_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>; +defm GLOBAL_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "global_atomic_min_num_f64">; +defm GLOBAL_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "global_atomic_max_num_f64">; + def True16D16Table : GenericTable { let FilterClass = "True16D16Table"; let CppTypeName = "True16D16Info"; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 7d6723a..334afd3 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -38,7 +38,11 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1, unsigned GCNRegPressure::getRegKind(const TargetRegisterClass *RC, const SIRegisterInfo *STI) { - return STI->isSGPRClass(RC) ? SGPR : (STI->isAGPRClass(RC) ? AGPR : VGPR); + return STI->isSGPRClass(RC) + ? SGPR + : (STI->isAGPRClass(RC) + ? AGPR + : (STI->isVectorSuperClass(RC) ? AVGPR : VGPR)); } void GCNRegPressure::inc(unsigned Reg, diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 3749b6d..ea33a22 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -29,43 +29,57 @@ class raw_ostream; class SlotIndex; struct GCNRegPressure { - enum RegKind { SGPR, VGPR, AGPR, TOTAL_KINDS }; + enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS }; GCNRegPressure() { clear(); } - bool empty() const { return !Value[SGPR] && !Value[VGPR] && !Value[AGPR]; } + bool empty() const { + return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR]; + } void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); } /// \returns the SGPR32 pressure unsigned getSGPRNum() const { return Value[SGPR]; } - /// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p - /// UnifiedVGPRFile + /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure + /// dependent upon \p UnifiedVGPRFile unsigned getVGPRNum(bool UnifiedVGPRFile) const { if (UnifiedVGPRFile) { - return Value[AGPR] ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR]) - : Value[VGPR]; + return Value[AGPR] + ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR]) + : Value[VGPR] + Value[AVGPR]; } - return std::max(Value[VGPR], Value[AGPR]); + // AVGPR assignment priority is based on the width of the register. Account + // AVGPR pressure as VGPR. + return std::max(Value[VGPR] + Value[AVGPR], Value[AGPR]); } /// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs - /// and \p NumAGPRs AGPRS, for a target with a unified VGPR file. + /// \p NumAGPRs AGPRS, and \p NumAVGPRs AVGPRs for a target with a unified + /// VGPR file. inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs, - unsigned NumAGPRs) { - return alignTo(NumArchVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) + + unsigned NumAGPRs, + unsigned NumAVGPRs) { + + // Assume AVGPRs will be assigned as VGPRs. + return alignTo(NumArchVGPRs + NumAVGPRs, + AMDGPU::IsaInfo::getArchVGPRAllocGranule()) + NumAGPRs; } - /// \returns the ArchVGPR32 pressure - unsigned getArchVGPRNum() const { return Value[VGPR]; } + /// \returns the ArchVGPR32 pressure, plus the AVGPRS which we assume will be + /// allocated as VGPR + unsigned getArchVGPRNum() const { return Value[VGPR] + Value[AVGPR]; } /// \returns the AccVGPR32 pressure unsigned getAGPRNum() const { return Value[AGPR]; } + /// \returns the AVGPR32 pressure + unsigned getAVGPRNum() const { return Value[AVGPR]; } unsigned getVGPRTuplesWeight() const { - return std::max(Value[TOTAL_KINDS + VGPR], Value[TOTAL_KINDS + AGPR]); + return std::max(Value[TOTAL_KINDS + VGPR] + Value[TOTAL_KINDS + AVGPR], + Value[TOTAL_KINDS + AGPR]); } unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; } diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index a655308..ce1ce68 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1911,14 +1911,12 @@ void PreRARematStage::rematerialize() { for (auto &[DefMI, Remat] : Rematerializations) { MachineBasicBlock::iterator InsertPos(Remat.UseMI); Register Reg = DefMI->getOperand(0).getReg(); - unsigned SubReg = DefMI->getOperand(0).getSubReg(); unsigned DefRegion = MIRegion.at(DefMI); // Rematerialize DefMI to its use block. - TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI, - *DAG.TRI); + TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, + AMDGPU::NoSubRegister, *DefMI, *DAG.TRI); Remat.RematMI = &*std::prev(InsertPos); - Remat.RematMI->getOperand(0).setSubReg(SubReg); DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI); // Update region boundaries in regions we sinked from (remove defining MI) @@ -2064,14 +2062,13 @@ void PreRARematStage::finalizeGCNSchedStage() { MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second); MachineBasicBlock *MBB = RegionBB[DefRegion]; Register Reg = RematMI.getOperand(0).getReg(); - unsigned SubReg = RematMI.getOperand(0).getSubReg(); // Re-rematerialize MI at the end of its original region. Note that it may // not be rematerialized exactly in the same position as originally within // the region, but it should not matter much. - TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI); + TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI, + *DAG.TRI); MachineInstr *NewMI = &*std::prev(InsertPos); - NewMI->getOperand(0).setSubReg(SubReg); DAG.LIS->InsertMachineInstrInMaps(*NewMI); auto UseRegion = MIRegion.find(Remat.UseMI); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 9a2bab1..0a0a107 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -537,6 +537,63 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { return getMaxNumVGPRs(MF.getFunction()); } +std::pair<unsigned, unsigned> +GCNSubtarget::getMaxNumVectorRegs(const Function &F) const { + const unsigned MaxVectorRegs = getMaxNumVGPRs(F); + + unsigned MaxNumVGPRs = MaxVectorRegs; + unsigned MaxNumAGPRs = 0; + + // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, + // a wave may have up to 512 total vector registers combining together both + // VGPRs and AGPRs. Hence, in an entry function without calls and without + // AGPRs used within it, it is possible to use the whole vector register + // budget for VGPRs. + // + // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split + // register file accordingly. + if (hasGFX90AInsts()) { + unsigned MinNumAGPRs = 0; + const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs(); + const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); + + const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u}; + + // TODO: The lower bound should probably force the number of required + // registers up, overriding amdgpu-waves-per-eu. + std::tie(MinNumAGPRs, MaxNumAGPRs) = + AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", DefaultNumAGPR, + /*OnlyFirstRequired=*/true); + + if (MinNumAGPRs == DefaultNumAGPR.first) { + // Default to splitting half the registers if AGPRs are required. + MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2; + } else { + // Align to accum_offset's allocation granularity. + MinNumAGPRs = alignTo(MinNumAGPRs, 4); + + MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs); + } + + // Clamp values to be inbounds of our limits, and ensure min <= max. + + MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs); + MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs); + + MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs); + MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs); + + assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs && + MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs && + "invalid register counts"); + } else if (hasMAIInsts()) { + // On gfx908 the number of AGPRs always equals the number of VGPRs. + MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs; + } + + return std::pair(MaxNumVGPRs, MaxNumAGPRs); +} + void GCNSubtarget::adjustSchedDependency( SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 0435e7f..bdd900d 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -123,6 +123,7 @@ protected: bool HasSMemRealTime = false; bool HasIntClamp = false; bool HasFmaMixInsts = false; + bool HasFmaMixBF16Insts = false; bool HasMovrel = false; bool HasVGPRIndexMode = false; bool HasScalarDwordx3Loads = false; @@ -244,7 +245,10 @@ protected: bool HasVMEMtoScalarWriteHazard = false; bool HasSMEMtoVectorWriteHazard = false; bool HasInstFwdPrefetchBug = false; + bool HasVmemPrefInsts = false; bool HasSafeSmemPrefetch = false; + bool HasSafeCUPrefetch = false; + bool HasCUStores = false; bool HasVcmpxExecWARHazard = false; bool HasLdsBranchVmemWARHazard = false; bool HasNSAtoVMEMBug = false; @@ -269,6 +273,7 @@ protected: bool HasMinimum3Maximum3PKF16 = false; bool HasLshlAddU64Inst = false; bool HasAddSubU64Insts = false; + bool HasMadU32Inst = false; bool HasPointSampleAccel = false; bool HasLdsBarrierArriveAtomic = false; bool HasSetPrioIncWgInst = false; @@ -462,6 +467,8 @@ public: return HasFmaMixInsts; } + bool hasFmaMixBF16Insts() const { return HasFmaMixBF16Insts; } + bool hasCARRY() const { return true; } @@ -709,7 +716,9 @@ public: bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); } // DS_ADD_F64/DS_ADD_RTN_F64 - bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); } + bool hasLdsAtomicAddF64() const { + return hasGFX90AInsts() || hasGFX1250Insts(); + } bool hasMultiDwordFlatScratchAddressing() const { return getGeneration() >= GFX9; @@ -987,8 +996,14 @@ public: bool hasPrefetch() const { return GFX12Insts; } + bool hasVmemPrefInsts() const { return HasVmemPrefInsts; } + bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; } + bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; } + + bool hasCUStores() const { return HasCUStores; } + // Has s_cmpk_* instructions. bool hasSCmpK() const { return getGeneration() < GFX12; } @@ -1507,9 +1522,22 @@ public: // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions. bool hasAddSubU64Insts() const { return HasAddSubU64Insts; } + // \returns true if the target has V_MAD_U32 instruction. + bool hasMadU32Inst() const { return HasMadU32Inst; } + // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions. bool hasVectorMulU64() const { return GFX1250Insts; } + // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32 + // instructions. + bool hasMadU64U32NoCarry() const { return GFX1250Insts; } + + // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions. + bool hasIntMinMax64() const { return GFX1250Insts; } + + // \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions. + bool hasAddMinMaxInsts() const { return GFX1250Insts; } + // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions. bool hasPkAddMinMaxInsts() const { return GFX1250Insts; } @@ -1658,6 +1686,10 @@ public: return getMaxNumVGPRs(F); } + /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number + /// of waves per execution unit required for the function \p MF. + std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const; + /// \returns Maximum number of VGPRs that meets number of waves per execution /// unit requirement for function \p MF, or number of VGPRs explicitly /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 11b072e..15088ac 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -540,6 +540,8 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType, printImmediateBFloat16(static_cast<uint16_t>(Imm), STI, O)) return; break; + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + break; default: llvm_unreachable("bad operand type"); } @@ -770,6 +772,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index c49ad79..f358084 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -341,6 +341,9 @@ std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding( return AMDGPU::getInlineEncodingV2BF16(static_cast<uint32_t>(Imm)) .value_or(255); + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + return 255; + case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_KIMM16: case AMDGPU::OPERAND_KIMM64: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 10f6d33..43ca548 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -440,6 +440,11 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, ".amdhsa_user_sgpr_private_segment_size"); + if (isGFX1250(STI)) + PrintField(KD.kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES_SHIFT, + amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES, + ".amdhsa_uses_cu_stores"); if (IVersion.Major >= 10) PrintField(KD.kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT, diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 3902d4c..c564145 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -208,6 +208,7 @@ enum OperandType : unsigned { OPERAND_REG_IMM_V2BF16, OPERAND_REG_IMM_V2FP16, OPERAND_REG_IMM_V2INT16, + OPERAND_REG_IMM_NOINLINE_V2FP16, OPERAND_REG_IMM_V2INT32, OPERAND_REG_IMM_V2FP32, @@ -392,11 +393,13 @@ enum CPol { TH_ATOMIC_CASCADE = 4, // Cascading vs regular // Scope - SCOPE = 0x3 << 3, // All Scope bits - SCOPE_CU = 0 << 3, - SCOPE_SE = 1 << 3, - SCOPE_DEV = 2 << 3, - SCOPE_SYS = 3 << 3, + SCOPE_SHIFT = 3, + SCOPE_MASK = 0x3, + SCOPE = SCOPE_MASK << SCOPE_SHIFT, // All Scope bits + SCOPE_CU = 0 << SCOPE_SHIFT, + SCOPE_SE = 1 << SCOPE_SHIFT, + SCOPE_DEV = 2 << SCOPE_SHIFT, + SCOPE_SYS = 3 << SCOPE_SHIFT, NV = 1 << 5, // Non-volatile bit diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index e5d1eaa..e934152 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -468,6 +468,7 @@ bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI, case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: @@ -1062,9 +1063,13 @@ bool SIFoldOperandsImpl::tryFoldRegSeqSplat( switch (OpTy) { case AMDGPU::OPERAND_REG_INLINE_AC_INT32: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0); break; case AMDGPU::OPERAND_REG_INLINE_AC_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1); break; default: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f1a8ee1..ad26757 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -882,7 +882,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasMad64_32()) setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); - if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch()) + if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts()) setOperationAction(ISD::PREFETCH, MVT::Other, Custom); if (Subtarget->hasIEEEMinimumMaximumInsts()) { @@ -909,6 +909,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, Custom); } + if (Subtarget->hasIntMinMax64()) + setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i64, + Legal); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128, @@ -1061,10 +1065,12 @@ ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const { // where this is OK to use. bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const { - return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || - (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && - DestVT.getScalarType() == MVT::f32 && - SrcVT.getScalarType() == MVT::f16 && + return DestVT.getScalarType() == MVT::f32 && + ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && + SrcVT.getScalarType() == MVT::f16) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() && + SrcVT.getScalarType() == MVT::bf16)) && // TODO: This probably only requires no input flushing? denormalModeIsFlushAllF32(DAG.getMachineFunction()); } @@ -1254,6 +1260,25 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { return AMDGPUTargetLowering::getPointerMemTy(DL, AS); } +static unsigned getIntrMemWidth(unsigned IntrID) { + switch (IntrID) { + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + return 8; + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + return 32; + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + return 64; + case Intrinsic::amdgcn_global_load_async_to_lds_b128: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: + return 128; + default: + llvm_unreachable("Unknown width"); + } +} + bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, @@ -1475,6 +1500,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: case Intrinsic::amdgcn_ds_load_tr6_b96: case Intrinsic::amdgcn_ds_load_tr4_b64: case Intrinsic::amdgcn_ds_load_tr8_b64: @@ -1519,6 +1550,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOStore; return true; } + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); + Info.ptrVal = CI.getArgOperand(1); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + } + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); + Info.ptrVal = CI.getArgOperand(0); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + } case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { Info.opc = ISD::INTRINSIC_VOID; @@ -1548,7 +1599,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; } - case Intrinsic::amdgcn_s_prefetch_data: { + case Intrinsic::amdgcn_s_prefetch_data: + case Intrinsic::amdgcn_flat_prefetch: + case Intrinsic::amdgcn_global_prefetch: { Info.opc = ISD::INTRINSIC_VOID; Info.memVT = EVT::getIntegerVT(CI.getContext(), 8); Info.ptrVal = CI.getArgOperand(0); @@ -1599,18 +1652,32 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin_num: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: case Intrinsic::amdgcn_global_atomic_csub: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: case Intrinsic::amdgcn_global_load_tr_b64: case Intrinsic::amdgcn_global_load_tr_b128: case Intrinsic::amdgcn_global_load_tr4_b64: case Intrinsic::amdgcn_global_load_tr6_b96: + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: Ptr = II->getArgOperand(0); break; case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: Ptr = II->getArgOperand(1); break; default: @@ -4225,7 +4292,7 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = BaseAddr.getValue(1); Align StackAlign = TFL->getStackAlign(); if (Alignment > StackAlign) { - uint64_t ScaledAlignment = (uint64_t)Alignment.value() + uint64_t ScaledAlignment = Alignment.value() << Subtarget->getWavefrontSizeLog2(); uint64_t StackAlignMask = ScaledAlignment - 1; SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, @@ -4440,19 +4507,28 @@ SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op, } SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { - if (Op->isDivergent()) + if (Op->isDivergent() && + (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4))) + // Cannot do I$ prefetch with divergent pointer. return SDValue(); switch (cast<MemSDNode>(Op)->getAddressSpace()) { case AMDGPUAS::FLAT_ADDRESS: case AMDGPUAS::GLOBAL_ADDRESS: case AMDGPUAS::CONSTANT_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS_32BIT: break; + case AMDGPUAS::CONSTANT_ADDRESS_32BIT: + if (Subtarget->hasSafeSmemPrefetch()) + break; + [[fallthrough]]; default: return SDValue(); } + // I$ prefetch + if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4)) + return SDValue(); + return Op; } @@ -7123,7 +7199,7 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16); return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc); } - if (getTargetMachine().Options.UnsafeFPMath) { + if (Op->getFlags().hasApproximateFuncs()) { SDValue Flags = Op.getOperand(1); SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags); return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags); @@ -11218,8 +11294,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool AllowInaccurateRcp = - Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath; + bool AllowInaccurateRcp = Flags.hasApproximateFuncs(); if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { // Without !fpmath accuracy information, we can't do more because we don't @@ -11238,7 +11313,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, // 1.0 / sqrt(x) -> rsq(x) - // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP + // XXX - Is afn sufficient to do this for f64? The maximum ULP // error seems really high at 2^29 ULP. // 1.0 / x -> rcp(x) return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); @@ -11272,8 +11347,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op, EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool AllowInaccurateDiv = - Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath; + bool AllowInaccurateDiv = Flags.hasApproximateFuncs(); if (!AllowInaccurateDiv) return SDValue(); @@ -14154,6 +14228,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && (VT == MVT::f32 || VT == MVT::f64 || (VT == MVT::f16 && Subtarget->has16BitInsts()) || + (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) || + (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) || (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) && Op0.hasOneUse()) { if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) @@ -14523,7 +14599,7 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, return ISD::FMAD; const TargetOptions &Options = DAG.getTarget().Options; - if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || + if ((Options.AllowFPOpFusion == FPOpFusion::Fast || (N0->getFlags().hasAllowContract() && N1->getFlags().hasAllowContract())) && isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) { @@ -15646,9 +15722,9 @@ SDValue SITargetLowering::performFMACombine(SDNode *N, // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero, // regardless of the denorm mode setting. Therefore, - // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2. + // fp-contract is sufficient to allow generating fdot2. const TargetOptions &Options = DAG.getTarget().Options; - if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || + if (Options.AllowFPOpFusion == FPOpFusion::Fast || (N->getFlags().hasAllowContract() && FMA->getFlags().hasAllowContract())) { Op1 = Op1.getOperand(0); @@ -15869,6 +15945,78 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, return SDValue(CSrc, 0); } +SDValue SITargetLowering::performSelectCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + + // Try to fold CMP + SELECT patterns with shared constants (both FP and + // integer). + // Detect when CMP and SELECT use the same constant and fold them to avoid + // loading the constant twice. Specifically handles patterns like: + // %cmp = icmp eq i32 %val, 4242 + // %sel = select i1 %cmp, i32 4242, i32 %other + // It can be optimized to reuse %val instead of 4242 in select. + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + // Check if condition is a comparison. + if (Cond.getOpcode() != ISD::SETCC) + return SDValue(); + + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); + bool isInteger = LHS.getValueType().isInteger(); + + // Handle simple floating-point and integer types only. + if (!isFloatingPoint && !isInteger) + return SDValue(); + + bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); + bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); + if (!isEquality && !isNonEquality) + return SDValue(); + + SDValue ArgVal, ConstVal; + if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || + (isInteger && isa<ConstantSDNode>(RHS))) { + ConstVal = RHS; + ArgVal = LHS; + } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || + (isInteger && isa<ConstantSDNode>(LHS))) { + ConstVal = LHS; + ArgVal = RHS; + } else { + return SDValue(); + } + + // Skip optimization for inlinable immediates. + if (isFloatingPoint) { + const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); + if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val)) + return SDValue(); + } else { + if (AMDGPU::isInlinableIntLiteral( + cast<ConstantSDNode>(ConstVal)->getSExtValue())) + return SDValue(); + } + + // For equality and non-equality comparisons, patterns: + // select (setcc x, const), const, y -> select (setcc x, const), x, y + // select (setccinv x, const), y, const -> select (setccinv x, const), y, x + if (!(isEquality && TrueVal == ConstVal) && + !(isNonEquality && FalseVal == ConstVal)) + return SDValue(); + + SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; + SDValue SelectRHS = + (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; + return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, + SelectLHS, SelectRHS); +} + SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { @@ -15917,6 +16065,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performFMulCombine(N, DCI); case ISD::SETCC: return performSetCCCombine(N, DCI); + case ISD::SELECT: + if (auto Res = performSelectCombine(N, DCI)) + return Res; + break; case ISD::FMAXNUM: case ISD::FMINNUM: case ISD::FMAXNUM_IEEE: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index acf6158..dedd9ae 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -211,6 +211,7 @@ private: SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const; unsigned getFusedOpcode(const SelectionDAG &DAG, diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 9faf497..4b48fc4 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -552,7 +552,7 @@ public: (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) { // FLAT and SCRATCH instructions may access scratch. Other VMEM // instructions do not. - if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst)) + if (TII->mayAccessScratchThroughFlat(Inst)) return SCRATCH_WRITE_ACCESS; return VMEM_WRITE_ACCESS; } @@ -565,7 +565,6 @@ public: bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; - bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; bool isVmemAccess(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, @@ -1381,6 +1380,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( Modified = true; } else WaitcntInstr = &II; + } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) { + assert(ST->hasVMemToLDSLoad()); + LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II + << "Before: " << Wait.LoadCnt << '\n';); + ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait); + LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';); + + // It is possible (but unlikely) that this is the only wait instruction, + // in which case, we exit this loop without a WaitcntInstr to consume + // `Wait`. But that works because `Wait` was passed in by reference, and + // the callee eventually calls createNewWaitcnt on it. We test this + // possibility in an articial MIR test since such a situation cannot be + // recreated by running the memory legalizer. + II.eraseFromParent(); } else { assert(Opcode == AMDGPU::S_WAITCNT_VSCNT); assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); @@ -1552,6 +1565,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( ScoreBrackets.simplifyWaitcnt(OldWait); Wait = Wait.combined(OldWait); UpdatableInstr = &CombinedStoreDsCntInstr; + } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) { + // Architectures higher than GFX10 do not have direct loads to + // LDS, so no work required here yet. + II.eraseFromParent(); + continue; } else { std::optional<InstCounterType> CT = counterTypeForInstr(Opcode); assert(CT.has_value()); @@ -2108,8 +2126,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { assert(TII->isFLAT(MI)); - // All flat instructions use the VMEM counter. - assert(TII->usesVM_CNT(MI)); + // All flat instructions use the VMEM counter except prefetch. + if (!TII->usesVM_CNT(MI)) + return false; // If there are no memory operands then conservatively assume the flat // operation may access VMEM. @@ -2159,32 +2178,6 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { return false; } -// This is a flat memory operation. Check to see if it has memory tokens for -// either scratch or FLAT. -bool SIInsertWaitcnts::mayAccessScratchThroughFlat( - const MachineInstr &MI) const { - assert(TII->isFLAT(MI)); - - // SCRATCH instructions always access scratch. - if (TII->isFLATScratch(MI)) - return true; - - // GLOBAL instructions never access scratch. - if (TII->isFLATGlobal(MI)) - return false; - - // If there are no memory operands then conservatively assume the flat - // operation may access scratch. - if (MI.memoperands_empty()) - return true; - - // See if any memory operand specifies an address space that involves scratch. - return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { - unsigned AS = Memop->getAddrSpace(); - return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; - }); -} - bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const { return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) || (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode())); @@ -2295,9 +2288,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); } - // A Flat memory operation must access at least one address space. - assert(FlatASCount); - // This is a flat memory operation that access both VMEM and LDS, so note it // - it will require that both the VM and LGKM be flushed to zero if it is // pending when a VM or LGKM dependency occurs. @@ -2444,6 +2434,7 @@ static bool isWaitInstr(MachineInstr &Inst) { Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) || Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT || Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT || + Opcode == AMDGPU::S_WAITCNT_lds_direct || counterTypeForInstr(Opcode).has_value(); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 40e6871..044a681 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2508,7 +2508,20 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { .addReg(DstHi); } break; + + case AMDGPU::V_MAX_BF16_PSEUDO_e64: + assert(ST.hasBF16PackedInsts()); + MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16)); + MI.addOperand(MachineOperand::CreateImm(0)); // op_sel + MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo + MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi + auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); + Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1); + auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); + Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1); + break; } + return true; } @@ -2733,49 +2746,47 @@ static MachineInstr *swapImmOperands(MachineInstr &MI, } bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0, - const MachineOperand *MO0, unsigned OpIdx1, - const MachineOperand *MO1) const { + unsigned OpIdx1) const { const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0]; const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1]; - const TargetRegisterClass *DefinedRC1 = - OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo1.RegClass) : nullptr; - const TargetRegisterClass *DefinedRC0 = - OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo0.RegClass) : nullptr; unsigned Opc = MI.getOpcode(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + const MachineOperand &MO0 = MI.getOperand(OpIdx0); + const MachineOperand &MO1 = MI.getOperand(OpIdx1); + // Swap doesn't breach constant bus or literal limits // It may move literal to position other than src0, this is not allowed // pre-gfx10 However, most test cases need literals in Src0 for VOP // FIXME: After gfx9, literal can be in place other than Src0 if (isVALU(MI)) { - if ((int)OpIdx0 == Src0Idx && !MO0->isReg() && - !isInlineConstant(*MO0, OpInfo1)) + if ((int)OpIdx0 == Src0Idx && !MO0.isReg() && + !isInlineConstant(MO0, OpInfo1)) return false; - if ((int)OpIdx1 == Src0Idx && !MO1->isReg() && - !isInlineConstant(*MO1, OpInfo0)) + if ((int)OpIdx1 == Src0Idx && !MO1.isReg() && + !isInlineConstant(MO1, OpInfo0)) return false; } - if ((int)OpIdx1 != Src0Idx && MO0->isReg()) { - if (!DefinedRC1) + if ((int)OpIdx1 != Src0Idx && MO0.isReg()) { + if (OpInfo1.RegClass == -1) return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN; - return isLegalRegOperand(MI, OpIdx1, *MO0) && - (!MO1->isReg() || isLegalRegOperand(MI, OpIdx0, *MO1)); + return isLegalRegOperand(MI, OpIdx1, MO0) && + (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1)); } - if ((int)OpIdx0 != Src0Idx && MO1->isReg()) { - if (!DefinedRC0) + if ((int)OpIdx0 != Src0Idx && MO1.isReg()) { + if (OpInfo0.RegClass == -1) return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN; - return (!MO0->isReg() || isLegalRegOperand(MI, OpIdx1, *MO0)) && - isLegalRegOperand(MI, OpIdx0, *MO1); + return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) && + isLegalRegOperand(MI, OpIdx0, MO1); } // No need to check 64-bit literals since swapping does not bring new // 64-bit literals into current instruction to fold to 32-bit - return isImmOperandLegal(MI, OpIdx1, *MO0); + return isImmOperandLegal(MI, OpIdx1, MO0); } MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, @@ -2797,12 +2808,12 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices"); - MachineOperand &Src0 = MI.getOperand(Src0Idx); - MachineOperand &Src1 = MI.getOperand(Src1Idx); - if (!isLegalToSwap(MI, Src0Idx, &Src0, Src1Idx, &Src1)) { + if (!isLegalToSwap(MI, Src0Idx, Src1Idx)) return nullptr; - } + MachineInstr *CommutedMI = nullptr; + MachineOperand &Src0 = MI.getOperand(Src0Idx); + MachineOperand &Src1 = MI.getOperand(Src1Idx); if (Src0.isReg() && Src1.isReg()) { // Be sure to copy the source modifiers to the right place. CommutedMI = @@ -4238,6 +4249,32 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode); } +bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const { + if (!isFLAT(MI) || isFLATGlobal(MI)) + return false; + + // If scratch is not initialized, we can never access it. + if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init")) + return false; + + // SCRATCH instructions always access scratch. + if (isFLATScratch(MI)) + return true; + + // If there are no memory operands then conservatively assume the flat + // operation may access scratch. + if (MI.memoperands_empty()) + return true; + + // TODO (?): Does this need to be taught how to read noalias.addrspace ? + + // See if any memory operand specifies an address space that involves scratch. + return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { + unsigned AS = Memop->getAddrSpace(); + return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; + }); +} + bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { // Skip the full operand and register alias search modifiesRegister // does. There's only a handful of instructions that touch this, it's only an @@ -4401,6 +4438,8 @@ bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const { case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: return AMDGPU::isInlinableLiteralV2BF16(Imm); + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + return false; case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { @@ -9244,6 +9283,16 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { default: if (MI.isMetaInstruction()) return 0; + + // If D16 Pseudo inst, get correct MC code size + const auto *D16Info = AMDGPU::getT16D16Helper(Opc); + if (D16Info) { + // Assume d16_lo/hi inst are always in same size + unsigned LoInstOpcode = D16Info->LoOp; + const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode); + DescSize = Desc.getSize(); + } + return DescSize; } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 800ea9a..e042b59 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -197,8 +197,7 @@ protected: AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const; bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, - const MachineOperand *fromMO, unsigned toIdx, - const MachineOperand *toMO) const; + unsigned toIdx) const; MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override; @@ -679,6 +678,12 @@ public: return get(Opcode).TSFlags & SIInstrFlags::FLAT; } + /// \returns true for SCRATCH_ instructions, or FLAT_ instructions with + /// SCRATCH_ memory operands. + /// Conservatively correct; will return true if \p MI cannot be proven + /// to not hit scratch. + bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; + static bool isBlockLoadStore(uint16_t Opcode) { switch (Opcode) { case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 485ca78..efcc88e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1662,6 +1662,8 @@ def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">; def VOP3PMadMixModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsExt">; def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">; +def VOP3PMadMixBF16ModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsExt">; +def VOP3PMadMixBF16Mods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16Mods">; def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">; def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">; @@ -2857,15 +2859,18 @@ def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>; def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>; def VOP_BF16_BF16 : VOPProfile<[bf16, bf16, untyped, untyped]>; def VOP1_I16_I32 : VOPProfile<[i16, i32, untyped, untyped]>; +def VOP_I16_V2F16 : VOPProfile<[i16, v2f16, untyped, untyped]>; def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>; def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>; def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp=*/1>; +def VOP_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, untyped]>; def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>; def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>; +def VOP_BF16_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, bf16, untyped]>; def VOP_I32_I16_I16_I32 : VOPProfile <[i32, i16, i16, i32, untyped]>; def VOP_I32_I16 : VOPProfile <[i32, i16, untyped, untyped]>; @@ -2917,10 +2922,12 @@ def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp= def VOP_I64_I64_I64_ARITH : VOPProfile <[i64, i64, i64, untyped], /*EnableClamp=*/1>; def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>; def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>; +def VOP_F32_BF16_BF16_BF16 : VOPProfile <[f32, bf16, bf16, bf16]>; def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>; def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>; def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>; def VOP_V32BF16_V6I32_F32 : VOPProfile <[v32bf16, v6i32, f32, untyped]>; +def VOP_V2BF16_F32_F32_I32 : VOPProfile <[v2bf16, f32, f32, i32]>; def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>; def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>; def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index d05be8f..54fa192 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1894,6 +1894,9 @@ let SubtargetPredicate = UseRealTrue16Insts in def : ClampPat<V_MAX_F16_t16_e64, f16>; let SubtargetPredicate = UseFakeTrue16Insts in def : ClampPat<V_MAX_F16_fake16_e64, f16>; +// FIXME-TRUE16: Pseudo expansion of this won't work with True16. +let True16Predicate = UseFakeTrue16Insts in +def : ClampPat<V_MAX_BF16_PSEUDO_e64, bf16>; let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat < @@ -1903,6 +1906,13 @@ def : GCNPat < >; } +let SubtargetPredicate = HasBF16PackedInsts in { +def : GCNPat < + (v2bf16 (AMDGPUclamp (VOP3PMods v2bf16:$src0, i32:$src0_modifiers))), + (V_PK_MAX_NUM_BF16 $src0_modifiers, $src0, + $src0_modifiers, $src0, DSTCLAMP.ENABLE) +>; +} // End SubtargetPredicate = HasBF16PackedInsts /********** ================================ **********/ /********** Floating point absolute/negative **********/ diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 9f61bf8..9509199 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -351,6 +351,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF, MachineRegisterInfo &MRI = MF.getRegInfo(); BitVector ReservedRegs = TRI->getReservedRegs(MF); BitVector NonWwmAllocMask(TRI->getNumRegs()); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); // FIXME: MaxNumVGPRsForWwmAllocation might need to be adjusted in the future // to have a balanced allocation between WWM values and per-thread vector @@ -359,7 +360,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF, NumRegs = std::min(static_cast<unsigned>(MFI->getSGPRSpillVGPRs().size()), NumRegs); - auto [MaxNumVGPRs, MaxNumAGPRs] = TRI->getMaxNumVectorRegs(MF); + auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction()); // Try to use the highest available registers for now. Later after // vgpr-regalloc, they can be shifted to the lowest range. unsigned I = 0; @@ -376,7 +377,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF, // Reserve an arbitrary register and report the error. TRI->markSuperRegs(RegMask, AMDGPU::VGPR0); MF.getFunction().getContext().emitError( - "can't find enough VGPRs for wwm-regalloc"); + "cannot find enough VGPRs for wwm-regalloc"); } } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index f0be204..9a1448f 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -81,11 +81,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, PSInputAddr = AMDGPU::getInitialPSInputAddr(F); } - MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm; - if (!MFMAVGPRForm && ST.hasGFX90AInsts() && - ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() && - !mayUseAGPRs(F)) - MayNeedAGPRs = false; // We will select all MAI with VGPR operands. + MayNeedAGPRs = ST.hasMAIInsts(); + if (ST.hasGFX90AInsts()) { + // FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection + // should be separated from availability of AGPRs + if (MFMAVGPRForm || + (ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() && + !mayUseAGPRs(F))) + MayNeedAGPRs = false; // We will select all MAI with VGPR operands. + } if (AMDGPU::isChainCC(CC)) { // Chain functions don't receive an SP from their caller, but are free to diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 3212060..53f554e 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -321,7 +321,7 @@ public: bool IsNonTemporal, bool IsLastUse = false) const = 0; - virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const { + virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const { return false; }; @@ -602,7 +602,7 @@ public: bool IsVolatile, bool IsNonTemporal, bool IsLastUse) const override; - bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; + bool finalizeStore(MachineInstr &MI, bool Atomic) const override; bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, @@ -704,16 +704,16 @@ void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) { DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning)); } -/// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA. -/// If this tag isn't present, or if it has no meaningful values, returns \p -/// Default. Otherwise returns all the address spaces concerned by the MMRA. -static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI, - SIAtomicAddrSpace Default) { - static constexpr StringLiteral FenceASPrefix = "amdgpu-as"; +/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA. +/// If this tag isn't present, or if it has no meaningful values, returns +/// \p none, otherwise returns the address spaces specified by the MD. +static std::optional<SIAtomicAddrSpace> +getSynchronizeAddrSpaceMD(const MachineInstr &MI) { + static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as"; auto MMRA = MMRAMetadata(MI.getMMRAMetadata()); if (!MMRA) - return Default; + return std::nullopt; SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE; for (const auto &[Prefix, Suffix] : MMRA) { @@ -726,7 +726,10 @@ static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI, diagnoseUnknownMMRAASName(MI, Suffix); } - return (Result != SIAtomicAddrSpace::NONE) ? Result : Default; + if (Result == SIAtomicAddrSpace::NONE) + return std::nullopt; + + return Result; } } // end anonymous namespace @@ -903,12 +906,19 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = *ScopeOrNone; - if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || - ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { + if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) { + // We currently expect refineOrderingAS to be the only place that + // can refine the AS ordered by the fence. + // If that changes, we need to review the semantics of that function + // in case it needs to preserve certain address spaces. reportUnsupported(MI, "Unsupported atomic address space"); return std::nullopt; } + auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI); + if (SynchronizeAS) + OrderingAddrSpace = *SynchronizeAS; + return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); } @@ -1160,6 +1170,16 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, Changed = true; } + // On architectures that support direct loads to LDS, emit an unknown waitcnt + // at workgroup-scoped release operations that specify the LDS address space. + // SIInsertWaitcnts will later replace this with a vmcnt(). + if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && + Scope == SIAtomicScope::WORKGROUP && + (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct)); + Changed = true; + } + if (Pos == Position::AFTER) --MI; @@ -2068,6 +2088,16 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, Changed = true; } + // On architectures that support direct loads to LDS, emit an unknown waitcnt + // at workgroup-scoped release operations that specify the LDS address space. + // SIInsertWaitcnts will later replace this with a vmcnt(). + if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && + Scope == SIAtomicScope::WORKGROUP && + (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct)); + Changed = true; + } + if (VSCnt) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) .addReg(AMDGPU::SGPR_NULL, RegState::Undef) @@ -2526,9 +2556,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( if (IsVolatile) { Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); - if (Op == SIMemOp::STORE) - Changed |= insertWaitsBeforeSystemScopeStore(MI); - // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not // request cross address space as only the global address space can be @@ -2541,11 +2568,26 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( return Changed; } -bool SIGfx12CacheControl::expandSystemScopeStore( - MachineBasicBlock::iterator &MI) const { - MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); - if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) - return insertWaitsBeforeSystemScopeStore(MI); +bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { + MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); + if (!CPol) + return false; + + const unsigned Scope = CPol->getImm() & CPol::SCOPE; + + // GFX12.0 only: Extra waits needed before system scope stores. + if (!ST.hasGFX1250Insts()) { + if (!Atomic && Scope == CPol::SCOPE_SYS) + return insertWaitsBeforeSystemScopeStore(MI); + return false; + } + + // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address + // space. + // We also require SCOPE_SE minimum if we not have the "cu-stores" feature. + if (Scope == CPol::SCOPE_CU && + (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI))) + return setScope(MI, CPol::SCOPE_SE); return false; } @@ -2648,6 +2690,8 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, assert(!MI->mayLoad() && MI->mayStore()); bool Changed = false; + // FIXME: Necessary hack because iterator can lose track of the store. + MachineInstr &StoreMI = *MI; if (MOI.isAtomic()) { if (MOI.getOrdering() == AtomicOrdering::Monotonic || @@ -2664,6 +2708,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE); + Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true); return Changed; } @@ -2676,7 +2721,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, // GFX12 specific, scope(desired coherence domain in cache hierarchy) is // instruction field, do not confuse it with atomic scope. - Changed |= CC->expandSystemScopeStore(MI); + Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false); return Changed; } @@ -2687,11 +2732,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, AtomicPseudoMIs.push_back(MI); bool Changed = false; - // Refine fenced address space based on MMRAs. - // - // TODO: Should we support this MMRA on other atomic operations? - auto OrderingAddrSpace = - getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace()); + const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace(); if (MOI.isAtomic()) { const AtomicOrdering Order = MOI.getOrdering(); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 84cfa87..f3acc5c 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -572,65 +572,6 @@ MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass); } -std::pair<unsigned, unsigned> -SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const { - const unsigned MaxVectorRegs = ST.getMaxNumVGPRs(MF); - - unsigned MaxNumVGPRs = MaxVectorRegs; - unsigned MaxNumAGPRs = 0; - - // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, - // a wave may have up to 512 total vector registers combining together both - // VGPRs and AGPRs. Hence, in an entry function without calls and without - // AGPRs used within it, it is possible to use the whole vector register - // budget for VGPRs. - // - // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split - // register file accordingly. - if (ST.hasGFX90AInsts()) { - unsigned MinNumAGPRs = 0; - const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs(); - const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); - - const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u}; - - // TODO: Move this logic into subtarget on IR function - // - // TODO: The lower bound should probably force the number of required - // registers up, overriding amdgpu-waves-per-eu. - std::tie(MinNumAGPRs, MaxNumAGPRs) = AMDGPU::getIntegerPairAttribute( - MF.getFunction(), "amdgpu-agpr-alloc", DefaultNumAGPR, - /*OnlyFirstRequired=*/true); - - if (MinNumAGPRs == DefaultNumAGPR.first) { - // Default to splitting half the registers if AGPRs are required. - MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2; - } else { - // Align to accum_offset's allocation granularity. - MinNumAGPRs = alignTo(MinNumAGPRs, 4); - - MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs); - } - - // Clamp values to be inbounds of our limits, and ensure min <= max. - - MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs); - MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs); - - MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs); - MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs); - - assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs && - MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs && - "invalid register counts"); - } else if (ST.hasMAIInsts()) { - // On gfx908 the number of AGPRs always equals the number of VGPRs. - MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs; - } - - return std::pair(MaxNumVGPRs, MaxNumAGPRs); -} - BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::MODE); @@ -742,7 +683,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Reserve VGPRs/AGPRs. // - auto [MaxNumVGPRs, MaxNumAGPRs] = getMaxNumVectorRegs(MF); + auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction()); for (const TargetRegisterClass *RC : regclasses()) { if (RC->isBaseClass() && isVGPRClass(RC)) { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 0008e5f..5508f07 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -90,11 +90,6 @@ public: /// spilling is needed. MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; - /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number - /// of waves per execution unit required for the function \p MF. - std::pair<unsigned, unsigned> - getMaxNumVectorRegs(const MachineFunction &MF) const; - BitVector getReservedRegs(const MachineFunction &MF) const override; bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 0039d2f..36d1a3b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -109,6 +109,23 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList> let TSFlags{2} = HasVGPR; let TSFlags{3} = HasAGPR; let TSFlags{4} = HasSGPR; + + // RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block) + // to decide which registers to try to assign first. Usually, this RegisterClass priority is given + // very high priority, if not the highest priority, when considering which VirtReg to allocate next. + // + // We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to + // assign more constrained RegisterClasses first. As a result, we prioritize register classes with + // more 32 bit tuples (e.g. VReg_512) over registers with fewer tuples (e.g. VGPR_32). + // + // The interesting case is the vector register case on architectures which have ARegs, VRegs, AVRegs. + // In this case, we would like to assign ARegs and VRegs before AVRegs, as AVRegs are less constrained + // and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the + // RegisterClass AllocationPriority. BaseClassPriority is used to turn the bit on, and BaseClassScaleFactor + // is used for scaling of the bit (i.e. 1 << 4). + field int BaseClassPriority = 1; + field int BaseClassScaleFactor = 16; + } multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1, @@ -575,7 +592,7 @@ let HasVGPR = 1 in { def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, (add (interleave (sequence "VGPR%u_LO16", 0, 255), (sequence "VGPR%u_HI16", 0, 255)))> { - let AllocationPriority = 2; + let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor)); let Size = 16; let GeneratePressureSet = 0; @@ -601,7 +618,7 @@ def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, // i16/f16 only on VI+ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, (add (sequence "VGPR%u", 0, 255))> { - let AllocationPriority = 0; + let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor)); let Size = 32; let Weight = 1; let BaseClassOrder = 32; @@ -610,7 +627,7 @@ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types // Identical to VGPR_32 except it only contains the low 128 (Lo128) registers. def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, (add (sequence "VGPR%u", 0, 127))> { - let AllocationPriority = 0; + let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor)); let GeneratePressureSet = 0; let Size = 32; let Weight = 1; @@ -668,7 +685,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, // AccVGPR 32-bit registers def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, (add (sequence "AGPR%u", 0, 255))> { - let AllocationPriority = 0; + let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor)); let Size = 32; let Weight = 1; let BaseClassOrder = 32; @@ -940,14 +957,23 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> : // Requires n v_mov_b32 to copy let CopyCost = numRegs; - let AllocationPriority = !sub(numRegs, 1); + + // Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the + // 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result + // of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for + // regsters with numRegs 17+ we give SizePriority of 15. In practice, there is only one + // RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512}, + // and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing. + defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15)); + + let AllocationPriority = !add(SizePrioriity, !mul(BaseClassPriority, BaseClassScaleFactor)); let Weight = numRegs; } // Define a register tuple class, along with one requiring an even // aligned base register. multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> { - let HasVGPR = 1 in { + let HasVGPR = 1, BaseClassPriority = 1 in { // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, regList> { let BaseClassOrder = !mul(numRegs, 32); @@ -981,7 +1007,7 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>; } multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> { - let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in { + let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1 in { // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, regList> { let BaseClassOrder = !mul(numRegs, 32); @@ -1066,6 +1092,7 @@ def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_6 def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> { let HasVGPR = 1; let HasAGPR = 1; + let BaseClassPriority = 0; let Size = 32; } } // End GeneratePressureSet = 0 @@ -1074,7 +1101,7 @@ def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_3 // aligned base register. multiclass AVRegClass<int numRegs, list<ValueType> regTypes, dag vregList, dag aregList> { - let HasVGPR = 1, HasAGPR = 1 in { + let HasVGPR = 1, HasAGPR = 1, BaseClassPriority = 0 in { // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, (add vregList, aregList)>; @@ -1191,6 +1218,8 @@ def VSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_FP64"> { def VSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2INT32">; def VSrc_v2f32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2FP32">; +def VSrc_NoInline_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_NOINLINE_V2FP16">; + //===----------------------------------------------------------------------===// // VRegSrc_* Operands with a VGPR //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 38cc51b..4bda51d 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -856,9 +856,9 @@ def smrd_sextloadi16 : SMRDLoadPat<sextloadi16>; def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), (prefetch node:$ptr, node:$rw, node:$loc, node:$type), - [{ return !N->getOperand(1)->isDivergent();}]> { + [{ return !N->getOperand(1)->isDivergent() && Subtarget->hasSafeSmemPrefetch();}]> { let GISelPredicateCode = [{ - return isInstrUniform(MI); + return isInstrUniform(MI) && Subtarget->hasSafeSmemPrefetch(); }]; } @@ -1152,6 +1152,7 @@ multiclass SMPrefetchPat<string type, TImmLeaf cache_type> { } defm : SMPrefetchPat<"INST", i32imm_zero>; +let AddedComplexity = 12 in // Prefer scalar prefetch over global for r/o case. defm : SMPrefetchPat<"DATA", i32imm_one>; let SubtargetPredicate = isGFX12Plus in { diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index e103ccc..8303410 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1621,6 +1621,13 @@ let OtherPredicates = [HasImageInsts] in { def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">; } +// Represents the point at which a wave must wait for all outstanding direct loads to LDS. +// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts. + +def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> { + let hasSideEffects = 0; +} + def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b5b3cc9..5827f18 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -732,7 +732,14 @@ bool isGenericAtomic(unsigned Opc) { } bool isAsyncStore(unsigned Opc) { - return false; // placeholder before async store implementation. + return Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_SADDR_gfx1250; } bool isTensorStore(unsigned Opc) { @@ -2652,6 +2659,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_C_FP16: @@ -3016,6 +3024,8 @@ bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) { case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: return isInlinableLiteralV2BF16(Literal); + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + return false; default: llvm_unreachable("bad packed operand type"); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index c09a9d6..74d59f4 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1636,6 +1636,7 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: return 2; default: diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 550ec9d..9de7d6d 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1344,6 +1344,8 @@ def V_FMAAK_F64 : VOP2_Pseudo<"v_fmaak_f64", VOP_MADAK_F64, [], "">; } // End SubtargetPredicate = HasFmaakFmamkF64Insts, isReMaterializable = 1, FixedSize = 1, Size = 12, SchedRW = [Write64Bit] let SubtargetPredicate = HasPkFmacF16Inst in { +// FIXME: V_PK_FMAC_F16 is currently not used in instruction selection. +// If this changes, ensure the DPP variant is not used for GFX11+. defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>; } // End SubtargetPredicate = HasPkFmacF16Inst @@ -1904,7 +1906,7 @@ multiclass VOP2_Real_FULL_with_name_gfx11_gfx12<bits<6> op, string opName, VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>; multiclass VOP2_Real_e32_gfx11_gfx12<bits<6> op> : - VOP2Only_Real<GFX11Gen, op>, VOP2Only_Real<GFX12Gen, op>; + VOP2Only_Real_e32<GFX11Gen, op>, VOP2Only_Real_e32<GFX12Gen, op>; multiclass VOP3Only_Realtriple_gfx11_gfx12<bits<10> op> : VOP3Only_Realtriple<GFX11Gen, op>, VOP3Only_Realtriple<GFX12Gen, op>; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index b6f9568..96fe503 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -32,9 +32,10 @@ class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> { let HasExtDPP = 0; } -let HasExt64BitDPP = 1 in { -def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>; -def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>; +def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> { + let HasExtVOP3DPP = 0; + let HasExtDPP = 0; +} def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { let HasClamp = 1; @@ -44,6 +45,10 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp"; } +let HasExt64BitDPP = 1 in { +def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>; +def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>; + class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> { let HasExtVOP3DPP = 0; let HasExtDPP = 0; @@ -52,10 +57,13 @@ class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> { def V_LSHL_ADD_U64_PROF : VOP3_Profile<VOP_I64_I64_I32_I64>; def VOP_F64_F64_F64_F64_DPP_PROF : VOP3_Profile<VOP_F64_F64_F64_F64>; - -def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> { +def V_MAD_U32_PROF: VOP3_Profile<VOP_I32_I32_I32_I32> { let HasExtVOP3DPP = 0; - let HasExtDPP = 0; + let HasExt64BitDPP = 1; +} +def VOP_I64_I64_I64_DPP : VOP3_Profile<VOP_I64_I64_I64>; +def VOP_I32_I32_I64_DPP : VOP3_Profile<VOPProfile<[i64, i32, i32, i64]>> { + let HasClamp = 1; } } // End HasExt64BitDPP = 1; @@ -152,6 +160,15 @@ defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32 defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>, VOPD_Component<0x13, "v_fma_f32">; defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>; +let SchedRW = [WriteIntMul] in { + let SubtargetPredicate = HasMadU32Inst in + defm V_MAD_U32 : VOP3Inst <"v_mad_u32", V_MAD_U32_PROF>; + let SubtargetPredicate = isGFX1250Plus in { + defm V_MAD_NC_U64_U32 : VOP3Inst<"v_mad_nc_u64_u32", VOP_I32_I32_I64_DPP>; + defm V_MAD_NC_I64_I32 : VOP3Inst<"v_mad_nc_i64_i32", VOP_I32_I32_I64_DPP>; + } +} + let SchedRW = [WriteDoubleAdd] in { let FPDPRounding = 1 in { defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP_F64_F64_F64_F64_DPP_PROF, any_fma>, VOPD_Component<0x20, "v_fma_f64">; @@ -185,6 +202,13 @@ defm V_MAXIMUM_F64 : VOP3Inst <"v_maximum_f64", VOP3_Profile<VOP_F64_F64_F64>, f } // End SchedRW = [WriteDoubleAdd] } // End SubtargetPredicate = HasIEEEMinimumMaximumInsts, ReadsModeReg = 0, AddedComplexity = 1 +let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] in { +defm V_MAX_I64 : VOP3Inst <"v_max_i64", VOP_I64_I64_I64_DPP, smax>; +defm V_MAX_U64 : VOP3Inst <"v_max_u64", VOP_I64_I64_I64_DPP, umax>; +defm V_MIN_I64 : VOP3Inst <"v_min_i64", VOP_I64_I64_I64_DPP, smin>; +defm V_MIN_U64 : VOP3Inst <"v_min_u64", VOP_I64_I64_I64_DPP, umin>; +} // End SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] + } // End isReMaterializable = 1 let Uses = [MODE, VCC, EXEC] in { @@ -722,6 +746,13 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in { defm V_MAXIMUM3_F16 : VOP3Inst_t16 <"v_maximum3_f16", VOP_F16_F16_F16_F16, AMDGPUfmaximum3>; } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 +let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in { + defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP_I32_I32_I32_I32>; + defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP_I32_I32_I32_I32>; + defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP_I32_I32_I32_I32>; + defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP_I32_I32_I32_I32>; +} + defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>; defm V_SUB_I16 : VOP3Inst_t16 <"v_sub_i16", VOP_I16_I16_I16>; @@ -848,6 +879,9 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>; def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>; def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>; +let SubtargetPredicate = HasMadU32Inst, AddedComplexity = 10 in + def : ThreeOp_i32_Pats<mul, add, V_MAD_U32_e64>; + def : GCNPat< (DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1), (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>; @@ -858,6 +892,13 @@ def : GCNPat< (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2) >; +let SubtargetPredicate = HasAddMinMaxInsts in { +def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>; +def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>; +def : ThreeOp_i32_Pats<add, smin, V_ADD_MIN_I32_e64>; +def : ThreeOp_i32_Pats<add, umin, V_ADD_MIN_U32_e64>; +} + def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>; def : VOPBinOpClampPat<ssubsat, V_SUB_I32_e64, i32>; @@ -972,10 +1013,12 @@ class SrcAndDstSelToOpSelXForm<int modifier_idx, bit dest_sel> : SDNodeXForm<tim unsigned Val = N->getZExtValue(); unsigned New = 0; if (}] # modifier_idx # [{ == 0) { - New = (}] # dest_sel # [{ == 1) ? ((Val & 0x2) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL) - : ((Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE); - } else if (}] # modifier_idx # [{== 1 || }] # modifier_idx # [{ == 2) { - New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; + New = (}] # dest_sel # [{ == 1) ? ((Val & 0x1) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL) + : ((Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE); + } else if (}] # modifier_idx # [{== 1) { + New = (Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; + } if (}] # modifier_idx # [{== 2) { + New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; } return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32); }]>; @@ -1427,34 +1470,72 @@ let SubtargetPredicate = isGFX12Plus in { } // End SubtargetPredicate = isGFX12Plus -let SubtargetPredicate = HasBitOp3Insts in { +let HasClamp = 0, HasModifiers = 1 in { +def BitOp3_B16_Profile : VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>; +def BitOp3_B16_t16_Profile : VOP3_Profile_True16<BitOp3_B16_Profile>; +def BitOp3_B16_fake16_Profile : VOP3_Profile_Fake16<BitOp3_B16_Profile>; +} + +let OtherPredicates = [HasBitOp3Insts] in { let isReMaterializable = 1 in { - defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16", - VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>>; + let SubtargetPredicate = isGFX940Plus in + defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16", BitOp3_B16_Profile>; + let SubtargetPredicate = isGFX1250Plus in + defm V_BITOP3_B16_gfx1250 : VOP3Inst_t16_with_profiles <"v_bitop3_b16_gfx1250", BitOp3_B16_Profile, + BitOp3_B16_t16_Profile, BitOp3_B16_fake16_Profile>; defm V_BITOP3_B32 : VOP3Inst <"v_bitop3_b32", VOP3_BITOP3_Profile<VOPProfile <[i32, i32, i32, i32, i32]>, VOP3_REGULAR>>, VOPD_Component<0x12, "v_bitop2_b32">; } + def : GCNPat< (i32 (int_amdgcn_bitop3 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)), (i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3)) >; def : GCNPat< - (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), - (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) - >; - - def : GCNPat< (i32 (BITOP3_32 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)), (i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3)) >; - def : GCNPat< - (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), - (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) - >; -} // End SubtargetPredicate = HasBitOp3Insts + let SubtargetPredicate = isGFX940Plus in { + def : GCNPat< + (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + + def : GCNPat< + (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + } // End SubtargetPredicate = isGFX940Plus + + let SubtargetPredicate = isGFX1250Plus in { + let True16Predicate = UseFakeTrue16Insts in { + def : GCNPat< + (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_fake16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + + def : GCNPat< + (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_fake16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + } + let True16Predicate = UseRealTrue16Insts in { + def : GCNPat< + (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_t16_e64 0, VSrcT_b16:$src0, 0, VSrcT_b16:$src1, 0, VSrcT_b16:$src2, timm:$bitop3, 0)) + >; + + def : GCNPat< + (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_t16_e64 0, VSrcT_b16:$src0, 0, VSrcT_b16:$src1, 0, VSrcT_b16:$src2, timm:$bitop3, 0)) + >; + } + } // End SubtargetPredicate = isGFX1250Plus + +} // End OtherPredicates = [HasBitOp3Insts] class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat< (AMDGPUdiv_fmas (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), @@ -1531,6 +1612,7 @@ def bf16_fpround : PatFrag <(ops node:$src0), (fpround $src0), [{ return true; let SubtargetPredicate = HasBF16ConversionInsts in { let ReadsModeReg = 0 in { defm V_CVT_PK_BF16_F32 : VOP3Inst<"v_cvt_pk_bf16_f32", VOP3_Profile<VOP_V2BF16_F32_F32>>; + defm V_CVT_SR_PK_BF16_F32 : VOP3Inst<"v_cvt_sr_pk_bf16_f32", VOP3_Profile<VOP_V2BF16_F32_F32_I32>, int_amdgcn_cvt_sr_pk_bf16_f32>; } def : GCNPat<(v2bf16 (bf16_fpround v2f32:$src)), (V_CVT_PK_BF16_F32_e64 0, (EXTRACT_SUBREG VReg_64:$src, sub0), 0, (EXTRACT_SUBREG VReg_64:$src, sub1))>; @@ -1541,6 +1623,53 @@ let SubtargetPredicate = HasBF16ConversionInsts in { (V_CVT_PK_BF16_F32_e64 $src0_modifiers, $src0, 0, (f32 (IMPLICIT_DEF)))>; } +let Src0RC64 = VSrc_NoInline_v2f16 in { +def VOP3_CVT_PK_F8_F16_Profile : VOP3_Profile<VOP_I16_V2F16>; +def VOP3_CVT_PK_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_PK_F8_F16_Profile>; +def VOP3_CVT_PK_F8_F16_Fake16_Profile : VOP3_Profile_Fake16<VOP3_CVT_PK_F8_F16_Profile>; +} + +let ReadsModeReg = 0, IsPacked = 0, SubtargetPredicate = isGFX125xOnly in { + defm V_CVT_PK_FP8_F16_gfx1250 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f16_gfx1250", + VOP3_CVT_PK_F8_F16_Profile, + VOP3_CVT_PK_F8_F16_True16_Profile, + VOP3_CVT_PK_F8_F16_Fake16_Profile, + int_amdgcn_cvt_pk_fp8_f16>; + defm V_CVT_PK_BF8_F16_gfx1250 : VOP3Inst_t16_with_profiles<"v_cvt_pk_bf8_f16_gfx1250", + VOP3_CVT_PK_F8_F16_Profile, + VOP3_CVT_PK_F8_F16_True16_Profile, + VOP3_CVT_PK_F8_F16_Fake16_Profile, + int_amdgcn_cvt_pk_bf8_f16>; +} + +let HasClamp = 0, HasOpSel = 1 in { +def VOP3_CVT_SR_F8_F16_Profile : VOP3_CVT_SR_F8_ByteSel_Profile<f16>; +def VOP3_CVT_SR_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_SR_F8_F16_Profile>; +def VOP3_CVT_SR_F8_F16_Fake16_Profile : VOP3_Profile_Fake16<VOP3_CVT_SR_F8_F16_Profile>; +} + +let SubtargetPredicate = isGFX1250Plus in { + let ReadsModeReg = 0 in { + // These instructions have non-standard use of op_sel. They are using bits 2 and 3 of opsel + // to select a byte in the vdst. Bits 0 and 1 are unused. + let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in { + defm V_CVT_SR_FP8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_fp8_f16", VOP3_CVT_SR_F8_F16_Profile, + VOP3_CVT_SR_F8_F16_True16_Profile, VOP3_CVT_SR_F8_F16_Fake16_Profile>; + defm V_CVT_SR_BF8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_bf8_f16", VOP3_CVT_SR_F8_F16_Profile, + VOP3_CVT_SR_F8_F16_True16_Profile, VOP3_CVT_SR_F8_F16_Fake16_Profile>; + } + } // End ReadsModeReg = 0 + + let True16Predicate = UseRealTrue16Insts in { + def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_t16_e64, f16>; + def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_t16_e64, f16>; + } + let True16Predicate = UseFakeTrue16Insts in { + def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_fake16_e64, f16>; + def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_fake16_e64, f16>; + } +} // End SubtargetPredicate = isGFX1250Plus + class Cvt_Scale_Sr_F32ToBF16F16_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType DstTy> : GCNPat< (DstTy (node DstTy:$vdst_in, f32:$src0, i32:$src1, timm:$word_sel)), (inst (DstSelToOpSelXForm $word_sel), $src0, 0, $src1, VGPR_32:$vdst_in) @@ -1746,6 +1875,21 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x368, "v_m defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>; defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>; +defm V_BITOP3_B16_gfx1250 : VOP3_Real_BITOP3_t16_and_fake16_gfx1250<0x233, "v_bitop3_b16">; +defm V_BITOP3_B32 : VOP3_Real_BITOP3_gfx1250<0x234>; + +defm V_MAD_U32 : VOP3Only_Realtriple_gfx1250<0x235>; +defm V_MAD_NC_U64_U32 : VOP3Only_Realtriple_gfx1250<0x2fa>; +defm V_MAD_NC_I64_I32 : VOP3Only_Realtriple_gfx1250<0x2fb>; +defm V_MIN_U64 : VOP3Only_Realtriple_gfx1250<0x318>; +defm V_MAX_U64 : VOP3Only_Realtriple_gfx1250<0x319>; +defm V_MIN_I64 : VOP3Only_Realtriple_gfx1250<0x31a>; +defm V_MAX_I64 : VOP3Only_Realtriple_gfx1250<0x31b>; +defm V_ADD_MAX_I32 : VOP3Only_Realtriple_gfx1250<0x25e>; +defm V_ADD_MAX_U32 : VOP3Only_Realtriple_gfx1250<0x25f>; +defm V_ADD_MIN_I32 : VOP3Only_Realtriple_gfx1250<0x260>; +defm V_ADD_MIN_U32 : VOP3Only_Realtriple_gfx1250<0x261>; + defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">; defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">; defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >; @@ -1918,7 +2062,15 @@ let AssemblerPredicate = isGFX11Plus in { // These instructions differ from GFX12 variant by supporting DPP: defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>; +defm V_ASHR_PK_I8_I32 : VOP3Only_Realtriple_gfx1250<0x290>; +defm V_ASHR_PK_U8_I32 : VOP3Only_Realtriple_gfx1250<0x291>; defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>; +defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36e>; +defm V_CVT_PK_F16_F32 : VOP3Only_Realtriple_gfx1250<0x36f>; +defm V_CVT_PK_FP8_F16_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x372, "v_cvt_pk_fp8_f16">; +defm V_CVT_PK_BF8_F16_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x373, "v_cvt_pk_bf8_f16">; +defm V_CVT_SR_FP8_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x374>; +defm V_CVT_SR_BF8_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x375>; //===----------------------------------------------------------------------===// // GFX10. diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index ea14c77..95fcd4a 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -35,14 +35,18 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR, bit useTiedOutput = 0> : VOP3P_Profile<P, Features, 1> { bit UseTiedOutput = useTiedOutput; + defvar Src0RC = getVCSrcForVT<P.Src0VT>.ret; + defvar Src1RC = getVCSrcForVT<P.Src1VT>.ret; + defvar Src2RC = getVCSrcForVT<P.Src2VT>.ret; + dag srcs = - (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0, - FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, - FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); + (ins FP16InputMods:$src0_modifiers, Src0RC:$src0, + FP16InputMods:$src1_modifiers, Src1RC:$src1, + FP16InputMods:$src2_modifiers, Src2RC:$src2); dag dpp_srcs = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, FPVRegInputMods:$src1_modifiers, VRegSrc_32:$src1, - FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); + FP16InputMods:$src2_modifiers, Src2RC:$src2); // FIXME: Clamp0 misbehaves with the non-default vdst_in // following it. For now workaround this by requiring clamp @@ -161,38 +165,42 @@ defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_ multiclass MadFmaMixPats<SDPatternOperator fma_like, Instruction mix_inst, Instruction mixlo_inst, - Instruction mixhi_inst> { + Instruction mixhi_inst, + ValueType VT = f16, + ValueType vecVT = v2f16> { + defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods); + defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt); // At least one of the operands needs to be an fpextend of an f16 // for this to be worthwhile, so we need three patterns here. // TODO: Could we use a predicate to inspect src1/2/3 instead? def : GCNPat < - (f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_mods)))), + (f32 (fma_like (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; def : GCNPat < - (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)), - (f32 (VOP3PMadMixModsExt f16:$src1, i32:$src1_mods)), - (f32 (VOP3PMadMixMods f32:$src2, i32:$src2_mods)))), + (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsPat f32:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; def : GCNPat < - (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_mods)), - (f32 (VOP3PMadMixModsExt f16:$src2, i32:$src2_mods)))), + (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; def : GCNPat < (AMDGPUclamp (build_vector - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))), - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))), - (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0, + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))), + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))), + (vecVT (mixhi_inst $hi_src0_modifiers, $hi_src0, $hi_src1_modifiers, $hi_src1, $hi_src2_modifiers, $hi_src2, DSTCLAMP.ENABLE, @@ -204,8 +212,8 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, >; def : GCNPat < - (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))), + (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))), (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, (i32 0), (i32 0), @@ -214,9 +222,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, >; def : GCNPat < - (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + (build_vector VT:$elt0, (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers)))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, (i32 0), (i32 0), DSTCLAMP.NONE, @@ -224,9 +232,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, >; def : GCNPat < - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, @@ -241,10 +249,10 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, let True16Predicate = p in { def : GCNPat < - (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.NONE, @@ -253,11 +261,11 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, def : GCNPat < (build_vector - f16:$elt0, - (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + VT:$elt0, + (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.ENABLE, @@ -268,38 +276,38 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, let True16Predicate = UseRealTrue16Insts in { def : GCNPat < - (build_vector (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), f16:$elt1), - (v2f16 (mixlo_inst $src0_modifiers, $src0, + (build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), VT:$elt1), + (vecVT (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.NONE, - (REG_SEQUENCE VGPR_32, (f16 (IMPLICIT_DEF)), lo16, $elt1, hi16))) + (REG_SEQUENCE VGPR_32, (VT (IMPLICIT_DEF)), lo16, $elt1, hi16))) >; def : GCNPat < - (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.NONE, - (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16))) + (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16))) >; def : GCNPat < (build_vector - f16:$elt0, - (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + VT:$elt0, + (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.ENABLE, - (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16))) + (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16))) >; } // end True16Predicate } @@ -360,6 +368,24 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>; } +let SubtargetPredicate = HasFmaMixBF16Insts in { +let isCommutable = 1 in { + +let isReMaterializable = 1 in +defm V_FMA_MIX_F32_BF16 : VOP3_VOP3PInst<"v_fma_mix_f32_bf16", VOP3P_Mix_Profile<VOP_F32_BF16_BF16_BF16, VOP3_OPSEL>>; + +let FPDPRounding = 1 in { +defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>; + +let ClampLo = 0, ClampHi = 1 in { +defm V_FMA_MIXHI_BF16 : VOP3_VOP3PInst<"v_fma_mixhi_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>; +} +} // End FPDPRounding = 1 +} // End isCommutable = 1 + +defm : MadFmaMixPats<fma, V_FMA_MIX_F32_BF16, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>; +} // End SubtargetPredicate = HasFmaMixBF16Insts + def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> { let HasModifiers = 0; } @@ -1210,6 +1236,12 @@ let isCommutable = 1, isReMaterializable = 1 in { defm V_PK_MIN_NUM_BF16 : VOP3PInst<"v_pk_min_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fminnum_like>; defm V_PK_MAX_NUM_BF16 : VOP3PInst<"v_pk_max_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fmaxnum_like>; defm V_PK_FMA_BF16 : VOP3PInst<"v_pk_fma_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fma>; + + // Scalar pseudo used to emulate AMDGPUClamp. + // Expanded to V_PK_MAX_NUM_BF16 with unused high half. + // FIXME-TRUE16: Pseudo expansion of this won't work with True16. + let True16Predicate = UseFakeTrue16Insts in + defm V_MAX_BF16_PSEUDO : VOP3Inst <"v_max_bf16", VOP_BF16_BF16_BF16>; } } // End isCommutable = 1, isReMaterializable = 1 @@ -2247,6 +2279,13 @@ defm V_PK_MAXIMUM3_F16 : VOP3P_Real_gfx1250<0x37>; defm V_PK_MIN3_NUM_F16 : VOP3P_Real_gfx1250<0x38>; defm V_PK_MAX3_NUM_F16 : VOP3P_Real_gfx1250<0x39>; +defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>; +defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3e>; +defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3f>; + +let AssemblerPredicate = isGFX1250Plus in +def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">; + defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>; defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index c21e2d3..a029376 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -401,6 +401,19 @@ class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> { let Inst{49-41} = src0; } +class VOP3a_BITOP3_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, p> { + bits<8> bitop3; + + let Inst{60-59} = bitop3{7-6}; + let Inst{10-8} = bitop3{5-3}; + let Inst{63-61} = bitop3{2-0}; + + let Inst{11} = !if(p.HasOpSel, src0_modifiers{2}, 0); + let Inst{12} = !if(p.HasOpSel, src1_modifiers{2}, 0); + let Inst{13} = !if(p.HasOpSel, src2_modifiers{2}, 0); + let Inst{14} = !if(p.HasOpSel, src0_modifiers{3}, 0); +} + class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> { bits<6> attr; bits<2> attrchan; @@ -1506,6 +1519,7 @@ class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VO let HasFP8SrcByteSel = P.HasFP8SrcByteSel; let HasFP8DstByteSel = P.HasFP8DstByteSel; let HasOMod = P.HasOMod; + let HasBitOp3 = P.HasBitOp3; let HasModifiers = !if (Features.IsMAI, 0, @@ -1525,6 +1539,7 @@ class VOP3_Profile_True16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : let HasFP8SrcByteSel = P.HasFP8SrcByteSel; let HasFP8DstByteSel = P.HasFP8DstByteSel; let HasOMod = P.HasOMod; + let HasBitOp3 = P.HasBitOp3; let HasModifiers = !if (Features.IsMAI, 0, @@ -1540,6 +1555,7 @@ class VOP3_Profile_Fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : let HasFP8SrcByteSel = P.HasFP8SrcByteSel; let HasFP8DstByteSel = P.HasFP8DstByteSel; let HasOMod = P.HasOMod; + let HasBitOp3 = P.HasBitOp3; let HasModifiers = !if (Features.IsMAI, 0, @@ -1723,6 +1739,34 @@ class VOP3b_DPP8_Base<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> let Inst{14 - 8} = sdst; } +class VOP3_BITOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo p, GFXGen Gen, string asmName> + : VOP3_DPP16_Gen_t16<op, p, Gen, asmName> { + bits<8> bitop3; + + let Inst{60-59} = bitop3{7-6}; + let Inst{10-8} = bitop3{5-3}; + let Inst{63-61} = bitop3{2-0}; + + let Inst{11} = !if(p.Pfl.HasOpSel, src0_modifiers{2}, 0); + let Inst{12} = !if(p.Pfl.HasOpSel, src1_modifiers{2}, 0); + let Inst{13} = !if(p.Pfl.HasOpSel, src2_modifiers{2}, 0); + let Inst{14} = !if(p.Pfl.HasOpSel, src0_modifiers{3}, 0); +} + +class VOP3_BITOP3_DPP8<bits<10> op, VOP_Pseudo p, string asmName> + : Base_VOP3_DPP8_t16<op, p, asmName> { + bits<8> bitop3; + + let Inst{60-59} = bitop3{7-6}; + let Inst{10-8} = bitop3{5-3}; + let Inst{63-61} = bitop3{2-0}; + + let Inst{11} = !if(p.Pfl.HasOpSel, src0_modifiers{2}, 0); + let Inst{12} = !if(p.Pfl.HasOpSel, src1_modifiers{2}, 0); + let Inst{13} = !if(p.Pfl.HasOpSel, src2_modifiers{2}, 0); + let Inst{14} = !if(p.Pfl.HasOpSel, src0_modifiers{3}, 0); +} + class VOP3b_DPP8_Base_t16<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> : Base_VOP3_DPP8<op, ps, opName> { bits<8> sdst; @@ -1943,6 +1987,29 @@ multiclass VOP3be_Realtriple< multiclass VOP3beOnly_Realtriple<GFXGen Gen, bits<10> op> : VOP3be_Realtriple<Gen, op, 1>; +multiclass VOP3_BITOP3_Real_dpp_Base<GFXGen Gen, bits<10> op, string asmName> { + def _e64_dpp#Gen.Suffix : + VOP3_BITOP3_DPP16_Gen<op, !cast<VOP_DPP_Pseudo>(NAME#"_e64"#"_dpp"), Gen, asmName>; +} + +multiclass VOP3_BITOP3_Real_dpp8_Base<GFXGen Gen, bits<10> op, string asmName> { + defvar ps = !cast<VOP3_Pseudo>(NAME#"_e64"); + def _e64_dpp8#Gen.Suffix : VOP3_BITOP3_DPP8<op, ps, asmName> { + let DecoderNamespace = + Gen.DecoderNamespace #!if (ps.Pfl.IsRealTrue16, "", "_FAKE16"); + let AssemblerPredicate = Gen.AssemblerPredicate; + } +} + +multiclass VOP3_BITOP3_Real_Base<GFXGen Gen, bits<10> op, string asmName> { + defvar ps = !cast<VOP_Pseudo>(NAME#"_e64"); + let IsSingle = ps.Pfl.IsSingle, AsmString = asmName # ps.AsmOperands in { + def _e64#Gen.Suffix : + VOP3_Real_Gen<ps, Gen>, + VOP3a_BITOP3_gfx12<op, ps.Pfl>; + } +} + //===----------------------------------------------------------------------===// // VOP3 GFX11 //===----------------------------------------------------------------------===// @@ -2004,6 +2071,15 @@ multiclass VOP3Only_Real_Base_gfx1250<bits<10> op> : multiclass VOP3Only_Realtriple_gfx1250<bits<10> op, bit isSingle = 0> : VOP3_Realtriple<GFX1250Gen, op, isSingle>; +multiclass VOP3Only_Realtriple_with_name_gfx1250<bits<10> op, string opName, + string asmName, string pseudo_mnemonic = "", + bit isSingle = 0> : + VOP3_Realtriple_with_name<GFX1250Gen, op, opName, asmName, pseudo_mnemonic, isSingle>; + +multiclass VOP3Only_Realtriple_t16_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic, + string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> : + VOP3Only_Realtriple_with_name_gfx1250<op, opName, asmName, pseudo_mnemonic, isSingle>; + multiclass VOP3_Realtriple_t16_gfx12<bits<10> op, string asmName, string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> : VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>; @@ -2024,6 +2100,13 @@ multiclass VOP3Only_Realtriple_t16_and_fake16_gfx12<bits<10> op, string asmName, defm _fake16 : VOP3Only_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic>; } +multiclass VOP3Only_Realtriple_t16_and_fake16_gfx1250<bits<10> op, + string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic, + string opName = NAME, string pseudo_mnemonic = ""> { + defm _t16 : VOP3Only_Realtriple_t16_gfx1250<op, asmName, opName#"_t16", pseudo_mnemonic>; + defm _fake16 : VOP3Only_Realtriple_t16_gfx1250<op, asmName, opName#"_fake16", pseudo_mnemonic>; +} + multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName, string asmName, bit isSingle = 0> { defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); @@ -2046,6 +2129,16 @@ multiclass VOP3Only_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName, VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>, VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>; +multiclass VOP3_Real_BITOP3_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> : + VOP3_BITOP3_Real_Base<GFX1250Gen, op, asmName>, + VOP3_BITOP3_Real_dpp_Base<GFX1250Gen, op, asmName>, + VOP3_BITOP3_Real_dpp8_Base<GFX1250Gen, op, asmName>; + +multiclass VOP3_Real_BITOP3_t16_and_fake16_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> { + defm _t16 : VOP3_Real_BITOP3_gfx1250<op, asmName>; + defm _fake16: VOP3_Real_BITOP3_gfx1250<op, asmName>; +} + multiclass VOP3Dot_Realtriple_gfx11_gfx12<bits<10> op, string asmName, bit isSingle = 0, string opName = NAME> : VOP3Dot_Realtriple<GFX11Gen, op, asmName, isSingle, opName>, |