diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
45 files changed, 1968 insertions, 563 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 6076ac4..8b8fc8b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -149,6 +149,12 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts", "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions" >; +def FeatureFmaMixBF16Insts : SubtargetFeature<"fma-mix-bf16-insts", + "HasFmaMixBF16Insts", + "true", + "Has v_fma_mix_f32_bf16, v_fma_mixlo_bf16, v_fma_mixhi_bf16 instructions" +>; + def FeatureIEEEMinimumMaximumInsts : SubtargetFeature<"ieee-minimum-maximum-insts", "HasIEEEMinimumMaximumInsts", "true", @@ -167,6 +173,12 @@ def FeatureMinimum3Maximum3F16 : SubtargetFeature<"minimum3-maximum3-f16", "Has v_minimum3_f16 and v_maximum3_f16 instructions" >; +def FeatureMin3Max3PKF16 : SubtargetFeature<"min3-max3-pkf16", + "HasMin3Max3PKF16", + "true", + "Has v_pk_min3_num_f16 and v_pk_max3_num_f16 instructions" +>; + def FeatureMinimum3Maximum3PKF16 : SubtargetFeature<"minimum3-maximum3-pkf16", "HasMinimum3Maximum3PKF16", "true", @@ -256,12 +268,24 @@ def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug", "S_INST_PREFETCH instruction causes shader to hang" >; +def FeatureVmemPrefInsts : SubtargetFeature<"vmem-pref-insts", + "HasVmemPrefInsts", + "true", + "Has flat_prefect_b8 and global_prefetch_b8 instructions" +>; + def FeatureSafeSmemPrefetch : SubtargetFeature<"safe-smem-prefetch", "HasSafeSmemPrefetch", "true", "SMEM prefetches do not fail on illegal address" >; +def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch", + "HasSafeCUPrefetch", + "true", + "VMEM CU scope prefetches do not fail on illegal address" +>; + def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", "HasVcmpxExecWARHazard", "true", @@ -559,6 +583,12 @@ def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts", "Has bf16 conversion instructions" >; +def FeatureBF16PackedInsts : SubtargetFeature<"bf16-pk-insts", + "HasBF16PackedInsts", + "true", + "Has bf16 packed instructions (fma, add, mul, max, min)" +>; + def FeatureVOP3P : SubtargetFeature<"vop3p", "HasVOP3PInsts", "true", @@ -1349,6 +1379,10 @@ def FeatureLshlAddU64Inst : SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true", "Has v_lshl_add_u64 instruction">; +def FeatureAddSubU64Insts + : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true", + "Has v_add_u64 and v_sub_u64 instructions">; + def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts", "HasVMemToLDSLoad", "true", @@ -1989,7 +2023,10 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureTransposeLoadF4F6Insts, FeatureBF16TransInsts, FeatureBF16ConversionInsts, + FeatureBF16PackedInsts, FeatureCvtPkF16F32Inst, + FeatureFmaMixBF16Insts, + FeatureMin3Max3PKF16, FeatureMinimum3Maximum3PKF16, FeaturePrngInst, FeaturePermlane16Swap, @@ -2002,7 +2039,9 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureFlatBufferGlobalAtomicFaddF64Inst, FeatureMemoryAtomicFAddF32DenormalSupport, FeatureKernargPreload, + FeatureVmemPrefInsts, FeatureLshlAddU64Inst, + FeatureAddSubU64Insts, FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, ]>; @@ -2349,6 +2388,10 @@ def HasMinimum3Maximum3F16 : Predicate<"Subtarget->hasMinimum3Maximum3F16()">, AssemblerPredicate<(all_of FeatureMinimum3Maximum3F16)>; +def HasMin3Max3PKF16 : + Predicate<"Subtarget->hasMin3Max3PKF16()">, + AssemblerPredicate<(all_of FeatureMin3Max3PKF16)>; + def HasMinimum3Maximum3PKF16 : Predicate<"Subtarget->hasMinimum3Maximum3PKF16()">, AssemblerPredicate<(all_of FeatureMinimum3Maximum3PKF16)>; @@ -2472,6 +2515,9 @@ def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">, def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">, AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>; +def HasBF16PackedInsts : Predicate<"Subtarget->hasBF16PackedInsts()">, + AssemblerPredicate<(all_of FeatureBF16PackedInsts)>; + def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, AssemblerPredicate<(all_of FeatureVOP3P)>; @@ -2519,6 +2565,14 @@ def HasFmaakFmamkF64Insts : Predicate<"Subtarget->hasFmaakFmamkF64Insts()">, AssemblerPredicate<(any_of FeatureGFX1250Insts)>; +def HasPkAddMinMaxInsts : + Predicate<"Subtarget->hasPkAddMinMaxInsts()">, + AssemblerPredicate<(any_of FeatureGFX1250Insts)>; + +def HasPkMinMax3Insts : + Predicate<"Subtarget->hasPkMinMax3Insts()">, + AssemblerPredicate<(any_of FeatureGFX1250Insts)>; + def HasImageInsts : Predicate<"Subtarget->hasImageInsts()">, AssemblerPredicate<(all_of FeatureImageInsts)>; @@ -2565,6 +2619,9 @@ def HasMovrel : Predicate<"Subtarget->hasMovrel()">, def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">, AssemblerPredicate<(all_of FeatureFmaMixInsts)>; +def HasFmaMixBF16Insts : Predicate<"Subtarget->hasFmaMixBF16Insts()">, + AssemblerPredicate<(all_of FeatureFmaMixBF16Insts)>; + def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">, AssemblerPredicate<(all_of FeatureDLInsts)>; @@ -2763,12 +2820,18 @@ def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">; def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">, AssemblerPredicate<(all_of FeatureXF32Insts)>; +def HasVmemPrefInsts : Predicate<"Subtarget->hasVmemPrefInsts()">, + AssemblerPredicate<(all_of FeatureVmemPrefInsts)>; + def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">, AssemblerPredicate<(all_of FeatureAshrPkInsts)>; def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">, AssemblerPredicate<(all_of FeatureLshlAddU64Inst)>; +def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">, + AssemblerPredicate<(all_of FeatureAddSubU64Insts)>; + def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">, AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index dedee46..49d8b44 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1383,7 +1383,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID, - &AAIndirectCallInfo::ID, &AAInstanceInfo::ID}); + &AAIndirectCallInfo::ID}); AttributorConfig AC(CGUpdater); AC.IsClosedWorldModule = Options.IsClosedWorld; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 891d362..c01e5d3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -137,6 +137,9 @@ def gi_global_offset : def gi_global_saddr : GIComplexOperandMatcher<s64, "selectGlobalSAddr">, GIComplexPatternEquiv<GlobalSAddr>; +def gi_global_saddr_cpol : + GIComplexOperandMatcher<s64, "selectGlobalSAddrCPol">, + GIComplexPatternEquiv<GlobalSAddrCPol>; def gi_global_saddr_glc : GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">, GIComplexPatternEquiv<GlobalSAddrGLC>; @@ -446,5 +449,8 @@ def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">, def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">, GISDNodeXFormEquiv<as_hw_round_mode>; +def gi_prefetch_loc : GICustomOperandRenderer<"renderPrefetchLoc">, + GISDNodeXFormEquiv<PrefetchLoc>; + def gi_MFMALdScaleModifierOp : GICustomOperandRenderer<"renderScaledMAIIntrinsicOperand">, GISDNodeXFormEquiv<MFMALdScaleXForm>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp index 00979f4..f36935d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp @@ -117,45 +117,72 @@ static LLT getReadAnyLaneSplitTy(LLT Ty) { return LLT::scalar(32); } -static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc, - const RegisterBankInfo &RBI); - -static void unmergeReadAnyLane(MachineIRBuilder &B, - SmallVectorImpl<Register> &SgprDstParts, - LLT UnmergeTy, Register VgprSrc, - const RegisterBankInfo &RBI) { +template <typename ReadLaneFnTy> +static Register buildReadLane(MachineIRBuilder &, Register, + const RegisterBankInfo &, ReadLaneFnTy); + +template <typename ReadLaneFnTy> +static void +unmergeReadAnyLane(MachineIRBuilder &B, SmallVectorImpl<Register> &SgprDstParts, + LLT UnmergeTy, Register VgprSrc, const RegisterBankInfo &RBI, + ReadLaneFnTy BuildRL) { const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID); auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc); for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { - SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI)); + SgprDstParts.push_back(buildReadLane(B, Unmerge.getReg(i), RBI, BuildRL)); } } -static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc, - const RegisterBankInfo &RBI) { +template <typename ReadLaneFnTy> +static Register buildReadLane(MachineIRBuilder &B, Register VgprSrc, + const RegisterBankInfo &RBI, + ReadLaneFnTy BuildRL) { LLT Ty = B.getMRI()->getType(VgprSrc); const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID); if (Ty.getSizeInBits() == 32) { - return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc}) - .getReg(0); + Register SgprDst = B.getMRI()->createVirtualRegister({SgprRB, Ty}); + return BuildRL(B, SgprDst, VgprSrc).getReg(0); } SmallVector<Register, 8> SgprDstParts; - unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI); + unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI, + BuildRL); return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0); } -void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, - Register VgprSrc, const RegisterBankInfo &RBI) { +template <typename ReadLaneFnTy> +static void buildReadLane(MachineIRBuilder &B, Register SgprDst, + Register VgprSrc, const RegisterBankInfo &RBI, + ReadLaneFnTy BuildReadLane) { LLT Ty = B.getMRI()->getType(VgprSrc); if (Ty.getSizeInBits() == 32) { - B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc}); + BuildReadLane(B, SgprDst, VgprSrc); return; } SmallVector<Register, 8> SgprDstParts; - unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI); + unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI, + BuildReadLane); B.buildMergeLikeInstr(SgprDst, SgprDstParts).getReg(0); } + +void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, + Register VgprSrc, const RegisterBankInfo &RBI) { + return buildReadLane( + B, SgprDst, VgprSrc, RBI, + [](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) { + return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc}); + }); +} + +void AMDGPU::buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, + Register VgprSrc, const RegisterBankInfo &RBI) { + return buildReadLane( + B, SgprDst, VgprSrc, RBI, + [](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) { + return B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, SgprDst) + .addReg(VgprSrc); + }); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h index 0c89bb5..5e1000e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h @@ -51,6 +51,8 @@ private: void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI); +void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, + const RegisterBankInfo &RBI); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 00c7f0e..dfaa145 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1863,9 +1863,17 @@ bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr, SIInstrFlags::FlatScratch); } -// If this matches zero_extend i32:x, return x -static SDValue matchZExtFromI32(SDValue Op) { - if (Op.getOpcode() != ISD::ZERO_EXTEND) +// If this matches *_extend i32:x, return x +// Otherwise if the value is I32 returns x. +static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, + const SelectionDAG *DAG) { + if (Op.getValueType() == MVT::i32) + return Op; + + if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) && + Op.getOpcode() != ISD::ANY_EXTEND && + !(DAG->SignBitIsZero(Op) && + Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND))) return SDValue(); SDValue ExtSrc = Op.getOperand(0); @@ -1873,12 +1881,13 @@ static SDValue matchZExtFromI32(SDValue Op) { } // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) -bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, - SDValue Addr, - SDValue &SAddr, - SDValue &VOffset, - SDValue &Offset) const { +// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset) +bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, + SDValue &SAddr, SDValue &VOffset, + SDValue &Offset, bool &ScaleOffset, + bool NeedIOffset) const { int64_t ImmOffset = 0; + ScaleOffset = false; // Match the immediate offset first, which canonically is moved as low as // possible. @@ -1888,7 +1897,8 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, + if (NeedIOffset && + TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal)) { Addr = LHS; ImmOffset = COffsetVal; @@ -1898,11 +1908,14 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, // saddr + large_offset -> saddr + // (voffset = large_offset & ~MaxOffset) + // (large_offset & MaxOffset); - int64_t SplitImmOffset, RemainderOffset; - std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset( - COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); + int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal; + if (NeedIOffset) { + std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset( + COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); + } - if (isUInt<32>(RemainderOffset)) { + if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset) + : isUInt<32>(RemainderOffset)) { SDNode *VMov = CurDAG->getMachineNode( AMDGPU::V_MOV_B32_e32, SL, MVT::i32, CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); @@ -1929,21 +1942,26 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, // Match the variable offset. if (Addr.getOpcode() == ISD::ADD) { LHS = Addr.getOperand(0); - RHS = Addr.getOperand(1); if (!LHS->isDivergent()) { - // add (i64 sgpr), (zero_extend (i32 vgpr)) - if (SDValue ZextRHS = matchZExtFromI32(RHS)) { + // add (i64 sgpr), (*_extend (i32 vgpr)) + RHS = Addr.getOperand(1); + ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset()); + if (SDValue ExtRHS = matchExtFromI32orI32( + RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) { SAddr = LHS; - VOffset = ZextRHS; + VOffset = ExtRHS; } } + RHS = Addr.getOperand(1); if (!SAddr && !RHS->isDivergent()) { - // add (zero_extend (i32 vgpr)), (i64 sgpr) - if (SDValue ZextLHS = matchZExtFromI32(LHS)) { + // add (*_extend (i32 vgpr)), (i64 sgpr) + ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset()); + if (SDValue ExtLHS = matchExtFromI32orI32( + LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) { SAddr = RHS; - VOffset = ZextLHS; + VOffset = ExtLHS; } } @@ -1953,6 +1971,27 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, } } + if (Subtarget->hasScaleOffset() && + (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset() + ? AMDGPUISD::MAD_I64_I32 + : AMDGPUISD::MAD_U64_U32) || + (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 && + CurDAG->SignBitIsZero(Addr.getOperand(0)))) && + Addr.getOperand(0)->isDivergent() && + isa<ConstantSDNode>(Addr.getOperand(1)) && + !Addr.getOperand(2)->isDivergent()) { + // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr) + unsigned Size = + (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8; + ScaleOffset = Addr.getConstantOperandVal(1) == Size; + if (ScaleOffset) { + SAddr = Addr.getOperand(2); + VOffset = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32); + return true; + } + } + if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF || isa<ConstantSDNode>(Addr)) return false; @@ -1972,10 +2011,28 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const { - if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset)) + bool ScaleOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset)) return false; - CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); + CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0, + SDLoc(), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, + SDValue &SAddr, SDValue &VOffset, + SDValue &Offset, + SDValue &CPol) const { + bool ScaleOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset)) + return false; + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL; + CPol = CurDAG->getTargetConstant( + (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32); return true; } @@ -1983,10 +2040,11 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const { - if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset)) + bool ScaleOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset)) return false; - unsigned CPolVal = AMDGPU::CPol::GLC; + unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC; CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32); return true; } @@ -2074,7 +2132,8 @@ bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug( bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &SAddr, - SDValue &Offset) const { + SDValue &Offset, + SDValue &CPol) const { int64_t ImmOffset = 0; SDValue LHS, RHS; @@ -2106,6 +2165,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset)) return false; Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32); + CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); return true; } } @@ -2139,6 +2199,10 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return false; SAddr = SelectSAddrFI(CurDAG, SAddr); Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32); + + bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */); + CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0, + SDLoc(), MVT::i32); return true; } @@ -2159,17 +2223,59 @@ bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset, return true; } +// Given \p Offset and load node \p N check if an \p Offset is a multiple of +// the load byte size. If it is update \p Offset to a pre-scaled value and +// return true. +bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset, + bool IsSigned) const { + bool ScaleOffset = false; + if (!Subtarget->hasScaleOffset() || !Offset) + return false; + + unsigned Size = + (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8; + + SDValue Off = Offset; + if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG)) + Off = Ext; + + if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) { + if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1))) + ScaleOffset = C->getZExtValue() == Log2_32(Size); + } else if (Offset.getOpcode() == ISD::MUL || + (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) || + Offset.getOpcode() == AMDGPUISD::MUL_U24 || + (Offset.isMachineOpcode() && + Offset.getMachineOpcode() == + (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO + : AMDGPU::S_MUL_U64_U32_PSEUDO))) { + if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1))) + ScaleOffset = C->getZExtValue() == Size; + } + + if (ScaleOffset) + Offset = Off.getOperand(0); + + return ScaleOffset; +} + // Match an immediate (if Offset is not null) or an SGPR (if SOffset is // not null) offset. If Imm32Only is true, match only 32-bit immediate // offsets available on CI. -bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, +bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode, SDValue *SOffset, SDValue *Offset, bool Imm32Only, bool IsBuffer, - bool HasSOffset, - int64_t ImmOffset) const { + bool HasSOffset, int64_t ImmOffset, + bool *ScaleOffset) const { assert((!SOffset || !Offset) && "Cannot match both soffset and offset at the same time!"); + if (ScaleOffset) { + assert(N && SOffset); + + *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */); + } + ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode); if (!C) { if (!SOffset) @@ -2254,24 +2360,25 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { // Match a base and an immediate (if Offset is not null) or an SGPR (if // SOffset is not null) or an immediate+SGPR offset. If Imm32Only is // true, match only 32-bit immediate offsets available on CI. -bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, - SDValue *SOffset, SDValue *Offset, - bool Imm32Only, bool IsBuffer, - bool HasSOffset, - int64_t ImmOffset) const { +bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr, + SDValue &SBase, SDValue *SOffset, + SDValue *Offset, bool Imm32Only, + bool IsBuffer, bool HasSOffset, + int64_t ImmOffset, + bool *ScaleOffset) const { if (SOffset && Offset) { assert(!Imm32Only && !IsBuffer); SDValue B; - if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true)) + if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true)) return false; int64_t ImmOff = 0; if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset)) ImmOff = C->getSExtValue(); - return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true, - ImmOff); + return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false, + true, ImmOff, ScaleOffset); } // A 32-bit (address + offset) should not cause unsigned 32-bit integer @@ -2291,23 +2398,25 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, if (!N0 || !N1) return false; - if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, - ImmOffset)) { + if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, + ImmOffset, ScaleOffset)) { SBase = N0; return true; } - if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, - ImmOffset)) { + if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, + ImmOffset, ScaleOffset)) { SBase = N1; return true; } return false; } -bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, +bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, - bool Imm32Only) const { - if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) { + bool Imm32Only, bool *ScaleOffset) const { + if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only, + /* IsBuffer */ false, /* HasSOffset */ false, + /* ImmOffset */ 0, ScaleOffset)) { SBase = Expand32BitAddress(SBase); return true; } @@ -2323,36 +2432,51 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const { - return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset); + return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr, + &Offset); } bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const { assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); - return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset, - /* Imm32Only */ true); + return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr, + &Offset, /* Imm32Only */ true); } -bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase, - SDValue &SOffset) const { - return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr); +bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase, + SDValue &SOffset, SDValue &CPol) const { + bool ScaleOffset; + if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr, + /* Imm32Only */ false, &ScaleOffset)) + return false; + + CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0, + SDLoc(N), MVT::i32); + return true; } -bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, - SDValue &SOffset, - SDValue &Offset) const { - return SelectSMRD(Addr, SBase, &SOffset, &Offset); +bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr, + SDValue &SBase, SDValue &SOffset, + SDValue &Offset, + SDValue &CPol) const { + bool ScaleOffset; + if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset)) + return false; + + CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0, + SDLoc(N), MVT::i32); + return true; } bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const { - return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset, + return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset, /* Imm32Only */ false, /* IsBuffer */ true); } bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const { assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); - return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset, + return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset, /* Imm32Only */ true, /* IsBuffer */ true); } @@ -2361,9 +2485,9 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset, // Match the (soffset + offset) pair as a 32-bit register base and // an immediate offset. return N.getValueType() == MVT::i32 && - SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr, - &Offset, /* Imm32Only */ false, - /* IsBuffer */ true); + SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset, + /* SOffset*/ nullptr, &Offset, + /* Imm32Only */ false, /* IsBuffer */ true); } bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, @@ -3753,58 +3877,114 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +// Match lowered fpext from bf16 to f32. This is a bit operation extending +// a 16-bit value with 16-bit of zeroes at LSB: +// +// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val))))) +// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true +// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false +static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) { + if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST) + return SDValue(); + Op = Op.getOperand(0); + + IsExtractHigh = false; + if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) { + auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0)); + if (!Low16 || !Low16->isZero()) + return SDValue(); + Op = stripBitcast(Op.getOperand(1)); + if (Op.getValueType() != MVT::bf16) + return SDValue(); + return Op; + } + + if (Op.getValueType() != MVT::i32) + return SDValue(); + + if (Op.getOpcode() == ISD::AND) { + if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (Mask->getZExtValue() == 0xffff0000) { + IsExtractHigh = true; + return Op.getOperand(0); + } + } + return SDValue(); + } + + if (Op.getOpcode() == ISD::SHL) { + if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (Amt->getZExtValue() == 16) + return Op.getOperand(0); + } + } + + return SDValue(); +} + // The return value is not whether the match is possible (which it always is), // but whether or not it a conversion is really used. bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, - unsigned &Mods) const { + unsigned &Mods, + MVT VT) const { Mods = 0; SelectVOP3ModsImpl(In, Src, Mods); + bool IsExtractHigh = false; if (Src.getOpcode() == ISD::FP_EXTEND) { Src = Src.getOperand(0); - assert(Src.getValueType() == MVT::f16); - Src = stripBitcast(Src); + } else if (VT == MVT::bf16) { + SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh); + if (!B16) + return false; + Src = B16; + } else + return false; - // Be careful about folding modifiers if we already have an abs. fneg is - // applied last, so we don't want to apply an earlier fneg. - if ((Mods & SISrcMods::ABS) == 0) { - unsigned ModsTmp; - SelectVOP3ModsImpl(Src, Src, ModsTmp); + if (Src.getValueType() != VT && + (VT != MVT::bf16 || Src.getValueType() != MVT::i32)) + return false; - if ((ModsTmp & SISrcMods::NEG) != 0) - Mods ^= SISrcMods::NEG; + Src = stripBitcast(Src); - if ((ModsTmp & SISrcMods::ABS) != 0) - Mods |= SISrcMods::ABS; - } + // Be careful about folding modifiers if we already have an abs. fneg is + // applied last, so we don't want to apply an earlier fneg. + if ((Mods & SISrcMods::ABS) == 0) { + unsigned ModsTmp; + SelectVOP3ModsImpl(Src, Src, ModsTmp); - // op_sel/op_sel_hi decide the source type and source. - // If the source's op_sel_hi is set, it indicates to do a conversion from fp16. - // If the sources's op_sel is set, it picks the high half of the source - // register. + if ((ModsTmp & SISrcMods::NEG) != 0) + Mods ^= SISrcMods::NEG; - Mods |= SISrcMods::OP_SEL_1; - if (isExtractHiElt(Src, Src)) { - Mods |= SISrcMods::OP_SEL_0; + if ((ModsTmp & SISrcMods::ABS) != 0) + Mods |= SISrcMods::ABS; + } - // TODO: Should we try to look for neg/abs here? - } + // op_sel/op_sel_hi decide the source type and source. + // If the source's op_sel_hi is set, it indicates to do a conversion from + // fp16. If the sources's op_sel is set, it picks the high half of the source + // register. - // Prevent unnecessary subreg COPY to VGPR_16 - if (Src.getOpcode() == ISD::TRUNCATE && - Src.getOperand(0).getValueType() == MVT::i32) { - Src = Src.getOperand(0); - } - return true; + Mods |= SISrcMods::OP_SEL_1; + if (IsExtractHigh || + (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) { + Mods |= SISrcMods::OP_SEL_0; + + // TODO: Should we try to look for neg/abs here? } - return false; + // Prevent unnecessary subreg COPY to VGPR_16 + if (Src.getOpcode() == ISD::TRUNCATE && + Src.getOperand(0).getValueType() == MVT::i32) { + Src = Src.getOperand(0); + } + return true; } bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - if (!SelectVOP3PMadMixModsImpl(In, Src, Mods)) + if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16)) return false; SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; @@ -3813,7 +3993,24 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - SelectVOP3PMadMixModsImpl(In, Src, Mods); + SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16); + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16)) + return false; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16); SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index acbab3d..5636d89 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -19,6 +19,7 @@ #include "SIModeRegisterDefaults.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Support/AMDGPUAddrSpace.h" #include "llvm/Target/TargetMachine.h" namespace llvm { @@ -162,10 +163,14 @@ private: bool SelectScratchOffset(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset) const; bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, - SDValue &VOffset, SDValue &Offset) const; + SDValue &VOffset, SDValue &Offset, bool &ScaleOffset, + bool NeedIOffset = true) const; bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; + bool SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &Offset, + SDValue &CPol) const; bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; @@ -174,24 +179,31 @@ private: bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const; bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, - SDValue &SAddr, SDValue &Offset) const; + SDValue &SAddr, SDValue &Offset, + SDValue &CPol) const; - bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset, + bool SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode, SDValue *SOffset, SDValue *Offset, bool Imm32Only = false, bool IsBuffer = false, bool HasSOffset = false, - int64_t ImmOffset = 0) const; + int64_t ImmOffset = 0, + bool *ScaleOffset = nullptr) const; SDValue Expand32BitAddress(SDValue Addr) const; - bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset, - SDValue *Offset, bool Imm32Only = false, - bool IsBuffer = false, bool HasSOffset = false, - int64_t ImmOffset = 0) const; - bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset, - SDValue *Offset, bool Imm32Only = false) const; + bool SelectSMRDBaseOffset(SDNode *N, SDValue Addr, SDValue &SBase, + SDValue *SOffset, SDValue *Offset, + bool Imm32Only = false, bool IsBuffer = false, + bool HasSOffset = false, int64_t ImmOffset = 0, + bool *ScaleOffset = nullptr) const; + bool SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase, SDValue *SOffset, + SDValue *Offset, bool Imm32Only = false, + bool *ScaleOffset = nullptr) const; bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const; - bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &SOffset) const; - bool SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, SDValue &SOffset, - SDValue &Offset) const; + bool SelectScaleOffset(SDNode *N, SDValue &Offset, bool IsSigned) const; + bool SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase, SDValue &SOffset, + SDValue &CPol) const; + bool SelectSMRDSgprImm(SDNode *N, SDValue Addr, SDValue &SBase, + SDValue &SOffset, SDValue &Offset, + SDValue &CPol) const; bool SelectSMRDBufferImm(SDValue N, SDValue &Offset) const; bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const; bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset, @@ -246,11 +258,15 @@ private: bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, - unsigned &Mods) const; + bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods, + MVT VT) const; bool SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src, + SDValue &SrcMods) const; + bool SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const; bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2, SDValue &Tbl) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e3ca09e..f25ce87 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -391,8 +391,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // Library functions. These default to Expand, but we have instructions // for them. setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR, - ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM}, - MVT::f32, Legal); + ISD::FROUNDEVEN, ISD::FTRUNC}, + {MVT::f16, MVT::f32}, Legal); + setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal); setOperationAction(ISD::FLOG2, MVT::f32, Custom); setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom); @@ -412,9 +413,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom); - if (Subtarget->has16BitInsts()) + if (Subtarget->has16BitInsts()) { setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal); - else { + setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal); + } else { setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal); setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 8975486..266dee1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3494,25 +3494,74 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { } /// Match a zero extend from a 32-bit value to 64-bits. -static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { +Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const { Register ZExtSrc; - if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc)))) - return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); + if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc)))) + return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) - const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); + const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) return Register(); assert(Def->getNumOperands() == 3 && - MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); - if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { + MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); + if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) { return Def->getOperand(1).getReg(); } return Register(); } +/// Match a sign extend from a 32-bit value to 64-bits. +Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const { + Register SExtSrc; + if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc)))) + return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register(); + + // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31)) + const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); + if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) + return Register(); + + assert(Def->getNumOperands() == 3 && + MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); + if (mi_match(Def->getOperand(2).getReg(), *MRI, + m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()), + m_SpecificICst(31)))) + return Def->getOperand(1).getReg(); + + if (VT->signBitIsZero(Reg)) + return matchZeroExtendFromS32(Reg); + + return Register(); +} + +/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it +/// is 32-bit. +Register +AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const { + return MRI->getType(Reg) == LLT::scalar(32) ? Reg + : matchZeroExtendFromS32(Reg); +} + +/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it +/// is 32-bit. +Register +AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const { + return MRI->getType(Reg) == LLT::scalar(32) ? Reg + : matchSignExtendFromS32(Reg); +} + +Register +AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg, + bool IsSigned) const { + if (IsSigned) + return matchSignExtendFromS32OrS32(Reg); + + return matchZeroExtendFromS32OrS32(Reg); +} + Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const { Register AnyExtSrc; if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc)))) @@ -3581,7 +3630,7 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); if (isSGPR(SAddr)) { Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); - if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { + if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) { Addr = SAddr; VOffset = Off; } @@ -5223,7 +5272,7 @@ AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const { getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); unsigned Key = 0; - Register S32 = matchZeroExtendFromS32(*MRI, Src); + Register S32 = matchZeroExtendFromS32(Src); if (!S32) S32 = matchAnyExtendFromS32(Src); @@ -5296,10 +5345,68 @@ AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const { }}; } +// Given \p Offset and load specified by the \p Root operand check if \p Offset +// is a multiple of the load byte size. If it is update \p Offset to a +// pre-scaled value and return true. +bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root, + Register &Offset, + bool IsSigned) const { + if (!Subtarget->hasScaleOffset()) + return false; + + const MachineInstr &MI = *Root.getParent(); + MachineMemOperand *MMO = *MI.memoperands_begin(); + + if (!MMO->getSize().hasValue()) + return false; + + uint64_t Size = MMO->getSize().getValue(); + + Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned); + if (!OffsetReg) + OffsetReg = Offset; + + if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI)) + OffsetReg = Def->Reg; + + Register Op0; + MachineInstr *Mul; + bool ScaleOffset = + (isPowerOf2_64(Size) && + mi_match(OffsetReg, *MRI, + m_GShl(m_Reg(Op0), + m_any_of(m_SpecificICst(Log2_64(Size)), + m_Copy(m_SpecificICst(Log2_64(Size))))))) || + mi_match(OffsetReg, *MRI, + m_GMul(m_Reg(Op0), m_any_of(m_SpecificICst(Size), + m_Copy(m_SpecificICst(Size))))) || + mi_match( + OffsetReg, *MRI, + m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64, + m_Reg(Op0), m_SpecificICst(Size))) || + // Match G_AMDGPU_MAD_U64_U32 offset, c, 0 + (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) && + (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32 + : AMDGPU::G_AMDGPU_MAD_U64_U32) || + (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 && + VT->signBitIsZero(Mul->getOperand(2).getReg()))) && + mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) && + mi_match(Mul->getOperand(3).getReg(), *MRI, + m_GTrunc(m_any_of(m_SpecificICst(Size), + m_Copy(m_SpecificICst(Size))))) && + mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0))); + + if (ScaleOffset) + Offset = Op0; + + return ScaleOffset; +} + bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset, - int64_t *Offset) const { + int64_t *Offset, + bool *ScaleOffset) const { MachineInstr *MI = Root.getParent(); MachineBasicBlock *MBB = MI->getParent(); @@ -5314,6 +5421,9 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, const GEPInfo &GEPI = AddrInfo[0]; std::optional<int64_t> EncodedImm; + if (ScaleOffset) + *ScaleOffset = false; + if (SOffset && Offset) { EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false, /*HasSOffset=*/true); @@ -5321,8 +5431,12 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, AddrInfo.size() > 1) { const GEPInfo &GEPI2 = AddrInfo[1]; if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) { - if (Register OffsetReg = - matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) { + Register OffsetReg = GEPI2.SgprParts[1]; + if (ScaleOffset) + *ScaleOffset = + selectScaleOffset(Root, OffsetReg, false /* IsSigned */); + OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg); + if (OffsetReg) { Base = GEPI2.SgprParts[0]; *SOffset = OffsetReg; *Offset = *EncodedImm; @@ -5367,7 +5481,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, } if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) { - if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) { + Register OffsetReg = GEPI.SgprParts[1]; + if (ScaleOffset) + *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */); + OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg); + if (OffsetReg) { Base = GEPI.SgprParts[0]; *SOffset = OffsetReg; return true; @@ -5381,7 +5499,8 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { Register Base; int64_t Offset; - if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset)) + if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset, + /* ScaleOffset */ nullptr)) return std::nullopt; return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, @@ -5412,23 +5531,30 @@ AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { Register Base, SOffset; - if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr)) + bool ScaleOffset; + if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr, + &ScaleOffset)) return std::nullopt; + unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0; return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, - [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}}; + [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}}; } InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const { Register Base, SOffset; int64_t Offset; - if (!selectSmrdOffset(Root, Base, &SOffset, &Offset)) + bool ScaleOffset; + if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset)) return std::nullopt; + unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0; return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}}; + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}}; } std::pair<Register, int> @@ -5490,7 +5616,8 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const { // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, - unsigned CPolBits) const { + unsigned CPolBits, + bool NeedIOffset) const { Register Addr = Root.getReg(); Register PtrBase; int64_t ConstOffset; @@ -5501,7 +5628,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); if (ConstOffset != 0) { - if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, + if (NeedIOffset && + TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal)) { Addr = PtrBase; ImmOffset = ConstOffset; @@ -5514,11 +5642,15 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, // saddr + large_offset -> saddr + // (voffset = large_offset & ~MaxOffset) + // (large_offset & MaxOffset); - int64_t SplitImmOffset, RemainderOffset; - std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset( - ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); + int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset; + if (NeedIOffset) { + std::tie(SplitImmOffset, RemainderOffset) = + TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, + SIInstrFlags::FlatGlobal); + } - if (isUInt<32>(RemainderOffset)) { + if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset) + : isUInt<32>(RemainderOffset)) { MachineInstr *MI = Root.getParent(); MachineBasicBlock *MBB = MI->getParent(); Register HighBits = @@ -5528,12 +5660,22 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, HighBits) .addImm(RemainderOffset); + if (NeedIOffset) + return {{ + [=](MachineInstrBuilder &MIB) { + MIB.addReg(PtrBase); + }, // saddr + [=](MachineInstrBuilder &MIB) { + MIB.addReg(HighBits); + }, // voffset + [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); }, + }}; return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr [=](MachineInstrBuilder &MIB) { MIB.addReg(HighBits); }, // voffset - [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); }, }}; } @@ -5565,18 +5707,33 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, // It's possible voffset is an SGPR here, but the copy to VGPR will be // inserted later. - if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { + bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset, + Subtarget->hasSignedGVSOffset()); + if (Register VOffset = matchExtendFromS32OrS32( + PtrBaseOffset, Subtarget->hasSignedGVSOffset())) { + if (NeedIOffset) + return {{[=](MachineInstrBuilder &MIB) { // saddr + MIB.addReg(SAddr); + }, + [=](MachineInstrBuilder &MIB) { // voffset + MIB.addReg(VOffset); + }, + [=](MachineInstrBuilder &MIB) { // offset + MIB.addImm(ImmOffset); + }, + [=](MachineInstrBuilder &MIB) { // cpol + MIB.addImm(CPolBits | + (ScaleOffset ? AMDGPU::CPol::SCAL : 0)); + }}}; return {{[=](MachineInstrBuilder &MIB) { // saddr MIB.addReg(SAddr); }, [=](MachineInstrBuilder &MIB) { // voffset MIB.addReg(VOffset); }, - [=](MachineInstrBuilder &MIB) { // offset - MIB.addImm(ImmOffset); - }, [=](MachineInstrBuilder &MIB) { // cpol - MIB.addImm(CPolBits); + MIB.addImm(CPolBits | + (ScaleOffset ? AMDGPU::CPol::SCAL : 0)); }}}; } } @@ -5597,10 +5754,16 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset) .addImm(0); + if (NeedIOffset) + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr + [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol + }}; return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset - [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol }}; } @@ -5611,6 +5774,16 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const { + const MachineInstr &I = *Root.getParent(); + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL; + return selectGlobalSAddr(Root, PassedCPol); +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const { return selectGlobalSAddr(Root, AMDGPU::CPol::GLC); } @@ -5732,22 +5905,32 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset)) return std::nullopt; + unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */) + ? AMDGPU::CPol::SCAL + : 0; + if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { int FI = LHSDef->MI->getOperand(1).getIndex(); return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr + [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr - [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol }}; } if (!isSGPR(LHS)) + if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI)) + LHS = Def->Reg; + + if (!isSGPR(LHS)) return std::nullopt; return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr - [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr - [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset + [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr + [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol }}; } @@ -6895,6 +7078,17 @@ void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB, MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4); } +void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + uint32_t V = MI.getOperand(2).getImm(); + V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK)) + << AMDGPU::CPol::SCOPE_SHIFT; + if (!Subtarget->hasSafeCUPrefetch()) + V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe + MIB.addImm(V); +} + /// Convert from 2-bit value to enum values used for op_sel* source modifiers. void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 34bdf0a..fe9743d0a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -232,8 +232,10 @@ private: InstructionSelector::ComplexRendererFns selectVINTERPModsHi(MachineOperand &Root) const; + bool selectScaleOffset(MachineOperand &Root, Register &Offset, + bool IsSigned) const; bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset, - int64_t *Offset) const; + int64_t *Offset, bool *ScaleOffset) const; InstructionSelector::ComplexRendererFns selectSmrdImm(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns @@ -254,10 +256,13 @@ private: selectScratchOffset(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns - selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits) const; + selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits, + bool NeedIOffset = true) const; InstructionSelector::ComplexRendererFns selectGlobalSAddr(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectGlobalSAddrCPol(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectGlobalSAddrGLC(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns @@ -411,6 +416,10 @@ private: void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; + + void renderPrefetchLoc(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderScaledMAIIntrinsicOperand(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; @@ -421,6 +430,19 @@ private: // shift amount operand's `ShAmtBits` bits is unneeded. bool isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const; + /// Match a zero extend from a 32-bit value to 64-bits. + Register matchZeroExtendFromS32(Register Reg) const; + /// Match a sign extend from a 32-bit value to 64-bits. + Register matchSignExtendFromS32(Register Reg) const; + /// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it + /// is 32-bit. + Register matchZeroExtendFromS32OrS32(Register Reg) const; + /// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it + /// is 32-bit. + Register matchSignExtendFromS32OrS32(Register Reg) const; + /// Match either sign or zero extend depending on the \p IsSigned from a + /// 32-bit value to 64-bits, or \p Reg itself if it is 32-bit. + Register matchExtendFromS32OrS32(Register Reg, bool IsSigned) const; /// Match an any extend from a 32-bit value to 64-bit. Register matchAnyExtendFromS32(Register Reg) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index e7bf88d..fedfa3f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4208,6 +4208,9 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, assert(Ty.isScalar()); unsigned Size = Ty.getSizeInBits(); + if (ST.hasVectorMulU64() && Size == 64) + return true; + unsigned NumParts = Size / 32; assert((Size % 32) == 0); assert(NumParts >= 2); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index ba66134..e187959 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -23,6 +23,8 @@ #include "GCNSubtarget.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -115,126 +117,233 @@ public: VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}; - bool isLaneMask(Register Reg) { - const RegisterBank *RB = MRI.getRegBankOrNull(Reg); - if (RB && RB->getID() == AMDGPU::VCCRegBankID) - return true; + bool isLaneMask(Register Reg); + std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode); + std::pair<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src); + Register getReadAnyLaneSrc(Register Src); + void replaceRegWithOrBuildCopy(Register Dst, Register Src); - const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); - return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); - } + bool tryEliminateReadAnyLane(MachineInstr &Copy); + void tryCombineCopy(MachineInstr &MI); + void tryCombineS1AnyExt(MachineInstr &MI); +}; - void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) { - MI.eraseFromParent(); - if (Optional0 && isTriviallyDead(*Optional0, MRI)) - Optional0->eraseFromParent(); - } +bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) { + const RegisterBank *RB = MRI.getRegBankOrNull(Reg); + if (RB && RB->getID() == AMDGPU::VCCRegBankID) + return true; - std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) { - MachineInstr *MatchMI = MRI.getVRegDef(Src); - if (MatchMI->getOpcode() != Opcode) - return {nullptr, Register()}; - return {MatchMI, MatchMI->getOperand(1).getReg()}; - } + const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); + return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); +} - void tryCombineCopy(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - // Skip copies of physical registers. - if (!Dst.isVirtual() || !Src.isVirtual()) - return; - - // This is a cross bank copy, sgpr S1 to lane mask. - // - // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) - // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) - // -> - // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32) - if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { - auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); - assert(Trunc && MRI.getType(TruncS32Src) == S32 && - "sgpr S1 must be result of G_TRUNC of sgpr S32"); - - B.setInstr(MI); - // Ensure that truncated bits in BoolSrc are 0. - auto One = B.buildConstant({SgprRB, S32}, 1); - auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); - B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc}); - cleanUpAfterCombine(MI, Trunc); - return; - } +std::pair<MachineInstr *, Register> +AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) { + MachineInstr *MatchMI = MRI.getVRegDef(Src); + if (MatchMI->getOpcode() != Opcode) + return {nullptr, Register()}; + return {MatchMI, MatchMI->getOperand(1).getReg()}; +} + +std::pair<GUnmerge *, int> +AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) { + MachineInstr *ReadAnyLane = MRI.getVRegDef(Src); + if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE) + return {nullptr, -1}; + + Register RALSrc = ReadAnyLane->getOperand(1).getReg(); + if (auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI)) + return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)}; - // Src = G_AMDGPU_READANYLANE RALSrc - // Dst = COPY Src - // -> - // Dst = RALSrc - if (MRI.getRegBankOrNull(Dst) == VgprRB && - MRI.getRegBankOrNull(Src) == SgprRB) { - auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); - if (!RAL) - return; - - assert(MRI.getRegBank(RALSrc) == VgprRB); - MRI.replaceRegWith(Dst, RALSrc); - cleanUpAfterCombine(MI, RAL); - return; + return {nullptr, -1}; +} + +Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) { + // Src = G_AMDGPU_READANYLANE RALSrc + auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); + if (RAL) + return RALSrc; + + // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc + // LoSgpr = G_AMDGPU_READANYLANE LoVgpr + // HiSgpr = G_AMDGPU_READANYLANE HiVgpr + // Src G_MERGE_VALUES LoSgpr, HiSgpr + auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI); + if (Merge) { + unsigned NumElts = Merge->getNumSources(); + auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0)); + if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0) + return {}; + + // Check if all elements are from same unmerge and there is no shuffling. + for (unsigned i = 1; i < NumElts; ++i) { + auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i)); + if (UnmergeI != Unmerge || (unsigned)IdxI != i) + return {}; } + return Unmerge->getSourceReg(); } - void tryCombineS1AnyExt(MachineInstr &MI) { - // %Src:sgpr(S1) = G_TRUNC %TruncSrc - // %Dst = G_ANYEXT %Src:sgpr(S1) - // -> - // %Dst = G_... %TruncSrc - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - if (MRI.getType(Src) != S1) - return; - - auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC); - if (!Trunc) - return; - - LLT DstTy = MRI.getType(Dst); - LLT TruncSrcTy = MRI.getType(TruncSrc); - - if (DstTy == TruncSrcTy) { - MRI.replaceRegWith(Dst, TruncSrc); - cleanUpAfterCombine(MI, Trunc); - return; - } + // SrcRegIdx = G_AMDGPU_READANYLANE RALElSrc + // SourceReg G_MERGE_VALUES ..., SrcRegIdx, ... + // ..., Src, ... = G_UNMERGE_VALUES SourceReg + auto *UnMerge = getOpcodeDef<GUnmerge>(Src, MRI); + if (!UnMerge) + return {}; + + int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr); + Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI); + if (!Merge || UnMerge->getNumDefs() != Merge->getNumSources()) + return {}; + + Register SrcRegIdx = Merge->getSourceReg(Idx); + if (MRI.getType(Src) != MRI.getType(SrcRegIdx)) + return {}; + + auto [RALEl, RALElSrc] = tryMatch(SrcRegIdx, AMDGPU::G_AMDGPU_READANYLANE); + if (RALEl) + return RALElSrc; + + return {}; +} + +void AMDGPURegBankLegalizeCombiner::replaceRegWithOrBuildCopy(Register Dst, + Register Src) { + if (Dst.isVirtual()) + MRI.replaceRegWith(Dst, Src); + else + B.buildCopy(Dst, Src); +} + +bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane( + MachineInstr &Copy) { + Register Dst = Copy.getOperand(0).getReg(); + Register Src = Copy.getOperand(1).getReg(); + + // Skip non-vgpr Dst + if (Dst.isVirtual() ? (MRI.getRegBankOrNull(Dst) != VgprRB) + : !TRI.isVGPR(MRI, Dst)) + return false; + + // Skip physical source registers and source registers with register class + if (!Src.isVirtual() || MRI.getRegClassOrNull(Src)) + return false; + + Register RALDst = Src; + MachineInstr &SrcMI = *MRI.getVRegDef(Src); + if (SrcMI.getOpcode() == AMDGPU::G_BITCAST) + RALDst = SrcMI.getOperand(1).getReg(); + + Register RALSrc = getReadAnyLaneSrc(RALDst); + if (!RALSrc) + return false; + + B.setInstr(Copy); + if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) { + // Src = READANYLANE RALSrc Src = READANYLANE RALSrc + // Dst = Copy Src $Dst = Copy Src + // -> -> + // Dst = RALSrc $Dst = Copy RALSrc + replaceRegWithOrBuildCopy(Dst, RALSrc); + } else { + // RALDst = READANYLANE RALSrc RALDst = READANYLANE RALSrc + // Src = G_BITCAST RALDst Src = G_BITCAST RALDst + // Dst = Copy Src Dst = Copy Src + // -> -> + // NewVgpr = G_BITCAST RALDst NewVgpr = G_BITCAST RALDst + // Dst = NewVgpr $Dst = Copy NewVgpr + auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc); + replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0)); + } + + eraseInstr(Copy, MRI); + return true; +} + +void AMDGPURegBankLegalizeCombiner::tryCombineCopy(MachineInstr &MI) { + if (tryEliminateReadAnyLane(MI)) + return; + + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + // Skip copies of physical registers. + if (!Dst.isVirtual() || !Src.isVirtual()) + return; + + // This is a cross bank copy, sgpr S1 to lane mask. + // + // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) + // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) + // -> + // %BoolSrc:sgpr(s32) = G_AND %TruncS32Src:sgpr(s32), 1 + // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %BoolSrc:sgpr(s32) + if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { + auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); + assert(Trunc && MRI.getType(TruncS32Src) == S32 && + "sgpr S1 must be result of G_TRUNC of sgpr S32"); B.setInstr(MI); + // Ensure that truncated bits in BoolSrc are 0. + auto One = B.buildConstant({SgprRB, S32}, 1); + auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); + B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc}); + eraseInstr(MI, MRI); + } +} - if (DstTy == S32 && TruncSrcTy == S64) { - auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc); - MRI.replaceRegWith(Dst, Unmerge.getReg(0)); - cleanUpAfterCombine(MI, Trunc); - return; - } +void AMDGPURegBankLegalizeCombiner::tryCombineS1AnyExt(MachineInstr &MI) { + // %Src:sgpr(S1) = G_TRUNC %TruncSrc + // %Dst = G_ANYEXT %Src:sgpr(S1) + // -> + // %Dst = G_... %TruncSrc + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + if (MRI.getType(Src) != S1) + return; + + auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC); + if (!Trunc) + return; + + LLT DstTy = MRI.getType(Dst); + LLT TruncSrcTy = MRI.getType(TruncSrc); + + if (DstTy == TruncSrcTy) { + MRI.replaceRegWith(Dst, TruncSrc); + eraseInstr(MI, MRI); + return; + } - if (DstTy == S64 && TruncSrcTy == S32) { - B.buildMergeLikeInstr(MI.getOperand(0).getReg(), - {TruncSrc, B.buildUndef({SgprRB, S32})}); - cleanUpAfterCombine(MI, Trunc); - return; - } + B.setInstr(MI); - if (DstTy == S32 && TruncSrcTy == S16) { - B.buildAnyExt(Dst, TruncSrc); - cleanUpAfterCombine(MI, Trunc); - return; - } + if (DstTy == S32 && TruncSrcTy == S64) { + auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc); + MRI.replaceRegWith(Dst, Unmerge.getReg(0)); + eraseInstr(MI, MRI); + return; + } - if (DstTy == S16 && TruncSrcTy == S32) { - B.buildTrunc(Dst, TruncSrc); - cleanUpAfterCombine(MI, Trunc); - return; - } + if (DstTy == S64 && TruncSrcTy == S32) { + B.buildMergeLikeInstr(MI.getOperand(0).getReg(), + {TruncSrc, B.buildUndef({SgprRB, S32})}); + eraseInstr(MI, MRI); + return; + } - llvm_unreachable("missing anyext + trunc combine"); + if (DstTy == S32 && TruncSrcTy == S16) { + B.buildAnyExt(Dst, TruncSrc); + eraseInstr(MI, MRI); + return; } -}; + + if (DstTy == S16 && TruncSrcTy == S32) { + B.buildTrunc(Dst, TruncSrc); + eraseInstr(MI, MRI); + return; + } + + llvm_unreachable("missing anyext + trunc combine"); +} // Search through MRI for virtual registers with sgpr register bank and S1 LLT. [[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 411159c..f471881 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -33,7 +33,7 @@ RegBankLegalizeHelper::RegBankLegalizeHelper( MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules) : ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()), - MUI(MUI), RBI(RBI), RBLRules(RBLRules), + MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()), SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {} @@ -56,6 +56,224 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { lower(MI, Mapping, WaterfallSgprs); } +bool RegBankLegalizeHelper::executeInWaterfallLoop( + MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range, + SmallSet<Register, 4> &SGPROperandRegs) { + // Track use registers which have already been expanded with a readfirstlane + // sequence. This may have multiple uses if moving a sequence. + DenseMap<Register, Register> WaterfalledRegMap; + + MachineBasicBlock &MBB = B.getMBB(); + MachineFunction &MF = B.getMF(); + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); + unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg; + if (IsWave32) { + MovExecOpc = AMDGPU::S_MOV_B32; + MovExecTermOpc = AMDGPU::S_MOV_B32_term; + XorTermOpc = AMDGPU::S_XOR_B32_term; + AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32; + ExecReg = AMDGPU::EXEC_LO; + } else { + MovExecOpc = AMDGPU::S_MOV_B64; + MovExecTermOpc = AMDGPU::S_MOV_B64_term; + XorTermOpc = AMDGPU::S_XOR_B64_term; + AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64; + ExecReg = AMDGPU::EXEC; + } + +#ifndef NDEBUG + const int OrigRangeSize = std::distance(Range.begin(), Range.end()); +#endif + + MachineRegisterInfo &MRI = *B.getMRI(); + Register SaveExecReg = MRI.createVirtualRegister(WaveRC); + Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); + + // Don't bother using generic instructions/registers for the exec mask. + B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg); + + Register SavedExec = MRI.createVirtualRegister(WaveRC); + + // To insert the loop we need to split the block. Move everything before + // this point to a new block, and insert a new empty block before this + // instruction. + MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + MF.insert(MBBI, LoopBB); + MF.insert(MBBI, BodyBB); + MF.insert(MBBI, RestoreExecBB); + MF.insert(MBBI, RemainderBB); + + LoopBB->addSuccessor(BodyBB); + BodyBB->addSuccessor(RestoreExecBB); + BodyBB->addSuccessor(LoopBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); + RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); + + MBB.addSuccessor(LoopBB); + RestoreExecBB->addSuccessor(RemainderBB); + + B.setInsertPt(*LoopBB, LoopBB->end()); + + // +-MBB:------------+ + // | ... | + // | %0 = G_INST_1 | + // | %Dst = MI %Vgpr | + // | %1 = G_INST_2 | + // | ... | + // +-----------------+ + // -> + // +-MBB-------------------------------+ + // | ... | + // | %0 = G_INST_1 | + // | %SaveExecReg = S_MOV_B32 $exec_lo | + // +----------------|------------------+ + // | /------------------------------| + // V V | + // +-LoopBB---------------------------------------------------------------+ | + // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | | + // | instead of executing for each lane, see if other lanes had | | + // | same value for %Vgpr and execute for them also. | | + // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | | + // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | | + // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | | + // | exec is active for lanes with the same "CurrentLane value" in Vgpr | | + // +----------------|-----------------------------------------------------+ | + // V | + // +-BodyBB------------------------------------------------------------+ | + // | %Dst = MI %CurrentLaneReg:sgpr(s32) | | + // | executed only for active lanes and written to Dst | | + // | $exec = S_XOR_B32 $exec, %SavedExec | | + // | set active lanes to 0 in SavedExec, lanes that did not write to | | + // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | | + // | SI_WATERFALL_LOOP LoopBB |-----| + // +----------------|--------------------------------------------------+ + // V + // +-RestoreExecBB--------------------------+ + // | $exec_lo = S_MOV_B32_term %SaveExecReg | + // +----------------|-----------------------+ + // V + // +-RemainderBB:----------------------+ + // | %1 = G_INST_2 | + // | ... | + // +---------------------------------- + + + // Move the instruction into the loop body. Note we moved everything after + // Range.end() already into a new block, so Range.end() is no longer valid. + BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end()); + + // Figure out the iterator range after splicing the instructions. + MachineBasicBlock::iterator NewBegin = Range.begin()->getIterator(); + auto NewEnd = BodyBB->end(); + assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); + + B.setMBB(*LoopBB); + Register CondReg; + + for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { + for (MachineOperand &Op : MI.all_uses()) { + Register OldReg = Op.getReg(); + if (!SGPROperandRegs.count(OldReg)) + continue; + + // See if we already processed this register in another instruction in + // the sequence. + auto OldVal = WaterfalledRegMap.find(OldReg); + if (OldVal != WaterfalledRegMap.end()) { + Op.setReg(OldVal->second); + continue; + } + + Register OpReg = Op.getReg(); + LLT OpTy = MRI.getType(OpReg); + + // TODO: support for agpr + assert(MRI.getRegBank(OpReg) == VgprRB); + Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy}); + buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI); + + // Build the comparison(s), CurrentLaneReg == OpReg. + unsigned OpSize = OpTy.getSizeInBits(); + unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32; + LLT PartTy = LLT::scalar(PartSize); + unsigned NumParts = OpSize / PartSize; + SmallVector<Register, 8> OpParts; + SmallVector<Register, 8> CurrentLaneParts; + + if (NumParts == 1) { + OpParts.push_back(OpReg); + CurrentLaneParts.push_back(CurrentLaneReg); + } else { + auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg); + auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg); + for (unsigned i = 0; i < NumParts; ++i) { + OpParts.push_back(UnmergeOp.getReg(i)); + CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i)); + } + } + + for (unsigned i = 0; i < NumParts; ++i) { + Register CmpReg = MRI.createVirtualRegister(VccRB_S1); + B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]); + + if (!CondReg) + CondReg = CmpReg; + else + CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0); + } + + Op.setReg(CurrentLaneReg); + + // Make sure we don't re-process this register again. + WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg())); + } + } + + // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection. + Register CondRegLM = + MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)}); + B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg); + + // Update EXEC, save the original EXEC value to SavedExec. + B.buildInstr(AndSaveExecOpc) + .addDef(SavedExec) + .addReg(CondRegLM, RegState::Kill); + MRI.setSimpleHint(SavedExec, CondRegLM); + + B.setInsertPt(*BodyBB, BodyBB->end()); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1. + B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec); + + // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use + // s_cbranch_scc0? + + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. + B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); + + // Save the EXEC mask before the loop. + B.setInsertPt(MBB, MBB.end()); + B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg); + + // Restore the EXEC mask after the loop. + B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin()); + B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg); + + // Set the insert point after the original instruction, so any new + // instructions will be in the remainder. + B.setInsertPt(*RemainderBB, RemainderBB->begin()); + + return true; +} + void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown, LLT MergeTy) { MachineFunction &MF = B.getMF(); @@ -391,7 +609,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, switch (Mapping.LoweringMethod) { case DoNotLower: - return; + break; case VccExtToSel: return lowerVccExtToSel(MI); case UniExtToSel: { @@ -527,7 +745,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, } } - // TODO: executeInWaterfallLoop(... WaterfallSgprs) + if (!WaterfallSgprs.empty()) { + MachineBasicBlock::iterator I = MI.getIterator(); + executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs); + } } LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { @@ -539,6 +760,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case Vgpr16: return LLT::scalar(16); case Sgpr32: + case Sgpr32_WF: case Sgpr32Trunc: case Sgpr32AExt: case Sgpr32AExtBoolInReg: @@ -577,6 +799,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case VgprV2S32: return LLT::fixed_vector(2, 32); case SgprV4S32: + case SgprV4S32_WF: case VgprV4S32: case UniInVgprV4S32: return LLT::fixed_vector(4, 32); @@ -650,6 +873,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { return VccRB; case Sgpr16: case Sgpr32: + case Sgpr32_WF: case Sgpr64: case Sgpr128: case SgprP1: @@ -662,6 +886,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case SgprV2S16: case SgprV2S32: case SgprV4S32: + case SgprV4S32_WF: case SgprB32: case SgprB64: case SgprB96: @@ -923,6 +1148,14 @@ void RegBankLegalizeHelper::applyMappingSrc( } break; } + // sgpr waterfall, scalars and vectors + case Sgpr32_WF: + case SgprV4S32_WF: { + assert(Ty == getTyFromID(MethodIDs[i])); + if (RB != SgprRB) + SgprWaterfallOperandRegs.insert(Reg); + break; + } // sgpr and vgpr scalars with extend case Sgpr32AExt: { // Note: this ext allows S1, and it is meant to be combined away. diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index 08cc7d4..db965d8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -32,6 +32,7 @@ class RegBankLegalizeHelper { const MachineUniformityInfo &MUI; const RegisterBankInfo &RBI; const RegBankLegalizeRules &RBLRules; + const bool IsWave32; const RegisterBank *SgprRB; const RegisterBank *VgprRB; const RegisterBank *VccRB; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index a60855c..5a6ad40 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -529,7 +529,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_ICMP}) .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}}) - .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}}); + .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}}) + .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}}); addRulesForGOpcs({G_FCMP}) .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}}) @@ -666,11 +667,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, // clang-format off addRulesForGOpcs({G_LOAD}) .Any({{DivB32, DivP0}, {{VgprB32}, {VgprP0}}}) + .Any({{DivB32, UniP0}, {{VgprB32}, {VgprP0}}}) .Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}}) .Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}}) .Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}}) .Any({{{UniB32, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}}) + .Any({{{UniB64, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}}) + .Any({{{UniB96, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}}) + .Any({{{UniB128, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}}) .Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}}) .Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}}) @@ -684,6 +689,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{{UniB96, UniP4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasUnalignedLoads) .Any({{{UniB96, UniP4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasUnalignedLoads) .Any({{{UniB96, UniP4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasUnalignedLoads) + .Any({{{UniB128, UniP4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}}) .Any({{{UniB256, UniP4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}}) .Any({{{UniB512, UniP4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}}) .Any({{{UniB32, UniP4}, !isNaturalAlignedSmall || !isUL}, {{UniInVgprB32}, {VgprP4}}}, hasSMRDSmall) // i8 and i16 load @@ -698,11 +704,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}}); // clang-format on - addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, Vector) - .Div(S32, {{Vgpr32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}) - .Uni(S32, {{UniInVgprS32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}) - .Div(V4S32, {{VgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}) - .Uni(V4S32, {{UniInVgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}); + addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, StandardB) + .Div(B32, {{VgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B32, {{UniInVgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Div(B64, {{VgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B64, {{UniInVgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Div(B96, {{VgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B96, {{UniInVgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}); addRulesForGOpcs({G_STORE}) .Any({{S32, P0}, {{}, {Vgpr32, VgprP0}}}) @@ -716,7 +726,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_PTR_ADD}) .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}}) .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}}) - .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}}); + .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}}) + .Any({{UniP4}, {{SgprP4}, {SgprP4, Sgpr64}}}); addRulesForGOpcs({G_INTTOPTR}) .Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}}) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 7243d75..1391440 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -188,7 +188,11 @@ enum RegBankLLTMappingApplyID { Sgpr32Trunc, - // Src only modifiers: waterfalls, extends + // Src only modifiers: execute in waterfall loop if divergent + Sgpr32_WF, + SgprV4S32_WF, + + // Src only modifiers: extends Sgpr32AExt, Sgpr32AExtBoolInReg, Sgpr32SExt, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index f1caf24..c5a1d9e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2528,7 +2528,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // Special case for s_mul_u64. There is not a vector equivalent of // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector // multiplications. - if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) { + if (!Subtarget.hasVectorMulU64() && Opc == AMDGPU::G_MUL && + DstTy.getSizeInBits() == 64) { applyMappingSMULU64(B, OpdMapper); return; } @@ -3500,19 +3501,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl( applyMappingMAD_64_32(B, OpdMapper); return; case AMDGPU::G_PREFETCH: { - if (!Subtarget.hasPrefetch() || !Subtarget.hasSafeSmemPrefetch()) { + if (!Subtarget.hasSafeSmemPrefetch() && !Subtarget.hasVmemPrefInsts()) { MI.eraseFromParent(); return; } Register PtrReg = MI.getOperand(0).getReg(); unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID); - if (PtrBank == AMDGPU::VGPRRegBankID) { + if (PtrBank == AMDGPU::VGPRRegBankID && + (!Subtarget.hasVmemPrefInsts() || !MI.getOperand(3).getImm())) { + // Cannot do I$ prefetch with divergent pointer. MI.eraseFromParent(); return; } unsigned AS = MRI.getType(PtrReg).getAddressSpace(); - if (!AMDGPU::isFlatGlobalAddrSpace(AS) && - AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + if ((!AMDGPU::isFlatGlobalAddrSpace(AS) && + AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) || + (!Subtarget.hasSafeSmemPrefetch() && + (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || + !MI.getOperand(3).getImm() /* I$ prefetch */))) { MI.eraseFromParent(); return; } @@ -3973,7 +3979,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; } else { - OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); + if (MI.getOpcode() == AMDGPU::G_MUL && Subtarget.hasVectorMulU64()) + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + else + OpdsMapping[0] = + getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/); OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); @@ -5170,6 +5180,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_ds_load_tr16_b128: case Intrinsic::amdgcn_ds_load_tr4_b64: case Intrinsic::amdgcn_ds_load_tr6_b96: + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: case Intrinsic::amdgcn_ds_read_tr4_b64: case Intrinsic::amdgcn_ds_read_tr6_b96: case Intrinsic::amdgcn_ds_read_tr8_b64: @@ -5432,6 +5448,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_flat_prefetch: + case Intrinsic::amdgcn_global_prefetch: + return getDefaultMappingVOP(MI); default: return getInvalidInstructionMapping(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index a8e1967..f580f43 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -159,7 +159,8 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // If the inputs are tied and the same register, we can shortcut and // directly replace the register. - if (Src2->getReg() != CopySrcReg) { + if (!Src2->isReg() || Src2->getReg() != CopySrcReg || + Src2->getSubReg() != DefMI->getOperand(1).getSubReg()) { LLVM_DEBUG( dbgs() << "Replacing untied VGPR MFMAs with AGPR form not yet handled\n"); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 1e44be8..6878744 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -61,6 +61,7 @@ protected: bool EnableRealTrue16Insts = false; bool HasBF16TransInsts = false; bool HasBF16ConversionInsts = false; + bool HasBF16PackedInsts = false; bool HasMadMixInsts = false; bool HasMadMacF32Insts = false; bool HasDsSrc2Insts = false; @@ -209,6 +210,8 @@ public: return HasBF16ConversionInsts; } + bool hasBF16PackedInsts() const { return HasBF16PackedInsts; } + bool hasMadMixInsts() const { return HasMadMixInsts; } diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 1cc717b..7207c25 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -13,8 +13,9 @@ let WantsRoot = true in { def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>; def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>; + def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>; def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>; - def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [], -10>; + def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>; } class True16D16Table <string hiOp, string loOp> { @@ -464,6 +465,37 @@ class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = n let sve = 0; } +class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64:$vaddr), string asm = " $vaddr"> : + FLAT_Pseudo<opName, (outs), !con(addr, (ins flat_offset:$offset, CPol_0:$cpol)), asm#"$offset$cpol"> { + let has_vdst = 0; + let has_data = 0; + let mayLoad = 1; + let mayStore = 1; + let VM_CNT = 0; + let LGKM_CNT = 0; +} + +multiclass FLAT_Flat_Prefetch_Pseudo<string opName> { + def "" : FLAT_Prefetch_Pseudo<opName>, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">, + GlobalSaddrTable<1, opName> { + let OtherPredicates = [HasFlatGVSMode]; + let enabled_saddr = 1; + } +} + +multiclass FLAT_Global_Prefetch_Pseudo<string opName> { + let is_flat_global = 1, has_saddr = 1 in { + def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64:$vaddr), " $vaddr, off">, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">, + GlobalSaddrTable<1, opName> { + let enabled_saddr = 1; + } + } +} + class FlatScratchInst <string sv_op, string mode> { string SVOp = sv_op; string Mode = mode; @@ -1162,6 +1194,16 @@ defm SCRATCH_LOAD_LDS_USHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_u defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sshort">; defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">; +let SubtargetPredicate = isGFX125xOnly in { +defm FLAT_LOAD_MONITOR_B32 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b32", VGPR_32>; +defm FLAT_LOAD_MONITOR_B64 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b64", VReg_64>; +defm FLAT_LOAD_MONITOR_B128 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b128", VReg_128>; + +defm GLOBAL_LOAD_MONITOR_B32 : FLAT_Global_Load_Pseudo <"global_load_monitor_b32", VGPR_32>; +defm GLOBAL_LOAD_MONITOR_B64 : FLAT_Global_Load_Pseudo <"global_load_monitor_b64", VReg_64>; +defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VReg_128>; +} // End SubtargetPredicate = isGFX125xOnly + let SubtargetPredicate = isGFX12Plus in { let Uses = [EXEC, M0] in { defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>; @@ -1218,6 +1260,11 @@ let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in "global_atomic_pk_add_f16", VGPR_32, v2f16 >; +let SubtargetPredicate = HasVmemPrefInsts in { + defm FLAT_PREFETCH_B8 : FLAT_Flat_Prefetch_Pseudo<"flat_prefetch_b8">; + defm GLOBAL_PREFETCH_B8 : FLAT_Global_Prefetch_Pseudo<"global_prefetch_b8">; +} + //===----------------------------------------------------------------------===// // Flat Patterns //===----------------------------------------------------------------------===// @@ -1228,6 +1275,11 @@ class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCN (inst $vaddr, $offset) >; +class FlatLoadPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (FlatOffset i64:$vaddr, i32:$offset), (i32 timm:$cpol))), + (inst $vaddr, $offset, $cpol) +>; + class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in), (inst $vaddr, $offset, 0, $in) @@ -1249,8 +1301,8 @@ class FlatSignedLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Value >; class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)), - (inst $saddr, $voffset, $offset, 0, $in) + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)), + (inst $saddr, $voffset, $offset, $cpol, $in) >; class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < @@ -1264,8 +1316,8 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT >; class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), - (inst $saddr, $voffset, $offset, (i32 0)) + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (inst $saddr, $voffset, $offset, $cpol) >; class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < @@ -1278,6 +1330,16 @@ class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> (inst $saddr, $voffset, $offset, $cpol) >; +class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol))), + (inst $vaddr, $offset, $cpol) +>; + +class GlobalLoadSaddrPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddrCPol (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm))), + (inst $saddr, $voffset, $offset, $cpol) +>; + class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol)), @@ -1443,24 +1505,24 @@ class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, >; class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset))), - (inst $vaddr, $saddr, $offset, 0) + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))), + (inst $vaddr, $saddr, $offset, $cpol) >; class ScratchStoreSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset)), - (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset) + (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol)), + (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset, $cpol) >; class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset), vt:$in)), - (inst $vaddr, $saddr, $offset, 0, $in) + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol), vt:$in)), + (inst $vaddr, $saddr, $offset, $cpol, $in) >; class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset))), - (inst $vaddr, $saddr, $offset, 0) + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))), + (inst $vaddr, $saddr, $offset, $cpol) >; multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { @@ -1473,6 +1535,16 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp } } +multiclass GlobalFLATLoadPats_CPOL<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadSignedPat_CPOL<inst, node, vt> { + let AddedComplexity = 10; + } + + def : GlobalLoadSaddrPat_CPOL<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 11; + } +} + multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatSignedLoadPat_D16 <inst, node, vt> { let AddedComplexity = 10; @@ -2009,6 +2081,16 @@ let WaveSizePredicate = isWave32, OtherPredicates = [HasTransposeLoadF4F6Insts] defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR6_B96, int_amdgcn_global_load_tr6_b96, v3i32>; } +let OtherPredicates = [isGFX125xOnly] in { + def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B32, int_amdgcn_flat_load_monitor_b32, i32>; + def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B64, int_amdgcn_flat_load_monitor_b64, v2i32>; + def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B128, int_amdgcn_flat_load_monitor_b128, v4i32>; + + defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B32, int_amdgcn_global_load_monitor_b32, i32>; + defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B64, int_amdgcn_global_load_monitor_b64, v2i32>; + defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>; +} // End SubtargetPredicate = isGFX125xOnly + let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; @@ -2138,6 +2220,77 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f } // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch] +def PrefetchLoc: SDNodeXForm<timm, [{ + uint32_t V = N->getZExtValue(); + V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK)) << AMDGPU::CPol::SCOPE_SHIFT; + if (!Subtarget->hasSafeCUPrefetch()) + V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe + return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32); +}]>; + +def prefetch_flat : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), + (prefetch node:$ptr, node:$rw, node:$loc, node:$type), + [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; }]> { + let GISelPredicateCode = [{ + return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS; + }]; +} + +def prefetch_global : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), + (prefetch node:$ptr, node:$rw, node:$loc, node:$type), + [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + (cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + !Subtarget->hasSafeSmemPrefetch()); }]> { + let GISelPredicateCode = [{ + return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::GLOBAL_ADDRESS || + ((*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS && + !Subtarget->hasSafeSmemPrefetch()); + }]; +} + +multiclass FlatPrefetchPats<string inst, SDPatternOperator prefetch_kind, SDPatternOperator rw> { + def : GCNPat < + (prefetch_kind (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), rw, (i32 timm:$loc), i32imm_one), + (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, (i32 (PrefetchLoc $loc))) + > { + let AddedComplexity = !if(!eq(rw, i32imm_zero), 0, 25); + } + + def : GCNPat < + (prefetch_kind (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), rw, (i32 timm:$loc), i32imm_one), + (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, (i32 (PrefetchLoc $loc))) + > { + let AddedComplexity = !if(!eq(rw, i32imm_zero), 11, 30); + } +} + +multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> { + def : GCNPat < + (intr (FlatOffset i64:$vaddr, i32:$offset), timm:$cpol), + (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, $cpol) + >; + + def : GCNPat < + (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol), + (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> { + let AddedComplexity = 11; + } +} + +let SubtargetPredicate = HasVmemPrefInsts in { + defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_zero>; + defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_zero>; + + // Patterns for forced vector prefetch with rw = 1. + defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_one>; + defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_one>; + + + // Patterns for target intrinsics + defm : FlatIntrPrefetchPats<"FLAT_PREFETCH_B8", int_amdgcn_flat_prefetch>; + defm : FlatIntrPrefetchPats<"GLOBAL_PREFETCH_B8", int_amdgcn_global_prefetch>; +} // End SubtargetPredicate = HasVmemPrefInsts + //===----------------------------------------------------------------------===// // Target //===----------------------------------------------------------------------===// @@ -3210,6 +3363,17 @@ multiclass VFLAT_Real_Atomics_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME defm TENSOR_SAVE : VFLAT_Real_gfx1250<0x06e>; defm TENSOR_STOP : VFLAT_Real_gfx1250<0x06f>; +defm FLAT_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>; +defm GLOBAL_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>; + +defm FLAT_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>; +defm FLAT_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>; +defm FLAT_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>; + +defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>; +defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>; +defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>; + defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">; defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 7d6723a..334afd3 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -38,7 +38,11 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1, unsigned GCNRegPressure::getRegKind(const TargetRegisterClass *RC, const SIRegisterInfo *STI) { - return STI->isSGPRClass(RC) ? SGPR : (STI->isAGPRClass(RC) ? AGPR : VGPR); + return STI->isSGPRClass(RC) + ? SGPR + : (STI->isAGPRClass(RC) + ? AGPR + : (STI->isVectorSuperClass(RC) ? AVGPR : VGPR)); } void GCNRegPressure::inc(unsigned Reg, diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 3749b6d..ea33a22 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -29,43 +29,57 @@ class raw_ostream; class SlotIndex; struct GCNRegPressure { - enum RegKind { SGPR, VGPR, AGPR, TOTAL_KINDS }; + enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS }; GCNRegPressure() { clear(); } - bool empty() const { return !Value[SGPR] && !Value[VGPR] && !Value[AGPR]; } + bool empty() const { + return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR]; + } void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); } /// \returns the SGPR32 pressure unsigned getSGPRNum() const { return Value[SGPR]; } - /// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p - /// UnifiedVGPRFile + /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure + /// dependent upon \p UnifiedVGPRFile unsigned getVGPRNum(bool UnifiedVGPRFile) const { if (UnifiedVGPRFile) { - return Value[AGPR] ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR]) - : Value[VGPR]; + return Value[AGPR] + ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR]) + : Value[VGPR] + Value[AVGPR]; } - return std::max(Value[VGPR], Value[AGPR]); + // AVGPR assignment priority is based on the width of the register. Account + // AVGPR pressure as VGPR. + return std::max(Value[VGPR] + Value[AVGPR], Value[AGPR]); } /// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs - /// and \p NumAGPRs AGPRS, for a target with a unified VGPR file. + /// \p NumAGPRs AGPRS, and \p NumAVGPRs AVGPRs for a target with a unified + /// VGPR file. inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs, - unsigned NumAGPRs) { - return alignTo(NumArchVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) + + unsigned NumAGPRs, + unsigned NumAVGPRs) { + + // Assume AVGPRs will be assigned as VGPRs. + return alignTo(NumArchVGPRs + NumAVGPRs, + AMDGPU::IsaInfo::getArchVGPRAllocGranule()) + NumAGPRs; } - /// \returns the ArchVGPR32 pressure - unsigned getArchVGPRNum() const { return Value[VGPR]; } + /// \returns the ArchVGPR32 pressure, plus the AVGPRS which we assume will be + /// allocated as VGPR + unsigned getArchVGPRNum() const { return Value[VGPR] + Value[AVGPR]; } /// \returns the AccVGPR32 pressure unsigned getAGPRNum() const { return Value[AGPR]; } + /// \returns the AVGPR32 pressure + unsigned getAVGPRNum() const { return Value[AVGPR]; } unsigned getVGPRTuplesWeight() const { - return std::max(Value[TOTAL_KINDS + VGPR], Value[TOTAL_KINDS + AGPR]); + return std::max(Value[TOTAL_KINDS + VGPR] + Value[TOTAL_KINDS + AVGPR], + Value[TOTAL_KINDS + AGPR]); } unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; } diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index a655308..ce1ce68 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1911,14 +1911,12 @@ void PreRARematStage::rematerialize() { for (auto &[DefMI, Remat] : Rematerializations) { MachineBasicBlock::iterator InsertPos(Remat.UseMI); Register Reg = DefMI->getOperand(0).getReg(); - unsigned SubReg = DefMI->getOperand(0).getSubReg(); unsigned DefRegion = MIRegion.at(DefMI); // Rematerialize DefMI to its use block. - TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI, - *DAG.TRI); + TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, + AMDGPU::NoSubRegister, *DefMI, *DAG.TRI); Remat.RematMI = &*std::prev(InsertPos); - Remat.RematMI->getOperand(0).setSubReg(SubReg); DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI); // Update region boundaries in regions we sinked from (remove defining MI) @@ -2064,14 +2062,13 @@ void PreRARematStage::finalizeGCNSchedStage() { MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second); MachineBasicBlock *MBB = RegionBB[DefRegion]; Register Reg = RematMI.getOperand(0).getReg(); - unsigned SubReg = RematMI.getOperand(0).getSubReg(); // Re-rematerialize MI at the end of its original region. Note that it may // not be rematerialized exactly in the same position as originally within // the region, but it should not matter much. - TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI); + TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI, + *DAG.TRI); MachineInstr *NewMI = &*std::prev(InsertPos); - NewMI->getOperand(0).setSubReg(SubReg); DAG.LIS->InsertMachineInstrInMaps(*NewMI); auto UseRegion = MIRegion.find(Remat.UseMI); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 9a2bab1..0a0a107 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -537,6 +537,63 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { return getMaxNumVGPRs(MF.getFunction()); } +std::pair<unsigned, unsigned> +GCNSubtarget::getMaxNumVectorRegs(const Function &F) const { + const unsigned MaxVectorRegs = getMaxNumVGPRs(F); + + unsigned MaxNumVGPRs = MaxVectorRegs; + unsigned MaxNumAGPRs = 0; + + // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, + // a wave may have up to 512 total vector registers combining together both + // VGPRs and AGPRs. Hence, in an entry function without calls and without + // AGPRs used within it, it is possible to use the whole vector register + // budget for VGPRs. + // + // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split + // register file accordingly. + if (hasGFX90AInsts()) { + unsigned MinNumAGPRs = 0; + const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs(); + const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); + + const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u}; + + // TODO: The lower bound should probably force the number of required + // registers up, overriding amdgpu-waves-per-eu. + std::tie(MinNumAGPRs, MaxNumAGPRs) = + AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", DefaultNumAGPR, + /*OnlyFirstRequired=*/true); + + if (MinNumAGPRs == DefaultNumAGPR.first) { + // Default to splitting half the registers if AGPRs are required. + MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2; + } else { + // Align to accum_offset's allocation granularity. + MinNumAGPRs = alignTo(MinNumAGPRs, 4); + + MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs); + } + + // Clamp values to be inbounds of our limits, and ensure min <= max. + + MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs); + MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs); + + MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs); + MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs); + + assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs && + MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs && + "invalid register counts"); + } else if (hasMAIInsts()) { + // On gfx908 the number of AGPRs always equals the number of VGPRs. + MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs; + } + + return std::pair(MaxNumVGPRs, MaxNumAGPRs); +} + void GCNSubtarget::adjustSchedDependency( SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 407d79a..785ede3 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -123,6 +123,7 @@ protected: bool HasSMemRealTime = false; bool HasIntClamp = false; bool HasFmaMixInsts = false; + bool HasFmaMixBF16Insts = false; bool HasMovrel = false; bool HasVGPRIndexMode = false; bool HasScalarDwordx3Loads = false; @@ -244,7 +245,9 @@ protected: bool HasVMEMtoScalarWriteHazard = false; bool HasSMEMtoVectorWriteHazard = false; bool HasInstFwdPrefetchBug = false; + bool HasVmemPrefInsts = false; bool HasSafeSmemPrefetch = false; + bool HasSafeCUPrefetch = false; bool HasVcmpxExecWARHazard = false; bool HasLdsBranchVmemWARHazard = false; bool HasNSAtoVMEMBug = false; @@ -265,8 +268,10 @@ protected: bool HasIEEEMinimumMaximumInsts = false; bool HasMinimum3Maximum3F32 = false; bool HasMinimum3Maximum3F16 = false; + bool HasMin3Max3PKF16 = false; bool HasMinimum3Maximum3PKF16 = false; bool HasLshlAddU64Inst = false; + bool HasAddSubU64Insts = false; bool HasPointSampleAccel = false; bool HasLdsBarrierArriveAtomic = false; bool HasSetPrioIncWgInst = false; @@ -460,6 +465,8 @@ public: return HasFmaMixInsts; } + bool hasFmaMixBF16Insts() const { return HasFmaMixBF16Insts; } + bool hasCARRY() const { return true; } @@ -985,8 +992,12 @@ public: bool hasPrefetch() const { return GFX12Insts; } + bool hasVmemPrefInsts() const { return HasVmemPrefInsts; } + bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; } + bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; } + // Has s_cmpk_* instructions. bool hasSCmpK() const { return getGeneration() < GFX12; } @@ -1167,6 +1178,9 @@ public: bool hasFlatGVSMode() const { return FlatGVSMode; } + // FLAT GLOBAL VOffset is signed + bool hasSignedGVSOffset() const { return GFX1250Insts; } + bool enableSIScheduler() const { return EnableSIScheduler; } @@ -1303,7 +1317,7 @@ public: bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } - bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; } + bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; } /// Return if operations acting on VGPR tuples require even alignment. bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; } @@ -1384,6 +1398,8 @@ public: return HasMinimum3Maximum3F16; } + bool hasMin3Max3PKF16() const { return HasMin3Max3PKF16; } + bool hasTanhInsts() const { return HasTanhInsts; } bool hasAddPC64Inst() const { return GFX1250Insts; } @@ -1497,6 +1513,18 @@ public: bool hasVOPD3() const { return GFX1250Insts; } + // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions. + bool hasAddSubU64Insts() const { return HasAddSubU64Insts; } + + // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions. + bool hasVectorMulU64() const { return GFX1250Insts; } + + // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions. + bool hasPkAddMinMaxInsts() const { return GFX1250Insts; } + + // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions. + bool hasPkMinMax3Insts() const { return GFX1250Insts; } + // \returns true if target has S_SETPRIO_INC_WG instruction. bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; } @@ -1639,6 +1667,10 @@ public: return getMaxNumVGPRs(F); } + /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number + /// of waves per execution unit required for the function \p MF. + std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const; + /// \returns Maximum number of VGPRs that meets number of waves per execution /// unit requirement for function \p MF, or number of VGPRs explicitly /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. diff --git a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp index 429ce0e0..a33dbfa 100644 --- a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -270,5 +270,6 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { MI.eraseFromParent(); } } + finalizeBundles(MF); return false; } diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp index 2a3b42e..eff5b0a 100644 --- a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp @@ -138,7 +138,6 @@ void R600PassConfig::addPreSched2() { void R600PassConfig::addPreEmitPass() { addPass(createR600MachineCFGStructurizerPass()); addPass(createR600ExpandSpecialInstrsPass()); - addPass(&FinalizeMachineBundlesID); addPass(createR600Packetizer()); addPass(createR600ControlFlowFinalizer()); } diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 3902d4c..40b8bcd 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -392,11 +392,13 @@ enum CPol { TH_ATOMIC_CASCADE = 4, // Cascading vs regular // Scope - SCOPE = 0x3 << 3, // All Scope bits - SCOPE_CU = 0 << 3, - SCOPE_SE = 1 << 3, - SCOPE_DEV = 2 << 3, - SCOPE_SYS = 3 << 3, + SCOPE_SHIFT = 3, + SCOPE_MASK = 0x3, + SCOPE = SCOPE_MASK << SCOPE_SHIFT, // All Scope bits + SCOPE_CU = 0 << SCOPE_SHIFT, + SCOPE_SE = 1 << SCOPE_SHIFT, + SCOPE_DEV = 2 << SCOPE_SHIFT, + SCOPE_SYS = 3 << SCOPE_SHIFT, NV = 1 << 5, // Non-volatile bit diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index e5d1eaa..b77da4d 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1062,9 +1062,13 @@ bool SIFoldOperandsImpl::tryFoldRegSeqSplat( switch (OpTy) { case AMDGPU::OPERAND_REG_INLINE_AC_INT32: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0); break; case AMDGPU::OPERAND_REG_INLINE_AC_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1); break; default: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index bc0fd8d..8d51ec6 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -874,13 +874,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom); - if (Subtarget->hasScalarSMulU64()) + if (Subtarget->hasVectorMulU64()) + setOperationAction(ISD::MUL, MVT::i64, Legal); + else if (Subtarget->hasScalarSMulU64()) setOperationAction(ISD::MUL, MVT::i64, Custom); if (Subtarget->hasMad64_32()) setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); - if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch()) + if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts()) setOperationAction(ISD::PREFETCH, MVT::Other, Custom); if (Subtarget->hasIEEEMinimumMaximumInsts()) { @@ -944,6 +946,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal); } + if (Subtarget->hasBF16PackedInsts()) { + setOperationAction( + {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA}, + MVT::v2bf16, Legal); + } + if (Subtarget->hasBF16TransInsts()) { setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal); } @@ -1053,10 +1061,12 @@ ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const { // where this is OK to use. bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const { - return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || - (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && - DestVT.getScalarType() == MVT::f32 && - SrcVT.getScalarType() == MVT::f16 && + return DestVT.getScalarType() == MVT::f32 && + ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && + SrcVT.getScalarType() == MVT::f16) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() && + SrcVT.getScalarType() == MVT::bf16)) && // TODO: This probably only requires no input flushing? denormalModeIsFlushAllF32(DAG.getMachineFunction()); } @@ -1467,6 +1477,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: case Intrinsic::amdgcn_ds_load_tr6_b96: case Intrinsic::amdgcn_ds_load_tr4_b64: case Intrinsic::amdgcn_ds_load_tr8_b64: @@ -1540,7 +1556,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; } - case Intrinsic::amdgcn_s_prefetch_data: { + case Intrinsic::amdgcn_s_prefetch_data: + case Intrinsic::amdgcn_flat_prefetch: + case Intrinsic::amdgcn_global_prefetch: { Info.opc = ISD::INTRINSIC_VOID; Info.memVT = EVT::getIntegerVT(CI.getContext(), 8); Info.ptrVal = CI.getArgOperand(0); @@ -1591,10 +1609,16 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin_num: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: case Intrinsic::amdgcn_global_atomic_csub: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: case Intrinsic::amdgcn_global_load_tr_b64: case Intrinsic::amdgcn_global_load_tr_b128: case Intrinsic::amdgcn_global_load_tr4_b64: @@ -4432,19 +4456,28 @@ SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op, } SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { - if (Op->isDivergent()) + if (Op->isDivergent() && + (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4))) + // Cannot do I$ prefetch with divergent pointer. return SDValue(); switch (cast<MemSDNode>(Op)->getAddressSpace()) { case AMDGPUAS::FLAT_ADDRESS: case AMDGPUAS::GLOBAL_ADDRESS: case AMDGPUAS::CONSTANT_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS_32BIT: break; + case AMDGPUAS::CONSTANT_ADDRESS_32BIT: + if (Subtarget->hasSafeSmemPrefetch()) + break; + [[fallthrough]]; default: return SDValue(); } + // I$ prefetch + if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4)) + return SDValue(); + return Op; } @@ -5415,6 +5448,19 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineOperand &Src0 = MI.getOperand(1); MachineOperand &Src1 = MI.getOperand(2); + if (ST.hasAddSubU64Insts()) { + auto I = BuildMI(*BB, MI, DL, + TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64 + : AMDGPU::V_SUB_U64_e64), + Dest.getReg()) + .add(Src0) + .add(Src1) + .addImm(0); // clamp + TII->legalizeOperands(*I); + MI.eraseFromParent(); + return BB; + } + if (IsAdd && ST.hasLshlAddU64Inst()) { auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64), Dest.getReg()) @@ -13633,6 +13679,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, case Intrinsic::amdgcn_rcp_legacy: case Intrinsic::amdgcn_rsq_legacy: case Intrinsic::amdgcn_trig_preop: + case Intrinsic::amdgcn_tanh: case Intrinsic::amdgcn_log: case Intrinsic::amdgcn_exp2: case Intrinsic::amdgcn_sqrt: @@ -14046,7 +14093,8 @@ static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, case ISD::FMAXIMUMNUM: case AMDGPUISD::FMIN_LEGACY: case AMDGPUISD::FMAX_LEGACY: - return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()); + return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) || + (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16()); case ISD::FMINIMUM: case ISD::FMAXIMUM: return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) || @@ -14131,6 +14179,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && (VT == MVT::f32 || VT == MVT::f64 || (VT == MVT::f16 && Subtarget->has16BitInsts()) || + (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) || + (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) || (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) && Op0.hasOneUse()) { if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 9faf497..520c321 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -552,7 +552,7 @@ public: (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) { // FLAT and SCRATCH instructions may access scratch. Other VMEM // instructions do not. - if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst)) + if (TII->mayAccessScratchThroughFlat(Inst)) return SCRATCH_WRITE_ACCESS; return VMEM_WRITE_ACCESS; } @@ -565,7 +565,6 @@ public: bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; - bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; bool isVmemAccess(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, @@ -2108,8 +2107,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { assert(TII->isFLAT(MI)); - // All flat instructions use the VMEM counter. - assert(TII->usesVM_CNT(MI)); + // All flat instructions use the VMEM counter except prefetch. + if (!TII->usesVM_CNT(MI)) + return false; // If there are no memory operands then conservatively assume the flat // operation may access VMEM. @@ -2159,32 +2159,6 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { return false; } -// This is a flat memory operation. Check to see if it has memory tokens for -// either scratch or FLAT. -bool SIInsertWaitcnts::mayAccessScratchThroughFlat( - const MachineInstr &MI) const { - assert(TII->isFLAT(MI)); - - // SCRATCH instructions always access scratch. - if (TII->isFLATScratch(MI)) - return true; - - // GLOBAL instructions never access scratch. - if (TII->isFLATGlobal(MI)) - return false; - - // If there are no memory operands then conservatively assume the flat - // operation may access scratch. - if (MI.memoperands_empty()) - return true; - - // See if any memory operand specifies an address space that involves scratch. - return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { - unsigned AS = Memop->getAddrSpace(); - return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; - }); -} - bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const { return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) || (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode())); @@ -2295,9 +2269,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); } - // A Flat memory operation must access at least one address space. - assert(FlatASCount); - // This is a flat memory operation that access both VMEM and LDS, so note it // - it will require that both the VM and LGKM be flushed to zero if it is // pending when a VM or LGKM dependency occurs. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 571f3ef..2aa6b4e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2508,7 +2508,20 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { .addReg(DstHi); } break; + + case AMDGPU::V_MAX_BF16_PSEUDO_e64: + assert(ST.hasBF16PackedInsts()); + MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16)); + MI.addOperand(MachineOperand::CreateImm(0)); // op_sel + MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo + MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi + auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); + Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1); + auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); + Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1); + break; } + return true; } @@ -2733,49 +2746,47 @@ static MachineInstr *swapImmOperands(MachineInstr &MI, } bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0, - const MachineOperand *MO0, unsigned OpIdx1, - const MachineOperand *MO1) const { + unsigned OpIdx1) const { const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0]; const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1]; - const TargetRegisterClass *DefinedRC1 = - OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo1.RegClass) : nullptr; - const TargetRegisterClass *DefinedRC0 = - OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo0.RegClass) : nullptr; unsigned Opc = MI.getOpcode(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + const MachineOperand &MO0 = MI.getOperand(OpIdx0); + const MachineOperand &MO1 = MI.getOperand(OpIdx1); + // Swap doesn't breach constant bus or literal limits // It may move literal to position other than src0, this is not allowed // pre-gfx10 However, most test cases need literals in Src0 for VOP // FIXME: After gfx9, literal can be in place other than Src0 if (isVALU(MI)) { - if ((int)OpIdx0 == Src0Idx && !MO0->isReg() && - !isInlineConstant(*MO0, OpInfo1)) + if ((int)OpIdx0 == Src0Idx && !MO0.isReg() && + !isInlineConstant(MO0, OpInfo1)) return false; - if ((int)OpIdx1 == Src0Idx && !MO1->isReg() && - !isInlineConstant(*MO1, OpInfo0)) + if ((int)OpIdx1 == Src0Idx && !MO1.isReg() && + !isInlineConstant(MO1, OpInfo0)) return false; } - if ((int)OpIdx1 != Src0Idx && MO0->isReg()) { - if (!DefinedRC1) + if ((int)OpIdx1 != Src0Idx && MO0.isReg()) { + if (OpInfo1.RegClass == -1) return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN; - return isLegalRegOperand(MI, OpIdx1, *MO0) && - (!MO1->isReg() || isLegalRegOperand(MI, OpIdx0, *MO1)); + return isLegalRegOperand(MI, OpIdx1, MO0) && + (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1)); } - if ((int)OpIdx0 != Src0Idx && MO1->isReg()) { - if (!DefinedRC0) + if ((int)OpIdx0 != Src0Idx && MO1.isReg()) { + if (OpInfo0.RegClass == -1) return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN; - return (!MO0->isReg() || isLegalRegOperand(MI, OpIdx1, *MO0)) && - isLegalRegOperand(MI, OpIdx0, *MO1); + return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) && + isLegalRegOperand(MI, OpIdx0, MO1); } // No need to check 64-bit literals since swapping does not bring new // 64-bit literals into current instruction to fold to 32-bit - return isImmOperandLegal(MI, OpIdx1, *MO0); + return isImmOperandLegal(MI, OpIdx1, MO0); } MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, @@ -2797,12 +2808,12 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices"); - MachineOperand &Src0 = MI.getOperand(Src0Idx); - MachineOperand &Src1 = MI.getOperand(Src1Idx); - if (!isLegalToSwap(MI, Src0Idx, &Src0, Src1Idx, &Src1)) { + if (!isLegalToSwap(MI, Src0Idx, Src1Idx)) return nullptr; - } + MachineInstr *CommutedMI = nullptr; + MachineOperand &Src0 = MI.getOperand(Src0Idx); + MachineOperand &Src1 = MI.getOperand(Src1Idx); if (Src0.isReg() && Src1.isReg()) { // Be sure to copy the source modifiers to the right place. CommutedMI = @@ -4238,6 +4249,32 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode); } +bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const { + if (!isFLAT(MI) || isFLATGlobal(MI)) + return false; + + // If scratch is not initialized, we can never access it. + if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init")) + return false; + + // SCRATCH instructions always access scratch. + if (isFLATScratch(MI)) + return true; + + // If there are no memory operands then conservatively assume the flat + // operation may access scratch. + if (MI.memoperands_empty()) + return true; + + // TODO (?): Does this need to be taught how to read noalias.addrspace ? + + // See if any memory operand specifies an address space that involves scratch. + return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { + unsigned AS = Memop->getAddrSpace(); + return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; + }); +} + bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { // Skip the full operand and register alias search modifiesRegister // does. There's only a handful of instructions that touch this, it's only an @@ -7361,6 +7398,10 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } case AMDGPU::S_MUL_U64: + if (ST.hasVectorMulU64()) { + NewOpcode = AMDGPU::V_MUL_U64_e64; + break; + } // Split s_mul_u64 in 32-bit vector multiplications. splitScalarSMulU64(Worklist, Inst, MDT); Inst.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 800ea9a..e042b59 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -197,8 +197,7 @@ protected: AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const; bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, - const MachineOperand *fromMO, unsigned toIdx, - const MachineOperand *toMO) const; + unsigned toIdx) const; MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override; @@ -679,6 +678,12 @@ public: return get(Opcode).TSFlags & SIInstrFlags::FLAT; } + /// \returns true for SCRATCH_ instructions, or FLAT_ instructions with + /// SCRATCH_ memory operands. + /// Conservatively correct; will return true if \p MI cannot be proven + /// to not hit scratch. + bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; + static bool isBlockLoadStore(uint16_t Opcode) { switch (Opcode) { case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index bd4995b..83b0490 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1662,6 +1662,8 @@ def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">; def VOP3PMadMixModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsExt">; def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">; +def VOP3PMadMixBF16ModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsExt">; +def VOP3PMadMixBF16Mods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16Mods">; def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">; def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">; @@ -2863,9 +2865,11 @@ def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>; def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>; def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp=*/1>; +def VOP_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, untyped]>; def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>; def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>; +def VOP_BF16_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, bf16, untyped]>; def VOP_I32_I16_I16_I32 : VOPProfile <[i32, i16, i16, i32, untyped]>; def VOP_I32_I16 : VOPProfile <[i32, i16, untyped, untyped]>; @@ -2873,10 +2877,12 @@ def VOP_I16_I32 : VOPProfile <[i16, i32, untyped, untyped]>; def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>; def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>; +def VOP_V2BF16_V2BF16_V2BF16 : VOPProfile <[v2bf16, v2bf16, v2bf16, untyped]>; def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>; def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>; def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>; +def VOP_V2BF16_V2BF16_V2BF16_V2BF16 : VOPProfile <[v2bf16, v2bf16, v2bf16, v2bf16]>; def VOP_V2I16_F32_F32 : VOPProfile <[v2i16, f32, f32, untyped]>; def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, untyped]>; @@ -2912,8 +2918,10 @@ def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; def VOP_I16_F32_F32 : VOPProfile <[i16, f32, f32, untyped]>; def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=*/1>; +def VOP_I64_I64_I64_ARITH : VOPProfile <[i64, i64, i64, untyped], /*EnableClamp=*/1>; def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>; def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>; +def VOP_F32_BF16_BF16_BF16 : VOPProfile <[f32, bf16, bf16, bf16]>; def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>; def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>; def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index d05be8f..54fa192 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1894,6 +1894,9 @@ let SubtargetPredicate = UseRealTrue16Insts in def : ClampPat<V_MAX_F16_t16_e64, f16>; let SubtargetPredicate = UseFakeTrue16Insts in def : ClampPat<V_MAX_F16_fake16_e64, f16>; +// FIXME-TRUE16: Pseudo expansion of this won't work with True16. +let True16Predicate = UseFakeTrue16Insts in +def : ClampPat<V_MAX_BF16_PSEUDO_e64, bf16>; let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat < @@ -1903,6 +1906,13 @@ def : GCNPat < >; } +let SubtargetPredicate = HasBF16PackedInsts in { +def : GCNPat < + (v2bf16 (AMDGPUclamp (VOP3PMods v2bf16:$src0, i32:$src0_modifiers))), + (V_PK_MAX_NUM_BF16 $src0_modifiers, $src0, + $src0_modifiers, $src0, DSTCLAMP.ENABLE) +>; +} // End SubtargetPredicate = HasBF16PackedInsts /********** ================================ **********/ /********** Floating point absolute/negative **********/ diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 9f61bf8..9509199 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -351,6 +351,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF, MachineRegisterInfo &MRI = MF.getRegInfo(); BitVector ReservedRegs = TRI->getReservedRegs(MF); BitVector NonWwmAllocMask(TRI->getNumRegs()); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); // FIXME: MaxNumVGPRsForWwmAllocation might need to be adjusted in the future // to have a balanced allocation between WWM values and per-thread vector @@ -359,7 +360,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF, NumRegs = std::min(static_cast<unsigned>(MFI->getSGPRSpillVGPRs().size()), NumRegs); - auto [MaxNumVGPRs, MaxNumAGPRs] = TRI->getMaxNumVectorRegs(MF); + auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction()); // Try to use the highest available registers for now. Later after // vgpr-regalloc, they can be shifted to the lowest range. unsigned I = 0; @@ -376,7 +377,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF, // Reserve an arbitrary register and report the error. TRI->markSuperRegs(RegMask, AMDGPU::VGPR0); MF.getFunction().getContext().emitError( - "can't find enough VGPRs for wwm-regalloc"); + "cannot find enough VGPRs for wwm-regalloc"); } } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index f0be204..9a1448f 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -81,11 +81,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, PSInputAddr = AMDGPU::getInitialPSInputAddr(F); } - MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm; - if (!MFMAVGPRForm && ST.hasGFX90AInsts() && - ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() && - !mayUseAGPRs(F)) - MayNeedAGPRs = false; // We will select all MAI with VGPR operands. + MayNeedAGPRs = ST.hasMAIInsts(); + if (ST.hasGFX90AInsts()) { + // FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection + // should be separated from availability of AGPRs + if (MFMAVGPRForm || + (ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() && + !mayUseAGPRs(F))) + MayNeedAGPRs = false; // We will select all MAI with VGPR operands. + } if (AMDGPU::isChainCC(CC)) { // Chain functions don't receive an SP from their caller, but are free to diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 3212060..607825e 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -321,7 +321,8 @@ public: bool IsNonTemporal, bool IsLastUse = false) const = 0; - virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const { + virtual bool finalizeStore(MachineBasicBlock::iterator &MI, + bool Atomic) const { return false; }; @@ -602,7 +603,8 @@ public: bool IsVolatile, bool IsNonTemporal, bool IsLastUse) const override; - bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; + bool finalizeStore(MachineBasicBlock::iterator &MI, + bool Atomic) const override; bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, @@ -704,16 +706,16 @@ void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) { DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning)); } -/// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA. -/// If this tag isn't present, or if it has no meaningful values, returns \p -/// Default. Otherwise returns all the address spaces concerned by the MMRA. -static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI, - SIAtomicAddrSpace Default) { - static constexpr StringLiteral FenceASPrefix = "amdgpu-as"; +/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA. +/// If this tag isn't present, or if it has no meaningful values, returns +/// \p none, otherwise returns the address spaces specified by the MD. +static std::optional<SIAtomicAddrSpace> +getSynchronizeAddrSpaceMD(const MachineInstr &MI) { + static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as"; auto MMRA = MMRAMetadata(MI.getMMRAMetadata()); if (!MMRA) - return Default; + return std::nullopt; SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE; for (const auto &[Prefix, Suffix] : MMRA) { @@ -726,7 +728,10 @@ static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI, diagnoseUnknownMMRAASName(MI, Suffix); } - return (Result != SIAtomicAddrSpace::NONE) ? Result : Default; + if (Result == SIAtomicAddrSpace::NONE) + return std::nullopt; + + return Result; } } // end anonymous namespace @@ -903,12 +908,19 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = *ScopeOrNone; - if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || - ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { + if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) { + // We currently expect refineOrderingAS to be the only place that + // can refine the AS ordered by the fence. + // If that changes, we need to review the semantics of that function + // in case it needs to preserve certain address spaces. reportUnsupported(MI, "Unsupported atomic address space"); return std::nullopt; } + auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI); + if (SynchronizeAS) + OrderingAddrSpace = *SynchronizeAS; + return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); } @@ -2541,11 +2553,25 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( return Changed; } -bool SIGfx12CacheControl::expandSystemScopeStore( - MachineBasicBlock::iterator &MI) const { +bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI, + bool Atomic) const { MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); - if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) - return insertWaitsBeforeSystemScopeStore(MI); + if (!CPol) + return false; + + const unsigned Scope = CPol->getImm() & CPol::SCOPE; + + // GFX12.0 only: Extra waits needed before system scope stores. + if (!ST.hasGFX1250Insts()) { + if (!Atomic && Scope == CPol::SCOPE_SYS) + return insertWaitsBeforeSystemScopeStore(MI); + return false; + } + + // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address + // space. + if (TII->mayAccessScratchThroughFlat(*MI) && Scope == CPol::SCOPE_CU) + return setScope(MI, CPol::SCOPE_SE); return false; } @@ -2664,6 +2690,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE); + Changed |= CC->finalizeStore(MI, /*Atomic=*/true); return Changed; } @@ -2676,7 +2703,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, // GFX12 specific, scope(desired coherence domain in cache hierarchy) is // instruction field, do not confuse it with atomic scope. - Changed |= CC->expandSystemScopeStore(MI); + Changed |= CC->finalizeStore(MI, /*Atomic=*/false); return Changed; } @@ -2687,11 +2714,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, AtomicPseudoMIs.push_back(MI); bool Changed = false; - // Refine fenced address space based on MMRAs. - // - // TODO: Should we support this MMRA on other atomic operations? - auto OrderingAddrSpace = - getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace()); + const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace(); if (MOI.isAtomic()) { const AtomicOrdering Order = MOI.getOrdering(); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 84cfa87..f3acc5c 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -572,65 +572,6 @@ MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass); } -std::pair<unsigned, unsigned> -SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const { - const unsigned MaxVectorRegs = ST.getMaxNumVGPRs(MF); - - unsigned MaxNumVGPRs = MaxVectorRegs; - unsigned MaxNumAGPRs = 0; - - // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, - // a wave may have up to 512 total vector registers combining together both - // VGPRs and AGPRs. Hence, in an entry function without calls and without - // AGPRs used within it, it is possible to use the whole vector register - // budget for VGPRs. - // - // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split - // register file accordingly. - if (ST.hasGFX90AInsts()) { - unsigned MinNumAGPRs = 0; - const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs(); - const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); - - const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u}; - - // TODO: Move this logic into subtarget on IR function - // - // TODO: The lower bound should probably force the number of required - // registers up, overriding amdgpu-waves-per-eu. - std::tie(MinNumAGPRs, MaxNumAGPRs) = AMDGPU::getIntegerPairAttribute( - MF.getFunction(), "amdgpu-agpr-alloc", DefaultNumAGPR, - /*OnlyFirstRequired=*/true); - - if (MinNumAGPRs == DefaultNumAGPR.first) { - // Default to splitting half the registers if AGPRs are required. - MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2; - } else { - // Align to accum_offset's allocation granularity. - MinNumAGPRs = alignTo(MinNumAGPRs, 4); - - MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs); - } - - // Clamp values to be inbounds of our limits, and ensure min <= max. - - MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs); - MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs); - - MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs); - MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs); - - assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs && - MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs && - "invalid register counts"); - } else if (ST.hasMAIInsts()) { - // On gfx908 the number of AGPRs always equals the number of VGPRs. - MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs; - } - - return std::pair(MaxNumVGPRs, MaxNumAGPRs); -} - BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::MODE); @@ -742,7 +683,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Reserve VGPRs/AGPRs. // - auto [MaxNumVGPRs, MaxNumAGPRs] = getMaxNumVectorRegs(MF); + auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction()); for (const TargetRegisterClass *RC : regclasses()) { if (RC->isBaseClass() && isVGPRClass(RC)) { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 0008e5f..5508f07 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -90,11 +90,6 @@ public: /// spilling is needed. MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; - /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number - /// of waves per execution unit required for the function \p MF. - std::pair<unsigned, unsigned> - getMaxNumVectorRegs(const MachineFunction &MF) const; - BitVector getReservedRegs(const MachineFunction &MF) const override; bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 0039d2f..218841d 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -109,6 +109,23 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList> let TSFlags{2} = HasVGPR; let TSFlags{3} = HasAGPR; let TSFlags{4} = HasSGPR; + + // RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block) + // to decide which registers to try to assign first. Usually, this RegisterClass priority is given + // very high priority, if not the highest priority, when considering which VirtReg to allocate next. + // + // We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to + // assign more constrained RegisterClasses first. As a result, we prioritize register classes with + // more 32 bit tuples (e.g. VReg_512) over registers with fewer tuples (e.g. VGPR_32). + // + // The interesting case is the vector register case on architectures which have ARegs, VRegs, AVRegs. + // In this case, we would like to assign ARegs and VRegs before AVRegs, as AVRegs are less constrained + // and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the + // RegisterClass AllocationPriority. BaseClassPriority is used to turn the bit on, and BaseClassScaleFactor + // is used for scaling of the bit (i.e. 1 << 4). + field int BaseClassPriority = 1; + field int BaseClassScaleFactor = 16; + } multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1, @@ -575,7 +592,7 @@ let HasVGPR = 1 in { def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, (add (interleave (sequence "VGPR%u_LO16", 0, 255), (sequence "VGPR%u_HI16", 0, 255)))> { - let AllocationPriority = 2; + let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor)); let Size = 16; let GeneratePressureSet = 0; @@ -601,7 +618,7 @@ def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, // i16/f16 only on VI+ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, (add (sequence "VGPR%u", 0, 255))> { - let AllocationPriority = 0; + let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor)); let Size = 32; let Weight = 1; let BaseClassOrder = 32; @@ -610,7 +627,7 @@ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types // Identical to VGPR_32 except it only contains the low 128 (Lo128) registers. def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, (add (sequence "VGPR%u", 0, 127))> { - let AllocationPriority = 0; + let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor)); let GeneratePressureSet = 0; let Size = 32; let Weight = 1; @@ -668,7 +685,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, // AccVGPR 32-bit registers def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, (add (sequence "AGPR%u", 0, 255))> { - let AllocationPriority = 0; + let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor)); let Size = 32; let Weight = 1; let BaseClassOrder = 32; @@ -940,14 +957,23 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> : // Requires n v_mov_b32 to copy let CopyCost = numRegs; - let AllocationPriority = !sub(numRegs, 1); + + // Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the + // 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result + // of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for + // regsters with numRegs 17+ we give SizePriority of 15. In practice, there is only one + // RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512}, + // and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing. + defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15)); + + let AllocationPriority = !add(SizePrioriity, !mul(BaseClassPriority, BaseClassScaleFactor)); let Weight = numRegs; } // Define a register tuple class, along with one requiring an even // aligned base register. multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> { - let HasVGPR = 1 in { + let HasVGPR = 1, BaseClassPriority = 1 in { // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, regList> { let BaseClassOrder = !mul(numRegs, 32); @@ -981,7 +1007,7 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>; } multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> { - let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in { + let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1 in { // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, regList> { let BaseClassOrder = !mul(numRegs, 32); @@ -1066,6 +1092,7 @@ def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_6 def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> { let HasVGPR = 1; let HasAGPR = 1; + let BaseClassPriority = 0; let Size = 32; } } // End GeneratePressureSet = 0 @@ -1074,7 +1101,7 @@ def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_3 // aligned base register. multiclass AVRegClass<int numRegs, list<ValueType> regTypes, dag vregList, dag aregList> { - let HasVGPR = 1, HasAGPR = 1 in { + let HasVGPR = 1, HasAGPR = 1, BaseClassPriority = 0 in { // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, (add vregList, aregList)>; diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 0850c41..4bda51d 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -856,16 +856,18 @@ def smrd_sextloadi16 : SMRDLoadPat<sextloadi16>; def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), (prefetch node:$ptr, node:$rw, node:$loc, node:$type), - [{ return !N->getOperand(1)->isDivergent();}]> { + [{ return !N->getOperand(1)->isDivergent() && Subtarget->hasSafeSmemPrefetch();}]> { let GISelPredicateCode = [{ - return isInstrUniform(MI); + return isInstrUniform(MI) && Subtarget->hasSafeSmemPrefetch(); }]; } def SMRDImm : ComplexPattern<iPTR, 2, "SelectSMRDImm">; def SMRDImm32 : ComplexPattern<iPTR, 2, "SelectSMRDImm32">; -def SMRDSgpr : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">; -def SMRDSgprImm : ComplexPattern<iPTR, 3, "SelectSMRDSgprImm">; +let WantsRoot = true in { + def SMRDSgpr : ComplexPattern<iPTR, 3, "SelectSMRDSgpr", [], [], -3>; + def SMRDSgprImm : ComplexPattern<iPTR, 4, "SelectSMRDSgprImm", [], []>; +} def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">; def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">; def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">; @@ -906,15 +908,15 @@ multiclass SMRD_Patterns <string Instr, ValueType vt, PatFrag frag, let SubtargetPredicate = isNotGFX9Plus; } def : GCNPat < - (frag (SMRDSgpr i64:$sbase, i32:$soffset)), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, 0))> { + (frag (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, $cpol))> { let SubtargetPredicate = isGFX9Plus; } // 4. SGPR+IMM offset def : GCNPat < - (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, 0))> { + (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, $cpol))> { let SubtargetPredicate = isGFX9Plus; } @@ -989,15 +991,15 @@ multiclass ScalarLoadWithExtensionPat <string Instr, SDPatternOperator node, Val // 2. SGPR offset def : GCNPat < - (node (SMRDSgpr i64:$sbase, i32:$soffset)), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))>{ + (node (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, $cpol))>{ let SubtargetPredicate = isGFX12Plus; } // 3. SGPR+IMM offset def : GCNPat < - (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))>{ + (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, $cpol))>{ let SubtargetPredicate = isGFX12Plus; } @@ -1150,6 +1152,7 @@ multiclass SMPrefetchPat<string type, TImmLeaf cache_type> { } defm : SMPrefetchPat<"INST", i32imm_zero>; +let AddedComplexity = 12 in // Prefer scalar prefetch over global for r/o case. defm : SMPrefetchPat<"DATA", i32imm_one>; let SubtargetPredicate = isGFX12Plus in { diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 030a6e1..550ec9d 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -925,6 +925,17 @@ let isAdd = 1 in { defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32">; } +let isReMaterializable = 1 in { +let SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit] in { +defm V_ADD_U64 : VOP2Inst <"v_add_nc_u64", VOP_I64_I64_I64_ARITH>; +// We don't actually have something like V_SUBREV_U64 so V_SUB_U64 can't be treated as commutable. +let isCommutable = 0 in +defm V_SUB_U64 : VOP2Inst <"v_sub_nc_u64", VOP_I64_I64_I64_ARITH>; +} // End SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit] +let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDouble] in +defm V_MUL_U64 : VOP2Inst <"v_mul_u64", VOP_I64_I64_I64, DivergentBinFrag<mul>>; +} // End isReMaterializable = 1 + } // End isCommutable = 1 // These are special and do not read the exec mask. @@ -1754,6 +1765,9 @@ multiclass VOP2_Real_FULL_with_name<GFXGen Gen, bits<6> op, string opName, VOP2_Realtriple_e64_with_name<Gen, op, opName, asmName>, VOP2_Real_NO_VOP3_with_name<Gen, op, opName, asmName>; +multiclass VOP2_Real_NO_DPP<GFXGen Gen, bits<6> op> : + VOP2_Real_e32<Gen, op>, VOP2_Real_e64<Gen, op>; + multiclass VOP2_Real_NO_DPP_with_name<GFXGen Gen, bits<6> op, string opName, string asmName> { defm NAME : VOP2_Real_e32_with_name<Gen, op, opName, asmName>, @@ -1843,6 +1857,9 @@ defm V_FMAC_F64 : VOP2_Real_FULL<GFX12Gen, 0x17>; defm V_FMAMK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x23>; defm V_FMAAK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x24>; +defm V_ADD_U64 : VOP2_Real_FULL<GFX1250Gen, 0x28>; +defm V_SUB_U64 : VOP2_Real_FULL<GFX1250Gen, 0x29>; +defm V_MUL_U64 : VOP2_Real_NO_DPP<GFX1250Gen, 0x2a>; //===----------------------------------------------------------------------===// // GFX11. diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index aee2f2c..b6f9568 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1918,6 +1918,7 @@ let AssemblerPredicate = isGFX11Plus in { // These instructions differ from GFX12 variant by supporting DPP: defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>; +defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>; //===----------------------------------------------------------------------===// // GFX10. diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 9feea36..95fcd4a 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -35,14 +35,18 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR, bit useTiedOutput = 0> : VOP3P_Profile<P, Features, 1> { bit UseTiedOutput = useTiedOutput; + defvar Src0RC = getVCSrcForVT<P.Src0VT>.ret; + defvar Src1RC = getVCSrcForVT<P.Src1VT>.ret; + defvar Src2RC = getVCSrcForVT<P.Src2VT>.ret; + dag srcs = - (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0, - FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, - FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); + (ins FP16InputMods:$src0_modifiers, Src0RC:$src0, + FP16InputMods:$src1_modifiers, Src1RC:$src1, + FP16InputMods:$src2_modifiers, Src2RC:$src2); dag dpp_srcs = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, FPVRegInputMods:$src1_modifiers, VRegSrc_32:$src1, - FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); + FP16InputMods:$src2_modifiers, Src2RC:$src2); // FIXME: Clamp0 misbehaves with the non-default vdst_in // following it. For now workaround this by requiring clamp @@ -144,48 +148,59 @@ def : VOP3PSatPat<usubsat, V_PK_SUB_U16>; def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>; } // End SubtargetPredicate = HasVOP3PInsts -let SubtargetPredicate = HasMinimum3Maximum3PKF16, FPDPRounding = 1 in { +let isCommutable = 1, FPDPRounding = 1 in { +let SubtargetPredicate = HasMin3Max3PKF16 in { +defm V_PK_MIN3_NUM_F16 : VOP3PInst<"v_pk_min3_num_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmin3>; +defm V_PK_MAX3_NUM_F16 : VOP3PInst<"v_pk_max3_num_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmax3>; +} + +let SubtargetPredicate = HasMinimum3Maximum3PKF16 in { defm V_PK_MINIMUM3_F16 : VOP3PInst<"v_pk_minimum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfminimum3>; defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmaximum3>; } +} // End isCommutable = 1, FPDPRounding = 1 // TODO: Make sure we're doing the right thing with denormals. Note // that FMA and MAD will differ. multiclass MadFmaMixPats<SDPatternOperator fma_like, Instruction mix_inst, Instruction mixlo_inst, - Instruction mixhi_inst> { + Instruction mixhi_inst, + ValueType VT = f16, + ValueType vecVT = v2f16> { + defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods); + defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt); // At least one of the operands needs to be an fpextend of an f16 // for this to be worthwhile, so we need three patterns here. // TODO: Could we use a predicate to inspect src1/2/3 instead? def : GCNPat < - (f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_mods)))), + (f32 (fma_like (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; def : GCNPat < - (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)), - (f32 (VOP3PMadMixModsExt f16:$src1, i32:$src1_mods)), - (f32 (VOP3PMadMixMods f32:$src2, i32:$src2_mods)))), + (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsPat f32:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; def : GCNPat < - (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_mods)), - (f32 (VOP3PMadMixModsExt f16:$src2, i32:$src2_mods)))), + (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; def : GCNPat < (AMDGPUclamp (build_vector - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))), - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))), - (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0, + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))), + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))), + (vecVT (mixhi_inst $hi_src0_modifiers, $hi_src0, $hi_src1_modifiers, $hi_src1, $hi_src2_modifiers, $hi_src2, DSTCLAMP.ENABLE, @@ -197,8 +212,8 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, >; def : GCNPat < - (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))), + (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))), (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, (i32 0), (i32 0), @@ -207,9 +222,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, >; def : GCNPat < - (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + (build_vector VT:$elt0, (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers)))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, (i32 0), (i32 0), DSTCLAMP.NONE, @@ -217,9 +232,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, >; def : GCNPat < - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, @@ -234,10 +249,10 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, let True16Predicate = p in { def : GCNPat < - (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.NONE, @@ -246,11 +261,11 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, def : GCNPat < (build_vector - f16:$elt0, - (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + VT:$elt0, + (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.ENABLE, @@ -261,38 +276,38 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, let True16Predicate = UseRealTrue16Insts in { def : GCNPat < - (build_vector (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), f16:$elt1), - (v2f16 (mixlo_inst $src0_modifiers, $src0, + (build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), VT:$elt1), + (vecVT (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.NONE, - (REG_SEQUENCE VGPR_32, (f16 (IMPLICIT_DEF)), lo16, $elt1, hi16))) + (REG_SEQUENCE VGPR_32, (VT (IMPLICIT_DEF)), lo16, $elt1, hi16))) >; def : GCNPat < - (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.NONE, - (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16))) + (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16))) >; def : GCNPat < (build_vector - f16:$elt0, - (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + VT:$elt0, + (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.ENABLE, - (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16))) + (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16))) >; } // end True16Predicate } @@ -353,6 +368,67 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>; } +let SubtargetPredicate = HasFmaMixBF16Insts in { +let isCommutable = 1 in { + +let isReMaterializable = 1 in +defm V_FMA_MIX_F32_BF16 : VOP3_VOP3PInst<"v_fma_mix_f32_bf16", VOP3P_Mix_Profile<VOP_F32_BF16_BF16_BF16, VOP3_OPSEL>>; + +let FPDPRounding = 1 in { +defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>; + +let ClampLo = 0, ClampHi = 1 in { +defm V_FMA_MIXHI_BF16 : VOP3_VOP3PInst<"v_fma_mixhi_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>; +} +} // End FPDPRounding = 1 +} // End isCommutable = 1 + +defm : MadFmaMixPats<fma, V_FMA_MIX_F32_BF16, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>; +} // End SubtargetPredicate = HasFmaMixBF16Insts + +def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> { + let HasModifiers = 0; +} + +let isCommutable = 1, isReMaterializable = 1 in { +let SubtargetPredicate = HasPkAddMinMaxInsts in { +defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile>; +defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile>; +defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile>; +defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile>; +} +let SubtargetPredicate = HasPkMinMax3Insts in { +defm V_PK_MAX3_I16 : VOP3PInst<"v_pk_max3_i16", PK_ADD_MINMAX_Profile>; +defm V_PK_MAX3_U16 : VOP3PInst<"v_pk_max3_u16", PK_ADD_MINMAX_Profile>; +defm V_PK_MIN3_I16 : VOP3PInst<"v_pk_min3_i16", PK_ADD_MINMAX_Profile>; +defm V_PK_MIN3_U16 : VOP3PInst<"v_pk_min3_u16", PK_ADD_MINMAX_Profile>; +} +} // End isCommutable = 1, isReMaterializable = 1 + +// TODO: Extend pattern to select op_sel and op_sel_hi. +class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2, + VOP3P_Pseudo inst, + ValueType vt = inst.Pfl.Src0VT, + RegisterOperand RC = getVCSrcForVT<vt>.ret> : GCNPat < + (ThreeOpFrag<op1, op2> vt:$src0, vt:$src1, vt:$src2), + (inst SRCMODS.OP_SEL_1, RC:$src0, SRCMODS.OP_SEL_1, RC:$src1, + SRCMODS.OP_SEL_1, RC:$src2, DSTCLAMP.NONE, 0) +>; + +let SubtargetPredicate = HasPkAddMinMaxInsts in { +def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>; +def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>; +def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>; +def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>; +} + +let SubtargetPredicate = HasPkMinMax3Insts in { +def : ThreeOp_OpSelClampPats<smax, smax, V_PK_MAX3_I16>; +def : ThreeOp_OpSelClampPats<umax, umax, V_PK_MAX3_U16>; +def : ThreeOp_OpSelClampPats<smin, smin, V_PK_MIN3_I16>; +def : ThreeOp_OpSelClampPats<umin, umin, V_PK_MIN3_U16>; +} + // Defines patterns that extract signed 4bit from each Idx[0]. foreach Idx = [[0,28],[4,24],[8,20],[12,16],[16,12],[20,8],[24,4]] in def ExtractSigned4bit_#Idx[0] : PatFrag<(ops node:$src), @@ -1153,6 +1229,20 @@ let isCommutable = 1, isReMaterializable = 1 in { let SubtargetPredicate = HasPkMovB32, isAsCheapAsAMove = 1 in defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>; + + let SubtargetPredicate = HasBF16PackedInsts in { + defm V_PK_ADD_BF16 : VOP3PInst<"v_pk_add_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fadd>; + defm V_PK_MUL_BF16 : VOP3PInst<"v_pk_mul_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fmul>; + defm V_PK_MIN_NUM_BF16 : VOP3PInst<"v_pk_min_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fminnum_like>; + defm V_PK_MAX_NUM_BF16 : VOP3PInst<"v_pk_max_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fmaxnum_like>; + defm V_PK_FMA_BF16 : VOP3PInst<"v_pk_fma_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fma>; + + // Scalar pseudo used to emulate AMDGPUClamp. + // Expanded to V_PK_MAX_NUM_BF16 with unused high half. + // FIXME-TRUE16: Pseudo expansion of this won't work with True16. + let True16Predicate = UseFakeTrue16Insts in + defm V_MAX_BF16_PSEUDO : VOP3Inst <"v_max_bf16", VOP_BF16_BF16_BF16>; + } } // End isCommutable = 1, isReMaterializable = 1 def : AMDGPUMnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">; @@ -2157,6 +2247,8 @@ multiclass VOP3P_Realtriple_gfx11_gfx12<bits<8> op> multiclass VOP3P_Real_gfx12<bits<8> op> : VOP3P_Real_Base<GFX12Gen, op>; +multiclass VOP3P_Real_gfx1250<bits<8> op> : VOP3P_Real_Base<GFX1250Gen, op>; + multiclass VOP3P_Real_with_name_gfx12<bits<8> op, string backing_ps_name = NAME, string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> : @@ -2165,6 +2257,35 @@ multiclass VOP3P_Real_with_name_gfx12<bits<8> op, defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">; defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">; +defm V_PK_FMA_F32 : VOP3P_Real_gfx12<0x1f>; +defm V_PK_MUL_F32 : VOP3P_Real_gfx12<0x28>; +defm V_PK_ADD_F32 : VOP3P_Real_gfx12<0x29>; + +defm V_PK_ADD_MAX_I16 : VOP3P_Real_gfx1250<0x14>; +defm V_PK_ADD_MAX_U16 : VOP3P_Real_gfx1250<0x15>; +defm V_PK_ADD_MIN_I16 : VOP3P_Real_gfx1250<0x2d>; +defm V_PK_ADD_MIN_U16 : VOP3P_Real_gfx1250<0x2e>; +defm V_PK_MAX3_I16 : VOP3P_Real_gfx1250<0x2f>; +defm V_PK_MAX3_U16 : VOP3P_Real_gfx1250<0x30>; +defm V_PK_MIN3_I16 : VOP3P_Real_gfx1250<0x31>; +defm V_PK_MIN3_U16 : VOP3P_Real_gfx1250<0x32>; +defm V_PK_FMA_BF16 : VOP3P_Real_gfx1250<0x11>; +defm V_PK_ADD_BF16 : VOP3P_Real_gfx1250<0x23>; +defm V_PK_MUL_BF16 : VOP3P_Real_gfx1250<0x2a>; +defm V_PK_MIN_NUM_BF16 : VOP3P_Real_gfx1250<0x2b>; +defm V_PK_MAX_NUM_BF16 : VOP3P_Real_gfx1250<0x2c>; +defm V_PK_MINIMUM3_F16 : VOP3P_Real_gfx1250<0x36>; +defm V_PK_MAXIMUM3_F16 : VOP3P_Real_gfx1250<0x37>; +defm V_PK_MIN3_NUM_F16 : VOP3P_Real_gfx1250<0x38>; +defm V_PK_MAX3_NUM_F16 : VOP3P_Real_gfx1250<0x39>; + +defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>; +defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3e>; +defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3f>; + +let AssemblerPredicate = isGFX1250Plus in +def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">; + defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>; defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>; |