aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td46
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp180
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp99
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp57
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp65
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp63
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp17
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp51
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td12
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td3
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td310
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h40
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp24
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.cpp57
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h34
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h13
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp188
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h1
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp57
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp95
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h9
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td19
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td10
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp57
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp61
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.h5
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td45
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td5
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td7
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h1
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td4
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td342
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td147
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td124
57 files changed, 1873 insertions, 533 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 2a36f3d..8a0c4ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -149,6 +149,12 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
"Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
>;
+def FeatureFmaMixBF16Insts : SubtargetFeature<"fma-mix-bf16-insts",
+ "HasFmaMixBF16Insts",
+ "true",
+ "Has v_fma_mix_f32_bf16, v_fma_mixlo_bf16, v_fma_mixhi_bf16 instructions"
+>;
+
def FeatureIEEEMinimumMaximumInsts : SubtargetFeature<"ieee-minimum-maximum-insts",
"HasIEEEMinimumMaximumInsts",
"true",
@@ -262,12 +268,30 @@ def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug",
"S_INST_PREFETCH instruction causes shader to hang"
>;
+def FeatureVmemPrefInsts : SubtargetFeature<"vmem-pref-insts",
+ "HasVmemPrefInsts",
+ "true",
+ "Has flat_prefect_b8 and global_prefetch_b8 instructions"
+>;
+
def FeatureSafeSmemPrefetch : SubtargetFeature<"safe-smem-prefetch",
"HasSafeSmemPrefetch",
"true",
"SMEM prefetches do not fail on illegal address"
>;
+def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch",
+ "HasSafeCUPrefetch",
+ "true",
+ "VMEM CU scope prefetches do not fail on illegal address"
+>;
+
+def FeatureCUStores : SubtargetFeature<"cu-stores",
+ "HasCUStores",
+ "true",
+ "Whether SCOPE_CU stores can be used on GFX12.5"
+>;
+
def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
"HasVcmpxExecWARHazard",
"true",
@@ -1365,6 +1389,9 @@ def FeatureAddSubU64Insts
: SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true",
"Has v_add_u64 and v_sub_u64 instructions">;
+def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst",
+ "true", "Has v_mad_u32 instruction">;
+
def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
"HasVMemToLDSLoad",
"true",
@@ -1970,6 +1997,7 @@ def FeatureISAVersion12 : FeatureSet<
def FeatureISAVersion12_50 : FeatureSet<
[FeatureGFX12,
FeatureGFX1250Insts,
+ FeatureCUStores,
FeatureCuMode,
Feature64BitLiterals,
FeatureLDSBankCount32,
@@ -2007,6 +2035,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureBF16ConversionInsts,
FeatureBF16PackedInsts,
FeatureCvtPkF16F32Inst,
+ FeatureFmaMixBF16Insts,
FeatureMin3Max3PKF16,
FeatureMinimum3Maximum3PKF16,
FeaturePrngInst,
@@ -2020,8 +2049,10 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureFlatBufferGlobalAtomicFaddF64Inst,
FeatureMemoryAtomicFAddF32DenormalSupport,
FeatureKernargPreload,
+ FeatureVmemPrefInsts,
FeatureLshlAddU64Inst,
FeatureAddSubU64Insts,
+ FeatureMadU32Inst,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
]>;
@@ -2402,7 +2433,7 @@ def HasAtomicFMinFMaxF64FlatInsts :
def HasLdsAtomicAddF64 :
Predicate<"Subtarget->hasLdsAtomicAddF64()">,
- AssemblerPredicate<(any_of FeatureGFX90AInsts)>;
+ AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX1250Insts)>;
def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
AssemblerPredicate<(all_of FeatureFlatGlobalInsts)>;
@@ -2545,6 +2576,10 @@ def HasFmaakFmamkF64Insts :
Predicate<"Subtarget->hasFmaakFmamkF64Insts()">,
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+def HasAddMinMaxInsts :
+ Predicate<"Subtarget->hasAddMinMaxInsts()">,
+ AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+
def HasPkAddMinMaxInsts :
Predicate<"Subtarget->hasPkAddMinMaxInsts()">,
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
@@ -2599,6 +2634,9 @@ def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
AssemblerPredicate<(all_of FeatureFmaMixInsts)>;
+def HasFmaMixBF16Insts : Predicate<"Subtarget->hasFmaMixBF16Insts()">,
+ AssemblerPredicate<(all_of FeatureFmaMixBF16Insts)>;
+
def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
AssemblerPredicate<(all_of FeatureDLInsts)>;
@@ -2797,6 +2835,9 @@ def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;
def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">,
AssemblerPredicate<(all_of FeatureXF32Insts)>;
+def HasVmemPrefInsts : Predicate<"Subtarget->hasVmemPrefInsts()">,
+ AssemblerPredicate<(all_of FeatureVmemPrefInsts)>;
+
def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">,
AssemblerPredicate<(all_of FeatureAshrPkInsts)>;
@@ -2806,6 +2847,9 @@ def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">,
def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">,
AssemblerPredicate<(all_of FeatureAddSubU64Insts)>;
+def HasMadU32Inst : Predicate<"Subtarget->hasMadU32Inst()">,
+ AssemblerPredicate<(all_of FeatureMadU32Inst)>;
+
def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">,
AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 4b3dc37..6681393 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -552,6 +552,7 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
MCContext &Ctx = MF.getContext();
uint16_t KernelCodeProperties = 0;
const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
KernelCodeProperties |=
@@ -581,10 +582,13 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
}
- if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
+ if (ST.isWave32()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
}
+ if (isGFX1250(ST) && ST.hasCUStores()) {
+ KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES;
+ }
// CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
// un-evaluatable at this point so it cannot be conditionally checked here.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index dedee46..59cc1df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -13,7 +13,6 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/Target/TargetMachine.h"
@@ -1383,7 +1382,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
&AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID,
- &AAIndirectCallInfo::ID, &AAInstanceInfo::ID});
+ &AAIndirectCallInfo::ID});
AttributorConfig AC(CGUpdater);
AC.IsClosedWorldModule = Options.IsClosedWorld;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 5f19837..a9278c1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -89,10 +89,6 @@ static cl::opt<bool> DisableFDivExpand(
cl::ReallyHidden,
cl::init(false));
-static bool hasUnsafeFPMath(const Function &F) {
- return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
-}
-
class AMDGPUCodeGenPrepareImpl
: public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
public:
@@ -104,7 +100,6 @@ public:
const DominatorTree *DT;
const UniformityInfo &UA;
const DataLayout &DL;
- const bool HasUnsafeFPMath;
const bool HasFP32DenormalFlush;
bool FlowChanged = false;
mutable Function *SqrtF32 = nullptr;
@@ -117,7 +112,6 @@ public:
const DominatorTree *DT, const UniformityInfo &UA)
: F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC),
DT(DT), UA(UA), DL(F.getDataLayout()),
- HasUnsafeFPMath(hasUnsafeFPMath(F)),
HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
DenormalMode::getPreserveSign()) {}
@@ -637,8 +631,7 @@ bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
return false;
// v_rsq_f32 gives 1ulp
- return SqrtFMF.approxFunc() || HasUnsafeFPMath ||
- SqrtOp->getFPAccuracy() >= 1.0f;
+ return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f;
}
Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
@@ -664,7 +657,7 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
IRBuilder<>::FastMathFlagGuard Guard(Builder);
Builder.setFastMathFlags(DivFMF | SqrtFMF);
- if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath ||
+ if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
canIgnoreDenormalInput(Den, CtxI)) {
Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
// -1.0 / sqrt(x) -> fneg(rsq(x))
@@ -680,7 +673,7 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
// Optimize fdiv with rcp:
//
// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
-// allowed with unsafe-fp-math or afn.
+// allowed with afn.
//
// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
Value *
@@ -803,9 +796,9 @@ Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
//
// With rcp:
// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
-// allowed with unsafe-fp-math or afn.
+// allowed with afn.
//
-// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
+// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn.
//
// With fdiv.fast:
// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
@@ -843,7 +836,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
RsqOp = SqrtOp->getOperand(0);
}
- // Inaccurate rcp is allowed with unsafe-fp-math or afn.
+ // Inaccurate rcp is allowed with afn.
//
// Defer to codegen to handle this.
//
@@ -852,7 +845,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
// expansion of afn to codegen. The current interpretation is so aggressive we
// don't need any pre-consideration here when we have better information. A
// more conservative interpretation could use handling here.
- const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc();
+ const bool AllowInaccurateRcp = DivFMF.approxFunc();
if (!RsqOp && AllowInaccurateRcp)
return false;
@@ -2026,7 +2019,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
// We're trying to handle the fast-but-not-that-fast case only. The lowering
// of fast llvm.sqrt will give the raw instruction anyway.
- if (SqrtFMF.approxFunc() || HasUnsafeFPMath)
+ if (SqrtFMF.approxFunc())
return false;
const float ReqdAccuracy = FPOp->getFPAccuracy();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 891d362..992572f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -137,9 +137,15 @@ def gi_global_offset :
def gi_global_saddr :
GIComplexOperandMatcher<s64, "selectGlobalSAddr">,
GIComplexPatternEquiv<GlobalSAddr>;
+def gi_global_saddr_cpol :
+ GIComplexOperandMatcher<s64, "selectGlobalSAddrCPol">,
+ GIComplexPatternEquiv<GlobalSAddrCPol>;
def gi_global_saddr_glc :
GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
GIComplexPatternEquiv<GlobalSAddrGLC>;
+def gi_global_saddr_no_ioffset :
+ GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffset">,
+ GIComplexPatternEquiv<GlobalSAddrNoIOffset>;
def gi_mubuf_scratch_offset :
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
@@ -446,5 +452,8 @@ def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">,
def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">,
GISDNodeXFormEquiv<as_hw_round_mode>;
+def gi_prefetch_loc : GICustomOperandRenderer<"renderPrefetchLoc">,
+ GISDNodeXFormEquiv<PrefetchLoc>;
+
def gi_MFMALdScaleModifierOp : GICustomOperandRenderer<"renderScaledMAIIntrinsicOperand">,
GISDNodeXFormEquiv<MFMALdScaleXForm>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 2991778..19b8757 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -204,7 +204,7 @@ MetadataStreamerMsgPackV4::getWorkGroupDimensions(MDNode *Node) const {
for (auto &Op : Node->operands())
Dims.push_back(Dims.getDocument()->getNode(
- uint64_t(mdconst::extract<ConstantInt>(Op)->getZExtValue())));
+ mdconst::extract<ConstantInt>(Op)->getZExtValue()));
return Dims;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 5a2416de..39b4200 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1134,15 +1134,26 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
SDLoc SL(N);
bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
unsigned Opc;
+ bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1);
if (Subtarget->hasMADIntraFwdBug())
Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
: AMDGPU::V_MAD_U64_U32_gfx11_e64;
+ else if (UseNoCarry)
+ Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
else
Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
Clamp };
+
+ if (UseNoCarry) {
+ MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
+ ReplaceUses(SDValue(N, 0), SDValue(Mad, 0));
+ CurDAG->RemoveDeadNode(N);
+ return;
+ }
+
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}
@@ -2020,6 +2031,22 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
+ SDValue &SAddr, SDValue &VOffset,
+ SDValue &Offset,
+ SDValue &CPol) const {
+ bool ScaleOffset;
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
+ return false;
+
+ // We are assuming CPol is always the last operand of the intrinsic.
+ auto PassedCPol =
+ N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
+ CPol = CurDAG->getTargetConstant(
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
SDValue &SAddr, SDValue &VOffset,
SDValue &Offset,
@@ -2033,6 +2060,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
+ SDValue &SAddr,
+ SDValue &VOffset,
+ SDValue &CPol) const {
+ bool ScaleOffset;
+ SDValue DummyOffset;
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
+ false))
+ return false;
+
+ // We are assuming CPol is always the last operand of the intrinsic.
+ auto PassedCPol =
+ N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
+ CPol = CurDAG->getTargetConstant(
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
+ return true;
+}
+
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
@@ -3861,58 +3906,114 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
return SelectVOP3Mods(In, Src, SrcMods);
}
+// Match lowered fpext from bf16 to f32. This is a bit operation extending
+// a 16-bit value with 16-bit of zeroes at LSB:
+//
+// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
+// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
+// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
+static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
+ if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
+ return SDValue();
+ Op = Op.getOperand(0);
+
+ IsExtractHigh = false;
+ if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
+ auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+ if (!Low16 || !Low16->isZero())
+ return SDValue();
+ Op = stripBitcast(Op.getOperand(1));
+ if (Op.getValueType() != MVT::bf16)
+ return SDValue();
+ return Op;
+ }
+
+ if (Op.getValueType() != MVT::i32)
+ return SDValue();
+
+ if (Op.getOpcode() == ISD::AND) {
+ if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ if (Mask->getZExtValue() == 0xffff0000) {
+ IsExtractHigh = true;
+ return Op.getOperand(0);
+ }
+ }
+ return SDValue();
+ }
+
+ if (Op.getOpcode() == ISD::SHL) {
+ if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ if (Amt->getZExtValue() == 16)
+ return Op.getOperand(0);
+ }
+ }
+
+ return SDValue();
+}
+
// The return value is not whether the match is possible (which it always is),
// but whether or not it a conversion is really used.
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
- unsigned &Mods) const {
+ unsigned &Mods,
+ MVT VT) const {
Mods = 0;
SelectVOP3ModsImpl(In, Src, Mods);
+ bool IsExtractHigh = false;
if (Src.getOpcode() == ISD::FP_EXTEND) {
Src = Src.getOperand(0);
- assert(Src.getValueType() == MVT::f16);
- Src = stripBitcast(Src);
+ } else if (VT == MVT::bf16) {
+ SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);
+ if (!B16)
+ return false;
+ Src = B16;
+ } else
+ return false;
+
+ if (Src.getValueType() != VT &&
+ (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
+ return false;
- // Be careful about folding modifiers if we already have an abs. fneg is
- // applied last, so we don't want to apply an earlier fneg.
- if ((Mods & SISrcMods::ABS) == 0) {
- unsigned ModsTmp;
- SelectVOP3ModsImpl(Src, Src, ModsTmp);
+ Src = stripBitcast(Src);
- if ((ModsTmp & SISrcMods::NEG) != 0)
- Mods ^= SISrcMods::NEG;
+ // Be careful about folding modifiers if we already have an abs. fneg is
+ // applied last, so we don't want to apply an earlier fneg.
+ if ((Mods & SISrcMods::ABS) == 0) {
+ unsigned ModsTmp;
+ SelectVOP3ModsImpl(Src, Src, ModsTmp);
- if ((ModsTmp & SISrcMods::ABS) != 0)
- Mods |= SISrcMods::ABS;
- }
+ if ((ModsTmp & SISrcMods::NEG) != 0)
+ Mods ^= SISrcMods::NEG;
- // op_sel/op_sel_hi decide the source type and source.
- // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
- // If the sources's op_sel is set, it picks the high half of the source
- // register.
+ if ((ModsTmp & SISrcMods::ABS) != 0)
+ Mods |= SISrcMods::ABS;
+ }
- Mods |= SISrcMods::OP_SEL_1;
- if (isExtractHiElt(Src, Src)) {
- Mods |= SISrcMods::OP_SEL_0;
+ // op_sel/op_sel_hi decide the source type and source.
+ // If the source's op_sel_hi is set, it indicates to do a conversion from
+ // fp16. If the sources's op_sel is set, it picks the high half of the source
+ // register.
- // TODO: Should we try to look for neg/abs here?
- }
+ Mods |= SISrcMods::OP_SEL_1;
+ if (IsExtractHigh ||
+ (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) {
+ Mods |= SISrcMods::OP_SEL_0;
- // Prevent unnecessary subreg COPY to VGPR_16
- if (Src.getOpcode() == ISD::TRUNCATE &&
- Src.getOperand(0).getValueType() == MVT::i32) {
- Src = Src.getOperand(0);
- }
- return true;
+ // TODO: Should we try to look for neg/abs here?
}
- return false;
+ // Prevent unnecessary subreg COPY to VGPR_16
+ if (Src.getOpcode() == ISD::TRUNCATE &&
+ Src.getOperand(0).getValueType() == MVT::i32) {
+ Src = Src.getOperand(0);
+ }
+ return true;
}
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
- if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
+ if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16))
return false;
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
@@ -3921,7 +4022,24 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
- SelectVOP3PMadMixModsImpl(In, Src, Mods);
+ SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods = 0;
+ if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16))
+ return false;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods = 0;
+ SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 6123d75..983f1aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -19,6 +19,7 @@
#include "SIModeRegisterDefaults.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
@@ -167,9 +168,14 @@ private:
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
+ bool SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, SDValue &SAddr,
+ SDValue &VOffset, SDValue &Offset,
+ SDValue &CPol) const;
bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
+ bool SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, SDValue &SAddr,
+ SDValue &VOffset, SDValue &CPol) const;
bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &Offset) const;
bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,
@@ -254,11 +260,15 @@ private:
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
- unsigned &Mods) const;
+ bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods,
+ MVT VT) const;
bool SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
SDValue &SrcMods) const;
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
+ bool SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
SDValue &Tbl) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e3ca09e..31c4f62 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -391,8 +391,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
// Library functions. These default to Expand, but we have instructions
// for them.
setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
- ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
- MVT::f32, Legal);
+ ISD::FROUNDEVEN, ISD::FTRUNC},
+ {MVT::f16, MVT::f32}, Legal);
+ setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal);
setOperationAction(ISD::FLOG2, MVT::f32, Custom);
setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
@@ -412,9 +413,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
- if (Subtarget->has16BitInsts())
+ if (Subtarget->has16BitInsts()) {
setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
- else {
+ setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal);
+ } else {
setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
}
@@ -2632,7 +2634,7 @@ bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
if (Flags.hasApproximateFuncs())
return true;
auto &Options = DAG.getTarget().Options;
- return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
+ return Options.ApproxFuncFPMath;
}
bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
@@ -2755,7 +2757,7 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
const auto &Options = getTargetMachine().Options;
if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
- Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
+ Options.ApproxFuncFPMath) {
if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
// Log and multiply in f32 is good enough for f16.
@@ -3583,7 +3585,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) con
if (N0.getValueType() == MVT::f32)
return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
- if (getTargetMachine().Options.UnsafeFPMath) {
+ if (Op->getFlags().hasApproximateFuncs()) {
// There is a generic expand for FP_TO_FP16 with unsafe fast math.
return SDValue();
}
@@ -4844,94 +4846,11 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
return SDValue();
}
-// Detect when CMP and SELECT use the same constant and fold them to avoid
-// loading the constant twice. Specifically handles patterns like:
-// %cmp = icmp eq i32 %val, 4242
-// %sel = select i1 %cmp, i32 4242, i32 %other
-// It can be optimized to reuse %val instead of 4242 in select.
-static SDValue
-foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
- const AMDGPUSubtarget *ST) {
- SDValue Cond = N->getOperand(0);
- SDValue TrueVal = N->getOperand(1);
- SDValue FalseVal = N->getOperand(2);
-
- // Check if condition is a comparison.
- if (Cond.getOpcode() != ISD::SETCC)
- return SDValue();
-
- SDValue LHS = Cond.getOperand(0);
- SDValue RHS = Cond.getOperand(1);
- ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
-
- bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
- bool isInteger = LHS.getValueType().isInteger();
-
- // Handle simple floating-point and integer types only.
- if (!isFloatingPoint && !isInteger)
- return SDValue();
-
- bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
- bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
- if (!isEquality && !isNonEquality)
- return SDValue();
-
- SDValue ArgVal, ConstVal;
- if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
- (isInteger && isa<ConstantSDNode>(RHS))) {
- ConstVal = RHS;
- ArgVal = LHS;
- } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
- (isInteger && isa<ConstantSDNode>(LHS))) {
- ConstVal = LHS;
- ArgVal = RHS;
- } else {
- return SDValue();
- }
-
- // Check if constant should not be optimized - early return if not.
- if (isFloatingPoint) {
- const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
- const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST);
-
- // Only optimize normal floating-point values (finite, non-zero, and
- // non-subnormal as per IEEE 754), skip optimization for inlinable
- // floating-point constants.
- if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val))
- return SDValue();
- } else {
- int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue();
-
- // Skip optimization for inlinable integer immediates.
- // Inlinable immediates include: -16 to 64 (inclusive).
- if (IntVal >= -16 && IntVal <= 64)
- return SDValue();
- }
-
- // For equality and non-equality comparisons, patterns:
- // select (setcc x, const), const, y -> select (setcc x, const), x, y
- // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
- if (!(isEquality && TrueVal == ConstVal) &&
- !(isNonEquality && FalseVal == ConstVal))
- return SDValue();
-
- SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
- SDValue SelectRHS =
- (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
- return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
- SelectLHS, SelectRHS);
-}
-
SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
return Folded;
- // Try to fold CMP + SELECT patterns with shared constants (both FP and
- // integer).
- if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget))
- return Folded;
-
SDValue Cond = N->getOperand(0);
if (Cond.getOpcode() != ISD::SETCC)
return SDValue();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 877c3ac..b0d3b12 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -574,13 +574,22 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
+ bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
+ MRI->use_nodbg_empty(I.getOperand(1).getReg());
unsigned Opc;
if (Subtarget->hasMADIntraFwdBug())
Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
: AMDGPU::V_MAD_I64_I32_gfx11_e64;
+ else if (UseNoCarry)
+ Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
+ : AMDGPU::V_MAD_NC_I64_I32_e64;
else
Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
+
+ if (UseNoCarry)
+ I.removeOperand(1);
+
I.setDesc(TII.get(Opc));
I.addOperand(*MF, MachineOperand::CreateImm(0));
I.addImplicitDefUseOperands(*MF);
@@ -3995,6 +4004,9 @@ bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
}
unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
+ if (!IsB32 && STI.hasTrue16BitInsts())
+ Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
+ : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
unsigned CBL = STI.getConstantBusLimit(Opc);
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
@@ -5774,11 +5786,32 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
+ const MachineInstr &I = *Root.getParent();
+
+ // We are assuming CPol is always the last operand of the intrinsic.
+ auto PassedCPol =
+ I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
+ return selectGlobalSAddr(Root, PassedCPol);
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
+ MachineOperand &Root) const {
+ const MachineInstr &I = *Root.getParent();
+
+ // We are assuming CPol is always the last operand of the intrinsic.
+ auto PassedCPol =
+ I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
+ return selectGlobalSAddr(Root, PassedCPol, false);
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
Register Addr = Root.getReg();
Register PtrBase;
@@ -6961,13 +6994,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm(
- (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
+ (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
}
void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
+ MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL)
: (int64_t)SISrcMods::DST_OP_SEL);
}
@@ -6976,13 +7009,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm(
- (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
+ (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
}
void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
+ MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
? (int64_t)(SISrcMods::OP_SEL_0)
: 0);
}
@@ -7011,8 +7044,9 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm(
- (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0);
+ MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
+ ? (int64_t)SISrcMods::DST_OP_SEL
+ : 0);
}
void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
@@ -7068,6 +7102,17 @@ void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
}
+void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ uint32_t V = MI.getOperand(2).getImm();
+ V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK))
+ << AMDGPU::CPol::SCOPE_SHIFT;
+ if (!Subtarget->hasSafeCUPrefetch())
+ V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
+ MIB.addImm(V);
+}
+
/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 5f7f05c..140e753 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -261,7 +261,11 @@ private:
InstructionSelector::ComplexRendererFns
selectGlobalSAddr(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
+ selectGlobalSAddrCPol(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
selectGlobalSAddrGLC(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectGlobalSAddrNoIOffset(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectScratchSAddr(MachineOperand &Root) const;
@@ -414,6 +418,10 @@ private:
void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
+
+ void renderPrefetchLoc(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
+
void renderScaledMAIIntrinsicOperand(MachineInstrBuilder &MIB,
const MachineInstr &MI, int OpIdx) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 7a50923..511fc69 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -94,7 +94,6 @@ def NoFP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode()
def NoFP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">;
def IEEEModeEnabled : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">;
def IEEEModeDisabled : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">;
-def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
}
def FMA : Predicate<"Subtarget->hasFMA()">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index fedfa3f..1fdf272 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1342,13 +1342,30 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0);
if (ST.hasVOP3PInsts()) {
- getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
- .legalFor({S32, S16, V2S16})
- .clampMaxNumElements(0, S16, 2)
- .minScalar(0, S16)
- .widenScalarToNextPow2(0)
- .scalarize(0)
- .lower();
+ getActionDefinitionsBuilder(G_ABS)
+ .legalFor({S32, S16, V2S16})
+ .clampMaxNumElements(0, S16, 2)
+ .minScalar(0, S16)
+ .widenScalarToNextPow2(0)
+ .scalarize(0)
+ .lower();
+ if (ST.hasIntMinMax64()) {
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ .legalFor({S32, S16, S64, V2S16})
+ .clampMaxNumElements(0, S16, 2)
+ .minScalar(0, S16)
+ .widenScalarToNextPow2(0)
+ .scalarize(0)
+ .lower();
+ } else {
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ .legalFor({S32, S16, V2S16})
+ .clampMaxNumElements(0, S16, 2)
+ .minScalar(0, S16)
+ .widenScalarToNextPow2(0)
+ .scalarize(0)
+ .lower();
+ }
} else {
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
.legalFor({S32, S16})
@@ -1682,7 +1699,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasFlatAtomicFaddF32Inst())
Atomic.legalFor({{S32, FlatPtr}});
- if (ST.hasGFX90AInsts()) {
+ if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
// These are legal with some caveats, and should have undergone expansion in
// the IR in most situations
// TODO: Move atomic expansion into legalizer
@@ -2295,8 +2312,8 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
LLT::scalar(32), commonAlignment(Align(64), Offset));
// Pointer address
- B.buildPtrAdd(LoadAddr, KernargPtrReg,
- B.buildConstant(LLT::scalar(64), Offset).getReg(0));
+ B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
+ B.buildConstant(LLT::scalar(64), Offset).getReg(0));
// Load address
return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
}
@@ -2317,8 +2334,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
MachineMemOperand::MOInvariant,
LLT::scalar(32), commonAlignment(Align(64), StructOffset));
- B.buildPtrAdd(LoadAddr, QueuePtr,
- B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
+ B.buildObjectPtrOffset(
+ LoadAddr, QueuePtr,
+ B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
}
@@ -3326,7 +3344,7 @@ static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
if (Flags & MachineInstr::FmAfn)
return true;
const auto &Options = MF.getTarget().Options;
- return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
+ return Options.ApproxFuncFPMath;
}
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
@@ -3432,7 +3450,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
- TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
+ TM.Options.ApproxFuncFPMath) {
if (Ty == F16 && !ST.has16BitInsts()) {
Register LogVal = MRI.createGenericVirtualRegister(F32);
auto PromoteSrc = B.buildFPExt(F32, X);
@@ -4500,8 +4518,7 @@ Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
llvm_unreachable("failed to find kernarg segment ptr");
auto COffset = B.buildConstant(LLT::scalar(64), Offset);
- // TODO: Should get nuw
- return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
+ return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
}
/// Legalize a value that's loaded from kernel arguments. This is only used by
@@ -4860,9 +4877,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
uint16_t Flags = MI.getFlags();
LLT ResTy = MRI.getType(Res);
- const MachineFunction &MF = B.getMF();
- bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
- MF.getTarget().Options.UnsafeFPMath;
+ bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
@@ -4922,9 +4937,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
uint16_t Flags = MI.getFlags();
LLT ResTy = MRI.getType(Res);
- const MachineFunction &MF = B.getMF();
- bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
- MI.getFlag(MachineInstr::FmAfn);
+ bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
if (!AllowInaccurateRcp)
return false;
@@ -5676,8 +5689,8 @@ bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
return false;
- // FIXME: This should be nuw
- B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
+ B.buildObjectPtrOffset(DstReg, KernargPtrReg,
+ B.buildConstant(IdxTy, Offset).getReg(0));
return true;
}
@@ -7019,8 +7032,8 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
// Pointer address
Register LoadAddr = MRI.createGenericVirtualRegister(
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
- B.buildPtrAdd(LoadAddr, KernargPtrReg,
- B.buildConstant(LLT::scalar(64), Offset).getReg(0));
+ B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
+ B.buildConstant(LLT::scalar(64), Offset).getReg(0));
// Load address
Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
B.buildCopy(SGPR01, Temp);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 8767208..aa75534 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -53,8 +53,6 @@ private:
using FuncInfo = llvm::AMDGPULibFunc;
- bool UnsafeFPMath = false;
-
// -fuse-native.
bool AllNative = false;
@@ -117,7 +115,6 @@ private:
bool AllowStrictFP = false);
protected:
- bool isUnsafeMath(const FPMathOperator *FPOp) const;
bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const;
bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const;
@@ -415,23 +412,17 @@ bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName,
return AMDGPULibFunc::parse(FMangledName, FInfo);
}
-bool AMDGPULibCalls::isUnsafeMath(const FPMathOperator *FPOp) const {
- return UnsafeFPMath || FPOp->isFast();
-}
-
bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const {
- return UnsafeFPMath ||
- (FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs());
+ return FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs();
}
bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
const FPMathOperator *FPOp) const {
// TODO: Refine to approxFunc or contract
- return isUnsafeMath(FPOp);
+ return FPOp->isFast();
}
void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) {
- UnsafeFPMath = F.getFnAttribute("unsafe-fp-math").getValueAsBool();
AC = &FAM.getResult<AssumptionAnalysis>(F);
TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index f471881..b45627d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -294,7 +294,8 @@ void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
BasePlusOffset = Base;
} else {
auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
- BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0);
+ BasePlusOffset =
+ B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
}
auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 9b05f7c..c8e45d4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3501,19 +3501,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyMappingMAD_64_32(B, OpdMapper);
return;
case AMDGPU::G_PREFETCH: {
- if (!Subtarget.hasPrefetch() || !Subtarget.hasSafeSmemPrefetch()) {
+ if (!Subtarget.hasSafeSmemPrefetch() && !Subtarget.hasVmemPrefInsts()) {
MI.eraseFromParent();
return;
}
Register PtrReg = MI.getOperand(0).getReg();
unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);
- if (PtrBank == AMDGPU::VGPRRegBankID) {
+ if (PtrBank == AMDGPU::VGPRRegBankID &&
+ (!Subtarget.hasVmemPrefInsts() || !MI.getOperand(3).getImm())) {
+ // Cannot do I$ prefetch with divergent pointer.
MI.eraseFromParent();
return;
}
unsigned AS = MRI.getType(PtrReg).getAddressSpace();
- if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
- AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+ if ((!AMDGPU::isFlatGlobalAddrSpace(AS) &&
+ AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
+ (!Subtarget.hasSafeSmemPrefetch() &&
+ (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+ !MI.getOperand(3).getImm() /* I$ prefetch */))) {
MI.eraseFromParent();
return;
}
@@ -4004,10 +4009,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_SADDE:
case AMDGPU::G_USUBE:
case AMDGPU::G_SSUBE:
- case AMDGPU::G_SMIN:
- case AMDGPU::G_SMAX:
- case AMDGPU::G_UMIN:
- case AMDGPU::G_UMAX:
case AMDGPU::G_ABS:
case AMDGPU::G_SHUFFLE_VECTOR:
case AMDGPU::G_SBFX:
@@ -4017,6 +4018,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
+ case AMDGPU::G_SMIN:
+ case AMDGPU::G_SMAX:
+ case AMDGPU::G_UMIN:
+ case AMDGPU::G_UMAX:
+ if (isSALUMapping(MI)) {
+ // There are no scalar 64-bit min and max, use vector instruction instead.
+ if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 64 &&
+ Subtarget.hasIntMinMax64())
+ return getDefaultMappingVOP(MI);
+ return getDefaultMappingSOP(MI);
+ }
+ return getDefaultMappingVOP(MI);
case AMDGPU::G_FADD:
case AMDGPU::G_FSUB:
case AMDGPU::G_FMUL:
@@ -4561,8 +4574,23 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_pknorm_u16:
case Intrinsic::amdgcn_cvt_pk_i16:
case Intrinsic::amdgcn_cvt_pk_u16:
+ case Intrinsic::amdgcn_cvt_sr_pk_f16_f32:
+ case Intrinsic::amdgcn_cvt_sr_pk_bf16_f32:
case Intrinsic::amdgcn_cvt_pk_f16_fp8:
case Intrinsic::amdgcn_cvt_pk_f16_bf8:
+ case Intrinsic::amdgcn_cvt_pk_fp8_f16:
+ case Intrinsic::amdgcn_cvt_pk_bf8_f16:
+ case Intrinsic::amdgcn_cvt_sr_fp8_f16:
+ case Intrinsic::amdgcn_cvt_sr_bf8_f16:
+ case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp8:
+ case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp8:
+ case Intrinsic::amdgcn_cvt_scale_pk8_f16_bf8:
+ case Intrinsic::amdgcn_cvt_scale_pk8_bf16_bf8:
+ case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp4:
+ case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp4:
+ case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8:
+ case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8:
+ case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4:
case Intrinsic::amdgcn_sat_pk4_i4_i8:
case Intrinsic::amdgcn_sat_pk4_u4_u8:
case Intrinsic::amdgcn_fmed3:
@@ -4614,8 +4642,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_pk_f32_fp8:
case Intrinsic::amdgcn_cvt_pk_f32_bf8:
case Intrinsic::amdgcn_cvt_pk_fp8_f32:
+ case Intrinsic::amdgcn_cvt_pk_fp8_f32_e5m3:
case Intrinsic::amdgcn_cvt_pk_bf8_f32:
case Intrinsic::amdgcn_cvt_sr_fp8_f32:
+ case Intrinsic::amdgcn_cvt_sr_fp8_f32_e5m3:
case Intrinsic::amdgcn_cvt_sr_bf8_f32:
case Intrinsic::amdgcn_cvt_sr_bf16_f32:
case Intrinsic::amdgcn_cvt_sr_f16_f32:
@@ -5175,6 +5205,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_ds_load_tr16_b128:
case Intrinsic::amdgcn_ds_load_tr4_b64:
case Intrinsic::amdgcn_ds_load_tr6_b96:
+ case Intrinsic::amdgcn_flat_load_monitor_b32:
+ case Intrinsic::amdgcn_flat_load_monitor_b64:
+ case Intrinsic::amdgcn_flat_load_monitor_b128:
+ case Intrinsic::amdgcn_global_load_monitor_b32:
+ case Intrinsic::amdgcn_global_load_monitor_b64:
+ case Intrinsic::amdgcn_global_load_monitor_b128:
case Intrinsic::amdgcn_ds_read_tr4_b64:
case Intrinsic::amdgcn_ds_read_tr6_b96:
case Intrinsic::amdgcn_ds_read_tr8_b64:
@@ -5353,6 +5389,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;
}
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128:
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
@@ -5437,6 +5481,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
}
+ case Intrinsic::amdgcn_flat_prefetch:
+ case Intrinsic::amdgcn_global_prefetch:
+ return getDefaultMappingVOP(MI);
default:
return getInvalidInstructionMapping();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index a8e1967..f580f43 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -159,7 +159,8 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
// If the inputs are tied and the same register, we can shortcut and
// directly replace the register.
- if (Src2->getReg() != CopySrcReg) {
+ if (!Src2->isReg() || Src2->getReg() != CopySrcReg ||
+ Src2->getSubReg() != DefMI->getOperand(1).getSubReg()) {
LLVM_DEBUG(
dbgs()
<< "Replacing untied VGPR MFMAs with AGPR form not yet handled\n");
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c865082..c1f1703 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -104,7 +104,9 @@
#include "llvm/Transforms/Scalar/FlattenCFG.h"
#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
+#include "llvm/Transforms/Scalar/LICM.h"
#include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Scalar/NaryReassociate.h"
#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
#include "llvm/Transforms/Scalar/Sink.h"
@@ -836,8 +838,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
// When we are not using -fgpu-rdc, we can run accelerator code
// selection relatively early, but still after linking to prevent
// eager removal of potentially reachable symbols.
- if (EnableHipStdPar)
+ if (EnableHipStdPar) {
+ PM.addPass(HipStdParMathFixupPass());
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
+ }
PM.addPass(AMDGPUPrintfRuntimeBindingPass());
}
@@ -916,8 +920,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
// selection after linking to prevent, otherwise we end up removing
// potentially reachable symbols that were exported as external in other
// modules.
- if (EnableHipStdPar)
+ if (EnableHipStdPar) {
+ PM.addPass(HipStdParMathFixupPass());
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
+ }
// We want to support the -lto-partitions=N option as "best effort".
// For that, we need to lower LDS earlier in the pipeline before the
// module is partitioned for codegen.
@@ -2062,7 +2068,12 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
// TODO: May want to move later or split into an early and late one.
addPass(AMDGPUCodeGenPreparePass(TM));
- // TODO: LICM
+ // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
+ // have expanded.
+ if (TM.getOptLevel() > CodeGenOptLevel::Less) {
+ addPass(createFunctionToLoopPassAdaptor(LICMPass(LICMOptions()),
+ /*UseMemorySSA=*/true));
+ }
}
Base::addIRPasses(addPass);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 24f4df2..a0c99b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -597,7 +597,6 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
// Estimate all types may be fused with contract/unsafe flags
const TargetOptions &Options = TLI->getTargetMachine().Options;
if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
- Options.UnsafeFPMath ||
(FAdd->hasAllowContract() && CxtI->hasAllowContract()))
return TargetTransformInfo::TCC_Free;
}
@@ -650,8 +649,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
return LT.first * Cost * NElts;
}
- if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) ||
- TLI->getTargetMachine().Options.UnsafeFPMath)) {
+ if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
// Fast unsafe fdiv lowering:
// f32 rcp
// f32 fmul
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 421fc42..a83caa0 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -180,6 +180,7 @@ public:
ImmTyMatrixBFMT,
ImmTyMatrixAReuse,
ImmTyMatrixBReuse,
+ ImmTyScaleSel,
ImmTyByteSel,
};
@@ -689,6 +690,8 @@ public:
bool isVSrc_v2f16() const { return isVSrc_f16() || isLiteralImm(MVT::v2f16); }
+ bool isVSrc_NoInline_v2f16() const { return isVSrc_v2f16(); }
+
bool isVISrcB32() const {
return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::i32);
}
@@ -1182,6 +1185,7 @@ public:
case ImmTyMatrixBFMT: OS << "ImmTyMatrixBFMT"; break;
case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break;
case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break;
+ case ImmTyScaleSel: OS << "ScaleSel" ; break;
case ImmTyByteSel: OS << "ByteSel" ; break;
}
// clang-format on
@@ -2036,6 +2040,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
case AMDGPU::OPERAND_KIMM16:
return &APFloat::IEEEhalf();
case AMDGPU::OPERAND_REG_IMM_BF16:
@@ -2405,6 +2410,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2FP32:
case AMDGPU::OPERAND_REG_IMM_V2INT32:
case AMDGPU::OPERAND_KIMM32:
@@ -2456,6 +2462,9 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
setImmKindConst();
return;
}
+ [[fallthrough]];
+
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
setImmKindLiteral();
@@ -3761,6 +3770,9 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
OperandType == AMDGPU::OPERAND_REG_INLINE_C_BF16)
return AMDGPU::isInlinableLiteralBF16(Val, hasInv2PiInlineImm());
+ if (OperandType == AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16)
+ return false;
+
llvm_unreachable("invalid operand type");
}
default:
@@ -6066,6 +6078,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
ExprVal, ValRange);
if (Val)
ImpliedUserSGPRCount += 1;
+ } else if (ID == ".amdhsa_uses_cu_stores") {
+ if (!isGFX1250())
+ return Error(IDRange.Start, "directive requires gfx12.5", IDRange);
+
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_USES_CU_STORES, ExprVal, ValRange);
} else if (ID == ".amdhsa_wavefront_size32") {
EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
if (IVersion.Major < 10)
@@ -9350,6 +9368,14 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
}
}
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::scale_sel))
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTyScaleSel);
+
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTyClamp);
+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) {
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in))
Inst.addOperand(Inst.getOperand(0));
@@ -9357,10 +9383,6 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
AMDGPUOperand::ImmTyByteSel);
}
- if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
- addOptionalImmOperand(Inst, Operands, OptionalIdx,
- AMDGPUOperand::ImmTyClamp);
-
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod))
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyOModSI);
@@ -9414,8 +9436,22 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx1250_e64_dpp_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx1250_e64_dpp8_gfx1250 ||
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
- Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12)) {
+ Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp8_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp8_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp8_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp8_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_gfx1250)) {
Inst.addOperand(Inst.getOperand(0));
}
@@ -10010,9 +10046,12 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyClamp);
- if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel))
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) {
+ if (VdstInIdx == static_cast<int>(Inst.getNumOperands()))
+ Inst.addOperand(Inst.getOperand(0));
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyByteSel);
+ }
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod))
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index f99e716..1956a15 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -2489,7 +2489,7 @@ multiclass VBUFFER_MTBUF_Real_gfx12<bits<4> op, string real_name> {
}
//===----------------------------------------------------------------------===//
-// MUBUF - GFX11, GFX12.
+// MUBUF - GFX11, GFX12, GFX1250.
//===----------------------------------------------------------------------===//
// gfx11 instruction that accept both old and new assembler name.
@@ -2600,6 +2600,12 @@ multiclass MUBUF_Real_Atomic_gfx11_gfx12<bits<8> op,
def : Mnem_gfx12<gfx11_name, gfx12_name>;
}
+multiclass MUBUF_Real_Atomic_gfx12_Renamed<bits<8> op, string real_name> :
+ MUBUF_Real_Atomic_gfx12_impl<op, 0, real_name>,
+ MUBUF_Real_Atomic_gfx12_impl<op, 1, real_name> {
+ def : Mnem_gfx12<get_BUF_ps<NAME>.Mnemonic, real_name>;
+}
+
defm BUFFER_GL0_INV : MUBUF_Real_gfx11<0x02B>;
defm BUFFER_GL1_INV : MUBUF_Real_gfx11<0x02C>;
@@ -2678,6 +2684,10 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_gfx11_gfx12<0x04B, "buffer
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_gfx12<0x059>;
defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Real_Atomic_gfx12<0x05a>;
+defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_gfx12<0x055>;
+defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Real_Atomic_gfx12_Renamed<0x05b, "buffer_atomic_min_num_f64">;
+defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Real_Atomic_gfx12_Renamed<0x05c, "buffer_atomic_max_num_f64">;
+
//===----------------------------------------------------------------------===//
// MUBUF - GFX10.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 42edec0..c466f9c 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -199,6 +199,7 @@ add_llvm_target(AMDGPUCodeGen
Instrumentation
MC
MIRParser
+ ObjCARC
Passes
Scalar
SelectionDAG
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 319cc9d..3ff675d 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1397,6 +1397,9 @@ defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0,
defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>;
defm DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_Real_gfx12<0x0e2>;
+defm DS_ADD_F64 : DS_Real_gfx12<0x054>;
+defm DS_ADD_RTN_F64 : DS_Real_gfx12<0x074>;
+
let AssemblerPredicate = HasLdsBarrierArriveAtomic in {
defm DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 : DS_Real_gfx12<0x056>;
defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_Real_gfx12<0x075>;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 5c1989b..ffe6b06 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -2556,6 +2556,9 @@ Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective(
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
+ if (isGFX1250())
+ PRINT_DIRECTIVE(".amdhsa_uses_cu_stores",
+ KERNEL_CODE_PROPERTY_USES_CU_STORES);
if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0)
return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0,
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 679c55d..d5d1074 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -11,8 +11,10 @@ let WantsRoot = true in {
def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [], -10>;
def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>;
+ def GlobalSAddrNoIOffset : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffset", [], [], -3>;
def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
+ def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>;
def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>;
def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>;
}
@@ -368,31 +370,68 @@ multiclass FLAT_Global_Store_Pseudo_t16<string opName> {
}
}
-class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo<
+// Async loads, introduced in gfx1250, will store directly
+// to a DS address in vdst (they will not use M0 for DS addess).
+class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0> : FLAT_Pseudo<
opName,
(outs ),
!con(
- !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)),
- (ins flat_offset:$offset, CPol_0:$cpol)),
- " $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
- let LGKM_CNT = 1;
+ !if(IsAsync, (ins VGPR_32:$vdst), (ins)),
+ !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)),
+ (ins flat_offset:$offset, CPol_0:$cpol)),
+ !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
+ let LGKM_CNT = !not(IsAsync);
+ let VM_CNT = !not(IsAsync);
+ let ASYNC_CNT = IsAsync;
let is_flat_global = 1;
let lds = 1;
let has_data = 0;
+ let has_vdst = IsAsync; // vdst for ds address with IsAsync
+ let mayLoad = 1;
+ let mayStore = 1;
+ let has_saddr = 1;
+ let enabled_saddr = EnableSaddr;
+ let VALU = 1;
+ let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
+ let Uses = !if(IsAsync, [EXEC, ASYNCcnt], [M0, EXEC]);
+ let Defs = !if(IsAsync, [ASYNCcnt], []);
+ let SchedRW = [WriteVMEM, WriteLDS];
+}
+
+multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0> {
+ def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync>,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync>,
+ GlobalSaddrTable<1, opName>;
+}
+
+class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo<
+ opName,
+ (outs ),
+ !con(
+ !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), (ins VGPR_32:$vdata),
+ (ins flat_offset:$offset, CPol_0:$cpol)),
+ " $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
+ let VM_CNT = 0;
+ let ASYNC_CNT = 1;
+ let is_flat_global = 1;
+ let lds = 1;
+ let has_data = 1; // vdata for ds address
let has_vdst = 0;
let mayLoad = 1;
let mayStore = 1;
let has_saddr = 1;
let enabled_saddr = EnableSaddr;
let VALU = 1;
- let Uses = [M0, EXEC];
+ let Uses = [EXEC, ASYNCcnt];
+ let Defs = [ASYNCcnt];
let SchedRW = [WriteVMEM, WriteLDS];
}
-multiclass FLAT_Global_Load_LDS_Pseudo<string opName> {
- def "" : FLAT_Global_Load_LDS_Pseudo<opName>,
+multiclass FLAT_Global_STORE_LDS_Pseudo<string opName> {
+ def "" : FLAT_Global_STORE_LDS_Pseudo<opName>,
GlobalSaddrTable<0, opName>;
- def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1>,
+ def _SADDR : FLAT_Global_STORE_LDS_Pseudo<opName, 1>,
GlobalSaddrTable<1, opName>;
}
@@ -464,6 +503,37 @@ class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = n
let sve = 0;
}
+class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64:$vaddr), string asm = " $vaddr"> :
+ FLAT_Pseudo<opName, (outs), !con(addr, (ins flat_offset:$offset, CPol_0:$cpol)), asm#"$offset$cpol"> {
+ let has_vdst = 0;
+ let has_data = 0;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let VM_CNT = 0;
+ let LGKM_CNT = 0;
+}
+
+multiclass FLAT_Flat_Prefetch_Pseudo<string opName> {
+ def "" : FLAT_Prefetch_Pseudo<opName>,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">,
+ GlobalSaddrTable<1, opName> {
+ let OtherPredicates = [HasFlatGVSMode];
+ let enabled_saddr = 1;
+ }
+}
+
+multiclass FLAT_Global_Prefetch_Pseudo<string opName> {
+ let is_flat_global = 1, has_saddr = 1 in {
+ def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64:$vaddr), " $vaddr, off">,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">,
+ GlobalSaddrTable<1, opName> {
+ let enabled_saddr = 1;
+ }
+ }
+}
+
class FlatScratchInst <string sv_op, string mode> {
string SVOp = sv_op;
string Mode = mode;
@@ -1124,6 +1194,15 @@ let SubtargetPredicate = isGFX12Plus in {
let SubtargetPredicate = isGFX1250Plus in {
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b8", 1>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b32", 1>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b64", 1>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b128", 1>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b8">;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b32">;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b64">;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b128">;
+
def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>;
def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">;
} // End SubtargetPredicate = isGFX1250Plus
@@ -1162,6 +1241,16 @@ defm SCRATCH_LOAD_LDS_USHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_u
defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sshort">;
defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">;
+let SubtargetPredicate = isGFX125xOnly in {
+defm FLAT_LOAD_MONITOR_B32 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b32", VGPR_32>;
+defm FLAT_LOAD_MONITOR_B64 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b64", VReg_64>;
+defm FLAT_LOAD_MONITOR_B128 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b128", VReg_128>;
+
+defm GLOBAL_LOAD_MONITOR_B32 : FLAT_Global_Load_Pseudo <"global_load_monitor_b32", VGPR_32>;
+defm GLOBAL_LOAD_MONITOR_B64 : FLAT_Global_Load_Pseudo <"global_load_monitor_b64", VReg_64>;
+defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VReg_128>;
+} // End SubtargetPredicate = isGFX125xOnly
+
let SubtargetPredicate = isGFX12Plus in {
let Uses = [EXEC, M0] in {
defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>;
@@ -1218,6 +1307,11 @@ let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in
"global_atomic_pk_add_f16", VGPR_32, v2f16
>;
+let SubtargetPredicate = HasVmemPrefInsts in {
+ defm FLAT_PREFETCH_B8 : FLAT_Flat_Prefetch_Pseudo<"flat_prefetch_b8">;
+ defm GLOBAL_PREFETCH_B8 : FLAT_Global_Prefetch_Pseudo<"global_prefetch_b8">;
+}
+
//===----------------------------------------------------------------------===//
// Flat Patterns
//===----------------------------------------------------------------------===//
@@ -1228,6 +1322,11 @@ class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCN
(inst $vaddr, $offset)
>;
+class FlatLoadPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (FlatOffset i64:$vaddr, i32:$offset), (i32 timm:$cpol))),
+ (inst $vaddr, $offset, $cpol)
+>;
+
class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in),
(inst $vaddr, $offset, 0, $in)
@@ -1249,8 +1348,8 @@ class FlatSignedLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Value
>;
class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)),
- (inst $saddr, $voffset, $offset, 0, $in)
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)),
+ (inst $saddr, $voffset, $offset, $cpol, $in)
>;
class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1263,9 +1362,29 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT
(inst $saddr, $voffset, $offset, $cpol)
>;
+class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
+ (inst $dsaddr, $vaddr, $offset, $cpol)
+>;
+
+class GlobalLoadLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
+ (inst $dsaddr, $saddr, $voffset, $offset, $cpol)
+>;
+
+class FlatStoreLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
+ (inst $vaddr, $dsaddr, $offset, $cpol)
+>;
+
+class GlobalStoreLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
+ (inst $saddr, $voffset, $dsaddr, $offset, $cpol)
+>;
+
class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
- (inst $saddr, $voffset, $offset, (i32 0))
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+ (inst $saddr, $voffset, $offset, $cpol)
>;
class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1278,6 +1397,16 @@ class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
(inst $saddr, $voffset, $offset, $cpol)
>;
+class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol))),
+ (inst $vaddr, $offset, $cpol)
+>;
+
+class GlobalLoadSaddrPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddrCPol (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm))),
+ (inst $saddr, $voffset, $offset, $cpol)
+>;
+
class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
(node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol)),
@@ -1459,10 +1588,30 @@ class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT
>;
class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset))),
- (inst $vaddr, $saddr, $offset, 0)
+ (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
+ (inst $vaddr, $saddr, $offset, $cpol)
>;
+multiclass GlobalLoadLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
+ def : FlatLoadLDSSignedPat <inst, node> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalLoadLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> {
+ let AddedComplexity = 11;
+ }
+}
+
+multiclass GlobalStoreLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
+ def : FlatStoreLDSSignedPat <inst, node> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalStoreLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> {
+ let AddedComplexity = 11;
+ }
+}
+
multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadSignedPat <inst, node, vt> {
let AddedComplexity = 10;
@@ -1473,6 +1622,16 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
}
}
+multiclass GlobalFLATLoadPats_CPOL<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadSignedPat_CPOL<inst, node, vt> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalLoadSaddrPat_CPOL<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 11;
+ }
+}
+
multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatSignedLoadPat_D16 <inst, node, vt> {
let AddedComplexity = 10;
@@ -2009,6 +2168,28 @@ let WaveSizePredicate = isWave32, OtherPredicates = [HasTransposeLoadF4F6Insts]
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR6_B96, int_amdgcn_global_load_tr6_b96, v3i32>;
}
+let OtherPredicates = [isGFX125xOnly] in {
+ def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B32, int_amdgcn_flat_load_monitor_b32, i32>;
+ def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B64, int_amdgcn_flat_load_monitor_b64, v2i32>;
+ def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B128, int_amdgcn_flat_load_monitor_b128, v4i32>;
+
+ defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B32, int_amdgcn_global_load_monitor_b32, i32>;
+ defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B64, int_amdgcn_global_load_monitor_b64, v2i32>;
+ defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>;
+} // End SubtargetPredicate = isGFX125xOnly
+
+let OtherPredicates = [isGFX1250Plus] in {
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_global_load_async_to_lds_b8>;
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_global_load_async_to_lds_b32>;
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_global_load_async_to_lds_b64>;
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B128, int_amdgcn_global_load_async_to_lds_b128>;
+
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B8, int_amdgcn_global_store_async_from_lds_b8>;
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B32, int_amdgcn_global_store_async_from_lds_b32>;
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B64, int_amdgcn_global_store_async_from_lds_b64>;
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B128, int_amdgcn_global_store_async_from_lds_b128>;
+}
+
let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
@@ -2138,6 +2319,77 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f
} // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch]
+def PrefetchLoc: SDNodeXForm<timm, [{
+ uint32_t V = N->getZExtValue();
+ V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK)) << AMDGPU::CPol::SCOPE_SHIFT;
+ if (!Subtarget->hasSafeCUPrefetch())
+ V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
+ return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32);
+}]>;
+
+def prefetch_flat : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
+ (prefetch node:$ptr, node:$rw, node:$loc, node:$type),
+ [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; }]> {
+ let GISelPredicateCode = [{
+ return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
+ }];
+}
+
+def prefetch_global : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
+ (prefetch node:$ptr, node:$rw, node:$loc, node:$type),
+ [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ (cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ !Subtarget->hasSafeSmemPrefetch()); }]> {
+ let GISelPredicateCode = [{
+ return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ ((*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ !Subtarget->hasSafeSmemPrefetch());
+ }];
+}
+
+multiclass FlatPrefetchPats<string inst, SDPatternOperator prefetch_kind, SDPatternOperator rw> {
+ def : GCNPat <
+ (prefetch_kind (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), rw, (i32 timm:$loc), i32imm_one),
+ (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, (i32 (PrefetchLoc $loc)))
+ > {
+ let AddedComplexity = !if(!eq(rw, i32imm_zero), 0, 25);
+ }
+
+ def : GCNPat <
+ (prefetch_kind (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), rw, (i32 timm:$loc), i32imm_one),
+ (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, (i32 (PrefetchLoc $loc)))
+ > {
+ let AddedComplexity = !if(!eq(rw, i32imm_zero), 11, 30);
+ }
+}
+
+multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> {
+ def : GCNPat <
+ (intr (FlatOffset i64:$vaddr, i32:$offset), timm:$cpol),
+ (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, $cpol)
+ >;
+
+ def : GCNPat <
+ (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol),
+ (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> {
+ let AddedComplexity = 11;
+ }
+}
+
+let SubtargetPredicate = HasVmemPrefInsts in {
+ defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_zero>;
+ defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_zero>;
+
+ // Patterns for forced vector prefetch with rw = 1.
+ defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_one>;
+ defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_one>;
+
+
+ // Patterns for target intrinsics
+ defm : FlatIntrPrefetchPats<"FLAT_PREFETCH_B8", int_amdgcn_flat_prefetch>;
+ defm : FlatIntrPrefetchPats<"GLOBAL_PREFETCH_B8", int_amdgcn_global_prefetch>;
+} // End SubtargetPredicate = HasVmemPrefInsts
+
//===----------------------------------------------------------------------===//
// Target
//===----------------------------------------------------------------------===//
@@ -3210,12 +3462,40 @@ multiclass VFLAT_Real_Atomics_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME
defm TENSOR_SAVE : VFLAT_Real_gfx1250<0x06e>;
defm TENSOR_STOP : VFLAT_Real_gfx1250<0x06f>;
+defm FLAT_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>;
+defm GLOBAL_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>;
+
+defm FLAT_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
+defm FLAT_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
+defm FLAT_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;
+
+defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
+defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
+defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;
+
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x62>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x63>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x64>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x65>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x66>;
+
defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">;
defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">;
defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>;
defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>;
+defm FLAT_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>;
+defm FLAT_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">;
+defm FLAT_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">;
+
+defm GLOBAL_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>;
+defm GLOBAL_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "global_atomic_min_num_f64">;
+defm GLOBAL_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "global_atomic_max_num_f64">;
+
def True16D16Table : GenericTable {
let FilterClass = "True16D16Table";
let CppTypeName = "True16D16Info";
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 7d6723a..334afd3 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -38,7 +38,11 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1,
unsigned GCNRegPressure::getRegKind(const TargetRegisterClass *RC,
const SIRegisterInfo *STI) {
- return STI->isSGPRClass(RC) ? SGPR : (STI->isAGPRClass(RC) ? AGPR : VGPR);
+ return STI->isSGPRClass(RC)
+ ? SGPR
+ : (STI->isAGPRClass(RC)
+ ? AGPR
+ : (STI->isVectorSuperClass(RC) ? AVGPR : VGPR));
}
void GCNRegPressure::inc(unsigned Reg,
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 3749b6d..ea33a22 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -29,43 +29,57 @@ class raw_ostream;
class SlotIndex;
struct GCNRegPressure {
- enum RegKind { SGPR, VGPR, AGPR, TOTAL_KINDS };
+ enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS };
GCNRegPressure() {
clear();
}
- bool empty() const { return !Value[SGPR] && !Value[VGPR] && !Value[AGPR]; }
+ bool empty() const {
+ return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR];
+ }
void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
/// \returns the SGPR32 pressure
unsigned getSGPRNum() const { return Value[SGPR]; }
- /// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p
- /// UnifiedVGPRFile
+ /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure
+ /// dependent upon \p UnifiedVGPRFile
unsigned getVGPRNum(bool UnifiedVGPRFile) const {
if (UnifiedVGPRFile) {
- return Value[AGPR] ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR])
- : Value[VGPR];
+ return Value[AGPR]
+ ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR])
+ : Value[VGPR] + Value[AVGPR];
}
- return std::max(Value[VGPR], Value[AGPR]);
+ // AVGPR assignment priority is based on the width of the register. Account
+ // AVGPR pressure as VGPR.
+ return std::max(Value[VGPR] + Value[AVGPR], Value[AGPR]);
}
/// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs
- /// and \p NumAGPRs AGPRS, for a target with a unified VGPR file.
+ /// \p NumAGPRs AGPRS, and \p NumAVGPRs AVGPRs for a target with a unified
+ /// VGPR file.
inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs,
- unsigned NumAGPRs) {
- return alignTo(NumArchVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
+ unsigned NumAGPRs,
+ unsigned NumAVGPRs) {
+
+ // Assume AVGPRs will be assigned as VGPRs.
+ return alignTo(NumArchVGPRs + NumAVGPRs,
+ AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
NumAGPRs;
}
- /// \returns the ArchVGPR32 pressure
- unsigned getArchVGPRNum() const { return Value[VGPR]; }
+ /// \returns the ArchVGPR32 pressure, plus the AVGPRS which we assume will be
+ /// allocated as VGPR
+ unsigned getArchVGPRNum() const { return Value[VGPR] + Value[AVGPR]; }
/// \returns the AccVGPR32 pressure
unsigned getAGPRNum() const { return Value[AGPR]; }
+ /// \returns the AVGPR32 pressure
+ unsigned getAVGPRNum() const { return Value[AVGPR]; }
unsigned getVGPRTuplesWeight() const {
- return std::max(Value[TOTAL_KINDS + VGPR], Value[TOTAL_KINDS + AGPR]);
+ return std::max(Value[TOTAL_KINDS + VGPR] + Value[TOTAL_KINDS + AVGPR],
+ Value[TOTAL_KINDS + AGPR]);
}
unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; }
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index a655308..33b66a6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -896,15 +896,10 @@ GCNScheduleDAGMILive::getRegionLiveInMap() const {
assert(!Regions.empty());
std::vector<MachineInstr *> RegionFirstMIs;
RegionFirstMIs.reserve(Regions.size());
- auto I = Regions.rbegin(), E = Regions.rend();
- do {
- const MachineBasicBlock *MBB = I->first->getParent();
- auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
- RegionFirstMIs.push_back(MI);
- do {
- ++I;
- } while (I != E && I->first->getParent() == MBB);
- } while (I != E);
+ for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
+ RegionFirstMIs.push_back(
+ &*skipDebugInstructionsForward(RegionBegin, RegionEnd));
+
return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
}
@@ -1911,14 +1906,12 @@ void PreRARematStage::rematerialize() {
for (auto &[DefMI, Remat] : Rematerializations) {
MachineBasicBlock::iterator InsertPos(Remat.UseMI);
Register Reg = DefMI->getOperand(0).getReg();
- unsigned SubReg = DefMI->getOperand(0).getSubReg();
unsigned DefRegion = MIRegion.at(DefMI);
// Rematerialize DefMI to its use block.
- TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI,
- *DAG.TRI);
+ TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
+ AMDGPU::NoSubRegister, *DefMI, *DAG.TRI);
Remat.RematMI = &*std::prev(InsertPos);
- Remat.RematMI->getOperand(0).setSubReg(SubReg);
DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
// Update region boundaries in regions we sinked from (remove defining MI)
@@ -2064,14 +2057,13 @@ void PreRARematStage::finalizeGCNSchedStage() {
MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
MachineBasicBlock *MBB = RegionBB[DefRegion];
Register Reg = RematMI.getOperand(0).getReg();
- unsigned SubReg = RematMI.getOperand(0).getSubReg();
// Re-rematerialize MI at the end of its original region. Note that it may
// not be rematerialized exactly in the same position as originally within
// the region, but it should not matter much.
- TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI);
+ TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI,
+ *DAG.TRI);
MachineInstr *NewMI = &*std::prev(InsertPos);
- NewMI->getOperand(0).setSubReg(SubReg);
DAG.LIS->InsertMachineInstrInMaps(*NewMI);
auto UseRegion = MIRegion.find(Remat.UseMI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 9a2bab1..0a0a107 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -537,6 +537,63 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
return getMaxNumVGPRs(MF.getFunction());
}
+std::pair<unsigned, unsigned>
+GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
+ const unsigned MaxVectorRegs = getMaxNumVGPRs(F);
+
+ unsigned MaxNumVGPRs = MaxVectorRegs;
+ unsigned MaxNumAGPRs = 0;
+
+ // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
+ // a wave may have up to 512 total vector registers combining together both
+ // VGPRs and AGPRs. Hence, in an entry function without calls and without
+ // AGPRs used within it, it is possible to use the whole vector register
+ // budget for VGPRs.
+ //
+ // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
+ // register file accordingly.
+ if (hasGFX90AInsts()) {
+ unsigned MinNumAGPRs = 0;
+ const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
+ const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+
+ const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
+
+ // TODO: The lower bound should probably force the number of required
+ // registers up, overriding amdgpu-waves-per-eu.
+ std::tie(MinNumAGPRs, MaxNumAGPRs) =
+ AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", DefaultNumAGPR,
+ /*OnlyFirstRequired=*/true);
+
+ if (MinNumAGPRs == DefaultNumAGPR.first) {
+ // Default to splitting half the registers if AGPRs are required.
+ MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
+ } else {
+ // Align to accum_offset's allocation granularity.
+ MinNumAGPRs = alignTo(MinNumAGPRs, 4);
+
+ MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs);
+ }
+
+ // Clamp values to be inbounds of our limits, and ensure min <= max.
+
+ MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
+ MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);
+
+ MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs);
+ MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
+
+ assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
+ MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs &&
+ "invalid register counts");
+ } else if (hasMAIInsts()) {
+ // On gfx908 the number of AGPRs always equals the number of VGPRs.
+ MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
+ }
+
+ return std::pair(MaxNumVGPRs, MaxNumAGPRs);
+}
+
void GCNSubtarget::adjustSchedDependency(
SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
const TargetSchedModel *SchedModel) const {
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 0435e7f..bdd900d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -123,6 +123,7 @@ protected:
bool HasSMemRealTime = false;
bool HasIntClamp = false;
bool HasFmaMixInsts = false;
+ bool HasFmaMixBF16Insts = false;
bool HasMovrel = false;
bool HasVGPRIndexMode = false;
bool HasScalarDwordx3Loads = false;
@@ -244,7 +245,10 @@ protected:
bool HasVMEMtoScalarWriteHazard = false;
bool HasSMEMtoVectorWriteHazard = false;
bool HasInstFwdPrefetchBug = false;
+ bool HasVmemPrefInsts = false;
bool HasSafeSmemPrefetch = false;
+ bool HasSafeCUPrefetch = false;
+ bool HasCUStores = false;
bool HasVcmpxExecWARHazard = false;
bool HasLdsBranchVmemWARHazard = false;
bool HasNSAtoVMEMBug = false;
@@ -269,6 +273,7 @@ protected:
bool HasMinimum3Maximum3PKF16 = false;
bool HasLshlAddU64Inst = false;
bool HasAddSubU64Insts = false;
+ bool HasMadU32Inst = false;
bool HasPointSampleAccel = false;
bool HasLdsBarrierArriveAtomic = false;
bool HasSetPrioIncWgInst = false;
@@ -462,6 +467,8 @@ public:
return HasFmaMixInsts;
}
+ bool hasFmaMixBF16Insts() const { return HasFmaMixBF16Insts; }
+
bool hasCARRY() const {
return true;
}
@@ -709,7 +716,9 @@ public:
bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); }
// DS_ADD_F64/DS_ADD_RTN_F64
- bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
+ bool hasLdsAtomicAddF64() const {
+ return hasGFX90AInsts() || hasGFX1250Insts();
+ }
bool hasMultiDwordFlatScratchAddressing() const {
return getGeneration() >= GFX9;
@@ -987,8 +996,14 @@ public:
bool hasPrefetch() const { return GFX12Insts; }
+ bool hasVmemPrefInsts() const { return HasVmemPrefInsts; }
+
bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; }
+ bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
+
+ bool hasCUStores() const { return HasCUStores; }
+
// Has s_cmpk_* instructions.
bool hasSCmpK() const { return getGeneration() < GFX12; }
@@ -1507,9 +1522,22 @@ public:
// \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }
+ // \returns true if the target has V_MAD_U32 instruction.
+ bool hasMadU32Inst() const { return HasMadU32Inst; }
+
// \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
bool hasVectorMulU64() const { return GFX1250Insts; }
+ // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
+ // instructions.
+ bool hasMadU64U32NoCarry() const { return GFX1250Insts; }
+
+ // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
+ bool hasIntMinMax64() const { return GFX1250Insts; }
+
+ // \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions.
+ bool hasAddMinMaxInsts() const { return GFX1250Insts; }
+
// \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
@@ -1658,6 +1686,10 @@ public:
return getMaxNumVGPRs(F);
}
+ /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
+ /// of waves per execution unit required for the function \p MF.
+ std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
+
/// \returns Maximum number of VGPRs that meets number of waves per execution
/// unit requirement for function \p MF, or number of VGPRs explicitly
/// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 11b072e..42c4d8b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -540,6 +540,8 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType,
printImmediateBFloat16(static_cast<uint16_t>(Imm), STI, O))
return;
break;
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
+ break;
default:
llvm_unreachable("bad operand type");
}
@@ -770,6 +772,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
@@ -1790,4 +1793,14 @@ void AMDGPUInstPrinter::printBitOp3(const MCInst *MI, unsigned OpNo,
O << formatHex(static_cast<uint64_t>(Imm));
}
+void AMDGPUInstPrinter::printScaleSel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ uint8_t Imm = MI->getOperand(OpNo).getImm();
+ if (!Imm)
+ return;
+
+ O << " scale_sel:" << formatDec(Imm);
+}
+
#include "AMDGPUGenAsmWriter.inc"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index e0b7aa5..f6739b14 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -173,6 +173,8 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O,
StringRef Prefix, bool PrintInHex, bool AlwaysPrint);
+ void printScaleSel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printBitOp3(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index c49ad79..f358084 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -341,6 +341,9 @@ std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding(
return AMDGPU::getInlineEncodingV2BF16(static_cast<uint32_t>(Imm))
.value_or(255);
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
+ return 255;
+
case AMDGPU::OPERAND_KIMM32:
case AMDGPU::OPERAND_KIMM16:
case AMDGPU::OPERAND_KIMM64:
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 10f6d33..43ca548 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -440,6 +440,11 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
".amdhsa_user_sgpr_private_segment_size");
+ if (isGFX1250(STI))
+ PrintField(KD.kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES_SHIFT,
+ amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES,
+ ".amdhsa_uses_cu_stores");
if (IVersion.Major >= 10)
PrintField(KD.kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT,
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 3902d4c..c564145 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -208,6 +208,7 @@ enum OperandType : unsigned {
OPERAND_REG_IMM_V2BF16,
OPERAND_REG_IMM_V2FP16,
OPERAND_REG_IMM_V2INT16,
+ OPERAND_REG_IMM_NOINLINE_V2FP16,
OPERAND_REG_IMM_V2INT32,
OPERAND_REG_IMM_V2FP32,
@@ -392,11 +393,13 @@ enum CPol {
TH_ATOMIC_CASCADE = 4, // Cascading vs regular
// Scope
- SCOPE = 0x3 << 3, // All Scope bits
- SCOPE_CU = 0 << 3,
- SCOPE_SE = 1 << 3,
- SCOPE_DEV = 2 << 3,
- SCOPE_SYS = 3 << 3,
+ SCOPE_SHIFT = 3,
+ SCOPE_MASK = 0x3,
+ SCOPE = SCOPE_MASK << SCOPE_SHIFT, // All Scope bits
+ SCOPE_CU = 0 << SCOPE_SHIFT,
+ SCOPE_SE = 1 << SCOPE_SHIFT,
+ SCOPE_DEV = 2 << SCOPE_SHIFT,
+ SCOPE_SYS = 3 << SCOPE_SHIFT,
NV = 1 << 5, // Non-volatile bit
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index e5d1eaa..e934152 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -468,6 +468,7 @@ bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI,
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
@@ -1062,9 +1063,13 @@ bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
switch (OpTy) {
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
break;
case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT64:
OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
break;
default:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f1a8ee1..ad26757 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -882,7 +882,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasMad64_32())
setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
- if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch())
+ if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
if (Subtarget->hasIEEEMinimumMaximumInsts()) {
@@ -909,6 +909,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
Custom);
}
+ if (Subtarget->hasIntMinMax64())
+ setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i64,
+ Legal);
+
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
@@ -1061,10 +1065,12 @@ ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
// where this is OK to use.
bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
EVT DestVT, EVT SrcVT) const {
- return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
- (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
- DestVT.getScalarType() == MVT::f32 &&
- SrcVT.getScalarType() == MVT::f16 &&
+ return DestVT.getScalarType() == MVT::f32 &&
+ ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
+ (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
+ SrcVT.getScalarType() == MVT::f16) ||
+ (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
+ SrcVT.getScalarType() == MVT::bf16)) &&
// TODO: This probably only requires no input flushing?
denormalModeIsFlushAllF32(DAG.getMachineFunction());
}
@@ -1254,6 +1260,25 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
}
+static unsigned getIntrMemWidth(unsigned IntrID) {
+ switch (IntrID) {
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ return 8;
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ return 32;
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ return 64;
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128:
+ return 128;
+ default:
+ llvm_unreachable("Unknown width");
+ }
+}
+
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
@@ -1475,6 +1500,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MOVolatile;
return true;
}
+ case Intrinsic::amdgcn_flat_load_monitor_b32:
+ case Intrinsic::amdgcn_flat_load_monitor_b64:
+ case Intrinsic::amdgcn_flat_load_monitor_b128:
+ case Intrinsic::amdgcn_global_load_monitor_b32:
+ case Intrinsic::amdgcn_global_load_monitor_b64:
+ case Intrinsic::amdgcn_global_load_monitor_b128:
case Intrinsic::amdgcn_ds_load_tr6_b96:
case Intrinsic::amdgcn_ds_load_tr4_b64:
case Intrinsic::amdgcn_ds_load_tr8_b64:
@@ -1519,6 +1550,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOStore;
return true;
}
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+ Info.ptrVal = CI.getArgOperand(1);
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ return true;
+ }
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+ Info.ptrVal = CI.getArgOperand(0);
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ return true;
+ }
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
Info.opc = ISD::INTRINSIC_VOID;
@@ -1548,7 +1599,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
- case Intrinsic::amdgcn_s_prefetch_data: {
+ case Intrinsic::amdgcn_s_prefetch_data:
+ case Intrinsic::amdgcn_flat_prefetch:
+ case Intrinsic::amdgcn_global_prefetch: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
Info.ptrVal = CI.getArgOperand(0);
@@ -1599,18 +1652,32 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
+ case Intrinsic::amdgcn_flat_load_monitor_b128:
+ case Intrinsic::amdgcn_flat_load_monitor_b32:
+ case Intrinsic::amdgcn_flat_load_monitor_b64:
case Intrinsic::amdgcn_global_atomic_csub:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
+ case Intrinsic::amdgcn_global_load_monitor_b128:
+ case Intrinsic::amdgcn_global_load_monitor_b32:
+ case Intrinsic::amdgcn_global_load_monitor_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128:
case Intrinsic::amdgcn_global_load_tr4_b64:
case Intrinsic::amdgcn_global_load_tr6_b96:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128:
Ptr = II->getArgOperand(0);
break;
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128:
Ptr = II->getArgOperand(1);
break;
default:
@@ -4225,7 +4292,7 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
Chain = BaseAddr.getValue(1);
Align StackAlign = TFL->getStackAlign();
if (Alignment > StackAlign) {
- uint64_t ScaledAlignment = (uint64_t)Alignment.value()
+ uint64_t ScaledAlignment = Alignment.value()
<< Subtarget->getWavefrontSizeLog2();
uint64_t StackAlignMask = ScaledAlignment - 1;
SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
@@ -4440,19 +4507,28 @@ SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
}
SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
- if (Op->isDivergent())
+ if (Op->isDivergent() &&
+ (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
+ // Cannot do I$ prefetch with divergent pointer.
return SDValue();
switch (cast<MemSDNode>(Op)->getAddressSpace()) {
case AMDGPUAS::FLAT_ADDRESS:
case AMDGPUAS::GLOBAL_ADDRESS:
case AMDGPUAS::CONSTANT_ADDRESS:
- case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
break;
+ case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+ if (Subtarget->hasSafeSmemPrefetch())
+ break;
+ [[fallthrough]];
default:
return SDValue();
}
+ // I$ prefetch
+ if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
+ return SDValue();
+
return Op;
}
@@ -7123,7 +7199,7 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
}
- if (getTargetMachine().Options.UnsafeFPMath) {
+ if (Op->getFlags().hasApproximateFuncs()) {
SDValue Flags = Op.getOperand(1);
SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
@@ -11218,8 +11294,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
EVT VT = Op.getValueType();
const SDNodeFlags Flags = Op->getFlags();
- bool AllowInaccurateRcp =
- Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
+ bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
// Without !fpmath accuracy information, we can't do more because we don't
@@ -11238,7 +11313,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
// 1.0 / sqrt(x) -> rsq(x)
- // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
+ // XXX - Is afn sufficient to do this for f64? The maximum ULP
// error seems really high at 2^29 ULP.
// 1.0 / x -> rcp(x)
return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
@@ -11272,8 +11347,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
EVT VT = Op.getValueType();
const SDNodeFlags Flags = Op->getFlags();
- bool AllowInaccurateDiv =
- Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
+ bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
if (!AllowInaccurateDiv)
return SDValue();
@@ -14154,6 +14228,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
(VT == MVT::f32 || VT == MVT::f64 ||
(VT == MVT::f16 && Subtarget->has16BitInsts()) ||
+ (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
+ (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
(VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
Op0.hasOneUse()) {
if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
@@ -14523,7 +14599,7 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
return ISD::FMAD;
const TargetOptions &Options = DAG.getTarget().Options;
- if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
+ if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
(N0->getFlags().hasAllowContract() &&
N1->getFlags().hasAllowContract())) &&
isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {
@@ -15646,9 +15722,9 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
// fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
// regardless of the denorm mode setting. Therefore,
- // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
+ // fp-contract is sufficient to allow generating fdot2.
const TargetOptions &Options = DAG.getTarget().Options;
- if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
+ if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
(N->getFlags().hasAllowContract() &&
FMA->getFlags().hasAllowContract())) {
Op1 = Op1.getOperand(0);
@@ -15869,6 +15945,78 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
return SDValue(CSrc, 0);
}
+SDValue SITargetLowering::performSelectCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+
+ // Try to fold CMP + SELECT patterns with shared constants (both FP and
+ // integer).
+ // Detect when CMP and SELECT use the same constant and fold them to avoid
+ // loading the constant twice. Specifically handles patterns like:
+ // %cmp = icmp eq i32 %val, 4242
+ // %sel = select i1 %cmp, i32 4242, i32 %other
+ // It can be optimized to reuse %val instead of 4242 in select.
+ SDValue Cond = N->getOperand(0);
+ SDValue TrueVal = N->getOperand(1);
+ SDValue FalseVal = N->getOperand(2);
+
+ // Check if condition is a comparison.
+ if (Cond.getOpcode() != ISD::SETCC)
+ return SDValue();
+
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
+ bool isInteger = LHS.getValueType().isInteger();
+
+ // Handle simple floating-point and integer types only.
+ if (!isFloatingPoint && !isInteger)
+ return SDValue();
+
+ bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
+ bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
+ if (!isEquality && !isNonEquality)
+ return SDValue();
+
+ SDValue ArgVal, ConstVal;
+ if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
+ (isInteger && isa<ConstantSDNode>(RHS))) {
+ ConstVal = RHS;
+ ArgVal = LHS;
+ } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
+ (isInteger && isa<ConstantSDNode>(LHS))) {
+ ConstVal = LHS;
+ ArgVal = RHS;
+ } else {
+ return SDValue();
+ }
+
+ // Skip optimization for inlinable immediates.
+ if (isFloatingPoint) {
+ const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
+ if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
+ return SDValue();
+ } else {
+ if (AMDGPU::isInlinableIntLiteral(
+ cast<ConstantSDNode>(ConstVal)->getSExtValue()))
+ return SDValue();
+ }
+
+ // For equality and non-equality comparisons, patterns:
+ // select (setcc x, const), const, y -> select (setcc x, const), x, y
+ // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
+ if (!(isEquality && TrueVal == ConstVal) &&
+ !(isNonEquality && FalseVal == ConstVal))
+ return SDValue();
+
+ SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
+ SDValue SelectRHS =
+ (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
+ return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
+ SelectLHS, SelectRHS);
+}
+
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
@@ -15917,6 +16065,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performFMulCombine(N, DCI);
case ISD::SETCC:
return performSetCCCombine(N, DCI);
+ case ISD::SELECT:
+ if (auto Res = performSelectCombine(N, DCI))
+ return Res;
+ break;
case ISD::FMAXNUM:
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index acf6158..dedd9ae 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -211,6 +211,7 @@ private:
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
unsigned getFusedOpcode(const SelectionDAG &DAG,
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 9faf497..4b48fc4 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -552,7 +552,7 @@ public:
(!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
// FLAT and SCRATCH instructions may access scratch. Other VMEM
// instructions do not.
- if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
+ if (TII->mayAccessScratchThroughFlat(Inst))
return SCRATCH_WRITE_ACCESS;
return VMEM_WRITE_ACCESS;
}
@@ -565,7 +565,6 @@ public:
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
- bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
bool isVmemAccess(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
@@ -1381,6 +1380,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
Modified = true;
} else
WaitcntInstr = &II;
+ } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
+ assert(ST->hasVMemToLDSLoad());
+ LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
+ << "Before: " << Wait.LoadCnt << '\n';);
+ ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
+ LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';);
+
+ // It is possible (but unlikely) that this is the only wait instruction,
+ // in which case, we exit this loop without a WaitcntInstr to consume
+ // `Wait`. But that works because `Wait` was passed in by reference, and
+ // the callee eventually calls createNewWaitcnt on it. We test this
+ // possibility in an articial MIR test since such a situation cannot be
+ // recreated by running the memory legalizer.
+ II.eraseFromParent();
} else {
assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
@@ -1552,6 +1565,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
ScoreBrackets.simplifyWaitcnt(OldWait);
Wait = Wait.combined(OldWait);
UpdatableInstr = &CombinedStoreDsCntInstr;
+ } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
+ // Architectures higher than GFX10 do not have direct loads to
+ // LDS, so no work required here yet.
+ II.eraseFromParent();
+ continue;
} else {
std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
assert(CT.has_value());
@@ -2108,8 +2126,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
assert(TII->isFLAT(MI));
- // All flat instructions use the VMEM counter.
- assert(TII->usesVM_CNT(MI));
+ // All flat instructions use the VMEM counter except prefetch.
+ if (!TII->usesVM_CNT(MI))
+ return false;
// If there are no memory operands then conservatively assume the flat
// operation may access VMEM.
@@ -2159,32 +2178,6 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
return false;
}
-// This is a flat memory operation. Check to see if it has memory tokens for
-// either scratch or FLAT.
-bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
- const MachineInstr &MI) const {
- assert(TII->isFLAT(MI));
-
- // SCRATCH instructions always access scratch.
- if (TII->isFLATScratch(MI))
- return true;
-
- // GLOBAL instructions never access scratch.
- if (TII->isFLATGlobal(MI))
- return false;
-
- // If there are no memory operands then conservatively assume the flat
- // operation may access scratch.
- if (MI.memoperands_empty())
- return true;
-
- // See if any memory operand specifies an address space that involves scratch.
- return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
- unsigned AS = Memop->getAddrSpace();
- return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
- });
-}
-
bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
(TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
@@ -2295,9 +2288,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
}
- // A Flat memory operation must access at least one address space.
- assert(FlatASCount);
-
// This is a flat memory operation that access both VMEM and LDS, so note it
// - it will require that both the VM and LGKM be flushed to zero if it is
// pending when a VM or LGKM dependency occurs.
@@ -2444,6 +2434,7 @@ static bool isWaitInstr(MachineInstr &Inst) {
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
+ Opcode == AMDGPU::S_WAITCNT_lds_direct ||
counterTypeForInstr(Opcode).has_value();
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 40e6871..044a681 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2508,7 +2508,20 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
.addReg(DstHi);
}
break;
+
+ case AMDGPU::V_MAX_BF16_PSEUDO_e64:
+ assert(ST.hasBF16PackedInsts());
+ MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
+ MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
+ MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
+ MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
+ auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
+ Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
+ auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
+ Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
+ break;
}
+
return true;
}
@@ -2733,49 +2746,47 @@ static MachineInstr *swapImmOperands(MachineInstr &MI,
}
bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
- const MachineOperand *MO0, unsigned OpIdx1,
- const MachineOperand *MO1) const {
+ unsigned OpIdx1) const {
const MCInstrDesc &InstDesc = MI.getDesc();
const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
- const TargetRegisterClass *DefinedRC1 =
- OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo1.RegClass) : nullptr;
- const TargetRegisterClass *DefinedRC0 =
- OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo0.RegClass) : nullptr;
unsigned Opc = MI.getOpcode();
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ const MachineOperand &MO0 = MI.getOperand(OpIdx0);
+ const MachineOperand &MO1 = MI.getOperand(OpIdx1);
+
// Swap doesn't breach constant bus or literal limits
// It may move literal to position other than src0, this is not allowed
// pre-gfx10 However, most test cases need literals in Src0 for VOP
// FIXME: After gfx9, literal can be in place other than Src0
if (isVALU(MI)) {
- if ((int)OpIdx0 == Src0Idx && !MO0->isReg() &&
- !isInlineConstant(*MO0, OpInfo1))
+ if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
+ !isInlineConstant(MO0, OpInfo1))
return false;
- if ((int)OpIdx1 == Src0Idx && !MO1->isReg() &&
- !isInlineConstant(*MO1, OpInfo0))
+ if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
+ !isInlineConstant(MO1, OpInfo0))
return false;
}
- if ((int)OpIdx1 != Src0Idx && MO0->isReg()) {
- if (!DefinedRC1)
+ if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
+ if (OpInfo1.RegClass == -1)
return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
- return isLegalRegOperand(MI, OpIdx1, *MO0) &&
- (!MO1->isReg() || isLegalRegOperand(MI, OpIdx0, *MO1));
+ return isLegalRegOperand(MI, OpIdx1, MO0) &&
+ (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
}
- if ((int)OpIdx0 != Src0Idx && MO1->isReg()) {
- if (!DefinedRC0)
+ if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
+ if (OpInfo0.RegClass == -1)
return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
- return (!MO0->isReg() || isLegalRegOperand(MI, OpIdx1, *MO0)) &&
- isLegalRegOperand(MI, OpIdx0, *MO1);
+ return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
+ isLegalRegOperand(MI, OpIdx0, MO1);
}
// No need to check 64-bit literals since swapping does not bring new
// 64-bit literals into current instruction to fold to 32-bit
- return isImmOperandLegal(MI, OpIdx1, *MO0);
+ return isImmOperandLegal(MI, OpIdx1, MO0);
}
MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
@@ -2797,12 +2808,12 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
static_cast<int>(Src1Idx) &&
"inconsistency with findCommutedOpIndices");
- MachineOperand &Src0 = MI.getOperand(Src0Idx);
- MachineOperand &Src1 = MI.getOperand(Src1Idx);
- if (!isLegalToSwap(MI, Src0Idx, &Src0, Src1Idx, &Src1)) {
+ if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
return nullptr;
- }
+
MachineInstr *CommutedMI = nullptr;
+ MachineOperand &Src0 = MI.getOperand(Src0Idx);
+ MachineOperand &Src1 = MI.getOperand(Src1Idx);
if (Src0.isReg() && Src1.isReg()) {
// Be sure to copy the source modifiers to the right place.
CommutedMI =
@@ -4238,6 +4249,32 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
}
+bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
+ if (!isFLAT(MI) || isFLATGlobal(MI))
+ return false;
+
+ // If scratch is not initialized, we can never access it.
+ if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
+ return false;
+
+ // SCRATCH instructions always access scratch.
+ if (isFLATScratch(MI))
+ return true;
+
+ // If there are no memory operands then conservatively assume the flat
+ // operation may access scratch.
+ if (MI.memoperands_empty())
+ return true;
+
+ // TODO (?): Does this need to be taught how to read noalias.addrspace ?
+
+ // See if any memory operand specifies an address space that involves scratch.
+ return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
+ unsigned AS = Memop->getAddrSpace();
+ return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
+ });
+}
+
bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
// Skip the full operand and register alias search modifiesRegister
// does. There's only a handful of instructions that touch this, it's only an
@@ -4401,6 +4438,8 @@ bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
return AMDGPU::isInlinableLiteralV2BF16(Imm);
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
+ return false;
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
if (isInt<16>(Imm) || isUInt<16>(Imm)) {
@@ -9244,6 +9283,16 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
default:
if (MI.isMetaInstruction())
return 0;
+
+ // If D16 Pseudo inst, get correct MC code size
+ const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
+ if (D16Info) {
+ // Assume d16_lo/hi inst are always in same size
+ unsigned LoInstOpcode = D16Info->LoOp;
+ const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
+ DescSize = Desc.getSize();
+ }
+
return DescSize;
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 800ea9a..e042b59 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -197,8 +197,7 @@ protected:
AMDGPU::OpName Src0OpName, MachineOperand &Src1,
AMDGPU::OpName Src1OpName) const;
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx,
- const MachineOperand *fromMO, unsigned toIdx,
- const MachineOperand *toMO) const;
+ unsigned toIdx) const;
MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned OpIdx0,
unsigned OpIdx1) const override;
@@ -679,6 +678,12 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
}
+ /// \returns true for SCRATCH_ instructions, or FLAT_ instructions with
+ /// SCRATCH_ memory operands.
+ /// Conservatively correct; will return true if \p MI cannot be proven
+ /// to not hit scratch.
+ bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
+
static bool isBlockLoadStore(uint16_t Opcode) {
switch (Opcode) {
case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 485ca78..a3e20ba 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1313,6 +1313,10 @@ def MatrixBFMT : CustomOperand<i32, 1, "MatrixBFMT">;
def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">;
def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">;
+def ScaleSel : NamedIntOperand<"scale_sel"> {
+ let Validator = "isUInt<3>";
+}
+
class KImmFPOperand<ValueType vt> : ImmOperand<vt> {
let OperandNamespace = "AMDGPU";
let OperandType = "OPERAND_KIMM"#vt.Size;
@@ -1662,6 +1666,8 @@ def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
def VOP3PMadMixModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsExt">;
def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
+def VOP3PMadMixBF16ModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsExt">;
+def VOP3PMadMixBF16Mods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16Mods">;
def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;
@@ -2857,15 +2863,18 @@ def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>;
def VOP_BF16_BF16 : VOPProfile<[bf16, bf16, untyped, untyped]>;
def VOP1_I16_I32 : VOPProfile<[i16, i32, untyped, untyped]>;
+def VOP_I16_V2F16 : VOPProfile<[i16, v2f16, untyped, untyped]>;
def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>;
def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp=*/1>;
+def VOP_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, untyped]>;
def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
+def VOP_BF16_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, bf16, untyped]>;
def VOP_I32_I16_I16_I32 : VOPProfile <[i32, i16, i16, i32, untyped]>;
def VOP_I32_I16 : VOPProfile <[i32, i16, untyped, untyped]>;
@@ -2917,10 +2926,13 @@ def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=
def VOP_I64_I64_I64_ARITH : VOPProfile <[i64, i64, i64, untyped], /*EnableClamp=*/1>;
def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
+def VOP_F32_BF16_BF16_BF16 : VOPProfile <[f32, bf16, bf16, bf16]>;
def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>;
def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>;
def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>;
def VOP_V32BF16_V6I32_F32 : VOPProfile <[v32bf16, v6i32, f32, untyped]>;
+def VOP_V2BF16_F32_F32_I32 : VOPProfile <[v2bf16, f32, f32, i32]>;
+def VOP_V2F16_F32_F32_I32 : VOPProfile <[v2f16, f32, f32, i32]>;
def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>;
def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>;
def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>;
@@ -2936,6 +2948,13 @@ def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>;
def VOP_F16_F32_I32 : VOPProfile<[f16, f32, i32, untyped]>;
def VOP_I32_BF16_I32_F32 : VOPProfile<[i32, bf16, i32, f32]>;
def VOP_I32_F16_I32_F32 : VOPProfile<[i32, f16, i32, f32]>;
+def VOP_V8F16_V2I32_I32 : VOPProfile<[v8f16, v2i32, i32, untyped]>;
+def VOP_V8BF16_V2I32_I32 : VOPProfile<[v8bf16, v2i32, i32, untyped]>;
+def VOP_V8F16_I32_I32 : VOPProfile<[v8f16, i32, i32, untyped]>;
+def VOP_V8BF16_I32_I32 : VOPProfile<[v8bf16, i32, i32, untyped]>;
+def VOP_V16F32_V3I32_I32 : VOPProfile<[v16f32, v3i32, i32, untyped]>;
+def VOP_V8F32_V2I32_I32 : VOPProfile<[v8f32, v2i32, i32, untyped]>;
+def VOP_V8F32_I32_I32 : VOPProfile<[v8f32, i32, i32, untyped]>;
def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>;
def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index d05be8f..54fa192 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1894,6 +1894,9 @@ let SubtargetPredicate = UseRealTrue16Insts in
def : ClampPat<V_MAX_F16_t16_e64, f16>;
let SubtargetPredicate = UseFakeTrue16Insts in
def : ClampPat<V_MAX_F16_fake16_e64, f16>;
+// FIXME-TRUE16: Pseudo expansion of this won't work with True16.
+let True16Predicate = UseFakeTrue16Insts in
+def : ClampPat<V_MAX_BF16_PSEUDO_e64, bf16>;
let SubtargetPredicate = HasVOP3PInsts in {
def : GCNPat <
@@ -1903,6 +1906,13 @@ def : GCNPat <
>;
}
+let SubtargetPredicate = HasBF16PackedInsts in {
+def : GCNPat <
+ (v2bf16 (AMDGPUclamp (VOP3PMods v2bf16:$src0, i32:$src0_modifiers))),
+ (V_PK_MAX_NUM_BF16 $src0_modifiers, $src0,
+ $src0_modifiers, $src0, DSTCLAMP.ENABLE)
+>;
+} // End SubtargetPredicate = HasBF16PackedInsts
/********** ================================ **********/
/********** Floating point absolute/negative **********/
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 9f61bf8..9509199 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -351,6 +351,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF,
MachineRegisterInfo &MRI = MF.getRegInfo();
BitVector ReservedRegs = TRI->getReservedRegs(MF);
BitVector NonWwmAllocMask(TRI->getNumRegs());
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
// FIXME: MaxNumVGPRsForWwmAllocation might need to be adjusted in the future
// to have a balanced allocation between WWM values and per-thread vector
@@ -359,7 +360,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF,
NumRegs =
std::min(static_cast<unsigned>(MFI->getSGPRSpillVGPRs().size()), NumRegs);
- auto [MaxNumVGPRs, MaxNumAGPRs] = TRI->getMaxNumVectorRegs(MF);
+ auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
// Try to use the highest available registers for now. Later after
// vgpr-regalloc, they can be shifted to the lowest range.
unsigned I = 0;
@@ -376,7 +377,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF,
// Reserve an arbitrary register and report the error.
TRI->markSuperRegs(RegMask, AMDGPU::VGPR0);
MF.getFunction().getContext().emitError(
- "can't find enough VGPRs for wwm-regalloc");
+ "cannot find enough VGPRs for wwm-regalloc");
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index f0be204..9a1448f 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -81,11 +81,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
}
- MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm;
- if (!MFMAVGPRForm && ST.hasGFX90AInsts() &&
- ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
- !mayUseAGPRs(F))
- MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
+ MayNeedAGPRs = ST.hasMAIInsts();
+ if (ST.hasGFX90AInsts()) {
+ // FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection
+ // should be separated from availability of AGPRs
+ if (MFMAVGPRForm ||
+ (ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
+ !mayUseAGPRs(F)))
+ MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
+ }
if (AMDGPU::isChainCC(CC)) {
// Chain functions don't receive an SP from their caller, but are free to
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 0e8a420..53f554e 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -321,7 +321,7 @@ public:
bool IsNonTemporal,
bool IsLastUse = false) const = 0;
- virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
+ virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
return false;
};
@@ -602,7 +602,7 @@ public:
bool IsVolatile, bool IsNonTemporal,
bool IsLastUse) const override;
- bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
+ bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
@@ -1170,6 +1170,16 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
Changed = true;
}
+ // On architectures that support direct loads to LDS, emit an unknown waitcnt
+ // at workgroup-scoped release operations that specify the LDS address space.
+ // SIInsertWaitcnts will later replace this with a vmcnt().
+ if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
+ Scope == SIAtomicScope::WORKGROUP &&
+ (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
+ Changed = true;
+ }
+
if (Pos == Position::AFTER)
--MI;
@@ -2078,6 +2088,16 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
Changed = true;
}
+ // On architectures that support direct loads to LDS, emit an unknown waitcnt
+ // at workgroup-scoped release operations that specify the LDS address space.
+ // SIInsertWaitcnts will later replace this with a vmcnt().
+ if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
+ Scope == SIAtomicScope::WORKGROUP &&
+ (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
+ Changed = true;
+ }
+
if (VSCnt) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
@@ -2536,9 +2556,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
if (IsVolatile) {
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
- if (Op == SIMemOp::STORE)
- Changed |= insertWaitsBeforeSystemScopeStore(MI);
-
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be
@@ -2551,11 +2568,26 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}
-bool SIGfx12CacheControl::expandSystemScopeStore(
- MachineBasicBlock::iterator &MI) const {
- MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
- if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
- return insertWaitsBeforeSystemScopeStore(MI);
+bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
+ MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
+ if (!CPol)
+ return false;
+
+ const unsigned Scope = CPol->getImm() & CPol::SCOPE;
+
+ // GFX12.0 only: Extra waits needed before system scope stores.
+ if (!ST.hasGFX1250Insts()) {
+ if (!Atomic && Scope == CPol::SCOPE_SYS)
+ return insertWaitsBeforeSystemScopeStore(MI);
+ return false;
+ }
+
+ // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
+ // space.
+ // We also require SCOPE_SE minimum if we not have the "cu-stores" feature.
+ if (Scope == CPol::SCOPE_CU &&
+ (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI)))
+ return setScope(MI, CPol::SCOPE_SE);
return false;
}
@@ -2658,6 +2690,8 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
assert(!MI->mayLoad() && MI->mayStore());
bool Changed = false;
+ // FIXME: Necessary hack because iterator can lose track of the store.
+ MachineInstr &StoreMI = *MI;
if (MOI.isAtomic()) {
if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
@@ -2674,6 +2708,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
+ Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
return Changed;
}
@@ -2686,7 +2721,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
// GFX12 specific, scope(desired coherence domain in cache hierarchy) is
// instruction field, do not confuse it with atomic scope.
- Changed |= CC->expandSystemScopeStore(MI);
+ Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 84cfa87..f3acc5c 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -572,65 +572,6 @@ MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
}
-std::pair<unsigned, unsigned>
-SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const {
- const unsigned MaxVectorRegs = ST.getMaxNumVGPRs(MF);
-
- unsigned MaxNumVGPRs = MaxVectorRegs;
- unsigned MaxNumAGPRs = 0;
-
- // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
- // a wave may have up to 512 total vector registers combining together both
- // VGPRs and AGPRs. Hence, in an entry function without calls and without
- // AGPRs used within it, it is possible to use the whole vector register
- // budget for VGPRs.
- //
- // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
- // register file accordingly.
- if (ST.hasGFX90AInsts()) {
- unsigned MinNumAGPRs = 0;
- const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
- const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
-
- const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
-
- // TODO: Move this logic into subtarget on IR function
- //
- // TODO: The lower bound should probably force the number of required
- // registers up, overriding amdgpu-waves-per-eu.
- std::tie(MinNumAGPRs, MaxNumAGPRs) = AMDGPU::getIntegerPairAttribute(
- MF.getFunction(), "amdgpu-agpr-alloc", DefaultNumAGPR,
- /*OnlyFirstRequired=*/true);
-
- if (MinNumAGPRs == DefaultNumAGPR.first) {
- // Default to splitting half the registers if AGPRs are required.
- MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
- } else {
- // Align to accum_offset's allocation granularity.
- MinNumAGPRs = alignTo(MinNumAGPRs, 4);
-
- MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs);
- }
-
- // Clamp values to be inbounds of our limits, and ensure min <= max.
-
- MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
- MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);
-
- MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs);
- MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
-
- assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
- MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs &&
- "invalid register counts");
- } else if (ST.hasMAIInsts()) {
- // On gfx908 the number of AGPRs always equals the number of VGPRs.
- MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
- }
-
- return std::pair(MaxNumVGPRs, MaxNumAGPRs);
-}
-
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::MODE);
@@ -742,7 +683,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// Reserve VGPRs/AGPRs.
//
- auto [MaxNumVGPRs, MaxNumAGPRs] = getMaxNumVectorRegs(MF);
+ auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
for (const TargetRegisterClass *RC : regclasses()) {
if (RC->isBaseClass() && isVGPRClass(RC)) {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 0008e5f..5508f07 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -90,11 +90,6 @@ public:
/// spilling is needed.
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
- /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
- /// of waves per execution unit required for the function \p MF.
- std::pair<unsigned, unsigned>
- getMaxNumVectorRegs(const MachineFunction &MF) const;
-
BitVector getReservedRegs(const MachineFunction &MF) const override;
bool isAsmClobberable(const MachineFunction &MF,
MCRegister PhysReg) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 0039d2f..36d1a3b 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -109,6 +109,23 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
let TSFlags{2} = HasVGPR;
let TSFlags{3} = HasAGPR;
let TSFlags{4} = HasSGPR;
+
+ // RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block)
+ // to decide which registers to try to assign first. Usually, this RegisterClass priority is given
+ // very high priority, if not the highest priority, when considering which VirtReg to allocate next.
+ //
+ // We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to
+ // assign more constrained RegisterClasses first. As a result, we prioritize register classes with
+ // more 32 bit tuples (e.g. VReg_512) over registers with fewer tuples (e.g. VGPR_32).
+ //
+ // The interesting case is the vector register case on architectures which have ARegs, VRegs, AVRegs.
+ // In this case, we would like to assign ARegs and VRegs before AVRegs, as AVRegs are less constrained
+ // and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the
+ // RegisterClass AllocationPriority. BaseClassPriority is used to turn the bit on, and BaseClassScaleFactor
+ // is used for scaling of the bit (i.e. 1 << 4).
+ field int BaseClassPriority = 1;
+ field int BaseClassScaleFactor = 16;
+
}
multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1,
@@ -575,7 +592,7 @@ let HasVGPR = 1 in {
def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
(add (interleave (sequence "VGPR%u_LO16", 0, 255),
(sequence "VGPR%u_HI16", 0, 255)))> {
- let AllocationPriority = 2;
+ let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor));
let Size = 16;
let GeneratePressureSet = 0;
@@ -601,7 +618,7 @@ def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
// i16/f16 only on VI+
def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
(add (sequence "VGPR%u", 0, 255))> {
- let AllocationPriority = 0;
+ let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
let Size = 32;
let Weight = 1;
let BaseClassOrder = 32;
@@ -610,7 +627,7 @@ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types
// Identical to VGPR_32 except it only contains the low 128 (Lo128) registers.
def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
(add (sequence "VGPR%u", 0, 127))> {
- let AllocationPriority = 0;
+ let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
let GeneratePressureSet = 0;
let Size = 32;
let Weight = 1;
@@ -668,7 +685,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
// AccVGPR 32-bit registers
def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
(add (sequence "AGPR%u", 0, 255))> {
- let AllocationPriority = 0;
+ let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
let Size = 32;
let Weight = 1;
let BaseClassOrder = 32;
@@ -940,14 +957,23 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
// Requires n v_mov_b32 to copy
let CopyCost = numRegs;
- let AllocationPriority = !sub(numRegs, 1);
+
+ // Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the
+ // 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result
+ // of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for
+ // regsters with numRegs 17+ we give SizePriority of 15. In practice, there is only one
+ // RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512},
+ // and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing.
+ defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15));
+
+ let AllocationPriority = !add(SizePrioriity, !mul(BaseClassPriority, BaseClassScaleFactor));
let Weight = numRegs;
}
// Define a register tuple class, along with one requiring an even
// aligned base register.
multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
- let HasVGPR = 1 in {
+ let HasVGPR = 1, BaseClassPriority = 1 in {
// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, regList> {
let BaseClassOrder = !mul(numRegs, 32);
@@ -981,7 +1007,7 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>;
}
multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
- let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in {
+ let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1 in {
// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, regList> {
let BaseClassOrder = !mul(numRegs, 32);
@@ -1066,6 +1092,7 @@ def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_6
def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> {
let HasVGPR = 1;
let HasAGPR = 1;
+ let BaseClassPriority = 0;
let Size = 32;
}
} // End GeneratePressureSet = 0
@@ -1074,7 +1101,7 @@ def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_3
// aligned base register.
multiclass AVRegClass<int numRegs, list<ValueType> regTypes,
dag vregList, dag aregList> {
- let HasVGPR = 1, HasAGPR = 1 in {
+ let HasVGPR = 1, HasAGPR = 1, BaseClassPriority = 0 in {
// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, (add vregList, aregList)>;
@@ -1191,6 +1218,8 @@ def VSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_FP64"> {
def VSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2INT32">;
def VSrc_v2f32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2FP32">;
+def VSrc_NoInline_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_NOINLINE_V2FP16">;
+
//===----------------------------------------------------------------------===//
// VRegSrc_* Operands with a VGPR
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 38cc51b..4bda51d 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -856,9 +856,9 @@ def smrd_sextloadi16 : SMRDLoadPat<sextloadi16>;
def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
(prefetch node:$ptr, node:$rw, node:$loc, node:$type),
- [{ return !N->getOperand(1)->isDivergent();}]> {
+ [{ return !N->getOperand(1)->isDivergent() && Subtarget->hasSafeSmemPrefetch();}]> {
let GISelPredicateCode = [{
- return isInstrUniform(MI);
+ return isInstrUniform(MI) && Subtarget->hasSafeSmemPrefetch();
}];
}
@@ -1152,6 +1152,7 @@ multiclass SMPrefetchPat<string type, TImmLeaf cache_type> {
}
defm : SMPrefetchPat<"INST", i32imm_zero>;
+let AddedComplexity = 12 in // Prefer scalar prefetch over global for r/o case.
defm : SMPrefetchPat<"DATA", i32imm_one>;
let SubtargetPredicate = isGFX12Plus in {
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index e103ccc..8303410 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1621,6 +1621,13 @@ let OtherPredicates = [HasImageInsts] in {
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
}
+// Represents the point at which a wave must wait for all outstanding direct loads to LDS.
+// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts.
+
+def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> {
+ let hasSideEffects = 0;
+}
+
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index b5b3cc9..5827f18 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -732,7 +732,14 @@ bool isGenericAtomic(unsigned Opc) {
}
bool isAsyncStore(unsigned Opc) {
- return false; // placeholder before async store implementation.
+ return Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_SADDR_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_SADDR_gfx1250 ||
+ Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_SADDR_gfx1250;
}
bool isTensorStore(unsigned Opc) {
@@ -2652,6 +2659,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
@@ -3016,6 +3024,8 @@ bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) {
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
return isInlinableLiteralV2BF16(Literal);
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
+ return false;
default:
llvm_unreachable("bad packed operand type");
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index c09a9d6..74d59f4 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1636,6 +1636,7 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
return 2;
default:
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 550ec9d..9de7d6d 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1344,6 +1344,8 @@ def V_FMAAK_F64 : VOP2_Pseudo<"v_fmaak_f64", VOP_MADAK_F64, [], "">;
} // End SubtargetPredicate = HasFmaakFmamkF64Insts, isReMaterializable = 1, FixedSize = 1, Size = 12, SchedRW = [Write64Bit]
let SubtargetPredicate = HasPkFmacF16Inst in {
+// FIXME: V_PK_FMAC_F16 is currently not used in instruction selection.
+// If this changes, ensure the DPP variant is not used for GFX11+.
defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>;
} // End SubtargetPredicate = HasPkFmacF16Inst
@@ -1904,7 +1906,7 @@ multiclass VOP2_Real_FULL_with_name_gfx11_gfx12<bits<6> op, string opName,
VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;
multiclass VOP2_Real_e32_gfx11_gfx12<bits<6> op> :
- VOP2Only_Real<GFX11Gen, op>, VOP2Only_Real<GFX12Gen, op>;
+ VOP2Only_Real_e32<GFX11Gen, op>, VOP2Only_Real_e32<GFX12Gen, op>;
multiclass VOP3Only_Realtriple_gfx11_gfx12<bits<10> op> :
VOP3Only_Realtriple<GFX11Gen, op>, VOP3Only_Realtriple<GFX12Gen, op>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index b6f9568..1ffe39d 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -32,9 +32,10 @@ class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
let HasExtDPP = 0;
}
-let HasExt64BitDPP = 1 in {
-def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>;
-def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>;
+def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> {
+ let HasExtVOP3DPP = 0;
+ let HasExtDPP = 0;
+}
def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
let HasClamp = 1;
@@ -44,6 +45,10 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp";
}
+let HasExt64BitDPP = 1 in {
+def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>;
+def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>;
+
class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> {
let HasExtVOP3DPP = 0;
let HasExtDPP = 0;
@@ -52,10 +57,13 @@ class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> {
def V_LSHL_ADD_U64_PROF : VOP3_Profile<VOP_I64_I64_I32_I64>;
def VOP_F64_F64_F64_F64_DPP_PROF : VOP3_Profile<VOP_F64_F64_F64_F64>;
-
-def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> {
+def V_MAD_U32_PROF: VOP3_Profile<VOP_I32_I32_I32_I32> {
let HasExtVOP3DPP = 0;
- let HasExtDPP = 0;
+ let HasExt64BitDPP = 1;
+}
+def VOP_I64_I64_I64_DPP : VOP3_Profile<VOP_I64_I64_I64>;
+def VOP_I32_I32_I64_DPP : VOP3_Profile<VOPProfile<[i64, i32, i32, i64]>> {
+ let HasClamp = 1;
}
} // End HasExt64BitDPP = 1;
@@ -152,6 +160,15 @@ defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32
defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>, VOPD_Component<0x13, "v_fma_f32">;
defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
+let SchedRW = [WriteIntMul] in {
+ let SubtargetPredicate = HasMadU32Inst in
+ defm V_MAD_U32 : VOP3Inst <"v_mad_u32", V_MAD_U32_PROF>;
+ let SubtargetPredicate = isGFX1250Plus in {
+ defm V_MAD_NC_U64_U32 : VOP3Inst<"v_mad_nc_u64_u32", VOP_I32_I32_I64_DPP>;
+ defm V_MAD_NC_I64_I32 : VOP3Inst<"v_mad_nc_i64_i32", VOP_I32_I32_I64_DPP>;
+ }
+}
+
let SchedRW = [WriteDoubleAdd] in {
let FPDPRounding = 1 in {
defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP_F64_F64_F64_F64_DPP_PROF, any_fma>, VOPD_Component<0x20, "v_fma_f64">;
@@ -185,6 +202,13 @@ defm V_MAXIMUM_F64 : VOP3Inst <"v_maximum_f64", VOP3_Profile<VOP_F64_F64_F64>, f
} // End SchedRW = [WriteDoubleAdd]
} // End SubtargetPredicate = HasIEEEMinimumMaximumInsts, ReadsModeReg = 0, AddedComplexity = 1
+let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] in {
+defm V_MAX_I64 : VOP3Inst <"v_max_i64", VOP_I64_I64_I64_DPP, smax>;
+defm V_MAX_U64 : VOP3Inst <"v_max_u64", VOP_I64_I64_I64_DPP, umax>;
+defm V_MIN_I64 : VOP3Inst <"v_min_i64", VOP_I64_I64_I64_DPP, smin>;
+defm V_MIN_U64 : VOP3Inst <"v_min_u64", VOP_I64_I64_I64_DPP, umin>;
+} // End SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd]
+
} // End isReMaterializable = 1
let Uses = [MODE, VCC, EXEC] in {
@@ -601,8 +625,9 @@ def shl_0_to_4 : PatFrag<
}];
}
-def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
- defvar Tail = (ins VGPR_32:$vdst_in, op_sel0:$op_sel);
+class VOP3_CVT_PK_F8_F32_Profile<bit _HasClamp = 0> : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
+ defvar Tail = !con(!if(_HasClamp, (ins Clamp:$clamp), (ins)),
+ (ins VGPR_32:$vdst_in, op_sel0:$op_sel));
let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
0, HasModifiers, HasSrc2Mods,
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
@@ -612,12 +637,13 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
Src2ModVOP3DPP, false>.ret,
Tail);
- let HasClamp = 0;
+ let HasClamp = _HasClamp;
let HasExtVOP3DPP = 1;
}
-def VOP3_CVT_PK_F8_F32_Profile_fake16 : VOP3_Profile_Fake16<VOP_I16_F32_F32, VOP3_OPSEL> {
- defvar Tail = (ins VGPR_32:$vdst_in, op_sel0:$op_sel);
+class VOP3_CVT_PK_F8_F32_Profile_fake16<bit _HasClamp = 0> : VOP3_Profile_Fake16<VOP_I16_F32_F32, VOP3_OPSEL> {
+ defvar Tail = !con(!if(_HasClamp, (ins Clamp:$clamp), (ins)),
+ (ins VGPR_32:$vdst_in, op_sel0:$op_sel));
let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
0, HasModifiers, HasSrc2Mods,
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
@@ -627,14 +653,15 @@ def VOP3_CVT_PK_F8_F32_Profile_fake16 : VOP3_Profile_Fake16<VOP_I16_F32_F32, VOP
HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
Src2ModVOP3DPP, false>.ret,
Tail);
- let HasClamp = 0;
+ let HasClamp = _HasClamp;
let HasExtVOP3DPP = 1;
}
// This t16 profile with vdst_in operand is for backward compatibility and is used
// for user controlled packing
-def VOP3_CVT_PK_F8_F32_Profile_t16 : VOP3_Profile_True16<VOP_I16_F32_F32, VOP3_OPSEL> {
- defvar Tail = (ins VGPR_16:$vdst_in, op_sel0:$op_sel);
+class VOP3_CVT_PK_F8_F32_Profile_t16<bit _HasClamp = 0> : VOP3_Profile_True16<VOP_I16_F32_F32, VOP3_OPSEL> {
+ defvar Tail = !con(!if(_HasClamp, (ins Clamp:$clamp), (ins)),
+ (ins VGPR_16:$vdst_in, op_sel0:$op_sel));
let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
0, HasModifiers, HasSrc2Mods,
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
@@ -644,7 +671,7 @@ def VOP3_CVT_PK_F8_F32_Profile_t16 : VOP3_Profile_True16<VOP_I16_F32_F32, VOP3_O
HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
Src2ModVOP3DPP, false>.ret,
Tail);
- let HasClamp = 0;
+ let HasClamp = _HasClamp;
let HasExtVOP3DPP = 1;
}
@@ -678,10 +705,10 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
HasModifiers, DstVT>.ret);
}
-class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT> :
+class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT, bit _HasClamp = 0> :
VOP3_Profile<VOPProfile<[i32, SrcVT, i32, untyped]>> {
let HasFP8DstByteSel = 1;
- let HasClamp = 0;
+ let HasClamp = _HasClamp;
}
def IsPow2Plus1: PatLeaf<(i32 imm), [{
@@ -722,6 +749,13 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
defm V_MAXIMUM3_F16 : VOP3Inst_t16 <"v_maximum3_f16", VOP_F16_F16_F16_F16, AMDGPUfmaximum3>;
} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
+let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in {
+ defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP_I32_I32_I32_I32>;
+ defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP_I32_I32_I32_I32>;
+ defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP_I32_I32_I32_I32>;
+ defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP_I32_I32_I32_I32>;
+}
+
defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
defm V_SUB_I16 : VOP3Inst_t16 <"v_sub_i16", VOP_I16_I16_I16>;
@@ -749,15 +783,23 @@ defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", V_LSHL_ADD_U64_PROF>;
let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
SchedRW = [WriteFloatCvt] in {
let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
- defm V_CVT_PK_FP8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile,
- VOP3_CVT_PK_F8_F32_Profile_t16,
- VOP3_CVT_PK_F8_F32_Profile_fake16>;
- defm V_CVT_PK_BF8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile,
- VOP3_CVT_PK_F8_F32_Profile_t16,
- VOP3_CVT_PK_F8_F32_Profile_fake16>;
+ let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in
+ defm V_CVT_PK_FP8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile<>,
+ VOP3_CVT_PK_F8_F32_Profile_t16<>,
+ VOP3_CVT_PK_F8_F32_Profile_fake16<>>;
+ let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in
+ defm V_CVT_PK_FP8_F32_gfx1250 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32_gfx1250", VOP3_CVT_PK_F8_F32_Profile<true>,
+ VOP3_CVT_PK_F8_F32_Profile_t16<true>,
+ VOP3_CVT_PK_F8_F32_Profile_fake16<true>>;
+ defm V_CVT_PK_BF8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile<>,
+ VOP3_CVT_PK_F8_F32_Profile_t16<>,
+ VOP3_CVT_PK_F8_F32_Profile_fake16<>>;
let SubtargetPredicate = isGFX12Plus in {
- defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
+ let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in
+ defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
+ let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in
+ defm V_CVT_SR_FP8_F32_gfx1250 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx1250", VOP3_CVT_SR_F8_ByteSel_Profile<f32, true>>;
defm V_CVT_SR_BF8_F32_gfx12 : VOP3Inst<"v_cvt_sr_bf8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
}
}
@@ -776,6 +818,11 @@ class Cvt_PK_F8_F32_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst> : G
(inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, 0)
>;
+class Cvt_PK_F8_F32_E5M3_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst, int Clamp> : GCNPat<
+ (i32 (node f32:$src0, f32:$src1, i32:$old, index)),
+ (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, Clamp, $old, 0)
+>;
+
multiclass Cvt_PK_F8_F32_t16_Pat<SDPatternOperator node, VOP3_Pseudo inst> {
def : GCNPat<
(i32 (node f32:$src0, f32:$src1, i32:$old, -1)),
@@ -791,6 +838,21 @@ def : GCNPat<
>;
}
+multiclass Cvt_PK_F8_F32_E5M3_t16_Pat<SDPatternOperator node, VOP3_Pseudo inst, int Clamp> {
+def : GCNPat<
+ (i32 (node f32:$src0, f32:$src1, i32:$old, -1)),
+ (REG_SEQUENCE VGPR_32,
+ (i16 (EXTRACT_SUBREG $old, lo16)), lo16,
+ (i16 (inst SRCMODS.DST_OP_SEL, $src0, 0, $src1, Clamp, (i16 (EXTRACT_SUBREG $old, hi16)), 0)), hi16)
+>;
+def : GCNPat<
+ (i32 (node f32:$src0, f32:$src1, i32:$old, 0)),
+ (REG_SEQUENCE VGPR_32,
+ (i16 (inst 0, $src0, 0, $src1, Clamp, (i16 (EXTRACT_SUBREG $old, lo16)), 0)), lo16,
+ (i16 (EXTRACT_SUBREG $old, hi16)), hi16)
+>;
+}
+
class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst> : GCNPat<
(i32 (node f32:$src0, i32:$src1, i32:$old, index)),
(inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1,
@@ -803,21 +865,37 @@ class Cvt_SR_F8_ByteSel_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType
(inst $src0_modifiers, $src0, $src1_modifiers, $src1, $old, (as_i32timm $byte_sel))
>;
+class Cvt_SR_F8_ByteSel_E5M3_Pat<SDPatternOperator node, VOP3_Pseudo inst,
+ ValueType SrcVT, int Clamp> : GCNPat<
+ (i32 (node (VOP3Mods SrcVT:$src0, i32:$src0_modifiers), (VOP3Mods i32:$src1, i32:$src1_modifiers),
+ i32:$old, timm:$byte_sel)),
+ (inst $src0_modifiers, $src0, $src1_modifiers, $src1, Clamp, $old, (as_i32timm $byte_sel))
+>;
+
let OtherPredicates = [HasFP8ConversionInsts] in {
foreach Index = [0, -1] in {
let True16Predicate = NotHasTrue16BitInsts in {
- def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
+ let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in
+ def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>;
}
let True16Predicate = UseFakeTrue16Insts in {
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_fake16_e64>;
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_fake16_e64>;
+ let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in {
+ def : Cvt_PK_F8_F32_E5M3_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_gfx1250_fake16_e64, DSTCLAMP.NONE>;
+ def : Cvt_PK_F8_F32_E5M3_Pat<int_amdgcn_cvt_pk_fp8_f32_e5m3, Index, V_CVT_PK_FP8_F32_gfx1250_fake16_e64, DSTCLAMP.ENABLE>;
+ }
}
}
let True16Predicate = UseRealTrue16Insts in {
defm : Cvt_PK_F8_F32_t16_Pat<int_amdgcn_cvt_pk_fp8_f32, V_CVT_PK_FP8_F32_t16_e64>;
defm : Cvt_PK_F8_F32_t16_Pat<int_amdgcn_cvt_pk_bf8_f32, V_CVT_PK_BF8_F32_t16_e64>;
+ let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in {
+ defm : Cvt_PK_F8_F32_E5M3_t16_Pat<int_amdgcn_cvt_pk_fp8_f32, V_CVT_PK_FP8_F32_gfx1250_t16_e64, DSTCLAMP.NONE>;
+ defm : Cvt_PK_F8_F32_E5M3_t16_Pat<int_amdgcn_cvt_pk_fp8_f32_e5m3, V_CVT_PK_FP8_F32_gfx1250_t16_e64, DSTCLAMP.ENABLE>;
+ }
}
let SubtargetPredicate = isGFX940Plus in {
@@ -828,7 +906,12 @@ let SubtargetPredicate = isGFX940Plus in {
}
let SubtargetPredicate = isGFX12Plus in {
- def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx12_e64, f32>;
+ let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in
+ def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx12_e64, f32>;
+ let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in {
+ def : Cvt_SR_F8_ByteSel_E5M3_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx1250_e64, f32, DSTCLAMP.NONE>;
+ def : Cvt_SR_F8_ByteSel_E5M3_Pat<int_amdgcn_cvt_sr_fp8_f32_e5m3, V_CVT_SR_FP8_F32_gfx1250_e64, f32, DSTCLAMP.ENABLE>;
+ }
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f32, V_CVT_SR_BF8_F32_gfx12_e64, f32>;
}
}
@@ -848,6 +931,9 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
+let SubtargetPredicate = HasMadU32Inst, AddedComplexity = 10 in
+ def : ThreeOp_i32_Pats<mul, add, V_MAD_U32_e64>;
+
def : GCNPat<
(DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
(V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
@@ -858,6 +944,13 @@ def : GCNPat<
(V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)
>;
+let SubtargetPredicate = HasAddMinMaxInsts in {
+def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>;
+def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>;
+def : ThreeOp_i32_Pats<add, smin, V_ADD_MIN_I32_e64>;
+def : ThreeOp_i32_Pats<add, umin, V_ADD_MIN_U32_e64>;
+}
+
def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
def : VOPBinOpClampPat<ssubsat, V_SUB_I32_e64, i32>;
@@ -972,10 +1065,12 @@ class SrcAndDstSelToOpSelXForm<int modifier_idx, bit dest_sel> : SDNodeXForm<tim
unsigned Val = N->getZExtValue();
unsigned New = 0;
if (}] # modifier_idx # [{ == 0) {
- New = (}] # dest_sel # [{ == 1) ? ((Val & 0x2) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL)
- : ((Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE);
- } else if (}] # modifier_idx # [{== 1 || }] # modifier_idx # [{ == 2) {
- New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
+ New = (}] # dest_sel # [{ == 1) ? ((Val & 0x1) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL)
+ : ((Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE);
+ } else if (}] # modifier_idx # [{== 1) {
+ New = (Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
+ } if (}] # modifier_idx # [{== 2) {
+ New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
}
return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32);
}]>;
@@ -1427,34 +1522,72 @@ let SubtargetPredicate = isGFX12Plus in {
} // End SubtargetPredicate = isGFX12Plus
-let SubtargetPredicate = HasBitOp3Insts in {
+let HasClamp = 0, HasModifiers = 1 in {
+def BitOp3_B16_Profile : VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>;
+def BitOp3_B16_t16_Profile : VOP3_Profile_True16<BitOp3_B16_Profile>;
+def BitOp3_B16_fake16_Profile : VOP3_Profile_Fake16<BitOp3_B16_Profile>;
+}
+
+let OtherPredicates = [HasBitOp3Insts] in {
let isReMaterializable = 1 in {
- defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16",
- VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>>;
+ let SubtargetPredicate = isGFX940Plus in
+ defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16", BitOp3_B16_Profile>;
+ let SubtargetPredicate = isGFX1250Plus in
+ defm V_BITOP3_B16_gfx1250 : VOP3Inst_t16_with_profiles <"v_bitop3_b16_gfx1250", BitOp3_B16_Profile,
+ BitOp3_B16_t16_Profile, BitOp3_B16_fake16_Profile>;
defm V_BITOP3_B32 : VOP3Inst <"v_bitop3_b32",
VOP3_BITOP3_Profile<VOPProfile <[i32, i32, i32, i32, i32]>, VOP3_REGULAR>>,
VOPD_Component<0x12, "v_bitop2_b32">;
}
+
def : GCNPat<
(i32 (int_amdgcn_bitop3 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)),
(i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3))
>;
def : GCNPat<
- (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
- (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
- >;
-
- def : GCNPat<
(i32 (BITOP3_32 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)),
(i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3))
>;
- def : GCNPat<
- (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
- (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
- >;
-} // End SubtargetPredicate = HasBitOp3Insts
+ let SubtargetPredicate = isGFX940Plus in {
+ def : GCNPat<
+ (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
+ (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
+ >;
+
+ def : GCNPat<
+ (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
+ (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
+ >;
+ } // End SubtargetPredicate = isGFX940Plus
+
+ let SubtargetPredicate = isGFX1250Plus in {
+ let True16Predicate = UseFakeTrue16Insts in {
+ def : GCNPat<
+ (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
+ (i16 (V_BITOP3_B16_gfx1250_fake16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
+ >;
+
+ def : GCNPat<
+ (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
+ (i16 (V_BITOP3_B16_gfx1250_fake16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
+ >;
+ }
+ let True16Predicate = UseRealTrue16Insts in {
+ def : GCNPat<
+ (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
+ (i16 (V_BITOP3_B16_gfx1250_t16_e64 0, VSrcT_b16:$src0, 0, VSrcT_b16:$src1, 0, VSrcT_b16:$src2, timm:$bitop3, 0))
+ >;
+
+ def : GCNPat<
+ (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
+ (i16 (V_BITOP3_B16_gfx1250_t16_e64 0, VSrcT_b16:$src0, 0, VSrcT_b16:$src1, 0, VSrcT_b16:$src2, timm:$bitop3, 0))
+ >;
+ }
+ } // End SubtargetPredicate = isGFX1250Plus
+
+} // End OtherPredicates = [HasBitOp3Insts]
class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat<
(AMDGPUdiv_fmas (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
@@ -1531,6 +1664,7 @@ def bf16_fpround : PatFrag <(ops node:$src0), (fpround $src0), [{ return true;
let SubtargetPredicate = HasBF16ConversionInsts in {
let ReadsModeReg = 0 in {
defm V_CVT_PK_BF16_F32 : VOP3Inst<"v_cvt_pk_bf16_f32", VOP3_Profile<VOP_V2BF16_F32_F32>>;
+ defm V_CVT_SR_PK_BF16_F32 : VOP3Inst<"v_cvt_sr_pk_bf16_f32", VOP3_Profile<VOP_V2BF16_F32_F32_I32>, int_amdgcn_cvt_sr_pk_bf16_f32>;
}
def : GCNPat<(v2bf16 (bf16_fpround v2f32:$src)),
(V_CVT_PK_BF16_F32_e64 0, (EXTRACT_SUBREG VReg_64:$src, sub0), 0, (EXTRACT_SUBREG VReg_64:$src, sub1))>;
@@ -1541,6 +1675,85 @@ let SubtargetPredicate = HasBF16ConversionInsts in {
(V_CVT_PK_BF16_F32_e64 $src0_modifiers, $src0, 0, (f32 (IMPLICIT_DEF)))>;
}
+class VOP3_CVT_SCALE_PK_F16_F864_Profile<VOPProfile P> : VOP3_CVT_SCALEF32_PK_F864_Profile<P> {
+ let Src0RC64 = getVOP3VRegSrcForVT<Src0VT>.ret;
+ let Ins64 = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
+ HasClamp, HasModifiers, HasSrc2Mods,
+ HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
+ (ins ScaleSel:$scale_sel));
+ let Asm64 = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
+ HasOpSel, HasOMod, IsVOP3P, HasNeg, HasSrc0Mods, HasSrc1Mods,
+ HasSrc2Mods, DstVT>.ret # "$scale_sel";
+}
+
+multiclass VOP3CvtScaleSelInst<string OpName, VOPProfile P, SDPatternOperator node> {
+ def _e64 : VOP3InstBase<OpName, VOP3_CVT_SCALE_PK_F16_F864_Profile<P>> {
+ let Pattern = [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0)), i32:$src1, i32:$scale_sel))];
+ }
+}
+
+let Src0RC64 = VSrc_NoInline_v2f16 in {
+def VOP3_CVT_PK_F8_F16_Profile : VOP3_Profile<VOP_I16_V2F16>;
+def VOP3_CVT_PK_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_PK_F8_F16_Profile>;
+def VOP3_CVT_PK_F8_F16_Fake16_Profile : VOP3_Profile_Fake16<VOP3_CVT_PK_F8_F16_Profile>;
+}
+
+let ReadsModeReg = 0, IsPacked = 0, SubtargetPredicate = isGFX125xOnly in {
+ defm V_CVT_PK_FP8_F16_gfx1250 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f16_gfx1250",
+ VOP3_CVT_PK_F8_F16_Profile,
+ VOP3_CVT_PK_F8_F16_True16_Profile,
+ VOP3_CVT_PK_F8_F16_Fake16_Profile,
+ int_amdgcn_cvt_pk_fp8_f16>;
+ defm V_CVT_PK_BF8_F16_gfx1250 : VOP3Inst_t16_with_profiles<"v_cvt_pk_bf8_f16_gfx1250",
+ VOP3_CVT_PK_F8_F16_Profile,
+ VOP3_CVT_PK_F8_F16_True16_Profile,
+ VOP3_CVT_PK_F8_F16_Fake16_Profile,
+ int_amdgcn_cvt_pk_bf8_f16>;
+}
+
+let HasClamp = 0, HasOpSel = 1 in {
+def VOP3_CVT_SR_F8_F16_Profile : VOP3_CVT_SR_F8_ByteSel_Profile<f16>;
+def VOP3_CVT_SR_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_SR_F8_F16_Profile>;
+def VOP3_CVT_SR_F8_F16_Fake16_Profile : VOP3_Profile_Fake16<VOP3_CVT_SR_F8_F16_Profile>;
+}
+
+let SubtargetPredicate = isGFX1250Plus in {
+ let ReadsModeReg = 0 in {
+ defm V_CVT_SR_PK_F16_F32 : VOP3Inst<"v_cvt_sr_pk_f16_f32", VOP3_Profile<VOP_V2F16_F32_F32_I32>, int_amdgcn_cvt_sr_pk_f16_f32>;
+
+ // These instructions have non-standard use of op_sel. They are using bits 2 and 3 of opsel
+ // to select a byte in the vdst. Bits 0 and 1 are unused.
+ let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
+ defm V_CVT_SR_FP8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_fp8_f16", VOP3_CVT_SR_F8_F16_Profile,
+ VOP3_CVT_SR_F8_F16_True16_Profile, VOP3_CVT_SR_F8_F16_Fake16_Profile>;
+ defm V_CVT_SR_BF8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_bf8_f16", VOP3_CVT_SR_F8_F16_Profile,
+ VOP3_CVT_SR_F8_F16_True16_Profile, VOP3_CVT_SR_F8_F16_Fake16_Profile>;
+ }
+
+ let Constraints = "@earlyclobber $vdst" in {
+ defm V_CVT_SCALE_PK8_F16_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp8", VOP_V8F16_V2I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp8>;
+ defm V_CVT_SCALE_PK8_BF16_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_fp8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_fp8>;
+ defm V_CVT_SCALE_PK8_F16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_bf8", VOP_V8F16_V2I32_I32, int_amdgcn_cvt_scale_pk8_f16_bf8>;
+ defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_bf8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_bf8>;
+ defm V_CVT_SCALE_PK8_F32_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp8>;
+ defm V_CVT_SCALE_PK8_F32_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_bf8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_bf8>;
+ } // End Constraints = "@earlyclobber $vdst"
+
+ defm V_CVT_SCALE_PK8_F16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp4", VOP_V8F16_I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp4>;
+ defm V_CVT_SCALE_PK8_BF16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_fp4", VOP_V8BF16_I32_I32, int_amdgcn_cvt_scale_pk8_bf16_fp4>;
+ defm V_CVT_SCALE_PK8_F32_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp4", VOP_V8F32_I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp4>;
+ } // End ReadsModeReg = 0
+
+ let True16Predicate = UseRealTrue16Insts in {
+ def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_t16_e64, f16>;
+ def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_t16_e64, f16>;
+ }
+ let True16Predicate = UseFakeTrue16Insts in {
+ def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_fake16_e64, f16>;
+ def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_fake16_e64, f16>;
+ }
+} // End SubtargetPredicate = isGFX1250Plus
+
class Cvt_Scale_Sr_F32ToBF16F16_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType DstTy> : GCNPat<
(DstTy (node DstTy:$vdst_in, f32:$src0, i32:$src1, timm:$word_sel)),
(inst (DstSelToOpSelXForm $word_sel), $src0, 0, $src1, VGPR_32:$vdst_in)
@@ -1746,10 +1959,20 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x368, "v_m
defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>;
defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
-defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">;
-defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">;
-defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >;
-defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36c, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">;
+defm V_BITOP3_B16_gfx1250 : VOP3_Real_BITOP3_t16_and_fake16_gfx1250<0x233, "v_bitop3_b16">;
+defm V_BITOP3_B32 : VOP3_Real_BITOP3_gfx1250<0x234>;
+
+defm V_MAD_U32 : VOP3Only_Realtriple_gfx1250<0x235>;
+defm V_MAD_NC_U64_U32 : VOP3Only_Realtriple_gfx1250<0x2fa>;
+defm V_MAD_NC_I64_I32 : VOP3Only_Realtriple_gfx1250<0x2fb>;
+defm V_MIN_U64 : VOP3Only_Realtriple_gfx1250<0x318>;
+defm V_MAX_U64 : VOP3Only_Realtriple_gfx1250<0x319>;
+defm V_MIN_I64 : VOP3Only_Realtriple_gfx1250<0x31a>;
+defm V_MAX_I64 : VOP3Only_Realtriple_gfx1250<0x31b>;
+defm V_ADD_MAX_I32 : VOP3Only_Realtriple_gfx1250<0x25e>;
+defm V_ADD_MAX_U32 : VOP3Only_Realtriple_gfx1250<0x25f>;
+defm V_ADD_MIN_I32 : VOP3Only_Realtriple_gfx1250<0x260>;
+defm V_ADD_MIN_U32 : VOP3Only_Realtriple_gfx1250<0x261>;
//===----------------------------------------------------------------------===//
// GFX11, GFX12
@@ -1911,6 +2134,13 @@ defm V_AND_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x36
defm V_OR_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x363, "v_or_b16">;
defm V_XOR_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x364, "v_xor_b16">;
+defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_not_gfx1250<0x369, "v_cvt_pk_fp8_f32">;
+defm V_CVT_PK_FP8_F32_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x369, "v_cvt_pk_fp8_f32">;
+defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x36a, "v_cvt_pk_bf8_f32">;
+defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx11_gfx12_not_gfx1250<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32">;
+defm V_CVT_SR_FP8_F32_gfx1250 : VOP3Only_Realtriple_with_name_gfx1250<0x36b, "V_CVT_SR_FP8_F32_gfx1250", "v_cvt_sr_fp8_f32">;
+defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx11_gfx12<0x36c, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">;
+
let AssemblerPredicate = isGFX11Plus in {
def : AMDGPUMnemonicAlias<"v_add3_nc_u32", "v_add3_u32">;
def : AMDGPUMnemonicAlias<"v_xor_add_u32", "v_xad_u32">;
@@ -1918,7 +2148,25 @@ let AssemblerPredicate = isGFX11Plus in {
// These instructions differ from GFX12 variant by supporting DPP:
defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>;
+defm V_ASHR_PK_I8_I32 : VOP3Only_Realtriple_gfx1250<0x290>;
+defm V_ASHR_PK_U8_I32 : VOP3Only_Realtriple_gfx1250<0x291>;
+defm V_CVT_SCALE_PK8_F16_FP4 : VOP3Only_ScaleSel_Real_gfx1250<0x29f>;
+defm V_CVT_SCALE_PK8_BF16_FP4 : VOP3Only_ScaleSel_Real_gfx1250<0x2a0>;
+defm V_CVT_SCALE_PK8_F32_FP4 : VOP3Only_ScaleSel_Real_gfx1250<0x2a1>;
+defm V_CVT_SCALE_PK8_F16_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2a8>;
+defm V_CVT_SCALE_PK8_BF16_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2a9>;
+defm V_CVT_SCALE_PK8_F32_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2aa>;
+defm V_CVT_SCALE_PK8_F16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ab>;
+defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ac>;
+defm V_CVT_SCALE_PK8_F32_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ad>;
defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>;
+defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36e>;
+defm V_CVT_PK_F16_F32 : VOP3Only_Realtriple_gfx1250<0x36f>;
+defm V_CVT_SR_PK_F16_F32 : VOP3Only_Realtriple_gfx1250<0x370>;
+defm V_CVT_PK_FP8_F16_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x372, "v_cvt_pk_fp8_f16">;
+defm V_CVT_PK_BF8_F16_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x373, "v_cvt_pk_bf8_f16">;
+defm V_CVT_SR_FP8_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x374>;
+defm V_CVT_SR_BF8_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x375>;
//===----------------------------------------------------------------------===//
// GFX10.
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index ea14c77..95fcd4a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -35,14 +35,18 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
bit useTiedOutput = 0> : VOP3P_Profile<P, Features, 1> {
bit UseTiedOutput = useTiedOutput;
+ defvar Src0RC = getVCSrcForVT<P.Src0VT>.ret;
+ defvar Src1RC = getVCSrcForVT<P.Src1VT>.ret;
+ defvar Src2RC = getVCSrcForVT<P.Src2VT>.ret;
+
dag srcs =
- (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0,
- FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
- FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
+ (ins FP16InputMods:$src0_modifiers, Src0RC:$src0,
+ FP16InputMods:$src1_modifiers, Src1RC:$src1,
+ FP16InputMods:$src2_modifiers, Src2RC:$src2);
dag dpp_srcs =
(ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0,
FPVRegInputMods:$src1_modifiers, VRegSrc_32:$src1,
- FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
+ FP16InputMods:$src2_modifiers, Src2RC:$src2);
// FIXME: Clamp0 misbehaves with the non-default vdst_in
// following it. For now workaround this by requiring clamp
@@ -161,38 +165,42 @@ defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_
multiclass MadFmaMixPats<SDPatternOperator fma_like,
Instruction mix_inst,
Instruction mixlo_inst,
- Instruction mixhi_inst> {
+ Instruction mixhi_inst,
+ ValueType VT = f16,
+ ValueType vecVT = v2f16> {
+ defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
+ defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt);
// At least one of the operands needs to be an fpextend of an f16
// for this to be worthwhile, so we need three patterns here.
// TODO: Could we use a predicate to inspect src1/2/3 instead?
def : GCNPat <
- (f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_mods)))),
+ (f32 (fma_like (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_mods)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
def : GCNPat <
- (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)),
- (f32 (VOP3PMadMixModsExt f16:$src1, i32:$src1_mods)),
- (f32 (VOP3PMadMixMods f32:$src2, i32:$src2_mods)))),
+ (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)),
+ (f32 (VOP3PMadMixModsPat f32:$src2, i32:$src2_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
def : GCNPat <
- (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)),
- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_mods)),
- (f32 (VOP3PMadMixModsExt f16:$src2, i32:$src2_mods)))),
+ (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)),
+ (f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
def : GCNPat <
(AMDGPUclamp (build_vector
- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))),
- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))),
- (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))),
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))),
+ (vecVT (mixhi_inst $hi_src0_modifiers, $hi_src0,
$hi_src1_modifiers, $hi_src1,
$hi_src2_modifiers, $hi_src2,
DSTCLAMP.ENABLE,
@@ -204,8 +212,8 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
>;
def : GCNPat <
- (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
+ (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))),
(mixlo_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
(i32 0), (i32 0),
@@ -214,9 +222,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
>;
def : GCNPat <
- (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ (build_vector VT:$elt0, (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers)))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
(i32 0), (i32 0),
DSTCLAMP.NONE,
@@ -224,9 +232,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
>;
def : GCNPat <
- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))),
(mixlo_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
@@ -241,10 +249,10 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
let True16Predicate = p in {
def : GCNPat <
- (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.NONE,
@@ -253,11 +261,11 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
def : GCNPat <
(build_vector
- f16:$elt0,
- (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ VT:$elt0,
+ (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.ENABLE,
@@ -268,38 +276,38 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
let True16Predicate = UseRealTrue16Insts in {
def : GCNPat <
- (build_vector (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), f16:$elt1),
- (v2f16 (mixlo_inst $src0_modifiers, $src0,
+ (build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), VT:$elt1),
+ (vecVT (mixlo_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.NONE,
- (REG_SEQUENCE VGPR_32, (f16 (IMPLICIT_DEF)), lo16, $elt1, hi16)))
+ (REG_SEQUENCE VGPR_32, (VT (IMPLICIT_DEF)), lo16, $elt1, hi16)))
>;
def : GCNPat <
- (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.NONE,
- (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16)))
+ (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
>;
def : GCNPat <
(build_vector
- f16:$elt0,
- (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
- (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ VT:$elt0,
+ (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))),
+ (vecVT (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
DSTCLAMP.ENABLE,
- (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16)))
+ (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
>;
} // end True16Predicate
}
@@ -360,6 +368,24 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F
defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
}
+let SubtargetPredicate = HasFmaMixBF16Insts in {
+let isCommutable = 1 in {
+
+let isReMaterializable = 1 in
+defm V_FMA_MIX_F32_BF16 : VOP3_VOP3PInst<"v_fma_mix_f32_bf16", VOP3P_Mix_Profile<VOP_F32_BF16_BF16_BF16, VOP3_OPSEL>>;
+
+let FPDPRounding = 1 in {
+defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>;
+
+let ClampLo = 0, ClampHi = 1 in {
+defm V_FMA_MIXHI_BF16 : VOP3_VOP3PInst<"v_fma_mixhi_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>;
+}
+} // End FPDPRounding = 1
+} // End isCommutable = 1
+
+defm : MadFmaMixPats<fma, V_FMA_MIX_F32_BF16, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>;
+} // End SubtargetPredicate = HasFmaMixBF16Insts
+
def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
let HasModifiers = 0;
}
@@ -1210,6 +1236,12 @@ let isCommutable = 1, isReMaterializable = 1 in {
defm V_PK_MIN_NUM_BF16 : VOP3PInst<"v_pk_min_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fminnum_like>;
defm V_PK_MAX_NUM_BF16 : VOP3PInst<"v_pk_max_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fmaxnum_like>;
defm V_PK_FMA_BF16 : VOP3PInst<"v_pk_fma_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fma>;
+
+ // Scalar pseudo used to emulate AMDGPUClamp.
+ // Expanded to V_PK_MAX_NUM_BF16 with unused high half.
+ // FIXME-TRUE16: Pseudo expansion of this won't work with True16.
+ let True16Predicate = UseFakeTrue16Insts in
+ defm V_MAX_BF16_PSEUDO : VOP3Inst <"v_max_bf16", VOP_BF16_BF16_BF16>;
}
} // End isCommutable = 1, isReMaterializable = 1
@@ -2247,6 +2279,13 @@ defm V_PK_MAXIMUM3_F16 : VOP3P_Real_gfx1250<0x37>;
defm V_PK_MIN3_NUM_F16 : VOP3P_Real_gfx1250<0x38>;
defm V_PK_MAX3_NUM_F16 : VOP3P_Real_gfx1250<0x39>;
+defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>;
+defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3e>;
+defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3f>;
+
+let AssemblerPredicate = isGFX1250Plus in
+def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">;
+
defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>;
defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index c21e2d3..f027ab0 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -401,6 +401,26 @@ class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
let Inst{49-41} = src0;
}
+class VOP3a_BITOP3_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, p> {
+ bits<8> bitop3;
+
+ let Inst{60-59} = bitop3{7-6};
+ let Inst{10-8} = bitop3{5-3};
+ let Inst{63-61} = bitop3{2-0};
+
+ let Inst{11} = !if(p.HasOpSel, src0_modifiers{2}, 0);
+ let Inst{12} = !if(p.HasOpSel, src1_modifiers{2}, 0);
+ let Inst{13} = !if(p.HasOpSel, src2_modifiers{2}, 0);
+ let Inst{14} = !if(p.HasOpSel, src0_modifiers{3}, 0);
+}
+
+class VOP3a_ScaleSel_gfx1250<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, p> {
+ bits<3> scale_sel;
+
+ let Inst{13-11} = scale_sel;
+ let Inst{14} = 0;
+}
+
class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
bits<6> attr;
bits<2> attrchan;
@@ -1506,6 +1526,7 @@ class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VO
let HasFP8SrcByteSel = P.HasFP8SrcByteSel;
let HasFP8DstByteSel = P.HasFP8DstByteSel;
let HasOMod = P.HasOMod;
+ let HasBitOp3 = P.HasBitOp3;
let HasModifiers =
!if (Features.IsMAI, 0,
@@ -1525,6 +1546,7 @@ class VOP3_Profile_True16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> :
let HasFP8SrcByteSel = P.HasFP8SrcByteSel;
let HasFP8DstByteSel = P.HasFP8DstByteSel;
let HasOMod = P.HasOMod;
+ let HasBitOp3 = P.HasBitOp3;
let HasModifiers =
!if (Features.IsMAI, 0,
@@ -1540,6 +1562,7 @@ class VOP3_Profile_Fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> :
let HasFP8SrcByteSel = P.HasFP8SrcByteSel;
let HasFP8DstByteSel = P.HasFP8DstByteSel;
let HasOMod = P.HasOMod;
+ let HasBitOp3 = P.HasBitOp3;
let HasModifiers =
!if (Features.IsMAI, 0,
@@ -1723,6 +1746,34 @@ class VOP3b_DPP8_Base<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
let Inst{14 - 8} = sdst;
}
+class VOP3_BITOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo p, GFXGen Gen, string asmName>
+ : VOP3_DPP16_Gen_t16<op, p, Gen, asmName> {
+ bits<8> bitop3;
+
+ let Inst{60-59} = bitop3{7-6};
+ let Inst{10-8} = bitop3{5-3};
+ let Inst{63-61} = bitop3{2-0};
+
+ let Inst{11} = !if(p.Pfl.HasOpSel, src0_modifiers{2}, 0);
+ let Inst{12} = !if(p.Pfl.HasOpSel, src1_modifiers{2}, 0);
+ let Inst{13} = !if(p.Pfl.HasOpSel, src2_modifiers{2}, 0);
+ let Inst{14} = !if(p.Pfl.HasOpSel, src0_modifiers{3}, 0);
+}
+
+class VOP3_BITOP3_DPP8<bits<10> op, VOP_Pseudo p, string asmName>
+ : Base_VOP3_DPP8_t16<op, p, asmName> {
+ bits<8> bitop3;
+
+ let Inst{60-59} = bitop3{7-6};
+ let Inst{10-8} = bitop3{5-3};
+ let Inst{63-61} = bitop3{2-0};
+
+ let Inst{11} = !if(p.Pfl.HasOpSel, src0_modifiers{2}, 0);
+ let Inst{12} = !if(p.Pfl.HasOpSel, src1_modifiers{2}, 0);
+ let Inst{13} = !if(p.Pfl.HasOpSel, src2_modifiers{2}, 0);
+ let Inst{14} = !if(p.Pfl.HasOpSel, src0_modifiers{3}, 0);
+}
+
class VOP3b_DPP8_Base_t16<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
: Base_VOP3_DPP8<op, ps, opName> {
bits<8> sdst;
@@ -1943,6 +1994,53 @@ multiclass VOP3be_Realtriple<
multiclass VOP3beOnly_Realtriple<GFXGen Gen, bits<10> op> :
VOP3be_Realtriple<Gen, op, 1>;
+multiclass VOP3_BITOP3_Real_dpp_Base<GFXGen Gen, bits<10> op, string asmName> {
+ def _e64_dpp#Gen.Suffix :
+ VOP3_BITOP3_DPP16_Gen<op, !cast<VOP_DPP_Pseudo>(NAME#"_e64"#"_dpp"), Gen, asmName>;
+}
+
+multiclass VOP3_BITOP3_Real_dpp8_Base<GFXGen Gen, bits<10> op, string asmName> {
+ defvar ps = !cast<VOP3_Pseudo>(NAME#"_e64");
+ def _e64_dpp8#Gen.Suffix : VOP3_BITOP3_DPP8<op, ps, asmName> {
+ let DecoderNamespace =
+ Gen.DecoderNamespace #!if (ps.Pfl.IsRealTrue16, "", "_FAKE16");
+ let AssemblerPredicate = Gen.AssemblerPredicate;
+ }
+}
+
+multiclass VOP3_BITOP3_Real_Base<GFXGen Gen, bits<10> op, string asmName> {
+ defvar ps = !cast<VOP_Pseudo>(NAME#"_e64");
+ let IsSingle = ps.Pfl.IsSingle, AsmString = asmName # ps.AsmOperands in {
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3a_BITOP3_gfx12<op, ps.Pfl>;
+ }
+}
+
+multiclass VOP3Only_ScaleSel_Real_gfx1250<bits<10> op> {
+ defvar ps = !cast<VOP_Pseudo>(NAME#"_e64");
+ def _e64_gfx1250 :
+ VOP3_Real_Gen<ps, GFX1250Gen>,
+ VOP3a_ScaleSel_gfx1250<op, ps.Pfl>;
+}
+
+multiclass VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250<bits<10> op, string asmName, string opName = NAME,
+ string pseudo_mnemonic = "", bit isSingle = 0> :
+ VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>,
+ VOP3_Realtriple_with_name<GFX12Not12_50Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
+
+multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12_not_gfx1250<bits<10> op, string asmName,
+ string opName = NAME, string pseudo_mnemonic = ""> {
+ defm _t16 : VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250<op, asmName, opName#"_t16", pseudo_mnemonic, 1>;
+ defm _fake16 : VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250<op, asmName, opName#"_fake16", pseudo_mnemonic, 1>;
+}
+
+multiclass VOP3_Realtriple_with_name_gfx11_gfx12_not_gfx1250<bits<10> op, string opName,
+ string asmName, string pseudo_mnemonic = "",
+ bit isSingle = 0> :
+ VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>,
+ VOP3_Realtriple_with_name<GFX12Not12_50Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
+
//===----------------------------------------------------------------------===//
// VOP3 GFX11
//===----------------------------------------------------------------------===//
@@ -2004,6 +2102,15 @@ multiclass VOP3Only_Real_Base_gfx1250<bits<10> op> :
multiclass VOP3Only_Realtriple_gfx1250<bits<10> op, bit isSingle = 0> :
VOP3_Realtriple<GFX1250Gen, op, isSingle>;
+multiclass VOP3Only_Realtriple_with_name_gfx1250<bits<10> op, string opName,
+ string asmName, string pseudo_mnemonic = "",
+ bit isSingle = 0> :
+ VOP3_Realtriple_with_name<GFX1250Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
+
+multiclass VOP3Only_Realtriple_t16_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic,
+ string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> :
+ VOP3Only_Realtriple_with_name_gfx1250<op, opName, asmName, pseudo_mnemonic, isSingle>;
+
multiclass VOP3_Realtriple_t16_gfx12<bits<10> op, string asmName, string opName = NAME,
string pseudo_mnemonic = "", bit isSingle = 0> :
VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
@@ -2024,6 +2131,13 @@ multiclass VOP3Only_Realtriple_t16_and_fake16_gfx12<bits<10> op, string asmName,
defm _fake16 : VOP3Only_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic>;
}
+multiclass VOP3Only_Realtriple_t16_and_fake16_gfx1250<bits<10> op,
+ string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic,
+ string opName = NAME, string pseudo_mnemonic = ""> {
+ defm _t16 : VOP3Only_Realtriple_t16_gfx1250<op, asmName, opName#"_t16", pseudo_mnemonic>;
+ defm _fake16 : VOP3Only_Realtriple_t16_gfx1250<op, asmName, opName#"_fake16", pseudo_mnemonic>;
+}
+
multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName,
string asmName, bit isSingle = 0> {
defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
@@ -2046,6 +2160,16 @@ multiclass VOP3Only_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>,
VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>;
+multiclass VOP3_Real_BITOP3_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> :
+ VOP3_BITOP3_Real_Base<GFX1250Gen, op, asmName>,
+ VOP3_BITOP3_Real_dpp_Base<GFX1250Gen, op, asmName>,
+ VOP3_BITOP3_Real_dpp8_Base<GFX1250Gen, op, asmName>;
+
+multiclass VOP3_Real_BITOP3_t16_and_fake16_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> {
+ defm _t16 : VOP3_Real_BITOP3_gfx1250<op, asmName>;
+ defm _fake16: VOP3_Real_BITOP3_gfx1250<op, asmName>;
+}
+
multiclass VOP3Dot_Realtriple_gfx11_gfx12<bits<10> op, string asmName, bit isSingle = 0,
string opName = NAME> :
VOP3Dot_Realtriple<GFX11Gen, op, asmName, isSingle, opName>,