aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp143
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp31
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUFeatures.td1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp59
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp66
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp32
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp26
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp80
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp119
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp35
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp151
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h4
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp56
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h43
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp145
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h9
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h42
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp122
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrFormats.td1
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp22
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h11
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td2
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp62
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp60
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h15
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td3
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td44
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp85
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h18
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td8
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td13
62 files changed, 1092 insertions, 557 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 007b481..0059a86 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -439,10 +439,6 @@ struct AMDGPUPrintfRuntimeBindingPass
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};
-struct AMDGPUUnifyMetadataPass : PassInfoMixin<AMDGPUUnifyMetadataPass> {
- PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-};
-
void initializeSIOptimizeExecMaskingPreRALegacyPass(PassRegistry &);
extern char &SIOptimizeExecMaskingPreRAID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index f266398..8e4b636 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1548,7 +1548,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
"gfx12",
- [FeatureFP64, FeatureAddressableLocalMemorySize65536, FeatureMIMG_R128,
+ [FeatureFP64, FeatureMIMG_R128,
FeatureFlatAddressSpace, Feature16BitInsts,
FeatureInv2PiInlineImm, FeatureApertureRegs,
FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
@@ -1977,6 +1977,7 @@ def FeatureISAVersion11_5_3 : FeatureSet<
def FeatureISAVersion12 : FeatureSet<
[FeatureGFX12,
+ FeatureAddressableLocalMemorySize65536,
FeatureLDSBankCount32,
FeatureDLInsts,
FeatureDot7Insts,
@@ -2019,6 +2020,7 @@ def FeatureISAVersion12_50 : FeatureSet<
[FeatureGFX12,
FeatureGFX1250Insts,
FeatureCUStores,
+ FeatureAddressableLocalMemorySize327680,
FeatureCuMode,
Feature64BitLiterals,
FeatureLDSBankCount32,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 2a324e5..66c3fad 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -41,6 +41,7 @@
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/Compiler.h"
@@ -733,6 +734,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutContext, IsLocal));
}
+ // Emit _dvgpr$ symbol when appropriate.
+ emitDVgprSymbol(MF);
+
if (isVerbose()) {
MCSectionELF *CommentSection =
Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
@@ -875,6 +879,49 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
return false;
}
+// When appropriate, add a _dvgpr$ symbol, with the value of the function
+// symbol, plus an offset encoding one less than the number of VGPR blocks used
+// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
+// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
+// used by a front-end to have functions that are chained rather than called,
+// and a dispatcher that dynamically resizes the VGPR count before dispatching
+// to a function.
+void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
+ const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ if (MFI.isDynamicVGPREnabled() &&
+ MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS_Chain) {
+ MCContext &Ctx = MF.getContext();
+ unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
+ MCValue NumVGPRs;
+ if (!CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
+ NumVGPRs, nullptr) ||
+ !NumVGPRs.isAbsolute()) {
+ llvm_unreachable("unable to resolve NumVGPRs for _dvgpr$ symbol");
+ }
+ // Calculate number of VGPR blocks.
+ // Treat 0 VGPRs as 1 VGPR to avoid underflowing.
+ unsigned NumBlocks =
+ divideCeil(std::max(unsigned(NumVGPRs.getConstant()), 1U), BlockSize);
+
+ if (NumBlocks > 8) {
+ OutContext.reportError({},
+ "too many DVGPR blocks for _dvgpr$ symbol for '" +
+ Twine(CurrentFnSym->getName()) + "'");
+ return;
+ }
+ unsigned EncodedNumBlocks = (NumBlocks - 1) << 3;
+ // Add to function symbol to create _dvgpr$ symbol.
+ const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
+ MCSymbolRefExpr::create(CurrentFnSym, Ctx),
+ MCConstantExpr::create(EncodedNumBlocks, Ctx), Ctx);
+ MCSymbol *DVgprFuncSym =
+ Ctx.getOrCreateSymbol(Twine("_dvgpr$") + CurrentFnSym->getName());
+ OutStreamer->emitAssignment(DVgprFuncSym, DVgprFuncVal);
+ emitVisibility(DVgprFuncSym, MF.getFunction().getVisibility());
+ emitLinkage(&MF.getFunction(), DVgprFuncSym);
+ }
+}
+
// TODO: Fold this into emitFunctionBodyStart.
void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
// In the beginning all features are either 'Any' or 'NotSupported',
@@ -997,89 +1044,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const Function &F = MF.getFunction();
// Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
- // dispatch registers are function args.
- unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
-
- if (isShader(F.getCallingConv())) {
- bool IsPixelShader =
- F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
-
- // Calculate the number of VGPR registers based on the SPI input registers
- uint32_t InputEna = 0;
- uint32_t InputAddr = 0;
- unsigned LastEna = 0;
-
- if (IsPixelShader) {
- // Note for IsPixelShader:
- // By this stage, all enabled inputs are tagged in InputAddr as well.
- // We will use InputAddr to determine whether the input counts against the
- // vgpr total and only use the InputEnable to determine the last input
- // that is relevant - if extra arguments are used, then we have to honour
- // the InputAddr for any intermediate non-enabled inputs.
- InputEna = MFI->getPSInputEnable();
- InputAddr = MFI->getPSInputAddr();
-
- // We only need to consider input args up to the last used arg.
- assert((InputEna || InputAddr) &&
- "PSInputAddr and PSInputEnable should "
- "never both be 0 for AMDGPU_PS shaders");
- // There are some rare circumstances where InputAddr is non-zero and
- // InputEna can be set to 0. In this case we default to setting LastEna
- // to 1.
- LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
- }
+ // dispatch registers as function args.
+ unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
+ WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
- // FIXME: We should be using the number of registers determined during
- // calling convention lowering to legalize the types.
- const DataLayout &DL = F.getDataLayout();
- unsigned PSArgCount = 0;
- unsigned IntermediateVGPR = 0;
- for (auto &Arg : F.args()) {
- unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
- if (Arg.hasAttribute(Attribute::InReg)) {
- WaveDispatchNumSGPR += NumRegs;
- } else {
- // If this is a PS shader and we're processing the PS Input args (first
- // 16 VGPR), use the InputEna and InputAddr bits to define how many
- // VGPRs are actually used.
- // Any extra VGPR arguments are handled as normal arguments (and
- // contribute to the VGPR count whether they're used or not).
- if (IsPixelShader && PSArgCount < 16) {
- if ((1 << PSArgCount) & InputAddr) {
- if (PSArgCount < LastEna)
- WaveDispatchNumVGPR += NumRegs;
- else
- IntermediateVGPR += NumRegs;
- }
- PSArgCount++;
- } else {
- // If there are extra arguments we have to include the allocation for
- // the non-used (but enabled with InputAddr) input arguments
- if (IntermediateVGPR) {
- WaveDispatchNumVGPR += IntermediateVGPR;
- IntermediateVGPR = 0;
- }
- WaveDispatchNumVGPR += NumRegs;
- }
- }
- }
+ if (WaveDispatchNumSGPR) {
ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
- {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
+ {ProgInfo.NumSGPR,
+ MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
+ Ctx)},
+ Ctx);
+ }
+ if (WaveDispatchNumVGPR) {
ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
{ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
- } else if (isKernel(F.getCallingConv()) &&
- MFI->getNumKernargPreloadedSGPRs()) {
- // Consider cases where the total number of UserSGPRs with trailing
- // allocated preload SGPRs, is greater than the number of explicitly
- // referenced SGPRs.
- const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
- CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
- ProgInfo.NumSGPR =
- AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
}
// Adjust number of registers used to meet default/requested minimum/maximum
@@ -1168,7 +1150,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.DX10Clamp = Mode.DX10Clamp;
unsigned LDSAlignShift;
- if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
+ if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) {
+ // LDS is allocated in 256 dword blocks.
+ LDSAlignShift = 10;
+ } else if (STM.getFeatureBits().test(
+ FeatureAddressableLocalMemorySize163840)) {
// LDS is allocated in 320 dword blocks.
LDSAlignShift = 11;
} else if (STM.getFeatureBits().test(
@@ -1205,8 +1191,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
CreateExpr(STM.getWavefrontSize()), Ctx),
CreateExpr(1ULL << ScratchAlignShift));
- if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
+ if (STM.supportsWGP()) {
ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
+ }
+
+ if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
ProgInfo.MemOrdered = 1;
ProgInfo.FwdProgress = 1;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 63589d2..9e854fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -54,6 +54,9 @@ private:
MCCodeEmitter *DumpCodeInstEmitter = nullptr;
+ // When appropriate, add a _dvgpr$ symbol.
+ void emitDVgprSymbol(MachineFunction &MF);
+
void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
void getAmdKernelCode(AMDGPU::AMDGPUMCKernelCodeT &Out,
const SIProgramInfo &KernelInfo,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 3d8d274..d1a5b4e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -580,6 +580,9 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
++i;
}
+ if (Info->getNumKernargPreloadedSGPRs())
+ Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
+
TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
return true;
@@ -743,6 +746,15 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (!determineAssignments(Assigner, SplitArgs, CCInfo))
return false;
+ if (IsEntryFunc) {
+ // This assumes the registers are allocated by CCInfo in ascending order
+ // with no gaps.
+ Info->setNumWaveDispatchSGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
+ Info->setNumWaveDispatchVGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
+ }
+
FormalArgHandler Handler(B, MRI);
if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B))
return false;
@@ -1464,9 +1476,22 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const {
if (Function *F = Info.CB->getCalledFunction())
if (F->isIntrinsic()) {
- assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
- "Unexpected intrinsic");
- return lowerChainCall(MIRBuilder, Info);
+ switch (F->getIntrinsicID()) {
+ case Intrinsic::amdgcn_cs_chain:
+ return lowerChainCall(MIRBuilder, Info);
+ case Intrinsic::amdgcn_call_whole_wave:
+ Info.CallConv = CallingConv::AMDGPU_Gfx_WholeWave;
+
+ // Get the callee from the original instruction, so it doesn't look like
+ // this is an indirect call.
+ Info.Callee = MachineOperand::CreateGA(
+ cast<GlobalValue>(Info.CB->getOperand(0)), /*Offset=*/0);
+ Info.OrigArgs.erase(Info.OrigArgs.begin());
+ Info.IsVarArg = false;
+ break;
+ default:
+ llvm_unreachable("Unexpected intrinsic call");
+ }
}
if (Info.IsVarArg) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
index 74d1fae..d14b5ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -30,6 +30,7 @@ class SubtargetFeatureAddressableLocalMemorySize <int Value> : SubtargetFeature<
def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>;
def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>;
def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>;
+def FeatureAddressableLocalMemorySize327680 : SubtargetFeatureAddressableLocalMemorySize<327680>;
class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature<
"wavefrontsize"#!shl(1, ValueLog2),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 9d6584a..04c4d00 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -76,6 +76,40 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) {
return false;
}
+static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
+ llvm::SelectionDAG *CurDAG,
+ const GCNSubtarget *Subtarget) {
+ if (!Subtarget->useRealTrue16Insts()) {
+ return Lo;
+ }
+
+ SDValue NewSrc;
+ SDLoc SL(Lo);
+
+ if (Lo->isDivergent()) {
+ SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+ SL, Lo.getValueType()),
+ 0);
+ const SDValue Ops[] = {
+ CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo,
+ CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef,
+ CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)};
+
+ NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
+ Src.getValueType(), Ops),
+ 0);
+ } else {
+ // the S_MOV is needed since the Lo could still be a VGPR16.
+ // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
+ // the fixvgpr2sgprcopy pass to legalize it
+ NewSrc = SDValue(
+ CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo),
+ 0);
+ }
+
+ return NewSrc;
+}
+
// Look through operations that obscure just looking at the low 16-bits of the
// same register.
static SDValue stripExtractLoElt(SDValue In) {
@@ -1162,18 +1196,25 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
SDLoc SL(N);
bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
+ SDVTList VTList;
unsigned Opc;
- if (Subtarget->hasMADIntraFwdBug())
- Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
- : AMDGPU::V_MAD_U64_U32_gfx11_e64;
- else
- Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
+ if (Subtarget->hasMadU64U32NoCarry()) {
+ VTList = CurDAG->getVTList(MVT::i64);
+ Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
+ } else {
+ VTList = CurDAG->getVTList(MVT::i64, MVT::i1);
+ if (Subtarget->hasMADIntraFwdBug()) {
+ Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
+ : AMDGPU::V_MAD_U64_U32_gfx11_e64;
+ } else {
+ Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
+ }
+ }
SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
- SDNode *Mad = CurDAG->getMachineNode(
- Opc, SL, CurDAG->getVTList(MVT::i64, MVT::i1), Ops);
+ SDNode *Mad = CurDAG->getMachineNode(Opc, SL, VTList, Ops);
if (!SDValue(N, 0).use_empty()) {
SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
@@ -3412,8 +3453,10 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
// Really a scalar input. Just select from the low half of the register to
// avoid packing.
- if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
+ if (VecSize == Lo.getValueSizeInBits()) {
Src = Lo;
+ } else if (VecSize == 32) {
+ Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
} else {
assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 64e68ab..a28e272 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4002,7 +4002,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_rsq_legacy:
case Intrinsic::amdgcn_rsq_clamp:
- case Intrinsic::amdgcn_tanh: {
+ case Intrinsic::amdgcn_tanh:
+ case Intrinsic::amdgcn_prng_b32: {
// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
SDValue Src = N->getOperand(1);
return Src.isUndef() ? Src : SDValue();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index b7fd131..5d31eed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2368,8 +2368,10 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
return selectDSBvhStackIntrinsic(I);
+ case Intrinsic::amdgcn_s_barrier_init:
case Intrinsic::amdgcn_s_barrier_signal_var:
return selectNamedBarrierInit(I, IntrinsicID);
+ case Intrinsic::amdgcn_s_barrier_join:
case Intrinsic::amdgcn_s_get_named_barrier_state:
return selectNamedBarrierInst(I, IntrinsicID);
case Intrinsic::amdgcn_s_get_barrier_state:
@@ -5521,11 +5523,18 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
Register PtrBase;
int64_t ConstOffset;
- std::tie(PtrBase, ConstOffset) =
+ bool IsInBounds;
+ std::tie(PtrBase, ConstOffset, IsInBounds) =
getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
- if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
- !isFlatScratchBaseLegal(Root.getReg())))
+ // Adding the offset to the base address with an immediate in a FLAT
+ // instruction must not change the memory aperture in which the address falls.
+ // Therefore we can only fold offsets from inbounds GEPs into FLAT
+ // instructions.
+ if (ConstOffset == 0 ||
+ (FlatVariant == SIInstrFlags::FlatScratch &&
+ !isFlatScratchBaseLegal(Root.getReg())) ||
+ (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
return Default;
unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
@@ -5577,7 +5586,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
// Match the immediate offset first, which canonically is moved as low as
// possible.
- std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+ std::tie(PtrBase, ConstOffset, std::ignore) =
+ getPtrBaseWithConstantOffset(Addr, *MRI);
if (ConstOffset != 0) {
if (NeedIOffset &&
@@ -5760,7 +5770,8 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
// Match the immediate offset first, which canonically is moved as low as
// possible.
- std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+ std::tie(PtrBase, ConstOffset, std::ignore) =
+ getPtrBaseWithConstantOffset(Addr, *MRI);
if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
@@ -5836,7 +5847,8 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
// Match the immediate offset first, which canonically is moved as low as
// possible.
- std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+ std::tie(PtrBase, ConstOffset, std::ignore) =
+ getPtrBaseWithConstantOffset(Addr, *MRI);
Register OrigAddr = Addr;
if (ConstOffset != 0 &&
@@ -5942,7 +5954,8 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
Register PtrBase;
int64_t ConstOffset;
- std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
+ std::tie(PtrBase, ConstOffset, std::ignore) =
+ getPtrBaseWithConstantOffset(VAddr, *MRI);
if (ConstOffset != 0) {
if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
(!STI.privateMemoryResourceIsRangeChecked() ||
@@ -6181,8 +6194,8 @@ AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const
Register PtrBase;
int64_t Offset;
- std::tie(PtrBase, Offset) =
- getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
+ std::tie(PtrBase, Offset, std::ignore) =
+ getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
if (Offset) {
if (isDSOffsetLegal(PtrBase, Offset)) {
@@ -6243,8 +6256,8 @@ AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
Register PtrBase;
int64_t Offset;
- std::tie(PtrBase, Offset) =
- getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
+ std::tie(PtrBase, Offset, std::ignore) =
+ getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
if (Offset) {
int64_t OffsetValue0 = Offset;
@@ -6265,22 +6278,25 @@ AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
}
/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
-/// the base value with the constant offset. There may be intervening copies
-/// between \p Root and the identified constant. Returns \p Root, 0 if this does
-/// not match the pattern.
-std::pair<Register, int64_t>
+/// the base value with the constant offset, and if the offset computation is
+/// known to be inbounds. There may be intervening copies between \p Root and
+/// the identified constant. Returns \p Root, 0, false if this does not match
+/// the pattern.
+std::tuple<Register, int64_t, bool>
AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
- Register Root, const MachineRegisterInfo &MRI) const {
+ Register Root, const MachineRegisterInfo &MRI) const {
MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
- return {Root, 0};
+ return {Root, 0, false};
MachineOperand &RHS = RootI->getOperand(2);
std::optional<ValueAndVReg> MaybeOffset =
getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
if (!MaybeOffset)
- return {Root, 0};
- return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
+ return {Root, 0, false};
+ bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
+ return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
+ IsInBounds};
}
static void addZeroImm(MachineInstrBuilder &MIB) {
@@ -6358,7 +6374,8 @@ AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
Register PtrBase;
int64_t Offset;
- std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
+ std::tie(PtrBase, Offset, std::ignore) =
+ getPtrBaseWithConstantOffset(Src, *MRI);
if (isUInt<32>(Offset)) {
Data.N0 = PtrBase;
Data.Offset = Offset;
@@ -6757,6 +6774,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
switch (IntrID) {
default:
llvm_unreachable("not a named barrier op");
+ case Intrinsic::amdgcn_s_barrier_join:
+ return AMDGPU::S_BARRIER_JOIN_IMM;
case Intrinsic::amdgcn_s_get_named_barrier_state:
return AMDGPU::S_GET_BARRIER_STATE_IMM;
};
@@ -6764,6 +6783,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
switch (IntrID) {
default:
llvm_unreachable("not a named barrier op");
+ case Intrinsic::amdgcn_s_barrier_join:
+ return AMDGPU::S_BARRIER_JOIN_M0;
case Intrinsic::amdgcn_s_get_named_barrier_state:
return AMDGPU::S_GET_BARRIER_STATE_M0;
};
@@ -6814,8 +6835,11 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
+ unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
+ ? AMDGPU::S_BARRIER_INIT_M0
+ : AMDGPU::S_BARRIER_SIGNAL_M0;
MachineInstrBuilder MIB;
- MIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_M0));
+ MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
I.eraseFromParent();
return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index c9da419..0924396 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -156,6 +156,7 @@ private:
bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const;
bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const;
+ bool selectSBarrierLeave(MachineInstr &I) const;
std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
bool IsCanonicalizing = true,
@@ -295,7 +296,7 @@ private:
InstructionSelector::ComplexRendererFns
selectDSReadWrite2(MachineOperand &Root, unsigned size) const;
- std::pair<Register, int64_t>
+ std::tuple<Register, int64_t, bool>
getPtrBaseWithConstantOffset(Register Root,
const MachineRegisterInfo &MRI) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 523c66c..56113e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -545,7 +545,8 @@ public:
AU.addRequired<TargetPassConfig>();
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<UniformityInfoWrapperPass>();
- AU.setPreservesAll();
+ // Invalidates UniformityInfo
+ AU.setPreservesCFG();
}
bool runOnFunction(Function &F) override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 40d960e..600a130 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -26,6 +26,7 @@
#include "llvm/ADT/ScopeExit.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
@@ -137,6 +138,14 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
};
}
+// Retrieves the scalar type that's the same size as the mem desc
+static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
+ return std::make_pair(TypeIdx, LLT::scalar(MemSize));
+ };
+}
+
// Increase the number of vector elements to reach the next legal RegClass.
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
@@ -384,6 +393,16 @@ static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
};
}
+// If we have a truncating store or an extending load with a data size larger
+// than 32-bits and mem location is a power of 2
+static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
+ return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
+ isPowerOf2_64(MemSize);
+ };
+}
+
// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
// handle some operations by just promoting the register during
// selection. There are also d16 loads on GFX9+ which preserve the high bits.
@@ -1635,11 +1654,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// May need relegalization for the scalars.
return std::pair(0, EltTy);
})
- .minScalar(0, S32)
- .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
- .widenScalarToNextPow2(0)
- .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
- .lower();
+ .minScalar(0, S32)
+ .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
+ getScalarTypeFromMemDesc(0))
+ .widenScalarToNextPow2(0)
+ .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
+ .lower();
}
// FIXME: Unaligned accesses not lowered.
@@ -5653,7 +5673,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
unsigned SplitSize = 32;
if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
ST.hasDPALU_DPP() &&
- AMDGPU::isLegalDPALU_DPPControl(MI.getOperand(4).getImm()))
+ AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
SplitSize = 64;
if (Size == SplitSize) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 304e91e..139cad6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -599,8 +599,8 @@ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitStoreInst(StoreInst &SI) {
IRB.SetInsertPoint(&SI);
Value *IntV = fatPtrsToInts(V, Ty, IntTy, V->getName());
- for (auto *Dbg : at::getAssignmentMarkers(&SI))
- Dbg->setValue(IntV);
+ for (auto *Dbg : at::getDVRAssignmentMarkers(&SI))
+ Dbg->setRawLocation(ValueAsMetadata::get(IntV));
SI.setOperand(0, IntV);
return true;
@@ -1361,6 +1361,7 @@ public:
PtrParts visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI);
PtrParts visitGetElementPtrInst(GetElementPtrInst &GEP);
+ PtrParts visitPtrToAddrInst(PtrToAddrInst &PA);
PtrParts visitPtrToIntInst(PtrToIntInst &PI);
PtrParts visitIntToPtrInst(IntToPtrInst &IP);
PtrParts visitAddrSpaceCastInst(AddrSpaceCastInst &I);
@@ -1954,6 +1955,21 @@ PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) {
return {nullptr, nullptr};
}
+PtrParts SplitPtrStructs::visitPtrToAddrInst(PtrToAddrInst &PA) {
+ Value *Ptr = PA.getPointerOperand();
+ if (!isSplitFatPtr(Ptr->getType()))
+ return {nullptr, nullptr};
+ IRB.SetInsertPoint(&PA);
+
+ auto [Rsrc, Off] = getPtrParts(Ptr);
+ Value *Res = IRB.CreateIntCast(Off, PA.getType(), /*isSigned=*/false);
+ copyMetadata(Res, &PA);
+ Res->takeName(&PA);
+ SplitUsers.insert(&PA);
+ PA.replaceAllUsesWith(Res);
+ return {nullptr, nullptr};
+}
+
PtrParts SplitPtrStructs::visitIntToPtrInst(IntToPtrInst &IP) {
if (!isSplitFatPtr(IP.getType()))
return {nullptr, nullptr};
@@ -2350,8 +2366,12 @@ static bool containsBufferFatPointers(const Function &F,
BufferFatPtrToStructTypeMap *TypeMap) {
bool HasFatPointers = false;
for (const BasicBlock &BB : F)
- for (const Instruction &I : BB)
+ for (const Instruction &I : BB) {
HasFatPointers |= (I.getType() != TypeMap->remapType(I.getType()));
+ // Catch null pointer constants in loads, stores, etc.
+ for (const Value *V : I.operand_values())
+ HasFatPointers |= (V->getType() != TypeMap->remapType(V->getType()));
+ }
return HasFatPointers;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index aa72c3e..dfe7c53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -352,7 +352,10 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
case Intrinsic::amdgcn_s_barrier_signal:
case Intrinsic::amdgcn_s_barrier_signal_var:
case Intrinsic::amdgcn_s_barrier_signal_isfirst:
+ case Intrinsic::amdgcn_s_barrier_init:
+ case Intrinsic::amdgcn_s_barrier_join:
case Intrinsic::amdgcn_s_barrier_wait:
+ case Intrinsic::amdgcn_s_barrier_leave:
case Intrinsic::amdgcn_s_get_barrier_state:
case Intrinsic::amdgcn_wave_barrier:
case Intrinsic::amdgcn_sched_barrier:
@@ -381,7 +384,7 @@ bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
AAResults *AA) {
MemorySSAWalker *Walker = MSSA->getWalker();
SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)};
- SmallSet<MemoryAccess *, 8> Visited;
+ SmallPtrSet<MemoryAccess *, 8> Visited;
MemoryLocation Loc(MemoryLocation::get(Load));
LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index b6c6d92..6ddfa38 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -29,7 +29,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
-MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
#undef MODULE_PASS
#ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 3a37518..28d5400 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -134,8 +134,8 @@ static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
- SmallSet<const Value *, 32> WorkSet;
- SmallSet<const Value *, 32> Visited;
+ SmallPtrSet<const Value *, 32> WorkSet;
+ SmallPtrSet<const Value *, 32> Visited;
if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
if (isGlobalAddr(MO))
WorkSet.insert(MO);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
index 4009451..90c4f4e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
@@ -109,7 +109,7 @@ AMDGPUPreloadKernArgProlog::AMDGPUPreloadKernArgProlog(MachineFunction &MF)
TRI(*ST.getRegisterInfo()) {}
bool AMDGPUPreloadKernArgProlog::run() {
- if (!ST.hasKernargPreload())
+ if (!ST.needsKernArgPreloadProlog())
return false;
unsigned NumKernArgPreloadSGPRs = MFI.getNumKernargPreloadedSGPRs();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
index 984c1ee..a386fe6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
@@ -37,6 +37,11 @@ static cl::opt<unsigned> KernargPreloadCount(
"amdgpu-kernarg-preload-count",
cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));
+static cl::opt<bool>
+ EnableKernargPreload("amdgpu-kernarg-preload",
+ cl::desc("Enable preload kernel arguments to SGPRs"),
+ cl::init(true));
+
namespace {
class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass {
@@ -275,6 +280,9 @@ AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy(
: ModulePass(ID), TM(TM) {}
static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM) {
+ if (!EnableKernargPreload)
+ return false;
+
SmallVector<Function *, 4> FunctionsToErase;
bool Changed = false;
for (auto &F : M) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 5a6ad40..8c56c21 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -724,10 +724,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}});
addRulesForGOpcs({G_PTR_ADD})
- .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
- .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
- .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}})
- .Any({{UniP4}, {{SgprP4}, {SgprP4, Sgpr64}}});
+ .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32, Sgpr32}}})
+ .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32, Vgpr32}}})
+ .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64, Sgpr64}}})
+ .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64, Vgpr64}}});
addRulesForGOpcs({G_INTTOPTR})
.Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}})
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 868b1a2..2379296 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3342,6 +3342,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
assert(OpdMapper.getVRegs(1).empty());
constrainOpWithReadfirstlane(B, MI, 1);
return;
+ case Intrinsic::amdgcn_s_barrier_join:
+ constrainOpWithReadfirstlane(B, MI, 1);
+ return;
+ case Intrinsic::amdgcn_s_barrier_init:
case Intrinsic::amdgcn_s_barrier_signal_var:
constrainOpWithReadfirstlane(B, MI, 1);
constrainOpWithReadfirstlane(B, MI, 2);
@@ -5515,6 +5519,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_s_sleep_var:
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
break;
+ case Intrinsic::amdgcn_s_barrier_join:
+ OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ break;
+ case Intrinsic::amdgcn_s_barrier_init:
case Intrinsic::amdgcn_s_barrier_signal_var:
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
index e2e5c57..d2ec7dd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
@@ -195,13 +195,17 @@ bool AMDGPURemoveIncompatibleFunctions::checkFunction(Function &F) {
// Delete FeatureWavefrontSize32 functions for
// gfx9 and below targets that don't support the mode.
- // gfx10+ is implied to support both wave32 and 64 features.
+ // gfx10, gfx11, gfx12 are implied to support both wave32 and 64 features.
// They are not in the feature set. So, we need a separate check
- if (ST->getGeneration() < AMDGPUSubtarget::GFX10 &&
- ST->hasFeature(AMDGPU::FeatureWavefrontSize32)) {
+ if (!ST->supportsWave32() && ST->hasFeature(AMDGPU::FeatureWavefrontSize32)) {
reportFunctionRemoved(F, AMDGPU::FeatureWavefrontSize32);
return true;
}
+ // gfx125x only support FeatureWavefrontSize32.
+ if (!ST->supportsWave64() && ST->hasFeature(AMDGPU::FeatureWavefrontSize64)) {
+ reportFunctionRemoved(F, AMDGPU::FeatureWavefrontSize64);
+ return true;
+ }
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 8101c68..ccd2de1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -241,6 +241,9 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
if (!RC || !TRI.isVGPRClass(RC))
continue;
+ if (MI.isCall() || MI.isMetaInstruction())
+ continue;
+
unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32);
unsigned HWReg = TRI.getHWRegIndex(Reg);
int MaxUsed = HWReg + Width - 1;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index f580f43..20b5fd9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -57,27 +57,47 @@ public:
TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
LIS(LIS) {}
+ // TODO: Remove this restriction
+ bool mfmaHasSameSrc2AndDstReg(const MachineInstr &MI) const {
+ const MachineOperand *Src2 = TII.getNamedOperand(MI, AMDGPU::OpName::src2);
+ const MachineOperand *Dst = TII.getNamedOperand(MI, AMDGPU::OpName::vdst);
+ return Src2->getReg() == Dst->getReg() &&
+ Src2->getSubReg() == Dst->getSubReg();
+ }
+
+ bool isRewriteCandidate(const MachineInstr &MI) const {
+ return TII.isMAI(MI) &&
+ AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1 &&
+ mfmaHasSameSrc2AndDstReg(MI);
+ }
+
/// Compute the register class constraints based on the uses of \p Reg,
- /// excluding uses from \p ExceptMI. This should be nearly identical to
+ /// excluding MFMA uses from which can be rewritten to change the register
+ /// class constraint. This should be nearly identical to
/// MachineRegisterInfo::recomputeRegClass.
const TargetRegisterClass *
- recomputeRegClassExcept(Register Reg, const TargetRegisterClass *OldRC,
- const TargetRegisterClass *NewRC,
- const MachineInstr *ExceptMI) const;
+ recomputeRegClassExceptRewritable(Register Reg,
+ const TargetRegisterClass *OldRC,
+ const TargetRegisterClass *NewRC) const;
bool run(MachineFunction &MF) const;
};
const TargetRegisterClass *
-AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExcept(
+AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
Register Reg, const TargetRegisterClass *OldRC,
- const TargetRegisterClass *NewRC, const MachineInstr *ExceptMI) const {
+ const TargetRegisterClass *NewRC) const {
// Accumulate constraints from all uses.
for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) {
// Apply the effect of the given operand to NewRC.
MachineInstr *MI = MO.getParent();
- if (MI == ExceptMI)
+
+ // We can swap the classes of dst + src2 as a pair to AGPR, so ignore the
+ // effects of rewrite candidates. It just so happens that we can use either
+ // AGPR or VGPR in src0/src1, so don't bother checking the constraint
+ // effects of the individual operands.
+ if (isRewriteCandidate(*MI))
continue;
unsigned OpNo = &MO - &MI->getOperand(0);
@@ -96,8 +116,10 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
return false;
// Early exit if no AGPRs were assigned.
- if (!LRM.isPhysRegUsed(AMDGPU::AGPR0))
+ if (!LRM.isPhysRegUsed(AMDGPU::AGPR0)) {
+ LLVM_DEBUG(dbgs() << "skipping function that did not allocate AGPRs\n");
return false;
+ }
bool MadeChange = false;
@@ -109,17 +131,25 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
// Find AV_* registers assigned to AGPRs.
const TargetRegisterClass *VirtRegRC = MRI.getRegClass(VReg);
- if (!TRI.isVectorSuperClass(VirtRegRC))
+ if (!TRI.hasAGPRs(VirtRegRC))
continue;
- const TargetRegisterClass *AssignedRC = TRI.getPhysRegBaseClass(PhysReg);
- if (!TRI.isAGPRClass(AssignedRC))
- continue;
+ const TargetRegisterClass *AssignedRC = VirtRegRC;
+ if (TRI.hasVGPRs(VirtRegRC)) {
+ // If this is an AV register, we have to check if the actual assignment is
+ // to an AGPR
+ AssignedRC = TRI.getPhysRegBaseClass(PhysReg);
+ if (!TRI.isAGPRClass(AssignedRC))
+ continue;
+ }
LiveInterval &LI = LIS.getInterval(VReg);
// TODO: Test multiple uses
for (VNInfo *VNI : LI.vnis()) {
+ if (VNI->isPHIDef() || VNI->isUnused())
+ continue;
+
MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
// TODO: Handle SplitKit produced copy bundles for partially defined
@@ -183,10 +213,13 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
// first place, as well as need to assign another register, and need to
// figure out where to put them. The live range splitting is smarter than
// anything we're doing here, so trust it did something reasonable.
- const TargetRegisterClass *Src2ExceptRC = recomputeRegClassExcept(
- Src2->getReg(), Src2VirtRegRC, VirtRegRC, CopySrcMI);
- if (!Src2ExceptRC)
+ const TargetRegisterClass *Src2ExceptRC =
+ recomputeRegClassExceptRewritable(Src2->getReg(), Src2VirtRegRC,
+ VirtRegRC);
+ if (!Src2ExceptRC) {
+ LLVM_DEBUG(dbgs() << "Could not recompute the regclass\n");
continue;
+ }
const TargetRegisterClass *NewSrc2ConstraintRC =
TII.getRegClass(TII.get(AGPROp), Src2->getOperandNo(), &TRI, MF);
@@ -196,8 +229,6 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
const TargetRegisterClass *NewSrc2RC =
TRI.getCommonSubClass(Src2ExceptRC, NewSrc2ConstraintRC);
if (!NewSrc2RC) {
- // TODO: This is ignoring ther rewritable uses. e.g. a rewritable MFMA
- // using a rewritable MFMA can be rewritten as a pair.
LLVM_DEBUG(dbgs() << "Other uses of " << printReg(Src2->getReg(), &TRI)
<< " are incompatible with replacement class\n");
continue;
@@ -208,8 +239,19 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
CopySrcMI->setDesc(TII.get(AGPROp));
- // TODO: Is replacing too aggressive, fixup these instructions only?
- MRI.replaceRegWith(CopySrcReg, VReg);
+ // Perform replacement of the register, rewriting the rewritable uses.
+ for (MachineInstr &UseMI :
+ make_early_inc_range(MRI.reg_instructions(CopySrcReg))) {
+ if (TII.isMAI(UseMI)) {
+ // Note the register we need to rewrite may still appear in src0/src1,
+ // but that's fine since those can use A or V anyway.
+ int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(UseMI.getOpcode());
+ if (ReplacementOp != -1)
+ UseMI.setDesc(TII.get(ReplacementOp));
+ }
+
+ UseMI.substituteRegister(CopySrcReg, VReg, AMDGPU::NoSubRegister, TRI);
+ }
LLVM_DEBUG(dbgs() << "Replaced VGPR MFMA with AGPR: " << *CopySrcMI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 10b8606..7be1899 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -378,6 +378,7 @@ foreach intr = AMDGPUImageDimAtomicIntrinsics in
def : SourceOfDivergence<intr>;
def : SourceOfDivergence<int_amdgcn_dead>;
+def : SourceOfDivergence<int_amdgcn_call_whole_wave>;
class AlwaysUniform<Intrinsic intr> {
Intrinsic Intr = intr;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
index b60ded3..56aa3f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
@@ -195,7 +195,7 @@ bool AMDGPUSetWavePriority::run(MachineFunction &MF) {
// Lower the priority on edges where control leaves blocks from which
// the VMEM loads are reachable.
- SmallSet<MachineBasicBlock *, 16> PriorityLoweringBlocks;
+ SmallPtrSet<MachineBasicBlock *, 16> PriorityLoweringBlocks;
for (MachineBasicBlock &MBB : MF) {
if (MBBInfos[&MBB].MayReachVMEMLoad) {
if (MBB.succ_empty())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c1f1703..e393aa19 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -848,8 +848,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
if (Level == OptimizationLevel::O0)
return;
- PM.addPass(AMDGPUUnifyMetadataPass());
-
// We don't want to run internalization at per-module stage.
if (InternalizeSymbols && !isLTOPreLink(Phase)) {
PM.addPass(InternalizePass(mustPreserveGV));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
deleted file mode 100644
index e400491..0000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata --------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// \file
-// This pass that unifies multiple OpenCL metadata due to linking.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
-
-using namespace llvm;
-
-namespace {
-
- namespace kOCLMD {
-
- const char SpirVer[] = "opencl.spir.version";
- const char OCLVer[] = "opencl.ocl.version";
- const char UsedExt[] = "opencl.used.extensions";
- const char UsedOptCoreFeat[] = "opencl.used.optional.core.features";
- const char CompilerOptions[] = "opencl.compiler.options";
- const char LLVMIdent[] = "llvm.ident";
-
- } // end namespace kOCLMD
-
- /// Unify version metadata.
- /// \return true if changes are made.
- /// Assume the named metadata has operands each of which is a pair of
- /// integer constant, e.g.
- /// !Name = {!n1, !n2}
- /// !n1 = {i32 1, i32 2}
- /// !n2 = {i32 2, i32 0}
- /// Keep the largest version as the sole operand if PickFirst is false.
- /// Otherwise pick it from the first value, representing kernel module.
- bool unifyVersionMD(Module &M, StringRef Name, bool PickFirst) {
- auto *NamedMD = M.getNamedMetadata(Name);
- if (!NamedMD || NamedMD->getNumOperands() <= 1)
- return false;
- MDNode *MaxMD = nullptr;
- auto MaxVer = 0U;
- for (auto *VersionMD : NamedMD->operands()) {
- assert(VersionMD->getNumOperands() == 2);
- auto *CMajor = mdconst::extract<ConstantInt>(VersionMD->getOperand(0));
- auto VersionMajor = CMajor->getZExtValue();
- auto *CMinor = mdconst::extract<ConstantInt>(VersionMD->getOperand(1));
- auto VersionMinor = CMinor->getZExtValue();
- auto Ver = (VersionMajor * 100) + (VersionMinor * 10);
- if (Ver > MaxVer) {
- MaxVer = Ver;
- MaxMD = VersionMD;
- }
- if (PickFirst)
- break;
- }
- NamedMD->eraseFromParent();
- NamedMD = M.getOrInsertNamedMetadata(Name);
- NamedMD->addOperand(MaxMD);
- return true;
- }
-
- /// Unify version metadata.
- /// \return true if changes are made.
- /// Assume the named metadata has operands each of which is a list e.g.
- /// !Name = {!n1, !n2}
- /// !n1 = !{!"cl_khr_fp16", {!"cl_khr_fp64"}}
- /// !n2 = !{!"cl_khr_image"}
- /// Combine it into a single list with unique operands.
- bool unifyExtensionMD(Module &M, StringRef Name) {
- auto *NamedMD = M.getNamedMetadata(Name);
- if (!NamedMD || NamedMD->getNumOperands() == 1)
- return false;
-
- SmallVector<Metadata *, 4> All;
- for (auto *MD : NamedMD->operands())
- for (const auto &Op : MD->operands())
- if (!llvm::is_contained(All, Op.get()))
- All.push_back(Op.get());
-
- NamedMD->eraseFromParent();
- NamedMD = M.getOrInsertNamedMetadata(Name);
- for (const auto &MD : All)
- NamedMD->addOperand(MDNode::get(M.getContext(), MD));
-
- return true;
- }
-
- /// Unify multiple OpenCL metadata due to linking.
- bool unifyMetadataImpl(Module &M) {
- const char *Vers[] = {kOCLMD::SpirVer, kOCLMD::OCLVer};
- const char *Exts[] = {kOCLMD::UsedExt, kOCLMD::UsedOptCoreFeat,
- kOCLMD::CompilerOptions, kOCLMD::LLVMIdent};
-
- bool Changed = false;
-
- for (auto &I : Vers)
- Changed |= unifyVersionMD(M, I, true);
-
- for (auto &I : Exts)
- Changed |= unifyExtensionMD(M, I);
-
- return Changed;
- }
-
- } // end anonymous namespace
-
- PreservedAnalyses AMDGPUUnifyMetadataPass::run(Module &M,
- ModuleAnalysisManager &AM) {
- return unifyMetadataImpl(M) ? PreservedAnalyses::none()
- : PreservedAnalyses::all();
- }
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 0d2feeb..9514732 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -5052,11 +5052,13 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
if (DppCtrlIdx >= 0) {
unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
- if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) &&
- AMDGPU::isDPALU_DPP(MII.get(Opc))) {
- // DP ALU DPP is supported for row_newbcast only on GFX9*
+ if (!AMDGPU::isLegalDPALU_DPPControl(getSTI(), DppCtrl) &&
+ AMDGPU::isDPALU_DPP(MII.get(Opc), getSTI())) {
+ // DP ALU DPP is supported for row_newbcast only on GFX9* and row_share
+ // only on GFX12.
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
- Error(S, "DP ALU dpp only supports row_newbcast");
+ Error(S, isGFX12() ? "DP ALU dpp only supports row_share"
+ : "DP ALU dpp only supports row_newbcast");
return false;
}
}
@@ -6268,8 +6270,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
ExprVal, ValRange);
} else if (ID == ".amdhsa_workgroup_processor_mode") {
- if (IVersion.Major < 10)
- return Error(IDRange.Start, "directive requires gfx10+", IDRange);
+ if (!supportsWGP(getSTI()))
+ return Error(IDRange.Start,
+ "directive unsupported on " + getSTI().getCPU(), IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, ExprVal,
ValRange);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c466f9c..dc9dd22 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -114,7 +114,6 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUTargetTransformInfo.cpp
AMDGPUWaitSGPRHazards.cpp
AMDGPUUnifyDivergentExitNodes.cpp
- AMDGPUUnifyMetadata.cpp
R600MachineCFGStructurizer.cpp
GCNCreateVOPD.cpp
GCNDPPCombine.cpp
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index d5d1074..f5d4384 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1274,7 +1274,7 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in {
defm GLOBAL_LOAD_TR_B64_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPR_32>;
}
-let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus in {
+let WaveSizePredicate = isWave32, SubtargetPredicate = HasTransposeLoadF4F6Insts in {
defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VReg_96>;
defm GLOBAL_LOAD_TR4_B64 : FLAT_Global_Load_Pseudo <"global_load_tr4_b64", VReg_64>;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index f9a907a..184929a 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -421,6 +421,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::byte_sel)) {
DPPInst.addImm(ByteSelOpr->getImm());
}
+ if (MachineOperand *BitOp3 =
+ TII->getNamedOperand(OrigMI, AMDGPU::OpName::bitop3)) {
+ assert(AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::bitop3));
+ DPPInst.add(*BitOp3);
+ }
}
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
@@ -544,11 +549,17 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
return false;
}
- if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
- MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
- auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
- assert(DppCtrl && DppCtrl->isImm());
- if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl->getImm())) {
+ auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
+ assert(DppCtrl && DppCtrl->isImm());
+ unsigned DppCtrlVal = DppCtrl->getImm();
+ if ((MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
+ MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp)) {
+ if (!ST->hasFeature(AMDGPU::FeatureDPALU_DPP)) {
+ LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move is unsupported\n");
+ // Split it.
+ return false;
+ }
+ if (!AMDGPU::isLegalDPALU_DPPControl(*ST, DppCtrlVal)) {
LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported"
" control value\n");
// Let it split, then control may become legal.
@@ -704,6 +715,20 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
break;
}
+ if (!ST->hasFeature(AMDGPU::FeatureDPALU_DPP) &&
+ AMDGPU::isDPALU_DPP32BitOpc(OrigOp)) {
+ LLVM_DEBUG(dbgs() << " " << OrigMI
+ << " failed: DPP ALU DPP is not supported\n");
+ break;
+ }
+
+ if (!AMDGPU::isLegalDPALU_DPPControl(*ST, DppCtrlVal) &&
+ AMDGPU::isDPALU_DPP(TII->get(OrigOp), *ST)) {
+ LLVM_DEBUG(dbgs() << " " << OrigMI
+ << " failed: not valid 64-bit DPP control value\n");
+ break;
+ }
+
LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
if (Use == Src0) {
if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 96cb5ae..a3b64ae 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1200,6 +1200,14 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixShift64HighRegBug(MI);
fixVALUMaskWriteHazard(MI);
fixRequiredExportPriority(MI);
+ if (ST.requiresWaitIdleBeforeGetReg())
+ fixGetRegWaitIdle(MI);
+ if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
+ fixDsAtomicAsyncBarrierArriveB64(MI);
+ if (ST.hasScratchBaseForwardingHazard())
+ fixScratchBaseForwardingHazard(MI);
+ if (ST.setRegModeNeedsVNOPs())
+ fixSetRegMode(MI);
}
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI,
@@ -1350,6 +1358,9 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
return (Decoded.DsCnt == 0);
}
default:
+ assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
+ MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
+ "unexpected wait count instruction");
// SOPP instructions cannot mitigate the hazard.
if (TII->isSOPP(MI))
return false;
@@ -1731,7 +1742,7 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(0x0fff);
+ .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
return true;
}
@@ -1781,7 +1792,7 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isDS(I) ||
SIInstrInfo::isEXP(I) ||
(I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
- I.getOperand(0).getImm() == 0x0fff))
+ AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
return HazardExpired;
// Track registers writes
@@ -2239,19 +2250,7 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
return true;
- switch (MI.getOpcode()) {
- case AMDGPU::S_WAITCNT:
- case AMDGPU::S_WAITCNT_VSCNT:
- case AMDGPU::S_WAITCNT_VMCNT:
- case AMDGPU::S_WAITCNT_EXPCNT:
- case AMDGPU::S_WAITCNT_LGKMCNT:
- case AMDGPU::S_WAIT_IDLE:
- return true;
- default:
- break;
- }
-
- return false;
+ return SIInstrInfo::isWaitcnt(MI.getOpcode());
};
return FPAtomicToDenormModeWaitStates -
@@ -3428,3 +3427,125 @@ bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
return true;
}
+
+bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
+ if (!isSGetReg(MI->getOpcode()))
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ switch (getHWReg(TII, *MI)) {
+ default:
+ return false;
+ case AMDGPU::Hwreg::ID_STATUS:
+ case AMDGPU::Hwreg::ID_STATE_PRIV:
+ case AMDGPU::Hwreg::ID_EXCP_FLAG_PRIV:
+ case AMDGPU::Hwreg::ID_EXCP_FLAG_USER:
+ break;
+ }
+
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(0);
+ return true;
+}
+
+bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
+ if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(0xFFE3);
+ BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(0xFFE3);
+
+ return true;
+}
+
+bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
+ // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
+ // for hazard to trigger.
+ if (!IsHazardRecognizerMode)
+ return false;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
+ const int FlatScrBaseWaitStates = 10;
+
+ bool ReadsFlatScrLo =
+ MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
+ bool ReadsFlatScrHi =
+ MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
+ if (isSGetReg(MI->getOpcode())) {
+ switch (getHWReg(TII, *MI)) {
+ default:
+ break;
+ case AMDGPU::Hwreg::ID_FLAT_SCR_LO:
+ ReadsFlatScrLo = true;
+ break;
+ case AMDGPU::Hwreg::ID_FLAT_SCR_HI:
+ ReadsFlatScrHi = true;
+ break;
+ }
+ }
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ auto IsRegDefHazard = [&](Register Reg) -> bool {
+ DenseSet<const MachineBasicBlock *> Visited;
+ auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
+ return MI.modifiesRegister(Reg, TRI);
+ };
+
+ // This literally abuses the idea of waitstates. Instead of waitstates it
+ // returns 1 for SGPR written and 0 otherwise.
+ auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
+ if (!TII->isSALU(MI) && !TII->isVALU(MI))
+ return 0;
+ for (const MachineOperand &MO : MI.all_defs()) {
+ if (TRI->isSGPRReg(MRI, MO.getReg()))
+ return 1;
+ }
+ return 0;
+ };
+
+ auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
+ if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
+ unsigned Wait = MI.getOperand(0).getImm();
+ if (AMDGPU::DepCtr::decodeFieldSaSdst(Wait) == 0 &&
+ AMDGPU::DepCtr::decodeFieldVaSdst(Wait) == 0)
+ return true;
+ }
+ return SgprWrites >= FlatScrBaseWaitStates;
+ };
+
+ return ::getWaitStatesSince(
+ IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()),
+ 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
+ };
+
+ if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
+ !IsRegDefHazard(AMDGPU::SGPR102)) &&
+ (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
+ !IsRegDefHazard(AMDGPU::SGPR103)))
+ return false;
+
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(AMDGPU::DepCtr::encodeFieldVaSdst(
+ AMDGPU::DepCtr::encodeFieldSaSdst(0), 0));
+ return true;
+}
+
+bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
+ if (!isSSetReg(MI->getOpcode()) ||
+ MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE)
+ return false;
+
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
+ return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index f796eeae..67beffa 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -110,6 +110,10 @@ private:
bool fixShift64HighRegBug(MachineInstr *MI);
bool fixVALUMaskWriteHazard(MachineInstr *MI);
bool fixRequiredExportPriority(MachineInstr *MI);
+ bool fixGetRegWaitIdle(MachineInstr *MI);
+ bool fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI);
+ bool fixScratchBaseForwardingHazard(MachineInstr *MI);
+ bool fixSetRegMode(MachineInstr *MI);
int checkMAIHazards(MachineInstr *MI);
int checkMAIHazards908(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 334afd3..ef63acc 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -368,46 +368,45 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
////////////////////////////////////////////////////////////////////////////////
// GCNRPTarget
-GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP,
- bool CombineVGPRSavings)
- : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
+GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP)
+ : GCNRPTarget(RP, MF) {
const Function &F = MF.getFunction();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- setRegLimits(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F), MF);
+ setTarget(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F));
}
GCNRPTarget::GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs,
- const MachineFunction &MF, const GCNRegPressure &RP,
- bool CombineVGPRSavings)
- : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
- setRegLimits(NumSGPRs, NumVGPRs, MF);
+ const MachineFunction &MF, const GCNRegPressure &RP)
+ : GCNRPTarget(RP, MF) {
+ setTarget(NumSGPRs, NumVGPRs);
}
GCNRPTarget::GCNRPTarget(unsigned Occupancy, const MachineFunction &MF,
- const GCNRegPressure &RP, bool CombineVGPRSavings)
- : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
+ const GCNRegPressure &RP)
+ : GCNRPTarget(RP, MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
- setRegLimits(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false),
- ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize), MF);
+ setTarget(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false),
+ ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize));
}
-void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs,
- const MachineFunction &MF) {
+void GCNRPTarget::setTarget(unsigned NumSGPRs, unsigned NumVGPRs) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- unsigned DynamicVGPRBlockSize =
- MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs);
MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs);
- MaxUnifiedVGPRs =
- ST.hasGFX90AInsts()
- ? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs)
- : 0;
+ if (UnifiedRF) {
+ unsigned DynamicVGPRBlockSize =
+ MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+ MaxUnifiedVGPRs =
+ std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs);
+ } else {
+ MaxUnifiedVGPRs = 0;
+ }
}
-bool GCNRPTarget::isSaveBeneficial(Register Reg,
- const MachineRegisterInfo &MRI) const {
+bool GCNRPTarget::isSaveBeneficial(Register Reg) const {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
@@ -416,16 +415,19 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg,
return RP.getSGPRNum() > MaxSGPRs;
unsigned NumVGPRs =
SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum();
- return isVGPRBankSaveBeneficial(NumVGPRs);
+ // The addressable limit must always be respected.
+ if (NumVGPRs > MaxVGPRs)
+ return true;
+ // For unified RFs, combined VGPR usage limit must be respected as well.
+ return UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs;
}
bool GCNRPTarget::satisfied() const {
- if (RP.getSGPRNum() > MaxSGPRs)
+ if (RP.getSGPRNum() > MaxSGPRs || RP.getVGPRNum(false) > MaxVGPRs)
return false;
- if (RP.getVGPRNum(false) > MaxVGPRs &&
- (!CombineVGPRSavings || !satisifiesVGPRBanksTarget()))
+ if (UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs)
return false;
- return satisfiesUnifiedTarget();
+ return true;
}
///////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index ea33a22..a9c58bb 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -186,20 +186,22 @@ public:
/// Sets up the target such that the register pressure starting at \p RP does
/// not show register spilling on function \p MF (w.r.t. the function's
/// mininum target occupancy).
- GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP,
- bool CombineVGPRSavings = false);
+ GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP);
/// Sets up the target such that the register pressure starting at \p RP does
/// not use more than \p NumSGPRs SGPRs and \p NumVGPRs VGPRs on function \p
/// MF.
GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, const MachineFunction &MF,
- const GCNRegPressure &RP, bool CombineVGPRSavings = false);
+ const GCNRegPressure &RP);
/// Sets up the target such that the register pressure starting at \p RP does
/// not prevent achieving an occupancy of at least \p Occupancy on function
/// \p MF.
GCNRPTarget(unsigned Occupancy, const MachineFunction &MF,
- const GCNRegPressure &RP, bool CombineVGPRSavings = false);
+ const GCNRegPressure &RP);
+
+ /// Changes the target (same semantics as constructor).
+ void setTarget(unsigned NumSGPRs, unsigned NumVGPRs);
const GCNRegPressure &getCurrentRP() const { return RP; }
@@ -207,7 +209,7 @@ public:
/// Determines whether saving virtual register \p Reg will be beneficial
/// towards achieving the RP target.
- bool isSaveBeneficial(Register Reg, const MachineRegisterInfo &MRI) const;
+ bool isSaveBeneficial(Register Reg) const;
/// Saves virtual register \p Reg with lanemask \p Mask.
void saveReg(Register Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI) {
@@ -227,15 +229,15 @@ public:
if (Target.MaxUnifiedVGPRs) {
OS << ", " << Target.RP.getVGPRNum(true) << '/' << Target.MaxUnifiedVGPRs
<< " VGPRs (unified)";
- } else if (Target.CombineVGPRSavings) {
- OS << ", " << Target.RP.getArchVGPRNum() + Target.RP.getAGPRNum() << '/'
- << 2 * Target.MaxVGPRs << " VGPRs (combined target)";
}
return OS;
}
#endif
private:
+ const MachineFunction &MF;
+ const bool UnifiedRF;
+
/// Current register pressure.
GCNRegPressure RP;
@@ -246,29 +248,10 @@ private:
/// Target number of overall VGPRs for subtargets with unified RFs. Always 0
/// for subtargets with non-unified RFs.
unsigned MaxUnifiedVGPRs;
- /// Whether we consider that the register allocator will be able to swap
- /// between ArchVGPRs and AGPRs by copying them to a super register class.
- /// Concretely, this allows savings in one of the VGPR banks to help toward
- /// savings in the other VGPR bank.
- bool CombineVGPRSavings;
-
- inline bool satisifiesVGPRBanksTarget() const {
- assert(CombineVGPRSavings && "only makes sense with combined savings");
- return RP.getArchVGPRNum() + RP.getAGPRNum() <= 2 * MaxVGPRs;
- }
-
- /// Always satisified when the subtarget doesn't have a unified RF.
- inline bool satisfiesUnifiedTarget() const {
- return !MaxUnifiedVGPRs || RP.getVGPRNum(true) <= MaxUnifiedVGPRs;
- }
-
- inline bool isVGPRBankSaveBeneficial(unsigned NumVGPRs) const {
- return NumVGPRs > MaxVGPRs || !satisfiesUnifiedTarget() ||
- (CombineVGPRSavings && !satisifiesVGPRBanksTarget());
- }
- void setRegLimits(unsigned MaxSGPRs, unsigned MaxVGPRs,
- const MachineFunction &MF);
+ GCNRPTarget(const GCNRegPressure &RP, const MachineFunction &MF)
+ : MF(MF), UnifiedRF(MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()),
+ RP(RP) {}
};
///////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 96d5668..254b75b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1086,7 +1086,8 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
}
/// Allows to easily filter for this stage's debug output.
-#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
+#define REMAT_PREFIX "[PreRARemat] "
+#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
bool PreRARematStage::initGCNSchedStage() {
// FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
@@ -1115,10 +1116,15 @@ bool PreRARematStage::initGCNSchedStage() {
rematerialize();
if (GCNTrackers)
DAG.RegionLiveOuts.buildLiveRegMap();
- REMAT_DEBUG(
- dbgs() << "Retrying function scheduling with new min. occupancy of "
- << AchievedOcc << " from rematerializing (original was "
- << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
+ REMAT_DEBUG({
+ dbgs() << "Retrying function scheduling with new min. occupancy of "
+ << AchievedOcc << " from rematerializing (original was "
+ << DAG.MinOccupancy;
+ if (TargetOcc)
+ dbgs() << ", target was " << *TargetOcc;
+ dbgs() << ")\n";
+ });
+
if (AchievedOcc > DAG.MinOccupancy) {
DAG.MinOccupancy = AchievedOcc;
SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
@@ -1540,8 +1546,7 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
- mayCauseSpilling(WavesAfter) ||
- (IncreaseOccupancy && WavesAfter < TargetOcc);
+ mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc);
}
bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
@@ -1687,78 +1692,63 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
}
bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
- REMAT_DEBUG({
- dbgs() << "Collecting rematerializable instructions in ";
- MF.getFunction().printAsOperand(dbgs(), false);
- dbgs() << '\n';
- });
+ const Function &F = MF.getFunction();
// Maps optimizable regions (i.e., regions at minimum and register-limited
// occupancy, or regions with spilling) to the target RP we would like to
// reach.
DenseMap<unsigned, GCNRPTarget> OptRegions;
- const Function &F = MF.getFunction();
- unsigned DynamicVGPRBlockSize =
- MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
-
- std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
- const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
- const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
- const unsigned MaxSGPRsIncOcc =
- ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
- const unsigned MaxVGPRsIncOcc =
- ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize);
- IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
-
- // Collect optimizable regions. If there is spilling in any region we will
- // just try to reduce spilling. Otherwise we will try to increase occupancy by
- // one in the whole function.
- for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- GCNRegPressure &RP = DAG.Pressure[I];
- // We allow ArchVGPR or AGPR savings to count as savings of the other kind
- // of VGPR only when trying to eliminate spilling. We cannot do this when
- // trying to increase occupancy since VGPR class swaps only occur later in
- // the register allocator i.e., the scheduler will not be able to reason
- // about these savings and will not report an increase in the achievable
- // occupancy, triggering rollbacks.
- GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP,
- /*CombineVGPRSavings=*/true);
- if (!Target.satisfied() && IncreaseOccupancy) {
- // There is spilling in the region and we were so far trying to increase
- // occupancy. Strop trying that and focus on reducing spilling.
- IncreaseOccupancy = false;
- OptRegions.clear();
- } else if (IncreaseOccupancy) {
- // There is no spilling in the region, try to increase occupancy.
- Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP,
- /*CombineVGPRSavings=*/false);
+ unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
+ unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
+ auto ResetTargetRegions = [&]() {
+ OptRegions.clear();
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ const GCNRegPressure &RP = DAG.Pressure[I];
+ GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP);
+ if (!Target.satisfied())
+ OptRegions.insert({I, Target});
}
- if (!Target.satisfied())
- OptRegions.insert({I, Target});
- }
- if (OptRegions.empty())
- return false;
+ };
-#ifndef NDEBUG
- if (IncreaseOccupancy) {
- REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy
- << ") in regions:\n");
+ ResetTargetRegions();
+ if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
+ // In addition to register usage being above addressable limits, occupancy
+ // below the minimum is considered like "spilling" as well.
+ TargetOcc = std::nullopt;
} else {
- REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy ("
- << WavesPerEU.first << ") in regions:\n");
- }
- for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
- REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << '\n');
+ // There is no spilling and room to improve occupancy; set up "increased
+ // occupancy targets" for all regions.
+ TargetOcc = DAG.MinOccupancy + 1;
+ unsigned VGPRBlockSize =
+ MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+ MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
+ MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
+ ResetTargetRegions();
}
-#endif
-
- // When we are reducing spilling, the target is the minimum target number of
- // waves/EU determined by the subtarget. In cases where either one of
- // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current
- // minimum region occupancy may be higher than the latter.
- TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1
- : std::max(DAG.MinOccupancy, WavesPerEU.first);
+ REMAT_DEBUG({
+ dbgs() << "Analyzing ";
+ MF.getFunction().printAsOperand(dbgs(), false);
+ dbgs() << ": ";
+ if (OptRegions.empty()) {
+ dbgs() << "no objective to achieve, occupancy is maximal at "
+ << MFI.getMaxWavesPerEU();
+ } else if (!TargetOcc) {
+ dbgs() << "reduce spilling (minimum target occupancy is "
+ << MFI.getMinWavesPerEU() << ')';
+ } else {
+ dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to "
+ << TargetOcc;
+ }
+ dbgs() << '\n';
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) {
+ dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond()
+ << '\n';
+ }
+ }
+ });
+ if (OptRegions.empty())
+ return false;
// Accounts for a reduction in RP in an optimizable region. Returns whether we
// estimate that we have identified enough rematerialization opportunities to
@@ -1767,7 +1757,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
bool &Progress) -> bool {
GCNRPTarget &Target = OptIt->getSecond();
- if (!Target.isSaveBeneficial(Reg, DAG.MRI))
+ if (!Target.isSaveBeneficial(Reg))
return false;
Progress = true;
Target.saveReg(Reg, Mask, DAG.MRI);
@@ -1876,7 +1866,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
}
}
- if (IncreaseOccupancy) {
+ if (TargetOcc) {
// We were trying to increase occupancy but failed, abort the stage.
REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
Rematerializations.clear();
@@ -1979,7 +1969,9 @@ void PreRARematStage::rematerialize() {
// All regions impacted by at least one rematerialization must be rescheduled.
// Maximum pressure must also be recomputed for all regions where it changed
// non-predictably and checked against the target occupancy.
- AchievedOcc = TargetOcc;
+ unsigned DynamicVGPRBlockSize =
+ MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+ AchievedOcc = MFI.getMaxWavesPerEU();
for (auto &[I, OriginalRP] : ImpactedRegions) {
bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
RescheduleRegions[I] = !IsEmptyRegion;
@@ -2003,9 +1995,8 @@ void PreRARematStage::rematerialize() {
}
}
DAG.Pressure[I] = RP;
- AchievedOcc = std::min(
- AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>()
- ->getDynamicVGPRBlockSize()));
+ AchievedOcc =
+ std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
}
REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
}
@@ -2035,7 +2026,7 @@ void PreRARematStage::finalizeGCNSchedStage() {
// which case we do not want to rollback either (the rescheduling was already
// reverted in PreRARematStage::shouldRevertScheduling in such cases).
unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
- if (!IncreaseOccupancy || MaxOcc >= TargetOcc)
+ if (!TargetOcc || MaxOcc >= *TargetOcc)
return;
REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 32139a9..790370f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -470,15 +470,12 @@ private:
/// After successful stage initialization, indicates which regions should be
/// rescheduled.
BitVector RescheduleRegions;
- /// Target occupancy the stage estimates is reachable through
- /// rematerialization. Greater than or equal to the pre-stage min occupancy.
- unsigned TargetOcc;
+ /// The target occupancy the stage is trying to achieve. Empty when the
+ /// objective is spilling reduction.
+ std::optional<unsigned> TargetOcc;
/// Achieved occupancy *only* through rematerializations (pre-rescheduling).
/// Smaller than or equal to the target occupancy.
unsigned AchievedOcc;
- /// Whether the stage is attempting to increase occupancy in the abscence of
- /// spilling.
- bool IncreaseOccupancy;
/// Returns whether remat can reduce spilling or increase function occupancy
/// by 1 through rematerialization. If it can do one, collects instructions in
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f47ddf5..2a8385d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -390,7 +390,11 @@ public:
/// the original value.
bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
- bool supportsWGP() const { return getGeneration() >= GFX10; }
+ bool supportsWGP() const {
+ if (GFX1250Insts)
+ return false;
+ return getGeneration() >= GFX10;
+ }
bool hasIntClamp() const {
return HasIntClamp;
@@ -1341,6 +1345,10 @@ public:
bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }
+ bool setRegModeNeedsVNOPs() const {
+ return GFX1250Insts && getGeneration() == GFX12;
+ }
+
/// Return if operations acting on VGPR tuples require even alignment.
bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; }
@@ -1573,6 +1581,12 @@ public:
// extended VA to 57 bits.
bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; }
+ // \returns true if the target needs to create a prolog for backward
+ // compatibility when preloading kernel arguments.
+ bool needsKernArgPreloadProlog() const {
+ return hasKernargPreload() && !GFX1250Insts;
+ }
+
/// \returns SGPR allocation granularity supported by the subtarget.
unsigned getSGPRAllocGranule() const {
return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
@@ -1722,6 +1736,10 @@ public:
/// unit requirement.
unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
+ bool supportsWave32() const { return getGeneration() >= GFX10; }
+
+ bool supportsWave64() const { return !hasGFX1250Insts(); }
+
bool isWave32() const {
return getWavefrontSize() == 32;
}
@@ -1785,11 +1803,11 @@ public:
// \returns true if the subtarget has a hazard requiring an "s_nop 0"
// instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
- bool requiresNopBeforeDeallocVGPRs() const {
- // Currently all targets that support the dealloc VGPRs message also require
- // the nop.
- return true;
- }
+ bool requiresNopBeforeDeallocVGPRs() const { return !GFX1250Insts; }
+
+ // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
+ // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
+ bool requiresWaitIdleBeforeGetReg() const { return GFX1250Insts; }
bool isDynamicVGPREnabled() const { return DynamicVGPR; }
unsigned getDynamicVGPRBlockSize() const {
@@ -1801,6 +1819,18 @@ public:
// to the same register.
return false;
}
+
+ // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
+ // and surronded by S_WAIT_ALU(0xFFE3).
+ bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const {
+ return getGeneration() == GFX12;
+ }
+
+ // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
+ // read.
+ bool hasScratchBaseForwardingHazard() const {
+ return GFX1250Insts && getGeneration() == GFX12;
+ }
};
class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index ee8683a..aafbdc2 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -976,8 +976,10 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
unsigned Imm = MI->getOperand(OpNo).getImm();
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
- if (!AMDGPU::isLegalDPALU_DPPControl(Imm) && AMDGPU::isDPALU_DPP(Desc)) {
- O << " /* DP ALU dpp only supports row_newbcast */";
+ if (!AMDGPU::isLegalDPALU_DPPControl(STI, Imm) &&
+ AMDGPU::isDPALU_DPP(Desc, STI)) {
+ O << " /* DP ALU dpp only supports "
+ << (isGFX12(STI) ? "row_share" : "row_newbcast") << " */";
return;
}
if (Imm <= DppCtrl::QUAD_PERM_LAST) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index f358084..61f6732 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -389,6 +389,8 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) &&
// Matrix B format operand reuses op_sel_hi.
!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_fmt) &&
+ // Matrix B scale operand reuses op_sel_hi.
+ !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_scale) &&
// Matrix B reuse operand reuses op_sel_hi.
!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_reuse)) {
Encoding |= getImplicitOpSelHiEncoding(Opcode);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 68302f0..e20581d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -26,7 +26,6 @@
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/AMDHSAKernelDescriptor.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/TargetParser/TargetParser.h"
@@ -563,11 +562,12 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PrintField(KD.compute_pgm_rsrc3,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, ".amdhsa_tg_split");
- if (IVersion.Major >= 10) {
+ if (AMDGPU::supportsWGP(STI))
PrintField(KD.compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_SHIFT,
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE,
".amdhsa_workgroup_processor_mode");
+ if (IVersion.Major >= 10) {
PrintField(KD.compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT,
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED,
@@ -885,7 +885,7 @@ void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
if (!SymbolELF->isBindingSet())
SymbolELF->setBinding(ELF::STB_GLOBAL);
- if (SymbolELF->declareCommon(Size, Alignment, true)) {
+ if (SymbolELF->declareCommon(Size, Alignment)) {
report_fatal_error("Symbol: " + Symbol->getName() +
" redeclared as different type");
}
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 2d0102f..7c01903 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -197,7 +197,7 @@ enum ClassFlags : unsigned {
namespace AMDGPU {
enum OperandType : unsigned {
- /// Operands with register or 32-bit immediate
+ /// Operands with register, 32-bit, or 64-bit immediate
OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET,
OPERAND_REG_IMM_INT64,
OPERAND_REG_IMM_INT16,
@@ -407,7 +407,7 @@ enum CPol {
SCAL = 1 << 11, // Scale offset bit
- ALL = TH | SCOPE,
+ ALL = TH | SCOPE | NV,
// Helper bits
TH_TYPE_LOAD = 1 << 7, // TH_LOAD policy
@@ -440,6 +440,7 @@ enum Id { // Message ID, width(4) [3:0].
ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10
ID_GS_ALLOC_REQ = 9, // added in GFX9
ID_GET_DOORBELL = 10, // added in GFX9, removed in GFX11
+ ID_SAVEWAVE_HAS_TDM = 10, // added in GFX1250
ID_GET_DDID = 11, // added in GFX10, removed in GFX11
ID_SYSMSG = 15,
@@ -513,6 +514,7 @@ enum Id { // HwRegCode, (6) [5:0]
ID_HW_ID2 = 24,
ID_POPS_PACKER = 25,
ID_PERF_SNAPSHOT_DATA_gfx11 = 27,
+ ID_IB_STS2 = 28,
ID_SHADER_CYCLES = 29,
ID_SHADER_CYCLES_HI = 30,
ID_DVGPR_ALLOC_LO = 31,
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index f018f77..dce4e6f 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -460,7 +460,7 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
// List of clobbering instructions.
SmallVector<MachineInstr*, 8> Clobbers;
// List of instructions marked for deletion.
- SmallSet<MachineInstr*, 8> MergedInstrs;
+ SmallPtrSet<MachineInstr *, 8> MergedInstrs;
bool Changed = false;
@@ -808,7 +808,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
bool AllAGPRUses = true;
SetVector<const MachineInstr *> worklist;
- SmallSet<const MachineInstr *, 4> Visited;
+ SmallPtrSet<const MachineInstr *, 4> Visited;
SetVector<MachineInstr *> PHIOperands;
worklist.insert(&MI);
Visited.insert(&MI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5b327fb..561019b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3106,6 +3106,15 @@ SDValue SITargetLowering::LowerFormalArguments(
if (!IsKernel) {
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
+
+ // This assumes the registers are allocated by CCInfo in ascending order
+ // with no gaps.
+ Info->setNumWaveDispatchSGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
+ Info->setNumWaveDispatchVGPRs(
+ CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
+ } else if (Info->getNumKernargPreloadedSGPRs()) {
+ Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
}
SmallVector<SDValue, 16> Chains;
@@ -6612,7 +6621,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
unsigned SplitSize = 32;
if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
ST->hasDPALU_DPP() &&
- AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
+ AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
SplitSize = 64;
auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
@@ -10816,6 +10825,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
Op->getOperand(2), Chain),
0);
+ case Intrinsic::amdgcn_s_barrier_init:
case Intrinsic::amdgcn_s_barrier_signal_var: {
// these two intrinsics have two operands: barrier pointer and member count
SDValue Chain = Op->getOperand(0);
@@ -10823,6 +10833,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SDValue BarOp = Op->getOperand(2);
SDValue CntOp = Op->getOperand(3);
SDValue M0Val;
+ unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
+ ? AMDGPU::S_BARRIER_INIT_M0
+ : AMDGPU::S_BARRIER_SIGNAL_M0;
// extract the BarrierID from bits 4-9 of BarOp
SDValue BarID;
BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
@@ -10846,8 +10859,40 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
- auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_M0, DL,
- Op->getVTList(), Ops);
+ auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
+ return SDValue(NewMI, 0);
+ }
+ case Intrinsic::amdgcn_s_barrier_join: {
+ // these three intrinsics have one operand: barrier pointer
+ SDValue Chain = Op->getOperand(0);
+ SmallVector<SDValue, 2> Ops;
+ SDValue BarOp = Op->getOperand(2);
+ unsigned Opc;
+
+ if (isa<ConstantSDNode>(BarOp)) {
+ uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
+ Opc = AMDGPU::S_BARRIER_JOIN_IMM;
+
+ // extract the BarrierID from bits 4-9 of the immediate
+ unsigned BarID = (BarVal >> 4) & 0x3F;
+ SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
+ Ops.push_back(K);
+ Ops.push_back(Chain);
+ } else {
+ Opc = AMDGPU::S_BARRIER_JOIN_M0;
+
+ // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
+ SDValue M0Val;
+ M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
+ DAG.getShiftAmountConstant(4, MVT::i32, DL));
+ M0Val =
+ SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
+ DAG.getTargetConstant(0x3F, DL, MVT::i32)),
+ 0);
+ Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
+ }
+
+ auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
return SDValue(NewMI, 0);
}
case Intrinsic::amdgcn_s_prefetch_data: {
@@ -11495,9 +11540,22 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
return FastLowered;
SDLoc SL(Op);
+ EVT VT = Op.getValueType();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
+ SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
+ SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
+
+ if (VT == MVT::bf16) {
+ SDValue ExtDiv =
+ DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
+ return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
+ DAG.getTargetConstant(0, SL, MVT::i32));
+ }
+
+ assert(VT == MVT::f16);
+
// a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
// b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
// r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
@@ -11514,9 +11572,6 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
// We will use ISD::FMA on targets that don't support ISD::FMAD.
unsigned FMADOpCode =
isOperationLegal(ISD::FMAD, MVT::f32) ? ISD::FMAD : ISD::FMA;
-
- SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
- SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
SDValue Rcp =
DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
@@ -15684,7 +15739,7 @@ SDValue SITargetLowering::performFDivCombine(SDNode *N,
SelectionDAG &DAG = DCI.DAG;
SDLoc SL(N);
EVT VT = N->getValueType(0);
- if (VT != MVT::f16 || !Subtarget->has16BitInsts())
+ if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
return SDValue();
SDValue LHS = N->getOperand(0);
@@ -16849,6 +16904,11 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
const TargetRegisterClass *RC = nullptr;
if (Constraint.size() == 1) {
+ // Check if we cannot determine the bit size of the given value type. This
+ // can happen, for example, in this situation where we have an empty struct
+ // (size 0): `call void asm "", "v"({} poison)`-
+ if (VT == MVT::Other)
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
const unsigned BitWidth = VT.getSizeInBits();
switch (Constraint[0]) {
default:
@@ -16897,13 +16957,26 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
}
break;
}
- // We actually support i128, i16 and f16 as inline parameters
- // even if they are not reported as legal
- if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
- VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
- return std::pair(0U, RC);
+ } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
+ const unsigned BitWidth = VT.getSizeInBits();
+ switch (BitWidth) {
+ case 16:
+ RC = &AMDGPU::AV_32RegClass;
+ break;
+ default:
+ RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
+ if (!RC)
+ return std::pair(0U, nullptr);
+ break;
+ }
}
+ // We actually support i128, i16 and f16 as inline parameters
+ // even if they are not reported as legal
+ if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
+ VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
+ return std::pair(0U, RC);
+
auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
if (Kind != '\0') {
if (Kind == 'v') {
@@ -16916,7 +16989,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
if (RC) {
if (NumRegs > 1) {
- if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 > RC->getNumRegs())
+ if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
return std::pair(0U, nullptr);
uint32_t Width = NumRegs * 32;
@@ -16988,6 +17061,9 @@ SITargetLowering::getConstraintType(StringRef Constraint) const {
case 'a':
return C_RegisterClass;
}
+ } else if (Constraint.size() == 2) {
+ if (Constraint == "VA")
+ return C_RegisterClass;
}
if (isImmConstraint(Constraint)) {
return C_Other;
@@ -17727,23 +17803,9 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
/// Return if a flat address space atomicrmw can access private memory.
static bool flatInstrMayAccessPrivate(const Instruction *I) {
- const MDNode *NoaliasAddrSpaceMD =
- I->getMetadata(LLVMContext::MD_noalias_addrspace);
- if (!NoaliasAddrSpaceMD)
- return true;
-
- for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
- ++I) {
- auto *Low = mdconst::extract<ConstantInt>(
- NoaliasAddrSpaceMD->getOperand(2 * I + 0));
- if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
- auto *High = mdconst::extract<ConstantInt>(
- NoaliasAddrSpaceMD->getOperand(2 * I + 1));
- return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
- }
- }
-
- return true;
+ const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
+ return !MD ||
+ !AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS);
}
TargetLowering::AtomicExpansionKind
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 4b48fc4..343e455 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2341,6 +2341,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
case AMDGPU::S_MEMREALTIME:
case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
+ case AMDGPU::S_BARRIER_LEAVE:
case AMDGPU::S_GET_BARRIER_STATE_M0:
case AMDGPU::S_GET_BARRIER_STATE_IMM:
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 89d9b0d..50964a9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -473,6 +473,7 @@ class VIMAGE_VSAMPLE_Common <bits<8> op> : Enc96 {
let Inst{4} = r128;
let Inst{5} = d16;
let Inst{6} = a16;
+ let Inst{7} = cpol{5}; // nv
let Inst{21-14} = op;
let Inst{25-22} = dmask;
let Inst{39-32} = vdata;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 19e6bcf..cc4bee0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2616,9 +2616,9 @@ std::pair<MachineInstr*, MachineInstr*>
SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
- if (ST.hasMovB64() &&
+ if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
AMDGPU::isLegalDPALU_DPPControl(
- getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
+ ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
return std::pair(&MI, nullptr);
}
@@ -2905,7 +2905,6 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
MachineBasicBlock &RestoreBB,
const DebugLoc &DL, int64_t BrOffset,
RegScavenger *RS) const {
- assert(RS && "RegScavenger required for long branching");
assert(MBB.empty() &&
"new block should be inserted for expanding unconditional branch");
assert(MBB.pred_size() == 1);
@@ -4241,6 +4240,7 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
MI.getOpcode() == AMDGPU::S_SETPRIO ||
+ MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
changesVGPRIndexingMode(MI);
}
@@ -4267,12 +4267,15 @@ bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
if (MI.memoperands_empty())
return true;
- // TODO (?): Does this need to be taught how to read noalias.addrspace ?
-
// See if any memory operand specifies an address space that involves scratch.
return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
unsigned AS = Memop->getAddrSpace();
- return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
+ const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
+ return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
+ *MD, AMDGPUAS::PRIVATE_ADDRESS);
+ }
+ return AS == AMDGPUAS::PRIVATE_ADDRESS;
});
}
@@ -5433,7 +5436,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
- !AMDGPU::isLegalDPALU_DPPControl(DC) && AMDGPU::isDPALU_DPP(Desc)) {
+ !AMDGPU::isLegalDPALU_DPPControl(ST, DC) &&
+ AMDGPU::isDPALU_DPP(Desc, ST)) {
ErrInfo = "Invalid dpp_ctrl value: "
"DP ALU dpp only support row_newbcast";
return false;
@@ -9225,7 +9229,7 @@ bool SIInstrInfo::isHighLatencyDef(int Opc) const {
(isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
}
-unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
+Register SIInstrInfo::isStackAccess(const MachineInstr &MI,
int &FrameIndex) const {
const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
if (!Addr || !Addr->isFI())
@@ -9238,7 +9242,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
}
-unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
+Register SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
int &FrameIndex) const {
const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
assert(Addr && Addr->isFI());
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 6b9403f..12ffae7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -996,6 +996,11 @@ public:
bool isBarrier(unsigned Opcode) const {
return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT ||
+ Opcode == AMDGPU::S_BARRIER_INIT_M0 ||
+ Opcode == AMDGPU::S_BARRIER_INIT_IMM ||
+ Opcode == AMDGPU::S_BARRIER_JOIN_IMM ||
+ Opcode == AMDGPU::S_BARRIER_LEAVE ||
+ Opcode == AMDGPU::S_BARRIER_LEAVE_IMM ||
Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER;
}
@@ -1051,7 +1056,7 @@ public:
}
}
- bool isWaitcnt(unsigned Opcode) const {
+ static bool isWaitcnt(unsigned Opcode) {
switch (getNonSoftWaitcntOpcode(Opcode)) {
case AMDGPU::S_WAITCNT:
case AMDGPU::S_WAITCNT_VSCNT:
@@ -1402,8 +1407,8 @@ public:
return get(pseudoToMCOpcode(Opcode));
}
- unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const;
- unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const;
+ Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const;
+ Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const;
Register isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index c552f1a..c425d97 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1954,6 +1954,7 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
!eq(VT, v2f16) : VCSrc_v2f16,
!eq(VT, v2bf16) : VCSrc_v2bf16,
!eq(VT, f32) : VCSrc_f32,
+ !eq(VT, f64) : VCSrc_f64,
!eq(VT, v2i32) : VCSrc_v2b32,
1 : VCSrc_b32);
}
@@ -2707,7 +2708,6 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
isModifierType<Src2VT>.ret,
HasOMod);
field bit HasNeg = HasModifiers;
- field bit HasMatrixReuse = 0;
field bit HasMatrixFMT = 0;
field bit HasMatrixScale = 0;
field bit HasMatrixReuse = 0;
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index b49c5a9..e204d6b 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -87,6 +87,8 @@ enum InstClassEnum {
GLOBAL_STORE_SADDR,
FLAT_LOAD,
FLAT_STORE,
+ FLAT_LOAD_SADDR,
+ FLAT_STORE_SADDR,
GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
GLOBAL_STORE // any CombineInfo, they are only ever returned by
// getCommonInstClass.
@@ -354,6 +356,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
case AMDGPU::FLAT_LOAD_DWORD:
case AMDGPU::FLAT_STORE_DWORD:
+ case AMDGPU::FLAT_LOAD_DWORD_SADDR:
+ case AMDGPU::FLAT_STORE_DWORD_SADDR:
return 1;
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
@@ -367,6 +371,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
case AMDGPU::FLAT_LOAD_DWORDX2:
case AMDGPU::FLAT_STORE_DWORDX2:
+ case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
return 2;
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
@@ -380,6 +386,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
case AMDGPU::FLAT_LOAD_DWORDX3:
case AMDGPU::FLAT_STORE_DWORDX3:
+ case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
return 3;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
@@ -393,6 +401,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
case AMDGPU::FLAT_LOAD_DWORDX4:
case AMDGPU::FLAT_STORE_DWORDX4:
+ case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
return 4;
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
@@ -575,6 +585,16 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
return GLOBAL_STORE_SADDR;
+ case AMDGPU::FLAT_LOAD_DWORD_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
+ return FLAT_LOAD_SADDR;
+ case AMDGPU::FLAT_STORE_DWORD_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
+ return FLAT_STORE_SADDR;
}
}
@@ -661,6 +681,16 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
+ case AMDGPU::FLAT_LOAD_DWORD_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
+ return AMDGPU::FLAT_LOAD_DWORD_SADDR;
+ case AMDGPU::FLAT_STORE_DWORD_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
+ return AMDGPU::FLAT_STORE_DWORD_SADDR;
}
}
@@ -776,6 +806,14 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORD_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
+ case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
+ case AMDGPU::FLAT_STORE_DWORD_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
+ case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
Result.SAddr = true;
[[fallthrough]];
case AMDGPU::GLOBAL_LOAD_DWORD:
@@ -1875,6 +1913,28 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
case 4:
return AMDGPU::FLAT_STORE_DWORDX4;
}
+ case FLAT_LOAD_SADDR:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::FLAT_LOAD_DWORDX2_SADDR;
+ case 3:
+ return AMDGPU::FLAT_LOAD_DWORDX3_SADDR;
+ case 4:
+ return AMDGPU::FLAT_LOAD_DWORDX4_SADDR;
+ }
+ case FLAT_STORE_SADDR:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::FLAT_STORE_DWORDX2_SADDR;
+ case 3:
+ return AMDGPU::FLAT_STORE_DWORDX3_SADDR;
+ case 4:
+ return AMDGPU::FLAT_STORE_DWORDX4_SADDR;
+ }
case MIMG:
assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
"No overlaps");
@@ -2508,12 +2568,14 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
OptimizeListAgain |= CI.Width + Paired.Width < 4;
break;
case FLAT_LOAD:
+ case FLAT_LOAD_SADDR:
case GLOBAL_LOAD:
case GLOBAL_LOAD_SADDR:
NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
OptimizeListAgain |= CI.Width + Paired.Width < 4;
break;
case FLAT_STORE:
+ case FLAT_STORE_SADDR:
case GLOBAL_STORE:
case GLOBAL_STORE_SADDR:
NewMI = mergeFlatStorePair(CI, Paired, Where->I);
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index f8878f3..e97536d 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -57,6 +57,7 @@
#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -76,10 +77,11 @@ private:
LiveIntervals *LIS = nullptr;
LiveVariables *LV = nullptr;
MachineDominatorTree *MDT = nullptr;
+ MachinePostDominatorTree *PDT = nullptr;
MachineRegisterInfo *MRI = nullptr;
SetVector<MachineInstr*> LoweredEndCf;
DenseSet<Register> LoweredIf;
- SmallSet<MachineBasicBlock *, 4> KillBlocks;
+ SmallPtrSet<MachineBasicBlock *, 4> KillBlocks;
SmallSet<Register, 8> RecomputeRegs;
const TargetRegisterClass *BoolRC = nullptr;
@@ -138,8 +140,8 @@ private:
public:
SILowerControlFlow(LiveIntervals *LIS, LiveVariables *LV,
- MachineDominatorTree *MDT)
- : LIS(LIS), LV(LV), MDT(MDT) {}
+ MachineDominatorTree *MDT, MachinePostDominatorTree *PDT)
+ : LIS(LIS), LV(LV), MDT(MDT), PDT(PDT) {}
bool run(MachineFunction &MF);
};
@@ -159,6 +161,7 @@ public:
AU.addUsedIfAvailable<LiveIntervalsWrapperPass>();
// Should preserve the same set that TwoAddressInstructions does.
AU.addPreserved<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
AU.addPreserved<SlotIndexesWrapperPass>();
AU.addPreserved<LiveIntervalsWrapperPass>();
AU.addPreserved<LiveVariablesWrapperPass>();
@@ -457,7 +460,7 @@ MachineBasicBlock::iterator
SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
- SmallSet<const MachineBasicBlock *, 4> Visited;
+ SmallPtrSet<const MachineBasicBlock *, 4> Visited;
MachineBasicBlock *B = &MBB;
do {
if (!Visited.insert(B).second)
@@ -506,13 +509,18 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
MachineBasicBlock *SplitBB = &MBB;
if (NeedBlockSplit) {
SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS);
- if (MDT && SplitBB != &MBB) {
- MachineDomTreeNode *MBBNode = (*MDT)[&MBB];
- SmallVector<MachineDomTreeNode *> Children(MBBNode->begin(),
- MBBNode->end());
- MachineDomTreeNode *SplitBBNode = MDT->addNewBlock(SplitBB, &MBB);
- for (MachineDomTreeNode *Child : Children)
- MDT->changeImmediateDominator(Child, SplitBBNode);
+ if (SplitBB != &MBB && (MDT || PDT)) {
+ using DomTreeT = DomTreeBase<MachineBasicBlock>;
+ SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
+ for (MachineBasicBlock *Succ : SplitBB->successors()) {
+ DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
+ DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
+ }
+ DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
+ if (MDT)
+ MDT->applyUpdates(DTUpdates);
+ if (PDT)
+ PDT->applyUpdates(DTUpdates);
}
Opcode = OrTermrOpc;
InsPt = MI;
@@ -727,26 +735,27 @@ bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
MachineBasicBlock *Succ = *MBB.succ_begin();
MachineBasicBlock *FallThrough = nullptr;
+ using DomTreeT = DomTreeBase<MachineBasicBlock>;
+ SmallVector<DomTreeT::UpdateType, 8> DTUpdates;
+
while (!MBB.predecessors().empty()) {
MachineBasicBlock *P = *MBB.pred_begin();
if (P->getFallThrough(false) == &MBB)
FallThrough = P;
P->ReplaceUsesOfBlockWith(&MBB, Succ);
+ DTUpdates.push_back({DomTreeT::Insert, P, Succ});
+ DTUpdates.push_back({DomTreeT::Delete, P, &MBB});
}
MBB.removeSuccessor(Succ);
if (LIS) {
for (auto &I : MBB.instrs())
LIS->RemoveMachineInstrFromMaps(I);
}
- if (MDT) {
- // If Succ, the single successor of MBB, is dominated by MBB, MDT needs
- // updating by changing Succ's idom to the one of MBB; otherwise, MBB must
- // be a leaf node in MDT and could be erased directly.
- if (MDT->dominates(&MBB, Succ))
- MDT->changeImmediateDominator(MDT->getNode(Succ),
- MDT->getNode(&MBB)->getIDom());
- MDT->eraseNode(&MBB);
- }
+ if (MDT)
+ MDT->applyUpdates(DTUpdates);
+ if (PDT)
+ PDT->applyUpdates(DTUpdates);
+
MBB.clear();
MBB.eraseFromParent();
if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) {
@@ -875,7 +884,11 @@ bool SILowerControlFlowLegacy::runOnMachineFunction(MachineFunction &MF) {
LiveVariables *LV = LVWrapper ? &LVWrapper->getLV() : nullptr;
auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
- return SILowerControlFlow(LIS, LV, MDT).run(MF);
+ auto *PDTWrapper =
+ getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
+ MachinePostDominatorTree *PDT =
+ PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
+ return SILowerControlFlow(LIS, LV, MDT, PDT).run(MF);
}
PreservedAnalyses
@@ -885,13 +898,16 @@ SILowerControlFlowPass::run(MachineFunction &MF,
LiveVariables *LV = MFAM.getCachedResult<LiveVariablesAnalysis>(MF);
MachineDominatorTree *MDT =
MFAM.getCachedResult<MachineDominatorTreeAnalysis>(MF);
+ MachinePostDominatorTree *PDT =
+ MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF);
- bool Changed = SILowerControlFlow(LIS, LV, MDT).run(MF);
+ bool Changed = SILowerControlFlow(LIS, LV, MDT, PDT).run(MF);
if (!Changed)
return PreservedAnalyses::all();
auto PA = getMachineFunctionPassPreservedAnalyses();
PA.preserve<MachineDominatorTreeAnalysis>();
+ PA.preserve<MachinePostDominatorTreeAnalysis>();
PA.preserve<SlotIndexesAnalysis>();
PA.preserve<LiveIntervalsAnalysis>();
PA.preserve<LiveVariablesAnalysis>();
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 9509199..09b737c 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -209,10 +209,13 @@ void SILowerSGPRSpills::calculateSaveRestoreBlocks(MachineFunction &MF) {
// So set the save points for those.
// Use the points found by shrink-wrapping, if any.
- if (MFI.getSavePoint()) {
- SaveBlocks.push_back(MFI.getSavePoint());
- assert(MFI.getRestorePoint() && "Both restore and save must be set");
- MachineBasicBlock *RestoreBlock = MFI.getRestorePoint();
+ if (!MFI.getSavePoints().empty()) {
+ assert(MFI.getSavePoints().size() == 1 &&
+ "Multiple save points not yet supported!");
+ SaveBlocks.push_back(MFI.getSavePoints().front());
+ assert(MFI.getRestorePoints().size() == 1 &&
+ "Multiple restore points not yet supported!");
+ MachineBasicBlock *RestoreBlock = MFI.getRestorePoints().front();
// If RestoreBlock does not have any successor and is not a return block
// then the end point is unreachable and we do not need to insert any
// epilogue.
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 9a1448f..8a11203 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -33,7 +33,7 @@ using namespace llvm;
// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
// where it is better to produce the VGPR form (e.g. if there are VGPR users
// of the MFMA result).
-cl::opt<bool> MFMAVGPRForm(
+static cl::opt<bool> MFMAVGPRForm(
"amdgpu-mfma-vgpr-form", cl::Hidden,
cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
"unspecified, default to compiler heuristics"),
@@ -728,6 +728,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
+ NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
+ NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
Occupancy(MFI.getOccupancy()),
ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
@@ -784,6 +786,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
WaveLimiter = YamlMFI.WaveLimiter;
HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
+ NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs;
+ NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs;
BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
ReturnsVoid = YamlMFI.ReturnsVoid;
IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 08b0206..ca8f803 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -270,6 +270,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
bool WaveLimiter = false;
bool HasSpilledSGPRs = false;
bool HasSpilledVGPRs = false;
+ uint16_t NumWaveDispatchSGPRs = 0;
+ uint16_t NumWaveDispatchVGPRs = 0;
uint32_t HighBitsOf32BitAddress = 0;
// TODO: 10 may be a better default since it's the maximum.
@@ -327,6 +329,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false);
YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false);
YamlIO.mapOptional("hasSpilledVGPRs", MFI.HasSpilledVGPRs, false);
+ YamlIO.mapOptional("numWaveDispatchSGPRs", MFI.NumWaveDispatchSGPRs, false);
+ YamlIO.mapOptional("numWaveDispatchVGPRs", MFI.NumWaveDispatchVGPRs, false);
YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg,
StringValue("$private_rsrc_reg"));
YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg,
@@ -465,6 +469,9 @@ private:
unsigned NumUserSGPRs = 0;
unsigned NumSystemSGPRs = 0;
+ unsigned NumWaveDispatchSGPRs = 0;
+ unsigned NumWaveDispatchVGPRs = 0;
+
bool HasSpilledSGPRs = false;
bool HasSpilledVGPRs = false;
bool HasNonSpillStackObjects = false;
@@ -991,6 +998,14 @@ public:
return UserSGPRInfo.getNumKernargPreloadSGPRs();
}
+ unsigned getNumWaveDispatchSGPRs() const { return NumWaveDispatchSGPRs; }
+
+ void setNumWaveDispatchSGPRs(unsigned Count) { NumWaveDispatchSGPRs = Count; }
+
+ unsigned getNumWaveDispatchVGPRs() const { return NumWaveDispatchVGPRs; }
+
+ void setNumWaveDispatchVGPRs(unsigned Count) { NumWaveDispatchVGPRs = Count; }
+
Register getPrivateSegmentWaveByteOffsetSystemSGPR() const {
return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
}
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index 205a45a..38d9a4b 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -130,6 +130,9 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
if (VirtReg.isPhysical())
continue;
+ if (!VirtReg.isValid())
+ continue;
+
if (!VRM->hasPhys(VirtReg))
continue;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 81655f5..0293d40 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1166,7 +1166,8 @@ class RegOrImmOperand <RegisterClass RegClass, string OperandTypeName>
}
//===----------------------------------------------------------------------===//
-// SSrc_* Operands with an SGPR or a 32-bit immediate
+// SSrc_* Operands with an SGPR, a 32-bit immediate, or 64-bit immediate
+// if supported by target.
//===----------------------------------------------------------------------===//
class SrcRegOrImm9<RegisterClass regClass, string operandType>
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 431d73b..a003a46 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -484,6 +484,24 @@ def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (o
let isConvergent = 1;
}
+def S_BARRIER_INIT_M0 : SOP1_Pseudo <"s_barrier_init m0", (outs), (ins),
+ "", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_BARRIER_INIT_IMM : SOP1_Pseudo <"s_barrier_init", (outs),
+ (ins SplitBarrier:$src0), "$src0", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_BARRIER_JOIN_M0 : SOP1_Pseudo <"s_barrier_join m0", (outs), (ins),
+ "", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
} // End Uses = [M0]
def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs),
@@ -501,6 +519,12 @@ def S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Pseudo <"s_barrier_signal_isfirst", (out
let isConvergent = 1;
}
+def S_BARRIER_JOIN_IMM : SOP1_Pseudo <"s_barrier_join", (outs),
+ (ins SplitBarrier:$src0), "$src0", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
} // End has_sdst = 0
def S_GET_BARRIER_STATE_IMM : SOP1_Pseudo <"s_get_barrier_state", (outs SSrc_b32:$sdst),
@@ -1588,6 +1612,17 @@ def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm
let isConvergent = 1;
}
+def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins)> {
+ let SchedRW = [WriteBarrier];
+ let simm16 = 0;
+ let fixed_imm = 1;
+ let isConvergent = 1;
+ let Defs = [SCC];
+}
+
+def S_BARRIER_LEAVE_IMM : SOPP_Pseudo <"s_barrier_leave",
+ (ins i16imm:$simm16), "$simm16", [(int_amdgcn_s_barrier_leave timm:$simm16)]>;
+
def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
let SubtargetPredicate = isGFX8Plus;
let simm16 = 0;
@@ -1630,7 +1665,9 @@ def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> {
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
-def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
+def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16"> {
+ let SubtargetPredicate = isNotGFX1250Plus;
+}
// On SI the documentation says sleep for approximately 64 * low 2
// bits, consistent with the reported maximum of 448. On VI the
@@ -2144,9 +2181,13 @@ defm S_SENDMSG_RTN_B64 : SOP1_Real_gfx11_gfx12<0x04d>;
defm S_BARRIER_SIGNAL_M0 : SOP1_M0_Real_gfx12<0x04e>;
defm S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_M0_Real_gfx12<0x04f>;
defm S_GET_BARRIER_STATE_M0 : SOP1_M0_Real_gfx12<0x050>;
+defm S_BARRIER_INIT_M0 : SOP1_M0_Real_gfx12<0x051>;
+defm S_BARRIER_JOIN_M0 : SOP1_M0_Real_gfx12<0x052>;
defm S_BARRIER_SIGNAL_IMM : SOP1_IMM_Real_gfx12<0x04e>;
defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_IMM_Real_gfx12<0x04f>;
defm S_GET_BARRIER_STATE_IMM : SOP1_IMM_Real_gfx12<0x050>;
+defm S_BARRIER_INIT_IMM : SOP1_IMM_Real_gfx12<0x051>;
+defm S_BARRIER_JOIN_IMM : SOP1_IMM_Real_gfx12<0x052>;
defm S_ALLOC_VGPR : SOP1_Real_gfx12<0x053>;
defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12<0x058>;
@@ -2639,6 +2680,7 @@ multiclass SOPP_Real_32_gfx12<bits<7> op, string name = !tolower(NAME)> {
}
defm S_BARRIER_WAIT : SOPP_Real_32_gfx12<0x014>;
+defm S_BARRIER_LEAVE : SOPP_Real_32_gfx12<0x015>;
defm S_WAIT_LOADCNT : SOPP_Real_32_gfx12<0x040>;
defm S_WAIT_STORECNT : SOPP_Real_32_gfx12<0x041>;
defm S_WAIT_SAMPLECNT : SOPP_Real_32_gfx12<0x042>;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 3d9455f..c740b5e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -106,7 +106,7 @@ static constexpr CustomOperand MsgOperands[] = {
{{"MSG_GET_DDID"}, ID_GET_DDID, isGFX10},
{{"MSG_HS_TESSFACTOR"}, ID_HS_TESSFACTOR_GFX11Plus, isGFX11Plus},
{{"MSG_DEALLOC_VGPRS"}, ID_DEALLOC_VGPRS_GFX11Plus, isGFX11Plus},
- {{""}},
+ {{"MSG_SAVEWAVE_HAS_TDM"}, ID_SAVEWAVE_HAS_TDM, isGFX1250},
{{"MSG_SYSMSG"}, ID_SYSMSG},
{{"MSG_RTN_GET_DOORBELL"}, ID_RTN_GET_DOORBELL, isGFX11Plus},
{{"MSG_RTN_GET_DDID"}, ID_RTN_GET_DDID, isGFX11Plus},
@@ -195,7 +195,7 @@ static constexpr CustomOperand Operands[] = {
{{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10},
{{""}},
{{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx11, isGFX11},
- {{""}},
+ {{"HW_REG_IB_STS2"}, ID_IB_STS2, isGFX1250},
{{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_3_GFX11},
{{"HW_REG_SHADER_CYCLES_HI"}, ID_SHADER_CYCLES_HI, isGFX12Plus},
{{"HW_REG_DVGPR_ALLOC_LO"}, ID_DVGPR_ALLOC_LO, isGFX12Plus},
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 1e3e9a2..6e4e087 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -21,6 +21,7 @@
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -1160,17 +1161,28 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
return 65536;
if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
return 163840;
+ if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize327680))
+ return 327680;
return 0;
}
unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
// "Per CU" really means "per whatever functional block the waves of a
- // workgroup must share". For gfx10 in CU mode this is the CU, which contains
+ // workgroup must share".
+
+ // GFX12.5 only supports CU mode, which contains four SIMDs.
+ if (isGFX1250(*STI)) {
+ assert(STI->getFeatureBits().test(FeatureCuMode));
+ return 4;
+ }
+
+ // For gfx10 in CU mode the functional block is the CU, which contains
// two SIMDs.
if (isGFX10Plus(*STI) && STI->getFeatureBits().test(FeatureCuMode))
return 2;
- // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP contains
- // two CUs, so a total of four SIMDs.
+
+ // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP
+ // contains two CUs, so a total of four SIMDs.
return 4;
}
@@ -1666,6 +1678,29 @@ getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size) {
return Vals;
}
+bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val) {
+ assert((MD.getNumOperands() % 2 == 0) && "invalid number of operands!");
+ for (unsigned I = 0, E = MD.getNumOperands() / 2; I != E; ++I) {
+ auto Low =
+ mdconst::extract<ConstantInt>(MD.getOperand(2 * I + 0))->getValue();
+ auto High =
+ mdconst::extract<ConstantInt>(MD.getOperand(2 * I + 1))->getValue();
+ // There are two types of [A; B) ranges:
+ // A < B, e.g. [4; 5) which is a range that only includes 4.
+ // A > B, e.g. [5; 4) which is a range that wraps around and includes
+ // everything except 4.
+ if (Low.ult(High)) {
+ if (Low.ule(Val) && High.ugt(Val))
+ return true;
+ } else {
+ if (Low.uge(Val) && High.ult(Val))
+ return true;
+ }
+ }
+
+ return false;
+}
+
unsigned getVmcntBitMask(const IsaVersion &Version) {
return (1 << (getVmcntBitWidthLo(Version.Major) +
getVmcntBitWidthHi(Version.Major))) -
@@ -2406,7 +2441,11 @@ unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler) {
return 0;
}
-unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) { return 16; }
+unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) {
+ if (isGFX1250(STI))
+ return 32;
+ return 16;
+}
bool isSI(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureSouthernIslands);
@@ -2478,6 +2517,12 @@ bool isGFX1250(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts];
}
+bool supportsWGP(const MCSubtargetInfo &STI) {
+ if (isGFX1250(STI))
+ return false;
+ return isGFX10Plus(STI);
+}
+
bool isNotGFX11Plus(const MCSubtargetInfo &STI) { return !isGFX11Plus(STI); }
bool isNotGFX10Plus(const MCSubtargetInfo &STI) {
@@ -3309,13 +3354,39 @@ bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) {
return false;
}
-bool isDPALU_DPP(const MCInstrDesc &OpDesc) {
+bool isDPALU_DPP32BitOpc(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::V_MUL_LO_U32_e64:
+ case AMDGPU::V_MUL_LO_U32_e64_dpp:
+ case AMDGPU::V_MUL_LO_U32_e64_dpp_gfx1250:
+ case AMDGPU::V_MUL_HI_U32_e64:
+ case AMDGPU::V_MUL_HI_U32_e64_dpp:
+ case AMDGPU::V_MUL_HI_U32_e64_dpp_gfx1250:
+ case AMDGPU::V_MUL_HI_I32_e64:
+ case AMDGPU::V_MUL_HI_I32_e64_dpp:
+ case AMDGPU::V_MUL_HI_I32_e64_dpp_gfx1250:
+ case AMDGPU::V_MAD_U32_e64:
+ case AMDGPU::V_MAD_U32_e64_dpp:
+ case AMDGPU::V_MAD_U32_e64_dpp_gfx1250:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST) {
+ if (!ST.hasFeature(AMDGPU::FeatureDPALU_DPP))
+ return false;
+
+ if (isDPALU_DPP32BitOpc(OpDesc.getOpcode()))
+ return ST.hasFeature(AMDGPU::FeatureGFX1250Insts);
+
return hasAny64BitVGPROperands(OpDesc);
}
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
- // Currently this is 128 for all subtargets
- return 128;
+ return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256
+ : 128;
}
bool isPackedFP32Inst(unsigned Opc) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 1bcd36c..70dfb63 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -35,6 +35,7 @@ class MCInstrInfo;
class MCRegisterClass;
class MCRegisterInfo;
class MCSubtargetInfo;
+class MDNode;
class StringRef;
class Triple;
class raw_ostream;
@@ -1064,6 +1065,9 @@ SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
std::optional<SmallVector<unsigned>>
getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size);
+/// Checks if \p Val is inside \p MD, a !range-like metadata.
+bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val);
+
/// Represents the counter values to wait for in an s_waitcnt instruction.
///
/// Large values (including the maximum possible integer) can be used to
@@ -1549,6 +1553,7 @@ bool isGFX11Plus(const MCSubtargetInfo &STI);
bool isGFX12(const MCSubtargetInfo &STI);
bool isGFX12Plus(const MCSubtargetInfo &STI);
bool isGFX1250(const MCSubtargetInfo &STI);
+bool supportsWGP(const MCSubtargetInfo &STI);
bool isNotGFX12Plus(const MCSubtargetInfo &STI);
bool isNotGFX11Plus(const MCSubtargetInfo &STI);
bool isGCN3Encoding(const MCSubtargetInfo &STI);
@@ -1750,15 +1755,22 @@ unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST);
bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
LLVM_READNONE
-inline bool isLegalDPALU_DPPControl(unsigned DC) {
- return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST;
+inline bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC) {
+ if (isGFX12(ST))
+ return DC >= DPP::ROW_SHARE_FIRST && DC <= DPP::ROW_SHARE_LAST;
+ if (isGFX90A(ST))
+ return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST;
+ return false;
}
/// \returns true if an instruction may have a 64-bit VGPR operand.
bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc);
+/// \returns true if an instruction is a DP ALU DPP without any 64-bit operands.
+bool isDPALU_DPP32BitOpc(unsigned Opc);
+
/// \returns true if an instruction is a DP ALU DPP.
-bool isDPALU_DPP(const MCInstrDesc &OpDesc);
+bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST);
/// \returns true if the intrinsic is divergent
bool isIntrinsicSourceOfDivergence(unsigned IntrID);
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index b128207..11c7275 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -706,7 +706,6 @@ def V_CVT_F16_F8_Fake16_Profile : VOP3_Profile_Fake16<V_CVT_F16_F8_Profile>;
let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
- // FIXME: This differs from downstream due to changes that haven't been upstreamed yet.
let SubtargetPredicate = isGFX12PlusNot12_50 in
defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
let SubtargetPredicate = isGFX125xOnly in
@@ -731,7 +730,6 @@ class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst, bit HasOpSe
>;
let OtherPredicates = [HasFP8ConversionInsts] in {
- // FIXME: This differs from downstream due to changes that haven't been upstreamed yet.
let SubtargetPredicate = isGFX12PlusNot12_50 in
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_fp8, V_CVT_F32_FP8_OP_SEL_e64>;
let SubtargetPredicate = isGFX125xOnly in {
@@ -740,7 +738,6 @@ let OtherPredicates = [HasFP8ConversionInsts] in {
def : GCNPat<(int_amdgcn_cvt_f32_fp8_e5m3 i32:$src0, timm:$byte_sel),
(V_CVT_F32_FP8_gfx1250_e64 $src0, DSTCLAMP.ENABLE, (as_i32timm $byte_sel))>;
}
- // FIXME: This differs from downstream due to changes that haven't been upstreamed yet.
let SubtargetPredicate = isGFX12Plus in
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_bf8, V_CVT_F32_BF8_OP_SEL_e64>;
}
@@ -1058,11 +1055,6 @@ multiclass VOP1Only_Real_gfx11_gfx12<bits<9> op> :
multiclass VOP1_Real_FULL_gfx11_gfx12<bits<9> op> :
VOP1_Real_FULL<GFX11Gen, op>, VOP1_Real_FULL<GFX12Gen, op>;
-multiclass VOP1_Real_NO_DPP_OP_SEL_with_name<GFXGen Gen, bits<9> op,
- string opName, string asmName> :
- VOP1_Real_e32_with_name<Gen, op, opName, asmName>,
- VOP3_Real_with_name<Gen, {0, 1, 1, op{6-0}}, opName, asmName>;
-
multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250<
bits<9> op, string asmName = !tolower(NAME), string opName = NAME> {
defm opName#"_t16" :
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index f4b6af6..329d003 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -2084,6 +2084,9 @@ multiclass VOP3_Realtriple_gfx11_gfx12<bits<10> op> :
multiclass VOP3_Real_Base_gfx11_gfx12<bits<10> op> :
VOP3_Real_Base<GFX11Gen, op>, VOP3_Real_Base<GFX12Gen, op>;
+multiclass VOP3_Real_Base_gfx11_gfx12_not_gfx1250<bits<10> op> :
+ VOP3_Real_Base<GFX11Gen, op>, VOP3_Real_Base<GFX12Not12_50Gen, op>;
+
multiclass VOP3_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
string asmName> :
VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName>,
@@ -2211,9 +2214,9 @@ defm V_MUL_F64 : VOP3_Real_Base_gfx11<0x328>;
defm V_MIN_F64 : VOP3_Real_Base_gfx11<0x329>;
defm V_MAX_F64 : VOP3_Real_Base_gfx11<0x32a>;
defm V_LDEXP_F64 : VOP3_Real_Base_gfx11_gfx12<0x32b>;
-defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11_gfx12<0x32c>;
-defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11_gfx12<0x32d>;
-defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11_gfx12<0x32e>;
+defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32c>;
+defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32d>;
+defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32e>;
defm V_TRIG_PREOP_F64 : VOP3_Real_Base_gfx11_gfx12<0x32f>;
defm V_LSHLREV_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x338, "v_lshlrev_b16">;
defm V_LSHRREV_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x339, "v_lshrrev_b16">;
@@ -2242,6 +2245,10 @@ let AssemblerPredicate = isGFX11Plus in {
}
// These instructions differ from GFX12 variant by supporting DPP:
+defm V_MUL_LO_U32 : VOP3Only_Realtriple_gfx1250<0x32c>;
+defm V_MUL_HI_U32 : VOP3Only_Realtriple_gfx1250<0x32d>;
+defm V_MUL_HI_I32 : VOP3Only_Realtriple_gfx1250<0x32e>;
+
defm V_PERM_PK16_B4_U4 : VOP3Only_Real_Base_gfx1250<0x23f>;
defm V_PERM_PK16_B6_U4 : VOP3Only_Real_Base_gfx1250<0x242>;
defm V_PERM_PK16_B8_U4 : VOP3Only_Real_Base_gfx1250<0x243>;