aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp20
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp108
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h23
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp234
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp26
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td1
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td104
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h3
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp20
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp150
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrFormats.td1
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp113
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h11
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td1
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td18
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td3
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td25
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td11
34 files changed, 564 insertions, 409 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 23f106a..007b481 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -153,6 +153,9 @@ private:
const TargetMachine &TM;
};
+void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &);
+extern char &AMDGPUPrepareAGPRAllocLegacyID;
+
void initializeAMDGPUReserveWWMRegsLegacyPass(PassRegistry &);
extern char &AMDGPUReserveWWMRegsLegacyID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index faf59c1..0e0e83b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1118,6 +1118,12 @@ def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts",
"Has v_bitop3_b32/v_bitop3_b16 instructions"
>;
+def FeatureTanhInsts : SubtargetFeature<"tanh-insts",
+ "HasTanhInsts",
+ "true",
+ "Has v_tanh_f32/f16 instructions"
+>;
+
def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts",
"HasTransposeLoadF4F6Insts",
"true",
@@ -1979,6 +1985,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
FeatureBitOp3Insts,
+ FeatureTanhInsts,
FeatureTransposeLoadF4F6Insts,
FeatureBF16TransInsts,
FeatureBF16ConversionInsts,
@@ -2703,6 +2710,9 @@ def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">,
AssemblerPredicate<(all_of FeatureBitOp3Insts)>;
+def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">,
+ AssemblerPredicate<(all_of FeatureTanhInsts)>;
+
def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">,
AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 7b5d4077e..2bfd56f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -137,6 +137,9 @@ def gi_global_offset :
def gi_global_saddr :
GIComplexOperandMatcher<s64, "selectGlobalSAddr">,
GIComplexPatternEquiv<GlobalSAddr>;
+def gi_global_saddr_glc :
+ GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
+ GIComplexPatternEquiv<GlobalSAddrGLC>;
def gi_mubuf_scratch_offset :
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 25672a5..00c7f0e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1968,6 +1968,29 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
+ SDValue &SAddr, SDValue &VOffset,
+ SDValue &Offset,
+ SDValue &CPol) const {
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset))
+ return false;
+
+ CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
+ SDValue &SAddr, SDValue &VOffset,
+ SDValue &Offset,
+ SDValue &CPol) const {
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset))
+ return false;
+
+ unsigned CPolVal = AMDGPU::CPol::GLC;
+ CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
+ return true;
+}
+
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 9967f46..acbab3d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -163,6 +163,12 @@ private:
SDValue &Offset) const;
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset) const;
+ bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
+ SDValue &VOffset, SDValue &Offset,
+ SDValue &CPol) const;
+ bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
+ SDValue &VOffset, SDValue &Offset,
+ SDValue &CPol) const;
bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &Offset) const;
bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 1a63c48..d161c03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5485,7 +5485,8 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
+AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
+ unsigned CPolBits) const {
Register Addr = Root.getReg();
Register PtrBase;
int64_t ConstOffset;
@@ -5529,6 +5530,7 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
MIB.addReg(HighBits);
}, // voffset
[=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
}};
}
}
@@ -5568,6 +5570,9 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
},
[=](MachineInstrBuilder &MIB) { // offset
MIB.addImm(ImmOffset);
+ },
+ [=](MachineInstrBuilder &MIB) { // cpol
+ MIB.addImm(CPolBits);
}}};
}
}
@@ -5591,11 +5596,22 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
[=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
- [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
}};
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
+ return selectGlobalSAddr(Root, 0);
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
+ return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
Register Addr = Root.getReg();
Register PtrBase;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 2cb7904..34bdf0a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -254,7 +254,11 @@ private:
selectScratchOffset(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
+ selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits) const;
+ InstructionSelector::ComplexRendererFns
selectGlobalSAddr(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectGlobalSAddrGLC(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectScratchSAddr(MachineOperand &Root) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 250547a..b6c6d92 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -114,6 +114,7 @@ MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUse
MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass())
MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass())
+MACHINE_FUNCTION_PASS("amdgpu-prepare-agpr-alloc", AMDGPUPrepareAGPRAllocPass())
MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass())
MACHINE_FUNCTION_PASS("amdgpu-wait-sgpr-hazards", AMDGPUWaitSGPRHazardsPass())
MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
new file mode 100644
index 0000000..3b06e9b
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
@@ -0,0 +1,108 @@
+//===-- AMDGPUPrepareAGPRAlloc.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Make simple transformations to relax register constraints for cases which can
+// allocate to AGPRs or VGPRs. Replace materialize of inline immediates into
+// AGPR or VGPR with a pseudo with an AV_* class register constraint. This
+// allows later passes to inflate the register class if necessary. The register
+// allocator does not know to replace instructions to relax constraints.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUPrepareAGPRAlloc.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-prepare-agpr-alloc"
+
+namespace {
+
+class AMDGPUPrepareAGPRAllocImpl {
+private:
+ const SIInstrInfo &TII;
+ MachineRegisterInfo &MRI;
+
+public:
+ AMDGPUPrepareAGPRAllocImpl(const GCNSubtarget &ST, MachineRegisterInfo &MRI)
+ : TII(*ST.getInstrInfo()), MRI(MRI) {}
+ bool run(MachineFunction &MF);
+};
+
+class AMDGPUPrepareAGPRAllocLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUPrepareAGPRAllocLegacy() : MachineFunctionPass(ID) {
+ initializeAMDGPUPrepareAGPRAllocLegacyPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "AMDGPU Prepare AGPR Alloc"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE,
+ "AMDGPU Prepare AGPR Alloc", false, false)
+INITIALIZE_PASS_END(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE,
+ "AMDGPU Prepare AGPR Alloc", false, false)
+
+char AMDGPUPrepareAGPRAllocLegacy::ID = 0;
+
+char &llvm::AMDGPUPrepareAGPRAllocLegacyID = AMDGPUPrepareAGPRAllocLegacy::ID;
+
+bool AMDGPUPrepareAGPRAllocLegacy::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ return AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF);
+}
+
+PreservedAnalyses
+AMDGPUPrepareAGPRAllocPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF);
+ return PreservedAnalyses::all();
+}
+
+bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) {
+ if (MRI.isReserved(AMDGPU::AGPR0))
+ return false;
+
+ const MCInstrDesc &AVImmPseudo = TII.get(AMDGPU::AV_MOV_B32_IMM_PSEUDO);
+
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if ((MI.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
+ TII.isInlineConstant(MI, 1)) ||
+ (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ MI.getOperand(1).isImm())) {
+ MI.setDesc(AVImmPseudo);
+ Changed = true;
+ }
+ }
+ }
+
+ return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h
new file mode 100644
index 0000000..dc598c9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h
@@ -0,0 +1,23 @@
+//===- AMDGPUPrepareAGPRAlloc.h ---------------------------------*- C++- *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+class AMDGPUPrepareAGPRAllocPass
+ : public PassInfoMixin<AMDGPUPrepareAGPRAllocPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index cbbb57c..bf2f37b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4558,6 +4558,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_pk_u16:
case Intrinsic::amdgcn_cvt_pk_f16_fp8:
case Intrinsic::amdgcn_cvt_pk_f16_bf8:
+ case Intrinsic::amdgcn_sat_pk4_i4_i8:
+ case Intrinsic::amdgcn_sat_pk4_u4_u8:
case Intrinsic::amdgcn_fmed3:
case Intrinsic::amdgcn_cubeid:
case Intrinsic::amdgcn_cubema:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 46027b8..8101c68 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -167,77 +167,39 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
Info.UsesVCC =
MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
+ Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass,
+ /*IncludeCalls=*/false);
+ if (ST.hasMAIInsts())
+ Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass,
+ /*IncludeCalls=*/false);
// If there are no calls, MachineRegisterInfo can tell us the used register
// count easily.
// A tail call isn't considered a call for MachineFrameInfo's purposes.
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
- Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
- Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
- if (ST.hasMAIInsts())
- Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
+ Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass,
+ /*IncludeCalls=*/false);
return Info;
}
int32_t MaxVGPR = -1;
- int32_t MaxAGPR = -1;
- int32_t MaxSGPR = -1;
Info.CalleeSegmentSize = 0;
for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
- // TODO: Check regmasks? Do they occur anywhere except calls?
- for (const MachineOperand &MO : MI.operands()) {
- unsigned Width = 0;
- bool IsSGPR = false;
- bool IsAGPR = false;
+ for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+ const MachineOperand &MO = MI.getOperand(I);
if (!MO.isReg())
continue;
Register Reg = MO.getReg();
switch (Reg) {
- case AMDGPU::EXEC:
- case AMDGPU::EXEC_LO:
- case AMDGPU::EXEC_HI:
- case AMDGPU::SCC:
- case AMDGPU::M0:
- case AMDGPU::M0_LO16:
- case AMDGPU::M0_HI16:
- case AMDGPU::SRC_SHARED_BASE_LO:
- case AMDGPU::SRC_SHARED_BASE:
- case AMDGPU::SRC_SHARED_LIMIT_LO:
- case AMDGPU::SRC_SHARED_LIMIT:
- case AMDGPU::SRC_PRIVATE_BASE_LO:
- case AMDGPU::SRC_PRIVATE_BASE:
- case AMDGPU::SRC_PRIVATE_LIMIT_LO:
- case AMDGPU::SRC_PRIVATE_LIMIT:
- case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
- case AMDGPU::SGPR_NULL:
- case AMDGPU::SGPR_NULL64:
- case AMDGPU::MODE:
- continue;
-
case AMDGPU::NoRegister:
assert(MI.isDebugInstr() &&
"Instruction uses invalid noreg register");
continue;
- case AMDGPU::VCC:
- case AMDGPU::VCC_LO:
- case AMDGPU::VCC_HI:
- case AMDGPU::VCC_LO_LO16:
- case AMDGPU::VCC_LO_HI16:
- case AMDGPU::VCC_HI_LO16:
- case AMDGPU::VCC_HI_HI16:
- Info.UsesVCC = true;
- continue;
-
- case AMDGPU::FLAT_SCR:
- case AMDGPU::FLAT_SCR_LO:
- case AMDGPU::FLAT_SCR_HI:
- continue;
-
case AMDGPU::XNACK_MASK:
case AMDGPU::XNACK_MASK_LO:
case AMDGPU::XNACK_MASK_HI:
@@ -267,170 +229,22 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
break;
}
- if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
- AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
- AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 1;
- } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
- AMDGPU::VGPR_16RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 1;
- } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
- AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 1;
- } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 2;
- } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 2;
- } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 2;
- } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 3;
- } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 3;
- } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 3;
- } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 4;
- } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 4;
- } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 4;
- } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 5;
- } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 5;
- } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 5;
- } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 6;
- } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 6;
- } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 6;
- } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 7;
- } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 7;
- } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 7;
- } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 8;
- } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 8;
- } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 8;
- } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 9;
- } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 9;
- } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 9;
- } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 10;
- } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 10;
- } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 10;
- } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 11;
- } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 11;
- } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 11;
- } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 12;
- } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 12;
- } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 12;
- } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 16;
- } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 16;
- } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 16;
- } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 32;
- } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 32;
- } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 32;
- } else {
- // We only expect TTMP registers or registers that do not belong to
- // any RC.
- assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
- AMDGPU::TTMP_64RegClass.contains(Reg) ||
- AMDGPU::TTMP_128RegClass.contains(Reg) ||
- AMDGPU::TTMP_256RegClass.contains(Reg) ||
- AMDGPU::TTMP_512RegClass.contains(Reg) ||
- !TRI.getPhysRegBaseClass(Reg)) &&
- "Unknown register class");
- }
+ const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg);
+ assert((!RC || TRI.isVGPRClass(RC) || TRI.isSGPRClass(RC) ||
+ TRI.isAGPRClass(RC) || AMDGPU::TTMP_32RegClass.contains(Reg) ||
+ AMDGPU::TTMP_64RegClass.contains(Reg) ||
+ AMDGPU::TTMP_128RegClass.contains(Reg) ||
+ AMDGPU::TTMP_256RegClass.contains(Reg) ||
+ AMDGPU::TTMP_512RegClass.contains(Reg)) &&
+ "Unknown register class");
+
+ if (!RC || !TRI.isVGPRClass(RC))
+ continue;
+
+ unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32);
unsigned HWReg = TRI.getHWRegIndex(Reg);
int MaxUsed = HWReg + Width - 1;
- if (IsSGPR) {
- MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
- } else if (IsAGPR) {
- MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
- } else {
- MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
- }
+ MaxVGPR = std::max(MaxUsed, MaxVGPR);
}
if (MI.isCall()) {
@@ -492,9 +306,7 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
}
}
- Info.NumExplicitSGPR = MaxSGPR + 1;
Info.NumVGPR = MaxVGPR + 1;
- Info.NumAGPR = MaxAGPR + 1;
return Info;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 31a80e0..c865082 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -25,6 +25,7 @@
#include "AMDGPUMacroFusion.h"
#include "AMDGPUPerfHintAnalysis.h"
#include "AMDGPUPreloadKernArgProlog.h"
+#include "AMDGPUPrepareAGPRAlloc.h"
#include "AMDGPURemoveIncompatibleFunctions.h"
#include "AMDGPUReserveWWMRegs.h"
#include "AMDGPUResourceUsageAnalysis.h"
@@ -499,6 +500,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeGlobalISel(*PR);
initializeAMDGPUAsmPrinterPass(*PR);
initializeAMDGPUDAGToDAGISelLegacyPass(*PR);
+ initializeAMDGPUPrepareAGPRAllocLegacyPass(*PR);
initializeGCNDPPCombineLegacyPass(*PR);
initializeSILowerI1CopiesLegacyPass(*PR);
initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
@@ -1196,6 +1198,7 @@ public:
bool addRegBankSelect() override;
void addPreGlobalInstructionSelect() override;
bool addGlobalInstructionSelect() override;
+ void addPreRegAlloc() override;
void addFastRegAlloc() override;
void addOptimizedRegAlloc() override;
@@ -1539,6 +1542,11 @@ void GCNPassConfig::addFastRegAlloc() {
TargetPassConfig::addFastRegAlloc();
}
+void GCNPassConfig::addPreRegAlloc() {
+ if (getOptLevel() != CodeGenOptLevel::None)
+ addPass(&AMDGPUPrepareAGPRAllocLegacyID);
+}
+
void GCNPassConfig::addOptimizedRegAlloc() {
if (EnableDCEInRA)
insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
@@ -2235,6 +2243,11 @@ void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
Base::addOptimizedRegAlloc(addPass);
}
+void AMDGPUCodeGenPassBuilder::addPreRegAlloc(AddMachinePass &addPass) const {
+ if (getOptLevel() != CodeGenOptLevel::None)
+ addPass(AMDGPUPrepareAGPRAllocPass());
+}
+
Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
AddMachinePass &addPass) const {
// TODO: Check --regalloc-npm option
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 3b2f39c..e0f1296 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -181,7 +181,9 @@ public:
void addMachineSSAOptimization(AddMachinePass &) const;
void addPostRegAlloc(AddMachinePass &) const;
void addPreEmitPass(AddMachinePass &) const;
+ void addPreEmitRegAlloc(AddMachinePass &) const;
Error addRegAssignmentOptimized(AddMachinePass &) const;
+ void addPreRegAlloc(AddMachinePass &) const;
void addOptimizedRegAlloc(AddMachinePass &) const;
void addPreSched2(AddMachinePass &) const;
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 43d4e8db..de17fcc 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -5280,6 +5280,15 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
unsigned CPol = Inst.getOperand(CPolPos).getImm();
+ if (!isGFX1250()) {
+ if (CPol & CPol::NV) {
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+ StringRef CStr(S.getPointer());
+ S = SMLoc::getFromPointer(&CStr.data()[CStr.find("nv")]);
+ Error(S, "nv is not supported on this GPU");
+ }
+ }
+
if (isGFX12Plus())
return validateTHAndScopeBits(Inst, Operands, CPol);
@@ -6916,6 +6925,7 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
int64_t CPolVal = 0;
ParseStatus ResTH = ParseStatus::NoMatch;
ParseStatus ResScope = ParseStatus::NoMatch;
+ ParseStatus ResNV = ParseStatus::NoMatch;
for (;;) {
if (ResTH.isNoMatch()) {
@@ -6940,10 +6950,24 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
}
}
+ // NV bit exists on GFX12+, but does something starting from GFX1250.
+ // Allow parsing on all GFX12 and fail on validation for better
+ // diagnostics.
+ if (ResNV.isNoMatch()) {
+ if (trySkipId("nv")) {
+ ResNV = ParseStatus::Success;
+ CPolVal |= CPol::NV;
+ continue;
+ } else if (trySkipId("no", "nv")) {
+ ResNV = ParseStatus::Success;
+ continue;
+ }
+ }
+
break;
}
- if (ResTH.isNoMatch() && ResScope.isNoMatch())
+ if (ResTH.isNoMatch() && ResScope.isNoMatch() && ResNV.isNoMatch())
return ParseStatus::NoMatch;
Operands.push_back(AMDGPUOperand::CreateImm(this, CPolVal, StringLoc,
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 0caabe4..e994aee 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -2451,6 +2451,7 @@ class VBUFFER_Real <bits<8> op, BUF_Pseudo ps, string real_name> :
let Inst{62} = ps.offen;
let Inst{63} = ps.idxen;
+ let Inst{7} = cpol{5}; // nv
let Inst{54-53} = cpol{2-1}; // th{2-1}
let Inst{52} = !if(ps.IsAtomicRet, 1, cpol{0}); // th{0}
let Inst{51-50} = cpol{4-3}; // scope
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index e3519f1..42edec0 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -74,6 +74,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
+ AMDGPUPrepareAGPRAlloc.cpp
AMDGPUSwLowerLDS.cpp
AMDGPUMachineFunction.cpp
AMDGPUMachineModuleInfo.cpp
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 74632c7..f7f29f1 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -11,7 +11,8 @@ let WantsRoot = true in {
def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [], -10>;
def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>;
- def GlobalSAddr : ComplexPattern<iPTR, 3, "SelectGlobalSAddr", [], [], -10>;
+ def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
+ def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>;
def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [], -10>;
}
@@ -182,7 +183,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
bits<7> saddr;
bits<8> vdst;
- bits<6> cpol;
+ bits<12> cpol;
bits<8> vdata; // vsrc
bits<8> vaddr;
bits<24> offset;
@@ -192,6 +193,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
let Inst{31-26} = 0x3b;
let Inst{39-32} = !if(ps.has_vdst, vdst, ?);
let Inst{49} = ps.sve;
+ let Inst{7} = cpol{5}; // nv
let Inst{54-53} = cpol{2-1}; // th{2-1}
let Inst{52} = !if(ps.IsAtomicRet, 1, cpol{0}); // th{0}
let Inst{51-50} = cpol{4-3}; // scope
@@ -200,6 +202,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
let Inst{95-72} = !if(ps.has_offset, offset, ?);
}
+// TODO: Rename to FlatSaddrTable, it now handles both global and flat GVS addressing mode.
class GlobalSaddrTable <bit is_saddr, string Name = ""> {
bit IsSaddr = is_saddr;
string SaddrOp = Name;
@@ -1251,13 +1254,13 @@ class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
>;
class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)),
- (inst $saddr, $voffset, $offset, (i32 0), $in)
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)),
+ (inst $saddr, $voffset, $offset, $cpol, $in)
>;
class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
- (inst $saddr, $voffset, $offset, (i32 0))
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+ (inst $saddr, $voffset, $offset, $cpol)
>;
class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1271,26 +1274,26 @@ class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
>;
class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
- (inst $saddr, $voffset, $offset, 0)
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+ (inst $saddr, $voffset, $offset, $cpol)
>;
class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
- (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset)),
- (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
+ (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol)),
+ (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset, $cpol)
>;
-class GlobalAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
- ValueType vt, ValueType data_vt = vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), data_vt:$data)),
- (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset)
+class FlatAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ComplexPattern pat,
+ ValueType vt, ValueType data_vt = vt> : GCNPat <
+ (vt (node (pat (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), data_vt:$data)),
+ (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset, $cpol)
>;
class GlobalAtomicNoRtnSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
- (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$data),
- (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
+ (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$data),
+ (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset, $cpol)
>;
class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1319,6 +1322,12 @@ multiclass FlatAtomicNoRtnPatBase <string inst, string node, ValueType vt,
let AddedComplexity = 1 in
def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+
+ def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node),
+ GlobalSAddr, vt, data_vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
}
multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addrSpaceSuffix,
@@ -1337,6 +1346,11 @@ multiclass FlatAtomicRtnPatBase <string inst, string node, ValueType vt,
def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+
+ def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, GlobalSAddrGLC, vt, data_vt> {
+ let AddedComplexity = 8;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
}
multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix,
@@ -1506,7 +1520,8 @@ multiclass GlobalFLATAtomicPatsNoRtnBase<string inst, string node, ValueType vt,
def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), !cast<SDPatternOperator>(node), vt, data_vt>;
let AddedComplexity = 13 in
- def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node), vt, data_vt>;
+ def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node),
+ GlobalSAddr, vt, data_vt>;
}
multiclass GlobalFLATAtomicPatsRtnBase<string inst, string node, ValueType vt,
@@ -1517,7 +1532,7 @@ multiclass GlobalFLATAtomicPatsRtnBase<string inst, string node, ValueType vt,
def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst#"_RTN"), rtnNode, vt, data_vt>;
let AddedComplexity = 12 in
- def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>;
+ def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, GlobalSAddrGLC, vt, data_vt>;
}
multiclass GlobalFLATAtomicPatsNoRtn<string inst, string node, ValueType vt,
@@ -1723,6 +1738,7 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi
defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, v2i32>;
defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
defm : FlatStorePats <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
@@ -1734,7 +1750,7 @@ defm : FlatStorePats <FLAT_STORE_DWORD, store_flat, vt>;
foreach vt = VReg_64.RegTypes in {
defm : FlatStorePats <FLAT_STORE_DWORDX2, store_flat, vt>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, load_flat, vt>;
}
defm : FlatStorePats <FLAT_STORE_DWORDX3, store_flat, v3i32>;
@@ -1746,6 +1762,7 @@ defm : FlatStorePats <FLAT_STORE_DWORDX4, store_flat, vt>;
defm : FlatStorePats <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
+defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, v2i32>;
defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
@@ -1791,12 +1808,16 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
} // end foreach as
-let SubtargetPredicate = isGFX12Plus in {
- defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >;
+defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
- let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-}
+} // End OtherPredicates = [HasFlatAddressSpace]
+
+let OtherPredicates = [isGFX12Plus] in
+defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
+
+let OtherPredicates = [isGFX12Plus, HasAtomicCSubNoRtnInsts] in
+defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
let OtherPredicates = [HasD16LoadStore] in {
defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
@@ -1805,23 +1826,21 @@ defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
let OtherPredicates = [D16PreservesUnusedBits] in {
// TODO: Handle atomic loads
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
}
-} // End OtherPredicates = [HasFlatAddressSpace]
-
let OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i32>;
@@ -1889,6 +1908,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, store_global, vt>;
// appropriate waits.
defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, atomic_load_nonext_32_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, i64>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, v2i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
@@ -1928,6 +1948,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, v2i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>;
@@ -2946,13 +2967,6 @@ multiclass VFLAT_Real_AllAddr_gfx12<bits<8> op,
defm _SADDR : VFLAT_Real_gfx12<op, name>;
}
-multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op,
- string name = get_FLAT_ps<NAME>.Mnemonic,
- string alias = name> :
- VFLAT_Real_Base_gfx12<op, name, alias> {
- defm _SADDR : VFLAT_Real_gfx12<op, name>;
-}
-
multiclass VGLOBAL_Real_AllAddr_gfx1200<bits<8> op> {
let AssemblerPredicate = isGFX12Not12_50 in {
defm "" : VFLAT_Real_gfx12<op>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 67c6daa..268162b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -234,6 +234,7 @@ protected:
bool HasRestrictedSOffset = false;
bool Has64BitLiterals = false;
bool HasBitOp3Insts = false;
+ bool HasTanhInsts = false;
bool HasTransposeLoadF4F6Insts = false;
bool HasPrngInst = false;
bool HasBVHDualAndBVH8Insts = false;
@@ -1380,6 +1381,8 @@ public:
return HasMinimum3Maximum3F16;
}
+ bool hasTanhInsts() const { return HasTanhInsts; }
+
bool hasAddPC64Inst() const { return GFX1250Insts; }
bool hasMinimum3Maximum3PKF16() const {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index ec9248b..44d2f94 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -160,6 +160,9 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
printTH(MI, TH, Scope, O);
printScope(Scope, O);
+ if (Imm & CPol::NV)
+ O << " nv";
+
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index a864997..edc74605 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -398,6 +398,8 @@ enum CPol {
SCOPE_DEV = 2 << 3,
SCOPE_SYS = 3 << 3,
+ NV = 1 << 5, // Non-volatile bit
+
SWZ = 1 << 6, // Swizzle bit
ALL = TH | SCOPE,
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 9b5a463..f018f77 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -378,6 +378,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
default:
return false;
case AMDGPU::V_MOV_B32_e32:
+ case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
SMovOp = AMDGPU::S_MOV_B32;
break;
case AMDGPU::V_MOV_B64_PSEUDO:
@@ -946,13 +947,18 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
// Copies and REG_SEQUENCE do not contribute to the final assembly
// So, skip them but take care of the SGPR to VGPR copies bookkeeping.
- if (Inst->isCopy() || Inst->isRegSequence()) {
- if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
- if (!Inst->isCopy() ||
- !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
- Info.NumSVCopies++;
- continue;
- }
+ if (Inst->isRegSequence() &&
+ TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
+ Info.NumSVCopies++;
+ continue;
+ }
+ if (Inst->isCopy()) {
+ const TargetRegisterClass *SrcRC, *DstRC;
+ std::tie(SrcRC, DstRC) = getCopyRegClasses(*Inst, *TRI, *MRI);
+ if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI) &&
+ !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
+ Info.NumSVCopies++;
+ continue;
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index dfe6f65..0c76ff2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -33,6 +33,7 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -46,6 +47,7 @@
#include <optional>
using namespace llvm;
+using namespace llvm::SDPatternMatch;
#define DEBUG_TYPE "si-lower"
@@ -9308,7 +9310,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_reloc_constant: {
- Module *M = const_cast<Module *>(MF.getFunction().getParent());
+ Module *M = MF.getFunction().getParent();
const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
auto *RelocSymbol = cast<GlobalVariable>(
@@ -11131,7 +11133,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
assert(VT.getSizeInBits() == 64);
SDLoc DL(Op);
- SDValue Cond = Op.getOperand(0);
+ SDValue Cond = DAG.getFreeze(Op.getOperand(0));
SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
SDValue One = DAG.getConstant(1, DL, MVT::i32);
@@ -14561,7 +14563,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
// instead of a tree.
SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
DAGCombinerInfo &DCI) const {
- assert(N->getOpcode() == ISD::ADD);
+ assert(N->isAnyAdd());
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
@@ -14594,7 +14596,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
for (SDNode *User : LHS->users()) {
// There is a use that does not feed into addition, so the multiply can't
// be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
- if (User->getOpcode() != ISD::ADD)
+ if (!User->isAnyAdd())
return SDValue();
// We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
@@ -14706,8 +14708,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
SDValue Hi = getHiHalf64(LHS, DAG);
SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
+ unsigned Opcode = N->getOpcode();
+ if (Opcode == ISD::PTRADD)
+ Opcode = ISD::ADD;
SDValue AddHi =
- DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
+ DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
@@ -15181,42 +15186,123 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
+ EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- if (N1.getOpcode() == ISD::ADD) {
- // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
- // y is not, and (add y, z) is used only once.
- // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
- // z is not, and (add y, z) is used only once.
- // The goal is to move constant offsets to the outermost ptradd, to create
- // more opportunities to fold offsets into memory instructions.
- // Together with the generic combines in DAGCombiner.cpp, this also
- // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
- //
- // This transform is here instead of in the general DAGCombiner as it can
- // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
- // AArch64's CPA.
- SDValue X = N0;
- SDValue Y = N1.getOperand(0);
- SDValue Z = N1.getOperand(1);
- if (N1.hasOneUse()) {
- bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
- bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
- if (ZIsConstant != YIsConstant) {
- // If both additions in the original were NUW, the new ones are as well.
- SDNodeFlags Flags =
- (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
- if (YIsConstant)
- std::swap(Y, Z);
+ // The following folds transform PTRADDs into regular arithmetic in cases
+ // where the PTRADD wouldn't be folded as an immediate offset into memory
+ // instructions anyway. They are target-specific in that other targets might
+ // prefer to not lose information about the pointer arithmetic.
+
+ // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
+ // Adapted from DAGCombiner::visitADDLikeCommutative.
+ SDValue V, K;
+ if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
+ SDNodeFlags ShlFlags = N1->getFlags();
+ // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
+ // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
+ // preserved.
+ SDNodeFlags NewShlFlags =
+ ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
+ ? SDNodeFlags::NoSignedWrap
+ : SDNodeFlags();
+ SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
+ DCI.AddToWorklist(Inner.getNode());
+ return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
+ }
+
+ // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
+ // performAddCombine.
+ if (N1.getOpcode() == ISD::MUL) {
+ if (Subtarget->hasMad64_32()) {
+ if (SDValue Folded = tryFoldToMad64_32(N, DCI))
+ return Folded;
+ }
+ }
- SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
+ // If the 32 low bits of the constant are all zero, there is nothing to fold
+ // into an immediate offset, so it's better to eliminate the unnecessary
+ // addition for the lower 32 bits than to preserve the PTRADD.
+ // Analogous to a fold in performAddCombine.
+ if (VT == MVT::i64) {
+ if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
+ return Folded;
+ }
+
+ if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
+ // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
+ // global address GA and constant c, such that c can be folded into GA.
+ SDValue GAValue = N0.getOperand(0);
+ if (const GlobalAddressSDNode *GA =
+ dyn_cast<GlobalAddressSDNode>(GAValue)) {
+ if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) {
+ // If both additions in the original were NUW, reassociation preserves
+ // that.
+ SDNodeFlags Flags =
+ (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
+ SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
DCI.AddToWorklist(Inner.getNode());
- return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
+ return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
}
}
}
+ if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
+ return SDValue();
+
+ // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
+ // y is not, and (add y, z) is used only once.
+ // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
+ // z is not, and (add y, z) is used only once.
+ // The goal is to move constant offsets to the outermost ptradd, to create
+ // more opportunities to fold offsets into memory instructions.
+ // Together with the generic combines in DAGCombiner.cpp, this also
+ // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
+ //
+ // This transform is here instead of in the general DAGCombiner as it can
+ // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
+ // AArch64's CPA.
+ SDValue X = N0;
+ SDValue Y = N1.getOperand(0);
+ SDValue Z = N1.getOperand(1);
+ bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
+ bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
+
+ // If both additions in the original were NUW, reassociation preserves that.
+ SDNodeFlags ReassocFlags =
+ (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
+
+ if (ZIsConstant != YIsConstant) {
+ if (YIsConstant)
+ std::swap(Y, Z);
+ SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
+ DCI.AddToWorklist(Inner.getNode());
+ return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
+ }
+
+ // If one of Y and Z is constant, they have been handled above. If both were
+ // constant, the addition would have been folded in SelectionDAG::getNode
+ // already. This ensures that the generic DAG combines won't undo the
+ // following reassociation.
+ assert(!YIsConstant && !ZIsConstant);
+
+ if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
+ // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
+ // y are uniform and z isn't.
+ // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
+ // z are uniform and y isn't.
+ // The goal is to push uniform operands up in the computation, so that they
+ // can be handled with scalar operations. We can't use reassociateScalarOps
+ // for this since it requires two identical commutative operations to
+ // reassociate.
+ if (Y->isDivergent())
+ std::swap(Y, Z);
+ SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
+ DCI.AddToWorklist(UniformInner.getNode());
+ return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
+ }
+
return SDValue();
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index a368bc5..6b41934 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -317,6 +317,7 @@ def CPolBit {
int SLC = 1;
int DLC = 2;
int SCC = 4;
+ int NV = 5;
}
class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9da8a1c..c8935f0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -687,7 +687,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
if (!SafeToPropagate)
break;
- DefOp.setIsKill(false);
+ for (auto I = Def; I != MI; ++I)
+ I->clearRegisterKills(DefOp.getReg(), &RI);
}
MachineInstrBuilder Builder =
@@ -1625,41 +1626,6 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
}
}
-static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
- switch (Size) {
- case 4:
- return AMDGPU::SI_SPILL_A32_SAVE;
- case 8:
- return AMDGPU::SI_SPILL_A64_SAVE;
- case 12:
- return AMDGPU::SI_SPILL_A96_SAVE;
- case 16:
- return AMDGPU::SI_SPILL_A128_SAVE;
- case 20:
- return AMDGPU::SI_SPILL_A160_SAVE;
- case 24:
- return AMDGPU::SI_SPILL_A192_SAVE;
- case 28:
- return AMDGPU::SI_SPILL_A224_SAVE;
- case 32:
- return AMDGPU::SI_SPILL_A256_SAVE;
- case 36:
- return AMDGPU::SI_SPILL_A288_SAVE;
- case 40:
- return AMDGPU::SI_SPILL_A320_SAVE;
- case 44:
- return AMDGPU::SI_SPILL_A352_SAVE;
- case 48:
- return AMDGPU::SI_SPILL_A384_SAVE;
- case 64:
- return AMDGPU::SI_SPILL_A512_SAVE;
- case 128:
- return AMDGPU::SI_SPILL_A1024_SAVE;
- default:
- llvm_unreachable("unknown register size");
- }
-}
-
static unsigned getAVSpillSaveOpcode(unsigned Size) {
switch (Size) {
case 4:
@@ -1707,22 +1673,20 @@ static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
return AMDGPU::SI_SPILL_WWM_V32_SAVE;
}
-static unsigned getVectorRegSpillSaveOpcode(Register Reg,
- const TargetRegisterClass *RC,
- unsigned Size,
- const SIRegisterInfo &TRI,
- const SIMachineFunctionInfo &MFI) {
- bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
+unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
+ Register Reg, const TargetRegisterClass *RC, unsigned Size,
+ const SIMachineFunctionInfo &MFI) const {
+ bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
// Choose the right opcode if spilling a WWM register.
if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
- if (IsVectorSuperClass)
+ // TODO: Check if AGPRs are available
+ if (ST.hasMAIInsts())
return getAVSpillSaveOpcode(Size);
- return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
- : getVGPRSpillSaveOpcode(Size);
+ return getVGPRSpillSaveOpcode(Size);
}
void SIInstrInfo::storeRegToStackSlot(
@@ -1770,8 +1734,8 @@ void SIInstrInfo::storeRegToStackSlot(
return;
}
- unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
- SpillSize, RI, *MFI);
+ unsigned Opcode =
+ getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
MFI->setHasSpilledVGPRs();
BuildMI(MBB, MI, DL, get(Opcode))
@@ -1854,41 +1818,6 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
}
}
-static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
- switch (Size) {
- case 4:
- return AMDGPU::SI_SPILL_A32_RESTORE;
- case 8:
- return AMDGPU::SI_SPILL_A64_RESTORE;
- case 12:
- return AMDGPU::SI_SPILL_A96_RESTORE;
- case 16:
- return AMDGPU::SI_SPILL_A128_RESTORE;
- case 20:
- return AMDGPU::SI_SPILL_A160_RESTORE;
- case 24:
- return AMDGPU::SI_SPILL_A192_RESTORE;
- case 28:
- return AMDGPU::SI_SPILL_A224_RESTORE;
- case 32:
- return AMDGPU::SI_SPILL_A256_RESTORE;
- case 36:
- return AMDGPU::SI_SPILL_A288_RESTORE;
- case 40:
- return AMDGPU::SI_SPILL_A320_RESTORE;
- case 44:
- return AMDGPU::SI_SPILL_A352_RESTORE;
- case 48:
- return AMDGPU::SI_SPILL_A384_RESTORE;
- case 64:
- return AMDGPU::SI_SPILL_A512_RESTORE;
- case 128:
- return AMDGPU::SI_SPILL_A1024_RESTORE;
- default:
- llvm_unreachable("unknown register size");
- }
-}
-
static unsigned getAVSpillRestoreOpcode(unsigned Size) {
switch (Size) {
case 4:
@@ -1930,27 +1859,27 @@ static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
if (Size != 4)
llvm_unreachable("unknown wwm register spill size");
- if (IsVectorSuperClass)
+ if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
}
-static unsigned
-getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
- unsigned Size, const SIRegisterInfo &TRI,
- const SIMachineFunctionInfo &MFI) {
- bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
+unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode(
+ Register Reg, const TargetRegisterClass *RC, unsigned Size,
+ const SIMachineFunctionInfo &MFI) const {
+ bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
// Choose the right opcode if restoring a WWM register.
if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
- if (IsVectorSuperClass)
+ // TODO: Check if AGPRs are available
+ if (ST.hasMAIInsts())
return getAVSpillRestoreOpcode(Size);
- return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
- : getVGPRSpillRestoreOpcode(Size);
+ assert(!RI.isAGPRClass(RC));
+ return getVGPRSpillRestoreOpcode(Size);
}
void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -1998,7 +1927,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
}
unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
- SpillSize, RI, *MFI);
+ SpillSize, *MFI);
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
.addFrameIndex(FrameIndex) // vaddr
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 3a48e65..5e92921 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -33,6 +33,7 @@ class LiveVariables;
class MachineDominatorTree;
class MachineRegisterInfo;
class RegScavenger;
+class SIMachineFunctionInfo;
class TargetRegisterClass;
class ScheduleHazardRecognizer;
@@ -287,6 +288,15 @@ public:
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
int64_t &ImmVal) const override;
+ unsigned getVectorRegSpillSaveOpcode(Register Reg,
+ const TargetRegisterClass *RC,
+ unsigned Size,
+ const SIMachineFunctionInfo &MFI) const;
+ unsigned
+ getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
+ unsigned Size,
+ const SIMachineFunctionInfo &MFI) const;
+
void storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
bool isKill, int FrameIndex, const TargetRegisterClass *RC,
@@ -1103,7 +1113,6 @@ public:
// that will not require an additional 4-bytes; this function assumes that it
// will.
bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const {
- assert(!MO.isReg() && "isInlineConstant called on register operand!");
if (!MO.isImm())
return false;
return isInlineConstant(MO.getImm(), OperandType);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index ab7d340..9e1951e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2850,6 +2850,7 @@ def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>;
def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>;
def VOP_BF16_BF16 : VOPProfile<[bf16, bf16, untyped, untyped]>;
+def VOP1_I16_I32 : VOPProfile<[i16, i32, untyped, untyped]>;
def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 2a6fcad..991d9f8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3427,30 +3427,32 @@ def : GCNPat <
(S_LSHL_B32 SReg_32:$src1, (i16 16))
>;
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 VGPR_32:$src1))),
(v2i16 (V_LSHLREV_B32_e64 (i16 16), VGPR_32:$src1))
>;
-
def : GCNPat <
- (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
- (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
+ (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))),
+ (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
>;
def : GCNPat <
- (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))),
- (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
+ (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
+ (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
>;
+}
def : GCNPat <
- (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+ (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
>;
def : GCNPat <
- (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
- (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
+ (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+ (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
>;
foreach vecTy = [v2i16, v2f16, v2bf16] in {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 75ce67c..8c2e9b62 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -29,6 +29,16 @@ enum { MAX_LANES = 64 };
using namespace llvm;
+// TODO -- delete this flag once we have more robust mechanisms to allocate the
+// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
+// where it is better to produce the VGPR form (e.g. if there are VGPR users
+// of the MFMA result).
+cl::opt<bool> MFMAVGPRForm(
+ "amdgpu-mfma-vgpr-form", cl::Hidden,
+ cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
+ "unspecified, default to compiler heuristics"),
+ cl::init(false));
+
const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
const SITargetLowering *TLI = STI->getTargetLowering();
return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
@@ -69,8 +79,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
}
- MayNeedAGPRs = ST.hasMAIInsts();
- if (ST.hasGFX90AInsts() &&
+ MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm;
+ if (!MFMAVGPRForm && ST.hasGFX90AInsts() &&
ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
!mayUseAGPRs(F))
MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 9173041..fa2b8db 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -4052,11 +4052,11 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
return 0;
}
-unsigned
-SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
- const TargetRegisterClass &RC) const {
+unsigned SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
+ const TargetRegisterClass &RC,
+ bool IncludeCalls) const {
for (MCPhysReg Reg : reverse(RC.getRegisters()))
- if (MRI.isPhysRegUsed(Reg))
+ if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
return getHWRegIndex(Reg) + 1;
return 0;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 06a7a17..0008e5f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -486,9 +486,11 @@ public:
unsigned SubReg) const;
// \returns a number of registers of a given \p RC used in a function.
- // Does not go inside function calls.
+ // Does not go inside function calls. If \p IncludeCalls is true, it will
+ // include registers that may be clobbered by calls.
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
- const TargetRegisterClass &RC) const;
+ const TargetRegisterClass &RC,
+ bool IncludeCalls = true) const;
std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 37dcc100..d8b52d2 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -87,7 +87,7 @@ class SM_Real <SM_Pseudo ps, string opName = ps.Mnemonic>
bits<7> sdst;
bits<32> offset;
bits<8> soffset;
- bits<5> cpol;
+ bits<12> cpol;
}
class OffsetMode<bit hasOffset, bit hasSOffset, string variant,
@@ -1485,6 +1485,7 @@ class SMEM_Real_Load_gfx12<bits<6> op, string ps, string opName, OffsetMode offs
RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));
+ let Inst{20} = cpol{CPolBit.NV}; // non-volatile
let Inst{22-21} = cpol{4-3}; // scope
let Inst{24-23} = cpol{1-0}; // th - only lower 2 bits are supported
}
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index c91319e..f621f85 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -366,6 +366,9 @@ defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>;
let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
+
+let SubtargetPredicate = HasTanhInsts in
+defm V_TANH_F32 : VOP1Inst <"v_tanh_f32", VOP_F32_F32, int_amdgcn_tanh>;
} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
@@ -527,6 +530,10 @@ defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, AMDGPUexpf16>;
defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
+let SubtargetPredicate = HasTanhInsts in {
+defm V_TANH_F16 : VOP1Inst_t16 <"v_tanh_f16", VOP_F16_F16, int_amdgcn_tanh>;
+}
+
let SubtargetPredicate = HasBF16TransInsts in {
defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>;
defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>;
@@ -535,6 +542,7 @@ defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>;
defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>;
defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>;
defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>;
+defm V_COS_BF16 : VOP1Inst_t16 <"v_cos_bf16", VOP_BF16_BF16, AMDGPUcos>;
}
} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
@@ -795,6 +803,9 @@ let SubtargetPredicate = isGFX1250Plus in {
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_fp8, V_CVT_F16_FP8_fake16_e64, 1>;
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_bf8, V_CVT_F16_BF8_fake16_e64, 1>;
}
+
+ defm V_SAT_PK4_I4_I8 : VOP1Inst_t16<"v_sat_pk4_i4_i8", VOP1_I16_I32, int_amdgcn_sat_pk4_i4_i8>;
+ defm V_SAT_PK4_U4_U8 : VOP1Inst_t16<"v_sat_pk4_u4_u8", VOP1_I16_I32, int_amdgcn_sat_pk4_u4_u8>;
} // End SubtargetPredicate = isGFX1250Plus
let SubtargetPredicate = isGFX10Plus in {
@@ -1072,6 +1083,13 @@ multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250<
VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>;
}
+multiclass VOP1_Real_OpSelIsDPP_gfx1250<bits<9> op> : VOP1_Real_e32<GFX1250Gen, op> {
+ defvar ps = !cast<VOP_Pseudo>(NAME#"_e64");
+ def _e64_gfx1250 :
+ VOP3_Real_Gen<ps, GFX1250Gen>,
+ VOP3OpSelIsDPP_gfx12<{0, 1, 1, op{6-0}}, ps.Pfl>;
+}
+
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX12Not12_50Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX1250Gen, 0x06c, "V_CVT_F32_FP8_gfx1250", "v_cvt_f32_fp8">;
@@ -1137,8 +1155,14 @@ defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>;
defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>;
+defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>;
+defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>;
+defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>;
defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
+defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>;
defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
+defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>;
+defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>;
defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
defm V_CVT_PK_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>;
defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>;
@@ -1149,6 +1173,7 @@ defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>;
defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>;
defm V_EXP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07d>;
defm V_SIN_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07e>;
+defm V_COS_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07f>;
//===----------------------------------------------------------------------===//
// GFX10.
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 2b91ea7..a25ebdf 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -331,10 +331,19 @@ class VOP3OpSel_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
// Special case for v_permlane16_swap_b32/v_permlane32_swap_b32
// op_sel[0]/op_sel[1] are treated as bound_ctrl and fi dpp operands.
-class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
+class VOP3OpSelIsDPP_base {
bits<1> fi;
bits<1> bound_ctrl;
+}
+
+class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_vi <op, P> {
+ // OPSEL[0] specifies FI
+ let Inst{11} = fi;
+ // OPSEL[1] specifies BOUND_CTRL
+ let Inst{12} = bound_ctrl;
+}
+class VOP3OpSelIsDPP_gfx12 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_gfx11_gfx12 <op, P> {
// OPSEL[0] specifies FI
let Inst{11} = fi;
// OPSEL[1] specifies BOUND_CTRL