aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp28
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp159
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h6
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td8
8 files changed, 203 insertions, 13 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 0f2c335..ce2b4a5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -562,6 +562,11 @@ public:
void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &);
extern char &AMDGPURewriteAGPRCopyMFMALegacyID;
+struct AMDGPUUniformIntrinsicCombinePass
+ : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 9449e70..a6074ea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -30,6 +30,7 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
+MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
#undef MODULE_PASS
#ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 557d87f..56807a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5053,16 +5053,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
//
// vdst, srcA, srcB, srcC
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ bool UseAGPRForm = !Subtarget.hasGFX90AInsts() ||
+ Info->selectAGPRFormMFMA(MinNumRegsRequired);
+
OpdsMapping[0] =
- Info->getMinNumAGPRs() >= MinNumRegsRequired
- ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
- : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
OpdsMapping[4] =
- Info->getMinNumAGPRs() >= MinNumRegsRequired
- ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
- : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
break;
}
case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
@@ -5115,11 +5117,21 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: {
+ Register DstReg = MI.getOperand(0).getReg();
+ unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+ unsigned MinNumRegsRequired = DstSize / 32;
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired);
+
// vdst, srcA, srcB, srcC, idx
- OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[0] = UseAGPRForm ? getAGPROpMapping(DstReg, MRI, *TRI)
+ : getVGPROpMapping(DstReg, MRI, *TRI);
+
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
- OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ OpdsMapping[4] =
+ UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
break;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c7a91f4c..4958a20 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -526,6 +526,11 @@ static cl::opt<bool> HasClosedWorldAssumption(
cl::desc("Whether has closed-world assumption at link time"),
cl::init(false), cl::Hidden);
+static cl::opt<bool> EnableUniformIntrinsicCombine(
+ "amdgpu-enable-uniform-intrinsic-combine",
+ cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
+ cl::init(true), cl::Hidden);
+
extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
@@ -879,6 +884,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
if (EarlyInlineAll && !EnableFunctionCalls)
PM.addPass(AMDGPUAlwaysInlinePass());
+
+ if (EnableUniformIntrinsicCombine)
+ PM.addPass(AMDGPUUniformIntrinsicCombinePass());
});
PB.registerPeepholeEPCallback(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
new file mode 100644
index 0000000..50c78d8
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -0,0 +1,159 @@
+//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass simplifies certain intrinsic calls when the arguments are uniform.
+/// It's true that this pass has transforms that can lead to a situation where
+/// some instruction whose operand was previously recognized as statically
+/// uniform is later on no longer recognized as statically uniform. However, the
+/// semantics of how programs execute don't (and must not, for this precise
+/// reason) care about static uniformity, they only ever care about dynamic
+/// uniformity. And every instruction that's downstream and cares about dynamic
+/// uniformity must be convergent (and isel will introduce v_readfirstlane for
+/// them if their operands can't be proven statically uniform).
+///
+/// This pass is implemented as a ModulePass because intrinsic declarations
+/// exist at the module scope, allowing us to skip processing entirely if no
+/// declarations are present and to traverse their user lists directly when
+/// they are. A FunctionPass would instead require scanning every instruction
+/// in every function to find relevant intrinsics, which is far less efficient.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+using namespace llvm::PatternMatch;
+
+/// Wrapper for querying uniformity info that first checks locally tracked
+/// instructions.
+static bool
+isDivergentUseWithNew(const Use &U, const UniformityInfo &UI,
+ const ValueMap<const Value *, bool> &Tracker) {
+ Value *V = U.get();
+ if (auto It = Tracker.find(V); It != Tracker.end())
+ return !It->second; // divergent if marked false
+ return UI.isDivergentUse(U);
+}
+
+/// Optimizes uniform intrinsics calls if their operand can be proven uniform.
+static bool optimizeUniformIntrinsic(IntrinsicInst &II,
+ const UniformityInfo &UI,
+ ValueMap<const Value *, bool> &Tracker) {
+ llvm::Intrinsic::ID IID = II.getIntrinsicID();
+
+ switch (IID) {
+ case Intrinsic::amdgcn_permlane64:
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readlane: {
+ Value *Src = II.getArgOperand(0);
+ if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
+ return false;
+ LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n');
+ II.replaceAllUsesWith(Src);
+ II.eraseFromParent();
+ return true;
+ }
+ case Intrinsic::amdgcn_ballot: {
+ Value *Src = II.getArgOperand(0);
+ if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
+ return false;
+ LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n');
+
+ bool Changed = false;
+ for (User *U : make_early_inc_range(II.users())) {
+ if (auto *ICmp = dyn_cast<ICmpInst>(U)) {
+ Value *Op0 = ICmp->getOperand(0);
+ Value *Op1 = ICmp->getOperand(1);
+ ICmpInst::Predicate Pred = ICmp->getPredicate();
+ Value *OtherOp = Op0 == &II ? Op1 : Op0;
+
+ if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) {
+ // Case: (icmp eq %ballot, 0) -> xor %ballot_arg, 1
+ Instruction *NotOp =
+ BinaryOperator::CreateNot(Src, "", ICmp->getIterator());
+ Tracker[NotOp] = true; // NOT preserves uniformity
+ LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
+ ICmp->replaceAllUsesWith(NotOp);
+ ICmp->eraseFromParent();
+ Changed = true;
+ } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
+ // Case: (icmp ne %ballot, 0) -> %ballot_arg
+ LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
+ << *Src << '\n');
+ ICmp->replaceAllUsesWith(Src);
+ ICmp->eraseFromParent();
+ Changed = true;
+ }
+ }
+ }
+ // Erase the intrinsic if it has no remaining uses.
+ if (II.use_empty())
+ II.eraseFromParent();
+ return Changed;
+ }
+ default:
+ llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic");
+ }
+ return false;
+}
+
+/// Iterates over intrinsic declarations in the module to optimize their uses.
+static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
+ bool IsChanged = false;
+ ValueMap<const Value *, bool> Tracker;
+
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ for (Function &F : M) {
+ switch (F.getIntrinsicID()) {
+ case Intrinsic::amdgcn_permlane64:
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_ballot:
+ break;
+ default:
+ continue;
+ }
+
+ for (User *U : make_early_inc_range(F.users())) {
+ auto *II = cast<IntrinsicInst>(U);
+ Function *ParentF = II->getFunction();
+ const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
+ IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
+ }
+ }
+ return IsChanged;
+}
+
+PreservedAnalyses
+AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) {
+ if (!runUniformIntrinsicCombine(M, AM))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserve<UniformityInfoAnalysis>();
+ return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index aae56ee..13f727b68 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -64,6 +64,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUHSAMetadataStreamer.cpp
AMDGPUInsertDelayAlu.cpp
AMDGPUInstCombineIntrinsic.cpp
+ AMDGPUUniformIntrinsicCombine.cpp
AMDGPUInstrInfo.cpp
AMDGPUInstructionSelector.cpp
AMDGPUISelDAGToDAG.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index b7dbb59..2c1a13c 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -1202,6 +1202,12 @@ public:
unsigned getMinNumAGPRs() const { return MinNumAGPRs; }
+ /// Return true if an MFMA that requires at least \p NumRegs should select to
+ /// the AGPR form, instead of the VGPR form.
+ bool selectAGPRFormMFMA(unsigned NumRegs) const {
+ return !MFMAVGPRForm && getMinNumAGPRs() >= NumRegs;
+ }
+
// \returns true if a function has a use of AGPRs via inline asm or
// has a call which may use it.
bool mayUseAGPRs(const Function &F) const;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 7cfd059..6500fce 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -964,14 +964,12 @@ class MAIFrag<SDPatternOperator Op, bit HasAbid = true, bit Scaled = false> : Pa
class CanUseAGPR_MAI<ValueType vt> {
code PredicateCode = [{
return !Subtarget->hasGFX90AInsts() ||
- (!SIMachineFunctionInfo::MFMAVGPRForm &&
- MF->getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >=
- }] # !srl(vt.Size, 5) # ");";
+ MF->getInfo<SIMachineFunctionInfo>()->selectAGPRFormMFMA(
+ }] # !srl(vt.Size, 5) # ");";
code GISelPredicateCode = [{
return !Subtarget->hasGFX90AInsts() ||
- (!SIMachineFunctionInfo::MFMAVGPRForm &&
- MF.getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >=
+ MF.getInfo<SIMachineFunctionInfo>()->selectAGPRFormMFMA(
}] # !srl(vt.Size, 5) # ");";
}