diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 28 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 159 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 8 |
8 files changed, 203 insertions, 13 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 0f2c335..ce2b4a5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -562,6 +562,11 @@ public: void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &); extern char &AMDGPURewriteAGPRCopyMFMALegacyID; +struct AMDGPUUniformIntrinsicCombinePass + : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> { + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 9449e70..a6074ea 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -30,6 +30,7 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass( MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) +MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass()) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 557d87f..56807a4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5053,16 +5053,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // // vdst, srcA, srcB, srcC const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + bool UseAGPRForm = !Subtarget.hasGFX90AInsts() || + Info->selectAGPRFormMFMA(MinNumRegsRequired); + OpdsMapping[0] = - Info->getMinNumAGPRs() >= MinNumRegsRequired - ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) - : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->getMinNumAGPRs() >= MinNumRegsRequired - ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) - : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4: @@ -5115,11 +5117,21 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8: case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8: case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: { + Register DstReg = MI.getOperand(0).getReg(); + unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); + unsigned MinNumRegsRequired = DstSize / 32; + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired); + // vdst, srcA, srcB, srcC, idx - OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[0] = UseAGPRForm ? getAGPROpMapping(DstReg, MRI, *TRI) + : getVGPROpMapping(DstReg, MRI, *TRI); + OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); - OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + OpdsMapping[4] = + UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); break; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c7a91f4c..4958a20 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -526,6 +526,11 @@ static cl::opt<bool> HasClosedWorldAssumption( cl::desc("Whether has closed-world assumption at link time"), cl::init(false), cl::Hidden); +static cl::opt<bool> EnableUniformIntrinsicCombine( + "amdgpu-enable-uniform-intrinsic-combine", + cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"), + cl::init(true), cl::Hidden); + extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(getTheR600Target()); @@ -879,6 +884,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (EarlyInlineAll && !EnableFunctionCalls) PM.addPass(AMDGPUAlwaysInlinePass()); + + if (EnableUniformIntrinsicCombine) + PM.addPass(AMDGPUUniformIntrinsicCombinePass()); }); PB.registerPeepholeEPCallback( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp new file mode 100644 index 0000000..50c78d8 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -0,0 +1,159 @@ +//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass simplifies certain intrinsic calls when the arguments are uniform. +/// It's true that this pass has transforms that can lead to a situation where +/// some instruction whose operand was previously recognized as statically +/// uniform is later on no longer recognized as statically uniform. However, the +/// semantics of how programs execute don't (and must not, for this precise +/// reason) care about static uniformity, they only ever care about dynamic +/// uniformity. And every instruction that's downstream and cares about dynamic +/// uniformity must be convergent (and isel will introduce v_readfirstlane for +/// them if their operands can't be proven statically uniform). +/// +/// This pass is implemented as a ModulePass because intrinsic declarations +/// exist at the module scope, allowing us to skip processing entirely if no +/// declarations are present and to traverse their user lists directly when +/// they are. A FunctionPass would instead require scanning every instruction +/// in every function to find relevant intrinsics, which is far less efficient. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/UniformityAnalysis.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine" + +using namespace llvm; +using namespace llvm::AMDGPU; +using namespace llvm::PatternMatch; + +/// Wrapper for querying uniformity info that first checks locally tracked +/// instructions. +static bool +isDivergentUseWithNew(const Use &U, const UniformityInfo &UI, + const ValueMap<const Value *, bool> &Tracker) { + Value *V = U.get(); + if (auto It = Tracker.find(V); It != Tracker.end()) + return !It->second; // divergent if marked false + return UI.isDivergentUse(U); +} + +/// Optimizes uniform intrinsics calls if their operand can be proven uniform. +static bool optimizeUniformIntrinsic(IntrinsicInst &II, + const UniformityInfo &UI, + ValueMap<const Value *, bool> &Tracker) { + llvm::Intrinsic::ID IID = II.getIntrinsicID(); + + switch (IID) { + case Intrinsic::amdgcn_permlane64: + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: { + Value *Src = II.getArgOperand(0); + if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker)) + return false; + LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n'); + II.replaceAllUsesWith(Src); + II.eraseFromParent(); + return true; + } + case Intrinsic::amdgcn_ballot: { + Value *Src = II.getArgOperand(0); + if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker)) + return false; + LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n'); + + bool Changed = false; + for (User *U : make_early_inc_range(II.users())) { + if (auto *ICmp = dyn_cast<ICmpInst>(U)) { + Value *Op0 = ICmp->getOperand(0); + Value *Op1 = ICmp->getOperand(1); + ICmpInst::Predicate Pred = ICmp->getPredicate(); + Value *OtherOp = Op0 == &II ? Op1 : Op0; + + if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) { + // Case: (icmp eq %ballot, 0) -> xor %ballot_arg, 1 + Instruction *NotOp = + BinaryOperator::CreateNot(Src, "", ICmp->getIterator()); + Tracker[NotOp] = true; // NOT preserves uniformity + LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n'); + ICmp->replaceAllUsesWith(NotOp); + ICmp->eraseFromParent(); + Changed = true; + } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) { + // Case: (icmp ne %ballot, 0) -> %ballot_arg + LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: " + << *Src << '\n'); + ICmp->replaceAllUsesWith(Src); + ICmp->eraseFromParent(); + Changed = true; + } + } + } + // Erase the intrinsic if it has no remaining uses. + if (II.use_empty()) + II.eraseFromParent(); + return Changed; + } + default: + llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic"); + } + return false; +} + +/// Iterates over intrinsic declarations in the module to optimize their uses. +static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) { + bool IsChanged = false; + ValueMap<const Value *, bool> Tracker; + + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + for (Function &F : M) { + switch (F.getIntrinsicID()) { + case Intrinsic::amdgcn_permlane64: + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_ballot: + break; + default: + continue; + } + + for (User *U : make_early_inc_range(F.users())) { + auto *II = cast<IntrinsicInst>(U); + Function *ParentF = II->getFunction(); + const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF); + IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); + } + } + return IsChanged; +} + +PreservedAnalyses +AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) { + if (!runUniformIntrinsicCombine(M, AM)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve<UniformityInfoAnalysis>(); + return PA; +} diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index aae56ee..13f727b68 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -64,6 +64,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUHSAMetadataStreamer.cpp AMDGPUInsertDelayAlu.cpp AMDGPUInstCombineIntrinsic.cpp + AMDGPUUniformIntrinsicCombine.cpp AMDGPUInstrInfo.cpp AMDGPUInstructionSelector.cpp AMDGPUISelDAGToDAG.cpp diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index b7dbb59..2c1a13c 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -1202,6 +1202,12 @@ public: unsigned getMinNumAGPRs() const { return MinNumAGPRs; } + /// Return true if an MFMA that requires at least \p NumRegs should select to + /// the AGPR form, instead of the VGPR form. + bool selectAGPRFormMFMA(unsigned NumRegs) const { + return !MFMAVGPRForm && getMinNumAGPRs() >= NumRegs; + } + // \returns true if a function has a use of AGPRs via inline asm or // has a call which may use it. bool mayUseAGPRs(const Function &F) const; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 7cfd059..6500fce 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -964,14 +964,12 @@ class MAIFrag<SDPatternOperator Op, bit HasAbid = true, bit Scaled = false> : Pa class CanUseAGPR_MAI<ValueType vt> { code PredicateCode = [{ return !Subtarget->hasGFX90AInsts() || - (!SIMachineFunctionInfo::MFMAVGPRForm && - MF->getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >= - }] # !srl(vt.Size, 5) # ");"; + MF->getInfo<SIMachineFunctionInfo>()->selectAGPRFormMFMA( + }] # !srl(vt.Size, 5) # ");"; code GISelPredicateCode = [{ return !Subtarget->hasGFX90AInsts() || - (!SIMachineFunctionInfo::MFMAVGPRForm && - MF.getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >= + MF.getInfo<SIMachineFunctionInfo>()->selectAGPRFormMFMA( }] # !srl(vt.Size, 5) # ");"; } |