diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 103 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp | 9 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 159 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 60 |
9 files changed, 299 insertions, 50 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 0f2c335..ce2b4a5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -562,6 +562,11 @@ public: void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &); extern char &AMDGPURewriteAGPRCopyMFMALegacyID; +struct AMDGPUUniformIntrinsicCombinePass + : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> { + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index ef58004..9907c88f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1288,16 +1288,17 @@ static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA, return std::min(MaxVirtReg + MaxPhysReg, 256u); } -// TODO: Migrate to range merge of amdgpu-agpr-alloc. -struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> { - using Base = StateWrapper<BooleanState, AbstractAttribute>; - AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : Base(IRP) {} +struct AAAMDGPUMinAGPRAlloc + : public StateWrapper<DecIntegerState<>, AbstractAttribute> { + using Base = StateWrapper<DecIntegerState<>, AbstractAttribute>; + AAAMDGPUMinAGPRAlloc(const IRPosition &IRP, Attributor &A) : Base(IRP) {} - static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP, - Attributor &A) { + static AAAMDGPUMinAGPRAlloc &createForPosition(const IRPosition &IRP, + Attributor &A) { if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) - return *new (A.Allocator) AAAMDGPUNoAGPR(IRP, A); - llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position"); + return *new (A.Allocator) AAAMDGPUMinAGPRAlloc(IRP, A); + llvm_unreachable( + "AAAMDGPUMinAGPRAlloc is only valid for function position"); } void initialize(Attributor &A) override { @@ -1310,25 +1311,33 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> { } const std::string getAsStr(Attributor *A) const override { - return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr"; + std::string Str = "amdgpu-agpr-alloc="; + raw_string_ostream OS(Str); + OS << getAssumed(); + return OS.str(); } void trackStatistics() const override {} ChangeStatus updateImpl(Attributor &A) override { - // TODO: Use AACallEdges, but then we need a way to inspect asm edges. + DecIntegerState<> Maximum; - auto CheckForNoAGPRs = [&](Instruction &I) { + // Check for cases which require allocation of AGPRs. The only cases where + // AGPRs are required are if there are direct references to AGPRs, so inline + // assembly and special intrinsics. + auto CheckForMinAGPRAllocs = [&](Instruction &I) { const auto &CB = cast<CallBase>(I); const Value *CalleeOp = CB.getCalledOperand(); - const Function *Callee = dyn_cast<Function>(CalleeOp); - if (!Callee) { - if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp)) - return inlineAsmGetNumRequiredAGPRs(IA, CB) == 0; - return false; + + if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp)) { + // Technically, the inline asm could be invoking a call to an unknown + // external function that requires AGPRs, but ignore that. + unsigned NumRegs = inlineAsmGetNumRequiredAGPRs(IA, CB); + Maximum.takeAssumedMaximum(NumRegs); + return true; } - switch (Callee->getIntrinsicID()) { + switch (CB.getIntrinsicID()) { case Intrinsic::not_intrinsic: break; case Intrinsic::write_register: @@ -1340,7 +1349,10 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> { ->getOperand(0)); auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmPhysRegName(RegName->getString()); - return Kind != 'a'; + if (Kind == 'a') + Maximum.takeAssumedMaximum(std::min(RegIdx + NumRegs, 256u)); + + return true; } default: // Some intrinsics may use AGPRs, but if we have a choice, we are not @@ -1349,32 +1361,50 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> { } // TODO: Handle callsite attributes - const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>( - *this, IRPosition::function(*Callee), DepClassTy::REQUIRED); - return CalleeInfo && CalleeInfo->isValidState() && - CalleeInfo->getAssumed(); + auto *CBEdges = A.getAAFor<AACallEdges>( + *this, IRPosition::callsite_function(CB), DepClassTy::REQUIRED); + if (!CBEdges || CBEdges->hasUnknownCallee()) { + Maximum.indicatePessimisticFixpoint(); + return false; + } + + for (const Function *PossibleCallee : CBEdges->getOptimisticEdges()) { + const auto *CalleeInfo = A.getAAFor<AAAMDGPUMinAGPRAlloc>( + *this, IRPosition::function(*PossibleCallee), DepClassTy::REQUIRED); + if (!CalleeInfo || !CalleeInfo->isValidState()) { + Maximum.indicatePessimisticFixpoint(); + return false; + } + + Maximum.takeAssumedMaximum(CalleeInfo->getAssumed()); + } + + return true; }; bool UsedAssumedInformation = false; - if (!A.checkForAllCallLikeInstructions(CheckForNoAGPRs, *this, + if (!A.checkForAllCallLikeInstructions(CheckForMinAGPRAllocs, *this, UsedAssumedInformation)) return indicatePessimisticFixpoint(); - return ChangeStatus::UNCHANGED; + + return clampStateAndIndicateChange(getState(), Maximum); } ChangeStatus manifest(Attributor &A) override { - if (!getAssumed()) - return ChangeStatus::UNCHANGED; LLVMContext &Ctx = getAssociatedFunction()->getContext(); - return A.manifestAttrs(getIRPosition(), - {Attribute::get(Ctx, "amdgpu-agpr-alloc", "0")}); + SmallString<4> Buffer; + raw_svector_ostream OS(Buffer); + OS << getAssumed(); + + return A.manifestAttrs( + getIRPosition(), {Attribute::get(Ctx, "amdgpu-agpr-alloc", OS.str())}); } - StringRef getName() const override { return "AAAMDGPUNoAGPR"; } + StringRef getName() const override { return "AAAMDGPUMinAGPRAlloc"; } const char *getIdAddr() const override { return &ID; } /// This function should return true if the type of the \p AA is - /// AAAMDGPUNoAGPRs + /// AAAMDGPUMinAGPRAllocs static bool classof(const AbstractAttribute *AA) { return (AA->getIdAddr() == &ID); } @@ -1382,7 +1412,7 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> { static const char ID; }; -const char AAAMDGPUNoAGPR::ID = 0; +const char AAAMDGPUMinAGPRAlloc::ID = 0; /// An abstract attribute to propagate the function attribute /// "amdgpu-cluster-dims" from kernel entry functions to device functions. @@ -1550,10 +1580,11 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, DenseSet<const char *> Allowed( {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, - &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, - &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID, - &AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID, - &AAIndirectCallInfo::ID, &AAAMDGPUClusterDims::ID}); + &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, + &AAAMDGPUMinAGPRAlloc::ID, &AACallEdges::ID, &AAPointerInfo::ID, + &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID, + &AANoAliasAddrSpace::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID, + &AAAMDGPUClusterDims::ID}); AttributorConfig AC(CGUpdater); AC.IsClosedWorldModule = Options.IsClosedWorld; @@ -1595,7 +1626,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, A.getOrCreateAAFor<AAAMDGPUClusterDims>(IRPosition::function(*F)); if (ST.hasGFX90AInsts()) - A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F)); + A.getOrCreateAAFor<AAAMDGPUMinAGPRAlloc>(IRPosition::function(*F)); for (auto &I : instructions(F)) { Value *Ptr = nullptr; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index e4d328a..b8b419d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1112,8 +1112,7 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { {N->getOperand(0), N->getOperand(1), CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); } else { - unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO - : AMDGPU::S_USUBO_PSEUDO; + unsigned Opc = IsAdd ? AMDGPU::S_UADDO_PSEUDO : AMDGPU::S_USUBO_PSEUDO; CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {N->getOperand(0), N->getOperand(1)}); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 9449e70..a6074ea 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -30,6 +30,7 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass( MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) +MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass()) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index fedb694..89c16da 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -482,12 +482,13 @@ void AMDGPURewriteAGPRCopyMFMAImpl::eliminateSpillsOfReassignedVGPRs() const { } sort(StackIntervals, [](const LiveInterval *A, const LiveInterval *B) { + // The ordering has to be strictly weak. /// Sort heaviest intervals first to prioritize their unspilling - if (A->weight() > B->weight()) - return true; + if (A->weight() != B->weight()) + return A->weight() > B->weight(); - if (A->getSize() > B->getSize()) - return true; + if (A->getSize() != B->getSize()) + return A->getSize() > B->getSize(); // Tie breaker by number to avoid need for stable sort return A->reg().stackSlotIndex() < B->reg().stackSlotIndex(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c7a91f4c..4958a20 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -526,6 +526,11 @@ static cl::opt<bool> HasClosedWorldAssumption( cl::desc("Whether has closed-world assumption at link time"), cl::init(false), cl::Hidden); +static cl::opt<bool> EnableUniformIntrinsicCombine( + "amdgpu-enable-uniform-intrinsic-combine", + cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"), + cl::init(true), cl::Hidden); + extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(getTheR600Target()); @@ -879,6 +884,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (EarlyInlineAll && !EnableFunctionCalls) PM.addPass(AMDGPUAlwaysInlinePass()); + + if (EnableUniformIntrinsicCombine) + PM.addPass(AMDGPUUniformIntrinsicCombinePass()); }); PB.registerPeepholeEPCallback( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp new file mode 100644 index 0000000..50c78d8 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -0,0 +1,159 @@ +//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass simplifies certain intrinsic calls when the arguments are uniform. +/// It's true that this pass has transforms that can lead to a situation where +/// some instruction whose operand was previously recognized as statically +/// uniform is later on no longer recognized as statically uniform. However, the +/// semantics of how programs execute don't (and must not, for this precise +/// reason) care about static uniformity, they only ever care about dynamic +/// uniformity. And every instruction that's downstream and cares about dynamic +/// uniformity must be convergent (and isel will introduce v_readfirstlane for +/// them if their operands can't be proven statically uniform). +/// +/// This pass is implemented as a ModulePass because intrinsic declarations +/// exist at the module scope, allowing us to skip processing entirely if no +/// declarations are present and to traverse their user lists directly when +/// they are. A FunctionPass would instead require scanning every instruction +/// in every function to find relevant intrinsics, which is far less efficient. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/UniformityAnalysis.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine" + +using namespace llvm; +using namespace llvm::AMDGPU; +using namespace llvm::PatternMatch; + +/// Wrapper for querying uniformity info that first checks locally tracked +/// instructions. +static bool +isDivergentUseWithNew(const Use &U, const UniformityInfo &UI, + const ValueMap<const Value *, bool> &Tracker) { + Value *V = U.get(); + if (auto It = Tracker.find(V); It != Tracker.end()) + return !It->second; // divergent if marked false + return UI.isDivergentUse(U); +} + +/// Optimizes uniform intrinsics calls if their operand can be proven uniform. +static bool optimizeUniformIntrinsic(IntrinsicInst &II, + const UniformityInfo &UI, + ValueMap<const Value *, bool> &Tracker) { + llvm::Intrinsic::ID IID = II.getIntrinsicID(); + + switch (IID) { + case Intrinsic::amdgcn_permlane64: + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: { + Value *Src = II.getArgOperand(0); + if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker)) + return false; + LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n'); + II.replaceAllUsesWith(Src); + II.eraseFromParent(); + return true; + } + case Intrinsic::amdgcn_ballot: { + Value *Src = II.getArgOperand(0); + if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker)) + return false; + LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n'); + + bool Changed = false; + for (User *U : make_early_inc_range(II.users())) { + if (auto *ICmp = dyn_cast<ICmpInst>(U)) { + Value *Op0 = ICmp->getOperand(0); + Value *Op1 = ICmp->getOperand(1); + ICmpInst::Predicate Pred = ICmp->getPredicate(); + Value *OtherOp = Op0 == &II ? Op1 : Op0; + + if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) { + // Case: (icmp eq %ballot, 0) -> xor %ballot_arg, 1 + Instruction *NotOp = + BinaryOperator::CreateNot(Src, "", ICmp->getIterator()); + Tracker[NotOp] = true; // NOT preserves uniformity + LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n'); + ICmp->replaceAllUsesWith(NotOp); + ICmp->eraseFromParent(); + Changed = true; + } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) { + // Case: (icmp ne %ballot, 0) -> %ballot_arg + LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: " + << *Src << '\n'); + ICmp->replaceAllUsesWith(Src); + ICmp->eraseFromParent(); + Changed = true; + } + } + } + // Erase the intrinsic if it has no remaining uses. + if (II.use_empty()) + II.eraseFromParent(); + return Changed; + } + default: + llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic"); + } + return false; +} + +/// Iterates over intrinsic declarations in the module to optimize their uses. +static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) { + bool IsChanged = false; + ValueMap<const Value *, bool> Tracker; + + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + for (Function &F : M) { + switch (F.getIntrinsicID()) { + case Intrinsic::amdgcn_permlane64: + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_ballot: + break; + default: + continue; + } + + for (User *U : make_early_inc_range(F.users())) { + auto *II = cast<IntrinsicInst>(U); + Function *ParentF = II->getFunction(); + const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF); + IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); + } + } + return IsChanged; +} + +PreservedAnalyses +AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) { + if (!runUniformIntrinsicCombine(M, AM)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve<UniformityInfoAnalysis>(); + return PA; +} diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index aae56ee..13f727b68 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -64,6 +64,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUHSAMetadataStreamer.cpp AMDGPUInsertDelayAlu.cpp AMDGPUInstCombineIntrinsic.cpp + AMDGPUUniformIntrinsicCombine.cpp AMDGPUInstrInfo.cpp AMDGPUInstructionSelector.cpp AMDGPUISelDAGToDAG.cpp diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1a686a9..730be69 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6073,9 +6073,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineOperand &Src0 = MI.getOperand(2); MachineOperand &Src1 = MI.getOperand(3); MachineOperand &Src2 = MI.getOperand(4); - unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) - ? AMDGPU::S_ADDC_U32 - : AMDGPU::S_SUBB_U32; if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) { Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0) @@ -6124,11 +6121,11 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addImm(0); } - // clang-format off - BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()) - .add(Src0) - .add(Src1); - // clang-format on + unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO + ? AMDGPU::S_ADDC_U32 + : AMDGPU::S_SUBB_U32; + + BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1); unsigned SelOpc = ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; @@ -16571,6 +16568,53 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, } } + // Eliminate setcc by using carryout from add/sub instruction + + // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo + // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi + // similarly for subtraction + + // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1 + // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0 + + if (VT == MVT::i64 && ((CC == ISD::SETULT && + sd_match(LHS, m_Add(m_Specific(RHS), m_Value()))) || + (CC == ISD::SETUGT && + sd_match(LHS, m_Sub(m_Specific(RHS), m_Value()))) || + (CC == ISD::SETEQ && CRHS && CRHS->isZero() && + sd_match(LHS, m_Add(m_Value(), m_One()))))) { + bool IsAdd = LHS.getOpcode() == ISD::ADD; + + SDValue Op0 = LHS.getOperand(0); + SDValue Op1 = LHS.getOperand(1); + + SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0); + SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1); + + SDValue Op0Hi = getHiHalf64(Op0, DAG); + SDValue Op1Hi = getHiHalf64(Op1, DAG); + + SDValue NodeLo = + DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL, + DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo}); + + SDValue CarryInHi = NodeLo.getValue(1); + SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY, + SL, DAG.getVTList(MVT::i32, MVT::i1), + {Op0Hi, Op1Hi, CarryInHi}); + + SDValue ResultLo = NodeLo.getValue(0); + SDValue ResultHi = NodeHi.getValue(0); + + SDValue JoinedResult = + DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi}); + + SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult); + SDValue Overflow = NodeHi.getValue(1); + DCI.CombineTo(LHS.getNode(), Result); + return Overflow; + } + if (VT != MVT::f32 && VT != MVT::f64 && (!Subtarget->has16BitInsts() || VT != MVT::f16)) return SDValue(); |