aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp103
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp60
4 files changed, 125 insertions, 50 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index ef58004..9907c88f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1288,16 +1288,17 @@ static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA,
return std::min(MaxVirtReg + MaxPhysReg, 256u);
}
-// TODO: Migrate to range merge of amdgpu-agpr-alloc.
-struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> {
- using Base = StateWrapper<BooleanState, AbstractAttribute>;
- AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+struct AAAMDGPUMinAGPRAlloc
+ : public StateWrapper<DecIntegerState<>, AbstractAttribute> {
+ using Base = StateWrapper<DecIntegerState<>, AbstractAttribute>;
+ AAAMDGPUMinAGPRAlloc(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
- static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP,
- Attributor &A) {
+ static AAAMDGPUMinAGPRAlloc &createForPosition(const IRPosition &IRP,
+ Attributor &A) {
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
- return *new (A.Allocator) AAAMDGPUNoAGPR(IRP, A);
- llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position");
+ return *new (A.Allocator) AAAMDGPUMinAGPRAlloc(IRP, A);
+ llvm_unreachable(
+ "AAAMDGPUMinAGPRAlloc is only valid for function position");
}
void initialize(Attributor &A) override {
@@ -1310,25 +1311,33 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> {
}
const std::string getAsStr(Attributor *A) const override {
- return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr";
+ std::string Str = "amdgpu-agpr-alloc=";
+ raw_string_ostream OS(Str);
+ OS << getAssumed();
+ return OS.str();
}
void trackStatistics() const override {}
ChangeStatus updateImpl(Attributor &A) override {
- // TODO: Use AACallEdges, but then we need a way to inspect asm edges.
+ DecIntegerState<> Maximum;
- auto CheckForNoAGPRs = [&](Instruction &I) {
+ // Check for cases which require allocation of AGPRs. The only cases where
+ // AGPRs are required are if there are direct references to AGPRs, so inline
+ // assembly and special intrinsics.
+ auto CheckForMinAGPRAllocs = [&](Instruction &I) {
const auto &CB = cast<CallBase>(I);
const Value *CalleeOp = CB.getCalledOperand();
- const Function *Callee = dyn_cast<Function>(CalleeOp);
- if (!Callee) {
- if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
- return inlineAsmGetNumRequiredAGPRs(IA, CB) == 0;
- return false;
+
+ if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp)) {
+ // Technically, the inline asm could be invoking a call to an unknown
+ // external function that requires AGPRs, but ignore that.
+ unsigned NumRegs = inlineAsmGetNumRequiredAGPRs(IA, CB);
+ Maximum.takeAssumedMaximum(NumRegs);
+ return true;
}
- switch (Callee->getIntrinsicID()) {
+ switch (CB.getIntrinsicID()) {
case Intrinsic::not_intrinsic:
break;
case Intrinsic::write_register:
@@ -1340,7 +1349,10 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> {
->getOperand(0));
auto [Kind, RegIdx, NumRegs] =
AMDGPU::parseAsmPhysRegName(RegName->getString());
- return Kind != 'a';
+ if (Kind == 'a')
+ Maximum.takeAssumedMaximum(std::min(RegIdx + NumRegs, 256u));
+
+ return true;
}
default:
// Some intrinsics may use AGPRs, but if we have a choice, we are not
@@ -1349,32 +1361,50 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> {
}
// TODO: Handle callsite attributes
- const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(
- *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
- return CalleeInfo && CalleeInfo->isValidState() &&
- CalleeInfo->getAssumed();
+ auto *CBEdges = A.getAAFor<AACallEdges>(
+ *this, IRPosition::callsite_function(CB), DepClassTy::REQUIRED);
+ if (!CBEdges || CBEdges->hasUnknownCallee()) {
+ Maximum.indicatePessimisticFixpoint();
+ return false;
+ }
+
+ for (const Function *PossibleCallee : CBEdges->getOptimisticEdges()) {
+ const auto *CalleeInfo = A.getAAFor<AAAMDGPUMinAGPRAlloc>(
+ *this, IRPosition::function(*PossibleCallee), DepClassTy::REQUIRED);
+ if (!CalleeInfo || !CalleeInfo->isValidState()) {
+ Maximum.indicatePessimisticFixpoint();
+ return false;
+ }
+
+ Maximum.takeAssumedMaximum(CalleeInfo->getAssumed());
+ }
+
+ return true;
};
bool UsedAssumedInformation = false;
- if (!A.checkForAllCallLikeInstructions(CheckForNoAGPRs, *this,
+ if (!A.checkForAllCallLikeInstructions(CheckForMinAGPRAllocs, *this,
UsedAssumedInformation))
return indicatePessimisticFixpoint();
- return ChangeStatus::UNCHANGED;
+
+ return clampStateAndIndicateChange(getState(), Maximum);
}
ChangeStatus manifest(Attributor &A) override {
- if (!getAssumed())
- return ChangeStatus::UNCHANGED;
LLVMContext &Ctx = getAssociatedFunction()->getContext();
- return A.manifestAttrs(getIRPosition(),
- {Attribute::get(Ctx, "amdgpu-agpr-alloc", "0")});
+ SmallString<4> Buffer;
+ raw_svector_ostream OS(Buffer);
+ OS << getAssumed();
+
+ return A.manifestAttrs(
+ getIRPosition(), {Attribute::get(Ctx, "amdgpu-agpr-alloc", OS.str())});
}
- StringRef getName() const override { return "AAAMDGPUNoAGPR"; }
+ StringRef getName() const override { return "AAAMDGPUMinAGPRAlloc"; }
const char *getIdAddr() const override { return &ID; }
/// This function should return true if the type of the \p AA is
- /// AAAMDGPUNoAGPRs
+ /// AAAMDGPUMinAGPRAllocs
static bool classof(const AbstractAttribute *AA) {
return (AA->getIdAddr() == &ID);
}
@@ -1382,7 +1412,7 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> {
static const char ID;
};
-const char AAAMDGPUNoAGPR::ID = 0;
+const char AAAMDGPUMinAGPRAlloc::ID = 0;
/// An abstract attribute to propagate the function attribute
/// "amdgpu-cluster-dims" from kernel entry functions to device functions.
@@ -1550,10 +1580,11 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
DenseSet<const char *> Allowed(
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
&AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
- &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
- &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
- &AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID,
- &AAIndirectCallInfo::ID, &AAAMDGPUClusterDims::ID});
+ &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID,
+ &AAAMDGPUMinAGPRAlloc::ID, &AACallEdges::ID, &AAPointerInfo::ID,
+ &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID,
+ &AANoAliasAddrSpace::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
+ &AAAMDGPUClusterDims::ID});
AttributorConfig AC(CGUpdater);
AC.IsClosedWorldModule = Options.IsClosedWorld;
@@ -1595,7 +1626,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
A.getOrCreateAAFor<AAAMDGPUClusterDims>(IRPosition::function(*F));
if (ST.hasGFX90AInsts())
- A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F));
+ A.getOrCreateAAFor<AAAMDGPUMinAGPRAlloc>(IRPosition::function(*F));
for (auto &I : instructions(F)) {
Value *Ptr = nullptr;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index e4d328a..b8b419d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1112,8 +1112,7 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
{N->getOperand(0), N->getOperand(1),
CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
} else {
- unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
- : AMDGPU::S_USUBO_PSEUDO;
+ unsigned Opc = IsAdd ? AMDGPU::S_UADDO_PSEUDO : AMDGPU::S_USUBO_PSEUDO;
CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
{N->getOperand(0), N->getOperand(1)});
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index fedb694..89c16da 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -482,12 +482,13 @@ void AMDGPURewriteAGPRCopyMFMAImpl::eliminateSpillsOfReassignedVGPRs() const {
}
sort(StackIntervals, [](const LiveInterval *A, const LiveInterval *B) {
+ // The ordering has to be strictly weak.
/// Sort heaviest intervals first to prioritize their unspilling
- if (A->weight() > B->weight())
- return true;
+ if (A->weight() != B->weight())
+ return A->weight() > B->weight();
- if (A->getSize() > B->getSize())
- return true;
+ if (A->getSize() != B->getSize())
+ return A->getSize() > B->getSize();
// Tie breaker by number to avoid need for stable sort
return A->reg().stackSlotIndex() < B->reg().stackSlotIndex();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1a686a9..730be69 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6073,9 +6073,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineOperand &Src0 = MI.getOperand(2);
MachineOperand &Src1 = MI.getOperand(3);
MachineOperand &Src2 = MI.getOperand(4);
- unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
- ? AMDGPU::S_ADDC_U32
- : AMDGPU::S_SUBB_U32;
if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
@@ -6124,11 +6121,11 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.addImm(0);
}
- // clang-format off
- BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
- .add(Src0)
- .add(Src1);
- // clang-format on
+ unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
+ ? AMDGPU::S_ADDC_U32
+ : AMDGPU::S_SUBB_U32;
+
+ BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
unsigned SelOpc =
ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
@@ -16571,6 +16568,53 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
}
}
+ // Eliminate setcc by using carryout from add/sub instruction
+
+ // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
+ // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
+ // similarly for subtraction
+
+ // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
+ // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
+
+ if (VT == MVT::i64 && ((CC == ISD::SETULT &&
+ sd_match(LHS, m_Add(m_Specific(RHS), m_Value()))) ||
+ (CC == ISD::SETUGT &&
+ sd_match(LHS, m_Sub(m_Specific(RHS), m_Value()))) ||
+ (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
+ sd_match(LHS, m_Add(m_Value(), m_One()))))) {
+ bool IsAdd = LHS.getOpcode() == ISD::ADD;
+
+ SDValue Op0 = LHS.getOperand(0);
+ SDValue Op1 = LHS.getOperand(1);
+
+ SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);
+ SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);
+
+ SDValue Op0Hi = getHiHalf64(Op0, DAG);
+ SDValue Op1Hi = getHiHalf64(Op1, DAG);
+
+ SDValue NodeLo =
+ DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
+ DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
+
+ SDValue CarryInHi = NodeLo.getValue(1);
+ SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
+ SL, DAG.getVTList(MVT::i32, MVT::i1),
+ {Op0Hi, Op1Hi, CarryInHi});
+
+ SDValue ResultLo = NodeLo.getValue(0);
+ SDValue ResultHi = NodeHi.getValue(0);
+
+ SDValue JoinedResult =
+ DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});
+
+ SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
+ SDValue Overflow = NodeHi.getValue(1);
+ DCI.CombineTo(LHS.getNode(), Result);
+ return Overflow;
+ }
+
if (VT != MVT::f32 && VT != MVT::f64 &&
(!Subtarget->has16BitInsts() || VT != MVT::f16))
return SDValue();