aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
diff options
context:
space:
mode:
authorNAKAMURA Takumi <geek4civic@gmail.com>2025-01-09 17:16:04 +0900
committerNAKAMURA Takumi <geek4civic@gmail.com>2025-01-09 17:16:04 +0900
commit0aa930a41f2d1ebf1fa90ec42da8f96d15a4dcbb (patch)
tree6a77b463f700e090df586672c26b9fe765fd115b /llvm/lib/Target/AMDGPU/SIISelLowering.cpp
parentec6892d1c979ce0b84c86918d5cdbb03037b409a (diff)
parent6d16b1c5c468a79ecf867293023c89ac518ecdda (diff)
downloadllvm-users/chapuni/cov/single/nextcount-base.zip
llvm-users/chapuni/cov/single/nextcount-base.tar.gz
llvm-users/chapuni/cov/single/nextcount-base.tar.bz2
Merge branch 'users/chapuni/cov/single/pair' into users/chapuni/cov/single/nextcount-baseusers/chapuni/cov/single/nextcount-base
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp114
1 files changed, 82 insertions, 32 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 58b061f..0ac84f4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4017,29 +4017,26 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
-// except for stack growth direction(default: downwards, AMDGPU: upwards) and
-// applying the wave size scale to the increment amount.
-SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
- SelectionDAG &DAG) const {
+// except for:
+// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
+// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
+SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
const MachineFunction &MF = DAG.getMachineFunction();
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
SDLoc dl(Op);
EVT VT = Op.getValueType();
- SDValue Tmp1 = Op;
- SDValue Tmp2 = Op.getValue(1);
- SDValue Tmp3 = Op.getOperand(2);
- SDValue Chain = Tmp1.getOperand(0);
-
+ SDValue Chain = Op.getOperand(0);
Register SPReg = Info->getStackPtrOffsetReg();
// Chain the dynamic stack allocation so that it doesn't modify the stack
// pointer when other instructions are using the stack.
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
- SDValue Size = Tmp2.getOperand(1);
+ SDValue Size = Op.getOperand(1);
SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
- Align Alignment = cast<ConstantSDNode>(Tmp3)->getAlignValue();
+ Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
@@ -4057,30 +4054,36 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
DAG.getSignedConstant(-ScaledAlignment, dl, VT));
}
- SDValue ScaledSize = DAG.getNode(
- ISD::SHL, dl, VT, Size,
- DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
-
- SDValue NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
+ assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
+ SDValue NewSP;
+ if (isa<ConstantSDNode>(Size)) {
+ // For constant sized alloca, scale alloca size by wave-size
+ SDValue ScaledSize = DAG.getNode(
+ ISD::SHL, dl, VT, Size,
+ DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
+ NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
+ } else {
+ // For dynamic sized alloca, perform wave-wide reduction to get max of
+ // alloca size(divergent) and then scale it by wave-size
+ SDValue WaveReduction =
+ DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
+ Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
+ Size, DAG.getConstant(0, dl, MVT::i32));
+ SDValue ScaledSize = DAG.getNode(
+ ISD::SHL, dl, VT, Size,
+ DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
+ NewSP =
+ DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
+ SDValue ReadFirstLaneID =
+ DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
+ NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
+ NewSP);
+ }
Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
- Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
+ SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
- return DAG.getMergeValues({BaseAddr, Tmp2}, dl);
-}
-
-SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
- SelectionDAG &DAG) const {
- // We only handle constant sizes here to allow non-entry block, static sized
- // allocas. A truly dynamic value is more difficult to support because we
- // don't know if the size value is uniform or not. If the size isn't uniform,
- // we would need to do a wave reduction to get the maximum size to know how
- // much to increment the uniform stack pointer.
- SDValue Size = Op.getOperand(1);
- if (isa<ConstantSDNode>(Size))
- return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
-
- return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
+ return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
}
SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
@@ -13982,6 +13985,43 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
return Accum;
}
+SDValue
+SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SDValue RHS = N->getOperand(1);
+ auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
+ if (!CRHS)
+ return SDValue();
+
+ // TODO: Worth using computeKnownBits? Maybe expensive since it's so
+ // common.
+ uint64_t Val = CRHS->getZExtValue();
+ if (countr_zero(Val) >= 32) {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+ SDValue LHS = N->getOperand(0);
+
+ // Avoid carry machinery if we know the low half of the add does not
+ // contribute to the final result.
+ //
+ // add i64:x, K if computeTrailingZeros(K) >= 32
+ // => build_pair (add x.hi, K.hi), x.lo
+
+ // Breaking the 64-bit add here with this strange constant is unlikely
+ // to interfere with addressing mode patterns.
+
+ SDValue Hi = getHiHalf64(LHS, DAG);
+ SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
+ SDValue AddHi =
+ DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
+
+ SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+ return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
+ }
+
+ return SDValue();
+}
+
// Collect the ultimate src of each of the mul node's operands, and confirm
// each operand is 8 bytes.
static std::optional<ByteProvider<SDValue>>
@@ -14258,6 +14298,11 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
return V;
}
+ if (VT == MVT::i64) {
+ if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
+ return Folded;
+ }
+
if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
(Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
SDValue TempNode(N, 0);
@@ -14443,6 +14488,11 @@ SDValue SITargetLowering::performSubCombine(SDNode *N,
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
+ if (VT == MVT::i64) {
+ if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
+ return Folded;
+ }
+
if (VT != MVT::i32)
return SDValue();