aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2026-02-02 09:33:16 +0100
committerGitHub <noreply@github.com>2026-02-02 09:33:16 +0100
commit80662c1de160c94e93086f8337255e219752468c (patch)
tree6ba2fe2354fc3f86426c59876a8b437d203fd645
parent8d0830e344c2f537ac70499a92d26ccd7568f385 (diff)
downloadllvm-main.zip
llvm-main.tar.gz
llvm-main.tar.bz2
AMDGPU: Use SimplifyQuery in AMDGPUCodeGenPrepare (#179133)HEADmain
Enables assumes in more contexts. Of particular interest is the nan check for the fract pattern. The device libs f32 and s64 sin implementations have a range check, and inside the large path this pattern appears. After a small patch to invert this check to send nans down the small path, this will enable the fold unconditionally on the large path.
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp49
-rw-r--r--llvm/test/CodeGen/AMDGPU/fract-match.ll79
2 files changed, 106 insertions, 22 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 5845b14..e51d2c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -101,10 +101,9 @@ public:
const GCNSubtarget &ST;
const AMDGPUTargetMachine &TM;
const TargetLibraryInfo *TLI;
- AssumptionCache *AC;
- const DominatorTree *DT;
const UniformityInfo &UA;
const DataLayout &DL;
+ SimplifyQuery SQ;
const bool HasFP32DenormalFlush;
bool FlowChanged = false;
mutable Function *SqrtF32 = nullptr;
@@ -116,8 +115,8 @@ public:
AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
const TargetLibraryInfo *TLI, AssumptionCache *AC,
const DominatorTree *DT, const UniformityInfo &UA)
- : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC),
- DT(DT), UA(UA), DL(F.getDataLayout()),
+ : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), UA(UA),
+ DL(F.getDataLayout()), SQ(DL, TLI, DT, AC),
HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
DenormalMode::getPreserveSign()) {}
@@ -150,7 +149,8 @@ public:
/// Wrapper to pass all the arguments to computeKnownFPClass
KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,
const Instruction *CtxI) const {
- return llvm::computeKnownFPClass(V, DL, Interested, TLI, AC, CtxI, DT);
+ return llvm::computeKnownFPClass(V, Interested,
+ SQ.getWithInstruction(CtxI));
}
bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
@@ -161,12 +161,12 @@ public:
/// \returns The minimum number of bits needed to store the value of \Op as an
/// unsigned integer. Truncating to this size and then zero-extending to
/// the original will not change the value.
- unsigned numBitsUnsigned(Value *Op) const;
+ unsigned numBitsUnsigned(Value *Op, const Instruction *CtxI) const;
/// \returns The minimum number of bits needed to store the value of \Op as a
/// signed integer. Truncating to this size and then sign-extending to
/// the original size will not change the value.
- unsigned numBitsSigned(Value *Op) const;
+ unsigned numBitsSigned(Value *Op, const Instruction *CtxI) const;
/// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
/// SelectionDAG has an issue where an and asserting the bits are known
@@ -319,12 +319,16 @@ bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I);
}
-unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
- return computeKnownBits(Op, DL, AC).countMaxActiveBits();
+unsigned
+AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op,
+ const Instruction *CtxI) const {
+ return computeKnownBits(Op, SQ.getWithInstruction(CtxI)).countMaxActiveBits();
}
-unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
- return ComputeMaxSignificantBits(Op, DL, AC);
+unsigned
+AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op,
+ const Instruction *CtxI) const {
+ return ComputeMaxSignificantBits(Op, SQ.DL, SQ.AC, CtxI, SQ.DT);
}
static void extractValues(IRBuilder<> &Builder,
@@ -375,12 +379,12 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
unsigned LHSBits = 0, RHSBits = 0;
bool IsSigned = false;
- if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
- (RHSBits = numBitsUnsigned(RHS)) <= 24) {
+ if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS, &I)) <= 24 &&
+ (RHSBits = numBitsUnsigned(RHS, &I)) <= 24) {
IsSigned = false;
- } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
- (RHSBits = numBitsSigned(RHS)) <= 24) {
+ } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS, &I)) <= 24 &&
+ (RHSBits = numBitsSigned(RHS, &I)) <= 24) {
IsSigned = true;
} else
@@ -1019,13 +1023,13 @@ unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
Den->getType()->getScalarSizeInBits());
unsigned SSBits = Num->getType()->getScalarSizeInBits();
if (IsSigned) {
- unsigned RHSSignBits = ComputeNumSignBits(Den, DL, AC, &I);
+ unsigned RHSSignBits = ComputeNumSignBits(Den, SQ.DL, SQ.AC, &I, SQ.DT);
// A sign bit needs to be reserved for shrinking.
unsigned DivBits = SSBits - RHSSignBits + 1;
if (DivBits > MaxDivBits)
return SSBits;
- unsigned LHSSignBits = ComputeNumSignBits(Num, DL, AC, &I);
+ unsigned LHSSignBits = ComputeNumSignBits(Num, SQ.DL, SQ.AC, &I);
unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
DivBits = SSBits - SignBits + 1;
@@ -1034,7 +1038,7 @@ unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
// All bits are used for unsigned division for Num or Den in range
// (SignedMax, UnsignedMax].
- KnownBits Known = computeKnownBits(Den, DL, AC, &I);
+ KnownBits Known = computeKnownBits(Den, SQ.getWithInstruction(&I));
if (Known.isNegative() || !Known.isNonNegative())
return SSBits;
unsigned RHSSignBits = Known.countMinLeadingZeros();
@@ -1042,7 +1046,7 @@ unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
if (DivBits > MaxDivBits)
return SSBits;
- Known = computeKnownBits(Num, DL, AC, &I);
+ Known = computeKnownBits(Num, SQ.getWithInstruction(&I));
if (Known.isNegative() || !Known.isNonNegative())
return SSBits;
unsigned LHSSignBits = Known.countMinLeadingZeros();
@@ -1179,7 +1183,7 @@ bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
// If there's no wider mulhi, there's only a better expansion for powers of
// two.
// TODO: Should really know for each vector element.
- if (isKnownToBeAPowerOfTwo(C, DL, true, AC, &I, DT))
+ if (isKnownToBeAPowerOfTwo(C, true, SQ.getWithInstruction(&I)))
return true;
return false;
@@ -1189,7 +1193,8 @@ bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
// fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
if (BinOpDen->getOpcode() == Instruction::Shl &&
isa<Constant>(BinOpDen->getOperand(0)) &&
- isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), DL, true, AC, &I, DT)) {
+ isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), true,
+ SQ.getWithInstruction(&I))) {
return true;
}
}
@@ -2078,7 +2083,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
// Match pattern for fract intrinsic in contexts where the nan check has been
// optimized out (and hope the knowledge the source can't be nan wasn't lost).
- if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SimplifyQuery(DL, TLI)))
+ if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SQ.getWithInstruction(&I)))
return false;
IRBuilder<> Builder(&I);
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index 9016d4f..1da8a9fb 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -3871,6 +3871,85 @@ entry:
ret float %min
}
+define double @fract_match_f64_assume_not_nan(double %x) #0 {
+; GFX6-LABEL: fract_match_f64_assume_not_nan:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v4, -1
+; GFX6-NEXT: v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX6-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 3
+; GFX6-NEXT: s_mov_b32 s4, -1
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX6-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3]
+; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
+; GFX6-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3]
+; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
+; GFX6-NEXT: s_mov_b32 s4, 0
+; GFX6-NEXT: s_mov_b32 s5, 0x7ff00000
+; GFX6-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: fract_match_f64_assume_not_nan:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s4, 0
+; GFX7-NEXT: s_mov_b32 s5, 0x7ff00000
+; GFX7-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: fract_match_f64_assume_not_nan:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s4, 0
+; GFX8-NEXT: s_mov_b32 s5, 0x7ff00000
+; GFX8-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fract_match_f64_assume_not_nan:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
+; GFX11-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v2 :: v_dual_cndmask_b32 v1, 0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fract_match_f64_assume_not_nan:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
+; GFX12-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v2 :: v_dual_cndmask_b32 v1, 0, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %is.ord = fcmp ord double %x, 0.000000e+00
+ tail call void @llvm.assume(i1 %is.ord)
+ %floor = tail call double @llvm.floor.f64(double %x)
+ %sub = fsub double %x, %floor
+ %min = tail call double @llvm.minnum.f64(double %sub, double 0x3FEFFFFFFFFFFFFF)
+ %x.abs = tail call double @llvm.fabs.f64(double %x)
+ %is.inf = fcmp oeq double %x.abs, 0x7FF0000000000000
+ %result = select i1 %is.inf, double 0.0, double %min
+ ret double %result
+}
+
declare half @llvm.floor.f16(half) #0
declare float @llvm.floor.f32(float) #0
declare double @llvm.floor.f64(double) #0