diff options
| author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2026-02-02 09:33:16 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2026-02-02 09:33:16 +0100 |
| commit | 80662c1de160c94e93086f8337255e219752468c (patch) | |
| tree | 6ba2fe2354fc3f86426c59876a8b437d203fd645 | |
| parent | 8d0830e344c2f537ac70499a92d26ccd7568f385 (diff) | |
| download | llvm-main.zip llvm-main.tar.gz llvm-main.tar.bz2 | |
Enables assumes in more contexts. Of particular interest is the
nan check for the fract pattern.
The device libs f32 and s64 sin implementations have a range check,
and inside the large path this pattern appears. After a small patch
to invert this check to send nans down the small path, this will
enable the fold unconditionally on the large path.
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 49 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/fract-match.ll | 79 |
2 files changed, 106 insertions, 22 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 5845b14..e51d2c0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -101,10 +101,9 @@ public: const GCNSubtarget &ST; const AMDGPUTargetMachine &TM; const TargetLibraryInfo *TLI; - AssumptionCache *AC; - const DominatorTree *DT; const UniformityInfo &UA; const DataLayout &DL; + SimplifyQuery SQ; const bool HasFP32DenormalFlush; bool FlowChanged = false; mutable Function *SqrtF32 = nullptr; @@ -116,8 +115,8 @@ public: AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM, const TargetLibraryInfo *TLI, AssumptionCache *AC, const DominatorTree *DT, const UniformityInfo &UA) - : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC), - DT(DT), UA(UA), DL(F.getDataLayout()), + : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), UA(UA), + DL(F.getDataLayout()), SQ(DL, TLI, DT, AC), HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals == DenormalMode::getPreserveSign()) {} @@ -150,7 +149,8 @@ public: /// Wrapper to pass all the arguments to computeKnownFPClass KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested, const Instruction *CtxI) const { - return llvm::computeKnownFPClass(V, DL, Interested, TLI, AC, CtxI, DT); + return llvm::computeKnownFPClass(V, Interested, + SQ.getWithInstruction(CtxI)); } bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const { @@ -161,12 +161,12 @@ public: /// \returns The minimum number of bits needed to store the value of \Op as an /// unsigned integer. Truncating to this size and then zero-extending to /// the original will not change the value. - unsigned numBitsUnsigned(Value *Op) const; + unsigned numBitsUnsigned(Value *Op, const Instruction *CtxI) const; /// \returns The minimum number of bits needed to store the value of \Op as a /// signed integer. Truncating to this size and then sign-extending to /// the original size will not change the value. - unsigned numBitsSigned(Value *Op) const; + unsigned numBitsSigned(Value *Op, const Instruction *CtxI) const; /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24. /// SelectionDAG has an issue where an and asserting the bits are known @@ -319,12 +319,16 @@ bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const { return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I); } -unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const { - return computeKnownBits(Op, DL, AC).countMaxActiveBits(); +unsigned +AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op, + const Instruction *CtxI) const { + return computeKnownBits(Op, SQ.getWithInstruction(CtxI)).countMaxActiveBits(); } -unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const { - return ComputeMaxSignificantBits(Op, DL, AC); +unsigned +AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op, + const Instruction *CtxI) const { + return ComputeMaxSignificantBits(Op, SQ.DL, SQ.AC, CtxI, SQ.DT); } static void extractValues(IRBuilder<> &Builder, @@ -375,12 +379,12 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const { unsigned LHSBits = 0, RHSBits = 0; bool IsSigned = false; - if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 && - (RHSBits = numBitsUnsigned(RHS)) <= 24) { + if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS, &I)) <= 24 && + (RHSBits = numBitsUnsigned(RHS, &I)) <= 24) { IsSigned = false; - } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 && - (RHSBits = numBitsSigned(RHS)) <= 24) { + } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS, &I)) <= 24 && + (RHSBits = numBitsSigned(RHS, &I)) <= 24) { IsSigned = true; } else @@ -1019,13 +1023,13 @@ unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num, Den->getType()->getScalarSizeInBits()); unsigned SSBits = Num->getType()->getScalarSizeInBits(); if (IsSigned) { - unsigned RHSSignBits = ComputeNumSignBits(Den, DL, AC, &I); + unsigned RHSSignBits = ComputeNumSignBits(Den, SQ.DL, SQ.AC, &I, SQ.DT); // A sign bit needs to be reserved for shrinking. unsigned DivBits = SSBits - RHSSignBits + 1; if (DivBits > MaxDivBits) return SSBits; - unsigned LHSSignBits = ComputeNumSignBits(Num, DL, AC, &I); + unsigned LHSSignBits = ComputeNumSignBits(Num, SQ.DL, SQ.AC, &I); unsigned SignBits = std::min(LHSSignBits, RHSSignBits); DivBits = SSBits - SignBits + 1; @@ -1034,7 +1038,7 @@ unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num, // All bits are used for unsigned division for Num or Den in range // (SignedMax, UnsignedMax]. - KnownBits Known = computeKnownBits(Den, DL, AC, &I); + KnownBits Known = computeKnownBits(Den, SQ.getWithInstruction(&I)); if (Known.isNegative() || !Known.isNonNegative()) return SSBits; unsigned RHSSignBits = Known.countMinLeadingZeros(); @@ -1042,7 +1046,7 @@ unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num, if (DivBits > MaxDivBits) return SSBits; - Known = computeKnownBits(Num, DL, AC, &I); + Known = computeKnownBits(Num, SQ.getWithInstruction(&I)); if (Known.isNegative() || !Known.isNonNegative()) return SSBits; unsigned LHSSignBits = Known.countMinLeadingZeros(); @@ -1179,7 +1183,7 @@ bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I, // If there's no wider mulhi, there's only a better expansion for powers of // two. // TODO: Should really know for each vector element. - if (isKnownToBeAPowerOfTwo(C, DL, true, AC, &I, DT)) + if (isKnownToBeAPowerOfTwo(C, true, SQ.getWithInstruction(&I))) return true; return false; @@ -1189,7 +1193,8 @@ bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I, // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 if (BinOpDen->getOpcode() == Instruction::Shl && isa<Constant>(BinOpDen->getOperand(0)) && - isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), DL, true, AC, &I, DT)) { + isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), true, + SQ.getWithInstruction(&I))) { return true; } } @@ -2078,7 +2083,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) { // Match pattern for fract intrinsic in contexts where the nan check has been // optimized out (and hope the knowledge the source can't be nan wasn't lost). - if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SimplifyQuery(DL, TLI))) + if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SQ.getWithInstruction(&I))) return false; IRBuilder<> Builder(&I); diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index 9016d4f..1da8a9fb 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -3871,6 +3871,85 @@ entry: ret float %min } +define double @fract_match_f64_assume_not_nan(double %x) #0 { +; GFX6-LABEL: fract_match_f64_assume_not_nan: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v4, -1 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX6-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 3 +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX6-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff +; GFX6-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: s_mov_b32 s5, 0x7ff00000 +; GFX6-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: fract_match_f64_assume_not_nan: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, 0 +; GFX7-NEXT: s_mov_b32 s5, 0x7ff00000 +; GFX7-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: fract_match_f64_assume_not_nan: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: s_mov_b32 s5, 0x7ff00000 +; GFX8-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fract_match_f64_assume_not_nan: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX11-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v2 :: v_dual_cndmask_b32 v1, 0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: fract_match_f64_assume_not_nan: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX12-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| +; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v2 :: v_dual_cndmask_b32 v1, 0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] +entry: + %is.ord = fcmp ord double %x, 0.000000e+00 + tail call void @llvm.assume(i1 %is.ord) + %floor = tail call double @llvm.floor.f64(double %x) + %sub = fsub double %x, %floor + %min = tail call double @llvm.minnum.f64(double %sub, double 0x3FEFFFFFFFFFFFFF) + %x.abs = tail call double @llvm.fabs.f64(double %x) + %is.inf = fcmp oeq double %x.abs, 0x7FF0000000000000 + %result = select i1 %is.inf, double 0.0, double %min + ret double %result +} + declare half @llvm.floor.f16(half) #0 declare float @llvm.floor.f32(float) #0 declare double @llvm.floor.f64(double) #0 |
