diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 91 |
1 files changed, 34 insertions, 57 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 24bef82..8e35ba7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPUTargetMachine.h" #include "SIModeRegisterDefaults.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -27,6 +28,7 @@ #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" @@ -106,6 +108,7 @@ public: bool FlowChanged = false; mutable Function *SqrtF32 = nullptr; mutable Function *LdexpF32 = nullptr; + mutable SmallVector<WeakVH> DeadVals; DenseMap<const PHINode *, bool> BreakPhiNodesCache; @@ -242,6 +245,8 @@ public: Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src, FastMathFlags FMF) const; + bool tryNarrowMathIfNoOverflow(Instruction *I); + public: bool visitFDiv(BinaryOperator &I); @@ -281,28 +286,21 @@ bool AMDGPUCodeGenPrepareImpl::run() { BreakPhiNodesCache.clear(); bool MadeChange = false; - Function::iterator NextBB; - for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) { - BasicBlock *BB = &*FI; - NextBB = std::next(FI); - - BasicBlock::iterator Next; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; - I = Next) { - Next = std::next(I); - - MadeChange |= visit(*I); - - if (Next != E) { // Control flow changed - BasicBlock *NextInstBB = Next->getParent(); - if (NextInstBB != BB) { - BB = NextInstBB; - E = BB->end(); - FE = F.end(); - } - } + // Need to use make_early_inc_range because integer division expansion is + // handled by Transform/Utils, and it can delete instructions such as the + // terminator of the BB. + for (BasicBlock &BB : reverse(F)) { + for (Instruction &I : make_early_inc_range(reverse(BB))) { + if (!isInstructionTriviallyDead(&I, TLI)) + MadeChange |= visit(I); } } + + while (!DeadVals.empty()) { + if (auto *I = dyn_cast_or_null<Instruction>(DeadVals.pop_back_val())) + RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + } + return MadeChange; } @@ -422,7 +420,7 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const { Value *NewVal = insertValues(Builder, Ty, ResultVals); NewVal->takeName(&I); I.replaceAllUsesWith(NewVal); - I.eraseFromParent(); + DeadVals.push_back(&I); return true; } @@ -496,10 +494,10 @@ bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const { FoldedT, FoldedF); NewSelect->takeName(&BO); BO.replaceAllUsesWith(NewSelect); - BO.eraseFromParent(); + DeadVals.push_back(&BO); if (CastOp) - CastOp->eraseFromParent(); - Sel->eraseFromParent(); + DeadVals.push_back(CastOp); + DeadVals.push_back(Sel); return true; } @@ -895,7 +893,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { if (NewVal) { FDiv.replaceAllUsesWith(NewVal); NewVal->takeName(&FDiv); - RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLI); + DeadVals.push_back(&FDiv); } return true; @@ -1302,10 +1300,7 @@ it will create `s_and_b32 s0, s0, 0xff`. We accept this change since the non-byte load assumes the upper bits within the byte are all 0. */ -static bool tryNarrowMathIfNoOverflow(Instruction *I, - const SITargetLowering *TLI, - const TargetTransformInfo &TTI, - const DataLayout &DL) { +bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) { unsigned Opc = I->getOpcode(); Type *OldType = I->getType(); @@ -1330,6 +1325,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I, NewType = I->getType()->getWithNewBitWidth(NewBit); // Old cost + const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F); InstructionCost OldCost = TTI.getArithmeticInstrCost(Opc, OldType, TTI::TCK_RecipThroughput); // New cost of new op @@ -1360,7 +1356,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I, Value *Zext = Builder.CreateZExt(Arith, OldType); I->replaceAllUsesWith(Zext); - I->eraseFromParent(); + DeadVals.push_back(I); return true; } @@ -1370,8 +1366,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { if (UseMul24Intrin && replaceMulWithMul24(I)) return true; - if (tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(), - TM.getTargetTransformInfo(F), DL)) + if (tryNarrowMathIfNoOverflow(&I)) return true; bool Changed = false; @@ -1436,7 +1431,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { if (NewDiv) { I.replaceAllUsesWith(NewDiv); - I.eraseFromParent(); + DeadVals.push_back(&I); Changed = true; } } @@ -1492,7 +1487,7 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) { Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); I.replaceAllUsesWith(ValOrig); - I.eraseFromParent(); + DeadVals.push_back(&I); return true; } @@ -1534,7 +1529,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) { Fract->takeName(&I); I.replaceAllUsesWith(Fract); - RecursivelyDeleteTriviallyDeadInstructions(&I, TLI); + DeadVals.push_back(&I); return true; } @@ -1822,7 +1817,7 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) { } I.replaceAllUsesWith(Vec); - I.eraseFromParent(); + DeadVals.push_back(&I); return true; } @@ -1903,7 +1898,7 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) { auto *Intrin = B.CreateIntrinsic( I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)}); I.replaceAllUsesWith(Intrin); - I.eraseFromParent(); + DeadVals.push_back(&I); return true; } @@ -2000,16 +1995,10 @@ bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) { Value *Fract = applyFractPat(Builder, FractArg); Fract->takeName(&I); I.replaceAllUsesWith(Fract); - - RecursivelyDeleteTriviallyDeadInstructions(&I, TLI); + DeadVals.push_back(&I); return true; } -static bool isOneOrNegOne(const Value *Val) { - const APFloat *C; - return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0; -} - // Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way. bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { Type *Ty = Sqrt.getType()->getScalarType(); @@ -2030,18 +2019,6 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { if (ReqdAccuracy < 1.0f) return false; - // FIXME: This is an ugly hack for this pass using forward iteration instead - // of reverse. If it worked like a normal combiner, the rsq would form before - // we saw a sqrt call. - auto *FDiv = - dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser()); - if (FDiv && FDiv->getOpcode() == Instruction::FDiv && - FDiv->getFPAccuracy() >= 1.0f && - canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) && - // TODO: We should also handle the arcp case for the fdiv with non-1 value - isOneOrNegOne(FDiv->getOperand(0))) - return false; - Value *SrcVal = Sqrt.getOperand(0); bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt); @@ -2065,7 +2042,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals); NewSqrt->takeName(&Sqrt); Sqrt.replaceAllUsesWith(NewSqrt); - Sqrt.eraseFromParent(); + DeadVals.push_back(&Sqrt); return true; } |