diff options
Diffstat (limited to 'llvm/lib')
68 files changed, 3029 insertions, 718 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 299ea33..b3f5b12 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -866,21 +866,6 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP, Type *IntIdxTy = DL.getIndexType(Ptr->getType()); - // If this is "gep i8* Ptr, (sub 0, V)", fold this as: - // "inttoptr (sub (ptrtoint Ptr), V)" - if (Ops.size() == 2 && ResElemTy->isIntegerTy(8)) { - auto *CE = dyn_cast<ConstantExpr>(Ops[1]); - assert((!CE || CE->getType() == IntIdxTy) && - "CastGEPIndices didn't canonicalize index types!"); - if (CE && CE->getOpcode() == Instruction::Sub && - CE->getOperand(0)->isNullValue()) { - Constant *Res = ConstantExpr::getPtrToInt(Ptr, CE->getType()); - Res = ConstantExpr::getSub(Res, CE->getOperand(1)); - Res = ConstantExpr::getIntToPtr(Res, ResTy); - return ConstantFoldConstant(Res, DL, TLI); - } - } - for (unsigned i = 1, e = Ops.size(); i != e; ++i) if (!isa<ConstantInt>(Ops[i])) return nullptr; @@ -1336,6 +1321,19 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C, DL, BaseOffset, /*AllowNonInbounds=*/true)); if (Base->isNullValue()) { FoldedValue = ConstantInt::get(CE->getContext(), BaseOffset); + } else { + // ptrtoint (gep i8, Ptr, (sub 0, V)) -> sub (ptrtoint Ptr), V + if (GEP->getNumIndices() == 1 && + GEP->getSourceElementType()->isIntegerTy(8)) { + auto *Ptr = cast<Constant>(GEP->getPointerOperand()); + auto *Sub = dyn_cast<ConstantExpr>(GEP->getOperand(1)); + Type *IntIdxTy = DL.getIndexType(Ptr->getType()); + if (Sub && Sub->getType() == IntIdxTy && + Sub->getOpcode() == Instruction::Sub && + Sub->getOperand(0)->isNullValue()) + FoldedValue = ConstantExpr::getSub( + ConstantExpr::getPtrToInt(Ptr, IntIdxTy), Sub->getOperand(1)); + } } } if (FoldedValue) { @@ -3038,7 +3036,7 @@ static Constant *ConstantFoldFixedVectorCall( // Gather a column of constants. for (unsigned J = 0, JE = Operands.size(); J != JE; ++J) { // Some intrinsics use a scalar type for certain arguments. - if (hasVectorIntrinsicScalarOpd(IntrinsicID, J)) { + if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, J)) { Lane[J] = Operands[J]; continue; } diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index e03cf6c..e4d706a 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -227,12 +227,10 @@ static bool checkOrderedReduction(RecurKind Kind, Instruction *ExactFPMathInst, return true; } -bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind, - Loop *TheLoop, FastMathFlags FuncFMF, - RecurrenceDescriptor &RedDes, - DemandedBits *DB, - AssumptionCache *AC, - DominatorTree *DT) { +bool RecurrenceDescriptor::AddReductionVar( + PHINode *Phi, RecurKind Kind, Loop *TheLoop, FastMathFlags FuncFMF, + RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC, + DominatorTree *DT, ScalarEvolution *SE) { if (Phi->getNumIncomingValues() != 2) return false; @@ -249,6 +247,12 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind, // This includes users of the reduction, variables (which form a cycle // which ends in the phi node). Instruction *ExitInstruction = nullptr; + + // Variable to keep last visited store instruction. By the end of the + // algorithm this variable will be either empty or having intermediate + // reduction value stored in invariant address. + StoreInst *IntermediateStore = nullptr; + // Indicates that we found a reduction operation in our scan. bool FoundReduxOp = false; @@ -314,6 +318,10 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind, // - By instructions outside of the loop (safe). // * One value may have several outside users, but all outside // uses must be of the same value. + // - By store instructions with a loop invariant address (safe with + // the following restrictions): + // * If there are several stores, all must have the same address. + // * Final value should be stored in that loop invariant address. // - By an instruction that is not part of the reduction (not safe). // This is either: // * An instruction type other than PHI or the reduction operation. @@ -321,6 +329,43 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind, while (!Worklist.empty()) { Instruction *Cur = Worklist.pop_back_val(); + // Store instructions are allowed iff it is the store of the reduction + // value to the same loop invariant memory location. + if (auto *SI = dyn_cast<StoreInst>(Cur)) { + if (!SE) { + LLVM_DEBUG(dbgs() << "Store instructions are not processed without " + << "Scalar Evolution Analysis\n"); + return false; + } + + const SCEV *PtrScev = SE->getSCEV(SI->getPointerOperand()); + // Check it is the same address as previous stores + if (IntermediateStore) { + const SCEV *OtherScev = + SE->getSCEV(IntermediateStore->getPointerOperand()); + + if (OtherScev != PtrScev) { + LLVM_DEBUG(dbgs() << "Storing reduction value to different addresses " + << "inside the loop: " << *SI->getPointerOperand() + << " and " + << *IntermediateStore->getPointerOperand() << '\n'); + return false; + } + } + + // Check the pointer is loop invariant + if (!SE->isLoopInvariant(PtrScev, TheLoop)) { + LLVM_DEBUG(dbgs() << "Storing reduction value to non-uniform address " + << "inside the loop: " << *SI->getPointerOperand() + << '\n'); + return false; + } + + // IntermediateStore is always the last store in the loop. + IntermediateStore = SI; + continue; + } + // No Users. // If the instruction has no users then this is a broken chain and can't be // a reduction variable. @@ -443,10 +488,17 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind, // reductions which are represented as a cmp followed by a select. InstDesc IgnoredVal(false, nullptr); if (VisitedInsts.insert(UI).second) { - if (isa<PHINode>(UI)) + if (isa<PHINode>(UI)) { PHIs.push_back(UI); - else + } else { + StoreInst *SI = dyn_cast<StoreInst>(UI); + if (SI && SI->getPointerOperand() == Cur) { + // Reduction variable chain can only be stored somewhere but it + // can't be used as an address. + return false; + } NonPHIs.push_back(UI); + } } else if (!isa<PHINode>(UI) && ((!isa<FCmpInst>(UI) && !isa<ICmpInst>(UI) && !isa<SelectInst>(UI)) || @@ -474,6 +526,32 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind, if (isSelectCmpRecurrenceKind(Kind) && NumCmpSelectPatternInst != 1) return false; + if (IntermediateStore) { + // Check that stored value goes to the phi node again. This way we make sure + // that the value stored in IntermediateStore is indeed the final reduction + // value. + if (!is_contained(Phi->operands(), IntermediateStore->getValueOperand())) { + LLVM_DEBUG(dbgs() << "Not a final reduction value stored: " + << *IntermediateStore << '\n'); + return false; + } + + // If there is an exit instruction it's value should be stored in + // IntermediateStore + if (ExitInstruction && + IntermediateStore->getValueOperand() != ExitInstruction) { + LLVM_DEBUG(dbgs() << "Last store Instruction of reduction value does not " + "store last calculated value of the reduction: " + << *IntermediateStore << '\n'); + return false; + } + + // If all uses are inside the loop (intermediate stores), then the + // reduction value after the loop will be the one used in the last store. + if (!ExitInstruction) + ExitInstruction = cast<Instruction>(IntermediateStore->getValueOperand()); + } + if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) return false; @@ -535,9 +613,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind, // is saved as part of the RecurrenceDescriptor. // Save the description of this reduction variable. - RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF, ExactFPMathInst, - RecurrenceType, IsSigned, IsOrdered, CastInsts, - MinWidthCastToRecurrenceType); + RecurrenceDescriptor RD(RdxStart, ExitInstruction, IntermediateStore, Kind, + FMF, ExactFPMathInst, RecurrenceType, IsSigned, + IsOrdered, CastInsts, MinWidthCastToRecurrenceType); RedDes = RD; return true; @@ -761,7 +839,8 @@ bool RecurrenceDescriptor::hasMultipleUsesOf( bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC, - DominatorTree *DT) { + DominatorTree *DT, + ScalarEvolution *SE) { BasicBlock *Header = TheLoop->getHeader(); Function &F = *Header->getParent(); FastMathFlags FMF; @@ -770,72 +849,85 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, FMF.setNoSignedZeros( F.getFnAttribute("no-signed-zeros-fp-math").getValueAsBool()); - if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a SMAX reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a SMIN reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a UMAX reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a UMIN reduction PHI." << *Phi << "\n"); return true; } if (AddReductionVar(Phi, RecurKind::SelectICmp, TheLoop, FMF, RedDes, DB, AC, - DT)) { + DT, SE)) { LLVM_DEBUG(dbgs() << "Found an integer conditional select reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a float MAX reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT)) { + if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found a float MIN reduction PHI." << *Phi << "\n"); return true; } if (AddReductionVar(Phi, RecurKind::SelectFCmp, TheLoop, FMF, RedDes, DB, AC, - DT)) { + DT, SE)) { LLVM_DEBUG(dbgs() << "Found a float conditional select reduction PHI." << " PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC, - DT)) { + if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC, DT, + SE)) { LLVM_DEBUG(dbgs() << "Found an FMulAdd reduction PHI." << *Phi << "\n"); return true; } diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index b1773db..d0276df 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1993,9 +1993,12 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, for (StoreInst *ST : Stores) { Value *Ptr = ST->getPointerOperand(); - if (isUniform(Ptr)) + if (isUniform(Ptr)) { + // Record store instructions to loop invariant addresses + StoresToInvariantAddresses.push_back(ST); HasDependenceInvolvingLoopInvariantAddress |= !UniformStores.insert(Ptr).second; + } // If we did *not* see this pointer before, insert it to the read-write // list. At this phase it is only a 'write' list. diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp index b7806b3..eacd2621 100644 --- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp +++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp @@ -103,14 +103,24 @@ static bool isOneDimensionalArray(const SCEV &AccessFn, const SCEV &ElemSize, return StepRec == &ElemSize; } -/// Compute the trip count for the given loop \p L. Return the SCEV expression -/// for the trip count or nullptr if it cannot be computed. -static const SCEV *computeTripCount(const Loop &L, ScalarEvolution &SE) { +/// Compute the trip count for the given loop \p L or assume a default value if +/// it is not a compile time constant. Return the SCEV expression for the trip +/// count. +static const SCEV *computeTripCount(const Loop &L, const SCEV &ElemSize, + ScalarEvolution &SE) { const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(&L); - if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) || - !isa<SCEVConstant>(BackedgeTakenCount)) - return nullptr; - return SE.getTripCountFromExitCount(BackedgeTakenCount); + const SCEV *TripCount = (!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && + isa<SCEVConstant>(BackedgeTakenCount)) + ? SE.getTripCountFromExitCount(BackedgeTakenCount) + : nullptr; + + if (!TripCount) { + LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName() + << " could not be computed, using DefaultTripCount\n"); + TripCount = SE.getConstant(ElemSize.getType(), DefaultTripCount); + } + + return TripCount; } //===----------------------------------------------------------------------===// @@ -274,22 +284,18 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L, return 1; } - const SCEV *TripCount = computeTripCount(L, SE); - if (!TripCount) { - LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName() - << " could not be computed, using DefaultTripCount\n"); - const SCEV *ElemSize = Sizes.back(); - TripCount = SE.getConstant(ElemSize->getType(), DefaultTripCount); - } + const SCEV *TripCount = computeTripCount(L, *Sizes.back(), SE); + assert(TripCount && "Expecting valid TripCount"); LLVM_DEBUG(dbgs() << "TripCount=" << *TripCount << "\n"); - // If the indexed reference is 'consecutive' the cost is - // (TripCount*Stride)/CLS, otherwise the cost is TripCount. - const SCEV *RefCost = TripCount; - + const SCEV *RefCost = nullptr; if (isConsecutive(L, CLS)) { + // If the indexed reference is 'consecutive' the cost is + // (TripCount*Stride)/CLS. const SCEV *Coeff = getLastCoefficient(); const SCEV *ElemSize = Sizes.back(); + assert(Coeff->getType() == ElemSize->getType() && + "Expecting the same type"); const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize); Type *WiderType = SE.getWiderType(Stride->getType(), TripCount->getType()); const SCEV *CacheLineSize = SE.getConstant(WiderType, CLS); @@ -303,10 +309,33 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L, LLVM_DEBUG(dbgs().indent(4) << "Access is consecutive: RefCost=(TripCount*Stride)/CLS=" << *RefCost << "\n"); - } else + } else { + // If the indexed reference is not 'consecutive' the cost is proportional to + // the trip count and the depth of the dimension which the subject loop + // subscript is accessing. We try to estimate this by multiplying the cost + // by the trip counts of loops corresponding to the inner dimensions. For + // example, given the indexed reference 'A[i][j][k]', and assuming the + // i-loop is in the innermost position, the cost would be equal to the + // iterations of the i-loop multiplied by iterations of the j-loop. + RefCost = TripCount; + + int Index = getSubscriptIndex(L); + assert(Index >= 0 && "Cound not locate a valid Index"); + + for (unsigned I = Index + 1; I < getNumSubscripts() - 1; ++I) { + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(getSubscript(I)); + assert(AR && AR->getLoop() && "Expecting valid loop"); + const SCEV *TripCount = + computeTripCount(*AR->getLoop(), *Sizes.back(), SE); + Type *WiderType = SE.getWiderType(RefCost->getType(), TripCount->getType()); + RefCost = SE.getMulExpr(SE.getNoopOrAnyExtend(RefCost, WiderType), + SE.getNoopOrAnyExtend(TripCount, WiderType)); + } + LLVM_DEBUG(dbgs().indent(4) - << "Access is not consecutive: RefCost=TripCount=" << *RefCost - << "\n"); + << "Access is not consecutive: RefCost=" << *RefCost << "\n"); + } + assert(RefCost && "Expecting a valid RefCost"); // Attempt to fold RefCost into a constant. if (auto ConstantCost = dyn_cast<SCEVConstant>(RefCost)) @@ -481,6 +510,16 @@ bool IndexedReference::isConsecutive(const Loop &L, unsigned CLS) const { return SE.isKnownPredicate(ICmpInst::ICMP_ULT, Stride, CacheLineSize); } +int IndexedReference::getSubscriptIndex(const Loop &L) const { + for (auto Idx : seq<int>(0, getNumSubscripts())) { + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(getSubscript(Idx)); + if (AR && AR->getLoop() == &L) { + return Idx; + } + } + return -1; +} + const SCEV *IndexedReference::getLastCoefficient() const { const SCEV *LastSubscript = getLastSubscript(); auto *AR = cast<SCEVAddRecExpr>(LastSubscript); diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 75381f5..0144ce4 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -282,6 +282,20 @@ bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS, match(LHS, m_c_And(m_Specific(M), m_Value()))) return true; } + + // X op (Y & ~X) + if (match(RHS, m_c_And(m_Not(m_Specific(LHS)), m_Value())) || + match(LHS, m_c_And(m_Not(m_Specific(RHS)), m_Value()))) + return true; + + // X op ((X & Y) ^ Y) -- this is the canonical form of the previous pattern + // for constant Y. + Value *Y; + if (match(RHS, + m_c_Xor(m_c_And(m_Specific(LHS), m_Value(Y)), m_Deferred(Y))) || + match(LHS, m_c_Xor(m_c_And(m_Specific(RHS), m_Value(Y)), m_Deferred(Y)))) + return true; + // Look for: (A & B) op ~(A | B) { Value *A, *B; diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 5f8fa13..a53b216 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -40,7 +40,7 @@ static cl::opt<unsigned> MaxInterleaveGroupFactor( /// Return true if all of the intrinsic's arguments and return type are scalars /// for the scalar form of the intrinsic, and vectors for the vector form of the /// intrinsic (except operands that are marked as always being scalar by -/// hasVectorIntrinsicScalarOpd). +/// isVectorIntrinsicWithScalarOpAtArg). bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { switch (ID) { case Intrinsic::abs: // Begin integer bit-manipulation. @@ -89,6 +89,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::fmuladd: case Intrinsic::powi: case Intrinsic::canonicalize: + case Intrinsic::fptosi_sat: + case Intrinsic::fptoui_sat: return true; default: return false; @@ -96,8 +98,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { } /// Identifies if the vector form of the intrinsic has a scalar operand. -bool llvm::hasVectorIntrinsicScalarOpd(Intrinsic::ID ID, - unsigned ScalarOpdIdx) { +bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, + unsigned ScalarOpdIdx) { switch (ID) { case Intrinsic::abs: case Intrinsic::ctlz: @@ -114,11 +116,14 @@ bool llvm::hasVectorIntrinsicScalarOpd(Intrinsic::ID ID, } } -bool llvm::hasVectorIntrinsicOverloadedScalarOpd(Intrinsic::ID ID, - unsigned ScalarOpdIdx) { +bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, + unsigned OpdIdx) { switch (ID) { + case Intrinsic::fptosi_sat: + case Intrinsic::fptoui_sat: + return OpdIdx == 0; case Intrinsic::powi: - return (ScalarOpdIdx == 1); + return OpdIdx == 1; default: return false; } diff --git a/llvm/lib/BinaryFormat/Magic.cpp b/llvm/lib/BinaryFormat/Magic.cpp index 5d999a90..d48adb1 100644 --- a/llvm/lib/BinaryFormat/Magic.cpp +++ b/llvm/lib/BinaryFormat/Magic.cpp @@ -225,6 +225,11 @@ file_magic llvm::identify_magic(StringRef Magic) { if (startswith(Magic, "--- !tapi") || startswith(Magic, "---\narchs:")) return file_magic::tapi_file; break; + + case 'D': // DirectX container file - DXBC + if (startswith(Magic, "DXBC") && Magic.size() == 4) + return file_magic::dxcontainer_object; + break; default: break; diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp index 06830e8..f2b0024 100644 --- a/llvm/lib/CodeGen/MachineFunction.cpp +++ b/llvm/lib/CodeGen/MachineFunction.cpp @@ -1033,7 +1033,32 @@ void MachineFunction::substituteDebugValuesForInst(const MachineInstr &Old, } } -auto MachineFunction::salvageCopySSA(MachineInstr &MI) +auto MachineFunction::salvageCopySSA( + MachineInstr &MI, DenseMap<Register, DebugInstrOperandPair> &DbgPHICache) + -> DebugInstrOperandPair { + const TargetInstrInfo &TII = *getSubtarget().getInstrInfo(); + + // Check whether this copy-like instruction has already been salvaged into + // an operand pair. + Register Dest; + if (auto CopyDstSrc = TII.isCopyInstr(MI)) { + Dest = CopyDstSrc->Destination->getReg(); + } else { + assert(MI.isSubregToReg()); + Dest = MI.getOperand(0).getReg(); + } + + auto CacheIt = DbgPHICache.find(Dest); + if (CacheIt != DbgPHICache.end()) + return CacheIt->second; + + // Calculate the instruction number to use, or install a DBG_PHI. + auto OperandPair = salvageCopySSAImpl(MI); + DbgPHICache.insert({Dest, OperandPair}); + return OperandPair; +} + +auto MachineFunction::salvageCopySSAImpl(MachineInstr &MI) -> DebugInstrOperandPair { MachineRegisterInfo &MRI = getRegInfo(); const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); @@ -1189,6 +1214,7 @@ void MachineFunction::finalizeDebugInstrRefs() { MI.getOperand(1).ChangeToRegister(0, false); }; + DenseMap<Register, DebugInstrOperandPair> ArgDbgPHIs; for (auto &MBB : *this) { for (auto &MI : MBB) { if (!MI.isDebugRef() || !MI.getOperand(0).isReg()) @@ -1211,7 +1237,7 @@ void MachineFunction::finalizeDebugInstrRefs() { // instruction that defines the source value, see salvageCopySSA docs // for why this is important. if (DefMI.isCopyLike() || TII->isCopyInstr(DefMI)) { - auto Result = salvageCopySSA(DefMI); + auto Result = salvageCopySSA(DefMI, ArgDbgPHIs); MI.getOperand(0).ChangeToImmediate(Result.first); MI.getOperand(1).setImm(Result.second); } else { diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp index 6887347..87b8ac5 100644 --- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp +++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp @@ -109,7 +109,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, auto *ArgType = Arg.value()->getType(); // Vector calls to intrinsics can still have // scalar operands for specific arguments. - if (hasVectorIntrinsicScalarOpd(IntrinsicID, Arg.index())) { + if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, Arg.index())) { ScalarTypes.push_back(ArgType); } else { // The argument in this place should be a vector if diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index e139cf6..e483c3a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -519,7 +519,9 @@ namespace { SDValue XformToShuffleWithZero(SDNode *N); bool reassociationCanBreakAddressingModePattern(unsigned Opc, - const SDLoc &DL, SDValue N0, + const SDLoc &DL, + SDNode *N, + SDValue N0, SDValue N1); SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1); @@ -996,6 +998,7 @@ static bool canSplitIdx(LoadSDNode *LD) { bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc, const SDLoc &DL, + SDNode *N, SDValue N0, SDValue N1) { // Currently this only tries to ensure we don't undo the GEP splits done by @@ -1025,7 +1028,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc, return false; const int64_t CombinedValue = CombinedValueIntVal.getSExtValue(); - for (SDNode *Node : N0->uses()) { + for (SDNode *Node : N->uses()) { auto LoadStore = dyn_cast<MemSDNode>(Node); if (LoadStore) { // Is x[offset2] already not a legal addressing mode? If so then @@ -2447,7 +2450,7 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) { return NewSel; // reassociate add - if (!reassociationCanBreakAddressingModePattern(ISD::ADD, DL, N0, N1)) { + if (!reassociationCanBreakAddressingModePattern(ISD::ADD, DL, N, N0, N1)) { if (SDValue RADD = reassociateOps(ISD::ADD, DL, N0, N1, N->getFlags())) return RADD; @@ -15527,7 +15530,7 @@ static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) { // This means this is also safe for a signed input and unsigned output, since // a negative input would lead to undefined behavior. unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned; - unsigned OutputSize = (int)VT.getScalarSizeInBits() - IsOutputSigned; + unsigned OutputSize = (int)VT.getScalarSizeInBits(); unsigned ActualSize = std::min(InputSize, OutputSize); const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType()); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index d667988..90e4b5d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4684,26 +4684,33 @@ bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const { return false; } +static bool haveNoCommonBitsSetCommutative(SDValue A, SDValue B) { + // Match masked merge pattern (X & ~M) op (Y & M) + // Including degenerate case (X & ~M) op M + auto MatchNoCommonBitsPattern = [&](SDValue NotM, SDValue Other) { + if (isBitwiseNot(NotM, true)) { + SDValue NotOperand = NotM->getOperand(0); + if (Other == NotOperand) + return true; + if (Other->getOpcode() == ISD::AND) + return NotOperand == Other->getOperand(0) || + NotOperand == Other->getOperand(1); + } + return false; + }; + if (A->getOpcode() == ISD::AND) + return MatchNoCommonBitsPattern(A->getOperand(0), B) || + MatchNoCommonBitsPattern(A->getOperand(1), B); + return false; +} + // FIXME: unify with llvm::haveNoCommonBitsSet. bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const { assert(A.getValueType() == B.getValueType() && "Values must have the same type"); - // Match masked merge pattern (X & ~M) op (Y & M) - if (A->getOpcode() == ISD::AND && B->getOpcode() == ISD::AND) { - auto MatchNoCommonBitsPattern = [&](SDValue NotM, SDValue And) { - if (isBitwiseNot(NotM, true)) { - SDValue NotOperand = NotM->getOperand(0); - return NotOperand == And->getOperand(0) || - NotOperand == And->getOperand(1); - } - return false; - }; - if (MatchNoCommonBitsPattern(A->getOperand(0), B) || - MatchNoCommonBitsPattern(A->getOperand(1), B) || - MatchNoCommonBitsPattern(B->getOperand(0), A) || - MatchNoCommonBitsPattern(B->getOperand(1), A)) - return true; - } + if (haveNoCommonBitsSetCommutative(A, B) || + haveNoCommonBitsSetCommutative(B, A)) + return true; return KnownBits::haveNoCommonBitsSet(computeKnownBits(A), computeKnownBits(B)); } diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 9732a17..b209aecf 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9044,7 +9044,9 @@ void TargetLowering::expandUADDSUBO( if (IsAdd && isOneConstant(RHS)) { // Special case: uaddo X, 1 overflowed if X+1 is 0. This potential reduces // the live range of X. We assume comparing with 0 is cheap. - // TODO: This generalizes to (X + C) < C. + // The general case (X + C) < C is not necessarily beneficial. Although we + // reduce the live range of X, we may introduce the materialization of + // constant C. SetCC = DAG.getSetCC(dl, SetCCType, Result, DAG.getConstant(0, dl, Node->getValueType(0)), ISD::SETEQ); diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index 3cade80..1aa2d44 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -2029,3 +2029,14 @@ void AttributeFuncs::mergeAttributesForOutlining(Function &Base, // that aspect in the merged function. mergeFnAttrs(Base, ToMerge); } + +void AttributeFuncs::updateMinLegalVectorWidthAttr(Function &Fn, + uint64_t Width) { + Attribute Attr = Fn.getFnAttribute("min-legal-vector-width"); + if (Attr.isValid()) { + uint64_t OldWidth; + Attr.getValueAsString().getAsInteger(0, OldWidth); + if (Width > OldWidth) + Fn.addFnAttr("min-legal-vector-width", llvm::utostr(Width)); + } +} diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index 5dcf1ba..c182513 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -2068,6 +2068,17 @@ Constant *ConstantExpr::getTruncOrBitCast(Constant *C, Type *Ty) { return getTrunc(C, Ty); } +Constant *ConstantExpr::getSExtOrTrunc(Constant *C, Type *Ty) { + assert(C->getType()->isIntOrIntVectorTy() && Ty->isIntOrIntVectorTy() && + "Can only sign extend/truncate integers!"); + Type *CTy = C->getType(); + if (CTy->getScalarSizeInBits() < Ty->getScalarSizeInBits()) + return getSExt(C, Ty); + if (CTy->getScalarSizeInBits() > Ty->getScalarSizeInBits()) + return getTrunc(C, Ty); + return C; +} + Constant *ConstantExpr::getPointerCast(Constant *S, Type *Ty) { assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast"); assert((Ty->isIntOrIntVectorTy() || Ty->isPtrOrPtrVectorTy()) && diff --git a/llvm/lib/Object/Binary.cpp b/llvm/lib/Object/Binary.cpp index 67ed44a..1703f76 100644 --- a/llvm/lib/Object/Binary.cpp +++ b/llvm/lib/Object/Binary.cpp @@ -84,6 +84,7 @@ Expected<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer, case file_magic::unknown: case file_magic::cuda_fatbinary: case file_magic::coff_cl_gl_object: + case file_magic::dxcontainer_object: // Unrecognized object file format. return errorCodeToError(object_error::invalid_file_type); case file_magic::minidump: diff --git a/llvm/lib/Object/CMakeLists.txt b/llvm/lib/Object/CMakeLists.txt index 0825210..ba612e3 100644 --- a/llvm/lib/Object/CMakeLists.txt +++ b/llvm/lib/Object/CMakeLists.txt @@ -6,6 +6,7 @@ add_llvm_component_library(LLVMObject COFFModuleDefinition.cpp COFFObjectFile.cpp Decompressor.cpp + DXContainer.cpp ELF.cpp ELFObjectFile.cpp Error.cpp diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp new file mode 100644 index 0000000..e1aea562 --- /dev/null +++ b/llvm/lib/Object/DXContainer.cpp @@ -0,0 +1,44 @@ +//===- DXContainer.cpp - DXContainer object file implementation -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Object/DXContainer.h" +#include "llvm/BinaryFormat/DXContainer.h" +#include "llvm/Object/Error.h" + +using namespace llvm; +using namespace llvm::object; + +static Error parseFailed(const Twine &Msg) { + return make_error<GenericBinaryError>(Msg.str(), object_error::parse_failed); +} + +template <typename T> +static Error readStruct(StringRef Buffer, const char *P, T &Struct) { + // Don't read before the beginning or past the end of the file + if (P < Buffer.begin() || P + sizeof(T) > Buffer.end()) + return parseFailed("Reading structure out of file bounds"); + + memcpy(&Struct, P, sizeof(T)); + // DXContainer is always BigEndian + if (sys::IsBigEndianHost) + Struct.byteSwap(); + return Error::success(); +} + +DXContainer::DXContainer(MemoryBufferRef O) : Data(O) {} + +Error DXContainer::parseHeader() { + return readStruct(Data.getBuffer(), Data.getBuffer().data(), Header); +} + +Expected<DXContainer> DXContainer::create(MemoryBufferRef Object) { + DXContainer Container(Object); + if (Error Err = Container.parseHeader()) + return std::move(Err); + return Container; +} diff --git a/llvm/lib/Object/ObjectFile.cpp b/llvm/lib/Object/ObjectFile.cpp index fed6726..609dfae 100644 --- a/llvm/lib/Object/ObjectFile.cpp +++ b/llvm/lib/Object/ObjectFile.cpp @@ -147,6 +147,7 @@ ObjectFile::createObjectFile(MemoryBufferRef Object, file_magic Type, case file_magic::minidump: case file_magic::goff_object: case file_magic::cuda_fatbinary: + case file_magic::dxcontainer_object: return errorCodeToError(object_error::invalid_file_type); case file_magic::tapi_file: return errorCodeToError(object_error::invalid_file_type); diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp index 4c92502..3e5fff9 100644 --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -918,21 +918,34 @@ static size_t parseBackslash(StringRef Src, size_t I, SmallString<128> &Token) { return I - 1; } -// Windows treats whitespace, double quotes, and backslashes specially. +// Windows treats whitespace, double quotes, and backslashes specially, except +// when parsing the first token of a full command line, in which case +// backslashes are not special. static bool isWindowsSpecialChar(char C) { return isWhitespaceOrNull(C) || C == '\\' || C == '\"'; } +static bool isWindowsSpecialCharInCommandName(char C) { + return isWhitespaceOrNull(C) || C == '\"'; +} // Windows tokenization implementation. The implementation is designed to be // inlined and specialized for the two user entry points. -static inline void -tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver, - function_ref<void(StringRef)> AddToken, - bool AlwaysCopy, function_ref<void()> MarkEOL) { +static inline void tokenizeWindowsCommandLineImpl( + StringRef Src, StringSaver &Saver, function_ref<void(StringRef)> AddToken, + bool AlwaysCopy, function_ref<void()> MarkEOL, bool InitialCommandName) { SmallString<128> Token; + // Sometimes, this function will be handling a full command line including an + // executable pathname at the start. In that situation, the initial pathname + // needs different handling from the following arguments, because when + // CreateProcess or cmd.exe scans the pathname, it doesn't treat \ as + // escaping the quote character, whereas when libc scans the rest of the + // command line, it does. + bool CommandName = InitialCommandName; + // Try to do as much work inside the state machine as possible. enum { INIT, UNQUOTED, QUOTED } State = INIT; + for (size_t I = 0, E = Src.size(); I < E; ++I) { switch (State) { case INIT: { @@ -947,19 +960,29 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver, if (I >= E) break; size_t Start = I; - while (I < E && !isWindowsSpecialChar(Src[I])) - ++I; + if (CommandName) { + while (I < E && !isWindowsSpecialCharInCommandName(Src[I])) + ++I; + } else { + while (I < E && !isWindowsSpecialChar(Src[I])) + ++I; + } StringRef NormalChars = Src.slice(Start, I); if (I >= E || isWhitespaceOrNull(Src[I])) { // No special characters: slice out the substring and start the next // token. Copy the string if the caller asks us to. AddToken(AlwaysCopy ? Saver.save(NormalChars) : NormalChars); - if (I < E && Src[I] == '\n') + if (I < E && Src[I] == '\n') { MarkEOL(); + CommandName = InitialCommandName; + } else { + CommandName = false; + } } else if (Src[I] == '\"') { Token += NormalChars; State = QUOTED; } else if (Src[I] == '\\') { + assert(!CommandName && "or else we'd have treated it as a normal char"); Token += NormalChars; I = parseBackslash(Src, I, Token); State = UNQUOTED; @@ -976,12 +999,16 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver, // token. AddToken(Saver.save(Token.str())); Token.clear(); - if (Src[I] == '\n') + if (Src[I] == '\n') { + CommandName = InitialCommandName; MarkEOL(); + } else { + CommandName = false; + } State = INIT; } else if (Src[I] == '\"') { State = QUOTED; - } else if (Src[I] == '\\') { + } else if (Src[I] == '\\' && !CommandName) { I = parseBackslash(Src, I, Token); } else { Token.push_back(Src[I]); @@ -999,7 +1026,7 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver, // Otherwise, end the quoted portion and return to the unquoted state. State = UNQUOTED; } - } else if (Src[I] == '\\') { + } else if (Src[I] == '\\' && !CommandName) { I = parseBackslash(Src, I, Token); } else { Token.push_back(Src[I]); @@ -1008,7 +1035,7 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver, } } - if (State == UNQUOTED) + if (State != INIT) AddToken(Saver.save(Token.str())); } @@ -1021,7 +1048,7 @@ void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver, NewArgv.push_back(nullptr); }; tokenizeWindowsCommandLineImpl(Src, Saver, AddToken, - /*AlwaysCopy=*/true, OnEOL); + /*AlwaysCopy=*/true, OnEOL, false); } void cl::TokenizeWindowsCommandLineNoCopy(StringRef Src, StringSaver &Saver, @@ -1029,7 +1056,19 @@ void cl::TokenizeWindowsCommandLineNoCopy(StringRef Src, StringSaver &Saver, auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok); }; auto OnEOL = []() {}; tokenizeWindowsCommandLineImpl(Src, Saver, AddToken, /*AlwaysCopy=*/false, - OnEOL); + OnEOL, false); +} + +void cl::TokenizeWindowsCommandLineFull(StringRef Src, StringSaver &Saver, + SmallVectorImpl<const char *> &NewArgv, + bool MarkEOLs) { + auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok.data()); }; + auto OnEOL = [&]() { + if (MarkEOLs) + NewArgv.push_back(nullptr); + }; + tokenizeWindowsCommandLineImpl(Src, Saver, AddToken, + /*AlwaysCopy=*/true, OnEOL, true); } void cl::tokenizeConfigFile(StringRef Source, StringSaver &Saver, diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp index 98272bb..976599f 100644 --- a/llvm/lib/Support/Host.cpp +++ b/llvm/lib/Support/Host.cpp @@ -296,6 +296,12 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) { } } + if (Implementer == "0xc0") { // Ampere Computing + return StringSwitch<const char *>(Part) + .Case("0xac3", "ampere1") + .Default("generic"); + } + return "generic"; } diff --git a/llvm/lib/Support/Windows/Process.inc b/llvm/lib/Support/Windows/Process.inc index dfaab16..e415674 100644 --- a/llvm/lib/Support/Windows/Process.inc +++ b/llvm/lib/Support/Windows/Process.inc @@ -247,7 +247,7 @@ windows::GetCommandLineArguments(SmallVectorImpl<const char *> &Args, SmallVector<const char *, 20> TmpArgs; StringSaver Saver(Alloc); - cl::TokenizeWindowsCommandLine(Cmd, Saver, TmpArgs, /*MarkEOLs=*/false); + cl::TokenizeWindowsCommandLineFull(Cmd, Saver, TmpArgs, /*MarkEOLs=*/false); for (const char *Arg : TmpArgs) { EC = WildcardExpand(Arg, Args, Saver); @@ -255,6 +255,9 @@ windows::GetCommandLineArguments(SmallVectorImpl<const char *> &Args, return EC; } + if (Args.size() == 0) + return std::make_error_code(std::errc::invalid_argument); + SmallVector<char, MAX_PATH> Arg0(Args[0], Args[0] + strlen(Args[0])); SmallVector<char, MAX_PATH> Filename; sys::path::remove_filename(Arg0); diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index bd6deb5..2682b9b 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -567,6 +567,7 @@ include "AArch64Schedule.td" include "AArch64InstrInfo.td" include "AArch64SchedPredicates.td" include "AArch64SchedPredExynos.td" +include "AArch64SchedPredAmpere.td" include "AArch64Combine.td" def AArch64InstrInfo : InstrInfo; @@ -636,6 +637,7 @@ include "AArch64SchedThunderX2T99.td" include "AArch64SchedA64FX.td" include "AArch64SchedThunderX3T110.td" include "AArch64SchedTSV110.td" +include "AArch64SchedAmpere1.td" def TuneA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", "Cortex-A35 ARM processors">; @@ -956,6 +958,16 @@ def TuneTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110", FeatureFuseAES, FeaturePostRAScheduler]>; +def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1", + "Ampere Computing Ampere-1 processors", [ + FeaturePostRAScheduler, + FeatureFuseAES, + FeatureLSLFast, + FeatureAggressiveFMA, + FeatureArithmeticBccFusion, + FeatureCmpBccFusion, + FeatureFuseAddress, + FeatureFuseLiterals]>; def ProcessorFeatures { list<SubtargetFeature> A53 = [HasV8_0aOps, FeatureCRC, FeatureCrypto, @@ -1067,6 +1079,8 @@ def ProcessorFeatures { list<SubtargetFeature> TSV110 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureSPE, FeatureFullFP16, FeatureFP16FML, FeatureDotProd]; + list<SubtargetFeature> Ampere1 = [HasV8_6aOps, FeatureNEON, FeaturePerfMon, + FeatureMTE, FeatureSSBS]; // ETE and TRBE are future architecture extensions. We temporarily enable them // by default for users targeting generic AArch64. The extensions do not @@ -1205,6 +1219,10 @@ def : ProcessorModel<"a64fx", A64FXModel, ProcessorFeatures.A64FX, def : ProcessorModel<"carmel", NoSchedModel, ProcessorFeatures.Carmel, [TuneCarmel]>; +// Ampere Computing +def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1, + [TuneAmpere1]>; + //===----------------------------------------------------------------------===// // Assembly parser //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index c367d2d..71911b6 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -5092,12 +5092,19 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, SDValue &OffImm) { const EVT MemVT = getMemVTFromNode(*(CurDAG->getContext()), Root); const DataLayout &DL = CurDAG->getDataLayout(); + const MachineFrameInfo &MFI = MF->getFrameInfo(); if (N.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(N)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); - OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64); - return true; + // We can only encode VL scaled offsets, so only fold in frame indexes + // referencing SVE objects. + if (FI == 0 || MFI.getStackID(FI) == TargetStackID::ScalableVector) { + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64); + return true; + } + + return false; } if (MemVT == EVT()) @@ -5124,7 +5131,10 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + // We can only encode VL scaled offsets, so only fold in frame indexes + // referencing SVE objects. + if (FI == 0 || MFI.getStackID(FI) == TargetStackID::ScalableVector) + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); } OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64); diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td new file mode 100644 index 0000000..32f7299 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td @@ -0,0 +1,1136 @@ +//=- AArch64SchedAmpere1.td - Ampere-1 scheduling def -----*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for the Ampere Computing Ampere-1 to +// support instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +// The Ampere-1 core is an out-of-order micro-architecture. The front +// end has branch prediction, with a 10-cycle recovery time from a +// mispredicted branch. Instructions coming out of the front end are +// decoded into internal micro-ops (uops). + +def Ampere1Model : SchedMachineModel { + let IssueWidth = 4; // 4-way decode and dispatch + let MicroOpBufferSize = 174; // micro-op re-order buffer size + let LoadLatency = 4; // Optimistic load latency + let MispredictPenalty = 10; // Branch mispredict penalty + let LoopMicroOpBufferSize = 32; // Instruction queue size + let CompleteModel = 1; + + list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, + SMEUnsupported.F); +} + +let SchedModel = Ampere1Model in { + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Ampere-1. +// Ampere-1 has 12 pipelines that 8 independent scheduler (4 integer, 2 FP, +// and 2 memory) issue into. The integer and FP schedulers can each issue +// one uop per cycle, while the memory schedulers can each issue one load +// and one store address calculation per cycle. + +def Ampere1UnitA : ProcResource<2>; // integer single-cycle, branch, and flags r/w +def Ampere1UnitB : ProcResource<2>; // integer single-cycle, and complex shifts +def Ampere1UnitBS : ProcResource<1>; // integer multi-cycle +def Ampere1UnitL : ProcResource<2>; // load +def Ampere1UnitS : ProcResource<2>; // store address calculation +def Ampere1UnitX : ProcResource<1>; // FP and vector operations, and flag write +def Ampere1UnitY : ProcResource<1>; // FP and vector operations, and crypto +def Ampere1UnitZ : ProcResource<1>; // FP store data and FP-to-integer moves + +def Ampere1UnitAB : ProcResGroup<[Ampere1UnitA, Ampere1UnitB]>; +def Ampere1UnitXY : ProcResGroup<[Ampere1UnitX, Ampere1UnitY]>; + +//===----------------------------------------------------------------------===// +// Define customized scheduler read/write types specific to the Ampere-1. + +def Ampere1Write_1cyc_1A : SchedWriteRes<[Ampere1UnitA]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_2A : SchedWriteRes<[Ampere1UnitA, Ampere1UnitA]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def Ampere1Write_1cyc_1B : SchedWriteRes<[Ampere1UnitB]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_1AB : SchedWriteRes<[Ampere1UnitAB]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_1L : SchedWriteRes<[Ampere1UnitL]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_1S : SchedWriteRes<[Ampere1UnitS]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_2S : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1Y : SchedWriteRes<[Ampere1UnitY]> { + let Latency = 2; + let NumMicroOps = 1; +} + +def Ampere1Write_2cyc_2AB : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitAB]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1B_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitAB]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1B_1A : SchedWriteRes<[Ampere1UnitB, Ampere1UnitA]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1AB_1A : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitA]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1AB_2S : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS, + Ampere1UnitS]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_2cyc_1AB_1S_1Z : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS, + Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_2cyc_1B_1S : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 2; + let NumMicroOps = 1; +} + +def Ampere1Write_2cyc_1S_1Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_3cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1Write_3cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1Write_3cyc_1B_1S_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS, + Ampere1UnitAB]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_3cyc_1S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_3cyc_2S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 4; +} + +def Ampere1Write_4cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1L : SchedWriteRes<[Ampere1UnitL]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1X : SchedWriteRes<[Ampere1UnitX]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1Y : SchedWriteRes<[Ampere1UnitY]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1Z : SchedWriteRes<[Ampere1UnitZ]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def Ampere1Write_4cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def Ampere1Write_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitS, Ampere1UnitZ]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def Ampere1Write_4cyc_3S_3Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 4; + let NumMicroOps = 6; +} + +def Ampere1Write_5cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_1X : SchedWriteRes<[Ampere1UnitX]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_1L : SchedWriteRes<[Ampere1UnitL]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_1L_1BS : SchedWriteRes<[Ampere1UnitL, Ampere1UnitBS]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_4S_4Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 5; + let NumMicroOps = 8; +} + +def Ampere1Write_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 5; + let NumMicroOps = 6; +} + +def Ampere1Write_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 6; + let NumMicroOps = 6; +} + +def Ampere1Write_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 6; + let NumMicroOps = 9; +} + +def Ampere1Write_6cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1Write_6cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 6; + let NumMicroOps = 1; +} + +def Ampere1Write_6cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1Write_6cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def Ampere1Write_6cyc_3L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def Ampere1Write_6cyc_4L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def Ampere1Write_6cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 7; + let NumMicroOps = 1; +} + +def Ampere1Write_7cyc_1BS_1XY : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_1L_1XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_2L_2XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 4; +} + +def Ampere1Write_7cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 7; + let NumMicroOps = 12; +} + +def Ampere1Write_8cyc_1BS_1A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def Ampere1Write_8cyc_1BS_2A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA, + Ampere1UnitA]> { + let Latency = 8; + let NumMicroOps = 3; +} + +def Ampere1Write_8cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def Ampere1Write_8cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def Ampere1Write_8cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 6; +} + +def Ampere1Write_8cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 8; +} + +def Ampere1Write_9cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 6; +} + +def Ampere1Write_9cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 8; +} + +def Ampere1Write_9cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 3; +} + +def Ampere1Write_9cyc_2L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 5; +} + +def Ampere1Write_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 9; + let NumMicroOps = 14; +} + +def Ampere1Write_9cyc_8XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 9; + let NumMicroOps = 16; +} + +def Ampere1Write_10cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def Ampere1Write_10cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def Ampere1Write_10cyc_1X_1Z : SchedWriteRes<[Ampere1UnitX, Ampere1UnitZ]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def Ampere1Write_10cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 10; + let NumMicroOps = 6; +} + +def Ampere1Write_10cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def Ampere1Write_10cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def Ampere1Write_11cyc_1BS_1L : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitL]> { + let Latency = 11; + let NumMicroOps = 2; +} + +def Ampere1Write_11cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> { + let Latency = 11; + let NumMicroOps = 3; +} + +def Ampere1Write_11cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> { + let Latency = 11; + let NumMicroOps = 3; +} + +def Ampere1Write_11cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 11; + let NumMicroOps = 12; +} + +def Ampere1Write_12cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 12; + let NumMicroOps = 12; +} + +def Ampere1Write_12cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 12; + let NumMicroOps = 3; +} + +def Ampere1Write_12cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 12; + let NumMicroOps = 4; +} + +def Ampere1Write_18cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 18; + let NumMicroOps = 1; +} + +def Ampere1Write_19cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 19; + let NumMicroOps = 1; +} + +def Ampere1Write_25cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 25; + let NumMicroOps = 1; +} + +def Ampere1Write_32cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 32; + let NumMicroOps = 1; +} + +def Ampere1Write_34cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 34; + let NumMicroOps = 1; +} + +def Ampere1Write_34cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 34; + let NumMicroOps = 1; +} + +def Ampere1Write_39cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 39; + let NumMicroOps = 1; +} + +def Ampere1Write_62cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 62; + let NumMicroOps = 1; +} + +// For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4), +// which are a single uop, and for extended registers, which have full flexibility +// across Unit A or B for both uops. +def Ampere1Write_Arith : SchedWriteVariant<[ + SchedVar<RegExtendedPred, [Ampere1Write_2cyc_2AB]>, + SchedVar<AmpereCheapLSL, [Ampere1Write_1cyc_1AB]>, + SchedVar<NoSchedPred, [Ampere1Write_2cyc_1B_1AB]>]>; + +def Ampere1Write_ArithFlagsetting : SchedWriteVariant<[ + SchedVar<RegExtendedPred, [Ampere1Write_2cyc_1AB_1A]>, + SchedVar<AmpereCheapLSL, [Ampere1Write_1cyc_1A]>, + SchedVar<NoSchedPred, [Ampere1Write_2cyc_1B_1A]>]>; + +//===----------------------------------------------------------------------===// +// Map the target-defined scheduler read/write resources and latencies for Ampere-1. +// This provides a coarse model, which is then specialised below. + +def : WriteRes<WriteImm, [Ampere1UnitAB]>; // MOVN, MOVZ +def : WriteRes<WriteI, [Ampere1UnitAB]>; // ALU +def : WriteRes<WriteISReg, [Ampere1UnitB, Ampere1UnitA]> { + let Latency = 2; + let NumMicroOps = 2; +} // ALU of Shifted-Reg +def : WriteRes<WriteIEReg, [Ampere1UnitAB, Ampere1UnitA]> { + let Latency = 2; + let NumMicroOps = 2; +} // ALU of Extended-Reg +def : WriteRes<WriteExtr, [Ampere1UnitB]>; // EXTR shifts a reg pair +def : WriteRes<WriteIS, [Ampere1UnitB]>; // Shift/Scale +def : WriteRes<WriteID32, [Ampere1UnitBS]> { + let Latency = 18; +} // 32-bit Divide +def : WriteRes<WriteID64, [Ampere1UnitBS]> { + let Latency = 34; +} // 64-bit Divide +def : WriteRes<WriteIM32, [Ampere1UnitBS]> { + let Latency = 3; +} // 32-bit Multiply +def : WriteRes<WriteIM64, [Ampere1UnitBS]> { + let Latency = 3; +} // 32-bit Multiply +def : WriteRes<WriteBr, [Ampere1UnitA]>; +def : WriteRes<WriteBrReg, [Ampere1UnitA, Ampere1UnitA]>; +def : WriteRes<WriteLD, [Ampere1UnitL]> { + let Latency = 4; +} // Load from base addr plus immediate offset +def : WriteRes<WriteST, [Ampere1UnitS]> { + let Latency = 1; +} // Store to base addr plus immediate offset +def : WriteRes<WriteSTP, [Ampere1UnitS, Ampere1UnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} // Store a register pair. +def : WriteRes<WriteAdr, [Ampere1UnitAB]>; +def : WriteRes<WriteLDIdx, [Ampere1UnitAB, Ampere1UnitS]> { + let Latency = 5; + let NumMicroOps = 2; +} // Load from a register index (maybe scaled). +def : WriteRes<WriteSTIdx, [Ampere1UnitS, Ampere1UnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} // Store to a register index (maybe scaled). +def : WriteRes<WriteF, [Ampere1UnitXY]> { + let Latency = 2; +} // General floating-point ops. +def : WriteRes<WriteFCmp, [Ampere1UnitX]> { + let Latency = 5; +} // Floating-point compare. +def : WriteRes<WriteFCvt, [Ampere1UnitXY]> { + let Latency = 6; +} // Float conversion. +def : WriteRes<WriteFCopy, [Ampere1UnitXY]> { +} // Float-int register copy. +def : WriteRes<WriteFImm, [Ampere1UnitXY]> { + let Latency = 2; +} // Float-int register copy. +def : WriteRes<WriteFMul, [Ampere1UnitXY]> { + let Latency = 5; +} // Floating-point multiply. +def : WriteRes<WriteFDiv, [Ampere1UnitXY]> { + let Latency = 34; +} // Floating-point division. +def : WriteRes<WriteVd, [Ampere1UnitXY]> { + let Latency = 3; +} // 64bit Vector D ops. +def : WriteRes<WriteVq, [Ampere1UnitXY]> { + let Latency = 3; +} // 128bit Vector Q ops. +def : WriteRes<WriteVLD, [Ampere1UnitL, Ampere1UnitL]> { + let Latency = 5; +} // Vector loads. +def : WriteRes<WriteVST, [Ampere1UnitS, Ampere1UnitZ]> { + let Latency = 2; +} // Vector stores. + +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } + +def : WriteRes<WriteSys, []> { let Latency = 1; } +def : WriteRes<WriteBarrier, []> { let Latency = 1; } +def : WriteRes<WriteHint, []> { let Latency = 1; } + +def : WriteRes<WriteLDHi, []> { + let Latency = 4; +} // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP + +// Forwarding logic. +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 1, [WriteIM32, WriteIM64]>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadST, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadVLD, 0>; + +//===----------------------------------------------------------------------===// +// Specialising the scheduling model further for Ampere-1. + +def : InstRW<[Ampere1Write_1cyc_1AB], (instrs COPY)>; + +// Branch instructions +def : InstRW<[Ampere1Write_1cyc_1A], (instrs Bcc, BL, RET)>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>; +def : InstRW<[Ampere1Write_1cyc_2A], (instrs BLR)>; + +// Cryptography instructions +// -- AES encryption/decryption +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AES[DE]")>; +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AESI?MC")>; +// -- Polynomial multiplication +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^PMUL", "^PMULL")>; +// -- SHA-256 hash +def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA256(H|H2)")>; +// -- SHA-256 schedule update +def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA256SU[01]")>; +// -- SHA-3 instructions +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>; +// -- SHA-512 hash +def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA512(H|H2)")>; +// -- SHA-512 schedule update +def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA512SU[01]")>; +// -- SHA1 choose/majority/parity +def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA1[CMP]")>; +// -- SHA1 hash/schedule update +def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1SU[01]")>; +def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1H")>; + +// FP and vector load instructions +// -- Load 1-element structure to one/all lanes +// ---- all lanes +def : InstRW<[Ampere1Write_7cyc_1L_1XY], + (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// ---- one lane +def : InstRW<[Ampere1Write_7cyc_1L_1XY], + (instregex "^LD1i(8|16|32|64)")>; +// -- Load 1-element structure to one/all lanes, 1D size +def : InstRW<[Ampere1Write_5cyc_1L], + (instregex "^LD1Rv1d")>; +// -- Load 1-element structures to 1 register +def : InstRW<[Ampere1Write_5cyc_1L], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 2 registers +def : InstRW<[Ampere1Write_5cyc_2L], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 3 registers +def : InstRW<[Ampere1Write_6cyc_3L], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 4 registers +def : InstRW<[Ampere1Write_6cyc_4L], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 2-element structure to all lanes of 2 registers, 1D size +def : InstRW<[Ampere1Write_5cyc_2L], + (instregex "^LD2Rv1d")>; +// -- Load 2-element structure to all lanes of 2 registers, other sizes +def : InstRW<[Ampere1Write_7cyc_2L_2XY], + (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 2-element structure to one lane of 2 registers +def : InstRW<[Ampere1Write_7cyc_2L_2XY], + (instregex "^LD2i(8|16|32|64)")>; +// -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size +def : InstRW<[Ampere1Write_7cyc_2L_2XY], + (instregex "^LD2Twov(16b|8h|4s|2d)")>; +// -- Load 2-element structures to 2 registers, 8B/4H/2S size +def : InstRW<[Ampere1Write_9cyc_2L_3XY], + (instregex "^LD2Twov(8b|4h|2s)")>; +// -- Load 3-element structure to all lanes of 3 registers, 1D size +def : InstRW<[Ampere1Write_6cyc_3L], + (instregex "^LD3Rv1d")>; +// -- Load 3-element structure to all lanes of 3 registers, other sizes +def : InstRW<[Ampere1Write_8cyc_3L_3XY], + (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 3-element structure to one lane of 3 registers +def : InstRW<[Ampere1Write_8cyc_3L_3XY], + (instregex "^LD3i(8|16|32|64)")>; +// -- Load 3-element structures to 3 registers, 16B/8H/4S sizes +def : InstRW<[Ampere1Write_9cyc_3L_3XY], + (instregex "^LD3Threev(16b|8h|4s)")>; +// -- Load 3-element structures to 3 registers, 2D size +def : InstRW<[Ampere1Write_8cyc_3L_3XY], + (instregex "^LD3Threev2d")>; +// -- Load 3-element structures to 3 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1Write_10cyc_3L_3XY], + (instregex "^LD3Threev(8b|4h|2s)")>; +// -- Load 4-element structure to all lanes of 4 registers, 1D size +def : InstRW<[Ampere1Write_6cyc_4L], + (instregex "^LD4Rv1d")>; +// -- Load 4-element structure to all lanes of 4 registers, other sizes +def : InstRW<[Ampere1Write_8cyc_4L_4XY], + (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 4-element structure to one lane of 4 registers +def : InstRW<[Ampere1Write_6cyc_4L], + (instregex "^LD4i(8|16|32|64)")>; +// -- Load 4-element structures to 4 registers, 2D size +def : InstRW<[Ampere1Write_9cyc_4L_4XY], + (instregex "^LD4Fourv2d")>; +// -- Load 4-element structures to 4 registers, 2S size +def : InstRW<[Ampere1Write_12cyc_4L_8XY], + (instregex "^LD4Fourv2s")>; +// -- Load 4-element structures to 4 registers, other sizes +def : InstRW<[Ampere1Write_11cyc_4L_8XY], + (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>; +// -- Load pair, Q-form +def : InstRW<[Ampere1Write_5cyc_2L], (instregex "LDN?PQ")>; +// -- Load pair, S/D-form +def : InstRW<[Ampere1Write_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>; +// -- Load register +def : InstRW<[Ampere1Write_5cyc_1L], (instregex "LDU?R[BHSDQ]i")>; +// -- Load register, sign-extended register +def : InstRW<[Ampere1Write_6cyc_1AB_1L], (instregex "LDR[BHSDQ]ro(W|X)")>; + +// FP and vector store instructions +// -- Store 1-element structure from one lane of 1 register +def : InstRW<[Ampere1Write_4cyc_1XY_1S_1Z], + (instregex "^ST1i(8|16|32|64)")>; +// -- Store 1-element structures from 1 register +def : InstRW<[Ampere1Write_2cyc_1S_1Z], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 2 registers +def : InstRW<[Ampere1Write_3cyc_2S_2Z], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 3 registers +def : InstRW<[Ampere1Write_4cyc_3S_3Z], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 4 registers +def : InstRW<[Ampere1Write_5cyc_4S_4Z], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 2-element structure from one lane of 2 registers +def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z], + (instregex "^ST2i(8|16|32|64)")>; +// -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes +def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z], + (instregex "^ST2Twov(16b|8h|4s|2d)")>; +// -- Store 2-element structures from 2 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1Write_6cyc_2XY_2S_2Z], + (instregex "^ST2Twov(8b|4h|2s)")>; +// -- Store 3-element structure from one lane of 3 registers +def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z], + (instregex "^ST3i(8|16|32|64)")>; +// -- Store 3-element structures from 3 registers +def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z], + (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 4-element structure from one lane of 4 registers +def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z], + (instregex "^ST4i(8|16|32|64)")>; +// -- Store 4-element structures from 4 registers, 16B/8H/4S sizes +def : InstRW<[Ampere1Write_9cyc_8XY_4S_4Z], + (instregex "^ST4Fourv(16b|8h|4s)")>; +// -- Store 4-element structures from 4 registers, 2D sizes +def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z], + (instregex "^ST4Fourv2d")>; +// -- Store 4-element structures from 4 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1Write_9cyc_6XY_4S_4Z], + (instregex "^ST4Fourv(8b|4h|2s)")>; +// -- Store pair, Q-form +def : InstRW<[Ampere1Write_3cyc_2S_2Z], (instregex "^STN?PQ")>; +// -- Store pair, S/D-form +def : InstRW<[Ampere1Write_3cyc_1S_2Z], (instregex "^STN?P[SD]")>; +// -- Store register +def : InstRW<[Ampere1Write_2cyc_1S_1Z], (instregex "^STU?R[BHSDQ](ui|i)")>; +// -- Store register, sign-extended register offset +def : InstRW<[Ampere1Write_2cyc_1AB_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>; + +// FP data processing, bfloat16 format +def : InstRW<[Ampere1Write_5cyc_1XY], (instrs BFCVT)>; +def : InstRW<[Ampere1Write_7cyc_2XY], (instrs BFCVTN, BFCVTN2)>; +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>; +def : InstRW<[Ampere1Write_4cyc_2XY], (instrs BFMMLA)>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^BFMLAL")>; + +// FP data processing, scalar/vector, half precision +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], + (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>; +def : InstRW<[Ampere1Write_4cyc_1X], + (instregex "^FCMPE?H")>; +def : InstRW<[Ampere1Write_10cyc_1A_1BS_1X], + (instregex "^FCCMPE?H")>; +def : InstRW<[Ampere1Write_10cyc_1A_1BS_1XY], + (instregex "^FCSELH")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^[SU]CVTFv.[fi]16")>; +def : InstRW<[Ampere1Write_25cyc_1XY], (instregex "^FDIVv.[if]16", "FDIVH")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>; +def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>; +def : InstRW<[Ampere1Write_12cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FMULX?v.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instrs FMULX16)>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRECPXv.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>; +def : InstRW<[Ampere1Write_39cyc_1XY], (instregex "^FSQRTv.f16", "^FSQRTHr")>; + +// FP data processing, scalar/vector, single/double precision +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], + (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1X], + (instregex "^FCMPE?(S|D)")>; +def : InstRW<[Ampere1Write_11cyc_1A_1BS_1X], + (instregex "^FCCMPE?(S|D)")>; +def : InstRW<[Ampere1Write_11cyc_1A_1BS_1XY], + (instregex "^FCSEL(S|D)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^[SU]CVTFv.[fi](32|64)")>; +def : InstRW<[Ampere1Write_34cyc_1XY], (instregex "^FDIVv.[if](64)", "FDIVD")>; +def : InstRW<[Ampere1Write_19cyc_1XY], (instregex "^FDIVv.[if](32)", "FDIVS")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>; +def : InstRW<[Ampere1Write_10cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instrs FMULX32, FMULX64)>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT(32|64)")>; +def : InstRW<[Ampere1Write_62cyc_1XY], (instregex "^FSQRTv.f64", "^FSQRTDr")>; +def : InstRW<[Ampere1Write_32cyc_1XY], (instregex "^FSQRTv.f32", "^FSQRTSr")>; + +// FP miscellaneous instructions +def : InstRW<[Ampere1Write_10cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FCVT[HSD]Hr")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[HSD][SD]r")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVTLv")>; +def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^FCVT(N|XN)v")>; +def : InstRW<[Ampere1Write_10cyc_1X_1Z], (instrs FJCVTZS)>; +def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>; +def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>; +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>; +def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "^FMOVXDHighr")>; +def : InstRW<[Ampere1Write_4cyc_1Z], (instregex "^FMOV[WX][HSD]r")>; + +// Integer arithmetic and logical instructions +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "ADC(W|X)r", "SBC(W|X)r")>; +def : InstRW<[Ampere1Write_Arith], + (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)(W|X)r")>; +def : InstRW<[Ampere1Write_ArithFlagsetting], + (instregex "(ADD|AND|BIC|SUB)S(W|X)r")>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "(ADC|SBC)S(W|X)r")>; +def : InstRW<[Ampere1Write_1cyc_1A], (instrs RMIF)>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "(CCMN|CCMP)(X|W)")>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>; +def : InstRW<[Ampere1Write_18cyc_1BS], (instrs SDIVWr, UDIVWr)>; +def : InstRW<[Ampere1Write_34cyc_1BS], (instrs SDIVXr, UDIVXr)>; +def : InstRW<[Ampere1Write_3cyc_1BS], + (instregex "(S|U)MULHr")>; +def : InstRW<[Ampere1Write_4cyc_1BS], + (instregex "(S|U)?M(ADD|SUB)L?r")>; + +// Integer load instructions +def : InstRW<[Ampere1Write_4cyc_2L], + (instregex "(LDNP|LDP|LDPSW)(X|W)")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDR(B|D|H|Q|S)ui")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDR(D|Q|W|X)l")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDTR(B|H|W|X)i")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDTRS(BW|BX|HW|HX|W)i")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDUR(BB|HH|X|W)i")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDURS(BW|BX|HW|HX|W)i")>; +def : InstRW<[Ampere1Write_5cyc_1AB_1L], + (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>; +def : InstRW<[Ampere1Write_1cyc_1L], + (instrs PRFMl, PRFUMi, PRFUMi)>; +def : InstRW<[Ampere1Write_2cyc_1AB_1L], + (instrs PRFMroW, PRFMroX)>; + +// Integer miscellaneous instructions +def : InstRW<[Ampere1Write_1cyc_1A], (instrs ADR, ADRP)>; +def : InstRW<[Ampere1Write_1cyc_1B], (instregex "EXTR(W|X)")>; +def : InstRW<[Ampere1Write_1cyc_1B], (instregex "(S|U)?BFM(W|X)")>; +def : InstRW<[Ampere1Write_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>; +def : InstRW<[Ampere1Write_1cyc_1B], (instregex "CLS(W|X)")>; +def : InstRW<[Ampere1Write_1cyc_1A], (instrs SETF8, SETF16)>; +def : InstRW<[Ampere1Write_1cyc_1AB], + (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>; +def : InstRW<[Ampere1Write_1cyc_1B], + (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>; +def : InstRW<[Ampere1Write_1cyc_1B], + (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>; + +// Integer store instructions +def : InstRW<[Ampere1Write_1cyc_2S], (instregex "STNP(X|W)i")>; +def : InstRW<[Ampere1Write_2cyc_1B_1S], + (instrs STPWi, STPXi)>; +def : InstRW<[Ampere1Write_3cyc_1B_1S_1AB], + (instregex "STP(W|X)(pre|post)")>; +def : InstRW<[Ampere1Write_1cyc_1S], + (instrs STTRBi, STTRHi, STTRWi, STTRXi)>; +def : InstRW<[Ampere1Write_1cyc_1S], + (instregex "STUR(BB|HH|X|W)i", + "STR(X|W)ui", + "STUR(BB|HH|X|W)i")>; +def : InstRW<[Ampere1Write_1cyc_2S], (instrs STRWroX, STRXroX)>; +def : InstRW<[Ampere1Write_2cyc_1AB_2S], (instrs STRWroW, STRXroW)>; + +// Pointer authentication +//def : InstRW<[Ampere1Write_7cyc_1BS], +// (instrs AUTIAZ, AUTIBZ, AUTIASP, AUTIBSP, AUTIA1716, AUTIB1716)>; +def : InstRW<[Ampere1Write_8cyc_1BS_1A], + (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>; +def : InstRW<[Ampere1Write_8cyc_1BS_2A], + (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>; +//def : InstRW<[Ampere1Write_7cyc_1BS], +// (instrs PACIAZ, PACIBZ, PACIASP, PACIBSP, PACIA1716, PACIB1716)>; +def : InstRW<[Ampere1Write_11cyc_1BS_1L], (instregex "^LDRA(A|B)")>; +def : InstRW<[Ampere1Write_7cyc_1BS], (instrs XPACD, XPACI)>; + +// Vector integer instructions +// -- absolute difference +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv", + "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>; +// -- arithmetic +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD", + "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW", + "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>; +// -- arithmetic, horizontal, 16B +def : InstRW<[Ampere1Write_12cyc_4XY], + (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>; +def : InstRW<[Ampere1Write_12cyc_4XY], + (instregex "^[SU](MIN|MAX)Vv16i8v")>; +// -- arithmetic, horizontal, 4H/4S +def : InstRW<[Ampere1Write_6cyc_2XY], + (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>; +def : InstRW<[Ampere1Write_6cyc_2XY], + (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>; +// -- arithmetic, horizontal, 8B/8H +def : InstRW<[Ampere1Write_9cyc_3XY], + (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>; +def : InstRW<[Ampere1Write_9cyc_3XY], + (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>; +// -- arithmetic, narrowing +def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(ADD|SUB)HNv.*")>; +def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>; +// -- arithmetic, pairwise +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>; +// -- arithmetic, saturating +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>; +// -- bit count +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^(CLS|CLZ|CNT)v")>; +// -- compare +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv", + "^CMHIv", "^CMHSv")>; +// -- compare non-zero +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^CMTSTv")>; +// -- dot product +def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>; +// -- fp reciprocal estimate +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPEv", "^FRSQRTEv")>; +// -- integer reciprocal estimate +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>; +// -- logical +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>; +// -- logical, narrowing +def : InstRW<[Ampere1Write_5cyc_2XY], + (instregex "RSHRNv", + "SHRNv", "SQSHRNv", "SQSHRUNv", + "UQXTNv")>; +// -- matrix multiply +def : InstRW<[Ampere1Write_6cyc_2XY], + (instrs SMMLA, UMMLA, USMMLA)>; +// -- max/min +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>; +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>; +// -- move immediate +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>; +// -- multiply +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>; +// -- multiply accumulate +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>; +// -- negation, saturating +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>; +// -- reverse bits/bytes +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>; +// -- shift +def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; +// -- shift and accumulate +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>; +// -- shift, saturating +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU", + "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL", + "^UQSHL")>; + +// Vector miscellaneous instructions +// -- duplicate element +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^DUPv.+lane")>; +// -- duplicate from GPR +def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^DUPv.+gpr")>; +// -- extract narrow +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^XTNv")>; +// -- insert/extract element +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>; +// -- move FP immediate +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOVv")>; +// -- move element to GPR +def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "(S|U)MOVv")>; +// -- move from GPR to any element +def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>; +// -- table lookup +def : InstRW<[Ampere1Write_2cyc_1XY], + (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>; +def : InstRW<[Ampere1Write_4cyc_2XY], + (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>; +def : InstRW<[Ampere1Write_6cyc_3XY], + (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>; +def : InstRW<[Ampere1Write_8cyc_4XY], + (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>; +// -- transpose +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>; +// -- zip/unzip +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>; + +} // SchedModel = Ampere1Model diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td b/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td new file mode 100644 index 0000000..8552c07 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td @@ -0,0 +1,25 @@ +//===- AArch64SchedPredAmpere.td - AArch64 Sched Preds -----*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines scheduling predicate definitions that are used by the +// AArch64 Ampere Computing processors. +// +//===----------------------------------------------------------------------===// + +// Auxiliary predicates. + +// Check for a LSL shift <= 4 +def AmpereCheapLSL : MCSchedPredicate< + CheckAny<[CheckShiftBy0, + CheckAll< + [CheckShiftLSL, + CheckAny< + [CheckShiftBy1, + CheckShiftBy2, + CheckShiftBy3, + CheckShiftBy4]>]>]>>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td index 5402b8b..4473f3a 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td +++ b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td @@ -53,7 +53,7 @@ let FunctionMapper = "AArch64_AM::getShiftType" in { } // Check for shifting in arithmetic and logic instructions. -foreach I = {0-3, 8} in { +foreach I = {0-4, 8} in { let FunctionMapper = "AArch64_AM::getShiftValue" in def CheckShiftBy#I : CheckImmOperand<3, I>; } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index b3eb65d..f9b7ca8 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -238,6 +238,12 @@ void AArch64Subtarget::initializeProperties() { // FIXME: remove this to enable 64-bit SLP if performance looks good. MinVectorRegisterBitWidth = 128; break; + case Ampere1: + CacheLineSize = 64; + PrefFunctionLogAlignment = 6; + PrefLoopLogAlignment = 6; + MaxInterleaveFactor = 4; + break; } } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index e919263..d7878c4 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -40,6 +40,7 @@ public: enum ARMProcFamilyEnum : uint8_t { Others, A64FX, + Ampere1, AppleA7, AppleA10, AppleA11, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index a184159..135b94d 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -371,6 +371,49 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return Entry->Cost; break; } + case Intrinsic::fptosi_sat: + case Intrinsic::fptoui_sat: { + if (ICA.getArgTypes().empty()) + break; + bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; + auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]); + EVT MTy = TLI->getValueType(DL, RetTy); + // Check for the legal types, which are where the size of the input and the + // output are the same, or we are using cvt f64->i32 or f32->i64. + if ((LT.second == MVT::f32 || LT.second == MVT::f64 || + LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || + LT.second == MVT::v2f64) && + (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || + (LT.second == MVT::f64 && MTy == MVT::i32) || + (LT.second == MVT::f32 && MTy == MVT::i64))) + return LT.first; + // Similarly for fp16 sizes + if (ST->hasFullFP16() && + ((LT.second == MVT::f16 && MTy == MVT::i32) || + ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && + (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))) + return LT.first; + + // Otherwise we use a legal convert followed by a min+max + if ((LT.second.getScalarType() == MVT::f32 || + LT.second.getScalarType() == MVT::f64 || + (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) && + LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { + Type *LegalTy = + Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits()); + if (LT.second.isVector()) + LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount()); + InstructionCost Cost = 1; + IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, + LegalTy, {LegalTy, LegalTy}); + Cost += getIntrinsicInstrCost(Attrs1, CostKind); + IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, + LegalTy, {LegalTy, LegalTy}); + Cost += getIntrinsicInstrCost(Attrs2, CostKind); + return LT.first * Cost; + } + break; + } default: break; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index 805b6c7..bfe2e9b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -126,7 +126,6 @@ void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16( LLT::scalar(64)); const LLT S32 = LLT::scalar(32); - B.setMBB(*MI.getParent()); B.setInstrAndDebugLoc(MI); auto Unmerge = B.buildUnmerge(S32, Src); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 76c8edc..a8310c2 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -737,7 +737,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::BUILD_VECTOR, Vec16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand); - setOperationAction(ISD::SCALAR_TO_VECTOR, Vec16, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec16, Custom); } } @@ -4772,6 +4772,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, DAG); + case ISD::SCALAR_TO_VECTOR: + return lowerSCALAR_TO_VECTOR(Op, DAG); case ISD::BUILD_VECTOR: return lowerBUILD_VECTOR(Op, DAG); case ISD::FP_ROUND: @@ -5768,14 +5770,11 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, EVT EltVT = VecVT.getVectorElementType(); unsigned VecSize = VecVT.getSizeInBits(); unsigned EltSize = EltVT.getSizeInBits(); + SDLoc SL(Op); - - assert(VecSize <= 64); - + // Specially handle the case of v4i16 with static indexing. unsigned NumElts = VecVT.getVectorNumElements(); - SDLoc SL(Op); auto KIdx = dyn_cast<ConstantSDNode>(Idx); - if (NumElts == 4 && EltSize == 16 && KIdx) { SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec); @@ -5803,35 +5802,41 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat); } + // Static indexing does not lower to stack access, and hence there is no need + // for special custom lowering to avoid stack access. if (isa<ConstantSDNode>(Idx)) return SDValue(); - MVT IntVT = MVT::getIntegerVT(VecSize); - - // Avoid stack access for dynamic indexing. + // Avoid stack access for dynamic indexing by custom lowering to // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec - // Create a congruent vector with the target value in each element so that - // the required element can be masked and ORed into the target vector. - SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT, - DAG.getSplatBuildVector(VecVT, SL, InsVal)); + assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits"); + MVT IntVT = MVT::getIntegerVT(VecSize); + + // Convert vector index to bit-index and get the required bit mask. assert(isPowerOf2_32(EltSize)); SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); - - // Convert vector index to bit-index. SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); - - SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT, DAG.getConstant(0xffff, SL, IntVT), ScaledIdx); + // 1. Create a congruent vector with the target value in each element. + SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT, + DAG.getSplatBuildVector(VecVT, SL, InsVal)); + + // 2. Mask off all other indicies except the required index within (1). SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal); + + // 3. Mask off the required index within the target vector. + SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec); + // 4. Get (2) and (3) ORed into the target vector. SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS); + return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI); } @@ -5954,6 +5959,22 @@ SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces); } +SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDValue SVal = Op.getOperand(0); + EVT ResultVT = Op.getValueType(); + EVT SValVT = SVal.getValueType(); + SDValue UndefVal = DAG.getUNDEF(SValVT); + SDLoc SL(Op); + + SmallVector<SDValue, 8> VElts; + VElts.push_back(SVal); + for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I) + VElts.push_back(UndefVal); + + return DAG.getBuildVector(ResultVT, SL, VElts); +} + SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); @@ -10661,39 +10682,64 @@ static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad); } -SDValue SITargetLowering::performAddCombine(SDNode *N, +// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z). +SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const { + assert(N->getOpcode() == ISD::ADD); + SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); SDLoc SL(N); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) - && Subtarget->hasMad64_32() && - !VT.isVector() && VT.getScalarSizeInBits() > 32 && - VT.getScalarSizeInBits() <= 64) { - if (LHS.getOpcode() != ISD::MUL) - std::swap(LHS, RHS); + if (VT.isVector()) + return SDValue(); - SDValue MulLHS = LHS.getOperand(0); - SDValue MulRHS = LHS.getOperand(1); - SDValue AddRHS = RHS; + unsigned NumBits = VT.getScalarSizeInBits(); + if (NumBits <= 32 || NumBits > 64) + return SDValue(); - // TODO: Maybe restrict if SGPR inputs. - if (numBitsUnsigned(MulLHS, DAG) <= 32 && - numBitsUnsigned(MulRHS, DAG) <= 32) { - MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32); - MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32); - AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64); - return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false); - } + if (LHS.getOpcode() != ISD::MUL) { + assert(RHS.getOpcode() == ISD::MUL); + std::swap(LHS, RHS); + } + + SDValue MulLHS = LHS.getOperand(0); + SDValue MulRHS = LHS.getOperand(1); + SDValue AddRHS = RHS; + + // TODO: Maybe restrict if SGPR inputs. + if (numBitsUnsigned(MulLHS, DAG) <= 32 && + numBitsUnsigned(MulRHS, DAG) <= 32) { + MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32); + MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32); + AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64); + return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false); + } + + if (numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32) { + MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32); + MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32); + AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64); + return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true); + } + + return SDValue(); +} + +SDValue SITargetLowering::performAddCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + SDLoc SL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); - if (numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32) { - MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32); - MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32); - AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64); - return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true); + if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) { + if (Subtarget->hasMad64_32()) { + if (SDValue Folded = tryFoldToMad64_32(N, DCI)) + return Folded; } return SDValue(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 72a00c1..18bb9fb 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -151,6 +151,7 @@ private: SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const; @@ -197,6 +198,7 @@ private: SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; + SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp index c2cc302..b2765b2 100644 --- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -2562,8 +2562,9 @@ void ARMFrameLowering::adjustForSegmentedStacks( // Make sure the LiveIns are still sorted and unique. MBB->sortUniqueLiveIns(); // Replace the edges to PrologueMBB by edges to the sequences - // we are about to add. - MBB->ReplaceUsesOfBlockWith(&PrologueMBB, AddedBlocks[0]); + // we are about to add, but only update for immediate predecessors. + if (MBB->isSuccessor(&PrologueMBB)) + MBB->ReplaceUsesOfBlockWith(&PrologueMBB, AddedBlocks[0]); } // The required stack size that is aligned to ARM constant criterion. diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index bbd5f5f..44a323d 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -376,7 +376,8 @@ def ProcessorFeatures { FeaturePartwordAtomic, FeatureQuadwordAtomic, FeaturePredictableSelectIsExpensive, - FeatureISA2_07 + FeatureISA2_07, + FeatureCRBits ]; list<SubtargetFeature> P8SpecificFeatures = [FeatureAddiLoadFusion, diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index 9d8290f..5b28f0d 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -32,7 +32,7 @@ class PassRegistry; bool lowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, AsmPrinter &AP); -bool LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO, +bool lowerRISCVMachineOperandToMCOperand(const MachineOperand &MO, MCOperand &MCOp, const AsmPrinter &AP); FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM); diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp index 7d1ec2a..5b2a247 100644 --- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp +++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp @@ -63,7 +63,7 @@ public: // Wrapper needed for tblgenned pseudo lowering. bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const { - return LowerRISCVMachineOperandToMCOperand(MO, MCOp, *this); + return lowerRISCVMachineOperandToMCOperand(MO, MCOp, *this); } void emitStartOfAsmFile(Module &M) override; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index de9c151..7fae031 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -1121,16 +1121,15 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { SDValue V0 = CurDAG->getRegister(RISCV::V0, VT); // Otherwise use - // vmslt{u}.vx vd, va, x, v0.t; if mask policy is agnostic. + // vmslt{u}.vx vd, va, x, v0.t; vmxor.mm vd, vd, v0 + // The result is mask undisturbed. + // We use the same instructions to emulate mask agnostic behavior, because + // the agnostic result can be either undisturbed or all 1. SDValue Cmp = SDValue( CurDAG->getMachineNode(VMSLTMaskOpcode, DL, VT, {MaskedOff, Src1, Src2, V0, VL, SEW, Glue}), 0); - if (MaskedOff.isUndef()) { - ReplaceNode(Node, Cmp.getNode()); - return; - } - // Need vmxor.mm vd, vd, v0 to assign inactive value. + // vmxor.mm vd, vd, v0 is used to update active value. ReplaceNode(Node, CurDAG->getMachineNode(VMXOROpcode, DL, VT, {Cmp, Mask, VL, MaskSEW})); return; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 4cb3188..ff63b22 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -6918,7 +6918,10 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, SDValue Overflow; if (IsAdd && isOneConstant(RHS)) { // Special case uaddo X, 1 overflowed if the addition result is 0. - // FIXME: We can do this for any constant RHS by using (X + C) < C. + // The general case (X + C) < C is not necessarily beneficial. Although we + // reduce the live range of X, we may introduce the materialization of + // constant C, especially when the setcc result is used by branch. We have + // no compare with constant and branch instructions. Overflow = DAG.getSetCC(DL, N->getValueType(1), Res, DAG.getConstant(0, DL, MVT::i64), ISD::SETEQ); } else { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td index f7a1998..3831dc5 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td @@ -114,11 +114,11 @@ defm : FPFMADynFrmAlias_m<FNMSUB_D, "fnmsub.d", DINX>; defm : FPFMADynFrmAlias_m<FNMADD_D, "fnmadd.d", DINX>; let SchedRW = [WriteFALU64, ReadFALU64, ReadFALU64] in { -defm FADD_D : FPALU_rr_frm_m<0b0000001, "fadd.d", DINX>; +defm FADD_D : FPALU_rr_frm_m<0b0000001, "fadd.d", DINX, /*Commutable*/1>; defm FSUB_D : FPALU_rr_frm_m<0b0000101, "fsub.d", DINX>; } let SchedRW = [WriteFMul64, ReadFMul64, ReadFMul64] in -defm FMUL_D : FPALU_rr_frm_m<0b0001001, "fmul.d", DINX>; +defm FMUL_D : FPALU_rr_frm_m<0b0001001, "fmul.d", DINX, /*Commutable*/1>; let SchedRW = [WriteFDiv64, ReadFDiv64, ReadFDiv64] in defm FDIV_D : FPALU_rr_frm_m<0b0001101, "fdiv.d", DINX>; @@ -140,8 +140,8 @@ defm FSGNJX_D : FPALU_rr_m<0b0010001, 0b010, "fsgnjx.d", DINX>; } let SchedRW = [WriteFMinMax64, ReadFMinMax64, ReadFMinMax64] in { -defm FMIN_D : FPALU_rr_m<0b0010101, 0b000, "fmin.d", DINX>; -defm FMAX_D : FPALU_rr_m<0b0010101, 0b001, "fmax.d", DINX>; +defm FMIN_D : FPALU_rr_m<0b0010101, 0b000, "fmin.d", DINX, /*Commutable*/1>; +defm FMAX_D : FPALU_rr_m<0b0010101, 0b001, "fmax.d", DINX, /*Commutable*/1>; } defm FCVT_S_D : FPUnaryOp_r_frm_m<0b0100000, 0b00001, FDINX, "fcvt.s.d">, @@ -152,7 +152,7 @@ defm FCVT_D_S : FPUnaryOp_r_m<0b0100001, 0b00000, 0b000, DFINX, "fcvt.d.s">, Sched<[WriteFCvtF32ToF64, ReadFCvtF32ToF64]>; let SchedRW = [WriteFCmp64, ReadFCmp64, ReadFCmp64] in { -defm FEQ_D : FPCmp_rr_m<0b1010001, 0b010, "feq.d", DINX>; +defm FEQ_D : FPCmp_rr_m<0b1010001, 0b010, "feq.d", DINX, /*Commutable*/1>; defm FLT_D : FPCmp_rr_m<0b1010001, 0b001, "flt.d", DINX>; defm FLE_D : FPCmp_rr_m<0b1010001, 0b000, "fle.d", DINX>; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td index a2cd4a0..b1077ae 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td @@ -187,28 +187,32 @@ multiclass FPFMADynFrmAlias_m<FPFMA_rrr_frm Inst, string OpcodeStr, let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1 in class FPALU_rr<bits<7> funct7, bits<3> funct3, string opcodestr, - DAGOperand rty> + DAGOperand rty, bit Commutable> : RVInstR<funct7, funct3, OPC_OP_FP, (outs rty:$rd), - (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2">; + (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2"> { + let isCommutable = Commutable; +} multiclass FPALU_rr_m<bits<7> funct7, bits<3> funct3, string opcodestr, - list<ExtInfo_r> Exts> { + list<ExtInfo_r> Exts, bit Commutable = 0> { foreach Ext = Exts in let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in - def Ext.Suffix : FPALU_rr<funct7, funct3, opcodestr, Ext.Reg>; + def Ext.Suffix : FPALU_rr<funct7, funct3, opcodestr, Ext.Reg, Commutable>; } let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1, UseNamedOperandTable = 1, hasPostISelHook = 1 in -class FPALU_rr_frm<bits<7> funct7, string opcodestr, DAGOperand rty> +class FPALU_rr_frm<bits<7> funct7, string opcodestr, DAGOperand rty, + bit Commutable> : RVInstRFrm<funct7, OPC_OP_FP, (outs rty:$rd), (ins rty:$rs1, rty:$rs2, frmarg:$frm), opcodestr, - "$rd, $rs1, $rs2, $frm">; - + "$rd, $rs1, $rs2, $frm"> { + let isCommutable = Commutable; +} multiclass FPALU_rr_frm_m<bits<7> funct7, string opcodestr, - list<ExtInfo_r> Exts> { + list<ExtInfo_r> Exts, bit Commutable = 0> { foreach Ext = Exts in let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in - def Ext.Suffix : FPALU_rr_frm<funct7, opcodestr, Ext.Reg>; + def Ext.Suffix : FPALU_rr_frm<funct7, opcodestr, Ext.Reg, Commutable>; } class FPALUDynFrmAlias<FPALU_rr_frm Inst, string OpcodeStr, @@ -269,14 +273,16 @@ multiclass FPUnaryOpDynFrmAlias_m<FPUnaryOp_r_frm Inst, string OpcodeStr, let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1 in class FPCmp_rr<bits<7> funct7, bits<3> funct3, string opcodestr, - DAGOperand rty> + DAGOperand rty, bit Commutable> : RVInstR<funct7, funct3, OPC_OP_FP, (outs GPR:$rd), - (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2">; + (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2"> { + let isCommutable = Commutable; +} multiclass FPCmp_rr_m<bits<7> funct7, bits<3> funct3, string opcodestr, - list<ExtInfo_r> Exts> { + list<ExtInfo_r> Exts, bit Commutable = 0> { foreach Ext = Exts in let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in - def Ext.Suffix : FPCmp_rr<funct7, funct3, opcodestr, Ext.Reg>; + def Ext.Suffix : FPCmp_rr<funct7, funct3, opcodestr, Ext.Reg, Commutable>; } //===----------------------------------------------------------------------===// @@ -305,11 +311,11 @@ defm : FPFMADynFrmAlias_m<FNMSUB_S, "fnmsub.s", FINX>; defm : FPFMADynFrmAlias_m<FNMADD_S, "fnmadd.s", FINX>; let SchedRW = [WriteFALU32, ReadFALU32, ReadFALU32] in { -defm FADD_S : FPALU_rr_frm_m<0b0000000, "fadd.s", FINX>; +defm FADD_S : FPALU_rr_frm_m<0b0000000, "fadd.s", FINX, /*Commutable*/1>; defm FSUB_S : FPALU_rr_frm_m<0b0000100, "fsub.s", FINX>; } let SchedRW = [WriteFMul32, ReadFMul32, ReadFMul32] in -defm FMUL_S : FPALU_rr_frm_m<0b0001000, "fmul.s", FINX>; +defm FMUL_S : FPALU_rr_frm_m<0b0001000, "fmul.s", FINX, /*Commutable*/1>; let SchedRW = [WriteFDiv32, ReadFDiv32, ReadFDiv32] in defm FDIV_S : FPALU_rr_frm_m<0b0001100, "fdiv.s", FINX>; @@ -331,8 +337,8 @@ defm FSGNJX_S : FPALU_rr_m<0b0010000, 0b010, "fsgnjx.s", FINX>; } let SchedRW = [WriteFMinMax32, ReadFMinMax32, ReadFMinMax32] in { -defm FMIN_S : FPALU_rr_m<0b0010100, 0b000, "fmin.s", FINX>; -defm FMAX_S : FPALU_rr_m<0b0010100, 0b001, "fmax.s", FINX>; +defm FMIN_S : FPALU_rr_m<0b0010100, 0b000, "fmin.s", FINX, /*Commutable*/1>; +defm FMAX_S : FPALU_rr_m<0b0010100, 0b001, "fmax.s", FINX, /*Commutable*/1>; } defm FCVT_W_S : FPUnaryOp_r_frm_m<0b1100000, 0b00000, XFINX, "fcvt.w.s">, @@ -348,7 +354,7 @@ def FMV_X_W : FPUnaryOp_r<0b1110000, 0b00000, 0b000, GPR, FPR32, "fmv.x.w">, Sched<[WriteFMovF32ToI32, ReadFMovF32ToI32]>; let SchedRW = [WriteFCmp32, ReadFCmp32, ReadFCmp32] in { -defm FEQ_S : FPCmp_rr_m<0b1010000, 0b010, "feq.s", FINX>; +defm FEQ_S : FPCmp_rr_m<0b1010000, 0b010, "feq.s", FINX, /*Commutable*/1>; defm FLT_S : FPCmp_rr_m<0b1010000, 0b001, "flt.s", FINX>; defm FLE_S : FPCmp_rr_m<0b1010000, 0b000, "fle.s", FINX>; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td index edaf158..835a0f5 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td @@ -109,11 +109,11 @@ defm : FPFMADynFrmAlias_m<FNMSUB_H, "fnmsub.h", HINX>; defm : FPFMADynFrmAlias_m<FNMADD_H, "fnmadd.h", HINX>; let SchedRW = [WriteFALU16, ReadFALU16, ReadFALU16] in { -defm FADD_H : FPALU_rr_frm_m<0b0000010, "fadd.h", HINX>; +defm FADD_H : FPALU_rr_frm_m<0b0000010, "fadd.h", HINX, /*Commutable*/1>; defm FSUB_H : FPALU_rr_frm_m<0b0000110, "fsub.h", HINX>; } let SchedRW = [WriteFMul16, ReadFMul16, ReadFMul16] in -defm FMUL_H : FPALU_rr_frm_m<0b0001010, "fmul.h", HINX>; +defm FMUL_H : FPALU_rr_frm_m<0b0001010, "fmul.h", HINX, /*Commutable*/1>; let SchedRW = [WriteFDiv16, ReadFDiv16, ReadFDiv16] in defm FDIV_H : FPALU_rr_frm_m<0b0001110, "fdiv.h", HINX>; @@ -135,8 +135,8 @@ defm FSGNJX_H : FPALU_rr_m<0b0010010, 0b010, "fsgnjx.h", HINX>; } let SchedRW = [WriteFMinMax16, ReadFMinMax16, ReadFMinMax16] in { -defm FMIN_H : FPALU_rr_m<0b0010110, 0b000, "fmin.h", HINX>; -defm FMAX_H : FPALU_rr_m<0b0010110, 0b001, "fmax.h", HINX>; +defm FMIN_H : FPALU_rr_m<0b0010110, 0b000, "fmin.h", HINX, /*Commutable*/1>; +defm FMAX_H : FPALU_rr_m<0b0010110, 0b001, "fmax.h", HINX, /*Commutable*/1>; } defm FCVT_W_H : FPUnaryOp_r_frm_m<0b1100010, 0b00000, XHINX, "fcvt.w.h">, @@ -173,7 +173,7 @@ def FMV_H_X : FPUnaryOp_r<0b1111010, 0b00000, 0b000, FPR16, GPR, "fmv.h.x">, } // Predicates = [HasStdExtZfhOrZfhmin] let SchedRW = [WriteFCmp16, ReadFCmp16, ReadFCmp16] in { -defm FEQ_H : FPCmp_rr_m<0b1010010, 0b010, "feq.h", HINX>; +defm FEQ_H : FPCmp_rr_m<0b1010010, 0b010, "feq.h", HINX, /*Commutable*/1>; defm FLT_H : FPCmp_rr_m<0b1010010, 0b001, "flt.h", HINX>; defm FLE_H : FPCmp_rr_m<0b1010010, 0b000, "fle.h", HINX>; } diff --git a/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp b/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp index c167c09..4b34bba 100644 --- a/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp +++ b/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp @@ -87,7 +87,7 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym, return MCOperand::createExpr(ME); } -bool llvm::LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO, +bool llvm::lowerRISCVMachineOperandToMCOperand(const MachineOperand &MO, MCOperand &MCOp, const AsmPrinter &AP) { switch (MO.getType()) { @@ -214,7 +214,7 @@ bool llvm::lowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, for (const MachineOperand &MO : MI->operands()) { MCOperand MCOp; - if (LowerRISCVMachineOperandToMCOperand(MO, MCOp, AP)) + if (lowerRISCVMachineOperandToMCOperand(MO, MCOp, AP)) OutMI.addOperand(MCOp); } diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp index 4f37acc..60e1b05 100644 --- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp +++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp @@ -1590,9 +1590,11 @@ SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal, if (getParser().parseExpression(Expr)) return MatchOperand_NoMatch; - auto isOutOfRangeConstant = [&](const MCExpr *E) -> bool { + auto isOutOfRangeConstant = [&](const MCExpr *E, bool Negate) -> bool { if (auto *CE = dyn_cast<MCConstantExpr>(E)) { int64_t Value = CE->getValue(); + if (Negate) + Value = -Value; if ((Value & 1) || Value < MinVal || Value > MaxVal) return true; } @@ -1606,7 +1608,7 @@ SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal, Error(StartLoc, "Expected PC-relative expression"); return MatchOperand_ParseFail; } - if (isOutOfRangeConstant(CE)) { + if (isOutOfRangeConstant(CE, false)) { Error(StartLoc, "offset out of range"); return MatchOperand_ParseFail; } @@ -1621,8 +1623,9 @@ SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal, // For consistency with the GNU assembler, conservatively assume that a // constant offset must by itself be within the given size range. if (const auto *BE = dyn_cast<MCBinaryExpr>(Expr)) - if (isOutOfRangeConstant(BE->getLHS()) || - isOutOfRangeConstant(BE->getRHS())) { + if (isOutOfRangeConstant(BE->getLHS(), false) || + isOutOfRangeConstant(BE->getRHS(), + BE->getOpcode() == MCBinaryExpr::Sub)) { Error(StartLoc, "offset out of range"); return MatchOperand_ParseFail; } diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index d825981..368b05e 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -48,14 +48,18 @@ let Predicates = [HasAMXTILE, In64BitMode] in { VEX, T8XD; // Pseduo instruction for RA. + let mayLoad = 1 in def PLDTILECFGV : PseudoI<(outs), (ins opaquemem:$src), [(int_x86_ldtilecfg_internal addr:$src)]>; + let mayLoad = 1 in def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, opaquemem:$src3), []>; + let mayLoad = 1 in def PTILELOADDT1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, opaquemem:$src3), []>; + let mayStore = 1 in def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1, GR16:$src2, opaquemem:$src3, TILE:$src4), []>; @@ -67,9 +71,12 @@ let Predicates = [HasAMXTILE, In64BitMode] in { let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. // To be translated to the actual instructions in X86ISelLowering.cpp + let mayLoad = 1 in def PTILELOADD : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; + let mayLoad = 1 in def PTILELOADDT1 : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; + let mayStore = 1 in def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>; def PTILEZERO : PseudoI<(outs), (ins u8imm:$src), [(int_x86_tilezero timm:$src)]>; diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index 0ad3e6c..81f258d 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -964,7 +964,7 @@ static void combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) { static bool combineLdSt(SmallVectorImpl<Instruction *> &Casts) { bool Change = false; for (auto *Cast : Casts) { - IntrinsicInst *II = dyn_cast<IntrinsicInst>(Cast); + auto *II = cast<IntrinsicInst>(Cast); // %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector(x86_amx %42) // store <256 x i32> %43, <256 x i32>* %p, align 64 // --> @@ -984,7 +984,7 @@ static bool combineLdSt(SmallVectorImpl<Instruction *> &Casts) { Store->eraseFromParent(); } else { // x86_cast_vector_to_tile SmallVector<Instruction *, 2> DeadLoads; - LoadInst *Load = dyn_cast<LoadInst>(Cast->getOperand(0)); + auto *Load = dyn_cast<LoadInst>(Cast->getOperand(0)); if (!Load || !Load->hasOneUse()) continue; // %65 = load <256 x i32>, <256 x i32>* %p, align 64 diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index 67ed1d8..05364e3 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -935,8 +935,7 @@ def SKLWriteResGroup58 : SchedWriteRes<[SKLPort23]> { let ResourceCycles = [1]; } def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm(8|16|32)", - "MOVZX(16|32|64)rm(8|16)", - "(V?)MOVDDUPrm")>; // TODO: Should this be SKLWriteResGroup67? + "MOVZX(16|32|64)rm(8|16)")>; def SKLWriteResGroup59 : SchedWriteRes<[SKLPort0,SKLPort5]> { let Latency = 5; @@ -993,7 +992,8 @@ def: InstRW<[SKLWriteResGroup67], (instrs VBROADCASTSSrm, VPBROADCASTDrm, VPBROADCASTQrm)>; def: InstRW<[SKLWriteResGroup67], (instregex "(V?)MOVSHDUPrm", - "(V?)MOVSLDUPrm")>; + "(V?)MOVSLDUPrm", + "(V?)MOVDDUPrm")>; def SKLWriteResGroup68 : SchedWriteRes<[SKLPort0]> { let Latency = 6; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 0189acd..b682b51 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -1055,8 +1055,7 @@ def SKXWriteResGroup58 : SchedWriteRes<[SKXPort23]> { let ResourceCycles = [1]; } def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm(8|16|32)", - "MOVZX(16|32|64)rm(8|16)", - "(V?)MOVDDUPrm")>; // TODO: Should this be SKXWriteResGroup71? + "MOVZX(16|32|64)rm(8|16)")>; def SKXWriteResGroup61 : SchedWriteRes<[SKXPort5,SKXPort015]> { let Latency = 5; @@ -1159,11 +1158,10 @@ def SKXWriteResGroup71 : SchedWriteRes<[SKXPort23]> { } def: InstRW<[SKXWriteResGroup71], (instrs VBROADCASTSSrm, VPBROADCASTDrm, - VPBROADCASTQrm, - VMOVSHDUPrm, - VMOVSLDUPrm, - MOVSHDUPrm, - MOVSLDUPrm)>; + VPBROADCASTQrm)>; +def: InstRW<[SKXWriteResGroup71], (instregex "(V?)MOVSHDUPrm", + "(V?)MOVSLDUPrm", + "(V?)MOVDDUPrm")>; def SKXWriteResGroup72 : SchedWriteRes<[SKXPort5]> { let Latency = 6; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td index 2103169..5051d4c 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver2.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td @@ -485,12 +485,6 @@ defm : Zn2WriteResFpuPair<WriteFVarShuffle256, [Zn2FPU], 100>; def Zn2WriteMicrocoded : SchedWriteRes<[]> { let Latency = 100; } -defm : Zn2WriteResPair<WriteDPPS, [], 15>; -defm : Zn2WriteResPair<WriteFHAdd, [], 7>; -defm : Zn2WriteResPair<WriteFHAddY, [], 7>; -defm : Zn2WriteResPair<WritePHAdd, [], 3>; -defm : Zn2WriteResPair<WritePHAddX, [], 3>; -defm : Zn2WriteResPair<WritePHAddY, [], 3>; def : SchedAlias<WriteMicrocoded, Zn2WriteMicrocoded>; def : SchedAlias<WriteFCMOV, Zn2WriteMicrocoded>; @@ -1108,6 +1102,14 @@ def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>; //-- Arithmetic instructions --// +// HADD, HSUB PS/PD +// PHADD|PHSUB (S) W/D. +defm : Zn2WriteResPair<WriteFHAdd, [], 7>; +defm : Zn2WriteResPair<WriteFHAddY, [], 7>; +defm : Zn2WriteResPair<WritePHAdd, [], 3>; +defm : Zn2WriteResPair<WritePHAddX, [], 3>; +defm : Zn2WriteResPair<WritePHAddY, [], 3>; + // PCMPGTQ. def Zn2WritePCMPGTQr : SchedWriteRes<[Zn2FPU03]>; def : InstRW<[Zn2WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>; @@ -1478,6 +1480,7 @@ def : SchedAlias<WriteFDiv64YLd, Zn2WriteVDIVPDYLd>; // DPPS. // x,x,i / v,v,v,i. +defm : Zn2WriteResPair<WriteDPPS, [], 15>; def : SchedAlias<WriteDPPSY, Zn2WriteMicrocoded>; // x,m,i / v,v,m,i. diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 7f4ca3a..a0bde8dc 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3833,10 +3833,21 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, } } - // TODO: Use default extraction for now, but we should investigate extending this - // to handle repeated subvector extraction. - if (Extract) + if (Extract) { + // vXi1 can be efficiently extracted with MOVMSK. + // TODO: AVX512 predicate mask handling. + // NOTE: This doesn't work well for roundtrip scalarization. + if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) { + unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements(); + unsigned MaxElts = ST->hasAVX2() ? 32 : 16; + unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts; + return MOVMSKCost; + } + + // TODO: Use default extraction for now, but we should investigate extending + // this to handle repeated subvector extraction. Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract); + } return Cost; } diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index 65bc392..a33cb0b 100644 --- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -218,10 +218,17 @@ static Function *doPromotion( LLVM_DEBUG(dbgs() << "ARG PROMOTION: Promoting to:" << *NF << "\n" << "From: " << *F); + uint64_t LargestVectorWidth = 0; + for (auto *I : Params) + if (auto *VT = dyn_cast<llvm::VectorType>(I)) + LargestVectorWidth = std::max( + LargestVectorWidth, VT->getPrimitiveSizeInBits().getKnownMinSize()); + // Recompute the parameter attributes list based on the new arguments for // the function. NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttrs(), PAL.getRetAttrs(), ArgAttrVec)); + AttributeFuncs::updateMinLegalVectorWidthAttr(*NF, LargestVectorWidth); ArgAttrVec.clear(); F->getParent()->getFunctionList().insert(F->getIterator(), NF); @@ -313,6 +320,9 @@ static Function *doPromotion( Args.clear(); ArgAttrVec.clear(); + AttributeFuncs::updateMinLegalVectorWidthAttr(*CB.getCaller(), + LargestVectorWidth); + // Update the callgraph to know that the callsite has been transformed. if (ReplaceCallSite) (*ReplaceCallSite)(CB, *NewCS); diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index 247da10..40dd6ee 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -2486,6 +2486,12 @@ ChangeStatus Attributor::rewriteFunctionSignatures( } } + uint64_t LargestVectorWidth = 0; + for (auto *I : NewArgumentTypes) + if (auto *VT = dyn_cast<llvm::VectorType>(I)) + LargestVectorWidth = std::max( + LargestVectorWidth, VT->getPrimitiveSizeInBits().getKnownMinSize()); + FunctionType *OldFnTy = OldFn->getFunctionType(); Type *RetTy = OldFnTy->getReturnType(); @@ -2515,6 +2521,7 @@ ChangeStatus Attributor::rewriteFunctionSignatures( NewFn->setAttributes(AttributeList::get( Ctx, OldFnAttributeList.getFnAttrs(), OldFnAttributeList.getRetAttrs(), NewArgumentAttributes)); + AttributeFuncs::updateMinLegalVectorWidthAttr(*NewFn, LargestVectorWidth); // Since we have now created the new function, splice the body of the old // function right into the new function, leaving the old rotting hulk of the @@ -2592,6 +2599,9 @@ ChangeStatus Attributor::rewriteFunctionSignatures( Ctx, OldCallAttributeList.getFnAttrs(), OldCallAttributeList.getRetAttrs(), NewArgOperandAttributes)); + AttributeFuncs::updateMinLegalVectorWidthAttr(*NewCB->getCaller(), + LargestVectorWidth); + CallSitePairs.push_back({OldCB, NewCB}); return true; }; diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 72b94cd..157cb27e 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Twine.h" #include "llvm/ADT/iterator_range.h" @@ -904,7 +905,7 @@ OptimizeGlobalAddressOfAllocation(GlobalVariable *GV, CallInst *CI, } } - SmallPtrSet<Constant *, 1> RepValues; + SmallSetVector<Constant *, 1> RepValues; RepValues.insert(NewGV); // If there is a comparison against null, we will insert a global bool to diff --git a/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp b/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp index 6ec3c61..76f8f1a 100644 --- a/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp @@ -29,7 +29,7 @@ static bool inferAllPrototypeAttributes( // explicitly visited by CGSCC passes in the new pass manager.) if (F.isDeclaration() && !F.hasOptNone()) { if (!F.hasFnAttribute(Attribute::NoBuiltin)) - Changed |= inferLibFuncAttributes(F, GetTLI(F)); + Changed |= inferNonMandatoryLibFuncAttrs(F, GetTLI(F)); Changed |= inferAttributesFromOthers(F); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 4a62afc..39a32e5 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -759,7 +759,7 @@ getAsConstantIndexedAddress(Type *ElemTy, Value *V, const DataLayout &DL) { V = GEP->getOperand(0); Constant *GEPIndex = static_cast<Constant *>(GEP->getOperand(1)); Index = ConstantExpr::getAdd( - Index, ConstantExpr::getSExtOrBitCast(GEPIndex, IndexType)); + Index, ConstantExpr::getSExtOrTrunc(GEPIndex, IndexType)); continue; } break; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index ab35698..b044b8a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -1298,6 +1298,8 @@ static Instruction *foldFDivPowDivisor(BinaryOperator &I, } Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) { + Module *M = I.getModule(); + if (Value *V = SimplifyFDivInst(I.getOperand(0), I.getOperand(1), I.getFastMathFlags(), SQ.getWithInstruction(&I))) @@ -1363,8 +1365,8 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) { !IsTan && match(Op0, m_Intrinsic<Intrinsic::cos>(m_Value(X))) && match(Op1, m_Intrinsic<Intrinsic::sin>(m_Specific(X))); - if ((IsTan || IsCot) && - hasFloatFn(&TLI, I.getType(), LibFunc_tan, LibFunc_tanf, LibFunc_tanl)) { + if ((IsTan || IsCot) && hasFloatFn(M, &TLI, I.getType(), LibFunc_tan, + LibFunc_tanf, LibFunc_tanl)) { IRBuilder<> B(&I); IRBuilder<>::FastMathFlagGuard FMFGuard(B); B.setFastMathFlags(I.getFastMathFlags()); diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp index 0e65a44..780a446 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp @@ -32,6 +32,7 @@ #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -408,6 +409,25 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const { if (Access.Addr->isSwiftError()) return None; + // Peel off GEPs and BitCasts. + auto *Addr = Access.Addr->stripInBoundsOffsets(); + + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) { + // Do not instrument PGO counter updates. + if (GV->hasSection()) { + StringRef SectionName = GV->getSection(); + // Check if the global is in the PGO counters section. + auto OF = Triple(I->getModule()->getTargetTriple()).getObjectFormat(); + if (SectionName.endswith( + getInstrProfSectionName(IPSK_cnts, OF, /*AddSegmentInfo=*/false))) + return None; + } + + // Do not instrument accesses to LLVM internal variables. + if (GV->getName().startswith("__llvm")) + return None; + } + const DataLayout &DL = I->getModule()->getDataLayout(); Access.TypeSize = DL.getTypeStoreSizeInBits(Access.AccessTy); return Access; @@ -613,8 +633,6 @@ bool MemProfiler::instrumentFunction(Function &F) { initializeCallbacks(*F.getParent()); - FunctionModified |= insertDynamicShadowAtFunctionEntry(F); - SmallVector<Instruction *, 16> ToInstrument; // Fill the set of memory operations to instrument. @@ -625,6 +643,15 @@ bool MemProfiler::instrumentFunction(Function &F) { } } + if (ToInstrument.empty()) { + LLVM_DEBUG(dbgs() << "MEMPROF done instrumenting: " << FunctionModified + << " " << F << "\n"); + + return FunctionModified; + } + + FunctionModified |= insertDynamicShadowAtFunctionEntry(F); + int NumInstrumented = 0; for (auto *Inst : ToInstrument) { if (ClDebugMin < 0 || ClDebugMax < 0 || diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp index 5338032..2610ef1 100644 --- a/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -774,12 +774,9 @@ unsigned GVNSink::sinkBB(BasicBlock *BBEnd) { unsigned NumOrigPreds = Preds.size(); // We can only sink instructions through unconditional branches. - for (auto I = Preds.begin(); I != Preds.end();) { - if ((*I)->getTerminator()->getNumSuccessors() != 1) - I = Preds.erase(I); - else - ++I; - } + llvm::erase_if(Preds, [](BasicBlock *BB) { + return BB->getTerminator()->getNumSuccessors() != 1; + }); LockstepReverseIterator LRI(Preds); SmallVector<SinkingInstructionCandidate, 4> Candidates; diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 11c756c..87202f7 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -1100,6 +1100,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( Value *StoredVal, Instruction *TheStore, SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) { + Module *M = TheStore->getModule(); Value *SplatValue = isBytewiseValue(StoredVal, *DL); Constant *PatternValue = nullptr; @@ -1182,15 +1183,14 @@ bool LoopIdiomRecognize::processLoopStridedStore( NewCall = Builder.CreateMemSet( BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment), /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias); - } else { + } else if (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) { // Everything is emitted in default address space Type *Int8PtrTy = DestInt8PtrTy; - Module *M = TheStore->getModule(); StringRef FuncName = "memset_pattern16"; - FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(), - Int8PtrTy, Int8PtrTy, IntIdxTy); - inferLibFuncAttributes(M, FuncName, *TLI); + FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16, + Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy); + inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI); // Otherwise we should form a memset_pattern16. PatternValue is known to be // an constant array of 16-bytes. Plop the value into a mergable global. @@ -1201,7 +1201,9 @@ bool LoopIdiomRecognize::processLoopStridedStore( GV->setAlignment(Align(16)); Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy); NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); - } + } else + return Changed; + NewCall->setDebugLoc(TheStore->getDebugLoc()); if (MSSAU) { diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp index cff8f51..344f89e 100644 --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -575,9 +575,11 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) { if (OpI->getType()->isVectorTy()) { Scattered[I] = scatter(&CI, OpI); assert(Scattered[I].size() == NumElems && "mismatched call operands"); + if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I)) + Tys.push_back(OpI->getType()->getScalarType()); } else { ScalarOperands[I] = OpI; - if (hasVectorIntrinsicOverloadedScalarOpd(ID, I)) + if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I)) Tys.push_back(OpI->getType()); } } @@ -593,7 +595,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) { ScalarCallOps.clear(); for (unsigned J = 0; J != NumArgs; ++J) { - if (hasVectorIntrinsicScalarOpd(ID, J)) + if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) ScalarCallOps.push_back(ScalarOperands[J]); else ScalarCallOps.push_back(Scattered[J][Elem]); diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index b3f1229..14c1fed 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -204,13 +204,19 @@ static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB, /// branch on a single value. static void buildPartialUnswitchConditionalBranch( BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction, - BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze) { + BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze, + Instruction *I, AssumptionCache *AC, DominatorTree &DT) { IRBuilder<> IRB(&BB); - Value *Cond = Direction ? IRB.CreateOr(Invariants) : - IRB.CreateAnd(Invariants); - if (InsertFreeze) - Cond = IRB.CreateFreeze(Cond, Cond->getName() + ".fr"); + SmallVector<Value *> FrozenInvariants; + for (Value *Inv : Invariants) { + if (InsertFreeze && !isGuaranteedNotToBeUndefOrPoison(Inv, AC, I, &DT)) + Inv = IRB.CreateFreeze(Inv, Inv->getName() + ".fr"); + FrozenInvariants.push_back(Inv); + } + + Value *Cond = Direction ? IRB.CreateOr(FrozenInvariants) + : IRB.CreateAnd(FrozenInvariants); IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, Direction ? &NormalSucc : &UnswitchedSucc); } @@ -572,10 +578,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, " condition!"); buildPartialUnswitchConditionalBranch( *OldPH, Invariants, ExitDirection, *UnswitchedBB, *NewPH, - FreezeLoopUnswitchCond && any_of(Invariants, [&](Value *C) { - return !isGuaranteedNotToBeUndefOrPoison(C, nullptr, - OldPH->getTerminator(), &DT); - })); + FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT); } // Update the dominator tree with the added edge. @@ -2318,11 +2321,9 @@ static void unswitchNontrivialInvariants( buildPartialInvariantUnswitchConditionalBranch( *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU); else { - buildPartialUnswitchConditionalBranch( - *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, - InsertFreeze && any_of(Invariants, [&](Value *C) { - return !isGuaranteedNotToBeUndefOrPoison(C, &AC, BI, &DT); - })); + buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction, + *ClonedPH, *LoopPH, InsertFreeze, + BI, &AC, DT); } DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 1f4f1c9..40fd407 100644 --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -39,7 +39,6 @@ STATISTIC(NumInaccessibleMemOrArgMemOnly, STATISTIC(NumNoUnwind, "Number of functions inferred as nounwind"); STATISTIC(NumNoCapture, "Number of arguments inferred as nocapture"); STATISTIC(NumWriteOnlyArg, "Number of arguments inferred as writeonly"); -STATISTIC(NumExtArg, "Number of arguments inferred as signext/zeroext."); STATISTIC(NumReadOnlyArg, "Number of arguments inferred as readonly"); STATISTIC(NumNoAlias, "Number of function returns inferred as noalias"); STATISTIC(NumNoUndef, "Number of function returns inferred as noundef returns"); @@ -147,16 +146,6 @@ static bool setOnlyWritesMemory(Function &F, unsigned ArgNo) { return true; } -static bool setArgExtAttr(Function &F, unsigned ArgNo, - const TargetLibraryInfo &TLI, bool Signed = true) { - Attribute::AttrKind ExtAttr = TLI.getExtAttrForI32Param(Signed); - if (ExtAttr == Attribute::None || F.hasParamAttribute(ArgNo, ExtAttr)) - return false; - F.addParamAttr(ArgNo, ExtAttr); - ++NumExtArg; - return true; -} - static bool setRetNoUndef(Function &F) { if (!F.getReturnType()->isVoidTy() && !F.hasRetAttribute(Attribute::NoUndef)) { @@ -231,6 +220,13 @@ static bool setAlignedAllocParam(Function &F, unsigned ArgNo) { return true; } +static bool setAllocatedPointerParam(Function &F, unsigned ArgNo) { + if (F.hasParamAttribute(ArgNo, Attribute::AllocatedPointer)) + return false; + F.addParamAttr(ArgNo, Attribute::AllocatedPointer); + return true; +} + static bool setAllocSize(Function &F, unsigned ElemSizeArg, Optional<unsigned> NumElemsArg) { if (F.hasFnAttribute(Attribute::AllocSize)) @@ -240,15 +236,23 @@ static bool setAllocSize(Function &F, unsigned ElemSizeArg, return true; } -bool llvm::inferLibFuncAttributes(Module *M, StringRef Name, - const TargetLibraryInfo &TLI) { +static bool setAllocFamily(Function &F, StringRef Family) { + if (F.hasFnAttribute("alloc-family")) + return false; + F.addFnAttr("alloc-family", Family); + return true; +} + +bool llvm::inferNonMandatoryLibFuncAttrs(Module *M, StringRef Name, + const TargetLibraryInfo &TLI) { Function *F = M->getFunction(Name); if (!F) return false; - return inferLibFuncAttributes(*F, TLI); + return inferNonMandatoryLibFuncAttrs(*F, TLI); } -bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { +bool llvm::inferNonMandatoryLibFuncAttrs(Function &F, + const TargetLibraryInfo &TLI) { LibFunc TheLibFunc; if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc))) return false; @@ -376,6 +380,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setArgNoUndef(F, 1); LLVM_FALLTHROUGH; case LibFunc_strdup: + Changed |= setAllocFamily(F, "malloc"); Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F); Changed |= setDoesNotThrow(F); Changed |= setRetDoesNotAlias(F); @@ -437,7 +442,10 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { LLVM_FALLTHROUGH; case LibFunc_valloc: case LibFunc_malloc: + Changed |= setAllocFamily(F, "malloc"); + LLVM_FALLTHROUGH; case LibFunc_vec_malloc: + Changed |= setAllocFamily(F, "vec_malloc"); Changed |= setAllocSize(F, 0, None); Changed |= setOnlyAccessesInaccessibleMemory(F); Changed |= setRetAndArgsNoUndef(F); @@ -501,6 +509,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setOnlyReadsMemory(F, 1); return Changed; case LibFunc_memalign: + Changed |= setAllocFamily(F, "malloc"); Changed |= setAllocSize(F, 1, None); Changed |= setAlignedAllocParam(F, 0); Changed |= setOnlyAccessesInaccessibleMemory(F); @@ -522,8 +531,12 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 0); return Changed; case LibFunc_realloc: - case LibFunc_vec_realloc: case LibFunc_reallocf: + Changed |= setAllocFamily(F, "malloc"); + LLVM_FALLTHROUGH; + case LibFunc_vec_realloc: + Changed |= setAllocFamily(F, "vec_malloc"); + Changed |= setAllocatedPointerParam(F, 0); Changed |= setAllocSize(F, 1, None); Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F); Changed |= setRetNoUndef(F); @@ -597,7 +610,10 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setOnlyWritesMemory(F, 0); return Changed; case LibFunc_calloc: + Changed |= setAllocFamily(F, "malloc"); + LLVM_FALLTHROUGH; case LibFunc_vec_calloc: + Changed |= setAllocFamily(F, "vec_malloc"); Changed |= setAllocSize(F, 0, 1); Changed |= setOnlyAccessesInaccessibleMemory(F); Changed |= setRetAndArgsNoUndef(F); @@ -656,7 +672,11 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 0); return Changed; case LibFunc_free: + Changed |= setAllocFamily(F, "malloc"); + LLVM_FALLTHROUGH; case LibFunc_vec_free: + Changed |= setAllocFamily(F, "vec_malloc"); + Changed |= setAllocatedPointerParam(F, 0); Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F); Changed |= setArgsNoUndef(F); Changed |= setDoesNotThrow(F); @@ -845,7 +865,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { case LibFunc_putchar: case LibFunc_putchar_unlocked: Changed |= setRetAndArgsNoUndef(F); - Changed |= setArgExtAttr(F, 0, TLI); Changed |= setDoesNotThrow(F); return Changed; case LibFunc_popen: @@ -1066,7 +1085,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { case LibFunc_ldexp: case LibFunc_ldexpf: case LibFunc_ldexpl: - Changed |= setArgExtAttr(F, 1, TLI); Changed |= setWillReturn(F); return Changed; case LibFunc_abs: @@ -1203,34 +1221,141 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { } } -bool llvm::hasFloatFn(const TargetLibraryInfo *TLI, Type *Ty, +static void setArgExtAttr(Function &F, unsigned ArgNo, + const TargetLibraryInfo &TLI, bool Signed = true) { + Attribute::AttrKind ExtAttr = TLI.getExtAttrForI32Param(Signed); + if (ExtAttr != Attribute::None && !F.hasParamAttribute(ArgNo, ExtAttr)) + F.addParamAttr(ArgNo, ExtAttr); +} + +FunctionCallee llvm::getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI, + LibFunc TheLibFunc, FunctionType *T, + AttributeList AttributeList) { + assert(TLI.has(TheLibFunc) && + "Creating call to non-existing library function."); + StringRef Name = TLI.getName(TheLibFunc); + FunctionCallee C = M->getOrInsertFunction(Name, T, AttributeList); + + // Make sure any mandatory argument attributes are added. + + // Any outgoing i32 argument should be handled with setArgExtAttr() which + // will add an extension attribute if the target ABI requires it. Adding + // argument extensions is typically done by the front end but when an + // optimizer is building a library call on its own it has to take care of + // this. Each such generated function must be handled here with sign or + // zero extensions as needed. F is retreived with cast<> because we demand + // of the caller to have called isLibFuncEmittable() first. + Function *F = cast<Function>(C.getCallee()); + assert(F->getFunctionType() == T && "Function type does not match."); + switch (TheLibFunc) { + case LibFunc_fputc: + case LibFunc_putchar: + setArgExtAttr(*F, 0, TLI); + break; + case LibFunc_ldexp: + case LibFunc_ldexpf: + case LibFunc_ldexpl: + case LibFunc_memchr: + case LibFunc_strchr: + setArgExtAttr(*F, 1, TLI); + break; + case LibFunc_memccpy: + setArgExtAttr(*F, 2, TLI); + break; + + // These are functions that are known to not need any argument extension + // on any target: A size_t argument (which may be an i32 on some targets) + // should not trigger the assert below. + case LibFunc_bcmp: + case LibFunc_calloc: + case LibFunc_fwrite: + case LibFunc_malloc: + case LibFunc_memcmp: + case LibFunc_memcpy_chk: + case LibFunc_mempcpy: + case LibFunc_memset_pattern16: + case LibFunc_snprintf: + case LibFunc_stpncpy: + case LibFunc_strlcat: + case LibFunc_strlcpy: + case LibFunc_strncat: + case LibFunc_strncmp: + case LibFunc_strncpy: + case LibFunc_vsnprintf: + break; + + default: +#ifndef NDEBUG + for (unsigned i = 0; i < T->getNumParams(); i++) + assert(!isa<IntegerType>(T->getParamType(i)) && + "Unhandled integer argument."); +#endif + break; + } + + return C; +} + +FunctionCallee llvm::getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI, + LibFunc TheLibFunc, FunctionType *T) { + return getOrInsertLibFunc(M, TLI, TheLibFunc, T, AttributeList()); +} + +bool llvm::isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, + LibFunc TheLibFunc) { + StringRef FuncName = TLI->getName(TheLibFunc); + if (!TLI->has(TheLibFunc)) + return false; + + // Check if the Module already has a GlobalValue with the same name, in + // which case it must be a Function with the expected type. + if (GlobalValue *GV = M->getNamedValue(FuncName)) { + if (auto *F = dyn_cast<Function>(GV)) + return TLI->isValidProtoForLibFunc(*F->getFunctionType(), TheLibFunc, *M); + return false; + } + + return true; +} + +bool llvm::isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, + StringRef Name) { + LibFunc TheLibFunc; + return TLI->getLibFunc(Name, TheLibFunc) && + isLibFuncEmittable(M, TLI, TheLibFunc); +} + +bool llvm::hasFloatFn(const Module *M, const TargetLibraryInfo *TLI, Type *Ty, LibFunc DoubleFn, LibFunc FloatFn, LibFunc LongDoubleFn) { switch (Ty->getTypeID()) { case Type::HalfTyID: return false; case Type::FloatTyID: - return TLI->has(FloatFn); + return isLibFuncEmittable(M, TLI, FloatFn); case Type::DoubleTyID: - return TLI->has(DoubleFn); + return isLibFuncEmittable(M, TLI, DoubleFn); default: - return TLI->has(LongDoubleFn); + return isLibFuncEmittable(M, TLI, LongDoubleFn); } } -StringRef llvm::getFloatFnName(const TargetLibraryInfo *TLI, Type *Ty, - LibFunc DoubleFn, LibFunc FloatFn, - LibFunc LongDoubleFn) { - assert(hasFloatFn(TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) && +StringRef llvm::getFloatFn(const Module *M, const TargetLibraryInfo *TLI, + Type *Ty, LibFunc DoubleFn, LibFunc FloatFn, + LibFunc LongDoubleFn, LibFunc &TheLibFunc) { + assert(hasFloatFn(M, TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) && "Cannot get name for unavailable function!"); switch (Ty->getTypeID()) { case Type::HalfTyID: llvm_unreachable("No name for HalfTy!"); case Type::FloatTyID: + TheLibFunc = FloatFn; return TLI->getName(FloatFn); case Type::DoubleTyID: + TheLibFunc = DoubleFn; return TLI->getName(DoubleFn); default: + TheLibFunc = LongDoubleFn; return TLI->getName(LongDoubleFn); } } @@ -1247,14 +1372,14 @@ static Value *emitLibCall(LibFunc TheLibFunc, Type *ReturnType, ArrayRef<Value *> Operands, IRBuilderBase &B, const TargetLibraryInfo *TLI, bool IsVaArgs = false) { - if (!TLI->has(TheLibFunc)) + Module *M = B.GetInsertBlock()->getModule(); + if (!isLibFuncEmittable(M, TLI, TheLibFunc)) return nullptr; - Module *M = B.GetInsertBlock()->getModule(); StringRef FuncName = TLI->getName(TheLibFunc); FunctionType *FuncType = FunctionType::get(ReturnType, ParamTypes, IsVaArgs); - FunctionCallee Callee = M->getOrInsertFunction(FuncName, FuncType); - inferLibFuncAttributes(M, FuncName, *TLI); + FunctionCallee Callee = getOrInsertLibFunc(M, *TLI, TheLibFunc, FuncType); + inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI); CallInst *CI = B.CreateCall(Callee, Operands, FuncName); if (const Function *F = dyn_cast<Function>(Callee.getCallee()->stripPointerCasts())) @@ -1323,16 +1448,16 @@ Value *llvm::emitStpNCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B, Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI) { - if (!TLI->has(LibFunc_memcpy_chk)) + Module *M = B.GetInsertBlock()->getModule(); + if (!isLibFuncEmittable(M, TLI, LibFunc_memcpy_chk)) return nullptr; - Module *M = B.GetInsertBlock()->getModule(); AttributeList AS; AS = AttributeList::get(M->getContext(), AttributeList::FunctionIndex, Attribute::NoUnwind); LLVMContext &Context = B.GetInsertBlock()->getContext(); - FunctionCallee MemCpy = M->getOrInsertFunction( - "__memcpy_chk", AttributeList::get(M->getContext(), AS), B.getInt8PtrTy(), + FunctionCallee MemCpy = getOrInsertLibFunc(M, *TLI, LibFunc_memcpy_chk, + AttributeList::get(M->getContext(), AS), B.getInt8PtrTy(), B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context), DL.getIntPtrType(Context)); Dst = castToCStr(Dst, B); @@ -1466,14 +1591,15 @@ static void appendTypeSuffix(Value *Op, StringRef &Name, } } -static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name, - IRBuilderBase &B, - const AttributeList &Attrs) { +static Value *emitUnaryFloatFnCallHelper(Value *Op, LibFunc TheLibFunc, + StringRef Name, IRBuilderBase &B, + const AttributeList &Attrs, + const TargetLibraryInfo *TLI) { assert((Name != "") && "Must specify Name to emitUnaryFloatFnCall"); Module *M = B.GetInsertBlock()->getModule(); - FunctionCallee Callee = - M->getOrInsertFunction(Name, Op->getType(), Op->getType()); + FunctionCallee Callee = getOrInsertLibFunc(M, *TLI, TheLibFunc, Op->getType(), + Op->getType()); CallInst *CI = B.CreateCall(Callee, Op, Name); // The incoming attribute set may have come from a speculatable intrinsic, but @@ -1488,12 +1614,16 @@ static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name, return CI; } -Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilderBase &B, +Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI, + StringRef Name, IRBuilderBase &B, const AttributeList &Attrs) { SmallString<20> NameBuffer; appendTypeSuffix(Op, Name, NameBuffer); - return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs); + LibFunc TheLibFunc; + TLI->getLibFunc(Name, TheLibFunc); + + return emitUnaryFloatFnCallHelper(Op, TheLibFunc, Name, B, Attrs, TLI); } Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI, @@ -1501,23 +1631,25 @@ Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI, LibFunc LongDoubleFn, IRBuilderBase &B, const AttributeList &Attrs) { // Get the name of the function according to TLI. - StringRef Name = getFloatFnName(TLI, Op->getType(), - DoubleFn, FloatFn, LongDoubleFn); + Module *M = B.GetInsertBlock()->getModule(); + LibFunc TheLibFunc; + StringRef Name = getFloatFn(M, TLI, Op->getType(), DoubleFn, FloatFn, + LongDoubleFn, TheLibFunc); - return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs); + return emitUnaryFloatFnCallHelper(Op, TheLibFunc, Name, B, Attrs, TLI); } static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2, + LibFunc TheLibFunc, StringRef Name, IRBuilderBase &B, const AttributeList &Attrs, - const TargetLibraryInfo *TLI = nullptr) { + const TargetLibraryInfo *TLI) { assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall"); Module *M = B.GetInsertBlock()->getModule(); - FunctionCallee Callee = M->getOrInsertFunction(Name, Op1->getType(), - Op1->getType(), Op2->getType()); - if (TLI != nullptr) - inferLibFuncAttributes(M, Name, *TLI); + FunctionCallee Callee = getOrInsertLibFunc(M, *TLI, TheLibFunc, Op1->getType(), + Op1->getType(), Op2->getType()); + inferNonMandatoryLibFuncAttrs(M, Name, *TLI); CallInst *CI = B.CreateCall(Callee, { Op1, Op2 }, Name); // The incoming attribute set may have come from a speculatable intrinsic, but @@ -1532,15 +1664,19 @@ static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2, return CI; } -Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name, - IRBuilderBase &B, +Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, + const TargetLibraryInfo *TLI, + StringRef Name, IRBuilderBase &B, const AttributeList &Attrs) { assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall"); SmallString<20> NameBuffer; appendTypeSuffix(Op1, Name, NameBuffer); - return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs); + LibFunc TheLibFunc; + TLI->getLibFunc(Name, TheLibFunc); + + return emitBinaryFloatFnCallHelper(Op1, Op2, TheLibFunc, Name, B, Attrs, TLI); } Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, @@ -1549,22 +1685,24 @@ Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, LibFunc LongDoubleFn, IRBuilderBase &B, const AttributeList &Attrs) { // Get the name of the function according to TLI. - StringRef Name = getFloatFnName(TLI, Op1->getType(), - DoubleFn, FloatFn, LongDoubleFn); + Module *M = B.GetInsertBlock()->getModule(); + LibFunc TheLibFunc; + StringRef Name = getFloatFn(M, TLI, Op1->getType(), DoubleFn, FloatFn, + LongDoubleFn, TheLibFunc); - return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs, TLI); + return emitBinaryFloatFnCallHelper(Op1, Op2, TheLibFunc, Name, B, Attrs, TLI); } Value *llvm::emitPutChar(Value *Char, IRBuilderBase &B, const TargetLibraryInfo *TLI) { - if (!TLI->has(LibFunc_putchar)) + Module *M = B.GetInsertBlock()->getModule(); + if (!isLibFuncEmittable(M, TLI, LibFunc_putchar)) return nullptr; - Module *M = B.GetInsertBlock()->getModule(); StringRef PutCharName = TLI->getName(LibFunc_putchar); - FunctionCallee PutChar = - M->getOrInsertFunction(PutCharName, B.getInt32Ty(), B.getInt32Ty()); - inferLibFuncAttributes(M, PutCharName, *TLI); + FunctionCallee PutChar = getOrInsertLibFunc(M, *TLI, LibFunc_putchar, + B.getInt32Ty(), B.getInt32Ty()); + inferNonMandatoryLibFuncAttrs(M, PutCharName, *TLI); CallInst *CI = B.CreateCall(PutChar, B.CreateIntCast(Char, B.getInt32Ty(), @@ -1580,14 +1718,14 @@ Value *llvm::emitPutChar(Value *Char, IRBuilderBase &B, Value *llvm::emitPutS(Value *Str, IRBuilderBase &B, const TargetLibraryInfo *TLI) { - if (!TLI->has(LibFunc_puts)) + Module *M = B.GetInsertBlock()->getModule(); + if (!isLibFuncEmittable(M, TLI, LibFunc_puts)) return nullptr; - Module *M = B.GetInsertBlock()->getModule(); StringRef PutsName = TLI->getName(LibFunc_puts); - FunctionCallee PutS = - M->getOrInsertFunction(PutsName, B.getInt32Ty(), B.getInt8PtrTy()); - inferLibFuncAttributes(M, PutsName, *TLI); + FunctionCallee PutS = getOrInsertLibFunc(M, *TLI, LibFunc_puts, B.getInt32Ty(), + B.getInt8PtrTy()); + inferNonMandatoryLibFuncAttrs(M, PutsName, *TLI); CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), PutsName); if (const Function *F = dyn_cast<Function>(PutS.getCallee()->stripPointerCasts())) @@ -1597,15 +1735,15 @@ Value *llvm::emitPutS(Value *Str, IRBuilderBase &B, Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilderBase &B, const TargetLibraryInfo *TLI) { - if (!TLI->has(LibFunc_fputc)) + Module *M = B.GetInsertBlock()->getModule(); + if (!isLibFuncEmittable(M, TLI, LibFunc_fputc)) return nullptr; - Module *M = B.GetInsertBlock()->getModule(); StringRef FPutcName = TLI->getName(LibFunc_fputc); - FunctionCallee F = M->getOrInsertFunction(FPutcName, B.getInt32Ty(), - B.getInt32Ty(), File->getType()); + FunctionCallee F = getOrInsertLibFunc(M, *TLI, LibFunc_fputc, B.getInt32Ty(), + B.getInt32Ty(), File->getType()); if (File->getType()->isPointerTy()) - inferLibFuncAttributes(M, FPutcName, *TLI); + inferNonMandatoryLibFuncAttrs(M, FPutcName, *TLI); Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true, "chari"); CallInst *CI = B.CreateCall(F, {Char, File}, FPutcName); @@ -1618,15 +1756,15 @@ Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilderBase &B, Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilderBase &B, const TargetLibraryInfo *TLI) { - if (!TLI->has(LibFunc_fputs)) + Module *M = B.GetInsertBlock()->getModule(); + if (!isLibFuncEmittable(M, TLI, LibFunc_fputs)) return nullptr; - Module *M = B.GetInsertBlock()->getModule(); StringRef FPutsName = TLI->getName(LibFunc_fputs); - FunctionCallee F = M->getOrInsertFunction(FPutsName, B.getInt32Ty(), - B.getInt8PtrTy(), File->getType()); + FunctionCallee F = getOrInsertLibFunc(M, *TLI, LibFunc_fputs, B.getInt32Ty(), + B.getInt8PtrTy(), File->getType()); if (File->getType()->isPointerTy()) - inferLibFuncAttributes(M, FPutsName, *TLI); + inferNonMandatoryLibFuncAttrs(M, FPutsName, *TLI); CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, FPutsName); if (const Function *Fn = @@ -1637,18 +1775,18 @@ Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilderBase &B, Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI) { - if (!TLI->has(LibFunc_fwrite)) + Module *M = B.GetInsertBlock()->getModule(); + if (!isLibFuncEmittable(M, TLI, LibFunc_fwrite)) return nullptr; - Module *M = B.GetInsertBlock()->getModule(); LLVMContext &Context = B.GetInsertBlock()->getContext(); StringRef FWriteName = TLI->getName(LibFunc_fwrite); - FunctionCallee F = M->getOrInsertFunction( - FWriteName, DL.getIntPtrType(Context), B.getInt8PtrTy(), - DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType()); + FunctionCallee F = getOrInsertLibFunc(M, *TLI, LibFunc_fwrite, + DL.getIntPtrType(Context), B.getInt8PtrTy(), DL.getIntPtrType(Context), + DL.getIntPtrType(Context), File->getType()); if (File->getType()->isPointerTy()) - inferLibFuncAttributes(M, FWriteName, *TLI); + inferNonMandatoryLibFuncAttrs(M, FWriteName, *TLI); CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, ConstantInt::get(DL.getIntPtrType(Context), 1), File}); @@ -1661,15 +1799,15 @@ Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilderBase &B, Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI) { - if (!TLI->has(LibFunc_malloc)) + Module *M = B.GetInsertBlock()->getModule(); + if (!isLibFuncEmittable(M, TLI, LibFunc_malloc)) return nullptr; - Module *M = B.GetInsertBlock()->getModule(); StringRef MallocName = TLI->getName(LibFunc_malloc); LLVMContext &Context = B.GetInsertBlock()->getContext(); - FunctionCallee Malloc = M->getOrInsertFunction(MallocName, B.getInt8PtrTy(), - DL.getIntPtrType(Context)); - inferLibFuncAttributes(M, MallocName, *TLI); + FunctionCallee Malloc = getOrInsertLibFunc(M, *TLI, LibFunc_malloc, + B.getInt8PtrTy(), DL.getIntPtrType(Context)); + inferNonMandatoryLibFuncAttrs(M, MallocName, *TLI); CallInst *CI = B.CreateCall(Malloc, Num, MallocName); if (const Function *F = @@ -1681,16 +1819,16 @@ Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL, Value *llvm::emitCalloc(Value *Num, Value *Size, IRBuilderBase &B, const TargetLibraryInfo &TLI) { - if (!TLI.has(LibFunc_calloc)) + Module *M = B.GetInsertBlock()->getModule(); + if (!isLibFuncEmittable(M, &TLI, LibFunc_calloc)) return nullptr; - Module *M = B.GetInsertBlock()->getModule(); StringRef CallocName = TLI.getName(LibFunc_calloc); const DataLayout &DL = M->getDataLayout(); IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext())); - FunctionCallee Calloc = - M->getOrInsertFunction(CallocName, B.getInt8PtrTy(), PtrType, PtrType); - inferLibFuncAttributes(M, CallocName, TLI); + FunctionCallee Calloc = getOrInsertLibFunc(M, TLI, LibFunc_calloc, + B.getInt8PtrTy(), PtrType, PtrType); + inferNonMandatoryLibFuncAttrs(M, CallocName, TLI); CallInst *CI = B.CreateCall(Calloc, {Num, Size}, CallocName); if (const auto *F = diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 7a9a272..e72e3ce 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -500,6 +500,13 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I, if (isMathLibCallNoop(Call, TLI)) return true; + // Non-volatile atomic loads from constants can be removed. + if (auto *LI = dyn_cast<LoadInst>(I)) + if (auto *GV = dyn_cast<GlobalVariable>( + LI->getPointerOperand()->stripPointerCasts())) + if (!LI->isVolatile() && GV->isConstant()) + return true; + return false; } diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 38dca39..0710511 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -1190,13 +1190,15 @@ Value *LibCallSimplifier::optimizeMemCmpBCmpCommon(CallInst *CI, } Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilderBase &B) { + Module *M = CI->getModule(); if (Value *V = optimizeMemCmpBCmpCommon(CI, B)) return V; // memcmp(x, y, Len) == 0 -> bcmp(x, y, Len) == 0 // bcmp can be more efficient than memcmp because it only has to know that // there is a difference, not how different one is to the other. - if (TLI->has(LibFunc_bcmp) && isOnlyUsedInZeroEqualityComparison(CI)) { + if (isLibFuncEmittable(M, TLI, LibFunc_bcmp) && + isOnlyUsedInZeroEqualityComparison(CI)) { Value *LHS = CI->getArgOperand(0); Value *RHS = CI->getArgOperand(1); Value *Size = CI->getArgOperand(2); @@ -1360,7 +1362,8 @@ static Value *valueHasFloatPrecision(Value *Val) { /// Shrink double -> float functions. static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B, - bool isBinary, bool isPrecise = false) { + bool isBinary, const TargetLibraryInfo *TLI, + bool isPrecise = false) { Function *CalleeFn = CI->getCalledFunction(); if (!CI->getType()->isDoubleTy() || !CalleeFn) return nullptr; @@ -1410,22 +1413,25 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B, R = isBinary ? B.CreateCall(Fn, V) : B.CreateCall(Fn, V[0]); } else { AttributeList CalleeAttrs = CalleeFn->getAttributes(); - R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], CalleeName, B, CalleeAttrs) - : emitUnaryFloatFnCall(V[0], CalleeName, B, CalleeAttrs); + R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], TLI, CalleeName, B, + CalleeAttrs) + : emitUnaryFloatFnCall(V[0], TLI, CalleeName, B, CalleeAttrs); } return B.CreateFPExt(R, B.getDoubleTy()); } /// Shrink double -> float for unary functions. static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilderBase &B, + const TargetLibraryInfo *TLI, bool isPrecise = false) { - return optimizeDoubleFP(CI, B, false, isPrecise); + return optimizeDoubleFP(CI, B, false, TLI, isPrecise); } /// Shrink double -> float for binary functions. static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilderBase &B, + const TargetLibraryInfo *TLI, bool isPrecise = false) { - return optimizeDoubleFP(CI, B, true, isPrecise); + return optimizeDoubleFP(CI, B, true, TLI, isPrecise); } // cabs(z) -> sqrt((creal(z)*creal(z)) + (cimag(z)*cimag(z))) @@ -1541,6 +1547,7 @@ static Value *getIntToFPVal(Value *I2F, IRBuilderBase &B, unsigned DstWidth) { /// ldexp(1.0, x) for pow(2.0, itofp(x)); exp2(n * x) for pow(2.0 ** n, x); /// exp10(x) for pow(10.0, x); exp2(log2(n) * x) for pow(n, x). Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) { + Module *M = Pow->getModule(); Value *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1); AttributeList Attrs; // Attributes are only meaningful on the original call Module *Mod = Pow->getModule(); @@ -1568,7 +1575,8 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) { Function *CalleeFn = BaseFn->getCalledFunction(); if (CalleeFn && - TLI->getLibFunc(CalleeFn->getName(), LibFn) && TLI->has(LibFn)) { + TLI->getLibFunc(CalleeFn->getName(), LibFn) && + isLibFuncEmittable(M, TLI, LibFn)) { StringRef ExpName; Intrinsic::ID ID; Value *ExpFn; @@ -1620,7 +1628,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) { // pow(2.0, itofp(x)) -> ldexp(1.0, x) if (match(Base, m_SpecificFP(2.0)) && (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo)) && - hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) { + hasFloatFn(M, TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) { if (Value *ExpoI = getIntToFPVal(Expo, B, TLI->getIntSize())) return copyFlags(*Pow, emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI, @@ -1629,7 +1637,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) { } // pow(2.0 ** n, x) -> exp2(n * x) - if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) { + if (hasFloatFn(M, TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) { APFloat BaseR = APFloat(1.0); BaseR.convert(BaseF->getSemantics(), APFloat::rmTowardZero, &Ignored); BaseR = BaseR / *BaseF; @@ -1656,7 +1664,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) { // pow(10.0, x) -> exp10(x) // TODO: There is no exp10() intrinsic yet, but some day there shall be one. if (match(Base, m_SpecificFP(10.0)) && - hasFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l)) + hasFloatFn(M, TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l)) return copyFlags(*Pow, emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l, B, Attrs)); @@ -1681,7 +1689,8 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) { return copyFlags(*Pow, B.CreateCall(Intrinsic::getDeclaration( Mod, Intrinsic::exp2, Ty), FMul, "exp2")); - else if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) + else if (hasFloatFn(M, TLI, Ty, LibFunc_exp2, LibFunc_exp2f, + LibFunc_exp2l)) return copyFlags(*Pow, emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l, B, Attrs)); @@ -1702,7 +1711,8 @@ static Value *getSqrtCall(Value *V, AttributeList Attrs, bool NoErrno, } // Otherwise, use the libcall for sqrt(). - if (hasFloatFn(TLI, V->getType(), LibFunc_sqrt, LibFunc_sqrtf, LibFunc_sqrtl)) + if (hasFloatFn(M, TLI, V->getType(), LibFunc_sqrt, LibFunc_sqrtf, + LibFunc_sqrtl)) // TODO: We also should check that the target can in fact lower the sqrt() // libcall. We currently have no way to ask this question, so we ask if // the target has a sqrt() libcall, which is not exactly the same. @@ -1892,8 +1902,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) { // Shrink pow() to powf() if the arguments are single precision, // unless the result is expected to be double precision. if (UnsafeFPShrink && Name == TLI->getName(LibFunc_pow) && - hasFloatVersion(Name)) { - if (Value *Shrunk = optimizeBinaryDoubleFP(Pow, B, true)) + hasFloatVersion(M, Name)) { + if (Value *Shrunk = optimizeBinaryDoubleFP(Pow, B, TLI, true)) return Shrunk; } @@ -1901,13 +1911,14 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) { } Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) { + Module *M = CI->getModule(); Function *Callee = CI->getCalledFunction(); AttributeList Attrs; // Attributes are only meaningful on the original call StringRef Name = Callee->getName(); Value *Ret = nullptr; if (UnsafeFPShrink && Name == TLI->getName(LibFunc_exp2) && - hasFloatVersion(Name)) - Ret = optimizeUnaryDoubleFP(CI, B, true); + hasFloatVersion(M, Name)) + Ret = optimizeUnaryDoubleFP(CI, B, TLI, true); Type *Ty = CI->getType(); Value *Op = CI->getArgOperand(0); @@ -1915,7 +1926,7 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) { // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x)) if sizeof(x) <= IntSize // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x)) if sizeof(x) < IntSize if ((isa<SIToFPInst>(Op) || isa<UIToFPInst>(Op)) && - hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) { + hasFloatFn(M, TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) { if (Value *Exp = getIntToFPVal(Op, B, TLI->getIntSize())) return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), Exp, TLI, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl, @@ -1926,12 +1937,14 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) { } Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilderBase &B) { + Module *M = CI->getModule(); + // If we can shrink the call to a float function rather than a double // function, do that first. Function *Callee = CI->getCalledFunction(); StringRef Name = Callee->getName(); - if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(Name)) - if (Value *Ret = optimizeBinaryDoubleFP(CI, B)) + if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(M, Name)) + if (Value *Ret = optimizeBinaryDoubleFP(CI, B, TLI)) return Ret; // The LLVM intrinsics minnum/maxnum correspond to fmin/fmax. Canonicalize to @@ -1962,8 +1975,8 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) { Type *Ty = Log->getType(); Value *Ret = nullptr; - if (UnsafeFPShrink && hasFloatVersion(LogNm)) - Ret = optimizeUnaryDoubleFP(Log, B, true); + if (UnsafeFPShrink && hasFloatVersion(Mod, LogNm)) + Ret = optimizeUnaryDoubleFP(Log, B, TLI, true); // The earlier call must also be 'fast' in order to do these transforms. CallInst *Arg = dyn_cast<CallInst>(Log->getArgOperand(0)); @@ -2071,7 +2084,7 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) { Log->doesNotAccessMemory() ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty), Arg->getOperand(0), "log") - : emitUnaryFloatFnCall(Arg->getOperand(0), LogNm, B, Attrs); + : emitUnaryFloatFnCall(Arg->getOperand(0), TLI, LogNm, B, Attrs); Value *MulY = B.CreateFMul(Arg->getArgOperand(1), LogX, "mul"); // Since pow() may have side effects, e.g. errno, // dead code elimination may not be trusted to remove it. @@ -2094,7 +2107,7 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) { Value *LogE = Log->doesNotAccessMemory() ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty), Eul, "log") - : emitUnaryFloatFnCall(Eul, LogNm, B, Attrs); + : emitUnaryFloatFnCall(Eul, TLI, LogNm, B, Attrs); Value *MulY = B.CreateFMul(Arg->getArgOperand(0), LogE, "mul"); // Since exp() may have side effects, e.g. errno, // dead code elimination may not be trusted to remove it. @@ -2106,14 +2119,16 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) { } Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) { + Module *M = CI->getModule(); Function *Callee = CI->getCalledFunction(); Value *Ret = nullptr; // TODO: Once we have a way (other than checking for the existince of the // libcall) to tell whether our target can lower @llvm.sqrt, relax the // condition below. - if (TLI->has(LibFunc_sqrtf) && (Callee->getName() == "sqrt" || - Callee->getIntrinsicID() == Intrinsic::sqrt)) - Ret = optimizeUnaryDoubleFP(CI, B, true); + if (isLibFuncEmittable(M, TLI, LibFunc_sqrtf) && + (Callee->getName() == "sqrt" || + Callee->getIntrinsicID() == Intrinsic::sqrt)) + Ret = optimizeUnaryDoubleFP(CI, B, TLI, true); if (!CI->isFast()) return Ret; @@ -2158,7 +2173,6 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) { // If we found a repeated factor, hoist it out of the square root and // replace it with the fabs of that factor. - Module *M = Callee->getParent(); Type *ArgType = I->getType(); Function *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, ArgType); Value *FabsCall = B.CreateCall(Fabs, RepeatOp, "fabs"); @@ -2175,11 +2189,12 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) { // TODO: Generalize to handle any trig function and its inverse. Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilderBase &B) { + Module *M = CI->getModule(); Function *Callee = CI->getCalledFunction(); Value *Ret = nullptr; StringRef Name = Callee->getName(); - if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(Name)) - Ret = optimizeUnaryDoubleFP(CI, B, true); + if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(M, Name)) + Ret = optimizeUnaryDoubleFP(CI, B, TLI, true); Value *Op1 = CI->getArgOperand(0); auto *OpC = dyn_cast<CallInst>(Op1); @@ -2195,7 +2210,8 @@ Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilderBase &B) { // tanl(atanl(x)) -> x LibFunc Func; Function *F = OpC->getCalledFunction(); - if (F && TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) && + if (F && TLI->getLibFunc(F->getName(), Func) && + isLibFuncEmittable(M, TLI, Func) && ((Func == LibFunc_atan && Callee->getName() == "tan") || (Func == LibFunc_atanf && Callee->getName() == "tanf") || (Func == LibFunc_atanl && Callee->getName() == "tanl"))) @@ -2211,9 +2227,10 @@ static bool isTrigLibCall(CallInst *CI) { CI->hasFnAttr(Attribute::ReadNone); } -static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg, +static bool insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg, bool UseFloat, Value *&Sin, Value *&Cos, - Value *&SinCos) { + Value *&SinCos, const TargetLibraryInfo *TLI) { + Module *M = OrigCallee->getParent(); Type *ArgTy = Arg->getType(); Type *ResTy; StringRef Name; @@ -2233,9 +2250,12 @@ static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg, ResTy = StructType::get(ArgTy, ArgTy); } - Module *M = OrigCallee->getParent(); - FunctionCallee Callee = - M->getOrInsertFunction(Name, OrigCallee->getAttributes(), ResTy, ArgTy); + if (!isLibFuncEmittable(M, TLI, Name)) + return false; + LibFunc TheLibFunc; + TLI->getLibFunc(Name, TheLibFunc); + FunctionCallee Callee = getOrInsertLibFunc( + M, *TLI, TheLibFunc, OrigCallee->getAttributes(), ResTy, ArgTy); if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) { // If the argument is an instruction, it must dominate all uses so put our @@ -2259,6 +2279,8 @@ static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg, Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1), "cospi"); } + + return true; } Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilderBase &B) { @@ -2286,7 +2308,9 @@ Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilderBase &B) { return nullptr; Value *Sin, *Cos, *SinCos; - insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos, SinCos); + if (!insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos, + SinCos, TLI)) + return nullptr; auto replaceTrigInsts = [this](SmallVectorImpl<CallInst *> &Calls, Value *Res) { @@ -2307,6 +2331,7 @@ void LibCallSimplifier::classifyArgUse( SmallVectorImpl<CallInst *> &CosCalls, SmallVectorImpl<CallInst *> &SinCosCalls) { CallInst *CI = dyn_cast<CallInst>(Val); + Module *M = CI->getModule(); if (!CI || CI->use_empty()) return; @@ -2317,7 +2342,8 @@ void LibCallSimplifier::classifyArgUse( Function *Callee = CI->getCalledFunction(); LibFunc Func; - if (!Callee || !TLI->getLibFunc(*Callee, Func) || !TLI->has(Func) || + if (!Callee || !TLI->getLibFunc(*Callee, Func) || + !isLibFuncEmittable(M, TLI, Func) || !isTrigLibCall(CI)) return; @@ -2532,6 +2558,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) { Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilderBase &B) { + Module *M = CI->getModule(); Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); if (Value *V = optimizePrintFString(CI, B)) { @@ -2540,10 +2567,10 @@ Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilderBase &B) { // printf(format, ...) -> iprintf(format, ...) if no floating point // arguments. - if (TLI->has(LibFunc_iprintf) && !callHasFloatingPointArgument(CI)) { - Module *M = B.GetInsertBlock()->getParent()->getParent(); - FunctionCallee IPrintFFn = - M->getOrInsertFunction("iprintf", FT, Callee->getAttributes()); + if (isLibFuncEmittable(M, TLI, LibFunc_iprintf) && + !callHasFloatingPointArgument(CI)) { + FunctionCallee IPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_iprintf, FT, + Callee->getAttributes()); CallInst *New = cast<CallInst>(CI->clone()); New->setCalledFunction(IPrintFFn); B.Insert(New); @@ -2552,11 +2579,10 @@ Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilderBase &B) { // printf(format, ...) -> __small_printf(format, ...) if no 128-bit floating point // arguments. - if (TLI->has(LibFunc_small_printf) && !callHasFP128Argument(CI)) { - Module *M = B.GetInsertBlock()->getParent()->getParent(); - auto SmallPrintFFn = - M->getOrInsertFunction(TLI->getName(LibFunc_small_printf), - FT, Callee->getAttributes()); + if (isLibFuncEmittable(M, TLI, LibFunc_small_printf) && + !callHasFP128Argument(CI)) { + auto SmallPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_small_printf, FT, + Callee->getAttributes()); CallInst *New = cast<CallInst>(CI->clone()); New->setCalledFunction(SmallPrintFFn); B.Insert(New); @@ -2655,6 +2681,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, } Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilderBase &B) { + Module *M = CI->getModule(); Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); if (Value *V = optimizeSPrintFString(CI, B)) { @@ -2663,10 +2690,10 @@ Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilderBase &B) { // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating // point arguments. - if (TLI->has(LibFunc_siprintf) && !callHasFloatingPointArgument(CI)) { - Module *M = B.GetInsertBlock()->getParent()->getParent(); - FunctionCallee SIPrintFFn = - M->getOrInsertFunction("siprintf", FT, Callee->getAttributes()); + if (isLibFuncEmittable(M, TLI, LibFunc_siprintf) && + !callHasFloatingPointArgument(CI)) { + FunctionCallee SIPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_siprintf, + FT, Callee->getAttributes()); CallInst *New = cast<CallInst>(CI->clone()); New->setCalledFunction(SIPrintFFn); B.Insert(New); @@ -2675,11 +2702,10 @@ Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilderBase &B) { // sprintf(str, format, ...) -> __small_sprintf(str, format, ...) if no 128-bit // floating point arguments. - if (TLI->has(LibFunc_small_sprintf) && !callHasFP128Argument(CI)) { - Module *M = B.GetInsertBlock()->getParent()->getParent(); - auto SmallSPrintFFn = - M->getOrInsertFunction(TLI->getName(LibFunc_small_sprintf), - FT, Callee->getAttributes()); + if (isLibFuncEmittable(M, TLI, LibFunc_small_sprintf) && + !callHasFP128Argument(CI)) { + auto SmallSPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_small_sprintf, FT, + Callee->getAttributes()); CallInst *New = cast<CallInst>(CI->clone()); New->setCalledFunction(SmallSPrintFFn); B.Insert(New); @@ -2835,6 +2861,7 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, } Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilderBase &B) { + Module *M = CI->getModule(); Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); if (Value *V = optimizeFPrintFString(CI, B)) { @@ -2843,10 +2870,10 @@ Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilderBase &B) { // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no // floating point arguments. - if (TLI->has(LibFunc_fiprintf) && !callHasFloatingPointArgument(CI)) { - Module *M = B.GetInsertBlock()->getParent()->getParent(); - FunctionCallee FIPrintFFn = - M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes()); + if (isLibFuncEmittable(M, TLI, LibFunc_fiprintf) && + !callHasFloatingPointArgument(CI)) { + FunctionCallee FIPrintFFn = getOrInsertLibFunc(M, *TLI, LibFunc_fiprintf, + FT, Callee->getAttributes()); CallInst *New = cast<CallInst>(CI->clone()); New->setCalledFunction(FIPrintFFn); B.Insert(New); @@ -2855,11 +2882,11 @@ Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilderBase &B) { // fprintf(stream, format, ...) -> __small_fprintf(stream, format, ...) if no // 128-bit floating point arguments. - if (TLI->has(LibFunc_small_fprintf) && !callHasFP128Argument(CI)) { - Module *M = B.GetInsertBlock()->getParent()->getParent(); + if (isLibFuncEmittable(M, TLI, LibFunc_small_fprintf) && + !callHasFP128Argument(CI)) { auto SmallFPrintFFn = - M->getOrInsertFunction(TLI->getName(LibFunc_small_fprintf), - FT, Callee->getAttributes()); + getOrInsertLibFunc(M, *TLI, LibFunc_small_fprintf, FT, + Callee->getAttributes()); CallInst *New = cast<CallInst>(CI->clone()); New->setCalledFunction(SmallFPrintFFn); B.Insert(New); @@ -2944,21 +2971,19 @@ Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilderBase &B) { CI->getArgOperand(2))); } -bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) { - LibFunc Func; +bool LibCallSimplifier::hasFloatVersion(const Module *M, StringRef FuncName) { SmallString<20> FloatFuncName = FuncName; FloatFuncName += 'f'; - if (TLI->getLibFunc(FloatFuncName, Func)) - return TLI->has(Func); - return false; + return isLibFuncEmittable(M, TLI, FloatFuncName); } Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI, IRBuilderBase &Builder) { + Module *M = CI->getModule(); LibFunc Func; Function *Callee = CI->getCalledFunction(); // Check for string/memory library functions. - if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) { + if (TLI->getLibFunc(*Callee, Func) && isLibFuncEmittable(M, TLI, Func)) { // Make sure we never change the calling convention. assert( (ignoreCallingConv(Func) || @@ -3039,6 +3064,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI, Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI, LibFunc Func, IRBuilderBase &Builder) { + const Module *M = CI->getModule(); + // Don't optimize calls that require strict floating point semantics. if (CI->isStrictFP()) return nullptr; @@ -3117,12 +3144,12 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI, case LibFunc_sin: case LibFunc_sinh: case LibFunc_tanh: - if (UnsafeFPShrink && hasFloatVersion(CI->getCalledFunction()->getName())) - return optimizeUnaryDoubleFP(CI, Builder, true); + if (UnsafeFPShrink && hasFloatVersion(M, CI->getCalledFunction()->getName())) + return optimizeUnaryDoubleFP(CI, Builder, TLI, true); return nullptr; case LibFunc_copysign: - if (hasFloatVersion(CI->getCalledFunction()->getName())) - return optimizeBinaryDoubleFP(CI, Builder); + if (hasFloatVersion(M, CI->getCalledFunction()->getName())) + return optimizeBinaryDoubleFP(CI, Builder, TLI); return nullptr; case LibFunc_fminf: case LibFunc_fmin: @@ -3141,6 +3168,7 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI, } Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) { + Module *M = CI->getModule(); assert(!CI->isMustTailCall() && "These transforms aren't musttail safe."); // TODO: Split out the code below that operates on FP calls so that @@ -3219,7 +3247,7 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) { } // Then check for known library functions. - if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) { + if (TLI->getLibFunc(*Callee, Func) && isLibFuncEmittable(M, TLI, Func)) { // We never change the calling convention. if (!ignoreCallingConv(Func) && !IsCallingConvC) return nullptr; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 5ecee44..d3a944c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -441,6 +441,26 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, return false; } +/// Returns true if A and B have same pointer operands or same SCEVs addresses +static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A, + StoreInst *B) { + // Compare store + if (A == B) + return true; + + // Otherwise Compare pointers + Value *APtr = A->getPointerOperand(); + Value *BPtr = B->getPointerOperand(); + if (APtr == BPtr) + return true; + + // Otherwise compare address SCEVs + if (SE->getSCEV(APtr) == SE->getSCEV(BPtr)) + return true; + + return false; +} + int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, Value *Ptr) const { const ValueToValueMap &Strides = @@ -678,7 +698,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { RecurrenceDescriptor RedDes; if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, - DT)) { + DT, PSE.getSE())) { Requirements->addExactFPMathInst(RedDes.getExactFPMathInst()); AllowedExit.insert(RedDes.getLoopExitInstr()); Reductions[Phi] = RedDes; @@ -772,7 +792,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { auto *SE = PSE.getSE(); Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI); for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) - if (hasVectorIntrinsicScalarOpd(IntrinID, i)) { + if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, i)) { if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) { reportVectorizationFailure("Found unvectorizable intrinsic", "intrinsic instruction cannot be vectorized", @@ -913,11 +933,66 @@ bool LoopVectorizationLegality::canVectorizeMemory() { if (!LAI->canVectorizeMemory()) return false; - if (LAI->hasDependenceInvolvingLoopInvariantAddress()) { - reportVectorizationFailure("Stores to a uniform address", - "write to a loop invariant address could not be vectorized", - "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); - return false; + // We can vectorize stores to invariant address when final reduction value is + // guaranteed to be stored at the end of the loop. Also, if decision to + // vectorize loop is made, runtime checks are added so as to make sure that + // invariant address won't alias with any other objects. + if (!LAI->getStoresToInvariantAddresses().empty()) { + // For each invariant address, check its last stored value is unconditional. + for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { + if (isInvariantStoreOfReduction(SI) && + blockNeedsPredication(SI->getParent())) { + reportVectorizationFailure( + "We don't allow storing to uniform addresses", + "write of conditional recurring variant value to a loop " + "invariant address could not be vectorized", + "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); + return false; + } + } + + if (LAI->hasDependenceInvolvingLoopInvariantAddress()) { + // For each invariant address, check its last stored value is the result + // of one of our reductions. + // + // We do not check if dependence with loads exists because they are + // currently rejected earlier in LoopAccessInfo::analyzeLoop. In case this + // behaviour changes we have to modify this code. + ScalarEvolution *SE = PSE.getSE(); + SmallVector<StoreInst *, 4> UnhandledStores; + for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { + if (isInvariantStoreOfReduction(SI)) { + // Earlier stores to this address are effectively deadcode. + // With opaque pointers it is possible for one pointer to be used with + // different sizes of stored values: + // store i32 0, ptr %x + // store i8 0, ptr %x + // The latest store doesn't complitely overwrite the first one in the + // example. That is why we have to make sure that types of stored + // values are same. + // TODO: Check that bitwidth of unhandled store is smaller then the + // one that overwrites it and add a test. + erase_if(UnhandledStores, [SE, SI](StoreInst *I) { + return storeToSameAddress(SE, SI, I) && + I->getValueOperand()->getType() == + SI->getValueOperand()->getType(); + }); + continue; + } + UnhandledStores.push_back(SI); + } + + bool IsOK = UnhandledStores.empty(); + // TODO: we should also validate against InvariantMemSets. + if (!IsOK) { + reportVectorizationFailure( + "We don't allow storing to uniform addresses", + "write to a loop invariant address could not " + "be vectorized", + "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); + return false; + } + } } Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); @@ -944,13 +1019,34 @@ bool LoopVectorizationLegality::canVectorizeFPMath( // We can now only vectorize if all reductions with Exact FP math also // have the isOrdered flag set, which indicates that we can move the - // reduction operations in-loop. + // reduction operations in-loop, and do not have intermediate store. return (all_of(getReductionVars(), [&](auto &Reduction) -> bool { const RecurrenceDescriptor &RdxDesc = Reduction.second; - return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered(); + return !RdxDesc.hasExactFPMath() || + (RdxDesc.isOrdered() && !RdxDesc.IntermediateStore); })); } +bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) { + return any_of(getReductionVars(), [&](auto &Reduction) -> bool { + const RecurrenceDescriptor &RdxDesc = Reduction.second; + return RdxDesc.IntermediateStore == SI; + }); +} + +bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) { + return any_of(getReductionVars(), [&](auto &Reduction) -> bool { + const RecurrenceDescriptor &RdxDesc = Reduction.second; + if (!RdxDesc.IntermediateStore) + return false; + + ScalarEvolution *SE = PSE.getSE(); + Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand(); + return V == InvariantAddress || + SE->getSCEV(V) == SE->getSCEV(InvariantAddress); + }); +} + bool LoopVectorizationLegality::isInductionPhi(const Value *V) const { Value *In0 = const_cast<Value *>(V); PHINode *PN = dyn_cast_or_null<PHINode>(In0); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3bedf4b..d59abd2 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3998,6 +3998,17 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // Set the resume value for this reduction ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); + // If there were stores of the reduction value to a uniform memory address + // inside the loop, create the final store here. + if (StoreInst *SI = RdxDesc.IntermediateStore) { + StoreInst *NewSI = + Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); + propagateMetadata(NewSI, SI); + + // If the reduction value is used in other places, + // then let the code below create PHI's for that. + } + // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. @@ -4244,13 +4255,13 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, // Some intrinsics have a scalar argument - don't replace it with a // vector. Value *Arg; - if (!UseVectorIntrinsic || !hasVectorIntrinsicScalarOpd(ID, I.index())) + if (!UseVectorIntrinsic || + !isVectorIntrinsicWithScalarOpAtArg(ID, I.index())) Arg = State.get(I.value(), Part); - else { + else Arg = State.get(I.value(), VPIteration(0, 0)); - if (hasVectorIntrinsicOverloadedScalarOpd(ID, I.index())) - TysForDecl.push_back(Arg->getType()); - } + if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index())) + TysForDecl.push_back(Arg->getType()); Args.push_back(Arg); } @@ -7340,6 +7351,16 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { // Ignore ephemeral values. CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); + // Find all stores to invariant variables. Since they are going to sink + // outside the loop we do not need calculate cost for them. + for (BasicBlock *BB : TheLoop->blocks()) + for (Instruction &I : *BB) { + StoreInst *SI; + if ((SI = dyn_cast<StoreInst>(&I)) && + Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) + ValuesToIgnore.insert(&I); + } + // Ignore type-promoting instructions we identified during reduction // detection. for (auto &Reduction : Legal->getReductionVars()) { @@ -8329,6 +8350,8 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, return nullptr; auto willWiden = [&](ElementCount VF) -> bool { + if (VF.isScalar()) + return false; Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // The following case may be scalarized depending on the VF. // The flag shows whether we use Intrinsic or a usual Call for vectorized @@ -8843,6 +8866,13 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( continue; } + // Invariant stores inside loop will be deleted and a single store + // with the final reduction value will be added to the exit block + StoreInst *SI; + if ((SI = dyn_cast<StoreInst>(&I)) && + Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) + continue; + // Otherwise, if all widening options failed, Instruction is to be // replicated. This may create a successor for VPBB. VPBasicBlock *NextVPBB = diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a6b1bb8..4583308 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -641,7 +641,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, CallInst *CI = cast<CallInst>(UserInst); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) { - if (hasVectorIntrinsicScalarOpd(ID, i)) + if (isVectorIntrinsicWithScalarOpAtArg(ID, i)) return (CI->getArgOperand(i) == Scalar); } LLVM_FALLTHROUGH; @@ -2042,6 +2042,36 @@ public: DeletedInstructions.insert(I); } + /// Checks if the instruction was already analyzed for being possible + /// reduction root. + bool isAnalizedReductionRoot(Instruction *I) const { + return AnalizedReductionsRoots.count(I); + } + /// Register given instruction as already analyzed for being possible + /// reduction root. + void analyzedReductionRoot(Instruction *I) { + AnalizedReductionsRoots.insert(I); + } + /// Checks if the provided list of reduced values was checked already for + /// vectorization. + bool areAnalyzedReductionVals(ArrayRef<Value *> VL) { + return AnalyzedReductionVals.contains(hash_value(VL)); + } + /// Adds the list of reduced values to list of already checked values for the + /// vectorization. + void analyzedReductionVals(ArrayRef<Value *> VL) { + AnalyzedReductionVals.insert(hash_value(VL)); + } + /// Clear the list of the analyzed reduction root instructions. + void clearReductionData() { + AnalizedReductionsRoots.clear(); + AnalyzedReductionVals.clear(); + } + /// Checks if the given value is gathered in one of the nodes. + bool isGathered(Value *V) const { + return MustGather.contains(V); + } + ~BoUpSLP(); private: @@ -2603,6 +2633,12 @@ private: /// previously deleted instruction. DenseSet<Instruction *> DeletedInstructions; + /// Set of the instruction, being analyzed already for reductions. + SmallPtrSet<Instruction *, 16> AnalizedReductionsRoots; + + /// Set of hashes for the list of reduction values already being analyzed. + DenseSet<size_t> AnalyzedReductionVals; + /// A list of values that need to extracted out of the tree. /// This list holds pairs of (Internal Scalar : External User). External User /// can be nullptr, it means that this Internal Scalar will be used later, @@ -4041,6 +4077,83 @@ static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) { } #endif +/// Generates key/subkey pair for the given value to provide effective sorting +/// of the values and better detection of the vectorizable values sequences. The +/// keys/subkeys can be used for better sorting of the values themselves (keys) +/// and in values subgroups (subkeys). +static std::pair<size_t, size_t> generateKeySubkey( + Value *V, const TargetLibraryInfo *TLI, + function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, + bool AllowAlternate) { + hash_code Key = hash_value(V->getValueID() + 2); + hash_code SubKey = hash_value(0); + // Sort the loads by the distance between the pointers. + if (auto *LI = dyn_cast<LoadInst>(V)) { + Key = hash_combine(hash_value(Instruction::Load), Key); + if (LI->isSimple()) + SubKey = hash_value(LoadsSubkeyGenerator(Key, LI)); + else + SubKey = hash_value(LI); + } else if (isVectorLikeInstWithConstOps(V)) { + // Sort extracts by the vector operands. + if (isa<ExtractElementInst, UndefValue>(V)) + Key = hash_value(Value::UndefValueVal + 1); + if (auto *EI = dyn_cast<ExtractElementInst>(V)) { + if (!isUndefVector(EI->getVectorOperand()) && + !isa<UndefValue>(EI->getIndexOperand())) + SubKey = hash_value(EI->getVectorOperand()); + } + } else if (auto *I = dyn_cast<Instruction>(V)) { + // Sort other instructions just by the opcodes except for CMPInst. + // For CMP also sort by the predicate kind. + if ((isa<BinaryOperator>(I) || isa<CastInst>(I)) && + isValidForAlternation(I->getOpcode())) { + if (AllowAlternate) + Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0); + else + Key = hash_combine(hash_value(I->getOpcode()), Key); + SubKey = hash_combine( + hash_value(I->getOpcode()), hash_value(I->getType()), + hash_value(isa<BinaryOperator>(I) + ? I->getType() + : cast<CastInst>(I)->getOperand(0)->getType())); + } else if (auto *CI = dyn_cast<CmpInst>(I)) { + CmpInst::Predicate Pred = CI->getPredicate(); + if (CI->isCommutative()) + Pred = std::min(Pred, CmpInst::getInversePredicate(Pred)); + CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred); + SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred), + hash_value(SwapPred), + hash_value(CI->getOperand(0)->getType())); + } else if (auto *Call = dyn_cast<CallInst>(I)) { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI); + if (isTriviallyVectorizable(ID)) + SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID)); + else if (!VFDatabase(*Call).getMappings(*Call).empty()) + SubKey = hash_combine(hash_value(I->getOpcode()), + hash_value(Call->getCalledFunction())); + else + SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call)); + for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos()) + SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End), + hash_value(Op.Tag), SubKey); + } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) { + if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1))) + SubKey = hash_value(Gep->getPointerOperand()); + else + SubKey = hash_value(Gep); + } else if (BinaryOperator::isIntDivRem(I->getOpcode()) && + !isa<ConstantInt>(I->getOperand(1))) { + // Do not try to vectorize instructions with potentially high cost. + SubKey = hash_value(I); + } else { + SubKey = hash_value(I->getOpcode()); + } + Key = hash_combine(hash_value(I->getParent()), Key); + } + return std::make_pair(Key, SubKey); +} + void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -4742,7 +4855,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, unsigned NumArgs = CI->arg_size(); SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr); for (unsigned j = 0; j != NumArgs; ++j) - if (hasVectorIntrinsicScalarOpd(ID, j)) + if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) ScalarArgs[j] = CI->getArgOperand(j); for (Value *V : VL) { CallInst *CI2 = dyn_cast<CallInst>(V); @@ -4761,7 +4874,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Some intrinsics have scalar arguments and should be same in order for // them to be vectorized. for (unsigned j = 0; j != NumArgs; ++j) { - if (hasVectorIntrinsicScalarOpd(ID, j)) { + if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) { Value *A1J = CI2->getArgOperand(j); if (ScalarArgs[j] != A1J) { BS.cancelScheduling(VL, VL0); @@ -4794,7 +4907,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) { // For scalar operands no need to to create an entry since no need to // vectorize it. - if (hasVectorIntrinsicScalarOpd(ID, i)) + if (isVectorIntrinsicWithScalarOpAtArg(ID, i)) continue; ValueList Operands; // Prepare the operand vector. @@ -6238,10 +6351,10 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { ShuffleMask.emplace_back(VF.back(), UndefMaskElem); // Find the insertvector, vectorized in tree, if any. Value *Base = VU; - while (isa<InsertElementInst>(Base)) { + while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) { // Build the mask for the vectorized insertelement instructions. - if (const TreeEntry *E = getTreeEntry(Base)) { - VU = cast<InsertElementInst>(Base); + if (const TreeEntry *E = getTreeEntry(IEBase)) { + VU = IEBase; do { int Idx = E->findLaneForValue(Base); ShuffleMask.back()[Idx] = Idx; @@ -6257,8 +6370,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { } else { VecId = std::distance(FirstUsers.begin(), It); } - ShuffleMask[VecId][*InsertIdx] = EU.Lane; - DemandedElts[VecId].setBit(*InsertIdx); + int InIdx = *InsertIdx; + ShuffleMask[VecId][InIdx] = EU.Lane; + DemandedElts[VecId].setBit(InIdx); continue; } } @@ -6459,6 +6573,12 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, } } + if (UsedTEs.empty()) { + assert(all_of(TE->Scalars, UndefValue::classof) && + "Expected vector of undefs only."); + return None; + } + unsigned VF = 0; if (UsedTEs.size() == 1) { // Try to find the perfect match in another gather node at first. @@ -6612,11 +6732,15 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { // should not be scheduled. if (E->State != TreeEntry::NeedToGather && doesNotNeedToSchedule(E->Scalars)) { - BasicBlock::iterator InsertPt; + Instruction *InsertInst; if (all_of(E->Scalars, isUsedOutsideBlock)) - InsertPt = FindLastInst()->getIterator(); + InsertInst = FindLastInst(); else - InsertPt = FindFirstInst()->getIterator(); + InsertInst = FindFirstInst(); + // If the instruction is PHI, set the insert point after all the PHIs. + if (isa<PHINode>(InsertInst)) + InsertInst = BB->getFirstNonPHI(); + BasicBlock::iterator InsertPt = InsertInst->getIterator(); Builder.SetInsertPoint(BB, InsertPt); Builder.SetCurrentDebugLocation(Front->getDebugLoc()); return; @@ -6658,13 +6782,17 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { // not ideal. However, this should be exceedingly rare since it requires that // we both exit early from buildTree_rec and that the bundle be out-of-order // (causing us to iterate all the way to the end of the block). - if (!LastInst) + if (!LastInst) { LastInst = FindLastInst(); + // If the instruction is PHI, set the insert point after all the PHIs. + if (isa<PHINode>(LastInst)) + LastInst = BB->getFirstNonPHI()->getPrevNode(); + } assert(LastInst && "Failed to find last instruction in bundle"); // Set the insertion point after the last instruction in the bundle. Set the // debug location to Front. - Builder.SetInsertPoint(BB, ++LastInst->getIterator()); + Builder.SetInsertPoint(BB, std::next(LastInst->getIterator())); Builder.SetCurrentDebugLocation(Front->getDebugLoc()); } @@ -7358,11 +7486,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ValueList OpVL; // Some intrinsics have scalar arguments. This argument should not be // vectorized. - if (UseIntrinsic && hasVectorIntrinsicScalarOpd(IID, j)) { + if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, j)) { CallInst *CEI = cast<CallInst>(VL0); ScalarArg = CEI->getArgOperand(j); OpVecs.push_back(CEI->getArgOperand(j)); - if (hasVectorIntrinsicOverloadedScalarOpd(IID, j)) + if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j)) TysForDecl.push_back(ScalarArg->getType()); continue; } @@ -7370,6 +7498,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *OpVec = vectorizeTree(E->getOperand(j)); LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); OpVecs.push_back(OpVec); + if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j)) + TysForDecl.push_back(OpVec->getType()); } Function *CF; @@ -8804,6 +8934,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, // Scan the blocks in the function in post order. for (auto BB : post_order(&F.getEntryBlock())) { + // Start new block - clear the list of reduction roots. + R.clearReductionData(); collectSeedInstructions(BB); // Vectorize trees that end at stores. @@ -9273,15 +9405,16 @@ class HorizontalReduction { using ReductionOpsType = SmallVector<Value *, 16>; using ReductionOpsListType = SmallVector<ReductionOpsType, 2>; ReductionOpsListType ReductionOps; - SmallVector<Value *, 32> ReducedVals; + /// List of possibly reduced values. + SmallVector<SmallVector<Value *>> ReducedVals; + /// Maps reduced value to the corresponding reduction operation. + DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps; // Use map vector to make stable output. MapVector<Instruction *, Value *> ExtraArgs; WeakTrackingVH ReductionRoot; /// The type of reduction operation. RecurKind RdxKind; - const unsigned INVALID_OPERAND_INDEX = std::numeric_limits<unsigned>::max(); - static bool isCmpSelMinMax(Instruction *I) { return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) && RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I)); @@ -9325,26 +9458,6 @@ class HorizontalReduction { return I->getOperand(Index); } - /// Checks if the ParentStackElem.first should be marked as a reduction - /// operation with an extra argument or as extra argument itself. - void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem, - Value *ExtraArg) { - if (ExtraArgs.count(ParentStackElem.first)) { - ExtraArgs[ParentStackElem.first] = nullptr; - // We ran into something like: - // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg. - // The whole ParentStackElem.first should be considered as an extra value - // in this case. - // Do not perform analysis of remaining operands of ParentStackElem.first - // instruction, this whole instruction is an extra argument. - ParentStackElem.second = INVALID_OPERAND_INDEX; - } else { - // We ran into something like: - // ParentStackElem.first += ... + ExtraArg + ... - ExtraArgs[ParentStackElem.first] = ExtraArg; - } - } - /// Creates reduction operation with the current opcode. static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS, Value *RHS, const Twine &Name, bool UseSelect) { @@ -9429,7 +9542,7 @@ class HorizontalReduction { /// Creates reduction operation with the current opcode with the IR flags /// from \p I. static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, - Value *RHS, const Twine &Name, Instruction *I) { + Value *RHS, const Twine &Name, Value *I) { auto *SelI = dyn_cast<SelectInst>(I); Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, SelI != nullptr); if (SelI && RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { @@ -9440,8 +9553,10 @@ class HorizontalReduction { return Op; } - static RecurKind getRdxKind(Instruction *I) { - assert(I && "Expected instruction for reduction matching"); + static RecurKind getRdxKind(Value *V) { + auto *I = dyn_cast<Instruction>(V); + if (!I) + return RecurKind::None; if (match(I, m_Add(m_Value(), m_Value()))) return RecurKind::Add; if (match(I, m_Mul(m_Value(), m_Value()))) @@ -9603,7 +9718,9 @@ public: HorizontalReduction() = default; /// Try to find a reduction tree. - bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst) { + bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst, + ScalarEvolution &SE, const DataLayout &DL, + const TargetLibraryInfo &TLI) { assert((!Phi || is_contained(Phi->operands(), Inst)) && "Phi needs to use the binary operator"); assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) || @@ -9647,124 +9764,168 @@ public: ReductionRoot = Inst; - // The opcode for leaf values that we perform a reduction on. - // For example: load(x) + load(y) + load(z) + fptoui(w) - // The leaf opcode for 'w' does not match, so we don't include it as a - // potential candidate for the reduction. - unsigned LeafOpcode = 0; - - // Post-order traverse the reduction tree starting at Inst. We only handle - // true trees containing binary operators or selects. - SmallVector<std::pair<Instruction *, unsigned>, 32> Stack; - Stack.push_back(std::make_pair(Inst, getFirstOperandIndex(Inst))); - initReductionOps(Inst); - while (!Stack.empty()) { - Instruction *TreeN = Stack.back().first; - unsigned EdgeToVisit = Stack.back().second++; - const RecurKind TreeRdxKind = getRdxKind(TreeN); - bool IsReducedValue = TreeRdxKind != RdxKind; - - // Postorder visit. - if (IsReducedValue || EdgeToVisit >= getNumberOfOperands(TreeN)) { - if (IsReducedValue) - ReducedVals.push_back(TreeN); - else { - auto ExtraArgsIter = ExtraArgs.find(TreeN); - if (ExtraArgsIter != ExtraArgs.end() && !ExtraArgsIter->second) { - // Check if TreeN is an extra argument of its parent operation. - if (Stack.size() <= 1) { - // TreeN can't be an extra argument as it is a root reduction - // operation. - return false; - } - // Yes, TreeN is an extra argument, do not add it to a list of - // reduction operations. - // Stack[Stack.size() - 2] always points to the parent operation. - markExtraArg(Stack[Stack.size() - 2], TreeN); - ExtraArgs.erase(TreeN); - } else - addReductionOps(TreeN); + // Iterate through all the operands of the possible reduction tree and + // gather all the reduced values, sorting them by their value id. + BasicBlock *BB = Inst->getParent(); + bool IsCmpSelMinMax = isCmpSelMinMax(Inst); + SmallVector<Instruction *> Worklist(1, Inst); + // Checks if the operands of the \p TreeN instruction are also reduction + // operations or should be treated as reduced values or an extra argument, + // which is not part of the reduction. + auto &&CheckOperands = [this, IsCmpSelMinMax, + BB](Instruction *TreeN, + SmallVectorImpl<Value *> &ExtraArgs, + SmallVectorImpl<Value *> &PossibleReducedVals, + SmallVectorImpl<Instruction *> &ReductionOps) { + for (int I = getFirstOperandIndex(TreeN), + End = getNumberOfOperands(TreeN); + I < End; ++I) { + Value *EdgeVal = getRdxOperand(TreeN, I); + ReducedValsToOps[EdgeVal].push_back(TreeN); + auto *EdgeInst = dyn_cast<Instruction>(EdgeVal); + // Edge has wrong parent - mark as an extra argument. + if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) && + !hasSameParent(EdgeInst, BB)) { + ExtraArgs.push_back(EdgeVal); + continue; } - // Retract. - Stack.pop_back(); - continue; - } - - // Visit operands. - Value *EdgeVal = getRdxOperand(TreeN, EdgeToVisit); - auto *EdgeInst = dyn_cast<Instruction>(EdgeVal); - if (!EdgeInst) { - // Edge value is not a reduction instruction or a leaf instruction. - // (It may be a constant, function argument, or something else.) - markExtraArg(Stack.back(), EdgeVal); - continue; + // If the edge is not an instruction, or it is different from the main + // reduction opcode or has too many uses - possible reduced value. + if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind || + !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) || + !isVectorizable(getRdxKind(EdgeInst), EdgeInst)) { + PossibleReducedVals.push_back(EdgeVal); + continue; + } + ReductionOps.push_back(EdgeInst); } - RecurKind EdgeRdxKind = getRdxKind(EdgeInst); - // Continue analysis if the next operand is a reduction operation or - // (possibly) a leaf value. If the leaf value opcode is not set, - // the first met operation != reduction operation is considered as the - // leaf opcode. - // Only handle trees in the current basic block. - // Each tree node needs to have minimal number of users except for the - // ultimate reduction. - const bool IsRdxInst = EdgeRdxKind == RdxKind; - if (EdgeInst != Phi && EdgeInst != Inst && - hasSameParent(EdgeInst, Inst->getParent()) && - hasRequiredNumberOfUses(isCmpSelMinMax(Inst), EdgeInst) && - (!LeafOpcode || LeafOpcode == EdgeInst->getOpcode() || IsRdxInst)) { - if (IsRdxInst) { - // We need to be able to reassociate the reduction operations. - if (!isVectorizable(EdgeRdxKind, EdgeInst)) { - // I is an extra argument for TreeN (its parent operation). - markExtraArg(Stack.back(), EdgeInst); - continue; - } - } else if (!LeafOpcode) { - LeafOpcode = EdgeInst->getOpcode(); + }; + // Try to regroup reduced values so that it gets more profitable to try to + // reduce them. Values are grouped by their value ids, instructions - by + // instruction op id and/or alternate op id, plus do extra analysis for + // loads (grouping them by the distabce between pointers) and cmp + // instructions (grouping them by the predicate). + MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>> + PossibleReducedVals; + initReductionOps(Inst); + while (!Worklist.empty()) { + Instruction *TreeN = Worklist.pop_back_val(); + SmallVector<Value *> Args; + SmallVector<Value *> PossibleRedVals; + SmallVector<Instruction *> PossibleReductionOps; + CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps); + // If too many extra args - mark the instruction itself as a reduction + // value, not a reduction operation. + if (Args.size() < 2) { + addReductionOps(TreeN); + // Add extra args. + if (!Args.empty()) { + assert(Args.size() == 1 && "Expected only single argument."); + ExtraArgs[TreeN] = Args.front(); } - Stack.push_back( - std::make_pair(EdgeInst, getFirstOperandIndex(EdgeInst))); - continue; + // Add reduction values. The values are sorted for better vectorization + // results. + for (Value *V : PossibleRedVals) { + size_t Key, Idx; + std::tie(Key, Idx) = generateKeySubkey( + V, &TLI, + [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) { + for (const auto &LoadData : PossibleReducedVals[Key]) { + auto *RLI = cast<LoadInst>(LoadData.second.front().first); + if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(), + LI->getType(), LI->getPointerOperand(), + DL, SE, /*StrictCheck=*/true)) + return hash_value(RLI->getPointerOperand()); + } + return hash_value(LI->getPointerOperand()); + }, + /*AllowAlternate=*/false); + ++PossibleReducedVals[Key][Idx] + .insert(std::make_pair(V, 0)) + .first->second; + } + Worklist.append(PossibleReductionOps.rbegin(), + PossibleReductionOps.rend()); + } else { + size_t Key, Idx; + std::tie(Key, Idx) = generateKeySubkey( + TreeN, &TLI, + [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) { + for (const auto &LoadData : PossibleReducedVals[Key]) { + auto *RLI = cast<LoadInst>(LoadData.second.front().first); + if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(), + LI->getType(), LI->getPointerOperand(), DL, + SE, /*StrictCheck=*/true)) + return hash_value(RLI->getPointerOperand()); + } + return hash_value(LI->getPointerOperand()); + }, + /*AllowAlternate=*/false); + ++PossibleReducedVals[Key][Idx] + .insert(std::make_pair(TreeN, 0)) + .first->second; } - // I is an extra argument for TreeN (its parent operation). - markExtraArg(Stack.back(), EdgeInst); } + auto PossibleReducedValsVect = PossibleReducedVals.takeVector(); + // Sort values by the total number of values kinds to start the reduction + // from the longest possible reduced values sequences. + for (auto &PossibleReducedVals : PossibleReducedValsVect) { + auto PossibleRedVals = PossibleReducedVals.second.takeVector(); + SmallVector<SmallVector<Value *>> PossibleRedValsVect; + for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end(); + It != E; ++It) { + PossibleRedValsVect.emplace_back(); + auto RedValsVect = It->second.takeVector(); + stable_sort(RedValsVect, [](const auto &P1, const auto &P2) { + return P1.second < P2.second; + }); + for (const std::pair<Value *, unsigned> &Data : RedValsVect) + PossibleRedValsVect.back().append(Data.second, Data.first); + } + stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) { + return P1.size() > P2.size(); + }); + ReducedVals.emplace_back(); + for (ArrayRef<Value *> Data : PossibleRedValsVect) + ReducedVals.back().append(Data.rbegin(), Data.rend()); + } + // Sort the reduced values by number of same/alternate opcode and/or pointer + // operand. + stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) { + return P1.size() > P2.size(); + }); return true; } /// Attempt to vectorize the tree found by matchAssociativeReduction. Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { + constexpr int ReductionLimit = 4; // If there are a sufficient number of reduction values, reduce // to a nearby power-of-2. We can safely generate oversized // vectors and rely on the backend to split them to legal sizes. - unsigned NumReducedVals = ReducedVals.size(); - if (NumReducedVals < 4) + unsigned NumReducedVals = std::accumulate( + ReducedVals.begin(), ReducedVals.end(), 0, + [](int Num, ArrayRef<Value *> Vals) { return Num + Vals.size(); }); + if (NumReducedVals < ReductionLimit) return nullptr; - // Intersect the fast-math-flags from all reduction operations. - FastMathFlags RdxFMF; - RdxFMF.set(); - for (ReductionOpsType &RdxOp : ReductionOps) { - for (Value *RdxVal : RdxOp) { - if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal)) - RdxFMF &= FPMO->getFastMathFlags(); - } - } - IRBuilder<> Builder(cast<Instruction>(ReductionRoot)); - Builder.setFastMathFlags(RdxFMF); + // Track the reduced values in case if they are replaced by extractelement + // because of the vectorization. + DenseMap<Value *, WeakTrackingVH> TrackedVals; BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; // The same extra argument may be used several times, so log each attempt // to use it. for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) { assert(Pair.first && "DebugLoc must be set."); ExternallyUsedValues[Pair.second].push_back(Pair.first); + TrackedVals.try_emplace(Pair.second, Pair.second); } // The compare instruction of a min/max is the insertion point for new // instructions and may be replaced with a new compare instruction. - auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) { + auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) { assert(isa<SelectInst>(RdxRootInst) && "Expected min/max reduction to have select root instruction"); Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition(); @@ -9776,141 +9937,289 @@ public: // The reduction root is used as the insertion point for new instructions, // so set it as externally used to prevent it from being deleted. ExternallyUsedValues[ReductionRoot]; - SmallVector<Value *, 16> IgnoreList; - for (ReductionOpsType &RdxOp : ReductionOps) - IgnoreList.append(RdxOp.begin(), RdxOp.end()); - - unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); - if (NumReducedVals > ReduxWidth) { - // In the loop below, we are building a tree based on a window of - // 'ReduxWidth' values. - // If the operands of those values have common traits (compare predicate, - // constant operand, etc), then we want to group those together to - // minimize the cost of the reduction. - - // TODO: This should be extended to count common operands for - // compares and binops. - - // Step 1: Count the number of times each compare predicate occurs. - SmallDenseMap<unsigned, unsigned> PredCountMap; - for (Value *RdxVal : ReducedVals) { - CmpInst::Predicate Pred; - if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value()))) - ++PredCountMap[Pred]; - } - // Step 2: Sort the values so the most common predicates come first. - stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) { - CmpInst::Predicate PredA, PredB; - if (match(A, m_Cmp(PredA, m_Value(), m_Value())) && - match(B, m_Cmp(PredB, m_Value(), m_Value()))) { - return PredCountMap[PredA] > PredCountMap[PredB]; - } - return false; - }); - } + SmallVector<Value *> IgnoreList; + for (ReductionOpsType &RdxOps : ReductionOps) + for (Value *RdxOp : RdxOps) { + if (!RdxOp) + continue; + IgnoreList.push_back(RdxOp); + } + bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot)); + + // Need to track reduced vals, they may be changed during vectorization of + // subvectors. + for (ArrayRef<Value *> Candidates : ReducedVals) + for (Value *V : Candidates) + TrackedVals.try_emplace(V, V); + DenseMap<Value *, unsigned> VectorizedVals; Value *VectorizedTree = nullptr; - unsigned i = 0; - while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { - ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth); - V.buildTree(VL, IgnoreList); - if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) - break; - if (V.isLoadCombineReductionCandidate(RdxKind)) - break; - V.reorderTopToBottom(); - V.reorderBottomToTop(/*IgnoreReorder=*/true); - V.buildExternalUses(ExternallyUsedValues); - - // For a poison-safe boolean logic reduction, do not replace select - // instructions with logic ops. All reduced values will be frozen (see - // below) to prevent leaking poison. - if (isa<SelectInst>(ReductionRoot) && - isBoolLogicOp(cast<Instruction>(ReductionRoot)) && - NumReducedVals != ReduxWidth) - break; + bool CheckForReusedReductionOps = false; + // Try to vectorize elements based on their type. + for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) { + ArrayRef<Value *> OrigReducedVals = ReducedVals[I]; + InstructionsState S = getSameOpcode(OrigReducedVals); + SmallVector<Value *> Candidates; + DenseMap<Value *, Value *> TrackedToOrig; + for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) { + Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second; + // Check if the reduction value was not overriden by the extractelement + // instruction because of the vectorization and exclude it, if it is not + // compatible with other values. + if (auto *Inst = dyn_cast<Instruction>(RdxVal)) + if (isVectorLikeInstWithConstOps(Inst) && + (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) + continue; + Candidates.push_back(RdxVal); + TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]); + } + bool ShuffledExtracts = false; + // Try to handle shuffled extractelements. + if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() && + I + 1 < E) { + InstructionsState NextS = getSameOpcode(ReducedVals[I + 1]); + if (NextS.getOpcode() == Instruction::ExtractElement && + !NextS.isAltShuffle()) { + SmallVector<Value *> CommonCandidates(Candidates); + for (Value *RV : ReducedVals[I + 1]) { + Value *RdxVal = TrackedVals.find(RV)->second; + // Check if the reduction value was not overriden by the + // extractelement instruction because of the vectorization and + // exclude it, if it is not compatible with other values. + if (auto *Inst = dyn_cast<Instruction>(RdxVal)) + if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst)) + continue; + CommonCandidates.push_back(RdxVal); + TrackedToOrig.try_emplace(RdxVal, RV); + } + SmallVector<int> Mask; + if (isFixedVectorShuffle(CommonCandidates, Mask)) { + ++I; + Candidates.swap(CommonCandidates); + ShuffledExtracts = true; + } + } + } + unsigned NumReducedVals = Candidates.size(); + if (NumReducedVals < ReductionLimit) + continue; - V.computeMinimumValueSizes(); + unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); + unsigned Start = 0; + unsigned Pos = Start; + // Restarts vectorization attempt with lower vector factor. + unsigned PrevReduxWidth = ReduxWidth; + bool CheckForReusedReductionOpsLocal = false; + auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals, + &CheckForReusedReductionOpsLocal, + &PrevReduxWidth, &V, + &IgnoreList](bool IgnoreVL = false) { + bool IsAnyRedOpGathered = + !IgnoreVL && any_of(IgnoreList, [&V](Value *RedOp) { + return V.isGathered(RedOp); + }); + if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) { + // Check if any of the reduction ops are gathered. If so, worth + // trying again with less number of reduction ops. + CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered; + } + ++Pos; + if (Pos < NumReducedVals - ReduxWidth + 1) + return IsAnyRedOpGathered; + Pos = Start; + ReduxWidth /= 2; + return IsAnyRedOpGathered; + }; + while (Pos < NumReducedVals - ReduxWidth + 1 && + ReduxWidth >= ReductionLimit) { + // Dependency in tree of the reduction ops - drop this attempt, try + // later. + if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth && + Start == 0) { + CheckForReusedReductionOps = true; + break; + } + PrevReduxWidth = ReduxWidth; + ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth); + // Beeing analyzed already - skip. + if (V.areAnalyzedReductionVals(VL)) { + (void)AdjustReducedVals(/*IgnoreVL=*/true); + continue; + } + // Early exit if any of the reduction values were deleted during + // previous vectorization attempts. + if (any_of(VL, [&V](Value *RedVal) { + auto *RedValI = dyn_cast<Instruction>(RedVal); + if (!RedValI) + return false; + return V.isDeleted(RedValI); + })) + break; + V.buildTree(VL, IgnoreList); + if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) { + if (!AdjustReducedVals()) + V.analyzedReductionVals(VL); + continue; + } + if (V.isLoadCombineReductionCandidate(RdxKind)) { + if (!AdjustReducedVals()) + V.analyzedReductionVals(VL); + continue; + } + V.reorderTopToBottom(); + // No need to reorder the root node at all. + V.reorderBottomToTop(/*IgnoreReorder=*/true); + // Keep extracted other reduction values, if they are used in the + // vectorization trees. + BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues( + ExternallyUsedValues); + for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) { + if (Cnt == I || (ShuffledExtracts && Cnt == I - 1)) + continue; + for_each(ReducedVals[Cnt], + [&LocalExternallyUsedValues, &TrackedVals](Value *V) { + if (isa<Instruction>(V)) + LocalExternallyUsedValues[TrackedVals[V]]; + }); + } + for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) { + if (Cnt >= Pos && Cnt < Pos + ReduxWidth) + continue; + if (VectorizedVals.count(Candidates[Cnt])) + continue; + LocalExternallyUsedValues[Candidates[Cnt]]; + } + V.buildExternalUses(LocalExternallyUsedValues); + + V.computeMinimumValueSizes(); + + // Intersect the fast-math-flags from all reduction operations. + FastMathFlags RdxFMF; + RdxFMF.set(); + for (Value *U : IgnoreList) + if (auto *FPMO = dyn_cast<FPMathOperator>(U)) + RdxFMF &= FPMO->getFastMathFlags(); + // Estimate cost. + InstructionCost TreeCost = V.getTreeCost(VL); + InstructionCost ReductionCost = + getReductionCost(TTI, VL[0], ReduxWidth, RdxFMF); + InstructionCost Cost = TreeCost + ReductionCost; + if (!Cost.isValid()) { + LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n"); + return nullptr; + } + if (Cost >= -SLPCostThreshold) { + V.getORE()->emit([&]() { + return OptimizationRemarkMissed( + SV_NAME, "HorSLPNotBeneficial", + ReducedValsToOps.find(VL[0])->second.front()) + << "Vectorizing horizontal reduction is possible" + << "but not beneficial with cost " << ore::NV("Cost", Cost) + << " and threshold " + << ore::NV("Threshold", -SLPCostThreshold); + }); + if (!AdjustReducedVals()) + V.analyzedReductionVals(VL); + continue; + } - // Estimate cost. - InstructionCost TreeCost = - V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth)); - InstructionCost ReductionCost = - getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF); - InstructionCost Cost = TreeCost + ReductionCost; - if (!Cost.isValid()) { - LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n"); - return nullptr; - } - if (Cost >= -SLPCostThreshold) { + LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" + << Cost << ". (HorRdx)\n"); V.getORE()->emit([&]() { - return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial", - cast<Instruction>(VL[0])) - << "Vectorizing horizontal reduction is possible" - << "but not beneficial with cost " << ore::NV("Cost", Cost) - << " and threshold " - << ore::NV("Threshold", -SLPCostThreshold); + return OptimizationRemark( + SV_NAME, "VectorizedHorizontalReduction", + ReducedValsToOps.find(VL[0])->second.front()) + << "Vectorized horizontal reduction with cost " + << ore::NV("Cost", Cost) << " and with tree size " + << ore::NV("TreeSize", V.getTreeSize()); }); - break; - } - LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" - << Cost << ". (HorRdx)\n"); - V.getORE()->emit([&]() { - return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", - cast<Instruction>(VL[0])) - << "Vectorized horizontal reduction with cost " - << ore::NV("Cost", Cost) << " and with tree size " - << ore::NV("TreeSize", V.getTreeSize()); - }); + Builder.setFastMathFlags(RdxFMF); - // Vectorize a tree. - DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); - Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues); + // Vectorize a tree. + Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues); - // Emit a reduction. If the root is a select (min/max idiom), the insert - // point is the compare condition of that select. - Instruction *RdxRootInst = cast<Instruction>(ReductionRoot); - if (isCmpSelMinMax(RdxRootInst)) - Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst)); - else - Builder.SetInsertPoint(RdxRootInst); + // Emit a reduction. If the root is a select (min/max idiom), the insert + // point is the compare condition of that select. + Instruction *RdxRootInst = cast<Instruction>(ReductionRoot); + if (IsCmpSelMinMax) + Builder.SetInsertPoint(GetCmpForMinMaxReduction(RdxRootInst)); + else + Builder.SetInsertPoint(RdxRootInst); - // To prevent poison from leaking across what used to be sequential, safe, - // scalar boolean logic operations, the reduction operand must be frozen. - if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst)) - VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); + // To prevent poison from leaking across what used to be sequential, + // safe, scalar boolean logic operations, the reduction operand must be + // frozen. + if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst)) + VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); - Value *ReducedSubTree = - emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); + Value *ReducedSubTree = + emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); - if (!VectorizedTree) { - // Initialize the final value in the reduction. - VectorizedTree = ReducedSubTree; - } else { - // Update the final value in the reduction. - Builder.SetCurrentDebugLocation(Loc); - VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, - ReducedSubTree, "op.rdx", ReductionOps); + if (!VectorizedTree) { + // Initialize the final value in the reduction. + VectorizedTree = ReducedSubTree; + } else { + // Update the final value in the reduction. + Builder.SetCurrentDebugLocation( + cast<Instruction>(ReductionOps.front().front())->getDebugLoc()); + VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, + ReducedSubTree, "op.rdx", ReductionOps); + } + // Count vectorized reduced values to exclude them from final reduction. + for (Value *V : VL) + ++VectorizedVals.try_emplace(TrackedToOrig.find(V)->second, 0) + .first->getSecond(); + Pos += ReduxWidth; + Start = Pos; + ReduxWidth = PowerOf2Floor(NumReducedVals - Pos); } - i += ReduxWidth; - ReduxWidth = PowerOf2Floor(NumReducedVals - i); } - if (VectorizedTree) { // Finish the reduction. - for (; i < NumReducedVals; ++i) { - auto *I = cast<Instruction>(ReducedVals[i]); - Builder.SetCurrentDebugLocation(I->getDebugLoc()); - VectorizedTree = - createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps); + // Need to add extra arguments and not vectorized possible reduction + // values. + SmallPtrSet<Value *, 8> Visited; + for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) { + ArrayRef<Value *> Candidates = ReducedVals[I]; + for (Value *RdxVal : Candidates) { + if (!Visited.insert(RdxVal).second) + continue; + Value *StableRdxVal = RdxVal; + auto TVIt = TrackedVals.find(RdxVal); + if (TVIt != TrackedVals.end()) + StableRdxVal = TVIt->second; + unsigned NumOps = 0; + auto It = VectorizedVals.find(RdxVal); + if (It != VectorizedVals.end()) + NumOps = It->second; + for (Instruction *RedOp : + makeArrayRef(ReducedValsToOps.find(RdxVal)->second) + .drop_back(NumOps)) { + Builder.SetCurrentDebugLocation(RedOp->getDebugLoc()); + ReductionOpsListType Ops; + if (auto *Sel = dyn_cast<SelectInst>(RedOp)) + Ops.emplace_back().push_back(Sel->getCondition()); + Ops.emplace_back().push_back(RedOp); + VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, + StableRdxVal, "op.rdx", Ops); + } + } } for (auto &Pair : ExternallyUsedValues) { // Add each externally used value to the final reduction. for (auto *I : Pair.second) { Builder.SetCurrentDebugLocation(I->getDebugLoc()); + ReductionOpsListType Ops; + if (auto *Sel = dyn_cast<SelectInst>(I)) + Ops.emplace_back().push_back(Sel->getCondition()); + Ops.emplace_back().push_back(I); + Value *StableRdxVal = Pair.first; + auto TVIt = TrackedVals.find(Pair.first); + if (TVIt != TrackedVals.end()) + StableRdxVal = TVIt->second; VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, - Pair.first, "op.extra", I); + StableRdxVal, "op.rdx", Ops); } } @@ -9922,20 +10231,30 @@ public: // deletion. #ifndef NDEBUG SmallSet<Value *, 4> IgnoreSet; - IgnoreSet.insert(IgnoreList.begin(), IgnoreList.end()); + for (ArrayRef<Value *> RdxOps : ReductionOps) + IgnoreSet.insert(RdxOps.begin(), RdxOps.end()); #endif - for (auto *Ignore : IgnoreList) { + for (ArrayRef<Value *> RdxOps : ReductionOps) { + for (Value *Ignore : RdxOps) { + if (!Ignore) + continue; #ifndef NDEBUG - for (auto *U : Ignore->users()) { - assert(IgnoreSet.count(U)); - } + for (auto *U : Ignore->users()) { + assert(IgnoreSet.count(U) && + "All users must be either in the reduction ops list."); + } #endif - if (!Ignore->use_empty()) { - Value *Undef = UndefValue::get(Ignore->getType()); - Ignore->replaceAllUsesWith(Undef); + if (!Ignore->use_empty()) { + Value *Undef = UndefValue::get(Ignore->getType()); + Ignore->replaceAllUsesWith(Undef); + } + V.eraseInstruction(cast<Instruction>(Ignore)); } - V.eraseInstruction(cast<Instruction>(Ignore)); } + } else if (!CheckForReusedReductionOps) { + for (ReductionOpsType &RdxOps : ReductionOps) + for (Value *RdxOp : RdxOps) + V.analyzedReductionRoot(cast<Instruction>(RdxOp)); } return VectorizedTree; } @@ -10201,7 +10520,8 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) { /// performed. static bool tryToVectorizeHorReductionOrInstOperands( PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, - TargetTransformInfo *TTI, + TargetTransformInfo *TTI, ScalarEvolution &SE, const DataLayout &DL, + const TargetLibraryInfo &TLI, const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) { if (!ShouldVectorizeHor) return false; @@ -10220,7 +10540,7 @@ static bool tryToVectorizeHorReductionOrInstOperands( // horizontal reduction. // Interrupt the process if the Root instruction itself was vectorized or all // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. - // Skip the analysis of CmpInsts.Compiler implements postanalysis of the + // Skip the analysis of CmpInsts. Compiler implements postanalysis of the // CmpInsts so we can skip extra attempts in // tryToVectorizeHorReductionOrInstOperands and save compile time. std::queue<std::pair<Instruction *, unsigned>> Stack; @@ -10228,13 +10548,16 @@ static bool tryToVectorizeHorReductionOrInstOperands( SmallPtrSet<Value *, 8> VisitedInstrs; SmallVector<WeakTrackingVH> PostponedInsts; bool Res = false; - auto &&TryToReduce = [TTI, &P, &R](Instruction *Inst, Value *&B0, - Value *&B1) -> Value * { + auto &&TryToReduce = [TTI, &SE, &DL, &P, &R, &TLI](Instruction *Inst, + Value *&B0, + Value *&B1) -> Value * { + if (R.isAnalizedReductionRoot(Inst)) + return nullptr; bool IsBinop = matchRdxBop(Inst, B0, B1); bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value())); if (IsBinop || IsSelect) { HorizontalReduction HorRdx; - if (HorRdx.matchAssociativeReduction(P, Inst)) + if (HorRdx.matchAssociativeReduction(P, Inst, SE, DL, TLI)) return HorRdx.tryToReduce(R, TTI); } return nullptr; @@ -10279,7 +10602,7 @@ static bool tryToVectorizeHorReductionOrInstOperands( // Do not try to vectorize CmpInst operands, this is done separately. // Final attempt for binop args vectorization should happen after the loop // to try to find reductions. - if (!isa<CmpInst>(Inst)) + if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Inst)) PostponedInsts.push_back(Inst); } @@ -10292,8 +10615,8 @@ static bool tryToVectorizeHorReductionOrInstOperands( if (auto *I = dyn_cast<Instruction>(Op)) // Do not try to vectorize CmpInst operands, this is done // separately. - if (!isa<PHINode>(I) && !isa<CmpInst>(I) && !R.isDeleted(I) && - I->getParent() == BB) + if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) && + !R.isDeleted(I) && I->getParent() == BB) Stack.emplace(I, Level); } // Try to vectorized binops where reductions were not found. @@ -10317,8 +10640,8 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool { return tryToVectorize(I, R); }; - return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, - ExtraVectorization); + return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, *SE, *DL, + *TLI, ExtraVectorization); } bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, @@ -10486,12 +10809,16 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions( for (auto *I : reverse(Instructions)) { if (R.isDeleted(I)) continue; - if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) + if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) { OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); - else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) + } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) { OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); - else if (isa<CmpInst>(I)) + } else if (isa<CmpInst>(I)) { PostponedCmps.push_back(I); + continue; + } + // Try to find reductions in buildvector sequnces. + OpsChanged |= vectorizeRootInstruction(nullptr, I, BB, R, TTI); } if (AtTerminator) { // Try to find reductions first. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 02550dad..21bd231 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1420,6 +1420,9 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, getCondOp()->printAsOperand(O, SlotTracker); } O << ")"; + if (RdxDesc->IntermediateStore) + O << " (with final reduction value stored in invariant address sank " + "outside of loop)"; } void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 05fc8c6..f26babe 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -257,12 +257,12 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { ExtractElementInst *VectorCombine::getShuffleExtract( ExtractElementInst *Ext0, ExtractElementInst *Ext1, unsigned PreferredExtractIndex = InvalidIndex) const { - assert(isa<ConstantInt>(Ext0->getIndexOperand()) && - isa<ConstantInt>(Ext1->getIndexOperand()) && - "Expected constant extract indexes"); + auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand()); + auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand()); + assert(Index0C && Index1C && "Expected constant extract indexes"); - unsigned Index0 = cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue(); - unsigned Index1 = cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue(); + unsigned Index0 = Index0C->getZExtValue(); + unsigned Index1 = Index1C->getZExtValue(); // If the extract indexes are identical, no shuffle is needed. if (Index0 == Index1) @@ -308,9 +308,10 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, const Instruction &I, ExtractElementInst *&ConvertToShuffle, unsigned PreferredExtractIndex) { - assert(isa<ConstantInt>(Ext0->getOperand(1)) && - isa<ConstantInt>(Ext1->getOperand(1)) && - "Expected constant extract indexes"); + auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getOperand(1)); + auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getOperand(1)); + assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes"); + unsigned Opcode = I.getOpcode(); Type *ScalarTy = Ext0->getType(); auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType()); @@ -333,8 +334,8 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, // Get cost estimates for the extract elements. These costs will factor into // both sequences. - unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue(); - unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue(); + unsigned Ext0Index = Ext0IndexC->getZExtValue(); + unsigned Ext1Index = Ext1IndexC->getZExtValue(); InstructionCost Extract0Cost = TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index); |