diff options
Diffstat (limited to 'llvm/lib/Transforms')
| -rw-r--r-- | llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp | 3 | ||||
| -rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineInternal.h | 2 | ||||
| -rw-r--r-- | llvm/lib/Transforms/InstCombine/InstructionCombining.cpp | 6 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 40 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp | 12 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp | 9 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Utils/BuildLibCalls.cpp | 6 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp | 4 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Utils/LoopVersioning.cpp | 9 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 24 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.cpp | 11 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.h | 4 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 13 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 119 |
14 files changed, 159 insertions, 103 deletions
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 7a95df4..b575d76 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -1378,8 +1378,7 @@ static bool foldMemChr(CallInst *Call, DomTreeUpdater *DTU, IRB.CreateTrunc(Call->getArgOperand(1), ByteTy), BBNext, N); // We can't know the precise weights here, as they would depend on the value // distribution of Call->getArgOperand(1). So we just mark it as "unknown". - setExplicitlyUnknownBranchWeightsIfProfiled(*SI, *Call->getFunction(), - DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*SI, DEBUG_TYPE); Type *IndexTy = DL.getIndexType(Call->getType()); SmallVector<DominatorTree::UpdateType, 8> Updates; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index d85e4f7..9bdd8cb 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -479,7 +479,7 @@ private: const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { auto *Sel = SelectInst::Create(C, S1, S2, NameStr, InsertBefore, nullptr); - setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, F, DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, DEBUG_TYPE, &F); return Sel; } diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 67f837c..b158e0f 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2261,11 +2261,11 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) { } Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) { - if (!isa<Constant>(I.getOperand(1))) - return nullptr; + bool IsOtherParamConst = isa<Constant>(I.getOperand(1)); if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) { - if (Instruction *NewSel = FoldOpIntoSelect(I, Sel)) + if (Instruction *NewSel = + FoldOpIntoSelect(I, Sel, false, !IsOtherParamConst)) return NewSel; } else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) { if (Instruction *NewPhi = foldOpIntoPhi(I, PN)) diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 019536ca..9070d25 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -72,6 +72,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -105,6 +106,7 @@ STATISTIC( STATISTIC(NumShiftUntilZero, "Number of uncountable loops recognized as 'shift until zero' idiom"); +namespace llvm { bool DisableLIRP::All; static cl::opt<bool, true> DisableLIRPAll("disable-" DEBUG_TYPE "-all", @@ -163,6 +165,10 @@ static cl::opt<bool> ForceMemsetPatternIntrinsic( cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false), cl::Hidden); +extern cl::opt<bool> ProfcheckDisableMetadataFixes; + +} // namespace llvm + namespace { class LoopIdiomRecognize { @@ -3199,7 +3205,21 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() { // The loop trip count check. auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount, CurLoop->getName() + ".ivcheck"); - Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB); + SmallVector<uint32_t> BranchWeights; + const bool HasBranchWeights = + !ProfcheckDisableMetadataFixes && + extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights); + + auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB); + if (HasBranchWeights) { + if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1)) + std::swap(BranchWeights[0], BranchWeights[1]); + // We're not changing the loop profile, so we can reuse the original loop's + // profile. + setBranchWeights(*BI, BranchWeights, + /*IsExpected=*/false); + } + LoopHeaderBB->getTerminator()->eraseFromParent(); // Populate the IV PHI. @@ -3368,10 +3388,10 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, ScalarEvolution *SE, /// %start = <...> /// %extraoffset = <...> /// <...> -/// br label %for.cond +/// br label %loop /// /// loop: -/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ] +/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ] /// %nbits = add nsw i8 %iv, %extraoffset /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0 @@ -3533,7 +3553,19 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() { // The loop terminator. Builder.SetInsertPoint(LoopHeaderBB->getTerminator()); - Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB); + SmallVector<uint32_t> BranchWeights; + const bool HasBranchWeights = + !ProfcheckDisableMetadataFixes && + extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights); + + auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB); + if (HasBranchWeights) { + if (InvertedCond) + std::swap(BranchWeights[0], BranchWeights[1]); + // We're not changing the loop profile, so we can reuse the original loop's + // profile. + setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false); + } LoopHeaderBB->getTerminator()->eraseFromParent(); // Populate the IV PHI. diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index b9546c5..e902b71 100644 --- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -24,6 +24,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" @@ -393,6 +394,17 @@ private: DTUpdates.push_back({DominatorTree::Insert, Preheader, BB}); ++NumLoopExitsDeleted; } + // We don't really need to add branch weights to DummySwitch, because all + // but one branches are just a temporary artifact - see the comment on top + // of this function. But, it's easy to estimate the weights, and it helps + // maintain a property of the overall compiler - that the branch weights + // don't "just get dropped" accidentally (i.e. profcheck) + if (DummySwitch->getParent()->getParent()->hasProfileData()) { + SmallVector<uint32_t> DummyBranchWeights(1 + DummySwitch->getNumCases()); + // default. 100% probability, the rest are dead. + DummyBranchWeights[0] = 1; + setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false); + } assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?"); if (Loop *OuterLoop = LI.getLoopFor(Preheader)) { diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 0577ddb..0f3e664 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -330,8 +330,7 @@ static void buildPartialUnswitchConditionalBranch( HasBranchWeights ? ComputeProfFrom.getMetadata(LLVMContext::MD_prof) : nullptr); if (!HasBranchWeights) - setExplicitlyUnknownBranchWeightsIfProfiled( - *BR, *BR->getParent()->getParent(), DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE); } /// Copy a set of loop invariant values, and conditionally branch on them. @@ -389,8 +388,7 @@ static void buildPartialInvariantUnswitchConditionalBranch( IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, Direction ? &NormalSucc : &UnswitchedSucc, ProfData); if (!ProfData) - setExplicitlyUnknownBranchWeightsIfProfiled(*BR, *BR->getFunction(), - DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE); } /// Rewrite the PHI nodes in an unswitched loop exit basic block. @@ -3204,8 +3202,7 @@ injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L, auto *InvariantBr = Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock); // We don't know anything about the relation between the limits. - setExplicitlyUnknownBranchWeightsIfProfiled( - *InvariantBr, *InvariantBr->getParent()->getParent(), DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*InvariantBr, DEBUG_TYPE); Builder.SetInsertPoint(CheckBlock); Builder.CreateCondBr( diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 573a781..02b73e8 100644 --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -1283,6 +1283,12 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F, case LibFunc_ilogbl: case LibFunc_logf: case LibFunc_logl: + case LibFunc_nextafter: + case LibFunc_nextafterf: + case LibFunc_nextafterl: + case LibFunc_nexttoward: + case LibFunc_nexttowardf: + case LibFunc_nexttowardl: case LibFunc_pow: case LibFunc_powf: case LibFunc_powl: diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp index 6d4436b..dd8706c 100644 --- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp +++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp @@ -54,8 +54,8 @@ PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M, const DataLayout &DL = M.getDataLayout(); const Triple &TT = M.getTargetTriple(); - for (RTLIB::LibcallImpl Impl : RTLCI.getLibcallImpls()) { - if (Impl == RTLIB::Unsupported) + for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) { + if (!RTLCI.isAvailable(Impl)) continue; auto [FuncTy, FuncAttrs] = RTLCI.getFunctionTy(Ctx, TT, DL, Impl); diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp index ec2e6c1..9c8b6ef 100644 --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -109,8 +110,12 @@ void LoopVersioning::versionLoop( // Insert the conditional branch based on the result of the memchecks. Instruction *OrigTerm = RuntimeCheckBB->getTerminator(); Builder.SetInsertPoint(OrigTerm); - Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(), - VersionedLoop->getLoopPreheader()); + auto *BI = + Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(), + VersionedLoop->getLoopPreheader()); + // We don't know what the probability of executing the versioned vs the + // unversioned variants is. + setExplicitlyUnknownBranchWeightsIfProfiled(*BI, DEBUG_TYPE); OrigTerm->eraseFromParent(); // The loops merge in the original exit block. This is now dominated by the diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 3a3e3ad..37c048f 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -5214,8 +5214,7 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI, // We don't have any info about this condition. auto *Br = TrueWhenEqual ? Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB) : Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB); - setExplicitlyUnknownBranchWeightsIfProfiled(*Br, *NewBB->getParent(), - DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*Br, DEBUG_TYPE); OldTI->eraseFromParent(); @@ -7732,19 +7731,24 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, // label. The other is those powers of 2 that don't appear in the case // statement. We don't know the distribution of the values coming in, so // the safest is to split 50-50 the original probability to `default`. - uint64_t OrigDenominator = sum_of(map_range( - Weights, [](const auto &V) { return static_cast<uint64_t>(V); })); + uint64_t OrigDenominator = + sum_of(map_range(Weights, StaticCastTo<uint64_t>)); SmallVector<uint64_t> NewWeights(2); NewWeights[1] = Weights[0] / 2; NewWeights[0] = OrigDenominator - NewWeights[1]; setFittedBranchWeights(*BI, NewWeights, /*IsExpected=*/false); - - // For the original switch, we reduce the weight of the default by the - // amount by which the previous branch contributes to getting to default, - // and then make sure the remaining weights have the same relative ratio - // wrt eachother. + // The probability of executing the default block stays constant. It was + // p_d = Weights[0] / OrigDenominator + // we rewrite as W/D + // We want to find the probability of the default branch of the switch + // statement. Let's call it X. We have W/D = W/2D + X * (1-W/2D) + // i.e. the original probability is the probability we go to the default + // branch from the BI branch, or we take the default branch on the SI. + // Meaning X = W / (2D - W), or (W/2) / (D - W/2) + // This matches using W/2 for the default branch probability numerator and + // D-W/2 as the denominator. + Weights[0] = NewWeights[1]; uint64_t CasesDenominator = OrigDenominator - Weights[0]; - Weights[0] /= 2; for (auto &W : drop_begin(Weights)) W = NewWeights[0] * static_cast<double>(W) / CasesDenominator; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 428a8f4..dd26a05 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -304,18 +304,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) { } bool IsSingleScalar = vputils::isSingleScalar(Def); - VPLane LastLane(IsSingleScalar ? 0 : VF.getFixedValue() - 1); - // Check if there is a scalar value for the selected lane. - if (!hasScalarValue(Def, LastLane)) { - // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and - // VPExpandSCEVRecipes can also be a single scalar. - assert((isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe, - VPExpandSCEVRecipe>(Def->getDefiningRecipe())) && - "unexpected recipe found to be invariant"); - IsSingleScalar = true; - LastLane = 0; - } // We need to construct the vector value for a single-scalar value by // broadcasting the scalar to all lanes. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index cfe1f1e..22ea083 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1725,7 +1725,9 @@ public: #endif }; -/// A recipe for widening select instructions. +/// A recipe for widening select instructions. Supports both wide vector and +/// single-scalar conditions, matching the behavior of LLVM IR's select +/// instruction. struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { VPWidenSelectRecipe(SelectInst &I, ArrayRef<VPValue *> Operands) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1ee405a..f792d0a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -659,7 +659,9 @@ Value *VPInstruction::generate(VPTransformState &State) { } case Instruction::Select: { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); - Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed); + Value *Cond = + State.get(getOperand(0), + OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0))); Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed); Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed); return Builder.CreateSelect(Cond, Op1, Op2, Name); @@ -1968,16 +1970,13 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, getOperand(1)->printAsOperand(O, SlotTracker); O << ", "; getOperand(2)->printAsOperand(O, SlotTracker); - O << (isInvariantCond() ? " (condition is loop invariant)" : ""); + O << (vputils::isSingleScalar(getCond()) ? " (condition is single-scalar)" + : ""); } #endif void VPWidenSelectRecipe::execute(VPTransformState &State) { - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - Value *Cond = State.get(getCond(), isInvariantCond()); + Value *Cond = State.get(getCond(), vputils::isSingleScalar(getCond())); Value *Op0 = State.get(getOperand(1)); Value *Op1 = State.get(getOperand(2)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 2588c87..8ad772f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -167,8 +167,7 @@ static bool sinkScalarOperands(VPlan &Plan) { if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate)) return; - if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() || - Candidate->mayReadOrWriteMemory()) + if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate)) return; if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate)) @@ -1287,6 +1286,15 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { return; } + // Look through broadcast of single-scalar when used as select conditions; in + // that case the scalar condition can be used directly. + if (match(Def, + m_Select(m_Broadcast(m_VPValue(C)), m_VPValue(), m_VPValue())) && + vputils::isSingleScalar(C)) { + Def->setOperand(0, C); + return; + } + if (auto *Phi = dyn_cast<VPPhi>(Def)) { if (Phi->getNumOperands() == 1) Phi->replaceAllUsesWith(Phi->getOperand(0)); @@ -4175,6 +4183,59 @@ static bool isAlreadyNarrow(VPValue *VPV) { return RepR && RepR->isSingleScalar(); } +// Convert a wide recipe defining a VPValue \p V feeding an interleave group to +// a narrow variant. +static VPValue * +narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) { + auto *R = V->getDefiningRecipe(); + if (!R || NarrowedOps.contains(V)) + return V; + + if (isAlreadyNarrow(V)) + return V; + + if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) { + for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) + WideMember0->setOperand( + Idx, + narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps)); + return V; + } + + if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) { + // Narrow interleave group to wide load, as transformed VPlan will only + // process one original iteration. + auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()); + auto *L = new VPWidenLoadRecipe( + *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, + /*Reverse=*/false, LI->getAlign(), {}, LoadGroup->getDebugLoc()); + L->insertBefore(LoadGroup); + NarrowedOps.insert(L); + return L; + } + + if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) { + assert(RepR->isSingleScalar() && + isa<LoadInst>(RepR->getUnderlyingInstr()) && + "must be a single scalar load"); + NarrowedOps.insert(RepR); + return RepR; + } + + auto *WideLoad = cast<VPWidenLoadRecipe>(R); + VPValue *PtrOp = WideLoad->getAddr(); + if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp)) + PtrOp = VecPtr->getOperand(0); + // Narrow wide load to uniform scalar load, as transformed VPlan will only + // process one original iteration. + auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp}, + /*IsUniform*/ true, + /*Mask*/ nullptr, *WideLoad); + N->insertBefore(WideLoad); + NarrowedOps.insert(N); + return N; +} + void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth) { VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); @@ -4276,60 +4337,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe. SmallPtrSet<VPValue *, 4> NarrowedOps; - auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * { - auto *R = V->getDefiningRecipe(); - if (!R || NarrowedOps.contains(V)) - return V; - if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) { - // Narrow interleave group to wide load, as transformed VPlan will only - // process one original iteration. - auto *LI = - cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()); - auto *L = new VPWidenLoadRecipe( - *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, - /*Reverse=*/false, LI->getAlign(), {}, LoadGroup->getDebugLoc()); - L->insertBefore(LoadGroup); - NarrowedOps.insert(L); - return L; - } - - if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) { - assert(RepR->isSingleScalar() && - isa<LoadInst>(RepR->getUnderlyingInstr()) && - "must be a single scalar load"); - NarrowedOps.insert(RepR); - return RepR; - } - auto *WideLoad = cast<VPWidenLoadRecipe>(R); - - VPValue *PtrOp = WideLoad->getAddr(); - if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp)) - PtrOp = VecPtr->getOperand(0); - // Narrow wide load to uniform scalar load, as transformed VPlan will only - // process one original iteration. - auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp}, - /*IsUniform*/ true, - /*Mask*/ nullptr, *WideLoad); - N->insertBefore(WideLoad); - NarrowedOps.insert(N); - return N; - }; - // Narrow operation tree rooted at store groups. for (auto *StoreGroup : StoreGroups) { - VPValue *Res = nullptr; - VPValue *Member0 = StoreGroup->getStoredValues()[0]; - if (isAlreadyNarrow(Member0)) { - Res = Member0; - } else if (auto *WideMember0 = - dyn_cast<VPWidenRecipe>(Member0->getDefiningRecipe())) { - for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) - WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx))); - Res = WideMember0; - } else { - Res = NarrowOp(Member0); - } - + VPValue *Res = + narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps); auto *SI = cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()); auto *S = new VPWidenStoreRecipe( |
