diff options
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp | 60 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstructionCombining.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp | 35 | ||||
-rw-r--r-- | llvm/lib/Transforms/Scalar/GVN.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Transforms/Scalar/NewGVN.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Transforms/Utils/LoopRotationUtils.cpp | 939 | ||||
-rw-r--r-- | llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 31 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 9 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp | 9 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 |
12 files changed, 549 insertions, 553 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index aa030294..127a506 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -60,6 +60,58 @@ static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo, return true; } +/// Let N = 2 * M. +/// Given an N-bit integer representing a pack of two M-bit integers, +/// we can select one of the packed integers by right-shifting by either +/// zero or M (which is the most straightforward to check if M is a power +/// of 2), and then isolating the lower M bits. In this case, we can +/// represent the shift as a select on whether the shr amount is nonzero. +static Value *simplifyShiftSelectingPackedElement(Instruction *I, + const APInt &DemandedMask, + InstCombinerImpl &IC, + unsigned Depth) { + assert(I->getOpcode() == Instruction::LShr && + "Only lshr instruction supported"); + + uint64_t ShlAmt; + Value *Upper, *Lower; + if (!match(I->getOperand(0), + m_OneUse(m_c_DisjointOr( + m_OneUse(m_Shl(m_Value(Upper), m_ConstantInt(ShlAmt))), + m_Value(Lower))))) + return nullptr; + + if (!isPowerOf2_64(ShlAmt)) + return nullptr; + + const uint64_t DemandedBitWidth = DemandedMask.getActiveBits(); + if (DemandedBitWidth > ShlAmt) + return nullptr; + + // Check that upper demanded bits are not lost from lshift. + if (Upper->getType()->getScalarSizeInBits() < ShlAmt + DemandedBitWidth) + return nullptr; + + KnownBits KnownLowerBits = IC.computeKnownBits(Lower, I, Depth); + if (!KnownLowerBits.getMaxValue().isIntN(ShlAmt)) + return nullptr; + + Value *ShrAmt = I->getOperand(1); + KnownBits KnownShrBits = IC.computeKnownBits(ShrAmt, I, Depth); + + // Verify that ShrAmt is either exactly ShlAmt (which is a power of 2) or + // zero. + if (~KnownShrBits.Zero != ShlAmt) + return nullptr; + + Value *ShrAmtZ = + IC.Builder.CreateICmpEQ(ShrAmt, Constant::getNullValue(ShrAmt->getType()), + ShrAmt->getName() + ".z"); + Value *Select = IC.Builder.CreateSelect(ShrAmtZ, Lower, Upper); + Select->takeName(I); + return Select; +} + /// Returns the bitwidth of the given scalar or pointer type. For vector types, /// returns the element type's bitwidth. static unsigned getBitWidth(Type *Ty, const DataLayout &DL) { @@ -798,9 +850,13 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I, Known >>= ShiftAmt; if (ShiftAmt) Known.Zero.setHighBits(ShiftAmt); // high bits known zero. - } else { - llvm::computeKnownBits(I, Known, Q, Depth); + break; } + if (Value *V = + simplifyShiftSelectingPackedElement(I, DemandedMask, *this, Depth)) + return V; + + llvm::computeKnownBits(I, Known, Q, Depth); break; } case Instruction::AShr: { diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 048cdf4..d56a1af 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1970,12 +1970,6 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN, NewPhiValues.push_back(nullptr); OpsToMoveUseToIncomingBB.push_back(i); - // If the InVal is an invoke at the end of the pred block, then we can't - // insert a computation after it without breaking the edge. - if (isa<InvokeInst>(InVal)) - if (cast<Instruction>(InVal)->getParent() == InBB) - return nullptr; - // Do not push the operation across a loop backedge. This could result in // an infinite combine loop, and is generally non-profitable (especially // if the operation was originally outside the loop). diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 584cdad..e448230 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -1206,19 +1206,18 @@ private: // value for the new predecessor ClonedBB. The value will either be the same // value from BB or a cloned value. for (BasicBlock *Succ : BlocksToUpdate) { - for (auto II = Succ->begin(); PHINode *Phi = dyn_cast<PHINode>(II); - ++II) { - Value *Incoming = Phi->getIncomingValueForBlock(BB); + for (PHINode &Phi : Succ->phis()) { + Value *Incoming = Phi.getIncomingValueForBlock(BB); if (Incoming) { if (isa<Constant>(Incoming)) { - Phi->addIncoming(Incoming, ClonedBB); + Phi.addIncoming(Incoming, ClonedBB); continue; } Value *ClonedVal = VMap[Incoming]; if (ClonedVal) - Phi->addIncoming(ClonedVal, ClonedBB); + Phi.addIncoming(ClonedVal, ClonedBB); else - Phi->addIncoming(Incoming, ClonedBB); + Phi.addIncoming(Incoming, ClonedBB); } } } @@ -1313,27 +1312,19 @@ private: void cleanPhiNodes(BasicBlock *BB) { // If BB is no longer reachable, remove any remaining phi nodes if (pred_empty(BB)) { - std::vector<PHINode *> PhiToRemove; - for (auto II = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(II); ++II) { - PhiToRemove.push_back(Phi); - } - for (PHINode *PN : PhiToRemove) { - PN->replaceAllUsesWith(PoisonValue::get(PN->getType())); - PN->eraseFromParent(); + for (PHINode &PN : make_early_inc_range(BB->phis())) { + PN.replaceAllUsesWith(PoisonValue::get(PN.getType())); + PN.eraseFromParent(); } return; } // Remove any incoming values that come from an invalid predecessor - for (auto II = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(II); ++II) { - std::vector<BasicBlock *> BlocksToRemove; - for (BasicBlock *IncomingBB : Phi->blocks()) { - if (!isPredecessor(BB, IncomingBB)) - BlocksToRemove.push_back(IncomingBB); - } - for (BasicBlock *BB : BlocksToRemove) - Phi->removeIncomingValue(BB); - } + for (PHINode &Phi : BB->phis()) + Phi.removeIncomingValueIf([&](unsigned Index) { + BasicBlock *IncomingBB = Phi.getIncomingBlock(Index); + return !isPredecessor(BB, IncomingBB); + }); } /// Checks if BB was already cloned for a particular next state value. If it diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index b9b5b58..638952a 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -699,6 +699,7 @@ uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) { case Instruction::FPTrunc: case Instruction::FPExt: case Instruction::PtrToInt: + case Instruction::PtrToAddr: case Instruction::IntToPtr: case Instruction::AddrSpaceCast: case Instruction::BitCast: diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index d6b7633..3c1a8ba 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -2066,6 +2066,7 @@ NewGVN::performSymbolicEvaluation(Instruction *I, case Instruction::FPTrunc: case Instruction::FPExt: case Instruction::PtrToInt: + case Instruction::PtrToAddr: case Instruction::IntToPtr: case Instruction::Select: case Instruction::ExtractElement: diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 60e5df0..7ffccf7 100644 --- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -355,6 +355,8 @@ void SimplifyCFGPass::printPipeline( OS << (Options.ForwardSwitchCondToPhi ? "" : "no-") << "forward-switch-cond;"; OS << (Options.ConvertSwitchRangeToICmp ? "" : "no-") << "switch-range-to-icmp;"; + OS << (Options.ConvertSwitchToArithmetic ? "" : "no-") + << "switch-to-arithmetic;"; OS << (Options.ConvertSwitchToLookupTable ? "" : "no-") << "switch-to-lookup;"; OS << (Options.NeedCanonicalLoop ? "" : "no-") << "keep-loops;"; diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp index 7cc9ff8..0c8d6fa 100644 --- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -45,12 +45,6 @@ STATISTIC(NumInstrsHoisted, "Number of instructions hoisted into loop preheader"); STATISTIC(NumInstrsDuplicated, "Number of instructions cloned into loop preheader"); -STATISTIC(NumRotated, "Number of loops rotated"); - -static cl::opt<bool> - MultiRotate("loop-rotate-multi", cl::init(false), cl::Hidden, - cl::desc("Allow loop rotation multiple times in order to reach " - "a better latch exit")); // Probability that a rotated loop has zero trip count / is never entered. static constexpr uint32_t ZeroTripCountWeights[] = {1, 127}; @@ -206,50 +200,6 @@ static bool profitableToRotateLoopExitingLatch(Loop *L) { return false; } -// Check that latch exit is deoptimizing (which means - very unlikely to happen) -// and there is another exit from the loop which is non-deoptimizing. -// If we rotate latch to that exit our loop has a better chance of being fully -// canonical. -// -// It can give false positives in some rare cases. -static bool canRotateDeoptimizingLatchExit(Loop *L) { - BasicBlock *Latch = L->getLoopLatch(); - assert(Latch && "need latch"); - BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator()); - // Need normal exiting latch. - if (!BI || !BI->isConditional()) - return false; - - BasicBlock *Exit = BI->getSuccessor(1); - if (L->contains(Exit)) - Exit = BI->getSuccessor(0); - - // Latch exit is non-deoptimizing, no need to rotate. - if (!Exit->getPostdominatingDeoptimizeCall()) - return false; - - SmallVector<BasicBlock *, 4> Exits; - L->getUniqueExitBlocks(Exits); - if (!Exits.empty()) { - // There is at least one non-deoptimizing exit. - // - // Note, that BasicBlock::getPostdominatingDeoptimizeCall is not exact, - // as it can conservatively return false for deoptimizing exits with - // complex enough control flow down to deoptimize call. - // - // That means here we can report success for a case where - // all exits are deoptimizing but one of them has complex enough - // control flow (e.g. with loops). - // - // That should be a very rare case and false positives for this function - // have compile-time effect only. - return any_of(Exits, [](const BasicBlock *BB) { - return !BB->getPostdominatingDeoptimizeCall(); - }); - } - return false; -} - static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI, bool HasConditionalPreHeader, bool SuccsSwapped) { @@ -387,506 +337,489 @@ static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI, /// rotation. LoopRotate should be repeatable and converge to a canonical /// form. This property is satisfied because simplifying the loop latch can only /// happen once across multiple invocations of the LoopRotate pass. -/// -/// If -loop-rotate-multi is enabled we can do multiple rotations in one go -/// so to reach a suitable (non-deoptimizing) exit. bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // If the loop has only one block then there is not much to rotate. if (L->getBlocks().size() == 1) return false; bool Rotated = false; - do { - BasicBlock *OrigHeader = L->getHeader(); - BasicBlock *OrigLatch = L->getLoopLatch(); - - BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator()); - if (!BI || BI->isUnconditional()) - return Rotated; - - // If the loop header is not one of the loop exiting blocks then - // either this loop is already rotated or it is not - // suitable for loop rotation transformations. - if (!L->isLoopExiting(OrigHeader)) + BasicBlock *OrigHeader = L->getHeader(); + BasicBlock *OrigLatch = L->getLoopLatch(); + + BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator()); + if (!BI || BI->isUnconditional()) + return Rotated; + + // If the loop header is not one of the loop exiting blocks then + // either this loop is already rotated or it is not + // suitable for loop rotation transformations. + if (!L->isLoopExiting(OrigHeader)) + return Rotated; + + // If the loop latch already contains a branch that leaves the loop then the + // loop is already rotated. + if (!OrigLatch) + return Rotated; + + // Rotate if the loop latch was just simplified. Or if it makes the loop exit + // count computable. Or if we think it will be profitable. + if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false && + !profitableToRotateLoopExitingLatch(L)) + return Rotated; + + // Check size of original header and reject loop if it is very big or we can't + // duplicate blocks inside it. + { + SmallPtrSet<const Value *, 32> EphValues; + CodeMetrics::collectEphemeralValues(L, AC, EphValues); + + CodeMetrics Metrics; + Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues, PrepareForLTO); + if (Metrics.notDuplicatable) { + LLVM_DEBUG( + dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" + << " instructions: "; + L->dump()); return Rotated; - - // If the loop latch already contains a branch that leaves the loop then the - // loop is already rotated. - if (!OrigLatch) + } + if (Metrics.Convergence != ConvergenceKind::None) { + LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " + "instructions: "; + L->dump()); return Rotated; - - // Rotate if either the loop latch does *not* exit the loop, or if the loop - // latch was just simplified. Or if we think it will be profitable. - if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false && - !profitableToRotateLoopExitingLatch(L) && - !canRotateDeoptimizingLatchExit(L)) + } + if (!Metrics.NumInsts.isValid()) { + LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions" + " with invalid cost: "; + L->dump()); return Rotated; - - // Check size of original header and reject loop if it is very big or we can't - // duplicate blocks inside it. - { - SmallPtrSet<const Value *, 32> EphValues; - CodeMetrics::collectEphemeralValues(L, AC, EphValues); - - CodeMetrics Metrics; - Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues, PrepareForLTO); - if (Metrics.notDuplicatable) { - LLVM_DEBUG( - dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" - << " instructions: "; - L->dump()); - return Rotated; - } - if (Metrics.Convergence != ConvergenceKind::None) { - LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " - "instructions: "; - L->dump()); - return Rotated; - } - if (!Metrics.NumInsts.isValid()) { - LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions" - " with invalid cost: "; - L->dump()); - return Rotated; - } - if (Metrics.NumInsts > MaxHeaderSize) { - LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains " - << Metrics.NumInsts - << " instructions, which is more than the threshold (" - << MaxHeaderSize << " instructions): "; - L->dump()); - ++NumNotRotatedDueToHeaderSize; - return Rotated; - } - - // When preparing for LTO, avoid rotating loops with calls that could be - // inlined during the LTO stage. - if (PrepareForLTO && Metrics.NumInlineCandidates > 0) - return Rotated; } - - // Now, this loop is suitable for rotation. - BasicBlock *OrigPreheader = L->getLoopPreheader(); - - // If the loop could not be converted to canonical form, it must have an - // indirectbr in it, just give up. - if (!OrigPreheader || !L->hasDedicatedExits()) + if (Metrics.NumInsts > MaxHeaderSize) { + LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains " + << Metrics.NumInsts + << " instructions, which is more than the threshold (" + << MaxHeaderSize << " instructions): "; + L->dump()); + ++NumNotRotatedDueToHeaderSize; return Rotated; - - // Anything ScalarEvolution may know about this loop or the PHI nodes - // in its header will soon be invalidated. We should also invalidate - // all outer loops because insertion and deletion of blocks that happens - // during the rotation may violate invariants related to backedge taken - // infos in them. - if (SE) { - SE->forgetTopmostLoop(L); - // We may hoist some instructions out of loop. In case if they were cached - // as "loop variant" or "loop computable", these caches must be dropped. - // We also may fold basic blocks, so cached block dispositions also need - // to be dropped. - SE->forgetBlockAndLoopDispositions(); } - LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); - if (MSSAU && VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); - - // Find new Loop header. NewHeader is a Header's one and only successor - // that is inside loop. Header's other successor is outside the - // loop. Otherwise loop is not suitable for rotation. - BasicBlock *Exit = BI->getSuccessor(0); - BasicBlock *NewHeader = BI->getSuccessor(1); - bool BISuccsSwapped = L->contains(Exit); - if (BISuccsSwapped) - std::swap(Exit, NewHeader); - assert(NewHeader && "Unable to determine new loop header"); - assert(L->contains(NewHeader) && !L->contains(Exit) && - "Unable to determine loop header and exit blocks"); - - // This code assumes that the new header has exactly one predecessor. - // Remove any single-entry PHI nodes in it. - assert(NewHeader->getSinglePredecessor() && - "New header doesn't have one pred!"); - FoldSingleEntryPHINodes(NewHeader); - - // Begin by walking OrigHeader and populating ValueMap with an entry for - // each Instruction. - BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); - ValueToValueMapTy ValueMap, ValueMapMSSA; - - // For PHI nodes, the value available in OldPreHeader is just the - // incoming value from OldPreHeader. - for (; PHINode *PN = dyn_cast<PHINode>(I); ++I) - InsertNewValueIntoMap(ValueMap, PN, - PN->getIncomingValueForBlock(OrigPreheader)); - - // For the rest of the instructions, either hoist to the OrigPreheader if - // possible or create a clone in the OldPreHeader if not. - Instruction *LoopEntryBranch = OrigPreheader->getTerminator(); - - // Record all debug records preceding LoopEntryBranch to avoid - // duplication. - using DbgHash = - std::pair<std::pair<hash_code, DILocalVariable *>, DIExpression *>; - auto makeHash = [](const DbgVariableRecord *D) -> DbgHash { - auto VarLocOps = D->location_ops(); - return {{hash_combine_range(VarLocOps), D->getVariable()}, - D->getExpression()}; - }; - - SmallDenseSet<DbgHash, 8> DbgRecords; - // Build DbgVariableRecord hashes for DbgVariableRecords attached to the - // terminator. - for (const DbgVariableRecord &DVR : - filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange())) - DbgRecords.insert(makeHash(&DVR)); - - // Remember the local noalias scope declarations in the header. After the - // rotation, they must be duplicated and the scope must be cloned. This - // avoids unwanted interaction across iterations. - SmallVector<NoAliasScopeDeclInst *, 6> NoAliasDeclInstructions; - for (Instruction &I : *OrigHeader) - if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I)) - NoAliasDeclInstructions.push_back(Decl); - - Module *M = OrigHeader->getModule(); - - // Track the next DbgRecord to clone. If we have a sequence where an - // instruction is hoisted instead of being cloned: - // DbgRecord blah - // %foo = add i32 0, 0 - // DbgRecord xyzzy - // %bar = call i32 @foobar() - // where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once - // attached to %foo, then when %foo his hoisted it will "fall down" onto the - // function call: - // DbgRecord blah - // DbgRecord xyzzy - // %bar = call i32 @foobar() - // causing it to appear attached to the call too. - // - // To avoid this, cloneDebugInfoFrom takes an optional "start cloning from - // here" position to account for this behaviour. We point it at any - // DbgRecords on the next instruction, here labelled xyzzy, before we hoist - // %foo. Later, we only only clone DbgRecords from that position (xyzzy) - // onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as - // a range because it gives us a natural way of testing whether - // there were DbgRecords on the next instruction before we hoisted things). - iterator_range<DbgRecord::self_iterator> NextDbgInsts = - (I != E) ? I->getDbgRecordRange() : DbgMarker::getEmptyDbgRecordRange(); - - while (I != E) { - Instruction *Inst = &*I++; - - // If the instruction's operands are invariant and it doesn't read or write - // memory, then it is safe to hoist. Doing this doesn't change the order of - // execution in the preheader, but does prevent the instruction from - // executing in each iteration of the loop. This means it is safe to hoist - // something that might trap, but isn't safe to hoist something that reads - // memory (without proving that the loop doesn't write). - if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() && - !Inst->mayWriteToMemory() && !Inst->isTerminator() && - !isa<AllocaInst>(Inst) && - // It is not safe to hoist the value of these instructions in - // coroutines, as the addresses of otherwise eligible variables (e.g. - // thread-local variables and errno) may change if the coroutine is - // resumed in a different thread.Therefore, we disable this - // optimization for correctness. However, this may block other correct - // optimizations. - // FIXME: This should be reverted once we have a better model for - // memory access in coroutines. - !Inst->getFunction()->isPresplitCoroutine()) { - - if (!NextDbgInsts.empty()) { - auto DbgValueRange = - LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin()); - RemapDbgRecordRange(M, DbgValueRange, ValueMap, - RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); - // Erase anything we've seen before. - for (DbgVariableRecord &DVR : - make_early_inc_range(filterDbgVars(DbgValueRange))) - if (DbgRecords.count(makeHash(&DVR))) - DVR.eraseFromParent(); - } - - NextDbgInsts = I->getDbgRecordRange(); - - Inst->moveBefore(LoopEntryBranch->getIterator()); + // When preparing for LTO, avoid rotating loops with calls that could be + // inlined during the LTO stage. + if (PrepareForLTO && Metrics.NumInlineCandidates > 0) + return Rotated; + } - ++NumInstrsHoisted; - continue; - } + // Now, this loop is suitable for rotation. + BasicBlock *OrigPreheader = L->getLoopPreheader(); + + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!OrigPreheader || !L->hasDedicatedExits()) + return Rotated; + + // Anything ScalarEvolution may know about this loop or the PHI nodes + // in its header will soon be invalidated. We should also invalidate + // all outer loops because insertion and deletion of blocks that happens + // during the rotation may violate invariants related to backedge taken + // infos in them. + if (SE) { + SE->forgetTopmostLoop(L); + // We may hoist some instructions out of loop. In case if they were cached + // as "loop variant" or "loop computable", these caches must be dropped. + // We also may fold basic blocks, so cached block dispositions also need + // to be dropped. + SE->forgetBlockAndLoopDispositions(); + } - // Otherwise, create a duplicate of the instruction. - Instruction *C = Inst->clone(); - if (const DebugLoc &DL = C->getDebugLoc()) - mapAtomInstance(DL, ValueMap); + LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); - C->insertBefore(LoopEntryBranch->getIterator()); + // Find new Loop header. NewHeader is a Header's one and only successor + // that is inside loop. Header's other successor is outside the + // loop. Otherwise loop is not suitable for rotation. + BasicBlock *Exit = BI->getSuccessor(0); + BasicBlock *NewHeader = BI->getSuccessor(1); + bool BISuccsSwapped = L->contains(Exit); + if (BISuccsSwapped) + std::swap(Exit, NewHeader); + assert(NewHeader && "Unable to determine new loop header"); + assert(L->contains(NewHeader) && !L->contains(Exit) && + "Unable to determine loop header and exit blocks"); + + // This code assumes that the new header has exactly one predecessor. + // Remove any single-entry PHI nodes in it. + assert(NewHeader->getSinglePredecessor() && + "New header doesn't have one pred!"); + FoldSingleEntryPHINodes(NewHeader); + + // Begin by walking OrigHeader and populating ValueMap with an entry for + // each Instruction. + BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); + ValueToValueMapTy ValueMap, ValueMapMSSA; + + // For PHI nodes, the value available in OldPreHeader is just the + // incoming value from OldPreHeader. + for (; PHINode *PN = dyn_cast<PHINode>(I); ++I) + InsertNewValueIntoMap(ValueMap, PN, + PN->getIncomingValueForBlock(OrigPreheader)); + + // For the rest of the instructions, either hoist to the OrigPreheader if + // possible or create a clone in the OldPreHeader if not. + Instruction *LoopEntryBranch = OrigPreheader->getTerminator(); + + // Record all debug records preceding LoopEntryBranch to avoid + // duplication. + using DbgHash = + std::pair<std::pair<hash_code, DILocalVariable *>, DIExpression *>; + auto makeHash = [](const DbgVariableRecord *D) -> DbgHash { + auto VarLocOps = D->location_ops(); + return {{hash_combine_range(VarLocOps), D->getVariable()}, + D->getExpression()}; + }; - ++NumInstrsDuplicated; + SmallDenseSet<DbgHash, 8> DbgRecords; + // Build DbgVariableRecord hashes for DbgVariableRecords attached to the + // terminator. + for (const DbgVariableRecord &DVR : + filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange())) + DbgRecords.insert(makeHash(&DVR)); + + // Remember the local noalias scope declarations in the header. After the + // rotation, they must be duplicated and the scope must be cloned. This + // avoids unwanted interaction across iterations. + SmallVector<NoAliasScopeDeclInst *, 6> NoAliasDeclInstructions; + for (Instruction &I : *OrigHeader) + if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I)) + NoAliasDeclInstructions.push_back(Decl); + + Module *M = OrigHeader->getModule(); + + // Track the next DbgRecord to clone. If we have a sequence where an + // instruction is hoisted instead of being cloned: + // DbgRecord blah + // %foo = add i32 0, 0 + // DbgRecord xyzzy + // %bar = call i32 @foobar() + // where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once + // attached to %foo, then when %foo his hoisted it will "fall down" onto the + // function call: + // DbgRecord blah + // DbgRecord xyzzy + // %bar = call i32 @foobar() + // causing it to appear attached to the call too. + // + // To avoid this, cloneDebugInfoFrom takes an optional "start cloning from + // here" position to account for this behaviour. We point it at any + // DbgRecords on the next instruction, here labelled xyzzy, before we hoist + // %foo. Later, we only only clone DbgRecords from that position (xyzzy) + // onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as + // a range because it gives us a natural way of testing whether + // there were DbgRecords on the next instruction before we hoisted things). + iterator_range<DbgRecord::self_iterator> NextDbgInsts = + (I != E) ? I->getDbgRecordRange() : DbgMarker::getEmptyDbgRecordRange(); + + while (I != E) { + Instruction *Inst = &*I++; + + // If the instruction's operands are invariant and it doesn't read or write + // memory, then it is safe to hoist. Doing this doesn't change the order of + // execution in the preheader, but does prevent the instruction from + // executing in each iteration of the loop. This means it is safe to hoist + // something that might trap, but isn't safe to hoist something that reads + // memory (without proving that the loop doesn't write). + if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() && + !Inst->mayWriteToMemory() && !Inst->isTerminator() && + !isa<AllocaInst>(Inst) && + // It is not safe to hoist the value of these instructions in + // coroutines, as the addresses of otherwise eligible variables (e.g. + // thread-local variables and errno) may change if the coroutine is + // resumed in a different thread.Therefore, we disable this + // optimization for correctness. However, this may block other correct + // optimizations. + // FIXME: This should be reverted once we have a better model for + // memory access in coroutines. + !Inst->getFunction()->isPresplitCoroutine()) { if (!NextDbgInsts.empty()) { - auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin()); - RemapDbgRecordRange(M, Range, ValueMap, + auto DbgValueRange = + LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin()); + RemapDbgRecordRange(M, DbgValueRange, ValueMap, RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); - NextDbgInsts = DbgMarker::getEmptyDbgRecordRange(); // Erase anything we've seen before. for (DbgVariableRecord &DVR : - make_early_inc_range(filterDbgVars(Range))) + make_early_inc_range(filterDbgVars(DbgValueRange))) if (DbgRecords.count(makeHash(&DVR))) DVR.eraseFromParent(); } - // Eagerly remap the operands of the instruction. - RemapInstruction(C, ValueMap, - RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); - - // With the operands remapped, see if the instruction constant folds or is - // otherwise simplifyable. This commonly occurs because the entry from PHI - // nodes allows icmps and other instructions to fold. - Value *V = simplifyInstruction(C, SQ); - if (V && LI->replacementPreservesLCSSAForm(C, V)) { - // If so, then delete the temporary instruction and stick the folded value - // in the map. - InsertNewValueIntoMap(ValueMap, Inst, V); - if (!C->mayHaveSideEffects()) { - C->eraseFromParent(); - C = nullptr; - } - } else { - InsertNewValueIntoMap(ValueMap, Inst, C); - } - if (C) { - // Otherwise, stick the new instruction into the new block! - C->setName(Inst->getName()); - - if (auto *II = dyn_cast<AssumeInst>(C)) - AC->registerAssumption(II); - // MemorySSA cares whether the cloned instruction was inserted or not, and - // not whether it can be remapped to a simplified value. - if (MSSAU) - InsertNewValueIntoMap(ValueMapMSSA, Inst, C); - } - } + NextDbgInsts = I->getDbgRecordRange(); - if (!NoAliasDeclInstructions.empty()) { - // There are noalias scope declarations: - // (general): - // Original: OrigPre { OrigHeader NewHeader ... Latch } - // after: (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader } - // - // with D: llvm.experimental.noalias.scope.decl, - // U: !noalias or !alias.scope depending on D - // ... { D U1 U2 } can transform into: - // (0) : ... { D U1 U2 } // no relevant rotation for this part - // (1) : ... D' { U1 U2 D } // D is part of OrigHeader - // (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader - // - // We now want to transform: - // (1) -> : ... D' { D U1 U2 D'' } - // (2) -> : ... D' U1' { D U2 D'' U1'' } - // D: original llvm.experimental.noalias.scope.decl - // D', U1': duplicate with replaced scopes - // D'', U1'': different duplicate with replaced scopes - // This ensures a safe fallback to 'may_alias' introduced by the rotate, - // as U1'' and U1' scopes will not be compatible wrt to the local restrict - - // Clone the llvm.experimental.noalias.decl again for the NewHeader. - BasicBlock::iterator NewHeaderInsertionPoint = - NewHeader->getFirstNonPHIIt(); - for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) { - LLVM_DEBUG(dbgs() << " Cloning llvm.experimental.noalias.scope.decl:" - << *NAD << "\n"); - Instruction *NewNAD = NAD->clone(); - NewNAD->insertBefore(*NewHeader, NewHeaderInsertionPoint); - } + Inst->moveBefore(LoopEntryBranch->getIterator()); - // Scopes must now be duplicated, once for OrigHeader and once for - // OrigPreHeader'. - { - auto &Context = NewHeader->getContext(); - - SmallVector<MDNode *, 8> NoAliasDeclScopes; - for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) - NoAliasDeclScopes.push_back(NAD->getScopeList()); - - LLVM_DEBUG(dbgs() << " Updating OrigHeader scopes\n"); - cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, {OrigHeader}, Context, - "h.rot"); - LLVM_DEBUG(OrigHeader->dump()); - - // Keep the compile time impact low by only adapting the inserted block - // of instructions in the OrigPreHeader. This might result in slightly - // more aliasing between these instructions and those that were already - // present, but it will be much faster when the original PreHeader is - // large. - LLVM_DEBUG(dbgs() << " Updating part of OrigPreheader scopes\n"); - auto *FirstDecl = - cast<Instruction>(ValueMap[*NoAliasDeclInstructions.begin()]); - auto *LastInst = &OrigPreheader->back(); - cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, FirstDecl, LastInst, - Context, "pre.rot"); - LLVM_DEBUG(OrigPreheader->dump()); - - LLVM_DEBUG(dbgs() << " Updated NewHeader:\n"); - LLVM_DEBUG(NewHeader->dump()); - } + ++NumInstrsHoisted; + continue; } - // Along with all the other instructions, we just cloned OrigHeader's - // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's - // successors by duplicating their incoming values for OrigHeader. - for (BasicBlock *SuccBB : successors(OrigHeader)) - for (BasicBlock::iterator BI = SuccBB->begin(); - PHINode *PN = dyn_cast<PHINode>(BI); ++BI) - PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); - - // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove - // OrigPreHeader's old terminator (the original branch into the loop), and - // remove the corresponding incoming values from the PHI nodes in OrigHeader. - LoopEntryBranch->eraseFromParent(); - OrigPreheader->flushTerminatorDbgRecords(); - - // Update MemorySSA before the rewrite call below changes the 1:1 - // instruction:cloned_instruction_or_value mapping. - if (MSSAU) { - InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader); - MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader, - ValueMapMSSA); - } + // Otherwise, create a duplicate of the instruction. + Instruction *C = Inst->clone(); + if (const DebugLoc &DL = C->getDebugLoc()) + mapAtomInstance(DL, ValueMap); - SmallVector<PHINode*, 2> InsertedPHIs; - // If there were any uses of instructions in the duplicated block outside the - // loop, update them, inserting PHI nodes as required - RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE, - &InsertedPHIs); - - // Attach debug records to the new phis if that phi uses a value that - // previously had debug metadata attached. This keeps the debug info - // up-to-date in the loop body. - if (!InsertedPHIs.empty()) - insertDebugValuesForPHIs(OrigHeader, InsertedPHIs); - - // NewHeader is now the header of the loop. - L->moveToHeader(NewHeader); - assert(L->getHeader() == NewHeader && "Latch block is our new header"); - - // Inform DT about changes to the CFG. - if (DT) { - // The OrigPreheader branches to the NewHeader and Exit now. Then, inform - // the DT about the removed edge to the OrigHeader (that got removed). - SmallVector<DominatorTree::UpdateType, 3> Updates = { - {DominatorTree::Insert, OrigPreheader, Exit}, - {DominatorTree::Insert, OrigPreheader, NewHeader}, - {DominatorTree::Delete, OrigPreheader, OrigHeader}}; - - if (MSSAU) { - MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true); - if (VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); - } else { - DT->applyUpdates(Updates); - } - } + C->insertBefore(LoopEntryBranch->getIterator()); - // At this point, we've finished our major CFG changes. As part of cloning - // the loop into the preheader we've simplified instructions and the - // duplicated conditional branch may now be branching on a constant. If it is - // branching on a constant and if that constant means that we enter the loop, - // then we fold away the cond branch to an uncond branch. This simplifies the - // loop in cases important for nested loops, and it also means we don't have - // to split as many edges. - BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator()); - assert(PHBI->isConditional() && "Should be clone of BI condbr!"); - const Value *Cond = PHBI->getCondition(); - const bool HasConditionalPreHeader = - !isa<ConstantInt>(Cond) || - PHBI->getSuccessor(cast<ConstantInt>(Cond)->isZero()) != NewHeader; - - updateBranchWeights(*PHBI, *BI, HasConditionalPreHeader, BISuccsSwapped); + ++NumInstrsDuplicated; - if (HasConditionalPreHeader) { - // The conditional branch can't be folded, handle the general case. - // Split edges as necessary to preserve LoopSimplify form. - - // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and - // thus is not a preheader anymore. - // Split the edge to form a real preheader. - BasicBlock *NewPH = SplitCriticalEdge( - OrigPreheader, NewHeader, - CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); - NewPH->setName(NewHeader->getName() + ".lr.ph"); - - // Preserve canonical loop form, which means that 'Exit' should have only - // one predecessor. Note that Exit could be an exit block for multiple - // nested loops, causing both of the edges to now be critical and need to - // be split. - SmallVector<BasicBlock *, 4> ExitPreds(predecessors(Exit)); - bool SplitLatchEdge = false; - for (BasicBlock *ExitPred : ExitPreds) { - // We only need to split loop exit edges. - Loop *PredLoop = LI->getLoopFor(ExitPred); - if (!PredLoop || PredLoop->contains(Exit) || - isa<IndirectBrInst>(ExitPred->getTerminator())) - continue; - SplitLatchEdge |= L->getLoopLatch() == ExitPred; - BasicBlock *ExitSplit = SplitCriticalEdge( - ExitPred, Exit, - CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); - ExitSplit->moveBefore(Exit); + if (!NextDbgInsts.empty()) { + auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin()); + RemapDbgRecordRange(M, Range, ValueMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + NextDbgInsts = DbgMarker::getEmptyDbgRecordRange(); + // Erase anything we've seen before. + for (DbgVariableRecord &DVR : make_early_inc_range(filterDbgVars(Range))) + if (DbgRecords.count(makeHash(&DVR))) + DVR.eraseFromParent(); + } + + // Eagerly remap the operands of the instruction. + RemapInstruction(C, ValueMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + + // With the operands remapped, see if the instruction constant folds or is + // otherwise simplifyable. This commonly occurs because the entry from PHI + // nodes allows icmps and other instructions to fold. + Value *V = simplifyInstruction(C, SQ); + if (V && LI->replacementPreservesLCSSAForm(C, V)) { + // If so, then delete the temporary instruction and stick the folded value + // in the map. + InsertNewValueIntoMap(ValueMap, Inst, V); + if (!C->mayHaveSideEffects()) { + C->eraseFromParent(); + C = nullptr; } - assert(SplitLatchEdge && - "Despite splitting all preds, failed to split latch exit?"); - (void)SplitLatchEdge; } else { - // We can fold the conditional branch in the preheader, this makes things - // simpler. The first step is to remove the extra edge to the Exit block. - Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); - BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI->getIterator()); - NewBI->setDebugLoc(PHBI->getDebugLoc()); - PHBI->eraseFromParent(); + InsertNewValueIntoMap(ValueMap, Inst, C); + } + if (C) { + // Otherwise, stick the new instruction into the new block! + C->setName(Inst->getName()); + + if (auto *II = dyn_cast<AssumeInst>(C)) + AC->registerAssumption(II); + // MemorySSA cares whether the cloned instruction was inserted or not, and + // not whether it can be remapped to a simplified value. + if (MSSAU) + InsertNewValueIntoMap(ValueMapMSSA, Inst, C); + } + } - // With our CFG finalized, update DomTree if it is available. - if (DT) DT->deleteEdge(OrigPreheader, Exit); + if (!NoAliasDeclInstructions.empty()) { + // There are noalias scope declarations: + // (general): + // Original: OrigPre { OrigHeader NewHeader ... Latch } + // after: (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader } + // + // with D: llvm.experimental.noalias.scope.decl, + // U: !noalias or !alias.scope depending on D + // ... { D U1 U2 } can transform into: + // (0) : ... { D U1 U2 } // no relevant rotation for this part + // (1) : ... D' { U1 U2 D } // D is part of OrigHeader + // (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader + // + // We now want to transform: + // (1) -> : ... D' { D U1 U2 D'' } + // (2) -> : ... D' U1' { D U2 D'' U1'' } + // D: original llvm.experimental.noalias.scope.decl + // D', U1': duplicate with replaced scopes + // D'', U1'': different duplicate with replaced scopes + // This ensures a safe fallback to 'may_alias' introduced by the rotate, + // as U1'' and U1' scopes will not be compatible wrt to the local restrict + + // Clone the llvm.experimental.noalias.decl again for the NewHeader. + BasicBlock::iterator NewHeaderInsertionPoint = + NewHeader->getFirstNonPHIIt(); + for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) { + LLVM_DEBUG(dbgs() << " Cloning llvm.experimental.noalias.scope.decl:" + << *NAD << "\n"); + Instruction *NewNAD = NAD->clone(); + NewNAD->insertBefore(*NewHeader, NewHeaderInsertionPoint); + } - // Update MSSA too, if available. - if (MSSAU) - MSSAU->removeEdge(OrigPreheader, Exit); + // Scopes must now be duplicated, once for OrigHeader and once for + // OrigPreHeader'. + { + auto &Context = NewHeader->getContext(); + + SmallVector<MDNode *, 8> NoAliasDeclScopes; + for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) + NoAliasDeclScopes.push_back(NAD->getScopeList()); + + LLVM_DEBUG(dbgs() << " Updating OrigHeader scopes\n"); + cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, {OrigHeader}, Context, + "h.rot"); + LLVM_DEBUG(OrigHeader->dump()); + + // Keep the compile time impact low by only adapting the inserted block + // of instructions in the OrigPreHeader. This might result in slightly + // more aliasing between these instructions and those that were already + // present, but it will be much faster when the original PreHeader is + // large. + LLVM_DEBUG(dbgs() << " Updating part of OrigPreheader scopes\n"); + auto *FirstDecl = + cast<Instruction>(ValueMap[*NoAliasDeclInstructions.begin()]); + auto *LastInst = &OrigPreheader->back(); + cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, FirstDecl, LastInst, + Context, "pre.rot"); + LLVM_DEBUG(OrigPreheader->dump()); + + LLVM_DEBUG(dbgs() << " Updated NewHeader:\n"); + LLVM_DEBUG(NewHeader->dump()); } + } - assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); - assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); + // Along with all the other instructions, we just cloned OrigHeader's + // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's + // successors by duplicating their incoming values for OrigHeader. + for (BasicBlock *SuccBB : successors(OrigHeader)) + for (BasicBlock::iterator BI = SuccBB->begin(); + PHINode *PN = dyn_cast<PHINode>(BI); ++BI) + PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); + + // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove + // OrigPreHeader's old terminator (the original branch into the loop), and + // remove the corresponding incoming values from the PHI nodes in OrigHeader. + LoopEntryBranch->eraseFromParent(); + OrigPreheader->flushTerminatorDbgRecords(); + + // Update MemorySSA before the rewrite call below changes the 1:1 + // instruction:cloned_instruction_or_value mapping. + if (MSSAU) { + InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader); + MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader, + ValueMapMSSA); + } - if (MSSAU && VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); + SmallVector<PHINode *, 2> InsertedPHIs; + // If there were any uses of instructions in the duplicated block outside the + // loop, update them, inserting PHI nodes as required + RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE, + &InsertedPHIs); + + // Attach debug records to the new phis if that phi uses a value that + // previously had debug metadata attached. This keeps the debug info + // up-to-date in the loop body. + if (!InsertedPHIs.empty()) + insertDebugValuesForPHIs(OrigHeader, InsertedPHIs); + + // NewHeader is now the header of the loop. + L->moveToHeader(NewHeader); + assert(L->getHeader() == NewHeader && "Latch block is our new header"); + + // Inform DT about changes to the CFG. + if (DT) { + // The OrigPreheader branches to the NewHeader and Exit now. Then, inform + // the DT about the removed edge to the OrigHeader (that got removed). + SmallVector<DominatorTree::UpdateType, 3> Updates = { + {DominatorTree::Insert, OrigPreheader, Exit}, + {DominatorTree::Insert, OrigPreheader, NewHeader}, + {DominatorTree::Delete, OrigPreheader, OrigHeader}}; - // Now that the CFG and DomTree are in a consistent state again, try to merge - // the OrigHeader block into OrigLatch. This will succeed if they are - // connected by an unconditional branch. This is just a cleanup so the - // emitted code isn't too gross in this common case. - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); - BasicBlock *PredBB = OrigHeader->getUniquePredecessor(); - bool DidMerge = MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU); - if (DidMerge) - RemoveRedundantDbgInstrs(PredBB); + if (MSSAU) { + MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true); + if (VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + } else { + DT->applyUpdates(Updates); + } + } - if (MSSAU && VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); + // At this point, we've finished our major CFG changes. As part of cloning + // the loop into the preheader we've simplified instructions and the + // duplicated conditional branch may now be branching on a constant. If it is + // branching on a constant and if that constant means that we enter the loop, + // then we fold away the cond branch to an uncond branch. This simplifies the + // loop in cases important for nested loops, and it also means we don't have + // to split as many edges. + BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator()); + assert(PHBI->isConditional() && "Should be clone of BI condbr!"); + const Value *Cond = PHBI->getCondition(); + const bool HasConditionalPreHeader = + !isa<ConstantInt>(Cond) || + PHBI->getSuccessor(cast<ConstantInt>(Cond)->isZero()) != NewHeader; + + updateBranchWeights(*PHBI, *BI, HasConditionalPreHeader, BISuccsSwapped); - LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump()); + if (HasConditionalPreHeader) { + // The conditional branch can't be folded, handle the general case. + // Split edges as necessary to preserve LoopSimplify form. + + // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and + // thus is not a preheader anymore. + // Split the edge to form a real preheader. + BasicBlock *NewPH = SplitCriticalEdge( + OrigPreheader, NewHeader, + CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); + NewPH->setName(NewHeader->getName() + ".lr.ph"); + + // Preserve canonical loop form, which means that 'Exit' should have only + // one predecessor. Note that Exit could be an exit block for multiple + // nested loops, causing both of the edges to now be critical and need to + // be split. + SmallVector<BasicBlock *, 4> ExitPreds(predecessors(Exit)); + bool SplitLatchEdge = false; + for (BasicBlock *ExitPred : ExitPreds) { + // We only need to split loop exit edges. + Loop *PredLoop = LI->getLoopFor(ExitPred); + if (!PredLoop || PredLoop->contains(Exit) || + isa<IndirectBrInst>(ExitPred->getTerminator())) + continue; + SplitLatchEdge |= L->getLoopLatch() == ExitPred; + BasicBlock *ExitSplit = SplitCriticalEdge( + ExitPred, Exit, + CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); + ExitSplit->moveBefore(Exit); + } + assert(SplitLatchEdge && + "Despite splitting all preds, failed to split latch exit?"); + (void)SplitLatchEdge; + } else { + // We can fold the conditional branch in the preheader, this makes things + // simpler. The first step is to remove the extra edge to the Exit block. + Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); + BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI->getIterator()); + NewBI->setDebugLoc(PHBI->getDebugLoc()); + PHBI->eraseFromParent(); + + // With our CFG finalized, update DomTree if it is available. + if (DT) + DT->deleteEdge(OrigPreheader, Exit); + + // Update MSSA too, if available. + if (MSSAU) + MSSAU->removeEdge(OrigPreheader, Exit); + } - ++NumRotated; + assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); + assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); - Rotated = true; - SimplifiedLatch = false; + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + + // Now that the CFG and DomTree are in a consistent state again, try to merge + // the OrigHeader block into OrigLatch. This will succeed if they are + // connected by an unconditional branch. This is just a cleanup so the + // emitted code isn't too gross in this common case. + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + BasicBlock *PredBB = OrigHeader->getUniquePredecessor(); + bool DidMerge = MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU); + if (DidMerge) + RemoveRedundantDbgInstrs(PredBB); - // Check that new latch is a deoptimizing exit and then repeat rotation if possible. - // Deoptimizing latch exit is not a generally typical case, so we just loop over. - // TODO: if it becomes a performance bottleneck extend rotation algorithm - // to handle multiple rotations in one go. - } while (MultiRotate && canRotateDeoptimizingLatchExit(L)); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump()); return true; } diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index b8cfe3a..155fcc5 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -6642,6 +6642,9 @@ public: /// Return true if the replacement is a lookup table. bool isLookupTable(); + /// Return true if the replacement is a bit map. + bool isBitMap(); + private: // Depending on the switch, there are different alternatives. enum { @@ -6932,6 +6935,8 @@ Constant *SwitchReplacement::getDefaultValue() { return DefaultValue; } bool SwitchReplacement::isLookupTable() { return Kind == LookupTableKind; } +bool SwitchReplacement::isBitMap() { return Kind == BitMapKind; } + static bool isSwitchDense(uint64_t NumCases, uint64_t CaseRange) { // 40% is the default density for building a jump table in optsize/minsize // mode. See also TargetLoweringBase::isSuitableForJumpTable(), which this @@ -7097,7 +7102,8 @@ static void reuseTableCompare( /// lookup tables. static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, DomTreeUpdater *DTU, const DataLayout &DL, - const TargetTransformInfo &TTI) { + const TargetTransformInfo &TTI, + bool ConvertSwitchToLookupTable) { assert(SI->getNumCases() > 1 && "Degenerate switch?"); BasicBlock *BB = SI->getParent(); @@ -7262,6 +7268,8 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, bool AnyLookupTables = any_of( PhiToReplacementMap, [](auto &KV) { return KV.second.isLookupTable(); }); + bool AnyBitMaps = any_of(PhiToReplacementMap, + [](auto &KV) { return KV.second.isBitMap(); }); // A few conditions prevent the generation of lookup tables: // 1. The target does not support lookup tables. @@ -7274,6 +7282,12 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, Fn->getFnAttribute("no-jump-tables").getValueAsBool())) return false; + // In the early optimization pipeline, disable formation of lookup tables, + // bit maps and mask checks, as they may inhibit further optimization. + if (!ConvertSwitchToLookupTable && + (AnyLookupTables || AnyBitMaps || NeedMask)) + return false; + Builder.SetInsertPoint(SI); // TableIndex is the switch condition - TableIndexOffset if we don't // use the condition directly @@ -7929,14 +7943,13 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { if (Options.ForwardSwitchCondToPhi && forwardSwitchConditionToPHI(SI)) return requestResimplify(); - // The conversion from switch to lookup tables results in difficult-to-analyze - // code and makes pruning branches much harder. This is a problem if the - // switch expression itself can still be restricted as a result of inlining or - // CVP. Therefore, only apply this transformation during late stages of the - // optimisation pipeline. - if (Options.ConvertSwitchToLookupTable && - simplifySwitchLookup(SI, Builder, DTU, DL, TTI)) - return requestResimplify(); + // The conversion of switches to arithmetic or lookup table is disabled in + // the early optimization pipeline, as it may lose information or make the + // resulting code harder to analyze. + if (Options.ConvertSwitchToArithmetic || Options.ConvertSwitchToLookupTable) + if (simplifySwitchLookup(SI, Builder, DTU, DL, TTI, + Options.ConvertSwitchToLookupTable)) + return requestResimplify(); if (simplifySwitchOfPowersOfTwo(SI, Builder, DL, TTI)) return requestResimplify(); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index cee08ef..3f16b03 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4264,8 +4264,8 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) { if (!Legal->isReductionVariable(&Phi)) return Legal->isFixedOrderRecurrence(&Phi); - RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind(); - return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum; + return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind( + Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind()); })) return false; @@ -7282,8 +7282,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( if (!Exit->hasPredecessors()) continue; for (VPRecipeBase &PhiR : Exit->phis()) - SE.forgetLcssaPhiWithNewPredecessor( - OrigLoop, cast<PHINode>(&cast<VPIRPhi>(PhiR).getInstruction())); + SE.forgetLcssaPhiWithNewPredecessor(OrigLoop, + &cast<VPIRPhi>(PhiR).getIRPhi()); } // Forget the original loop and block dispositions. SE.forgetLoop(OrigLoop); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 91c3d42..cfa8d27 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10621,7 +10621,8 @@ class InstructionsCompatibilityAnalysis { /// elements. static bool isSupportedOpcode(const unsigned Opcode) { return Opcode == Instruction::Add || Opcode == Instruction::LShr || - Opcode == Instruction::Shl; + Opcode == Instruction::Shl || Opcode == Instruction::SDiv || + Opcode == Instruction::UDiv; } /// Identifies the best candidate value, which represents main opcode @@ -10939,6 +10940,8 @@ public: case Instruction::Add: case Instruction::LShr: case Instruction::Shl: + case Instruction::SDiv: + case Instruction::UDiv: VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind); break; default: @@ -22066,8 +22069,10 @@ bool BoUpSLP::collectValuesToDemote( auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) { assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!"); return all_of(E.Scalars, [&](Value *V) { - auto *I = cast<Instruction>(V); APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); + if (E.hasCopyableElements() && E.isCopyableElement(V)) + return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)); + auto *I = cast<Instruction>(V); return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) && MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)); }); diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index b36298f..81deba2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -840,8 +840,8 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { // TODO: Support multiple MaxNum/MinNum reductions and other reductions. if (RedPhiR) return false; - if (Cur->getRecurrenceKind() != RecurKind::FMaxNum && - Cur->getRecurrenceKind() != RecurKind::FMinNum) { + if (!RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind( + Cur->getRecurrenceKind())) { HasUnsupportedPhi = true; continue; } @@ -861,10 +861,9 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { if (!MinMaxOp) return false; - RecurKind RedPhiRK = RedPhiR->getRecurrenceKind(); - assert((RedPhiRK == RecurKind::FMaxNum || RedPhiRK == RecurKind::FMinNum) && + assert(RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind( + RedPhiR->getRecurrenceKind()) && "unsupported reduction"); - (void)RedPhiRK; /// Check if the vector loop of \p Plan can early exit and restart /// execution of last vector iteration in the scalar loop. This requires all diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 94e2628..3a9770c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1230,6 +1230,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::ExtractLane: case VPInstruction::ExtractLastElement: case VPInstruction::ExtractPenultimateElement: + case VPInstruction::ActiveLaneMask: case VPInstruction::FirstActiveLane: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: |