diff options
Diffstat (limited to 'llvm/lib/Transforms/Utils')
-rw-r--r-- | llvm/lib/Transforms/Utils/LoopRotationUtils.cpp | 939 | ||||
-rw-r--r-- | llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp | 92 | ||||
-rw-r--r-- | llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 31 |
3 files changed, 517 insertions, 545 deletions
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp index 7cc9ff8..0c8d6fa 100644 --- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -45,12 +45,6 @@ STATISTIC(NumInstrsHoisted, "Number of instructions hoisted into loop preheader"); STATISTIC(NumInstrsDuplicated, "Number of instructions cloned into loop preheader"); -STATISTIC(NumRotated, "Number of loops rotated"); - -static cl::opt<bool> - MultiRotate("loop-rotate-multi", cl::init(false), cl::Hidden, - cl::desc("Allow loop rotation multiple times in order to reach " - "a better latch exit")); // Probability that a rotated loop has zero trip count / is never entered. static constexpr uint32_t ZeroTripCountWeights[] = {1, 127}; @@ -206,50 +200,6 @@ static bool profitableToRotateLoopExitingLatch(Loop *L) { return false; } -// Check that latch exit is deoptimizing (which means - very unlikely to happen) -// and there is another exit from the loop which is non-deoptimizing. -// If we rotate latch to that exit our loop has a better chance of being fully -// canonical. -// -// It can give false positives in some rare cases. -static bool canRotateDeoptimizingLatchExit(Loop *L) { - BasicBlock *Latch = L->getLoopLatch(); - assert(Latch && "need latch"); - BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator()); - // Need normal exiting latch. - if (!BI || !BI->isConditional()) - return false; - - BasicBlock *Exit = BI->getSuccessor(1); - if (L->contains(Exit)) - Exit = BI->getSuccessor(0); - - // Latch exit is non-deoptimizing, no need to rotate. - if (!Exit->getPostdominatingDeoptimizeCall()) - return false; - - SmallVector<BasicBlock *, 4> Exits; - L->getUniqueExitBlocks(Exits); - if (!Exits.empty()) { - // There is at least one non-deoptimizing exit. - // - // Note, that BasicBlock::getPostdominatingDeoptimizeCall is not exact, - // as it can conservatively return false for deoptimizing exits with - // complex enough control flow down to deoptimize call. - // - // That means here we can report success for a case where - // all exits are deoptimizing but one of them has complex enough - // control flow (e.g. with loops). - // - // That should be a very rare case and false positives for this function - // have compile-time effect only. - return any_of(Exits, [](const BasicBlock *BB) { - return !BB->getPostdominatingDeoptimizeCall(); - }); - } - return false; -} - static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI, bool HasConditionalPreHeader, bool SuccsSwapped) { @@ -387,506 +337,489 @@ static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI, /// rotation. LoopRotate should be repeatable and converge to a canonical /// form. This property is satisfied because simplifying the loop latch can only /// happen once across multiple invocations of the LoopRotate pass. -/// -/// If -loop-rotate-multi is enabled we can do multiple rotations in one go -/// so to reach a suitable (non-deoptimizing) exit. bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // If the loop has only one block then there is not much to rotate. if (L->getBlocks().size() == 1) return false; bool Rotated = false; - do { - BasicBlock *OrigHeader = L->getHeader(); - BasicBlock *OrigLatch = L->getLoopLatch(); - - BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator()); - if (!BI || BI->isUnconditional()) - return Rotated; - - // If the loop header is not one of the loop exiting blocks then - // either this loop is already rotated or it is not - // suitable for loop rotation transformations. - if (!L->isLoopExiting(OrigHeader)) + BasicBlock *OrigHeader = L->getHeader(); + BasicBlock *OrigLatch = L->getLoopLatch(); + + BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator()); + if (!BI || BI->isUnconditional()) + return Rotated; + + // If the loop header is not one of the loop exiting blocks then + // either this loop is already rotated or it is not + // suitable for loop rotation transformations. + if (!L->isLoopExiting(OrigHeader)) + return Rotated; + + // If the loop latch already contains a branch that leaves the loop then the + // loop is already rotated. + if (!OrigLatch) + return Rotated; + + // Rotate if the loop latch was just simplified. Or if it makes the loop exit + // count computable. Or if we think it will be profitable. + if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false && + !profitableToRotateLoopExitingLatch(L)) + return Rotated; + + // Check size of original header and reject loop if it is very big or we can't + // duplicate blocks inside it. + { + SmallPtrSet<const Value *, 32> EphValues; + CodeMetrics::collectEphemeralValues(L, AC, EphValues); + + CodeMetrics Metrics; + Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues, PrepareForLTO); + if (Metrics.notDuplicatable) { + LLVM_DEBUG( + dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" + << " instructions: "; + L->dump()); return Rotated; - - // If the loop latch already contains a branch that leaves the loop then the - // loop is already rotated. - if (!OrigLatch) + } + if (Metrics.Convergence != ConvergenceKind::None) { + LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " + "instructions: "; + L->dump()); return Rotated; - - // Rotate if either the loop latch does *not* exit the loop, or if the loop - // latch was just simplified. Or if we think it will be profitable. - if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false && - !profitableToRotateLoopExitingLatch(L) && - !canRotateDeoptimizingLatchExit(L)) + } + if (!Metrics.NumInsts.isValid()) { + LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions" + " with invalid cost: "; + L->dump()); return Rotated; - - // Check size of original header and reject loop if it is very big or we can't - // duplicate blocks inside it. - { - SmallPtrSet<const Value *, 32> EphValues; - CodeMetrics::collectEphemeralValues(L, AC, EphValues); - - CodeMetrics Metrics; - Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues, PrepareForLTO); - if (Metrics.notDuplicatable) { - LLVM_DEBUG( - dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" - << " instructions: "; - L->dump()); - return Rotated; - } - if (Metrics.Convergence != ConvergenceKind::None) { - LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " - "instructions: "; - L->dump()); - return Rotated; - } - if (!Metrics.NumInsts.isValid()) { - LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions" - " with invalid cost: "; - L->dump()); - return Rotated; - } - if (Metrics.NumInsts > MaxHeaderSize) { - LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains " - << Metrics.NumInsts - << " instructions, which is more than the threshold (" - << MaxHeaderSize << " instructions): "; - L->dump()); - ++NumNotRotatedDueToHeaderSize; - return Rotated; - } - - // When preparing for LTO, avoid rotating loops with calls that could be - // inlined during the LTO stage. - if (PrepareForLTO && Metrics.NumInlineCandidates > 0) - return Rotated; } - - // Now, this loop is suitable for rotation. - BasicBlock *OrigPreheader = L->getLoopPreheader(); - - // If the loop could not be converted to canonical form, it must have an - // indirectbr in it, just give up. - if (!OrigPreheader || !L->hasDedicatedExits()) + if (Metrics.NumInsts > MaxHeaderSize) { + LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains " + << Metrics.NumInsts + << " instructions, which is more than the threshold (" + << MaxHeaderSize << " instructions): "; + L->dump()); + ++NumNotRotatedDueToHeaderSize; return Rotated; - - // Anything ScalarEvolution may know about this loop or the PHI nodes - // in its header will soon be invalidated. We should also invalidate - // all outer loops because insertion and deletion of blocks that happens - // during the rotation may violate invariants related to backedge taken - // infos in them. - if (SE) { - SE->forgetTopmostLoop(L); - // We may hoist some instructions out of loop. In case if they were cached - // as "loop variant" or "loop computable", these caches must be dropped. - // We also may fold basic blocks, so cached block dispositions also need - // to be dropped. - SE->forgetBlockAndLoopDispositions(); } - LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); - if (MSSAU && VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); - - // Find new Loop header. NewHeader is a Header's one and only successor - // that is inside loop. Header's other successor is outside the - // loop. Otherwise loop is not suitable for rotation. - BasicBlock *Exit = BI->getSuccessor(0); - BasicBlock *NewHeader = BI->getSuccessor(1); - bool BISuccsSwapped = L->contains(Exit); - if (BISuccsSwapped) - std::swap(Exit, NewHeader); - assert(NewHeader && "Unable to determine new loop header"); - assert(L->contains(NewHeader) && !L->contains(Exit) && - "Unable to determine loop header and exit blocks"); - - // This code assumes that the new header has exactly one predecessor. - // Remove any single-entry PHI nodes in it. - assert(NewHeader->getSinglePredecessor() && - "New header doesn't have one pred!"); - FoldSingleEntryPHINodes(NewHeader); - - // Begin by walking OrigHeader and populating ValueMap with an entry for - // each Instruction. - BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); - ValueToValueMapTy ValueMap, ValueMapMSSA; - - // For PHI nodes, the value available in OldPreHeader is just the - // incoming value from OldPreHeader. - for (; PHINode *PN = dyn_cast<PHINode>(I); ++I) - InsertNewValueIntoMap(ValueMap, PN, - PN->getIncomingValueForBlock(OrigPreheader)); - - // For the rest of the instructions, either hoist to the OrigPreheader if - // possible or create a clone in the OldPreHeader if not. - Instruction *LoopEntryBranch = OrigPreheader->getTerminator(); - - // Record all debug records preceding LoopEntryBranch to avoid - // duplication. - using DbgHash = - std::pair<std::pair<hash_code, DILocalVariable *>, DIExpression *>; - auto makeHash = [](const DbgVariableRecord *D) -> DbgHash { - auto VarLocOps = D->location_ops(); - return {{hash_combine_range(VarLocOps), D->getVariable()}, - D->getExpression()}; - }; - - SmallDenseSet<DbgHash, 8> DbgRecords; - // Build DbgVariableRecord hashes for DbgVariableRecords attached to the - // terminator. - for (const DbgVariableRecord &DVR : - filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange())) - DbgRecords.insert(makeHash(&DVR)); - - // Remember the local noalias scope declarations in the header. After the - // rotation, they must be duplicated and the scope must be cloned. This - // avoids unwanted interaction across iterations. - SmallVector<NoAliasScopeDeclInst *, 6> NoAliasDeclInstructions; - for (Instruction &I : *OrigHeader) - if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I)) - NoAliasDeclInstructions.push_back(Decl); - - Module *M = OrigHeader->getModule(); - - // Track the next DbgRecord to clone. If we have a sequence where an - // instruction is hoisted instead of being cloned: - // DbgRecord blah - // %foo = add i32 0, 0 - // DbgRecord xyzzy - // %bar = call i32 @foobar() - // where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once - // attached to %foo, then when %foo his hoisted it will "fall down" onto the - // function call: - // DbgRecord blah - // DbgRecord xyzzy - // %bar = call i32 @foobar() - // causing it to appear attached to the call too. - // - // To avoid this, cloneDebugInfoFrom takes an optional "start cloning from - // here" position to account for this behaviour. We point it at any - // DbgRecords on the next instruction, here labelled xyzzy, before we hoist - // %foo. Later, we only only clone DbgRecords from that position (xyzzy) - // onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as - // a range because it gives us a natural way of testing whether - // there were DbgRecords on the next instruction before we hoisted things). - iterator_range<DbgRecord::self_iterator> NextDbgInsts = - (I != E) ? I->getDbgRecordRange() : DbgMarker::getEmptyDbgRecordRange(); - - while (I != E) { - Instruction *Inst = &*I++; - - // If the instruction's operands are invariant and it doesn't read or write - // memory, then it is safe to hoist. Doing this doesn't change the order of - // execution in the preheader, but does prevent the instruction from - // executing in each iteration of the loop. This means it is safe to hoist - // something that might trap, but isn't safe to hoist something that reads - // memory (without proving that the loop doesn't write). - if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() && - !Inst->mayWriteToMemory() && !Inst->isTerminator() && - !isa<AllocaInst>(Inst) && - // It is not safe to hoist the value of these instructions in - // coroutines, as the addresses of otherwise eligible variables (e.g. - // thread-local variables and errno) may change if the coroutine is - // resumed in a different thread.Therefore, we disable this - // optimization for correctness. However, this may block other correct - // optimizations. - // FIXME: This should be reverted once we have a better model for - // memory access in coroutines. - !Inst->getFunction()->isPresplitCoroutine()) { - - if (!NextDbgInsts.empty()) { - auto DbgValueRange = - LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin()); - RemapDbgRecordRange(M, DbgValueRange, ValueMap, - RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); - // Erase anything we've seen before. - for (DbgVariableRecord &DVR : - make_early_inc_range(filterDbgVars(DbgValueRange))) - if (DbgRecords.count(makeHash(&DVR))) - DVR.eraseFromParent(); - } - - NextDbgInsts = I->getDbgRecordRange(); - - Inst->moveBefore(LoopEntryBranch->getIterator()); + // When preparing for LTO, avoid rotating loops with calls that could be + // inlined during the LTO stage. + if (PrepareForLTO && Metrics.NumInlineCandidates > 0) + return Rotated; + } - ++NumInstrsHoisted; - continue; - } + // Now, this loop is suitable for rotation. + BasicBlock *OrigPreheader = L->getLoopPreheader(); + + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!OrigPreheader || !L->hasDedicatedExits()) + return Rotated; + + // Anything ScalarEvolution may know about this loop or the PHI nodes + // in its header will soon be invalidated. We should also invalidate + // all outer loops because insertion and deletion of blocks that happens + // during the rotation may violate invariants related to backedge taken + // infos in them. + if (SE) { + SE->forgetTopmostLoop(L); + // We may hoist some instructions out of loop. In case if they were cached + // as "loop variant" or "loop computable", these caches must be dropped. + // We also may fold basic blocks, so cached block dispositions also need + // to be dropped. + SE->forgetBlockAndLoopDispositions(); + } - // Otherwise, create a duplicate of the instruction. - Instruction *C = Inst->clone(); - if (const DebugLoc &DL = C->getDebugLoc()) - mapAtomInstance(DL, ValueMap); + LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); - C->insertBefore(LoopEntryBranch->getIterator()); + // Find new Loop header. NewHeader is a Header's one and only successor + // that is inside loop. Header's other successor is outside the + // loop. Otherwise loop is not suitable for rotation. + BasicBlock *Exit = BI->getSuccessor(0); + BasicBlock *NewHeader = BI->getSuccessor(1); + bool BISuccsSwapped = L->contains(Exit); + if (BISuccsSwapped) + std::swap(Exit, NewHeader); + assert(NewHeader && "Unable to determine new loop header"); + assert(L->contains(NewHeader) && !L->contains(Exit) && + "Unable to determine loop header and exit blocks"); + + // This code assumes that the new header has exactly one predecessor. + // Remove any single-entry PHI nodes in it. + assert(NewHeader->getSinglePredecessor() && + "New header doesn't have one pred!"); + FoldSingleEntryPHINodes(NewHeader); + + // Begin by walking OrigHeader and populating ValueMap with an entry for + // each Instruction. + BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); + ValueToValueMapTy ValueMap, ValueMapMSSA; + + // For PHI nodes, the value available in OldPreHeader is just the + // incoming value from OldPreHeader. + for (; PHINode *PN = dyn_cast<PHINode>(I); ++I) + InsertNewValueIntoMap(ValueMap, PN, + PN->getIncomingValueForBlock(OrigPreheader)); + + // For the rest of the instructions, either hoist to the OrigPreheader if + // possible or create a clone in the OldPreHeader if not. + Instruction *LoopEntryBranch = OrigPreheader->getTerminator(); + + // Record all debug records preceding LoopEntryBranch to avoid + // duplication. + using DbgHash = + std::pair<std::pair<hash_code, DILocalVariable *>, DIExpression *>; + auto makeHash = [](const DbgVariableRecord *D) -> DbgHash { + auto VarLocOps = D->location_ops(); + return {{hash_combine_range(VarLocOps), D->getVariable()}, + D->getExpression()}; + }; - ++NumInstrsDuplicated; + SmallDenseSet<DbgHash, 8> DbgRecords; + // Build DbgVariableRecord hashes for DbgVariableRecords attached to the + // terminator. + for (const DbgVariableRecord &DVR : + filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange())) + DbgRecords.insert(makeHash(&DVR)); + + // Remember the local noalias scope declarations in the header. After the + // rotation, they must be duplicated and the scope must be cloned. This + // avoids unwanted interaction across iterations. + SmallVector<NoAliasScopeDeclInst *, 6> NoAliasDeclInstructions; + for (Instruction &I : *OrigHeader) + if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I)) + NoAliasDeclInstructions.push_back(Decl); + + Module *M = OrigHeader->getModule(); + + // Track the next DbgRecord to clone. If we have a sequence where an + // instruction is hoisted instead of being cloned: + // DbgRecord blah + // %foo = add i32 0, 0 + // DbgRecord xyzzy + // %bar = call i32 @foobar() + // where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once + // attached to %foo, then when %foo his hoisted it will "fall down" onto the + // function call: + // DbgRecord blah + // DbgRecord xyzzy + // %bar = call i32 @foobar() + // causing it to appear attached to the call too. + // + // To avoid this, cloneDebugInfoFrom takes an optional "start cloning from + // here" position to account for this behaviour. We point it at any + // DbgRecords on the next instruction, here labelled xyzzy, before we hoist + // %foo. Later, we only only clone DbgRecords from that position (xyzzy) + // onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as + // a range because it gives us a natural way of testing whether + // there were DbgRecords on the next instruction before we hoisted things). + iterator_range<DbgRecord::self_iterator> NextDbgInsts = + (I != E) ? I->getDbgRecordRange() : DbgMarker::getEmptyDbgRecordRange(); + + while (I != E) { + Instruction *Inst = &*I++; + + // If the instruction's operands are invariant and it doesn't read or write + // memory, then it is safe to hoist. Doing this doesn't change the order of + // execution in the preheader, but does prevent the instruction from + // executing in each iteration of the loop. This means it is safe to hoist + // something that might trap, but isn't safe to hoist something that reads + // memory (without proving that the loop doesn't write). + if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() && + !Inst->mayWriteToMemory() && !Inst->isTerminator() && + !isa<AllocaInst>(Inst) && + // It is not safe to hoist the value of these instructions in + // coroutines, as the addresses of otherwise eligible variables (e.g. + // thread-local variables and errno) may change if the coroutine is + // resumed in a different thread.Therefore, we disable this + // optimization for correctness. However, this may block other correct + // optimizations. + // FIXME: This should be reverted once we have a better model for + // memory access in coroutines. + !Inst->getFunction()->isPresplitCoroutine()) { if (!NextDbgInsts.empty()) { - auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin()); - RemapDbgRecordRange(M, Range, ValueMap, + auto DbgValueRange = + LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin()); + RemapDbgRecordRange(M, DbgValueRange, ValueMap, RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); - NextDbgInsts = DbgMarker::getEmptyDbgRecordRange(); // Erase anything we've seen before. for (DbgVariableRecord &DVR : - make_early_inc_range(filterDbgVars(Range))) + make_early_inc_range(filterDbgVars(DbgValueRange))) if (DbgRecords.count(makeHash(&DVR))) DVR.eraseFromParent(); } - // Eagerly remap the operands of the instruction. - RemapInstruction(C, ValueMap, - RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); - - // With the operands remapped, see if the instruction constant folds or is - // otherwise simplifyable. This commonly occurs because the entry from PHI - // nodes allows icmps and other instructions to fold. - Value *V = simplifyInstruction(C, SQ); - if (V && LI->replacementPreservesLCSSAForm(C, V)) { - // If so, then delete the temporary instruction and stick the folded value - // in the map. - InsertNewValueIntoMap(ValueMap, Inst, V); - if (!C->mayHaveSideEffects()) { - C->eraseFromParent(); - C = nullptr; - } - } else { - InsertNewValueIntoMap(ValueMap, Inst, C); - } - if (C) { - // Otherwise, stick the new instruction into the new block! - C->setName(Inst->getName()); - - if (auto *II = dyn_cast<AssumeInst>(C)) - AC->registerAssumption(II); - // MemorySSA cares whether the cloned instruction was inserted or not, and - // not whether it can be remapped to a simplified value. - if (MSSAU) - InsertNewValueIntoMap(ValueMapMSSA, Inst, C); - } - } + NextDbgInsts = I->getDbgRecordRange(); - if (!NoAliasDeclInstructions.empty()) { - // There are noalias scope declarations: - // (general): - // Original: OrigPre { OrigHeader NewHeader ... Latch } - // after: (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader } - // - // with D: llvm.experimental.noalias.scope.decl, - // U: !noalias or !alias.scope depending on D - // ... { D U1 U2 } can transform into: - // (0) : ... { D U1 U2 } // no relevant rotation for this part - // (1) : ... D' { U1 U2 D } // D is part of OrigHeader - // (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader - // - // We now want to transform: - // (1) -> : ... D' { D U1 U2 D'' } - // (2) -> : ... D' U1' { D U2 D'' U1'' } - // D: original llvm.experimental.noalias.scope.decl - // D', U1': duplicate with replaced scopes - // D'', U1'': different duplicate with replaced scopes - // This ensures a safe fallback to 'may_alias' introduced by the rotate, - // as U1'' and U1' scopes will not be compatible wrt to the local restrict - - // Clone the llvm.experimental.noalias.decl again for the NewHeader. - BasicBlock::iterator NewHeaderInsertionPoint = - NewHeader->getFirstNonPHIIt(); - for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) { - LLVM_DEBUG(dbgs() << " Cloning llvm.experimental.noalias.scope.decl:" - << *NAD << "\n"); - Instruction *NewNAD = NAD->clone(); - NewNAD->insertBefore(*NewHeader, NewHeaderInsertionPoint); - } + Inst->moveBefore(LoopEntryBranch->getIterator()); - // Scopes must now be duplicated, once for OrigHeader and once for - // OrigPreHeader'. - { - auto &Context = NewHeader->getContext(); - - SmallVector<MDNode *, 8> NoAliasDeclScopes; - for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) - NoAliasDeclScopes.push_back(NAD->getScopeList()); - - LLVM_DEBUG(dbgs() << " Updating OrigHeader scopes\n"); - cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, {OrigHeader}, Context, - "h.rot"); - LLVM_DEBUG(OrigHeader->dump()); - - // Keep the compile time impact low by only adapting the inserted block - // of instructions in the OrigPreHeader. This might result in slightly - // more aliasing between these instructions and those that were already - // present, but it will be much faster when the original PreHeader is - // large. - LLVM_DEBUG(dbgs() << " Updating part of OrigPreheader scopes\n"); - auto *FirstDecl = - cast<Instruction>(ValueMap[*NoAliasDeclInstructions.begin()]); - auto *LastInst = &OrigPreheader->back(); - cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, FirstDecl, LastInst, - Context, "pre.rot"); - LLVM_DEBUG(OrigPreheader->dump()); - - LLVM_DEBUG(dbgs() << " Updated NewHeader:\n"); - LLVM_DEBUG(NewHeader->dump()); - } + ++NumInstrsHoisted; + continue; } - // Along with all the other instructions, we just cloned OrigHeader's - // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's - // successors by duplicating their incoming values for OrigHeader. - for (BasicBlock *SuccBB : successors(OrigHeader)) - for (BasicBlock::iterator BI = SuccBB->begin(); - PHINode *PN = dyn_cast<PHINode>(BI); ++BI) - PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); - - // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove - // OrigPreHeader's old terminator (the original branch into the loop), and - // remove the corresponding incoming values from the PHI nodes in OrigHeader. - LoopEntryBranch->eraseFromParent(); - OrigPreheader->flushTerminatorDbgRecords(); - - // Update MemorySSA before the rewrite call below changes the 1:1 - // instruction:cloned_instruction_or_value mapping. - if (MSSAU) { - InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader); - MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader, - ValueMapMSSA); - } + // Otherwise, create a duplicate of the instruction. + Instruction *C = Inst->clone(); + if (const DebugLoc &DL = C->getDebugLoc()) + mapAtomInstance(DL, ValueMap); - SmallVector<PHINode*, 2> InsertedPHIs; - // If there were any uses of instructions in the duplicated block outside the - // loop, update them, inserting PHI nodes as required - RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE, - &InsertedPHIs); - - // Attach debug records to the new phis if that phi uses a value that - // previously had debug metadata attached. This keeps the debug info - // up-to-date in the loop body. - if (!InsertedPHIs.empty()) - insertDebugValuesForPHIs(OrigHeader, InsertedPHIs); - - // NewHeader is now the header of the loop. - L->moveToHeader(NewHeader); - assert(L->getHeader() == NewHeader && "Latch block is our new header"); - - // Inform DT about changes to the CFG. - if (DT) { - // The OrigPreheader branches to the NewHeader and Exit now. Then, inform - // the DT about the removed edge to the OrigHeader (that got removed). - SmallVector<DominatorTree::UpdateType, 3> Updates = { - {DominatorTree::Insert, OrigPreheader, Exit}, - {DominatorTree::Insert, OrigPreheader, NewHeader}, - {DominatorTree::Delete, OrigPreheader, OrigHeader}}; - - if (MSSAU) { - MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true); - if (VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); - } else { - DT->applyUpdates(Updates); - } - } + C->insertBefore(LoopEntryBranch->getIterator()); - // At this point, we've finished our major CFG changes. As part of cloning - // the loop into the preheader we've simplified instructions and the - // duplicated conditional branch may now be branching on a constant. If it is - // branching on a constant and if that constant means that we enter the loop, - // then we fold away the cond branch to an uncond branch. This simplifies the - // loop in cases important for nested loops, and it also means we don't have - // to split as many edges. - BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator()); - assert(PHBI->isConditional() && "Should be clone of BI condbr!"); - const Value *Cond = PHBI->getCondition(); - const bool HasConditionalPreHeader = - !isa<ConstantInt>(Cond) || - PHBI->getSuccessor(cast<ConstantInt>(Cond)->isZero()) != NewHeader; - - updateBranchWeights(*PHBI, *BI, HasConditionalPreHeader, BISuccsSwapped); + ++NumInstrsDuplicated; - if (HasConditionalPreHeader) { - // The conditional branch can't be folded, handle the general case. - // Split edges as necessary to preserve LoopSimplify form. - - // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and - // thus is not a preheader anymore. - // Split the edge to form a real preheader. - BasicBlock *NewPH = SplitCriticalEdge( - OrigPreheader, NewHeader, - CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); - NewPH->setName(NewHeader->getName() + ".lr.ph"); - - // Preserve canonical loop form, which means that 'Exit' should have only - // one predecessor. Note that Exit could be an exit block for multiple - // nested loops, causing both of the edges to now be critical and need to - // be split. - SmallVector<BasicBlock *, 4> ExitPreds(predecessors(Exit)); - bool SplitLatchEdge = false; - for (BasicBlock *ExitPred : ExitPreds) { - // We only need to split loop exit edges. - Loop *PredLoop = LI->getLoopFor(ExitPred); - if (!PredLoop || PredLoop->contains(Exit) || - isa<IndirectBrInst>(ExitPred->getTerminator())) - continue; - SplitLatchEdge |= L->getLoopLatch() == ExitPred; - BasicBlock *ExitSplit = SplitCriticalEdge( - ExitPred, Exit, - CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); - ExitSplit->moveBefore(Exit); + if (!NextDbgInsts.empty()) { + auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin()); + RemapDbgRecordRange(M, Range, ValueMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + NextDbgInsts = DbgMarker::getEmptyDbgRecordRange(); + // Erase anything we've seen before. + for (DbgVariableRecord &DVR : make_early_inc_range(filterDbgVars(Range))) + if (DbgRecords.count(makeHash(&DVR))) + DVR.eraseFromParent(); + } + + // Eagerly remap the operands of the instruction. + RemapInstruction(C, ValueMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + + // With the operands remapped, see if the instruction constant folds or is + // otherwise simplifyable. This commonly occurs because the entry from PHI + // nodes allows icmps and other instructions to fold. + Value *V = simplifyInstruction(C, SQ); + if (V && LI->replacementPreservesLCSSAForm(C, V)) { + // If so, then delete the temporary instruction and stick the folded value + // in the map. + InsertNewValueIntoMap(ValueMap, Inst, V); + if (!C->mayHaveSideEffects()) { + C->eraseFromParent(); + C = nullptr; } - assert(SplitLatchEdge && - "Despite splitting all preds, failed to split latch exit?"); - (void)SplitLatchEdge; } else { - // We can fold the conditional branch in the preheader, this makes things - // simpler. The first step is to remove the extra edge to the Exit block. - Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); - BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI->getIterator()); - NewBI->setDebugLoc(PHBI->getDebugLoc()); - PHBI->eraseFromParent(); + InsertNewValueIntoMap(ValueMap, Inst, C); + } + if (C) { + // Otherwise, stick the new instruction into the new block! + C->setName(Inst->getName()); + + if (auto *II = dyn_cast<AssumeInst>(C)) + AC->registerAssumption(II); + // MemorySSA cares whether the cloned instruction was inserted or not, and + // not whether it can be remapped to a simplified value. + if (MSSAU) + InsertNewValueIntoMap(ValueMapMSSA, Inst, C); + } + } - // With our CFG finalized, update DomTree if it is available. - if (DT) DT->deleteEdge(OrigPreheader, Exit); + if (!NoAliasDeclInstructions.empty()) { + // There are noalias scope declarations: + // (general): + // Original: OrigPre { OrigHeader NewHeader ... Latch } + // after: (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader } + // + // with D: llvm.experimental.noalias.scope.decl, + // U: !noalias or !alias.scope depending on D + // ... { D U1 U2 } can transform into: + // (0) : ... { D U1 U2 } // no relevant rotation for this part + // (1) : ... D' { U1 U2 D } // D is part of OrigHeader + // (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader + // + // We now want to transform: + // (1) -> : ... D' { D U1 U2 D'' } + // (2) -> : ... D' U1' { D U2 D'' U1'' } + // D: original llvm.experimental.noalias.scope.decl + // D', U1': duplicate with replaced scopes + // D'', U1'': different duplicate with replaced scopes + // This ensures a safe fallback to 'may_alias' introduced by the rotate, + // as U1'' and U1' scopes will not be compatible wrt to the local restrict + + // Clone the llvm.experimental.noalias.decl again for the NewHeader. + BasicBlock::iterator NewHeaderInsertionPoint = + NewHeader->getFirstNonPHIIt(); + for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) { + LLVM_DEBUG(dbgs() << " Cloning llvm.experimental.noalias.scope.decl:" + << *NAD << "\n"); + Instruction *NewNAD = NAD->clone(); + NewNAD->insertBefore(*NewHeader, NewHeaderInsertionPoint); + } - // Update MSSA too, if available. - if (MSSAU) - MSSAU->removeEdge(OrigPreheader, Exit); + // Scopes must now be duplicated, once for OrigHeader and once for + // OrigPreHeader'. + { + auto &Context = NewHeader->getContext(); + + SmallVector<MDNode *, 8> NoAliasDeclScopes; + for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) + NoAliasDeclScopes.push_back(NAD->getScopeList()); + + LLVM_DEBUG(dbgs() << " Updating OrigHeader scopes\n"); + cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, {OrigHeader}, Context, + "h.rot"); + LLVM_DEBUG(OrigHeader->dump()); + + // Keep the compile time impact low by only adapting the inserted block + // of instructions in the OrigPreHeader. This might result in slightly + // more aliasing between these instructions and those that were already + // present, but it will be much faster when the original PreHeader is + // large. + LLVM_DEBUG(dbgs() << " Updating part of OrigPreheader scopes\n"); + auto *FirstDecl = + cast<Instruction>(ValueMap[*NoAliasDeclInstructions.begin()]); + auto *LastInst = &OrigPreheader->back(); + cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, FirstDecl, LastInst, + Context, "pre.rot"); + LLVM_DEBUG(OrigPreheader->dump()); + + LLVM_DEBUG(dbgs() << " Updated NewHeader:\n"); + LLVM_DEBUG(NewHeader->dump()); } + } - assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); - assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); + // Along with all the other instructions, we just cloned OrigHeader's + // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's + // successors by duplicating their incoming values for OrigHeader. + for (BasicBlock *SuccBB : successors(OrigHeader)) + for (BasicBlock::iterator BI = SuccBB->begin(); + PHINode *PN = dyn_cast<PHINode>(BI); ++BI) + PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); + + // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove + // OrigPreHeader's old terminator (the original branch into the loop), and + // remove the corresponding incoming values from the PHI nodes in OrigHeader. + LoopEntryBranch->eraseFromParent(); + OrigPreheader->flushTerminatorDbgRecords(); + + // Update MemorySSA before the rewrite call below changes the 1:1 + // instruction:cloned_instruction_or_value mapping. + if (MSSAU) { + InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader); + MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader, + ValueMapMSSA); + } - if (MSSAU && VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); + SmallVector<PHINode *, 2> InsertedPHIs; + // If there were any uses of instructions in the duplicated block outside the + // loop, update them, inserting PHI nodes as required + RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE, + &InsertedPHIs); + + // Attach debug records to the new phis if that phi uses a value that + // previously had debug metadata attached. This keeps the debug info + // up-to-date in the loop body. + if (!InsertedPHIs.empty()) + insertDebugValuesForPHIs(OrigHeader, InsertedPHIs); + + // NewHeader is now the header of the loop. + L->moveToHeader(NewHeader); + assert(L->getHeader() == NewHeader && "Latch block is our new header"); + + // Inform DT about changes to the CFG. + if (DT) { + // The OrigPreheader branches to the NewHeader and Exit now. Then, inform + // the DT about the removed edge to the OrigHeader (that got removed). + SmallVector<DominatorTree::UpdateType, 3> Updates = { + {DominatorTree::Insert, OrigPreheader, Exit}, + {DominatorTree::Insert, OrigPreheader, NewHeader}, + {DominatorTree::Delete, OrigPreheader, OrigHeader}}; - // Now that the CFG and DomTree are in a consistent state again, try to merge - // the OrigHeader block into OrigLatch. This will succeed if they are - // connected by an unconditional branch. This is just a cleanup so the - // emitted code isn't too gross in this common case. - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); - BasicBlock *PredBB = OrigHeader->getUniquePredecessor(); - bool DidMerge = MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU); - if (DidMerge) - RemoveRedundantDbgInstrs(PredBB); + if (MSSAU) { + MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true); + if (VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + } else { + DT->applyUpdates(Updates); + } + } - if (MSSAU && VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); + // At this point, we've finished our major CFG changes. As part of cloning + // the loop into the preheader we've simplified instructions and the + // duplicated conditional branch may now be branching on a constant. If it is + // branching on a constant and if that constant means that we enter the loop, + // then we fold away the cond branch to an uncond branch. This simplifies the + // loop in cases important for nested loops, and it also means we don't have + // to split as many edges. + BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator()); + assert(PHBI->isConditional() && "Should be clone of BI condbr!"); + const Value *Cond = PHBI->getCondition(); + const bool HasConditionalPreHeader = + !isa<ConstantInt>(Cond) || + PHBI->getSuccessor(cast<ConstantInt>(Cond)->isZero()) != NewHeader; + + updateBranchWeights(*PHBI, *BI, HasConditionalPreHeader, BISuccsSwapped); - LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump()); + if (HasConditionalPreHeader) { + // The conditional branch can't be folded, handle the general case. + // Split edges as necessary to preserve LoopSimplify form. + + // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and + // thus is not a preheader anymore. + // Split the edge to form a real preheader. + BasicBlock *NewPH = SplitCriticalEdge( + OrigPreheader, NewHeader, + CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); + NewPH->setName(NewHeader->getName() + ".lr.ph"); + + // Preserve canonical loop form, which means that 'Exit' should have only + // one predecessor. Note that Exit could be an exit block for multiple + // nested loops, causing both of the edges to now be critical and need to + // be split. + SmallVector<BasicBlock *, 4> ExitPreds(predecessors(Exit)); + bool SplitLatchEdge = false; + for (BasicBlock *ExitPred : ExitPreds) { + // We only need to split loop exit edges. + Loop *PredLoop = LI->getLoopFor(ExitPred); + if (!PredLoop || PredLoop->contains(Exit) || + isa<IndirectBrInst>(ExitPred->getTerminator())) + continue; + SplitLatchEdge |= L->getLoopLatch() == ExitPred; + BasicBlock *ExitSplit = SplitCriticalEdge( + ExitPred, Exit, + CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); + ExitSplit->moveBefore(Exit); + } + assert(SplitLatchEdge && + "Despite splitting all preds, failed to split latch exit?"); + (void)SplitLatchEdge; + } else { + // We can fold the conditional branch in the preheader, this makes things + // simpler. The first step is to remove the extra edge to the Exit block. + Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); + BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI->getIterator()); + NewBI->setDebugLoc(PHBI->getDebugLoc()); + PHBI->eraseFromParent(); + + // With our CFG finalized, update DomTree if it is available. + if (DT) + DT->deleteEdge(OrigPreheader, Exit); + + // Update MSSA too, if available. + if (MSSAU) + MSSAU->removeEdge(OrigPreheader, Exit); + } - ++NumRotated; + assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); + assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); - Rotated = true; - SimplifiedLatch = false; + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + + // Now that the CFG and DomTree are in a consistent state again, try to merge + // the OrigHeader block into OrigLatch. This will succeed if they are + // connected by an unconditional branch. This is just a cleanup so the + // emitted code isn't too gross in this common case. + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + BasicBlock *PredBB = OrigHeader->getUniquePredecessor(); + bool DidMerge = MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU); + if (DidMerge) + RemoveRedundantDbgInstrs(PredBB); - // Check that new latch is a deoptimizing exit and then repeat rotation if possible. - // Deoptimizing latch exit is not a generally typical case, so we just loop over. - // TODO: if it becomes a performance bottleneck extend rotation algorithm - // to handle multiple rotations in one go. - } while (MultiRotate && canRotateDeoptimizingLatchExit(L)); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump()); return true; } diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index bf882d7..6312831 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -201,18 +201,27 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, /// unroll count is non-zero. /// /// This function performs the following: -/// - Update PHI nodes at the unrolling loop exit and epilog loop exit -/// - Create PHI nodes at the unrolling loop exit to combine -/// values that exit the unrolling loop code and jump around it. +/// - Update PHI nodes at the epilog loop exit +/// - Create PHI nodes at the unrolling loop exit and epilog preheader to +/// combine values that exit the unrolling loop code and jump around it. /// - Update PHI operands in the epilog loop by the new PHI nodes -/// - Branch around the epilog loop if extra iters (ModVal) is zero. +/// - At the unrolling loop exit, branch around the epilog loop if extra iters +// (ModVal) is zero. +/// - At the epilog preheader, add an llvm.assume call that extra iters is +/// non-zero. If the unrolling loop exit is the predecessor, the above new +/// branch guarantees that assumption. If the unrolling loop preheader is the +/// predecessor, then the required first iteration from the original loop has +/// yet to be executed, so it must be executed in the epilog loop. If we +/// later unroll the epilog loop, that llvm.assume call somehow enables +/// ScalarEvolution to compute a epilog loop maximum trip count, which enables +/// eliminating the branch at the end of the final unrolled epilog iteration. /// static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, BasicBlock *Exit, BasicBlock *PreHeader, BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader, ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA, ScalarEvolution &SE, - unsigned Count) { + unsigned Count, AssumptionCache &AC) { BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Loop must have a latch"); BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]); @@ -231,7 +240,7 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, // EpilogLatch // Exit (EpilogPN) - // Update PHI nodes at NewExit and Exit. + // Update PHI nodes at Exit. for (PHINode &PN : NewExit->phis()) { // PN should be used in another PHI located in Exit block as // Exit was split by SplitBlockPredecessors into Exit and NewExit @@ -246,15 +255,11 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, // epilogue edges have already been added. // // There is EpilogPreHeader incoming block instead of NewExit as - // NewExit was spilt 1 more time to get EpilogPreHeader. + // NewExit was split 1 more time to get EpilogPreHeader. assert(PN.hasOneUse() && "The phi should have 1 use"); PHINode *EpilogPN = cast<PHINode>(PN.use_begin()->getUser()); assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block"); - // Add incoming PreHeader from branch around the Loop - PN.addIncoming(PoisonValue::get(PN.getType()), PreHeader); - SE.forgetValue(&PN); - Value *V = PN.getIncomingValueForBlock(Latch); Instruction *I = dyn_cast<Instruction>(V); if (I && L->contains(I)) @@ -271,35 +276,52 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, NewExit); // Now PHIs should look like: // NewExit: - // PN = PHI [I, Latch], [poison, PreHeader] + // PN = PHI [I, Latch] // ... // Exit: // EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch] } - // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader). - // Update corresponding PHI nodes in epilog loop. + // Create PHI nodes at NewExit (from the unrolling loop Latch) and at + // EpilogPreHeader (from PreHeader and NewExit). Update corresponding PHI + // nodes in epilog loop. for (BasicBlock *Succ : successors(Latch)) { // Skip this as we already updated phis in exit blocks. if (!L->contains(Succ)) continue; + + // Succ here appears to always be just L->getHeader(). Otherwise, how do we + // know its corresponding epilog block (from VMap) is EpilogHeader and thus + // EpilogPreHeader is the right incoming block for VPN, as set below? + // TODO: Can we thus avoid the enclosing loop over successors? + assert(Succ == L->getHeader() && + "Expect the only in-loop successor of latch to be the loop header"); + for (PHINode &PN : Succ->phis()) { - // Add new PHI nodes to the loop exit block and update epilog - // PHIs with the new PHI values. - PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr"); - NewPN->insertBefore(NewExit->getFirstNonPHIIt()); - // Adding a value to the new PHI node from the unrolling loop preheader. - NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader); - // Adding a value to the new PHI node from the unrolling loop latch. - NewPN->addIncoming(PN.getIncomingValueForBlock(Latch), Latch); + // Add new PHI nodes to the loop exit block. + PHINode *NewPN0 = PHINode::Create(PN.getType(), /*NumReservedValues=*/1, + PN.getName() + ".unr"); + NewPN0->insertBefore(NewExit->getFirstNonPHIIt()); + // Add value to the new PHI node from the unrolling loop latch. + NewPN0->addIncoming(PN.getIncomingValueForBlock(Latch), Latch); + + // Add new PHI nodes to EpilogPreHeader. + PHINode *NewPN1 = PHINode::Create(PN.getType(), /*NumReservedValues=*/2, + PN.getName() + ".epil.init"); + NewPN1->insertBefore(EpilogPreHeader->getFirstNonPHIIt()); + // Add value to the new PHI node from the unrolling loop preheader. + NewPN1->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader); + // Add value to the new PHI node from the epilog loop guard. + NewPN1->addIncoming(NewPN0, NewExit); // Update the existing PHI node operand with the value from the new PHI // node. Corresponding instruction in epilog loop should be PHI. PHINode *VPN = cast<PHINode>(VMap[&PN]); - VPN->setIncomingValueForBlock(EpilogPreHeader, NewPN); + VPN->setIncomingValueForBlock(EpilogPreHeader, NewPN1); } } + // In NewExit, branch around the epilog loop if no extra iters. Instruction *InsertPt = NewExit->getTerminator(); IRBuilder<> B(InsertPt); Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod"); @@ -308,7 +330,7 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, SmallVector<BasicBlock*, 4> Preds(predecessors(Exit)); SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI, nullptr, PreserveLCSSA); - // Add the branch to the exit block (around the unrolling loop) + // Add the branch to the exit block (around the epilog loop) MDNode *BranchWeights = nullptr; if (hasBranchWeightMD(*Latch->getTerminator())) { // Assume equal distribution in interval [0, Count). @@ -322,10 +344,11 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, DT->changeImmediateDominator(Exit, NewDom); } - // Split the main loop exit to maintain canonicalization guarantees. - SmallVector<BasicBlock*, 4> NewExitPreds{Latch}; - SplitBlockPredecessors(NewExit, NewExitPreds, ".loopexit", DT, LI, nullptr, - PreserveLCSSA); + // In EpilogPreHeader, assume extra iters is non-zero. + IRBuilder<> B2(EpilogPreHeader, EpilogPreHeader->getFirstNonPHIIt()); + Value *ModIsNotNull = B2.CreateIsNotNull(ModVal, "lcmp.mod"); + AssumeInst *AI = cast<AssumeInst>(B2.CreateAssumption(ModIsNotNull)); + AC.registerAssumption(AI); } /// Create a clone of the blocks in a loop and connect them together. A new @@ -795,7 +818,8 @@ bool llvm::UnrollRuntimeLoopRemainder( ConstantInt::get(BECount->getType(), Count - 1)) : B.CreateIsNotNull(ModVal, "lcmp.mod"); - BasicBlock *RemainderLoop = UseEpilogRemainder ? NewExit : PrologPreHeader; + BasicBlock *RemainderLoop = + UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader; BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit; // Branch to either remainder (extra iterations) loop or unrolling loop. MDNode *BranchWeights = nullptr; @@ -808,7 +832,7 @@ bool llvm::UnrollRuntimeLoopRemainder( PreHeaderBR->eraseFromParent(); if (DT) { if (UseEpilogRemainder) - DT->changeImmediateDominator(NewExit, PreHeader); + DT->changeImmediateDominator(EpilogPreHeader, PreHeader); else DT->changeImmediateDominator(PrologExit, PreHeader); } @@ -880,7 +904,8 @@ bool llvm::UnrollRuntimeLoopRemainder( // from both the original loop and the remainder code reaching the exit // blocks. While the IDom of these exit blocks were from the original loop, // now the IDom is the preheader (which decides whether the original loop or - // remainder code should run). + // remainder code should run) unless the block still has just the original + // predecessor (such as NewExit in the case of an epilog remainder). if (DT && !L->getExitingBlock()) { SmallVector<BasicBlock *, 16> ChildrenToUpdate; // NB! We have to examine the dom children of all loop blocks, not just @@ -891,7 +916,8 @@ bool llvm::UnrollRuntimeLoopRemainder( auto *DomNodeBB = DT->getNode(BB); for (auto *DomChild : DomNodeBB->children()) { auto *DomChildBB = DomChild->getBlock(); - if (!L->contains(LI->getLoopFor(DomChildBB))) + if (!L->contains(LI->getLoopFor(DomChildBB)) && + DomChildBB->getUniquePredecessor() != BB) ChildrenToUpdate.push_back(DomChildBB); } } @@ -930,7 +956,7 @@ bool llvm::UnrollRuntimeLoopRemainder( // Connect the epilog code to the original loop and update the // PHI functions. ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader, - NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count); + NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC); // Update counter in loop for unrolling. // Use an incrementing IV. Pre-incr/post-incr is backedge/trip count. diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index b8cfe3a..155fcc5 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -6642,6 +6642,9 @@ public: /// Return true if the replacement is a lookup table. bool isLookupTable(); + /// Return true if the replacement is a bit map. + bool isBitMap(); + private: // Depending on the switch, there are different alternatives. enum { @@ -6932,6 +6935,8 @@ Constant *SwitchReplacement::getDefaultValue() { return DefaultValue; } bool SwitchReplacement::isLookupTable() { return Kind == LookupTableKind; } +bool SwitchReplacement::isBitMap() { return Kind == BitMapKind; } + static bool isSwitchDense(uint64_t NumCases, uint64_t CaseRange) { // 40% is the default density for building a jump table in optsize/minsize // mode. See also TargetLoweringBase::isSuitableForJumpTable(), which this @@ -7097,7 +7102,8 @@ static void reuseTableCompare( /// lookup tables. static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, DomTreeUpdater *DTU, const DataLayout &DL, - const TargetTransformInfo &TTI) { + const TargetTransformInfo &TTI, + bool ConvertSwitchToLookupTable) { assert(SI->getNumCases() > 1 && "Degenerate switch?"); BasicBlock *BB = SI->getParent(); @@ -7262,6 +7268,8 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, bool AnyLookupTables = any_of( PhiToReplacementMap, [](auto &KV) { return KV.second.isLookupTable(); }); + bool AnyBitMaps = any_of(PhiToReplacementMap, + [](auto &KV) { return KV.second.isBitMap(); }); // A few conditions prevent the generation of lookup tables: // 1. The target does not support lookup tables. @@ -7274,6 +7282,12 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, Fn->getFnAttribute("no-jump-tables").getValueAsBool())) return false; + // In the early optimization pipeline, disable formation of lookup tables, + // bit maps and mask checks, as they may inhibit further optimization. + if (!ConvertSwitchToLookupTable && + (AnyLookupTables || AnyBitMaps || NeedMask)) + return false; + Builder.SetInsertPoint(SI); // TableIndex is the switch condition - TableIndexOffset if we don't // use the condition directly @@ -7929,14 +7943,13 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { if (Options.ForwardSwitchCondToPhi && forwardSwitchConditionToPHI(SI)) return requestResimplify(); - // The conversion from switch to lookup tables results in difficult-to-analyze - // code and makes pruning branches much harder. This is a problem if the - // switch expression itself can still be restricted as a result of inlining or - // CVP. Therefore, only apply this transformation during late stages of the - // optimisation pipeline. - if (Options.ConvertSwitchToLookupTable && - simplifySwitchLookup(SI, Builder, DTU, DL, TTI)) - return requestResimplify(); + // The conversion of switches to arithmetic or lookup table is disabled in + // the early optimization pipeline, as it may lose information or make the + // resulting code harder to analyze. + if (Options.ConvertSwitchToArithmetic || Options.ConvertSwitchToLookupTable) + if (simplifySwitchLookup(SI, Builder, DTU, DL, TTI, + Options.ConvertSwitchToLookupTable)) + return requestResimplify(); if (simplifySwitchOfPowersOfTwo(SI, Builder, DL, TTI)) return requestResimplify(); |