diff options
Diffstat (limited to 'llvm/lib/Transforms')
28 files changed, 1446 insertions, 694 deletions
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index 0accb22..c89af68 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -689,10 +689,14 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, DISubprogram *DIS = F.getSubprogram(); // If there is no DISubprogram for F, it implies the function is compiled // without debug info. So we also don't generate debug info for the frame. - if (!DIS || !DIS->getUnit() || - !dwarf::isCPlusPlus( - (dwarf::SourceLanguage)DIS->getUnit()->getSourceLanguage()) || - DIS->getUnit()->getEmissionKind() != DICompileUnit::DebugEmissionKind::FullDebug) + + if (!DIS || !DIS->getUnit()) + return; + + if (!dwarf::isCPlusPlus(static_cast<llvm::dwarf::SourceLanguage>( + DIS->getUnit()->getSourceLanguage().getUnversionedName())) || + DIS->getUnit()->getEmissionKind() != + DICompileUnit::DebugEmissionKind::FullDebug) return; assert(Shape.ABI == coro::ABI::Switch && diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 9ca8194..56194fe 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -137,13 +137,10 @@ InstCombinerImpl::isEliminableCastPair(const CastInst *CI1, Instruction::CastOps secondOp = CI2->getOpcode(); Type *SrcIntPtrTy = SrcTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(SrcTy) : nullptr; - Type *MidIntPtrTy = - MidTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(MidTy) : nullptr; Type *DstIntPtrTy = DstTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(DstTy) : nullptr; unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy, - DstTy, SrcIntPtrTy, MidIntPtrTy, - DstIntPtrTy); + DstTy, &DL); // We don't want to form an inttoptr or ptrtoint that converts to an integer // type that differs from the pointer size. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 8f60e50..8c8fc69 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3356,7 +3356,10 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { impliesPoisonOrCond(FalseVal, B, /*Expected=*/false)) { // (A || B) || C --> A || (B | C) return replaceInstUsesWith( - SI, Builder.CreateLogicalOr(A, Builder.CreateOr(B, FalseVal))); + SI, Builder.CreateLogicalOr(A, Builder.CreateOr(B, FalseVal), "", + ProfcheckDisableMetadataFixes + ? nullptr + : cast<SelectInst>(CondVal))); } // (A && B) || (C && B) --> (A || C) && B @@ -3398,7 +3401,10 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { impliesPoisonOrCond(TrueVal, B, /*Expected=*/true)) { // (A && B) && C --> A && (B & C) return replaceInstUsesWith( - SI, Builder.CreateLogicalAnd(A, Builder.CreateAnd(B, TrueVal))); + SI, Builder.CreateLogicalAnd(A, Builder.CreateAnd(B, TrueVal), "", + ProfcheckDisableMetadataFixes + ? nullptr + : cast<SelectInst>(CondVal))); } // (A || B) && (C || B) --> (A && C) || B diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index aa030294..127a506 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -60,6 +60,58 @@ static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo, return true; } +/// Let N = 2 * M. +/// Given an N-bit integer representing a pack of two M-bit integers, +/// we can select one of the packed integers by right-shifting by either +/// zero or M (which is the most straightforward to check if M is a power +/// of 2), and then isolating the lower M bits. In this case, we can +/// represent the shift as a select on whether the shr amount is nonzero. +static Value *simplifyShiftSelectingPackedElement(Instruction *I, + const APInt &DemandedMask, + InstCombinerImpl &IC, + unsigned Depth) { + assert(I->getOpcode() == Instruction::LShr && + "Only lshr instruction supported"); + + uint64_t ShlAmt; + Value *Upper, *Lower; + if (!match(I->getOperand(0), + m_OneUse(m_c_DisjointOr( + m_OneUse(m_Shl(m_Value(Upper), m_ConstantInt(ShlAmt))), + m_Value(Lower))))) + return nullptr; + + if (!isPowerOf2_64(ShlAmt)) + return nullptr; + + const uint64_t DemandedBitWidth = DemandedMask.getActiveBits(); + if (DemandedBitWidth > ShlAmt) + return nullptr; + + // Check that upper demanded bits are not lost from lshift. + if (Upper->getType()->getScalarSizeInBits() < ShlAmt + DemandedBitWidth) + return nullptr; + + KnownBits KnownLowerBits = IC.computeKnownBits(Lower, I, Depth); + if (!KnownLowerBits.getMaxValue().isIntN(ShlAmt)) + return nullptr; + + Value *ShrAmt = I->getOperand(1); + KnownBits KnownShrBits = IC.computeKnownBits(ShrAmt, I, Depth); + + // Verify that ShrAmt is either exactly ShlAmt (which is a power of 2) or + // zero. + if (~KnownShrBits.Zero != ShlAmt) + return nullptr; + + Value *ShrAmtZ = + IC.Builder.CreateICmpEQ(ShrAmt, Constant::getNullValue(ShrAmt->getType()), + ShrAmt->getName() + ".z"); + Value *Select = IC.Builder.CreateSelect(ShrAmtZ, Lower, Upper); + Select->takeName(I); + return Select; +} + /// Returns the bitwidth of the given scalar or pointer type. For vector types, /// returns the element type's bitwidth. static unsigned getBitWidth(Type *Ty, const DataLayout &DL) { @@ -798,9 +850,13 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I, Known >>= ShiftAmt; if (ShiftAmt) Known.Zero.setHighBits(ShiftAmt); // high bits known zero. - } else { - llvm::computeKnownBits(I, Known, Q, Depth); + break; } + if (Value *V = + simplifyShiftSelectingPackedElement(I, DemandedMask, *this, Depth)) + return V; + + llvm::computeKnownBits(I, Known, Q, Depth); break; } case Instruction::AShr: { diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 048cdf4..d56a1af 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1970,12 +1970,6 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN, NewPhiValues.push_back(nullptr); OpsToMoveUseToIncomingBB.push_back(i); - // If the InVal is an invoke at the end of the pred block, then we can't - // insert a computation after it without breaking the edge. - if (isa<InvokeInst>(InVal)) - if (cast<Instruction>(InVal)->getParent() == InBB) - return nullptr; - // Do not push the operation across a loop backedge. This could result in // an infinite combine loop, and is generally non-profitable (especially // if the operation was originally outside the loop). diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index cdae9a7..3704ad7 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -2662,7 +2662,7 @@ void ModuleAddressSanitizer::instrumentGlobals(IRBuilder<> &IRB, G->eraseFromParent(); NewGlobals[i] = NewGlobal; - Constant *ODRIndicator = ConstantPointerNull::get(PtrTy); + Constant *ODRIndicator = Constant::getNullValue(IntptrTy); GlobalValue *InstrumentedGlobal = NewGlobal; bool CanUsePrivateAliases = @@ -2677,8 +2677,7 @@ void ModuleAddressSanitizer::instrumentGlobals(IRBuilder<> &IRB, // ODR should not happen for local linkage. if (NewGlobal->hasLocalLinkage()) { - ODRIndicator = - ConstantExpr::getIntToPtr(ConstantInt::get(IntptrTy, -1), PtrTy); + ODRIndicator = ConstantInt::get(IntptrTy, -1); } else if (UseOdrIndicator) { // With local aliases, we need to provide another externally visible // symbol __odr_asan_XXX to detect ODR violation. @@ -2692,7 +2691,7 @@ void ModuleAddressSanitizer::instrumentGlobals(IRBuilder<> &IRB, ODRIndicatorSym->setVisibility(NewGlobal->getVisibility()); ODRIndicatorSym->setDLLStorageClass(NewGlobal->getDLLStorageClass()); ODRIndicatorSym->setAlignment(Align(1)); - ODRIndicator = ODRIndicatorSym; + ODRIndicator = ConstantExpr::getPtrToInt(ODRIndicatorSym, IntptrTy); } Constant *Initializer = ConstantStruct::get( @@ -2703,8 +2702,7 @@ void ModuleAddressSanitizer::instrumentGlobals(IRBuilder<> &IRB, ConstantExpr::getPointerCast(Name, IntptrTy), ConstantExpr::getPointerCast(getOrCreateModuleName(), IntptrTy), ConstantInt::get(IntptrTy, MD.IsDynInit), - Constant::getNullValue(IntptrTy), - ConstantExpr::getPointerCast(ODRIndicator, IntptrTy)); + Constant::getNullValue(IntptrTy), ODRIndicator); LLVM_DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n"); diff --git a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp new file mode 100644 index 0000000..40720ae --- /dev/null +++ b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp @@ -0,0 +1,548 @@ +//===- AllocToken.cpp - Allocation token instrumentation ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements AllocToken, an instrumentation pass that +// replaces allocation calls with token-enabled versions. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Instrumentation/AllocToken.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/Analysis.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/RandomNumberGenerator.h" +#include "llvm/Support/SipHash.h" +#include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <limits> +#include <memory> +#include <optional> +#include <string> +#include <utility> +#include <variant> + +using namespace llvm; + +#define DEBUG_TYPE "alloc-token" + +namespace { + +//===--- Constants --------------------------------------------------------===// + +enum class TokenMode : unsigned { + /// Incrementally increasing token ID. + Increment = 0, + + /// Simple mode that returns a statically-assigned random token ID. + Random = 1, + + /// Token ID based on allocated type hash. + TypeHash = 2, + + /// Token ID based on allocated type hash, where the top half ID-space is + /// reserved for types that contain pointers and the bottom half for types + /// that do not contain pointers. + TypeHashPointerSplit = 3, +}; + +//===--- Command-line options ---------------------------------------------===// + +cl::opt<TokenMode> ClMode( + "alloc-token-mode", cl::Hidden, cl::desc("Token assignment mode"), + cl::init(TokenMode::TypeHashPointerSplit), + cl::values( + clEnumValN(TokenMode::Increment, "increment", + "Incrementally increasing token ID"), + clEnumValN(TokenMode::Random, "random", + "Statically-assigned random token ID"), + clEnumValN(TokenMode::TypeHash, "typehash", + "Token ID based on allocated type hash"), + clEnumValN( + TokenMode::TypeHashPointerSplit, "typehashpointersplit", + "Token ID based on allocated type hash, where the top half " + "ID-space is reserved for types that contain pointers and the " + "bottom half for types that do not contain pointers. "))); + +cl::opt<std::string> ClFuncPrefix("alloc-token-prefix", + cl::desc("The allocation function prefix"), + cl::Hidden, cl::init("__alloc_token_")); + +cl::opt<uint64_t> ClMaxTokens("alloc-token-max", + cl::desc("Maximum number of tokens (0 = no max)"), + cl::Hidden, cl::init(0)); + +cl::opt<bool> + ClFastABI("alloc-token-fast-abi", + cl::desc("The token ID is encoded in the function name"), + cl::Hidden, cl::init(false)); + +// Instrument libcalls only by default - compatible allocators only need to take +// care of providing standard allocation functions. With extended coverage, also +// instrument non-libcall allocation function calls with !alloc_token +// metadata. +cl::opt<bool> + ClExtended("alloc-token-extended", + cl::desc("Extend coverage to custom allocation functions"), + cl::Hidden, cl::init(false)); + +// C++ defines ::operator new (and variants) as replaceable (vs. standard +// library versions), which are nobuiltin, and are therefore not covered by +// isAllocationFn(). Cover by default, as users of AllocToken are already +// required to provide token-aware allocation functions (no defaults). +cl::opt<bool> ClCoverReplaceableNew("alloc-token-cover-replaceable-new", + cl::desc("Cover replaceable operator new"), + cl::Hidden, cl::init(true)); + +cl::opt<uint64_t> ClFallbackToken( + "alloc-token-fallback", + cl::desc("The default fallback token where none could be determined"), + cl::Hidden, cl::init(0)); + +//===--- Statistics -------------------------------------------------------===// + +STATISTIC(NumFunctionsInstrumented, "Functions instrumented"); +STATISTIC(NumAllocationsInstrumented, "Allocations instrumented"); + +//===----------------------------------------------------------------------===// + +/// Returns the !alloc_token metadata if available. +/// +/// Expected format is: !{<type-name>, <contains-pointer>} +MDNode *getAllocTokenMetadata(const CallBase &CB) { + MDNode *Ret = CB.getMetadata(LLVMContext::MD_alloc_token); + if (!Ret) + return nullptr; + assert(Ret->getNumOperands() == 2 && "bad !alloc_token"); + assert(isa<MDString>(Ret->getOperand(0))); + assert(isa<ConstantAsMetadata>(Ret->getOperand(1))); + return Ret; +} + +bool containsPointer(const MDNode *MD) { + ConstantAsMetadata *C = cast<ConstantAsMetadata>(MD->getOperand(1)); + auto *CI = cast<ConstantInt>(C->getValue()); + return CI->getValue().getBoolValue(); +} + +class ModeBase { +public: + explicit ModeBase(const IntegerType &TokenTy, uint64_t MaxTokens) + : MaxTokens(MaxTokens ? MaxTokens : TokenTy.getBitMask()) { + assert(MaxTokens <= TokenTy.getBitMask()); + } + +protected: + uint64_t boundedToken(uint64_t Val) const { + assert(MaxTokens != 0); + return Val % MaxTokens; + } + + const uint64_t MaxTokens; +}; + +/// Implementation for TokenMode::Increment. +class IncrementMode : public ModeBase { +public: + using ModeBase::ModeBase; + + uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &) { + return boundedToken(Counter++); + } + +private: + uint64_t Counter = 0; +}; + +/// Implementation for TokenMode::Random. +class RandomMode : public ModeBase { +public: + RandomMode(const IntegerType &TokenTy, uint64_t MaxTokens, + std::unique_ptr<RandomNumberGenerator> RNG) + : ModeBase(TokenTy, MaxTokens), RNG(std::move(RNG)) {} + uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &) { + return boundedToken((*RNG)()); + } + +private: + std::unique_ptr<RandomNumberGenerator> RNG; +}; + +/// Implementation for TokenMode::TypeHash. The implementation ensures +/// hashes are stable across different compiler invocations. Uses SipHash as the +/// hash function. +class TypeHashMode : public ModeBase { +public: + using ModeBase::ModeBase; + + uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &ORE) { + const auto [N, H] = getHash(CB, ORE); + return N ? boundedToken(H) : H; + } + +protected: + std::pair<MDNode *, uint64_t> getHash(const CallBase &CB, + OptimizationRemarkEmitter &ORE) { + if (MDNode *N = getAllocTokenMetadata(CB)) { + MDString *S = cast<MDString>(N->getOperand(0)); + return {N, getStableSipHash(S->getString())}; + } + // Fallback. + remarkNoMetadata(CB, ORE); + return {nullptr, ClFallbackToken}; + } + + /// Remark that there was no precise type information. + static void remarkNoMetadata(const CallBase &CB, + OptimizationRemarkEmitter &ORE) { + ORE.emit([&] { + ore::NV FuncNV("Function", CB.getParent()->getParent()); + const Function *Callee = CB.getCalledFunction(); + ore::NV CalleeNV("Callee", Callee ? Callee->getName() : "<unknown>"); + return OptimizationRemark(DEBUG_TYPE, "NoAllocToken", &CB) + << "Call to '" << CalleeNV << "' in '" << FuncNV + << "' without source-level type token"; + }); + } +}; + +/// Implementation for TokenMode::TypeHashPointerSplit. +class TypeHashPointerSplitMode : public TypeHashMode { +public: + using TypeHashMode::TypeHashMode; + + uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &ORE) { + if (MaxTokens == 1) + return 0; + const uint64_t HalfTokens = MaxTokens / 2; + const auto [N, H] = getHash(CB, ORE); + if (!N) { + // Pick the fallback token (ClFallbackToken), which by default is 0, + // meaning it'll fall into the pointer-less bucket. Override by setting + // -alloc-token-fallback if that is the wrong choice. + return H; + } + uint64_t Hash = H % HalfTokens; // base hash + if (containsPointer(N)) + Hash += HalfTokens; + return Hash; + } +}; + +// Apply opt overrides. +AllocTokenOptions transformOptionsFromCl(AllocTokenOptions Opts) { + if (!Opts.MaxTokens.has_value()) + Opts.MaxTokens = ClMaxTokens; + Opts.FastABI |= ClFastABI; + Opts.Extended |= ClExtended; + return Opts; +} + +class AllocToken { +public: + explicit AllocToken(AllocTokenOptions Opts, Module &M, + ModuleAnalysisManager &MAM) + : Options(transformOptionsFromCl(std::move(Opts))), Mod(M), + FAM(MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()), + Mode(IncrementMode(*IntPtrTy, *Options.MaxTokens)) { + switch (ClMode.getValue()) { + case TokenMode::Increment: + break; + case TokenMode::Random: + Mode.emplace<RandomMode>(*IntPtrTy, *Options.MaxTokens, + M.createRNG(DEBUG_TYPE)); + break; + case TokenMode::TypeHash: + Mode.emplace<TypeHashMode>(*IntPtrTy, *Options.MaxTokens); + break; + case TokenMode::TypeHashPointerSplit: + Mode.emplace<TypeHashPointerSplitMode>(*IntPtrTy, *Options.MaxTokens); + break; + } + } + + bool instrumentFunction(Function &F); + +private: + /// Returns the LibFunc (or NotLibFunc) if this call should be instrumented. + std::optional<LibFunc> + shouldInstrumentCall(const CallBase &CB, const TargetLibraryInfo &TLI) const; + + /// Returns true for functions that are eligible for instrumentation. + static bool isInstrumentableLibFunc(LibFunc Func, const CallBase &CB, + const TargetLibraryInfo &TLI); + + /// Returns true for isAllocationFn() functions that we should ignore. + static bool ignoreInstrumentableLibFunc(LibFunc Func); + + /// Replace a call/invoke with a call/invoke to the allocation function + /// with token ID. + bool replaceAllocationCall(CallBase *CB, LibFunc Func, + OptimizationRemarkEmitter &ORE, + const TargetLibraryInfo &TLI); + + /// Return replacement function for a LibFunc that takes a token ID. + FunctionCallee getTokenAllocFunction(const CallBase &CB, uint64_t TokenID, + LibFunc OriginalFunc); + + /// Return the token ID from metadata in the call. + uint64_t getToken(const CallBase &CB, OptimizationRemarkEmitter &ORE) { + return std::visit([&](auto &&Mode) { return Mode(CB, ORE); }, Mode); + } + + const AllocTokenOptions Options; + Module &Mod; + IntegerType *IntPtrTy = Mod.getDataLayout().getIntPtrType(Mod.getContext()); + FunctionAnalysisManager &FAM; + // Cache for replacement functions. + DenseMap<std::pair<LibFunc, uint64_t>, FunctionCallee> TokenAllocFunctions; + // Selected mode. + std::variant<IncrementMode, RandomMode, TypeHashMode, + TypeHashPointerSplitMode> + Mode; +}; + +bool AllocToken::instrumentFunction(Function &F) { + // Do not apply any instrumentation for naked functions. + if (F.hasFnAttribute(Attribute::Naked)) + return false; + if (F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation)) + return false; + // Don't touch available_externally functions, their actual body is elsewhere. + if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) + return false; + // Only instrument functions that have the sanitize_alloc_token attribute. + if (!F.hasFnAttribute(Attribute::SanitizeAllocToken)) + return false; + + auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); + auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); + SmallVector<std::pair<CallBase *, LibFunc>, 4> AllocCalls; + + // Collect all allocation calls to avoid iterator invalidation. + for (Instruction &I : instructions(F)) { + auto *CB = dyn_cast<CallBase>(&I); + if (!CB) + continue; + if (std::optional<LibFunc> Func = shouldInstrumentCall(*CB, TLI)) + AllocCalls.emplace_back(CB, Func.value()); + } + + bool Modified = false; + for (auto &[CB, Func] : AllocCalls) + Modified |= replaceAllocationCall(CB, Func, ORE, TLI); + + if (Modified) + NumFunctionsInstrumented++; + return Modified; +} + +std::optional<LibFunc> +AllocToken::shouldInstrumentCall(const CallBase &CB, + const TargetLibraryInfo &TLI) const { + const Function *Callee = CB.getCalledFunction(); + if (!Callee) + return std::nullopt; + + // Ignore nobuiltin of the CallBase, so that we can cover nobuiltin libcalls + // if requested via isInstrumentableLibFunc(). Note that isAllocationFn() is + // returning false for nobuiltin calls. + LibFunc Func; + if (TLI.getLibFunc(*Callee, Func)) { + if (isInstrumentableLibFunc(Func, CB, TLI)) + return Func; + } else if (Options.Extended && getAllocTokenMetadata(CB)) { + return NotLibFunc; + } + + return std::nullopt; +} + +bool AllocToken::isInstrumentableLibFunc(LibFunc Func, const CallBase &CB, + const TargetLibraryInfo &TLI) { + if (ignoreInstrumentableLibFunc(Func)) + return false; + + if (isAllocationFn(&CB, &TLI)) + return true; + + switch (Func) { + // These libfuncs don't return normal pointers, and are therefore not handled + // by isAllocationFn(). + case LibFunc_posix_memalign: + case LibFunc_size_returning_new: + case LibFunc_size_returning_new_hot_cold: + case LibFunc_size_returning_new_aligned: + case LibFunc_size_returning_new_aligned_hot_cold: + return true; + + // See comment above ClCoverReplaceableNew. + case LibFunc_Znwj: + case LibFunc_ZnwjRKSt9nothrow_t: + case LibFunc_ZnwjSt11align_val_t: + case LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t: + case LibFunc_Znwm: + case LibFunc_Znwm12__hot_cold_t: + case LibFunc_ZnwmRKSt9nothrow_t: + case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t: + case LibFunc_ZnwmSt11align_val_t: + case LibFunc_ZnwmSt11align_val_t12__hot_cold_t: + case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t: + case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t: + case LibFunc_Znaj: + case LibFunc_ZnajRKSt9nothrow_t: + case LibFunc_ZnajSt11align_val_t: + case LibFunc_ZnajSt11align_val_tRKSt9nothrow_t: + case LibFunc_Znam: + case LibFunc_Znam12__hot_cold_t: + case LibFunc_ZnamRKSt9nothrow_t: + case LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t: + case LibFunc_ZnamSt11align_val_t: + case LibFunc_ZnamSt11align_val_t12__hot_cold_t: + case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t: + case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t: + return ClCoverReplaceableNew; + + default: + return false; + } +} + +bool AllocToken::ignoreInstrumentableLibFunc(LibFunc Func) { + switch (Func) { + case LibFunc_strdup: + case LibFunc_dunder_strdup: + case LibFunc_strndup: + case LibFunc_dunder_strndup: + return true; + default: + return false; + } +} + +bool AllocToken::replaceAllocationCall(CallBase *CB, LibFunc Func, + OptimizationRemarkEmitter &ORE, + const TargetLibraryInfo &TLI) { + uint64_t TokenID = getToken(*CB, ORE); + + FunctionCallee TokenAlloc = getTokenAllocFunction(*CB, TokenID, Func); + if (!TokenAlloc) + return false; + NumAllocationsInstrumented++; + + if (Options.FastABI) { + assert(TokenAlloc.getFunctionType()->getNumParams() == CB->arg_size()); + CB->setCalledFunction(TokenAlloc); + return true; + } + + IRBuilder<> IRB(CB); + // Original args. + SmallVector<Value *, 4> NewArgs{CB->args()}; + // Add token ID, truncated to IntPtrTy width. + NewArgs.push_back(ConstantInt::get(IntPtrTy, TokenID)); + assert(TokenAlloc.getFunctionType()->getNumParams() == NewArgs.size()); + + // Preserve invoke vs call semantics for exception handling. + CallBase *NewCall; + if (auto *II = dyn_cast<InvokeInst>(CB)) { + NewCall = IRB.CreateInvoke(TokenAlloc, II->getNormalDest(), + II->getUnwindDest(), NewArgs); + } else { + NewCall = IRB.CreateCall(TokenAlloc, NewArgs); + cast<CallInst>(NewCall)->setTailCall(CB->isTailCall()); + } + NewCall->setCallingConv(CB->getCallingConv()); + NewCall->copyMetadata(*CB); + NewCall->setAttributes(CB->getAttributes()); + + // Replace all uses and delete the old call. + CB->replaceAllUsesWith(NewCall); + CB->eraseFromParent(); + return true; +} + +FunctionCallee AllocToken::getTokenAllocFunction(const CallBase &CB, + uint64_t TokenID, + LibFunc OriginalFunc) { + std::optional<std::pair<LibFunc, uint64_t>> Key; + if (OriginalFunc != NotLibFunc) { + Key = std::make_pair(OriginalFunc, Options.FastABI ? TokenID : 0); + auto It = TokenAllocFunctions.find(*Key); + if (It != TokenAllocFunctions.end()) + return It->second; + } + + const Function *Callee = CB.getCalledFunction(); + if (!Callee) + return FunctionCallee(); + const FunctionType *OldFTy = Callee->getFunctionType(); + if (OldFTy->isVarArg()) + return FunctionCallee(); + // Copy params, and append token ID type. + Type *RetTy = OldFTy->getReturnType(); + SmallVector<Type *, 4> NewParams{OldFTy->params()}; + std::string TokenAllocName = ClFuncPrefix; + if (Options.FastABI) + TokenAllocName += utostr(TokenID) + "_"; + else + NewParams.push_back(IntPtrTy); // token ID + TokenAllocName += Callee->getName(); + FunctionType *NewFTy = FunctionType::get(RetTy, NewParams, false); + FunctionCallee TokenAlloc = Mod.getOrInsertFunction(TokenAllocName, NewFTy); + if (Function *F = dyn_cast<Function>(TokenAlloc.getCallee())) + F->copyAttributesFrom(Callee); // preserve attrs + + if (Key.has_value()) + TokenAllocFunctions[*Key] = TokenAlloc; + return TokenAlloc; +} + +} // namespace + +AllocTokenPass::AllocTokenPass(AllocTokenOptions Opts) + : Options(std::move(Opts)) {} + +PreservedAnalyses AllocTokenPass::run(Module &M, ModuleAnalysisManager &MAM) { + AllocToken Pass(Options, M, MAM); + bool Modified = false; + + for (Function &F : M) { + if (F.empty()) + continue; // declaration + Modified |= Pass.instrumentFunction(F); + } + + return Modified ? PreservedAnalyses::none().preserveSet<CFGAnalyses>() + : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt index 15fd421..80576c6 100644 --- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_component_library(LLVMInstrumentation AddressSanitizer.cpp + AllocToken.cpp BoundsChecking.cpp CGProfile.cpp ControlHeightReduction.cpp diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 480ff4a..5ba2167 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -261,6 +261,11 @@ static cl::opt<bool> ClIgnorePersonalityRoutine( "list, do not create a wrapper for it."), cl::Hidden, cl::init(false)); +static cl::opt<bool> ClAddGlobalNameSuffix( + "dfsan-add-global-name-suffix", + cl::desc("Whether to add .dfsan suffix to global names"), cl::Hidden, + cl::init(true)); + static StringRef getGlobalTypeString(const GlobalValue &G) { // Types of GlobalVariables are always pointer types. Type *GType = G.getValueType(); @@ -1256,6 +1261,9 @@ DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) { } void DataFlowSanitizer::addGlobalNameSuffix(GlobalValue *GV) { + if (!ClAddGlobalNameSuffix) + return; + std::string GVName = std::string(GV->getName()), Suffix = ".dfsan"; GV->setName(GVName + Suffix); @@ -1784,10 +1792,8 @@ bool DataFlowSanitizer::runImpl( } Value *DFSanFunction::getArgTLS(Type *T, unsigned ArgOffset, IRBuilder<> &IRB) { - Value *Base = IRB.CreatePointerCast(DFS.ArgTLS, DFS.IntptrTy); - if (ArgOffset) - Base = IRB.CreateAdd(Base, ConstantInt::get(DFS.IntptrTy, ArgOffset)); - return IRB.CreateIntToPtr(Base, PointerType::get(*DFS.Ctx, 0), "_dfsarg"); + return IRB.CreatePtrAdd(DFS.ArgTLS, ConstantInt::get(DFS.IntptrTy, ArgOffset), + "_dfsarg"); } Value *DFSanFunction::getRetvalTLS(Type *T, IRBuilder<> &IRB) { diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index e9a3e98..e448230 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -89,6 +89,7 @@ STATISTIC(NumTransforms, "Number of transformations done"); STATISTIC(NumCloned, "Number of blocks cloned"); STATISTIC(NumPaths, "Number of individual paths threaded"); +namespace llvm { static cl::opt<bool> ClViewCfgBefore("dfa-jump-view-cfg-before", cl::desc("View the CFG before DFA Jump Threading"), @@ -120,8 +121,17 @@ static cl::opt<unsigned> cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50)); -namespace { +extern cl::opt<bool> ProfcheckDisableMetadataFixes; + +} // namespace llvm + +static cl::opt<double> MaxClonedRate( + "dfa-max-cloned-rate", + cl::desc( + "Maximum cloned instructions rate accepted for the transformation"), + cl::Hidden, cl::init(7.5)); +namespace { class SelectInstToUnfold { SelectInst *SI; PHINode *SIUse; @@ -135,10 +145,6 @@ public: explicit operator bool() const { return SI && SIUse; } }; -void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold, - std::vector<SelectInstToUnfold> *NewSIsToUnfold, - std::vector<BasicBlock *> *NewBBs); - class DFAJumpThreading { public: DFAJumpThreading(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI, @@ -152,7 +158,8 @@ private: void unfoldSelectInstrs(DominatorTree *DT, const SmallVector<SelectInstToUnfold, 4> &SelectInsts) { - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + // TODO: Have everything use a single lazy DTU + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); SmallVector<SelectInstToUnfold, 4> Stack(SelectInsts); while (!Stack.empty()) { @@ -167,16 +174,18 @@ private: } } + static void unfold(DomTreeUpdater *DTU, LoopInfo *LI, + SelectInstToUnfold SIToUnfold, + std::vector<SelectInstToUnfold> *NewSIsToUnfold, + std::vector<BasicBlock *> *NewBBs); + AssumptionCache *AC; DominatorTree *DT; LoopInfo *LI; TargetTransformInfo *TTI; OptimizationRemarkEmitter *ORE; }; - -} // end anonymous namespace - -namespace { +} // namespace /// Unfold the select instruction held in \p SIToUnfold by replacing it with /// control flow. @@ -185,9 +194,10 @@ namespace { /// created basic blocks into \p NewBBs. /// /// TODO: merge it with CodeGenPrepare::optimizeSelectInst() if possible. -void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold, - std::vector<SelectInstToUnfold> *NewSIsToUnfold, - std::vector<BasicBlock *> *NewBBs) { +void DFAJumpThreading::unfold(DomTreeUpdater *DTU, LoopInfo *LI, + SelectInstToUnfold SIToUnfold, + std::vector<SelectInstToUnfold> *NewSIsToUnfold, + std::vector<BasicBlock *> *NewBBs) { SelectInst *SI = SIToUnfold.getInst(); PHINode *SIUse = SIToUnfold.getUse(); assert(SI->hasOneUse()); @@ -253,7 +263,11 @@ void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold, // Insert the real conditional branch based on the original condition. StartBlockTerm->eraseFromParent(); - BranchInst::Create(EndBlock, NewBlock, SI->getCondition(), StartBlock); + auto *BI = + BranchInst::Create(EndBlock, NewBlock, SI->getCondition(), StartBlock); + if (!ProfcheckDisableMetadataFixes) + BI->setMetadata(LLVMContext::MD_prof, + SI->getMetadata(LLVMContext::MD_prof)); DTU->applyUpdates({{DominatorTree::Insert, StartBlock, EndBlock}, {DominatorTree::Insert, StartBlock, NewBlock}}); } else { @@ -288,7 +302,11 @@ void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold, // (Use) BranchInst::Create(EndBlock, NewBlockF); // Insert the real conditional branch based on the original condition. - BranchInst::Create(EndBlock, NewBlockF, SI->getCondition(), NewBlockT); + auto *BI = + BranchInst::Create(EndBlock, NewBlockF, SI->getCondition(), NewBlockT); + if (!ProfcheckDisableMetadataFixes) + BI->setMetadata(LLVMContext::MD_prof, + SI->getMetadata(LLVMContext::MD_prof)); DTU->applyUpdates({{DominatorTree::Insert, NewBlockT, NewBlockF}, {DominatorTree::Insert, NewBlockT, EndBlock}, {DominatorTree::Insert, NewBlockF, EndBlock}}); @@ -342,10 +360,12 @@ void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold, SI->eraseFromParent(); } +namespace { struct ClonedBlock { BasicBlock *BB; APInt State; ///< \p State corresponds to the next value of a switch stmnt. }; +} // namespace typedef std::deque<BasicBlock *> PathType; typedef std::vector<PathType> PathsType; @@ -375,6 +395,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const PathType &Path) { return OS; } +namespace { /// ThreadingPath is a path in the control flow of a loop that can be threaded /// by cloning necessary basic blocks and replacing conditional branches with /// unconditional ones. A threading path includes a list of basic blocks, the @@ -814,11 +835,13 @@ struct TransformDFA { : SwitchPaths(SwitchPaths), DT(DT), AC(AC), TTI(TTI), ORE(ORE), EphValues(EphValues) {} - void run() { + bool run() { if (isLegalAndProfitableToTransform()) { createAllExitPaths(); NumTransforms++; + return true; } + return false; } private: @@ -828,6 +851,7 @@ private: /// also returns false if it is illegal to clone some required block. bool isLegalAndProfitableToTransform() { CodeMetrics Metrics; + uint64_t NumClonedInst = 0; SwitchInst *Switch = SwitchPaths->getSwitchInst(); // Don't thread switch without multiple successors. @@ -837,7 +861,6 @@ private: // Note that DuplicateBlockMap is not being used as intended here. It is // just being used to ensure (BB, State) pairs are only counted once. DuplicateBlockMap DuplicateMap; - for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { PathType PathBBs = TPath.getPath(); APInt NextState = TPath.getExitValue(); @@ -848,6 +871,7 @@ private: BasicBlock *VisitedBB = getClonedBB(BB, NextState, DuplicateMap); if (!VisitedBB) { Metrics.analyzeBasicBlock(BB, *TTI, EphValues); + NumClonedInst += BB->sizeWithoutDebug(); DuplicateMap[BB].push_back({BB, NextState}); } @@ -865,6 +889,7 @@ private: if (VisitedBB) continue; Metrics.analyzeBasicBlock(BB, *TTI, EphValues); + NumClonedInst += BB->sizeWithoutDebug(); DuplicateMap[BB].push_back({BB, NextState}); } @@ -901,6 +926,22 @@ private: } } + // Too much cloned instructions slow down later optimizations, especially + // SLPVectorizer. + // TODO: Thread the switch partially before reaching the threshold. + uint64_t NumOrigInst = 0; + for (auto *BB : DuplicateMap.keys()) + NumOrigInst += BB->sizeWithoutDebug(); + if (double(NumClonedInst) / double(NumOrigInst) > MaxClonedRate) { + LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much " + "instructions wll be cloned\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable", Switch) + << "Too much instructions will be cloned."; + }); + return false; + } + InstructionCost DuplicationCost = 0; unsigned JumpTableSize = 0; @@ -951,8 +992,6 @@ private: /// Transform each threading path to effectively jump thread the DFA. void createAllExitPaths() { - DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Eager); - // Move the switch block to the end of the path, since it will be duplicated BasicBlock *SwitchBlock = SwitchPaths->getSwitchBlock(); for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { @@ -969,15 +1008,18 @@ private: SmallPtrSet<BasicBlock *, 16> BlocksToClean; BlocksToClean.insert_range(successors(SwitchBlock)); - for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { - createExitPath(NewDefs, TPath, DuplicateMap, BlocksToClean, &DTU); - NumPaths++; - } + { + DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy); + for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { + createExitPath(NewDefs, TPath, DuplicateMap, BlocksToClean, &DTU); + NumPaths++; + } - // After all paths are cloned, now update the last successor of the cloned - // path so it skips over the switch statement - for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) - updateLastSuccessor(TPath, DuplicateMap, &DTU); + // After all paths are cloned, now update the last successor of the cloned + // path so it skips over the switch statement + for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) + updateLastSuccessor(TPath, DuplicateMap, &DTU); + } // For each instruction that was cloned and used outside, update its uses updateSSA(NewDefs); @@ -993,7 +1035,7 @@ private: /// To remember the correct destination, we have to duplicate blocks /// corresponding to each state. Also update the terminating instruction of /// the predecessors, and phis in the successor blocks. - void createExitPath(DefMap &NewDefs, ThreadingPath &Path, + void createExitPath(DefMap &NewDefs, const ThreadingPath &Path, DuplicateBlockMap &DuplicateMap, SmallPtrSet<BasicBlock *, 16> &BlocksToClean, DomTreeUpdater *DTU) { @@ -1164,19 +1206,18 @@ private: // value for the new predecessor ClonedBB. The value will either be the same // value from BB or a cloned value. for (BasicBlock *Succ : BlocksToUpdate) { - for (auto II = Succ->begin(); PHINode *Phi = dyn_cast<PHINode>(II); - ++II) { - Value *Incoming = Phi->getIncomingValueForBlock(BB); + for (PHINode &Phi : Succ->phis()) { + Value *Incoming = Phi.getIncomingValueForBlock(BB); if (Incoming) { if (isa<Constant>(Incoming)) { - Phi->addIncoming(Incoming, ClonedBB); + Phi.addIncoming(Incoming, ClonedBB); continue; } Value *ClonedVal = VMap[Incoming]; if (ClonedVal) - Phi->addIncoming(ClonedVal, ClonedBB); + Phi.addIncoming(ClonedVal, ClonedBB); else - Phi->addIncoming(Incoming, ClonedBB); + Phi.addIncoming(Incoming, ClonedBB); } } } @@ -1239,7 +1280,7 @@ private: /// /// Note that this is an optional step and would have been done in later /// optimizations, but it makes the CFG significantly easier to work with. - void updateLastSuccessor(ThreadingPath &TPath, + void updateLastSuccessor(const ThreadingPath &TPath, DuplicateBlockMap &DuplicateMap, DomTreeUpdater *DTU) { APInt NextState = TPath.getExitValue(); @@ -1271,27 +1312,19 @@ private: void cleanPhiNodes(BasicBlock *BB) { // If BB is no longer reachable, remove any remaining phi nodes if (pred_empty(BB)) { - std::vector<PHINode *> PhiToRemove; - for (auto II = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(II); ++II) { - PhiToRemove.push_back(Phi); - } - for (PHINode *PN : PhiToRemove) { - PN->replaceAllUsesWith(PoisonValue::get(PN->getType())); - PN->eraseFromParent(); + for (PHINode &PN : make_early_inc_range(BB->phis())) { + PN.replaceAllUsesWith(PoisonValue::get(PN.getType())); + PN.eraseFromParent(); } return; } // Remove any incoming values that come from an invalid predecessor - for (auto II = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(II); ++II) { - std::vector<BasicBlock *> BlocksToRemove; - for (BasicBlock *IncomingBB : Phi->blocks()) { - if (!isPredecessor(BB, IncomingBB)) - BlocksToRemove.push_back(IncomingBB); - } - for (BasicBlock *BB : BlocksToRemove) - Phi->removeIncomingValue(BB); - } + for (PHINode &Phi : BB->phis()) + Phi.removeIncomingValueIf([&](unsigned Index) { + BasicBlock *IncomingBB = Phi.getIncomingBlock(Index); + return !isPredecessor(BB, IncomingBB); + }); } /// Checks if BB was already cloned for a particular next state value. If it @@ -1336,6 +1369,7 @@ private: SmallPtrSet<const Value *, 32> EphValues; std::vector<ThreadingPath> TPaths; }; +} // namespace bool DFAJumpThreading::run(Function &F) { LLVM_DEBUG(dbgs() << "\nDFA Jump threading: " << F.getName() << "\n"); @@ -1402,9 +1436,8 @@ bool DFAJumpThreading::run(Function &F) { for (AllSwitchPaths SwitchPaths : ThreadableLoops) { TransformDFA Transform(&SwitchPaths, DT, AC, TTI, ORE, EphValues); - Transform.run(); - MadeChanges = true; - LoopInfoBroken = true; + if (Transform.run()) + MadeChanges = LoopInfoBroken = true; } #ifdef EXPENSIVE_CHECKS @@ -1415,8 +1448,6 @@ bool DFAJumpThreading::run(Function &F) { return MadeChanges; } -} // end anonymous namespace - /// Integrate with the new Pass Manager PreservedAnalyses DFAJumpThreadingPass::run(Function &F, FunctionAnalysisManager &AM) { diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index b9b5b58..638952a 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -699,6 +699,7 @@ uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) { case Instruction::FPTrunc: case Instruction::FPExt: case Instruction::PtrToInt: + case Instruction::PtrToAddr: case Instruction::IntToPtr: case Instruction::AddrSpaceCast: case Instruction::BitCast: diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index d6b7633..3c1a8ba 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -2066,6 +2066,7 @@ NewGVN::performSymbolicEvaluation(Instruction *I, case Instruction::FPTrunc: case Instruction::FPExt: case Instruction::PtrToInt: + case Instruction::PtrToAddr: case Instruction::IntToPtr: case Instruction::Select: case Instruction::ExtractElement: diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 60e5df0..7ffccf7 100644 --- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -355,6 +355,8 @@ void SimplifyCFGPass::printPipeline( OS << (Options.ForwardSwitchCondToPhi ? "" : "no-") << "forward-switch-cond;"; OS << (Options.ConvertSwitchRangeToICmp ? "" : "no-") << "switch-range-to-icmp;"; + OS << (Options.ConvertSwitchToArithmetic ? "" : "no-") + << "switch-to-arithmetic;"; OS << (Options.ConvertSwitchToLookupTable ? "" : "no-") << "switch-to-lookup;"; OS << (Options.NeedCanonicalLoop ? "" : "no-") << "keep-loops;"; diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index bbd1ed6..5ba6f95f 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -970,6 +970,7 @@ Function *CodeExtractor::constructFunctionDeclaration( case Attribute::SanitizeMemTag: case Attribute::SanitizeRealtime: case Attribute::SanitizeRealtimeBlocking: + case Attribute::SanitizeAllocToken: case Attribute::SpeculativeLoadHardening: case Attribute::StackProtect: case Attribute::StackProtectReq: diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp index 5a09b73..2923633 100644 --- a/llvm/lib/Transforms/Utils/Debugify.cpp +++ b/llvm/lib/Transforms/Utils/Debugify.cpp @@ -19,6 +19,7 @@ #include "llvm/Config/llvm-config.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" @@ -162,8 +163,8 @@ bool llvm::applyDebugifyMetadata( unsigned NextLine = 1; unsigned NextVar = 1; auto File = DIB.createFile(M.getName(), "/"); - auto CU = DIB.createCompileUnit(dwarf::DW_LANG_C, File, "debugify", - /*isOptimized=*/true, "", 0); + auto CU = DIB.createCompileUnit(DISourceLanguageName(dwarf::DW_LANG_C), File, + "debugify", /*isOptimized=*/true, "", 0); // Visit each instruction. for (Function &F : Functions) { diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 21b2652..b6ca52e 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3031,6 +3031,13 @@ static void combineMetadata(Instruction *K, const Instruction *J, K->getContext(), MDNode::toCaptureComponents(JMD) | MDNode::toCaptureComponents(KMD))); break; + case LLVMContext::MD_alloc_token: + // Preserve !alloc_token if both K and J have it, and they are equal. + if (KMD == JMD) + K->setMetadata(Kind, JMD); + else + K->setMetadata(Kind, nullptr); + break; } } // Set !invariant.group from J if J has it. If both instructions have it diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp index 7cc9ff8..0c8d6fa 100644 --- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -45,12 +45,6 @@ STATISTIC(NumInstrsHoisted, "Number of instructions hoisted into loop preheader"); STATISTIC(NumInstrsDuplicated, "Number of instructions cloned into loop preheader"); -STATISTIC(NumRotated, "Number of loops rotated"); - -static cl::opt<bool> - MultiRotate("loop-rotate-multi", cl::init(false), cl::Hidden, - cl::desc("Allow loop rotation multiple times in order to reach " - "a better latch exit")); // Probability that a rotated loop has zero trip count / is never entered. static constexpr uint32_t ZeroTripCountWeights[] = {1, 127}; @@ -206,50 +200,6 @@ static bool profitableToRotateLoopExitingLatch(Loop *L) { return false; } -// Check that latch exit is deoptimizing (which means - very unlikely to happen) -// and there is another exit from the loop which is non-deoptimizing. -// If we rotate latch to that exit our loop has a better chance of being fully -// canonical. -// -// It can give false positives in some rare cases. -static bool canRotateDeoptimizingLatchExit(Loop *L) { - BasicBlock *Latch = L->getLoopLatch(); - assert(Latch && "need latch"); - BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator()); - // Need normal exiting latch. - if (!BI || !BI->isConditional()) - return false; - - BasicBlock *Exit = BI->getSuccessor(1); - if (L->contains(Exit)) - Exit = BI->getSuccessor(0); - - // Latch exit is non-deoptimizing, no need to rotate. - if (!Exit->getPostdominatingDeoptimizeCall()) - return false; - - SmallVector<BasicBlock *, 4> Exits; - L->getUniqueExitBlocks(Exits); - if (!Exits.empty()) { - // There is at least one non-deoptimizing exit. - // - // Note, that BasicBlock::getPostdominatingDeoptimizeCall is not exact, - // as it can conservatively return false for deoptimizing exits with - // complex enough control flow down to deoptimize call. - // - // That means here we can report success for a case where - // all exits are deoptimizing but one of them has complex enough - // control flow (e.g. with loops). - // - // That should be a very rare case and false positives for this function - // have compile-time effect only. - return any_of(Exits, [](const BasicBlock *BB) { - return !BB->getPostdominatingDeoptimizeCall(); - }); - } - return false; -} - static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI, bool HasConditionalPreHeader, bool SuccsSwapped) { @@ -387,506 +337,489 @@ static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI, /// rotation. LoopRotate should be repeatable and converge to a canonical /// form. This property is satisfied because simplifying the loop latch can only /// happen once across multiple invocations of the LoopRotate pass. -/// -/// If -loop-rotate-multi is enabled we can do multiple rotations in one go -/// so to reach a suitable (non-deoptimizing) exit. bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // If the loop has only one block then there is not much to rotate. if (L->getBlocks().size() == 1) return false; bool Rotated = false; - do { - BasicBlock *OrigHeader = L->getHeader(); - BasicBlock *OrigLatch = L->getLoopLatch(); - - BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator()); - if (!BI || BI->isUnconditional()) - return Rotated; - - // If the loop header is not one of the loop exiting blocks then - // either this loop is already rotated or it is not - // suitable for loop rotation transformations. - if (!L->isLoopExiting(OrigHeader)) + BasicBlock *OrigHeader = L->getHeader(); + BasicBlock *OrigLatch = L->getLoopLatch(); + + BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator()); + if (!BI || BI->isUnconditional()) + return Rotated; + + // If the loop header is not one of the loop exiting blocks then + // either this loop is already rotated or it is not + // suitable for loop rotation transformations. + if (!L->isLoopExiting(OrigHeader)) + return Rotated; + + // If the loop latch already contains a branch that leaves the loop then the + // loop is already rotated. + if (!OrigLatch) + return Rotated; + + // Rotate if the loop latch was just simplified. Or if it makes the loop exit + // count computable. Or if we think it will be profitable. + if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false && + !profitableToRotateLoopExitingLatch(L)) + return Rotated; + + // Check size of original header and reject loop if it is very big or we can't + // duplicate blocks inside it. + { + SmallPtrSet<const Value *, 32> EphValues; + CodeMetrics::collectEphemeralValues(L, AC, EphValues); + + CodeMetrics Metrics; + Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues, PrepareForLTO); + if (Metrics.notDuplicatable) { + LLVM_DEBUG( + dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" + << " instructions: "; + L->dump()); return Rotated; - - // If the loop latch already contains a branch that leaves the loop then the - // loop is already rotated. - if (!OrigLatch) + } + if (Metrics.Convergence != ConvergenceKind::None) { + LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " + "instructions: "; + L->dump()); return Rotated; - - // Rotate if either the loop latch does *not* exit the loop, or if the loop - // latch was just simplified. Or if we think it will be profitable. - if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false && - !profitableToRotateLoopExitingLatch(L) && - !canRotateDeoptimizingLatchExit(L)) + } + if (!Metrics.NumInsts.isValid()) { + LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions" + " with invalid cost: "; + L->dump()); return Rotated; - - // Check size of original header and reject loop if it is very big or we can't - // duplicate blocks inside it. - { - SmallPtrSet<const Value *, 32> EphValues; - CodeMetrics::collectEphemeralValues(L, AC, EphValues); - - CodeMetrics Metrics; - Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues, PrepareForLTO); - if (Metrics.notDuplicatable) { - LLVM_DEBUG( - dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" - << " instructions: "; - L->dump()); - return Rotated; - } - if (Metrics.Convergence != ConvergenceKind::None) { - LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " - "instructions: "; - L->dump()); - return Rotated; - } - if (!Metrics.NumInsts.isValid()) { - LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions" - " with invalid cost: "; - L->dump()); - return Rotated; - } - if (Metrics.NumInsts > MaxHeaderSize) { - LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains " - << Metrics.NumInsts - << " instructions, which is more than the threshold (" - << MaxHeaderSize << " instructions): "; - L->dump()); - ++NumNotRotatedDueToHeaderSize; - return Rotated; - } - - // When preparing for LTO, avoid rotating loops with calls that could be - // inlined during the LTO stage. - if (PrepareForLTO && Metrics.NumInlineCandidates > 0) - return Rotated; } - - // Now, this loop is suitable for rotation. - BasicBlock *OrigPreheader = L->getLoopPreheader(); - - // If the loop could not be converted to canonical form, it must have an - // indirectbr in it, just give up. - if (!OrigPreheader || !L->hasDedicatedExits()) + if (Metrics.NumInsts > MaxHeaderSize) { + LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains " + << Metrics.NumInsts + << " instructions, which is more than the threshold (" + << MaxHeaderSize << " instructions): "; + L->dump()); + ++NumNotRotatedDueToHeaderSize; return Rotated; - - // Anything ScalarEvolution may know about this loop or the PHI nodes - // in its header will soon be invalidated. We should also invalidate - // all outer loops because insertion and deletion of blocks that happens - // during the rotation may violate invariants related to backedge taken - // infos in them. - if (SE) { - SE->forgetTopmostLoop(L); - // We may hoist some instructions out of loop. In case if they were cached - // as "loop variant" or "loop computable", these caches must be dropped. - // We also may fold basic blocks, so cached block dispositions also need - // to be dropped. - SE->forgetBlockAndLoopDispositions(); } - LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); - if (MSSAU && VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); - - // Find new Loop header. NewHeader is a Header's one and only successor - // that is inside loop. Header's other successor is outside the - // loop. Otherwise loop is not suitable for rotation. - BasicBlock *Exit = BI->getSuccessor(0); - BasicBlock *NewHeader = BI->getSuccessor(1); - bool BISuccsSwapped = L->contains(Exit); - if (BISuccsSwapped) - std::swap(Exit, NewHeader); - assert(NewHeader && "Unable to determine new loop header"); - assert(L->contains(NewHeader) && !L->contains(Exit) && - "Unable to determine loop header and exit blocks"); - - // This code assumes that the new header has exactly one predecessor. - // Remove any single-entry PHI nodes in it. - assert(NewHeader->getSinglePredecessor() && - "New header doesn't have one pred!"); - FoldSingleEntryPHINodes(NewHeader); - - // Begin by walking OrigHeader and populating ValueMap with an entry for - // each Instruction. - BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); - ValueToValueMapTy ValueMap, ValueMapMSSA; - - // For PHI nodes, the value available in OldPreHeader is just the - // incoming value from OldPreHeader. - for (; PHINode *PN = dyn_cast<PHINode>(I); ++I) - InsertNewValueIntoMap(ValueMap, PN, - PN->getIncomingValueForBlock(OrigPreheader)); - - // For the rest of the instructions, either hoist to the OrigPreheader if - // possible or create a clone in the OldPreHeader if not. - Instruction *LoopEntryBranch = OrigPreheader->getTerminator(); - - // Record all debug records preceding LoopEntryBranch to avoid - // duplication. - using DbgHash = - std::pair<std::pair<hash_code, DILocalVariable *>, DIExpression *>; - auto makeHash = [](const DbgVariableRecord *D) -> DbgHash { - auto VarLocOps = D->location_ops(); - return {{hash_combine_range(VarLocOps), D->getVariable()}, - D->getExpression()}; - }; - - SmallDenseSet<DbgHash, 8> DbgRecords; - // Build DbgVariableRecord hashes for DbgVariableRecords attached to the - // terminator. - for (const DbgVariableRecord &DVR : - filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange())) - DbgRecords.insert(makeHash(&DVR)); - - // Remember the local noalias scope declarations in the header. After the - // rotation, they must be duplicated and the scope must be cloned. This - // avoids unwanted interaction across iterations. - SmallVector<NoAliasScopeDeclInst *, 6> NoAliasDeclInstructions; - for (Instruction &I : *OrigHeader) - if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I)) - NoAliasDeclInstructions.push_back(Decl); - - Module *M = OrigHeader->getModule(); - - // Track the next DbgRecord to clone. If we have a sequence where an - // instruction is hoisted instead of being cloned: - // DbgRecord blah - // %foo = add i32 0, 0 - // DbgRecord xyzzy - // %bar = call i32 @foobar() - // where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once - // attached to %foo, then when %foo his hoisted it will "fall down" onto the - // function call: - // DbgRecord blah - // DbgRecord xyzzy - // %bar = call i32 @foobar() - // causing it to appear attached to the call too. - // - // To avoid this, cloneDebugInfoFrom takes an optional "start cloning from - // here" position to account for this behaviour. We point it at any - // DbgRecords on the next instruction, here labelled xyzzy, before we hoist - // %foo. Later, we only only clone DbgRecords from that position (xyzzy) - // onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as - // a range because it gives us a natural way of testing whether - // there were DbgRecords on the next instruction before we hoisted things). - iterator_range<DbgRecord::self_iterator> NextDbgInsts = - (I != E) ? I->getDbgRecordRange() : DbgMarker::getEmptyDbgRecordRange(); - - while (I != E) { - Instruction *Inst = &*I++; - - // If the instruction's operands are invariant and it doesn't read or write - // memory, then it is safe to hoist. Doing this doesn't change the order of - // execution in the preheader, but does prevent the instruction from - // executing in each iteration of the loop. This means it is safe to hoist - // something that might trap, but isn't safe to hoist something that reads - // memory (without proving that the loop doesn't write). - if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() && - !Inst->mayWriteToMemory() && !Inst->isTerminator() && - !isa<AllocaInst>(Inst) && - // It is not safe to hoist the value of these instructions in - // coroutines, as the addresses of otherwise eligible variables (e.g. - // thread-local variables and errno) may change if the coroutine is - // resumed in a different thread.Therefore, we disable this - // optimization for correctness. However, this may block other correct - // optimizations. - // FIXME: This should be reverted once we have a better model for - // memory access in coroutines. - !Inst->getFunction()->isPresplitCoroutine()) { - - if (!NextDbgInsts.empty()) { - auto DbgValueRange = - LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin()); - RemapDbgRecordRange(M, DbgValueRange, ValueMap, - RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); - // Erase anything we've seen before. - for (DbgVariableRecord &DVR : - make_early_inc_range(filterDbgVars(DbgValueRange))) - if (DbgRecords.count(makeHash(&DVR))) - DVR.eraseFromParent(); - } - - NextDbgInsts = I->getDbgRecordRange(); - - Inst->moveBefore(LoopEntryBranch->getIterator()); + // When preparing for LTO, avoid rotating loops with calls that could be + // inlined during the LTO stage. + if (PrepareForLTO && Metrics.NumInlineCandidates > 0) + return Rotated; + } - ++NumInstrsHoisted; - continue; - } + // Now, this loop is suitable for rotation. + BasicBlock *OrigPreheader = L->getLoopPreheader(); + + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!OrigPreheader || !L->hasDedicatedExits()) + return Rotated; + + // Anything ScalarEvolution may know about this loop or the PHI nodes + // in its header will soon be invalidated. We should also invalidate + // all outer loops because insertion and deletion of blocks that happens + // during the rotation may violate invariants related to backedge taken + // infos in them. + if (SE) { + SE->forgetTopmostLoop(L); + // We may hoist some instructions out of loop. In case if they were cached + // as "loop variant" or "loop computable", these caches must be dropped. + // We also may fold basic blocks, so cached block dispositions also need + // to be dropped. + SE->forgetBlockAndLoopDispositions(); + } - // Otherwise, create a duplicate of the instruction. - Instruction *C = Inst->clone(); - if (const DebugLoc &DL = C->getDebugLoc()) - mapAtomInstance(DL, ValueMap); + LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); - C->insertBefore(LoopEntryBranch->getIterator()); + // Find new Loop header. NewHeader is a Header's one and only successor + // that is inside loop. Header's other successor is outside the + // loop. Otherwise loop is not suitable for rotation. + BasicBlock *Exit = BI->getSuccessor(0); + BasicBlock *NewHeader = BI->getSuccessor(1); + bool BISuccsSwapped = L->contains(Exit); + if (BISuccsSwapped) + std::swap(Exit, NewHeader); + assert(NewHeader && "Unable to determine new loop header"); + assert(L->contains(NewHeader) && !L->contains(Exit) && + "Unable to determine loop header and exit blocks"); + + // This code assumes that the new header has exactly one predecessor. + // Remove any single-entry PHI nodes in it. + assert(NewHeader->getSinglePredecessor() && + "New header doesn't have one pred!"); + FoldSingleEntryPHINodes(NewHeader); + + // Begin by walking OrigHeader and populating ValueMap with an entry for + // each Instruction. + BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); + ValueToValueMapTy ValueMap, ValueMapMSSA; + + // For PHI nodes, the value available in OldPreHeader is just the + // incoming value from OldPreHeader. + for (; PHINode *PN = dyn_cast<PHINode>(I); ++I) + InsertNewValueIntoMap(ValueMap, PN, + PN->getIncomingValueForBlock(OrigPreheader)); + + // For the rest of the instructions, either hoist to the OrigPreheader if + // possible or create a clone in the OldPreHeader if not. + Instruction *LoopEntryBranch = OrigPreheader->getTerminator(); + + // Record all debug records preceding LoopEntryBranch to avoid + // duplication. + using DbgHash = + std::pair<std::pair<hash_code, DILocalVariable *>, DIExpression *>; + auto makeHash = [](const DbgVariableRecord *D) -> DbgHash { + auto VarLocOps = D->location_ops(); + return {{hash_combine_range(VarLocOps), D->getVariable()}, + D->getExpression()}; + }; - ++NumInstrsDuplicated; + SmallDenseSet<DbgHash, 8> DbgRecords; + // Build DbgVariableRecord hashes for DbgVariableRecords attached to the + // terminator. + for (const DbgVariableRecord &DVR : + filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange())) + DbgRecords.insert(makeHash(&DVR)); + + // Remember the local noalias scope declarations in the header. After the + // rotation, they must be duplicated and the scope must be cloned. This + // avoids unwanted interaction across iterations. + SmallVector<NoAliasScopeDeclInst *, 6> NoAliasDeclInstructions; + for (Instruction &I : *OrigHeader) + if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I)) + NoAliasDeclInstructions.push_back(Decl); + + Module *M = OrigHeader->getModule(); + + // Track the next DbgRecord to clone. If we have a sequence where an + // instruction is hoisted instead of being cloned: + // DbgRecord blah + // %foo = add i32 0, 0 + // DbgRecord xyzzy + // %bar = call i32 @foobar() + // where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once + // attached to %foo, then when %foo his hoisted it will "fall down" onto the + // function call: + // DbgRecord blah + // DbgRecord xyzzy + // %bar = call i32 @foobar() + // causing it to appear attached to the call too. + // + // To avoid this, cloneDebugInfoFrom takes an optional "start cloning from + // here" position to account for this behaviour. We point it at any + // DbgRecords on the next instruction, here labelled xyzzy, before we hoist + // %foo. Later, we only only clone DbgRecords from that position (xyzzy) + // onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as + // a range because it gives us a natural way of testing whether + // there were DbgRecords on the next instruction before we hoisted things). + iterator_range<DbgRecord::self_iterator> NextDbgInsts = + (I != E) ? I->getDbgRecordRange() : DbgMarker::getEmptyDbgRecordRange(); + + while (I != E) { + Instruction *Inst = &*I++; + + // If the instruction's operands are invariant and it doesn't read or write + // memory, then it is safe to hoist. Doing this doesn't change the order of + // execution in the preheader, but does prevent the instruction from + // executing in each iteration of the loop. This means it is safe to hoist + // something that might trap, but isn't safe to hoist something that reads + // memory (without proving that the loop doesn't write). + if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() && + !Inst->mayWriteToMemory() && !Inst->isTerminator() && + !isa<AllocaInst>(Inst) && + // It is not safe to hoist the value of these instructions in + // coroutines, as the addresses of otherwise eligible variables (e.g. + // thread-local variables and errno) may change if the coroutine is + // resumed in a different thread.Therefore, we disable this + // optimization for correctness. However, this may block other correct + // optimizations. + // FIXME: This should be reverted once we have a better model for + // memory access in coroutines. + !Inst->getFunction()->isPresplitCoroutine()) { if (!NextDbgInsts.empty()) { - auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin()); - RemapDbgRecordRange(M, Range, ValueMap, + auto DbgValueRange = + LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin()); + RemapDbgRecordRange(M, DbgValueRange, ValueMap, RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); - NextDbgInsts = DbgMarker::getEmptyDbgRecordRange(); // Erase anything we've seen before. for (DbgVariableRecord &DVR : - make_early_inc_range(filterDbgVars(Range))) + make_early_inc_range(filterDbgVars(DbgValueRange))) if (DbgRecords.count(makeHash(&DVR))) DVR.eraseFromParent(); } - // Eagerly remap the operands of the instruction. - RemapInstruction(C, ValueMap, - RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); - - // With the operands remapped, see if the instruction constant folds or is - // otherwise simplifyable. This commonly occurs because the entry from PHI - // nodes allows icmps and other instructions to fold. - Value *V = simplifyInstruction(C, SQ); - if (V && LI->replacementPreservesLCSSAForm(C, V)) { - // If so, then delete the temporary instruction and stick the folded value - // in the map. - InsertNewValueIntoMap(ValueMap, Inst, V); - if (!C->mayHaveSideEffects()) { - C->eraseFromParent(); - C = nullptr; - } - } else { - InsertNewValueIntoMap(ValueMap, Inst, C); - } - if (C) { - // Otherwise, stick the new instruction into the new block! - C->setName(Inst->getName()); - - if (auto *II = dyn_cast<AssumeInst>(C)) - AC->registerAssumption(II); - // MemorySSA cares whether the cloned instruction was inserted or not, and - // not whether it can be remapped to a simplified value. - if (MSSAU) - InsertNewValueIntoMap(ValueMapMSSA, Inst, C); - } - } + NextDbgInsts = I->getDbgRecordRange(); - if (!NoAliasDeclInstructions.empty()) { - // There are noalias scope declarations: - // (general): - // Original: OrigPre { OrigHeader NewHeader ... Latch } - // after: (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader } - // - // with D: llvm.experimental.noalias.scope.decl, - // U: !noalias or !alias.scope depending on D - // ... { D U1 U2 } can transform into: - // (0) : ... { D U1 U2 } // no relevant rotation for this part - // (1) : ... D' { U1 U2 D } // D is part of OrigHeader - // (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader - // - // We now want to transform: - // (1) -> : ... D' { D U1 U2 D'' } - // (2) -> : ... D' U1' { D U2 D'' U1'' } - // D: original llvm.experimental.noalias.scope.decl - // D', U1': duplicate with replaced scopes - // D'', U1'': different duplicate with replaced scopes - // This ensures a safe fallback to 'may_alias' introduced by the rotate, - // as U1'' and U1' scopes will not be compatible wrt to the local restrict - - // Clone the llvm.experimental.noalias.decl again for the NewHeader. - BasicBlock::iterator NewHeaderInsertionPoint = - NewHeader->getFirstNonPHIIt(); - for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) { - LLVM_DEBUG(dbgs() << " Cloning llvm.experimental.noalias.scope.decl:" - << *NAD << "\n"); - Instruction *NewNAD = NAD->clone(); - NewNAD->insertBefore(*NewHeader, NewHeaderInsertionPoint); - } + Inst->moveBefore(LoopEntryBranch->getIterator()); - // Scopes must now be duplicated, once for OrigHeader and once for - // OrigPreHeader'. - { - auto &Context = NewHeader->getContext(); - - SmallVector<MDNode *, 8> NoAliasDeclScopes; - for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) - NoAliasDeclScopes.push_back(NAD->getScopeList()); - - LLVM_DEBUG(dbgs() << " Updating OrigHeader scopes\n"); - cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, {OrigHeader}, Context, - "h.rot"); - LLVM_DEBUG(OrigHeader->dump()); - - // Keep the compile time impact low by only adapting the inserted block - // of instructions in the OrigPreHeader. This might result in slightly - // more aliasing between these instructions and those that were already - // present, but it will be much faster when the original PreHeader is - // large. - LLVM_DEBUG(dbgs() << " Updating part of OrigPreheader scopes\n"); - auto *FirstDecl = - cast<Instruction>(ValueMap[*NoAliasDeclInstructions.begin()]); - auto *LastInst = &OrigPreheader->back(); - cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, FirstDecl, LastInst, - Context, "pre.rot"); - LLVM_DEBUG(OrigPreheader->dump()); - - LLVM_DEBUG(dbgs() << " Updated NewHeader:\n"); - LLVM_DEBUG(NewHeader->dump()); - } + ++NumInstrsHoisted; + continue; } - // Along with all the other instructions, we just cloned OrigHeader's - // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's - // successors by duplicating their incoming values for OrigHeader. - for (BasicBlock *SuccBB : successors(OrigHeader)) - for (BasicBlock::iterator BI = SuccBB->begin(); - PHINode *PN = dyn_cast<PHINode>(BI); ++BI) - PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); - - // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove - // OrigPreHeader's old terminator (the original branch into the loop), and - // remove the corresponding incoming values from the PHI nodes in OrigHeader. - LoopEntryBranch->eraseFromParent(); - OrigPreheader->flushTerminatorDbgRecords(); - - // Update MemorySSA before the rewrite call below changes the 1:1 - // instruction:cloned_instruction_or_value mapping. - if (MSSAU) { - InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader); - MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader, - ValueMapMSSA); - } + // Otherwise, create a duplicate of the instruction. + Instruction *C = Inst->clone(); + if (const DebugLoc &DL = C->getDebugLoc()) + mapAtomInstance(DL, ValueMap); - SmallVector<PHINode*, 2> InsertedPHIs; - // If there were any uses of instructions in the duplicated block outside the - // loop, update them, inserting PHI nodes as required - RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE, - &InsertedPHIs); - - // Attach debug records to the new phis if that phi uses a value that - // previously had debug metadata attached. This keeps the debug info - // up-to-date in the loop body. - if (!InsertedPHIs.empty()) - insertDebugValuesForPHIs(OrigHeader, InsertedPHIs); - - // NewHeader is now the header of the loop. - L->moveToHeader(NewHeader); - assert(L->getHeader() == NewHeader && "Latch block is our new header"); - - // Inform DT about changes to the CFG. - if (DT) { - // The OrigPreheader branches to the NewHeader and Exit now. Then, inform - // the DT about the removed edge to the OrigHeader (that got removed). - SmallVector<DominatorTree::UpdateType, 3> Updates = { - {DominatorTree::Insert, OrigPreheader, Exit}, - {DominatorTree::Insert, OrigPreheader, NewHeader}, - {DominatorTree::Delete, OrigPreheader, OrigHeader}}; - - if (MSSAU) { - MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true); - if (VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); - } else { - DT->applyUpdates(Updates); - } - } + C->insertBefore(LoopEntryBranch->getIterator()); - // At this point, we've finished our major CFG changes. As part of cloning - // the loop into the preheader we've simplified instructions and the - // duplicated conditional branch may now be branching on a constant. If it is - // branching on a constant and if that constant means that we enter the loop, - // then we fold away the cond branch to an uncond branch. This simplifies the - // loop in cases important for nested loops, and it also means we don't have - // to split as many edges. - BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator()); - assert(PHBI->isConditional() && "Should be clone of BI condbr!"); - const Value *Cond = PHBI->getCondition(); - const bool HasConditionalPreHeader = - !isa<ConstantInt>(Cond) || - PHBI->getSuccessor(cast<ConstantInt>(Cond)->isZero()) != NewHeader; - - updateBranchWeights(*PHBI, *BI, HasConditionalPreHeader, BISuccsSwapped); + ++NumInstrsDuplicated; - if (HasConditionalPreHeader) { - // The conditional branch can't be folded, handle the general case. - // Split edges as necessary to preserve LoopSimplify form. - - // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and - // thus is not a preheader anymore. - // Split the edge to form a real preheader. - BasicBlock *NewPH = SplitCriticalEdge( - OrigPreheader, NewHeader, - CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); - NewPH->setName(NewHeader->getName() + ".lr.ph"); - - // Preserve canonical loop form, which means that 'Exit' should have only - // one predecessor. Note that Exit could be an exit block for multiple - // nested loops, causing both of the edges to now be critical and need to - // be split. - SmallVector<BasicBlock *, 4> ExitPreds(predecessors(Exit)); - bool SplitLatchEdge = false; - for (BasicBlock *ExitPred : ExitPreds) { - // We only need to split loop exit edges. - Loop *PredLoop = LI->getLoopFor(ExitPred); - if (!PredLoop || PredLoop->contains(Exit) || - isa<IndirectBrInst>(ExitPred->getTerminator())) - continue; - SplitLatchEdge |= L->getLoopLatch() == ExitPred; - BasicBlock *ExitSplit = SplitCriticalEdge( - ExitPred, Exit, - CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); - ExitSplit->moveBefore(Exit); + if (!NextDbgInsts.empty()) { + auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin()); + RemapDbgRecordRange(M, Range, ValueMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + NextDbgInsts = DbgMarker::getEmptyDbgRecordRange(); + // Erase anything we've seen before. + for (DbgVariableRecord &DVR : make_early_inc_range(filterDbgVars(Range))) + if (DbgRecords.count(makeHash(&DVR))) + DVR.eraseFromParent(); + } + + // Eagerly remap the operands of the instruction. + RemapInstruction(C, ValueMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + + // With the operands remapped, see if the instruction constant folds or is + // otherwise simplifyable. This commonly occurs because the entry from PHI + // nodes allows icmps and other instructions to fold. + Value *V = simplifyInstruction(C, SQ); + if (V && LI->replacementPreservesLCSSAForm(C, V)) { + // If so, then delete the temporary instruction and stick the folded value + // in the map. + InsertNewValueIntoMap(ValueMap, Inst, V); + if (!C->mayHaveSideEffects()) { + C->eraseFromParent(); + C = nullptr; } - assert(SplitLatchEdge && - "Despite splitting all preds, failed to split latch exit?"); - (void)SplitLatchEdge; } else { - // We can fold the conditional branch in the preheader, this makes things - // simpler. The first step is to remove the extra edge to the Exit block. - Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); - BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI->getIterator()); - NewBI->setDebugLoc(PHBI->getDebugLoc()); - PHBI->eraseFromParent(); + InsertNewValueIntoMap(ValueMap, Inst, C); + } + if (C) { + // Otherwise, stick the new instruction into the new block! + C->setName(Inst->getName()); + + if (auto *II = dyn_cast<AssumeInst>(C)) + AC->registerAssumption(II); + // MemorySSA cares whether the cloned instruction was inserted or not, and + // not whether it can be remapped to a simplified value. + if (MSSAU) + InsertNewValueIntoMap(ValueMapMSSA, Inst, C); + } + } - // With our CFG finalized, update DomTree if it is available. - if (DT) DT->deleteEdge(OrigPreheader, Exit); + if (!NoAliasDeclInstructions.empty()) { + // There are noalias scope declarations: + // (general): + // Original: OrigPre { OrigHeader NewHeader ... Latch } + // after: (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader } + // + // with D: llvm.experimental.noalias.scope.decl, + // U: !noalias or !alias.scope depending on D + // ... { D U1 U2 } can transform into: + // (0) : ... { D U1 U2 } // no relevant rotation for this part + // (1) : ... D' { U1 U2 D } // D is part of OrigHeader + // (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader + // + // We now want to transform: + // (1) -> : ... D' { D U1 U2 D'' } + // (2) -> : ... D' U1' { D U2 D'' U1'' } + // D: original llvm.experimental.noalias.scope.decl + // D', U1': duplicate with replaced scopes + // D'', U1'': different duplicate with replaced scopes + // This ensures a safe fallback to 'may_alias' introduced by the rotate, + // as U1'' and U1' scopes will not be compatible wrt to the local restrict + + // Clone the llvm.experimental.noalias.decl again for the NewHeader. + BasicBlock::iterator NewHeaderInsertionPoint = + NewHeader->getFirstNonPHIIt(); + for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) { + LLVM_DEBUG(dbgs() << " Cloning llvm.experimental.noalias.scope.decl:" + << *NAD << "\n"); + Instruction *NewNAD = NAD->clone(); + NewNAD->insertBefore(*NewHeader, NewHeaderInsertionPoint); + } - // Update MSSA too, if available. - if (MSSAU) - MSSAU->removeEdge(OrigPreheader, Exit); + // Scopes must now be duplicated, once for OrigHeader and once for + // OrigPreHeader'. + { + auto &Context = NewHeader->getContext(); + + SmallVector<MDNode *, 8> NoAliasDeclScopes; + for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) + NoAliasDeclScopes.push_back(NAD->getScopeList()); + + LLVM_DEBUG(dbgs() << " Updating OrigHeader scopes\n"); + cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, {OrigHeader}, Context, + "h.rot"); + LLVM_DEBUG(OrigHeader->dump()); + + // Keep the compile time impact low by only adapting the inserted block + // of instructions in the OrigPreHeader. This might result in slightly + // more aliasing between these instructions and those that were already + // present, but it will be much faster when the original PreHeader is + // large. + LLVM_DEBUG(dbgs() << " Updating part of OrigPreheader scopes\n"); + auto *FirstDecl = + cast<Instruction>(ValueMap[*NoAliasDeclInstructions.begin()]); + auto *LastInst = &OrigPreheader->back(); + cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, FirstDecl, LastInst, + Context, "pre.rot"); + LLVM_DEBUG(OrigPreheader->dump()); + + LLVM_DEBUG(dbgs() << " Updated NewHeader:\n"); + LLVM_DEBUG(NewHeader->dump()); } + } - assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); - assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); + // Along with all the other instructions, we just cloned OrigHeader's + // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's + // successors by duplicating their incoming values for OrigHeader. + for (BasicBlock *SuccBB : successors(OrigHeader)) + for (BasicBlock::iterator BI = SuccBB->begin(); + PHINode *PN = dyn_cast<PHINode>(BI); ++BI) + PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); + + // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove + // OrigPreHeader's old terminator (the original branch into the loop), and + // remove the corresponding incoming values from the PHI nodes in OrigHeader. + LoopEntryBranch->eraseFromParent(); + OrigPreheader->flushTerminatorDbgRecords(); + + // Update MemorySSA before the rewrite call below changes the 1:1 + // instruction:cloned_instruction_or_value mapping. + if (MSSAU) { + InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader); + MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader, + ValueMapMSSA); + } - if (MSSAU && VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); + SmallVector<PHINode *, 2> InsertedPHIs; + // If there were any uses of instructions in the duplicated block outside the + // loop, update them, inserting PHI nodes as required + RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE, + &InsertedPHIs); + + // Attach debug records to the new phis if that phi uses a value that + // previously had debug metadata attached. This keeps the debug info + // up-to-date in the loop body. + if (!InsertedPHIs.empty()) + insertDebugValuesForPHIs(OrigHeader, InsertedPHIs); + + // NewHeader is now the header of the loop. + L->moveToHeader(NewHeader); + assert(L->getHeader() == NewHeader && "Latch block is our new header"); + + // Inform DT about changes to the CFG. + if (DT) { + // The OrigPreheader branches to the NewHeader and Exit now. Then, inform + // the DT about the removed edge to the OrigHeader (that got removed). + SmallVector<DominatorTree::UpdateType, 3> Updates = { + {DominatorTree::Insert, OrigPreheader, Exit}, + {DominatorTree::Insert, OrigPreheader, NewHeader}, + {DominatorTree::Delete, OrigPreheader, OrigHeader}}; - // Now that the CFG and DomTree are in a consistent state again, try to merge - // the OrigHeader block into OrigLatch. This will succeed if they are - // connected by an unconditional branch. This is just a cleanup so the - // emitted code isn't too gross in this common case. - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); - BasicBlock *PredBB = OrigHeader->getUniquePredecessor(); - bool DidMerge = MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU); - if (DidMerge) - RemoveRedundantDbgInstrs(PredBB); + if (MSSAU) { + MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true); + if (VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + } else { + DT->applyUpdates(Updates); + } + } - if (MSSAU && VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); + // At this point, we've finished our major CFG changes. As part of cloning + // the loop into the preheader we've simplified instructions and the + // duplicated conditional branch may now be branching on a constant. If it is + // branching on a constant and if that constant means that we enter the loop, + // then we fold away the cond branch to an uncond branch. This simplifies the + // loop in cases important for nested loops, and it also means we don't have + // to split as many edges. + BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator()); + assert(PHBI->isConditional() && "Should be clone of BI condbr!"); + const Value *Cond = PHBI->getCondition(); + const bool HasConditionalPreHeader = + !isa<ConstantInt>(Cond) || + PHBI->getSuccessor(cast<ConstantInt>(Cond)->isZero()) != NewHeader; + + updateBranchWeights(*PHBI, *BI, HasConditionalPreHeader, BISuccsSwapped); - LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump()); + if (HasConditionalPreHeader) { + // The conditional branch can't be folded, handle the general case. + // Split edges as necessary to preserve LoopSimplify form. + + // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and + // thus is not a preheader anymore. + // Split the edge to form a real preheader. + BasicBlock *NewPH = SplitCriticalEdge( + OrigPreheader, NewHeader, + CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); + NewPH->setName(NewHeader->getName() + ".lr.ph"); + + // Preserve canonical loop form, which means that 'Exit' should have only + // one predecessor. Note that Exit could be an exit block for multiple + // nested loops, causing both of the edges to now be critical and need to + // be split. + SmallVector<BasicBlock *, 4> ExitPreds(predecessors(Exit)); + bool SplitLatchEdge = false; + for (BasicBlock *ExitPred : ExitPreds) { + // We only need to split loop exit edges. + Loop *PredLoop = LI->getLoopFor(ExitPred); + if (!PredLoop || PredLoop->contains(Exit) || + isa<IndirectBrInst>(ExitPred->getTerminator())) + continue; + SplitLatchEdge |= L->getLoopLatch() == ExitPred; + BasicBlock *ExitSplit = SplitCriticalEdge( + ExitPred, Exit, + CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()); + ExitSplit->moveBefore(Exit); + } + assert(SplitLatchEdge && + "Despite splitting all preds, failed to split latch exit?"); + (void)SplitLatchEdge; + } else { + // We can fold the conditional branch in the preheader, this makes things + // simpler. The first step is to remove the extra edge to the Exit block. + Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); + BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI->getIterator()); + NewBI->setDebugLoc(PHBI->getDebugLoc()); + PHBI->eraseFromParent(); + + // With our CFG finalized, update DomTree if it is available. + if (DT) + DT->deleteEdge(OrigPreheader, Exit); + + // Update MSSA too, if available. + if (MSSAU) + MSSAU->removeEdge(OrigPreheader, Exit); + } - ++NumRotated; + assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); + assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); - Rotated = true; - SimplifiedLatch = false; + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + + // Now that the CFG and DomTree are in a consistent state again, try to merge + // the OrigHeader block into OrigLatch. This will succeed if they are + // connected by an unconditional branch. This is just a cleanup so the + // emitted code isn't too gross in this common case. + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + BasicBlock *PredBB = OrigHeader->getUniquePredecessor(); + bool DidMerge = MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU); + if (DidMerge) + RemoveRedundantDbgInstrs(PredBB); - // Check that new latch is a deoptimizing exit and then repeat rotation if possible. - // Deoptimizing latch exit is not a generally typical case, so we just loop over. - // TODO: if it becomes a performance bottleneck extend rotation algorithm - // to handle multiple rotations in one go. - } while (MultiRotate && canRotateDeoptimizingLatchExit(L)); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump()); return true; } diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index bf882d7..6312831 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -201,18 +201,27 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, /// unroll count is non-zero. /// /// This function performs the following: -/// - Update PHI nodes at the unrolling loop exit and epilog loop exit -/// - Create PHI nodes at the unrolling loop exit to combine -/// values that exit the unrolling loop code and jump around it. +/// - Update PHI nodes at the epilog loop exit +/// - Create PHI nodes at the unrolling loop exit and epilog preheader to +/// combine values that exit the unrolling loop code and jump around it. /// - Update PHI operands in the epilog loop by the new PHI nodes -/// - Branch around the epilog loop if extra iters (ModVal) is zero. +/// - At the unrolling loop exit, branch around the epilog loop if extra iters +// (ModVal) is zero. +/// - At the epilog preheader, add an llvm.assume call that extra iters is +/// non-zero. If the unrolling loop exit is the predecessor, the above new +/// branch guarantees that assumption. If the unrolling loop preheader is the +/// predecessor, then the required first iteration from the original loop has +/// yet to be executed, so it must be executed in the epilog loop. If we +/// later unroll the epilog loop, that llvm.assume call somehow enables +/// ScalarEvolution to compute a epilog loop maximum trip count, which enables +/// eliminating the branch at the end of the final unrolled epilog iteration. /// static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, BasicBlock *Exit, BasicBlock *PreHeader, BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader, ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA, ScalarEvolution &SE, - unsigned Count) { + unsigned Count, AssumptionCache &AC) { BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Loop must have a latch"); BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]); @@ -231,7 +240,7 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, // EpilogLatch // Exit (EpilogPN) - // Update PHI nodes at NewExit and Exit. + // Update PHI nodes at Exit. for (PHINode &PN : NewExit->phis()) { // PN should be used in another PHI located in Exit block as // Exit was split by SplitBlockPredecessors into Exit and NewExit @@ -246,15 +255,11 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, // epilogue edges have already been added. // // There is EpilogPreHeader incoming block instead of NewExit as - // NewExit was spilt 1 more time to get EpilogPreHeader. + // NewExit was split 1 more time to get EpilogPreHeader. assert(PN.hasOneUse() && "The phi should have 1 use"); PHINode *EpilogPN = cast<PHINode>(PN.use_begin()->getUser()); assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block"); - // Add incoming PreHeader from branch around the Loop - PN.addIncoming(PoisonValue::get(PN.getType()), PreHeader); - SE.forgetValue(&PN); - Value *V = PN.getIncomingValueForBlock(Latch); Instruction *I = dyn_cast<Instruction>(V); if (I && L->contains(I)) @@ -271,35 +276,52 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, NewExit); // Now PHIs should look like: // NewExit: - // PN = PHI [I, Latch], [poison, PreHeader] + // PN = PHI [I, Latch] // ... // Exit: // EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch] } - // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader). - // Update corresponding PHI nodes in epilog loop. + // Create PHI nodes at NewExit (from the unrolling loop Latch) and at + // EpilogPreHeader (from PreHeader and NewExit). Update corresponding PHI + // nodes in epilog loop. for (BasicBlock *Succ : successors(Latch)) { // Skip this as we already updated phis in exit blocks. if (!L->contains(Succ)) continue; + + // Succ here appears to always be just L->getHeader(). Otherwise, how do we + // know its corresponding epilog block (from VMap) is EpilogHeader and thus + // EpilogPreHeader is the right incoming block for VPN, as set below? + // TODO: Can we thus avoid the enclosing loop over successors? + assert(Succ == L->getHeader() && + "Expect the only in-loop successor of latch to be the loop header"); + for (PHINode &PN : Succ->phis()) { - // Add new PHI nodes to the loop exit block and update epilog - // PHIs with the new PHI values. - PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr"); - NewPN->insertBefore(NewExit->getFirstNonPHIIt()); - // Adding a value to the new PHI node from the unrolling loop preheader. - NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader); - // Adding a value to the new PHI node from the unrolling loop latch. - NewPN->addIncoming(PN.getIncomingValueForBlock(Latch), Latch); + // Add new PHI nodes to the loop exit block. + PHINode *NewPN0 = PHINode::Create(PN.getType(), /*NumReservedValues=*/1, + PN.getName() + ".unr"); + NewPN0->insertBefore(NewExit->getFirstNonPHIIt()); + // Add value to the new PHI node from the unrolling loop latch. + NewPN0->addIncoming(PN.getIncomingValueForBlock(Latch), Latch); + + // Add new PHI nodes to EpilogPreHeader. + PHINode *NewPN1 = PHINode::Create(PN.getType(), /*NumReservedValues=*/2, + PN.getName() + ".epil.init"); + NewPN1->insertBefore(EpilogPreHeader->getFirstNonPHIIt()); + // Add value to the new PHI node from the unrolling loop preheader. + NewPN1->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader); + // Add value to the new PHI node from the epilog loop guard. + NewPN1->addIncoming(NewPN0, NewExit); // Update the existing PHI node operand with the value from the new PHI // node. Corresponding instruction in epilog loop should be PHI. PHINode *VPN = cast<PHINode>(VMap[&PN]); - VPN->setIncomingValueForBlock(EpilogPreHeader, NewPN); + VPN->setIncomingValueForBlock(EpilogPreHeader, NewPN1); } } + // In NewExit, branch around the epilog loop if no extra iters. Instruction *InsertPt = NewExit->getTerminator(); IRBuilder<> B(InsertPt); Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod"); @@ -308,7 +330,7 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, SmallVector<BasicBlock*, 4> Preds(predecessors(Exit)); SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI, nullptr, PreserveLCSSA); - // Add the branch to the exit block (around the unrolling loop) + // Add the branch to the exit block (around the epilog loop) MDNode *BranchWeights = nullptr; if (hasBranchWeightMD(*Latch->getTerminator())) { // Assume equal distribution in interval [0, Count). @@ -322,10 +344,11 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, DT->changeImmediateDominator(Exit, NewDom); } - // Split the main loop exit to maintain canonicalization guarantees. - SmallVector<BasicBlock*, 4> NewExitPreds{Latch}; - SplitBlockPredecessors(NewExit, NewExitPreds, ".loopexit", DT, LI, nullptr, - PreserveLCSSA); + // In EpilogPreHeader, assume extra iters is non-zero. + IRBuilder<> B2(EpilogPreHeader, EpilogPreHeader->getFirstNonPHIIt()); + Value *ModIsNotNull = B2.CreateIsNotNull(ModVal, "lcmp.mod"); + AssumeInst *AI = cast<AssumeInst>(B2.CreateAssumption(ModIsNotNull)); + AC.registerAssumption(AI); } /// Create a clone of the blocks in a loop and connect them together. A new @@ -795,7 +818,8 @@ bool llvm::UnrollRuntimeLoopRemainder( ConstantInt::get(BECount->getType(), Count - 1)) : B.CreateIsNotNull(ModVal, "lcmp.mod"); - BasicBlock *RemainderLoop = UseEpilogRemainder ? NewExit : PrologPreHeader; + BasicBlock *RemainderLoop = + UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader; BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit; // Branch to either remainder (extra iterations) loop or unrolling loop. MDNode *BranchWeights = nullptr; @@ -808,7 +832,7 @@ bool llvm::UnrollRuntimeLoopRemainder( PreHeaderBR->eraseFromParent(); if (DT) { if (UseEpilogRemainder) - DT->changeImmediateDominator(NewExit, PreHeader); + DT->changeImmediateDominator(EpilogPreHeader, PreHeader); else DT->changeImmediateDominator(PrologExit, PreHeader); } @@ -880,7 +904,8 @@ bool llvm::UnrollRuntimeLoopRemainder( // from both the original loop and the remainder code reaching the exit // blocks. While the IDom of these exit blocks were from the original loop, // now the IDom is the preheader (which decides whether the original loop or - // remainder code should run). + // remainder code should run) unless the block still has just the original + // predecessor (such as NewExit in the case of an epilog remainder). if (DT && !L->getExitingBlock()) { SmallVector<BasicBlock *, 16> ChildrenToUpdate; // NB! We have to examine the dom children of all loop blocks, not just @@ -891,7 +916,8 @@ bool llvm::UnrollRuntimeLoopRemainder( auto *DomNodeBB = DT->getNode(BB); for (auto *DomChild : DomNodeBB->children()) { auto *DomChildBB = DomChild->getBlock(); - if (!L->contains(LI->getLoopFor(DomChildBB))) + if (!L->contains(LI->getLoopFor(DomChildBB)) && + DomChildBB->getUniquePredecessor() != BB) ChildrenToUpdate.push_back(DomChildBB); } } @@ -930,7 +956,7 @@ bool llvm::UnrollRuntimeLoopRemainder( // Connect the epilog code to the original loop and update the // PHI functions. ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader, - NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count); + NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC); // Update counter in loop for unrolling. // Use an incrementing IV. Pre-incr/post-incr is backedge/trip count. diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 148bfa8..155fcc5 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -4895,9 +4895,8 @@ bool SimplifyCFGOpt::simplifyTerminatorOnSelect(Instruction *OldTerm, // We found both of the successors we were looking for. // Create a conditional branch sharing the condition of the select. BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB); - if (TrueWeight != FalseWeight) - setBranchWeights(*NewBI, {TrueWeight, FalseWeight}, - /*IsExpected=*/false, /*ElideAllZero=*/true); + setBranchWeights(*NewBI, {TrueWeight, FalseWeight}, + /*IsExpected=*/false, /*ElideAllZero=*/true); } } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) { // Neither of the selected blocks were successors, so this @@ -4982,9 +4981,15 @@ bool SimplifyCFGOpt::simplifyIndirectBrOnSelect(IndirectBrInst *IBI, BasicBlock *TrueBB = TBA->getBasicBlock(); BasicBlock *FalseBB = FBA->getBasicBlock(); + // The select's profile becomes the profile of the conditional branch that + // replaces the indirect branch. + SmallVector<uint32_t> SelectBranchWeights(2); + if (!ProfcheckDisableMetadataFixes) + extractBranchWeights(*SI, SelectBranchWeights); // Perform the actual simplification. - return simplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB, 0, - 0); + return simplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB, + SelectBranchWeights[0], + SelectBranchWeights[1]); } /// This is called when we find an icmp instruction @@ -6637,6 +6642,9 @@ public: /// Return true if the replacement is a lookup table. bool isLookupTable(); + /// Return true if the replacement is a bit map. + bool isBitMap(); + private: // Depending on the switch, there are different alternatives. enum { @@ -6927,6 +6935,8 @@ Constant *SwitchReplacement::getDefaultValue() { return DefaultValue; } bool SwitchReplacement::isLookupTable() { return Kind == LookupTableKind; } +bool SwitchReplacement::isBitMap() { return Kind == BitMapKind; } + static bool isSwitchDense(uint64_t NumCases, uint64_t CaseRange) { // 40% is the default density for building a jump table in optsize/minsize // mode. See also TargetLoweringBase::isSuitableForJumpTable(), which this @@ -7092,7 +7102,8 @@ static void reuseTableCompare( /// lookup tables. static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, DomTreeUpdater *DTU, const DataLayout &DL, - const TargetTransformInfo &TTI) { + const TargetTransformInfo &TTI, + bool ConvertSwitchToLookupTable) { assert(SI->getNumCases() > 1 && "Degenerate switch?"); BasicBlock *BB = SI->getParent(); @@ -7257,6 +7268,8 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, bool AnyLookupTables = any_of( PhiToReplacementMap, [](auto &KV) { return KV.second.isLookupTable(); }); + bool AnyBitMaps = any_of(PhiToReplacementMap, + [](auto &KV) { return KV.second.isBitMap(); }); // A few conditions prevent the generation of lookup tables: // 1. The target does not support lookup tables. @@ -7269,6 +7282,12 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, Fn->getFnAttribute("no-jump-tables").getValueAsBool())) return false; + // In the early optimization pipeline, disable formation of lookup tables, + // bit maps and mask checks, as they may inhibit further optimization. + if (!ConvertSwitchToLookupTable && + (AnyLookupTables || AnyBitMaps || NeedMask)) + return false; + Builder.SetInsertPoint(SI); // TableIndex is the switch condition - TableIndexOffset if we don't // use the condition directly @@ -7924,14 +7943,13 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { if (Options.ForwardSwitchCondToPhi && forwardSwitchConditionToPHI(SI)) return requestResimplify(); - // The conversion from switch to lookup tables results in difficult-to-analyze - // code and makes pruning branches much harder. This is a problem if the - // switch expression itself can still be restricted as a result of inlining or - // CVP. Therefore, only apply this transformation during late stages of the - // optimisation pipeline. - if (Options.ConvertSwitchToLookupTable && - simplifySwitchLookup(SI, Builder, DTU, DL, TTI)) - return requestResimplify(); + // The conversion of switches to arithmetic or lookup table is disabled in + // the early optimization pipeline, as it may lose information or make the + // resulting code harder to analyze. + if (Options.ConvertSwitchToArithmetic || Options.ConvertSwitchToLookupTable) + if (simplifySwitchLookup(SI, Builder, DTU, DL, TTI, + Options.ConvertSwitchToLookupTable)) + return requestResimplify(); if (simplifySwitchOfPowersOfTwo(SI, Builder, DL, TTI)) return requestResimplify(); @@ -7952,19 +7970,27 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) { BasicBlock *BB = IBI->getParent(); bool Changed = false; + SmallVector<uint32_t> BranchWeights; + const bool HasBranchWeights = !ProfcheckDisableMetadataFixes && + extractBranchWeights(*IBI, BranchWeights); + + DenseMap<const BasicBlock *, uint64_t> TargetWeight; + if (HasBranchWeights) + for (size_t I = 0, E = IBI->getNumDestinations(); I < E; ++I) + TargetWeight[IBI->getDestination(I)] += BranchWeights[I]; // Eliminate redundant destinations. SmallPtrSet<Value *, 8> Succs; SmallSetVector<BasicBlock *, 8> RemovedSuccs; - for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) { - BasicBlock *Dest = IBI->getDestination(i); + for (unsigned I = 0, E = IBI->getNumDestinations(); I != E; ++I) { + BasicBlock *Dest = IBI->getDestination(I); if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) { if (!Dest->hasAddressTaken()) RemovedSuccs.insert(Dest); Dest->removePredecessor(BB); - IBI->removeDestination(i); - --i; - --e; + IBI->removeDestination(I); + --I; + --E; Changed = true; } } @@ -7990,7 +8016,12 @@ bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) { eraseTerminatorAndDCECond(IBI); return true; } - + if (HasBranchWeights) { + SmallVector<uint64_t> NewBranchWeights(IBI->getNumDestinations()); + for (size_t I = 0, E = IBI->getNumDestinations(); I < E; ++I) + NewBranchWeights[I] += TargetWeight.find(IBI->getDestination(I))->second; + setFittedBranchWeights(*IBI, NewBranchWeights, /*IsExpected=*/false); + } if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) { if (simplifyIndirectBrOnSelect(IBI, SI)) return requestResimplify(); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d393a9c..3f16b03 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3903,7 +3903,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( if (VF.isScalar()) continue; - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, + *CM.PSE.getSE()); precomputeCosts(*Plan, VF, CostCtx); auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { @@ -4160,7 +4161,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Add on other costs that are modelled in VPlan, but not in the legacy // cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, + *CM.PSE.getSE()); VPRegionBlock *VectorRegion = P->getVectorLoopRegion(); assert(VectorRegion && "Expected to have a vector region!"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( @@ -4262,8 +4264,8 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) { if (!Legal->isReductionVariable(&Phi)) return Legal->isFixedOrderRecurrence(&Phi); - RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind(); - return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum; + return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind( + Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind()); })) return false; @@ -6852,7 +6854,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF) const { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE()); InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. @@ -7085,7 +7087,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // simplifications not accounted for in the legacy cost model. If that's the // case, don't trigger the assertion, as the extra simplifications may cause a // different VF to be picked by the VPlan-based cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, + *CM.PSE.getSE()); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); // Verify that the VPlan-based and legacy cost models agree, except for VPlans // with early exits and plans with additional VPlan simplifications. The @@ -7279,8 +7282,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( if (!Exit->hasPredecessors()) continue; for (VPRecipeBase &PhiR : Exit->phis()) - SE.forgetLcssaPhiWithNewPredecessor( - OrigLoop, cast<PHINode>(&cast<VPIRPhi>(PhiR).getInstruction())); + SE.forgetLcssaPhiWithNewPredecessor(OrigLoop, + &cast<VPIRPhi>(PhiR).getIRPhi()); } // Forget the original loop and block dispositions. SE.forgetLoop(OrigLoop); @@ -8418,7 +8421,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. if (!CM.foldTailWithEVL()) { - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, + *CM.PSE.getSE()); VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); } @@ -9874,7 +9878,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM, - CM.CostKind); + CM.CostKind, *CM.PSE.getSE()); if (!ForceVectorization && !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, LVP.getPlanFor(VF.Width), SEL, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 91c3d42..cfa8d27 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10621,7 +10621,8 @@ class InstructionsCompatibilityAnalysis { /// elements. static bool isSupportedOpcode(const unsigned Opcode) { return Opcode == Instruction::Add || Opcode == Instruction::LShr || - Opcode == Instruction::Shl; + Opcode == Instruction::Shl || Opcode == Instruction::SDiv || + Opcode == Instruction::UDiv; } /// Identifies the best candidate value, which represents main opcode @@ -10939,6 +10940,8 @@ public: case Instruction::Add: case Instruction::LShr: case Instruction::Shl: + case Instruction::SDiv: + case Instruction::UDiv: VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind); break; default: @@ -22066,8 +22069,10 @@ bool BoUpSLP::collectValuesToDemote( auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) { assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!"); return all_of(E.Scalars, [&](Value *V) { - auto *I = cast<Instruction>(V); APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); + if (E.hasCopyableElements() && E.isCopyableElement(V)) + return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)); + auto *I = cast<Instruction>(V); return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) && MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)); }); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 07b191a..2555ebe 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1772,7 +1772,8 @@ VPCostContext::getOperandInfo(VPValue *V) const { } InstructionCost VPCostContext::getScalarizationOverhead( - Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) { + Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF, + bool AlwaysIncludeReplicatingR) { if (VF.isScalar()) return 0; @@ -1792,7 +1793,11 @@ InstructionCost VPCostContext::getScalarizationOverhead( SmallPtrSet<const VPValue *, 4> UniqueOperands; SmallVector<Type *> Tys; for (auto *Op : Operands) { - if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) || + if (Op->isLiveIn() || + (!AlwaysIncludeReplicatingR && + isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) || + (isa<VPReplicateRecipe>(Op) && + cast<VPReplicateRecipe>(Op)->getOpcode() == Instruction::Load) || !UniqueOperands.insert(Op).second) continue; Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF)); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index c167dd7..fb696be 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2263,8 +2263,7 @@ public: /// debug location \p DL. VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr, DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") - : VPSingleDefRecipe(VPDef::VPWidenPHISC, ArrayRef<VPValue *>(), Phi, DL), - Name(Name.str()) { + : VPSingleDefRecipe(VPDef::VPWidenPHISC, {}, Phi, DL), Name(Name.str()) { if (Start) addOperand(Start); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index c8212af..81deba2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -763,6 +763,8 @@ void VPlanTransforms::addMinimumVectorEpilogueIterationCheck( // Add the minimum iteration check for the epilogue vector loop. VPValue *TC = Plan.getOrAddLiveIn(TripCount); VPBuilder Builder(cast<VPBasicBlock>(Plan.getEntry())); + VPValue *VFxUF = Builder.createExpandSCEV(SE.getElementCount( + TripCount->getType(), (EpilogueVF * EpilogueUF), SCEV::FlagNUW)); VPValue *Count = Builder.createNaryOp( Instruction::Sub, {TC, Plan.getOrAddLiveIn(VectorTripCount)}, DebugLoc::getUnknown(), "n.vec.remaining"); @@ -770,9 +772,6 @@ void VPlanTransforms::addMinimumVectorEpilogueIterationCheck( // Generate code to check if the loop's trip count is less than VF * UF of // the vector epilogue loop. auto P = RequiresScalarEpilogue ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; - VPValue *VFxUF = Builder.createExpandSCEV(SE.getElementCount( - TripCount->getType(), (EpilogueVF * EpilogueUF), SCEV::FlagNUW)); - auto *CheckMinIters = Builder.createICmp( P, Count, VFxUF, DebugLoc::getUnknown(), "min.epilog.iters.check"); VPInstruction *Branch = @@ -841,8 +840,8 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { // TODO: Support multiple MaxNum/MinNum reductions and other reductions. if (RedPhiR) return false; - if (Cur->getRecurrenceKind() != RecurKind::FMaxNum && - Cur->getRecurrenceKind() != RecurKind::FMinNum) { + if (!RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind( + Cur->getRecurrenceKind())) { HasUnsupportedPhi = true; continue; } @@ -862,10 +861,9 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { if (!MinMaxOp) return false; - RecurKind RedPhiRK = RedPhiR->getRecurrenceKind(); - assert((RedPhiRK == RecurKind::FMaxNum || RedPhiRK == RecurKind::FMinNum) && + assert(RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind( + RedPhiR->getRecurrenceKind()) && "unsupported reduction"); - (void)RedPhiRK; /// Check if the vector loop of \p Plan can early exit and restart /// execution of last vector iteration in the scalar loop. This requires all diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index fc1a09e..1580a3b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -349,12 +349,14 @@ struct VPCostContext { LoopVectorizationCostModel &CM; SmallPtrSet<Instruction *, 8> SkipCostComputation; TargetTransformInfo::TargetCostKind CostKind; + ScalarEvolution &SE; VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const VPlan &Plan, LoopVectorizationCostModel &CM, - TargetTransformInfo::TargetCostKind CostKind) + TargetTransformInfo::TargetCostKind CostKind, + ScalarEvolution &SE) : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM), - CostKind(CostKind) {} + CostKind(CostKind), SE(SE) {} /// Return the cost for \p UI with \p VF using the legacy cost model as /// fallback until computing the cost of all recipes migrates to VPlan. @@ -374,10 +376,12 @@ struct VPCostContext { /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy /// and \p Operands with \p VF. This is a convenience wrapper for the - /// type-based getScalarizationOverhead API. - InstructionCost getScalarizationOverhead(Type *ResultTy, - ArrayRef<const VPValue *> Operands, - ElementCount VF); + /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR + /// is true, always compute the cost of scalarizing replicating operands. + InstructionCost + getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands, + ElementCount VF, + bool AlwaysIncludeReplicatingR = false); }; /// This class can be used to assign names to VPValues. For VPValues without diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 67b9244..600ff8a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -40,6 +40,7 @@ #include <cassert> using namespace llvm; +using namespace llvm::VPlanPatternMatch; using VectorParts = SmallVector<Value *, 2>; @@ -303,7 +304,6 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, VPRecipeBase *OpR = Op->getDefiningRecipe(); // If the partial reduction is predicated, a select will be operand 0 - using namespace llvm::VPlanPatternMatch; if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) { OpR = Op->getDefiningRecipe(); } @@ -1230,6 +1230,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::ExtractLane: case VPInstruction::ExtractLastElement: case VPInstruction::ExtractPenultimateElement: + case VPInstruction::ActiveLaneMask: case VPInstruction::FirstActiveLane: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: @@ -1963,7 +1964,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); VPValue *Op0, *Op1; - using namespace llvm::VPlanPatternMatch; if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 && (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) || match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) { @@ -2778,7 +2778,7 @@ VPExpressionRecipe::VPExpressionRecipe( // Recipes in the expression, except the last one, must only be used by // (other) recipes inside the expression. If there are other users, external // to the expression, use a clone of the recipe for external users. - for (VPSingleDefRecipe *R : ExpressionRecipes) { + for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) { if (R != ExpressionRecipes.back() && any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) { return !ExpressionRecipesAsSetOfUsers.contains(U); @@ -3111,6 +3111,62 @@ bool VPReplicateRecipe::shouldPack() const { }); } +/// Returns true if \p Ptr is a pointer computation for which the legacy cost +/// model computes a SCEV expression when computing the address cost. +static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) { + auto *PtrR = Ptr->getDefiningRecipe(); + if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) && + cast<VPReplicateRecipe>(PtrR)->getOpcode() == + Instruction::GetElementPtr) || + isa<VPWidenGEPRecipe>(PtrR) || + match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue())))) + return false; + + // We are looking for a GEP where all indices are either loop invariant or + // inductions. + for (VPValue *Opd : drop_begin(PtrR->operands())) { + if (!Opd->isDefinedOutsideLoopRegions() && + !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd)) + return false; + } + + return true; +} + +/// Returns true if \p V is used as part of the address of another load or +/// store. +static bool isUsedByLoadStoreAddress(const VPUser *V) { + SmallPtrSet<const VPUser *, 4> Seen; + SmallVector<const VPUser *> WorkList = {V}; + + while (!WorkList.empty()) { + auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val()); + if (!Cur || !Seen.insert(Cur).second || isa<VPBlendRecipe>(Cur)) + continue; + + for (VPUser *U : Cur->users()) { + if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U)) + if (InterleaveR->getAddr() == Cur) + return true; + if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) { + if (RepR->getOpcode() == Instruction::Load && + RepR->getOperand(0) == Cur) + return true; + if (RepR->getOpcode() == Instruction::Store && + RepR->getOperand(1) == Cur) + return true; + } + if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) { + if (MemR->getAddr() == Cur && MemR->isConsecutive()) + return true; + } + } + + append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users()); + } + return false; +} + InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { Instruction *UI = cast<Instruction>(getUnderlyingValue()); @@ -3218,21 +3274,60 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, } case Instruction::Load: case Instruction::Store: { - if (isSingleScalar()) { - bool IsLoad = UI->getOpcode() == Instruction::Load; - Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); - Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1)); - const Align Alignment = getLoadStoreAlignment(UI); - unsigned AS = getLoadStoreAddressSpace(UI); - TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); - InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( - UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI); - return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( - ScalarPtrTy, nullptr, nullptr, Ctx.CostKind); - } + if (VF.isScalable() && !isSingleScalar()) + return InstructionCost::getInvalid(); + // TODO: See getMemInstScalarizationCost for how to handle replicating and // predicated cases. - break; + const VPRegionBlock *ParentRegion = getParent()->getParent(); + if (ParentRegion && ParentRegion->isReplicator()) + break; + + bool IsLoad = UI->getOpcode() == Instruction::Load; + const VPValue *PtrOp = getOperand(!IsLoad); + // TODO: Handle cases where we need to pass a SCEV to + // getAddressComputationCost. + if (shouldUseAddressAccessSCEV(PtrOp)) + break; + + Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); + Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp); + const Align Alignment = getLoadStoreAlignment(UI); + unsigned AS = getLoadStoreAddressSpace(UI); + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); + InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( + UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo); + + Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF); + bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing(); + bool UsedByLoadStoreAddress = + !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this); + InstructionCost ScalarCost = + ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( + PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE, + nullptr, Ctx.CostKind); + if (isSingleScalar()) + return ScalarCost; + + SmallVector<const VPValue *> OpsToScalarize; + Type *ResultTy = Type::getVoidTy(PtrTy->getContext()); + // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we + // don't assign scalarization overhead in general, if the target prefers + // vectorized addressing or the loaded value is used as part of an address + // of another load or store. + if (!UsedByLoadStoreAddress) { + bool EfficientVectorLoadStore = + Ctx.TTI.supportsEfficientVectorElementLoadStore(); + if (!(IsLoad && !PreferVectorizedAddressing) && + !(!IsLoad && EfficientVectorLoadStore)) + append_range(OpsToScalarize, operands()); + + if (!EfficientVectorLoadStore) + ResultTy = Ctx.Types.inferScalarType(this); + } + + return (ScalarCost * VF.getFixedValue()) + + Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index ebf833e..c8a2d84 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -3180,9 +3180,8 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, DebugLoc::getUnknown(), "induction"); // Create the widened phi of the vector IV. - auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr, + auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init, WidenIVR->getDebugLoc(), "vec.ind"); - WidePHI->addOperand(Init); WidePHI->insertBefore(WidenIVR); // Create the backedge value for the vector IV. @@ -3545,8 +3544,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPValue *A, *B; VPValue *Tmp = nullptr; // Sub reductions could have a sub between the add reduction and vec op. - if (match(VecOp, - m_Binary<Instruction::Sub>(m_SpecificInt(0), m_VPValue(Tmp)))) { + if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) { Sub = VecOp->getDefiningRecipe(); VecOp = Tmp; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 0599930..66748c5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -71,8 +71,8 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) { m_Specific(&Plan.getVF()))) || IsWideCanonicalIV(A)); - return match(V, m_Binary<Instruction::ICmp>(m_VPValue(A), m_VPValue(B))) && - IsWideCanonicalIV(A) && B == Plan.getOrCreateBackedgeTakenCount(); + return match(V, m_ICmp(m_VPValue(A), m_VPValue(B))) && IsWideCanonicalIV(A) && + B == Plan.getOrCreateBackedgeTakenCount(); } const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { |