aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroFrame.cpp12
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp5
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp10
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp60
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp6
-rw-r--r--llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp10
-rw-r--r--llvm/lib/Transforms/Instrumentation/AllocToken.cpp548
-rw-r--r--llvm/lib/Transforms/Instrumentation/CMakeLists.txt1
-rw-r--r--llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp14
-rw-r--r--llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp143
-rw-r--r--llvm/lib/Transforms/Scalar/GVN.cpp1
-rw-r--r--llvm/lib/Transforms/Scalar/NewGVN.cpp1
-rw-r--r--llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp2
-rw-r--r--llvm/lib/Transforms/Utils/CodeExtractor.cpp1
-rw-r--r--llvm/lib/Transforms/Utils/Debugify.cpp5
-rw-r--r--llvm/lib/Transforms/Utils/Local.cpp7
-rw-r--r--llvm/lib/Transforms/Utils/LoopRotationUtils.cpp939
-rw-r--r--llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp92
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp71
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp24
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp9
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp9
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h3
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp14
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHelpers.h16
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp127
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp6
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.cpp4
28 files changed, 1446 insertions, 694 deletions
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 0accb22..c89af68 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -689,10 +689,14 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
DISubprogram *DIS = F.getSubprogram();
// If there is no DISubprogram for F, it implies the function is compiled
// without debug info. So we also don't generate debug info for the frame.
- if (!DIS || !DIS->getUnit() ||
- !dwarf::isCPlusPlus(
- (dwarf::SourceLanguage)DIS->getUnit()->getSourceLanguage()) ||
- DIS->getUnit()->getEmissionKind() != DICompileUnit::DebugEmissionKind::FullDebug)
+
+ if (!DIS || !DIS->getUnit())
+ return;
+
+ if (!dwarf::isCPlusPlus(static_cast<llvm::dwarf::SourceLanguage>(
+ DIS->getUnit()->getSourceLanguage().getUnversionedName())) ||
+ DIS->getUnit()->getEmissionKind() !=
+ DICompileUnit::DebugEmissionKind::FullDebug)
return;
assert(Shape.ABI == coro::ABI::Switch &&
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 9ca8194..56194fe 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -137,13 +137,10 @@ InstCombinerImpl::isEliminableCastPair(const CastInst *CI1,
Instruction::CastOps secondOp = CI2->getOpcode();
Type *SrcIntPtrTy =
SrcTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(SrcTy) : nullptr;
- Type *MidIntPtrTy =
- MidTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(MidTy) : nullptr;
Type *DstIntPtrTy =
DstTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(DstTy) : nullptr;
unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy,
- DstTy, SrcIntPtrTy, MidIntPtrTy,
- DstIntPtrTy);
+ DstTy, &DL);
// We don't want to form an inttoptr or ptrtoint that converts to an integer
// type that differs from the pointer size.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 8f60e50..8c8fc69 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3356,7 +3356,10 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
impliesPoisonOrCond(FalseVal, B, /*Expected=*/false)) {
// (A || B) || C --> A || (B | C)
return replaceInstUsesWith(
- SI, Builder.CreateLogicalOr(A, Builder.CreateOr(B, FalseVal)));
+ SI, Builder.CreateLogicalOr(A, Builder.CreateOr(B, FalseVal), "",
+ ProfcheckDisableMetadataFixes
+ ? nullptr
+ : cast<SelectInst>(CondVal)));
}
// (A && B) || (C && B) --> (A || C) && B
@@ -3398,7 +3401,10 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
impliesPoisonOrCond(TrueVal, B, /*Expected=*/true)) {
// (A && B) && C --> A && (B & C)
return replaceInstUsesWith(
- SI, Builder.CreateLogicalAnd(A, Builder.CreateAnd(B, TrueVal)));
+ SI, Builder.CreateLogicalAnd(A, Builder.CreateAnd(B, TrueVal), "",
+ ProfcheckDisableMetadataFixes
+ ? nullptr
+ : cast<SelectInst>(CondVal)));
}
// (A || B) && (C || B) --> (A && C) || B
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index aa030294..127a506 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -60,6 +60,58 @@ static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
return true;
}
+/// Let N = 2 * M.
+/// Given an N-bit integer representing a pack of two M-bit integers,
+/// we can select one of the packed integers by right-shifting by either
+/// zero or M (which is the most straightforward to check if M is a power
+/// of 2), and then isolating the lower M bits. In this case, we can
+/// represent the shift as a select on whether the shr amount is nonzero.
+static Value *simplifyShiftSelectingPackedElement(Instruction *I,
+ const APInt &DemandedMask,
+ InstCombinerImpl &IC,
+ unsigned Depth) {
+ assert(I->getOpcode() == Instruction::LShr &&
+ "Only lshr instruction supported");
+
+ uint64_t ShlAmt;
+ Value *Upper, *Lower;
+ if (!match(I->getOperand(0),
+ m_OneUse(m_c_DisjointOr(
+ m_OneUse(m_Shl(m_Value(Upper), m_ConstantInt(ShlAmt))),
+ m_Value(Lower)))))
+ return nullptr;
+
+ if (!isPowerOf2_64(ShlAmt))
+ return nullptr;
+
+ const uint64_t DemandedBitWidth = DemandedMask.getActiveBits();
+ if (DemandedBitWidth > ShlAmt)
+ return nullptr;
+
+ // Check that upper demanded bits are not lost from lshift.
+ if (Upper->getType()->getScalarSizeInBits() < ShlAmt + DemandedBitWidth)
+ return nullptr;
+
+ KnownBits KnownLowerBits = IC.computeKnownBits(Lower, I, Depth);
+ if (!KnownLowerBits.getMaxValue().isIntN(ShlAmt))
+ return nullptr;
+
+ Value *ShrAmt = I->getOperand(1);
+ KnownBits KnownShrBits = IC.computeKnownBits(ShrAmt, I, Depth);
+
+ // Verify that ShrAmt is either exactly ShlAmt (which is a power of 2) or
+ // zero.
+ if (~KnownShrBits.Zero != ShlAmt)
+ return nullptr;
+
+ Value *ShrAmtZ =
+ IC.Builder.CreateICmpEQ(ShrAmt, Constant::getNullValue(ShrAmt->getType()),
+ ShrAmt->getName() + ".z");
+ Value *Select = IC.Builder.CreateSelect(ShrAmtZ, Lower, Upper);
+ Select->takeName(I);
+ return Select;
+}
+
/// Returns the bitwidth of the given scalar or pointer type. For vector types,
/// returns the element type's bitwidth.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL) {
@@ -798,9 +850,13 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I,
Known >>= ShiftAmt;
if (ShiftAmt)
Known.Zero.setHighBits(ShiftAmt); // high bits known zero.
- } else {
- llvm::computeKnownBits(I, Known, Q, Depth);
+ break;
}
+ if (Value *V =
+ simplifyShiftSelectingPackedElement(I, DemandedMask, *this, Depth))
+ return V;
+
+ llvm::computeKnownBits(I, Known, Q, Depth);
break;
}
case Instruction::AShr: {
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 048cdf4..d56a1af 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1970,12 +1970,6 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN,
NewPhiValues.push_back(nullptr);
OpsToMoveUseToIncomingBB.push_back(i);
- // If the InVal is an invoke at the end of the pred block, then we can't
- // insert a computation after it without breaking the edge.
- if (isa<InvokeInst>(InVal))
- if (cast<Instruction>(InVal)->getParent() == InBB)
- return nullptr;
-
// Do not push the operation across a loop backedge. This could result in
// an infinite combine loop, and is generally non-profitable (especially
// if the operation was originally outside the loop).
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index cdae9a7..3704ad7 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -2662,7 +2662,7 @@ void ModuleAddressSanitizer::instrumentGlobals(IRBuilder<> &IRB,
G->eraseFromParent();
NewGlobals[i] = NewGlobal;
- Constant *ODRIndicator = ConstantPointerNull::get(PtrTy);
+ Constant *ODRIndicator = Constant::getNullValue(IntptrTy);
GlobalValue *InstrumentedGlobal = NewGlobal;
bool CanUsePrivateAliases =
@@ -2677,8 +2677,7 @@ void ModuleAddressSanitizer::instrumentGlobals(IRBuilder<> &IRB,
// ODR should not happen for local linkage.
if (NewGlobal->hasLocalLinkage()) {
- ODRIndicator =
- ConstantExpr::getIntToPtr(ConstantInt::get(IntptrTy, -1), PtrTy);
+ ODRIndicator = ConstantInt::get(IntptrTy, -1);
} else if (UseOdrIndicator) {
// With local aliases, we need to provide another externally visible
// symbol __odr_asan_XXX to detect ODR violation.
@@ -2692,7 +2691,7 @@ void ModuleAddressSanitizer::instrumentGlobals(IRBuilder<> &IRB,
ODRIndicatorSym->setVisibility(NewGlobal->getVisibility());
ODRIndicatorSym->setDLLStorageClass(NewGlobal->getDLLStorageClass());
ODRIndicatorSym->setAlignment(Align(1));
- ODRIndicator = ODRIndicatorSym;
+ ODRIndicator = ConstantExpr::getPtrToInt(ODRIndicatorSym, IntptrTy);
}
Constant *Initializer = ConstantStruct::get(
@@ -2703,8 +2702,7 @@ void ModuleAddressSanitizer::instrumentGlobals(IRBuilder<> &IRB,
ConstantExpr::getPointerCast(Name, IntptrTy),
ConstantExpr::getPointerCast(getOrCreateModuleName(), IntptrTy),
ConstantInt::get(IntptrTy, MD.IsDynInit),
- Constant::getNullValue(IntptrTy),
- ConstantExpr::getPointerCast(ODRIndicator, IntptrTy));
+ Constant::getNullValue(IntptrTy), ODRIndicator);
LLVM_DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
diff --git a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
new file mode 100644
index 0000000..40720ae
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
@@ -0,0 +1,548 @@
+//===- AllocToken.cpp - Allocation token instrumentation ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements AllocToken, an instrumentation pass that
+// replaces allocation calls with token-enabled versions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/AllocToken.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Analysis.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/RandomNumberGenerator.h"
+#include "llvm/Support/SipHash.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "alloc-token"
+
+namespace {
+
+//===--- Constants --------------------------------------------------------===//
+
+enum class TokenMode : unsigned {
+ /// Incrementally increasing token ID.
+ Increment = 0,
+
+ /// Simple mode that returns a statically-assigned random token ID.
+ Random = 1,
+
+ /// Token ID based on allocated type hash.
+ TypeHash = 2,
+
+ /// Token ID based on allocated type hash, where the top half ID-space is
+ /// reserved for types that contain pointers and the bottom half for types
+ /// that do not contain pointers.
+ TypeHashPointerSplit = 3,
+};
+
+//===--- Command-line options ---------------------------------------------===//
+
+cl::opt<TokenMode> ClMode(
+ "alloc-token-mode", cl::Hidden, cl::desc("Token assignment mode"),
+ cl::init(TokenMode::TypeHashPointerSplit),
+ cl::values(
+ clEnumValN(TokenMode::Increment, "increment",
+ "Incrementally increasing token ID"),
+ clEnumValN(TokenMode::Random, "random",
+ "Statically-assigned random token ID"),
+ clEnumValN(TokenMode::TypeHash, "typehash",
+ "Token ID based on allocated type hash"),
+ clEnumValN(
+ TokenMode::TypeHashPointerSplit, "typehashpointersplit",
+ "Token ID based on allocated type hash, where the top half "
+ "ID-space is reserved for types that contain pointers and the "
+ "bottom half for types that do not contain pointers. ")));
+
+cl::opt<std::string> ClFuncPrefix("alloc-token-prefix",
+ cl::desc("The allocation function prefix"),
+ cl::Hidden, cl::init("__alloc_token_"));
+
+cl::opt<uint64_t> ClMaxTokens("alloc-token-max",
+ cl::desc("Maximum number of tokens (0 = no max)"),
+ cl::Hidden, cl::init(0));
+
+cl::opt<bool>
+ ClFastABI("alloc-token-fast-abi",
+ cl::desc("The token ID is encoded in the function name"),
+ cl::Hidden, cl::init(false));
+
+// Instrument libcalls only by default - compatible allocators only need to take
+// care of providing standard allocation functions. With extended coverage, also
+// instrument non-libcall allocation function calls with !alloc_token
+// metadata.
+cl::opt<bool>
+ ClExtended("alloc-token-extended",
+ cl::desc("Extend coverage to custom allocation functions"),
+ cl::Hidden, cl::init(false));
+
+// C++ defines ::operator new (and variants) as replaceable (vs. standard
+// library versions), which are nobuiltin, and are therefore not covered by
+// isAllocationFn(). Cover by default, as users of AllocToken are already
+// required to provide token-aware allocation functions (no defaults).
+cl::opt<bool> ClCoverReplaceableNew("alloc-token-cover-replaceable-new",
+ cl::desc("Cover replaceable operator new"),
+ cl::Hidden, cl::init(true));
+
+cl::opt<uint64_t> ClFallbackToken(
+ "alloc-token-fallback",
+ cl::desc("The default fallback token where none could be determined"),
+ cl::Hidden, cl::init(0));
+
+//===--- Statistics -------------------------------------------------------===//
+
+STATISTIC(NumFunctionsInstrumented, "Functions instrumented");
+STATISTIC(NumAllocationsInstrumented, "Allocations instrumented");
+
+//===----------------------------------------------------------------------===//
+
+/// Returns the !alloc_token metadata if available.
+///
+/// Expected format is: !{<type-name>, <contains-pointer>}
+MDNode *getAllocTokenMetadata(const CallBase &CB) {
+ MDNode *Ret = CB.getMetadata(LLVMContext::MD_alloc_token);
+ if (!Ret)
+ return nullptr;
+ assert(Ret->getNumOperands() == 2 && "bad !alloc_token");
+ assert(isa<MDString>(Ret->getOperand(0)));
+ assert(isa<ConstantAsMetadata>(Ret->getOperand(1)));
+ return Ret;
+}
+
+bool containsPointer(const MDNode *MD) {
+ ConstantAsMetadata *C = cast<ConstantAsMetadata>(MD->getOperand(1));
+ auto *CI = cast<ConstantInt>(C->getValue());
+ return CI->getValue().getBoolValue();
+}
+
+class ModeBase {
+public:
+ explicit ModeBase(const IntegerType &TokenTy, uint64_t MaxTokens)
+ : MaxTokens(MaxTokens ? MaxTokens : TokenTy.getBitMask()) {
+ assert(MaxTokens <= TokenTy.getBitMask());
+ }
+
+protected:
+ uint64_t boundedToken(uint64_t Val) const {
+ assert(MaxTokens != 0);
+ return Val % MaxTokens;
+ }
+
+ const uint64_t MaxTokens;
+};
+
+/// Implementation for TokenMode::Increment.
+class IncrementMode : public ModeBase {
+public:
+ using ModeBase::ModeBase;
+
+ uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &) {
+ return boundedToken(Counter++);
+ }
+
+private:
+ uint64_t Counter = 0;
+};
+
+/// Implementation for TokenMode::Random.
+class RandomMode : public ModeBase {
+public:
+ RandomMode(const IntegerType &TokenTy, uint64_t MaxTokens,
+ std::unique_ptr<RandomNumberGenerator> RNG)
+ : ModeBase(TokenTy, MaxTokens), RNG(std::move(RNG)) {}
+ uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &) {
+ return boundedToken((*RNG)());
+ }
+
+private:
+ std::unique_ptr<RandomNumberGenerator> RNG;
+};
+
+/// Implementation for TokenMode::TypeHash. The implementation ensures
+/// hashes are stable across different compiler invocations. Uses SipHash as the
+/// hash function.
+class TypeHashMode : public ModeBase {
+public:
+ using ModeBase::ModeBase;
+
+ uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &ORE) {
+ const auto [N, H] = getHash(CB, ORE);
+ return N ? boundedToken(H) : H;
+ }
+
+protected:
+ std::pair<MDNode *, uint64_t> getHash(const CallBase &CB,
+ OptimizationRemarkEmitter &ORE) {
+ if (MDNode *N = getAllocTokenMetadata(CB)) {
+ MDString *S = cast<MDString>(N->getOperand(0));
+ return {N, getStableSipHash(S->getString())};
+ }
+ // Fallback.
+ remarkNoMetadata(CB, ORE);
+ return {nullptr, ClFallbackToken};
+ }
+
+ /// Remark that there was no precise type information.
+ static void remarkNoMetadata(const CallBase &CB,
+ OptimizationRemarkEmitter &ORE) {
+ ORE.emit([&] {
+ ore::NV FuncNV("Function", CB.getParent()->getParent());
+ const Function *Callee = CB.getCalledFunction();
+ ore::NV CalleeNV("Callee", Callee ? Callee->getName() : "<unknown>");
+ return OptimizationRemark(DEBUG_TYPE, "NoAllocToken", &CB)
+ << "Call to '" << CalleeNV << "' in '" << FuncNV
+ << "' without source-level type token";
+ });
+ }
+};
+
+/// Implementation for TokenMode::TypeHashPointerSplit.
+class TypeHashPointerSplitMode : public TypeHashMode {
+public:
+ using TypeHashMode::TypeHashMode;
+
+ uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &ORE) {
+ if (MaxTokens == 1)
+ return 0;
+ const uint64_t HalfTokens = MaxTokens / 2;
+ const auto [N, H] = getHash(CB, ORE);
+ if (!N) {
+ // Pick the fallback token (ClFallbackToken), which by default is 0,
+ // meaning it'll fall into the pointer-less bucket. Override by setting
+ // -alloc-token-fallback if that is the wrong choice.
+ return H;
+ }
+ uint64_t Hash = H % HalfTokens; // base hash
+ if (containsPointer(N))
+ Hash += HalfTokens;
+ return Hash;
+ }
+};
+
+// Apply opt overrides.
+AllocTokenOptions transformOptionsFromCl(AllocTokenOptions Opts) {
+ if (!Opts.MaxTokens.has_value())
+ Opts.MaxTokens = ClMaxTokens;
+ Opts.FastABI |= ClFastABI;
+ Opts.Extended |= ClExtended;
+ return Opts;
+}
+
+class AllocToken {
+public:
+ explicit AllocToken(AllocTokenOptions Opts, Module &M,
+ ModuleAnalysisManager &MAM)
+ : Options(transformOptionsFromCl(std::move(Opts))), Mod(M),
+ FAM(MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()),
+ Mode(IncrementMode(*IntPtrTy, *Options.MaxTokens)) {
+ switch (ClMode.getValue()) {
+ case TokenMode::Increment:
+ break;
+ case TokenMode::Random:
+ Mode.emplace<RandomMode>(*IntPtrTy, *Options.MaxTokens,
+ M.createRNG(DEBUG_TYPE));
+ break;
+ case TokenMode::TypeHash:
+ Mode.emplace<TypeHashMode>(*IntPtrTy, *Options.MaxTokens);
+ break;
+ case TokenMode::TypeHashPointerSplit:
+ Mode.emplace<TypeHashPointerSplitMode>(*IntPtrTy, *Options.MaxTokens);
+ break;
+ }
+ }
+
+ bool instrumentFunction(Function &F);
+
+private:
+ /// Returns the LibFunc (or NotLibFunc) if this call should be instrumented.
+ std::optional<LibFunc>
+ shouldInstrumentCall(const CallBase &CB, const TargetLibraryInfo &TLI) const;
+
+ /// Returns true for functions that are eligible for instrumentation.
+ static bool isInstrumentableLibFunc(LibFunc Func, const CallBase &CB,
+ const TargetLibraryInfo &TLI);
+
+ /// Returns true for isAllocationFn() functions that we should ignore.
+ static bool ignoreInstrumentableLibFunc(LibFunc Func);
+
+ /// Replace a call/invoke with a call/invoke to the allocation function
+ /// with token ID.
+ bool replaceAllocationCall(CallBase *CB, LibFunc Func,
+ OptimizationRemarkEmitter &ORE,
+ const TargetLibraryInfo &TLI);
+
+ /// Return replacement function for a LibFunc that takes a token ID.
+ FunctionCallee getTokenAllocFunction(const CallBase &CB, uint64_t TokenID,
+ LibFunc OriginalFunc);
+
+ /// Return the token ID from metadata in the call.
+ uint64_t getToken(const CallBase &CB, OptimizationRemarkEmitter &ORE) {
+ return std::visit([&](auto &&Mode) { return Mode(CB, ORE); }, Mode);
+ }
+
+ const AllocTokenOptions Options;
+ Module &Mod;
+ IntegerType *IntPtrTy = Mod.getDataLayout().getIntPtrType(Mod.getContext());
+ FunctionAnalysisManager &FAM;
+ // Cache for replacement functions.
+ DenseMap<std::pair<LibFunc, uint64_t>, FunctionCallee> TokenAllocFunctions;
+ // Selected mode.
+ std::variant<IncrementMode, RandomMode, TypeHashMode,
+ TypeHashPointerSplitMode>
+ Mode;
+};
+
+bool AllocToken::instrumentFunction(Function &F) {
+ // Do not apply any instrumentation for naked functions.
+ if (F.hasFnAttribute(Attribute::Naked))
+ return false;
+ if (F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation))
+ return false;
+ // Don't touch available_externally functions, their actual body is elsewhere.
+ if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage)
+ return false;
+ // Only instrument functions that have the sanitize_alloc_token attribute.
+ if (!F.hasFnAttribute(Attribute::SanitizeAllocToken))
+ return false;
+
+ auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+ SmallVector<std::pair<CallBase *, LibFunc>, 4> AllocCalls;
+
+ // Collect all allocation calls to avoid iterator invalidation.
+ for (Instruction &I : instructions(F)) {
+ auto *CB = dyn_cast<CallBase>(&I);
+ if (!CB)
+ continue;
+ if (std::optional<LibFunc> Func = shouldInstrumentCall(*CB, TLI))
+ AllocCalls.emplace_back(CB, Func.value());
+ }
+
+ bool Modified = false;
+ for (auto &[CB, Func] : AllocCalls)
+ Modified |= replaceAllocationCall(CB, Func, ORE, TLI);
+
+ if (Modified)
+ NumFunctionsInstrumented++;
+ return Modified;
+}
+
+std::optional<LibFunc>
+AllocToken::shouldInstrumentCall(const CallBase &CB,
+ const TargetLibraryInfo &TLI) const {
+ const Function *Callee = CB.getCalledFunction();
+ if (!Callee)
+ return std::nullopt;
+
+ // Ignore nobuiltin of the CallBase, so that we can cover nobuiltin libcalls
+ // if requested via isInstrumentableLibFunc(). Note that isAllocationFn() is
+ // returning false for nobuiltin calls.
+ LibFunc Func;
+ if (TLI.getLibFunc(*Callee, Func)) {
+ if (isInstrumentableLibFunc(Func, CB, TLI))
+ return Func;
+ } else if (Options.Extended && getAllocTokenMetadata(CB)) {
+ return NotLibFunc;
+ }
+
+ return std::nullopt;
+}
+
+bool AllocToken::isInstrumentableLibFunc(LibFunc Func, const CallBase &CB,
+ const TargetLibraryInfo &TLI) {
+ if (ignoreInstrumentableLibFunc(Func))
+ return false;
+
+ if (isAllocationFn(&CB, &TLI))
+ return true;
+
+ switch (Func) {
+ // These libfuncs don't return normal pointers, and are therefore not handled
+ // by isAllocationFn().
+ case LibFunc_posix_memalign:
+ case LibFunc_size_returning_new:
+ case LibFunc_size_returning_new_hot_cold:
+ case LibFunc_size_returning_new_aligned:
+ case LibFunc_size_returning_new_aligned_hot_cold:
+ return true;
+
+ // See comment above ClCoverReplaceableNew.
+ case LibFunc_Znwj:
+ case LibFunc_ZnwjRKSt9nothrow_t:
+ case LibFunc_ZnwjSt11align_val_t:
+ case LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t:
+ case LibFunc_Znwm:
+ case LibFunc_Znwm12__hot_cold_t:
+ case LibFunc_ZnwmRKSt9nothrow_t:
+ case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t:
+ case LibFunc_ZnwmSt11align_val_t:
+ case LibFunc_ZnwmSt11align_val_t12__hot_cold_t:
+ case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t:
+ case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
+ case LibFunc_Znaj:
+ case LibFunc_ZnajRKSt9nothrow_t:
+ case LibFunc_ZnajSt11align_val_t:
+ case LibFunc_ZnajSt11align_val_tRKSt9nothrow_t:
+ case LibFunc_Znam:
+ case LibFunc_Znam12__hot_cold_t:
+ case LibFunc_ZnamRKSt9nothrow_t:
+ case LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t:
+ case LibFunc_ZnamSt11align_val_t:
+ case LibFunc_ZnamSt11align_val_t12__hot_cold_t:
+ case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t:
+ case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
+ return ClCoverReplaceableNew;
+
+ default:
+ return false;
+ }
+}
+
+bool AllocToken::ignoreInstrumentableLibFunc(LibFunc Func) {
+ switch (Func) {
+ case LibFunc_strdup:
+ case LibFunc_dunder_strdup:
+ case LibFunc_strndup:
+ case LibFunc_dunder_strndup:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool AllocToken::replaceAllocationCall(CallBase *CB, LibFunc Func,
+ OptimizationRemarkEmitter &ORE,
+ const TargetLibraryInfo &TLI) {
+ uint64_t TokenID = getToken(*CB, ORE);
+
+ FunctionCallee TokenAlloc = getTokenAllocFunction(*CB, TokenID, Func);
+ if (!TokenAlloc)
+ return false;
+ NumAllocationsInstrumented++;
+
+ if (Options.FastABI) {
+ assert(TokenAlloc.getFunctionType()->getNumParams() == CB->arg_size());
+ CB->setCalledFunction(TokenAlloc);
+ return true;
+ }
+
+ IRBuilder<> IRB(CB);
+ // Original args.
+ SmallVector<Value *, 4> NewArgs{CB->args()};
+ // Add token ID, truncated to IntPtrTy width.
+ NewArgs.push_back(ConstantInt::get(IntPtrTy, TokenID));
+ assert(TokenAlloc.getFunctionType()->getNumParams() == NewArgs.size());
+
+ // Preserve invoke vs call semantics for exception handling.
+ CallBase *NewCall;
+ if (auto *II = dyn_cast<InvokeInst>(CB)) {
+ NewCall = IRB.CreateInvoke(TokenAlloc, II->getNormalDest(),
+ II->getUnwindDest(), NewArgs);
+ } else {
+ NewCall = IRB.CreateCall(TokenAlloc, NewArgs);
+ cast<CallInst>(NewCall)->setTailCall(CB->isTailCall());
+ }
+ NewCall->setCallingConv(CB->getCallingConv());
+ NewCall->copyMetadata(*CB);
+ NewCall->setAttributes(CB->getAttributes());
+
+ // Replace all uses and delete the old call.
+ CB->replaceAllUsesWith(NewCall);
+ CB->eraseFromParent();
+ return true;
+}
+
+FunctionCallee AllocToken::getTokenAllocFunction(const CallBase &CB,
+ uint64_t TokenID,
+ LibFunc OriginalFunc) {
+ std::optional<std::pair<LibFunc, uint64_t>> Key;
+ if (OriginalFunc != NotLibFunc) {
+ Key = std::make_pair(OriginalFunc, Options.FastABI ? TokenID : 0);
+ auto It = TokenAllocFunctions.find(*Key);
+ if (It != TokenAllocFunctions.end())
+ return It->second;
+ }
+
+ const Function *Callee = CB.getCalledFunction();
+ if (!Callee)
+ return FunctionCallee();
+ const FunctionType *OldFTy = Callee->getFunctionType();
+ if (OldFTy->isVarArg())
+ return FunctionCallee();
+ // Copy params, and append token ID type.
+ Type *RetTy = OldFTy->getReturnType();
+ SmallVector<Type *, 4> NewParams{OldFTy->params()};
+ std::string TokenAllocName = ClFuncPrefix;
+ if (Options.FastABI)
+ TokenAllocName += utostr(TokenID) + "_";
+ else
+ NewParams.push_back(IntPtrTy); // token ID
+ TokenAllocName += Callee->getName();
+ FunctionType *NewFTy = FunctionType::get(RetTy, NewParams, false);
+ FunctionCallee TokenAlloc = Mod.getOrInsertFunction(TokenAllocName, NewFTy);
+ if (Function *F = dyn_cast<Function>(TokenAlloc.getCallee()))
+ F->copyAttributesFrom(Callee); // preserve attrs
+
+ if (Key.has_value())
+ TokenAllocFunctions[*Key] = TokenAlloc;
+ return TokenAlloc;
+}
+
+} // namespace
+
+AllocTokenPass::AllocTokenPass(AllocTokenOptions Opts)
+ : Options(std::move(Opts)) {}
+
+PreservedAnalyses AllocTokenPass::run(Module &M, ModuleAnalysisManager &MAM) {
+ AllocToken Pass(Options, M, MAM);
+ bool Modified = false;
+
+ for (Function &F : M) {
+ if (F.empty())
+ continue; // declaration
+ Modified |= Pass.instrumentFunction(F);
+ }
+
+ return Modified ? PreservedAnalyses::none().preserveSet<CFGAnalyses>()
+ : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
index 15fd421..80576c6 100644
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -1,5 +1,6 @@
add_llvm_component_library(LLVMInstrumentation
AddressSanitizer.cpp
+ AllocToken.cpp
BoundsChecking.cpp
CGProfile.cpp
ControlHeightReduction.cpp
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 480ff4a..5ba2167 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -261,6 +261,11 @@ static cl::opt<bool> ClIgnorePersonalityRoutine(
"list, do not create a wrapper for it."),
cl::Hidden, cl::init(false));
+static cl::opt<bool> ClAddGlobalNameSuffix(
+ "dfsan-add-global-name-suffix",
+ cl::desc("Whether to add .dfsan suffix to global names"), cl::Hidden,
+ cl::init(true));
+
static StringRef getGlobalTypeString(const GlobalValue &G) {
// Types of GlobalVariables are always pointer types.
Type *GType = G.getValueType();
@@ -1256,6 +1261,9 @@ DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) {
}
void DataFlowSanitizer::addGlobalNameSuffix(GlobalValue *GV) {
+ if (!ClAddGlobalNameSuffix)
+ return;
+
std::string GVName = std::string(GV->getName()), Suffix = ".dfsan";
GV->setName(GVName + Suffix);
@@ -1784,10 +1792,8 @@ bool DataFlowSanitizer::runImpl(
}
Value *DFSanFunction::getArgTLS(Type *T, unsigned ArgOffset, IRBuilder<> &IRB) {
- Value *Base = IRB.CreatePointerCast(DFS.ArgTLS, DFS.IntptrTy);
- if (ArgOffset)
- Base = IRB.CreateAdd(Base, ConstantInt::get(DFS.IntptrTy, ArgOffset));
- return IRB.CreateIntToPtr(Base, PointerType::get(*DFS.Ctx, 0), "_dfsarg");
+ return IRB.CreatePtrAdd(DFS.ArgTLS, ConstantInt::get(DFS.IntptrTy, ArgOffset),
+ "_dfsarg");
}
Value *DFSanFunction::getRetvalTLS(Type *T, IRBuilder<> &IRB) {
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index e9a3e98..e448230 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -89,6 +89,7 @@ STATISTIC(NumTransforms, "Number of transformations done");
STATISTIC(NumCloned, "Number of blocks cloned");
STATISTIC(NumPaths, "Number of individual paths threaded");
+namespace llvm {
static cl::opt<bool>
ClViewCfgBefore("dfa-jump-view-cfg-before",
cl::desc("View the CFG before DFA Jump Threading"),
@@ -120,8 +121,17 @@ static cl::opt<unsigned>
cl::desc("Maximum cost accepted for the transformation"),
cl::Hidden, cl::init(50));
-namespace {
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+
+} // namespace llvm
+
+static cl::opt<double> MaxClonedRate(
+ "dfa-max-cloned-rate",
+ cl::desc(
+ "Maximum cloned instructions rate accepted for the transformation"),
+ cl::Hidden, cl::init(7.5));
+namespace {
class SelectInstToUnfold {
SelectInst *SI;
PHINode *SIUse;
@@ -135,10 +145,6 @@ public:
explicit operator bool() const { return SI && SIUse; }
};
-void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold,
- std::vector<SelectInstToUnfold> *NewSIsToUnfold,
- std::vector<BasicBlock *> *NewBBs);
-
class DFAJumpThreading {
public:
DFAJumpThreading(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI,
@@ -152,7 +158,8 @@ private:
void
unfoldSelectInstrs(DominatorTree *DT,
const SmallVector<SelectInstToUnfold, 4> &SelectInsts) {
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+ // TODO: Have everything use a single lazy DTU
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
SmallVector<SelectInstToUnfold, 4> Stack(SelectInsts);
while (!Stack.empty()) {
@@ -167,16 +174,18 @@ private:
}
}
+ static void unfold(DomTreeUpdater *DTU, LoopInfo *LI,
+ SelectInstToUnfold SIToUnfold,
+ std::vector<SelectInstToUnfold> *NewSIsToUnfold,
+ std::vector<BasicBlock *> *NewBBs);
+
AssumptionCache *AC;
DominatorTree *DT;
LoopInfo *LI;
TargetTransformInfo *TTI;
OptimizationRemarkEmitter *ORE;
};
-
-} // end anonymous namespace
-
-namespace {
+} // namespace
/// Unfold the select instruction held in \p SIToUnfold by replacing it with
/// control flow.
@@ -185,9 +194,10 @@ namespace {
/// created basic blocks into \p NewBBs.
///
/// TODO: merge it with CodeGenPrepare::optimizeSelectInst() if possible.
-void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold,
- std::vector<SelectInstToUnfold> *NewSIsToUnfold,
- std::vector<BasicBlock *> *NewBBs) {
+void DFAJumpThreading::unfold(DomTreeUpdater *DTU, LoopInfo *LI,
+ SelectInstToUnfold SIToUnfold,
+ std::vector<SelectInstToUnfold> *NewSIsToUnfold,
+ std::vector<BasicBlock *> *NewBBs) {
SelectInst *SI = SIToUnfold.getInst();
PHINode *SIUse = SIToUnfold.getUse();
assert(SI->hasOneUse());
@@ -253,7 +263,11 @@ void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold,
// Insert the real conditional branch based on the original condition.
StartBlockTerm->eraseFromParent();
- BranchInst::Create(EndBlock, NewBlock, SI->getCondition(), StartBlock);
+ auto *BI =
+ BranchInst::Create(EndBlock, NewBlock, SI->getCondition(), StartBlock);
+ if (!ProfcheckDisableMetadataFixes)
+ BI->setMetadata(LLVMContext::MD_prof,
+ SI->getMetadata(LLVMContext::MD_prof));
DTU->applyUpdates({{DominatorTree::Insert, StartBlock, EndBlock},
{DominatorTree::Insert, StartBlock, NewBlock}});
} else {
@@ -288,7 +302,11 @@ void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold,
// (Use)
BranchInst::Create(EndBlock, NewBlockF);
// Insert the real conditional branch based on the original condition.
- BranchInst::Create(EndBlock, NewBlockF, SI->getCondition(), NewBlockT);
+ auto *BI =
+ BranchInst::Create(EndBlock, NewBlockF, SI->getCondition(), NewBlockT);
+ if (!ProfcheckDisableMetadataFixes)
+ BI->setMetadata(LLVMContext::MD_prof,
+ SI->getMetadata(LLVMContext::MD_prof));
DTU->applyUpdates({{DominatorTree::Insert, NewBlockT, NewBlockF},
{DominatorTree::Insert, NewBlockT, EndBlock},
{DominatorTree::Insert, NewBlockF, EndBlock}});
@@ -342,10 +360,12 @@ void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold,
SI->eraseFromParent();
}
+namespace {
struct ClonedBlock {
BasicBlock *BB;
APInt State; ///< \p State corresponds to the next value of a switch stmnt.
};
+} // namespace
typedef std::deque<BasicBlock *> PathType;
typedef std::vector<PathType> PathsType;
@@ -375,6 +395,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const PathType &Path) {
return OS;
}
+namespace {
/// ThreadingPath is a path in the control flow of a loop that can be threaded
/// by cloning necessary basic blocks and replacing conditional branches with
/// unconditional ones. A threading path includes a list of basic blocks, the
@@ -814,11 +835,13 @@ struct TransformDFA {
: SwitchPaths(SwitchPaths), DT(DT), AC(AC), TTI(TTI), ORE(ORE),
EphValues(EphValues) {}
- void run() {
+ bool run() {
if (isLegalAndProfitableToTransform()) {
createAllExitPaths();
NumTransforms++;
+ return true;
}
+ return false;
}
private:
@@ -828,6 +851,7 @@ private:
/// also returns false if it is illegal to clone some required block.
bool isLegalAndProfitableToTransform() {
CodeMetrics Metrics;
+ uint64_t NumClonedInst = 0;
SwitchInst *Switch = SwitchPaths->getSwitchInst();
// Don't thread switch without multiple successors.
@@ -837,7 +861,6 @@ private:
// Note that DuplicateBlockMap is not being used as intended here. It is
// just being used to ensure (BB, State) pairs are only counted once.
DuplicateBlockMap DuplicateMap;
-
for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) {
PathType PathBBs = TPath.getPath();
APInt NextState = TPath.getExitValue();
@@ -848,6 +871,7 @@ private:
BasicBlock *VisitedBB = getClonedBB(BB, NextState, DuplicateMap);
if (!VisitedBB) {
Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
+ NumClonedInst += BB->sizeWithoutDebug();
DuplicateMap[BB].push_back({BB, NextState});
}
@@ -865,6 +889,7 @@ private:
if (VisitedBB)
continue;
Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
+ NumClonedInst += BB->sizeWithoutDebug();
DuplicateMap[BB].push_back({BB, NextState});
}
@@ -901,6 +926,22 @@ private:
}
}
+ // Too much cloned instructions slow down later optimizations, especially
+ // SLPVectorizer.
+ // TODO: Thread the switch partially before reaching the threshold.
+ uint64_t NumOrigInst = 0;
+ for (auto *BB : DuplicateMap.keys())
+ NumOrigInst += BB->sizeWithoutDebug();
+ if (double(NumClonedInst) / double(NumOrigInst) > MaxClonedRate) {
+ LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much "
+ "instructions wll be cloned\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable", Switch)
+ << "Too much instructions will be cloned.";
+ });
+ return false;
+ }
+
InstructionCost DuplicationCost = 0;
unsigned JumpTableSize = 0;
@@ -951,8 +992,6 @@ private:
/// Transform each threading path to effectively jump thread the DFA.
void createAllExitPaths() {
- DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Eager);
-
// Move the switch block to the end of the path, since it will be duplicated
BasicBlock *SwitchBlock = SwitchPaths->getSwitchBlock();
for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) {
@@ -969,15 +1008,18 @@ private:
SmallPtrSet<BasicBlock *, 16> BlocksToClean;
BlocksToClean.insert_range(successors(SwitchBlock));
- for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) {
- createExitPath(NewDefs, TPath, DuplicateMap, BlocksToClean, &DTU);
- NumPaths++;
- }
+ {
+ DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy);
+ for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) {
+ createExitPath(NewDefs, TPath, DuplicateMap, BlocksToClean, &DTU);
+ NumPaths++;
+ }
- // After all paths are cloned, now update the last successor of the cloned
- // path so it skips over the switch statement
- for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths())
- updateLastSuccessor(TPath, DuplicateMap, &DTU);
+ // After all paths are cloned, now update the last successor of the cloned
+ // path so it skips over the switch statement
+ for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths())
+ updateLastSuccessor(TPath, DuplicateMap, &DTU);
+ }
// For each instruction that was cloned and used outside, update its uses
updateSSA(NewDefs);
@@ -993,7 +1035,7 @@ private:
/// To remember the correct destination, we have to duplicate blocks
/// corresponding to each state. Also update the terminating instruction of
/// the predecessors, and phis in the successor blocks.
- void createExitPath(DefMap &NewDefs, ThreadingPath &Path,
+ void createExitPath(DefMap &NewDefs, const ThreadingPath &Path,
DuplicateBlockMap &DuplicateMap,
SmallPtrSet<BasicBlock *, 16> &BlocksToClean,
DomTreeUpdater *DTU) {
@@ -1164,19 +1206,18 @@ private:
// value for the new predecessor ClonedBB. The value will either be the same
// value from BB or a cloned value.
for (BasicBlock *Succ : BlocksToUpdate) {
- for (auto II = Succ->begin(); PHINode *Phi = dyn_cast<PHINode>(II);
- ++II) {
- Value *Incoming = Phi->getIncomingValueForBlock(BB);
+ for (PHINode &Phi : Succ->phis()) {
+ Value *Incoming = Phi.getIncomingValueForBlock(BB);
if (Incoming) {
if (isa<Constant>(Incoming)) {
- Phi->addIncoming(Incoming, ClonedBB);
+ Phi.addIncoming(Incoming, ClonedBB);
continue;
}
Value *ClonedVal = VMap[Incoming];
if (ClonedVal)
- Phi->addIncoming(ClonedVal, ClonedBB);
+ Phi.addIncoming(ClonedVal, ClonedBB);
else
- Phi->addIncoming(Incoming, ClonedBB);
+ Phi.addIncoming(Incoming, ClonedBB);
}
}
}
@@ -1239,7 +1280,7 @@ private:
///
/// Note that this is an optional step and would have been done in later
/// optimizations, but it makes the CFG significantly easier to work with.
- void updateLastSuccessor(ThreadingPath &TPath,
+ void updateLastSuccessor(const ThreadingPath &TPath,
DuplicateBlockMap &DuplicateMap,
DomTreeUpdater *DTU) {
APInt NextState = TPath.getExitValue();
@@ -1271,27 +1312,19 @@ private:
void cleanPhiNodes(BasicBlock *BB) {
// If BB is no longer reachable, remove any remaining phi nodes
if (pred_empty(BB)) {
- std::vector<PHINode *> PhiToRemove;
- for (auto II = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(II); ++II) {
- PhiToRemove.push_back(Phi);
- }
- for (PHINode *PN : PhiToRemove) {
- PN->replaceAllUsesWith(PoisonValue::get(PN->getType()));
- PN->eraseFromParent();
+ for (PHINode &PN : make_early_inc_range(BB->phis())) {
+ PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
+ PN.eraseFromParent();
}
return;
}
// Remove any incoming values that come from an invalid predecessor
- for (auto II = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(II); ++II) {
- std::vector<BasicBlock *> BlocksToRemove;
- for (BasicBlock *IncomingBB : Phi->blocks()) {
- if (!isPredecessor(BB, IncomingBB))
- BlocksToRemove.push_back(IncomingBB);
- }
- for (BasicBlock *BB : BlocksToRemove)
- Phi->removeIncomingValue(BB);
- }
+ for (PHINode &Phi : BB->phis())
+ Phi.removeIncomingValueIf([&](unsigned Index) {
+ BasicBlock *IncomingBB = Phi.getIncomingBlock(Index);
+ return !isPredecessor(BB, IncomingBB);
+ });
}
/// Checks if BB was already cloned for a particular next state value. If it
@@ -1336,6 +1369,7 @@ private:
SmallPtrSet<const Value *, 32> EphValues;
std::vector<ThreadingPath> TPaths;
};
+} // namespace
bool DFAJumpThreading::run(Function &F) {
LLVM_DEBUG(dbgs() << "\nDFA Jump threading: " << F.getName() << "\n");
@@ -1402,9 +1436,8 @@ bool DFAJumpThreading::run(Function &F) {
for (AllSwitchPaths SwitchPaths : ThreadableLoops) {
TransformDFA Transform(&SwitchPaths, DT, AC, TTI, ORE, EphValues);
- Transform.run();
- MadeChanges = true;
- LoopInfoBroken = true;
+ if (Transform.run())
+ MadeChanges = LoopInfoBroken = true;
}
#ifdef EXPENSIVE_CHECKS
@@ -1415,8 +1448,6 @@ bool DFAJumpThreading::run(Function &F) {
return MadeChanges;
}
-} // end anonymous namespace
-
/// Integrate with the new Pass Manager
PreservedAnalyses DFAJumpThreadingPass::run(Function &F,
FunctionAnalysisManager &AM) {
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index b9b5b58..638952a 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -699,6 +699,7 @@ uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) {
case Instruction::FPTrunc:
case Instruction::FPExt:
case Instruction::PtrToInt:
+ case Instruction::PtrToAddr:
case Instruction::IntToPtr:
case Instruction::AddrSpaceCast:
case Instruction::BitCast:
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index d6b7633..3c1a8ba 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -2066,6 +2066,7 @@ NewGVN::performSymbolicEvaluation(Instruction *I,
case Instruction::FPTrunc:
case Instruction::FPExt:
case Instruction::PtrToInt:
+ case Instruction::PtrToAddr:
case Instruction::IntToPtr:
case Instruction::Select:
case Instruction::ExtractElement:
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 60e5df0..7ffccf7 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -355,6 +355,8 @@ void SimplifyCFGPass::printPipeline(
OS << (Options.ForwardSwitchCondToPhi ? "" : "no-") << "forward-switch-cond;";
OS << (Options.ConvertSwitchRangeToICmp ? "" : "no-")
<< "switch-range-to-icmp;";
+ OS << (Options.ConvertSwitchToArithmetic ? "" : "no-")
+ << "switch-to-arithmetic;";
OS << (Options.ConvertSwitchToLookupTable ? "" : "no-")
<< "switch-to-lookup;";
OS << (Options.NeedCanonicalLoop ? "" : "no-") << "keep-loops;";
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index bbd1ed6..5ba6f95f 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -970,6 +970,7 @@ Function *CodeExtractor::constructFunctionDeclaration(
case Attribute::SanitizeMemTag:
case Attribute::SanitizeRealtime:
case Attribute::SanitizeRealtimeBlocking:
+ case Attribute::SanitizeAllocToken:
case Attribute::SpeculativeLoadHardening:
case Attribute::StackProtect:
case Attribute::StackProtectReq:
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index 5a09b73..2923633 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -19,6 +19,7 @@
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/DIBuilder.h"
#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
@@ -162,8 +163,8 @@ bool llvm::applyDebugifyMetadata(
unsigned NextLine = 1;
unsigned NextVar = 1;
auto File = DIB.createFile(M.getName(), "/");
- auto CU = DIB.createCompileUnit(dwarf::DW_LANG_C, File, "debugify",
- /*isOptimized=*/true, "", 0);
+ auto CU = DIB.createCompileUnit(DISourceLanguageName(dwarf::DW_LANG_C), File,
+ "debugify", /*isOptimized=*/true, "", 0);
// Visit each instruction.
for (Function &F : Functions) {
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 21b2652..b6ca52e 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3031,6 +3031,13 @@ static void combineMetadata(Instruction *K, const Instruction *J,
K->getContext(), MDNode::toCaptureComponents(JMD) |
MDNode::toCaptureComponents(KMD)));
break;
+ case LLVMContext::MD_alloc_token:
+ // Preserve !alloc_token if both K and J have it, and they are equal.
+ if (KMD == JMD)
+ K->setMetadata(Kind, JMD);
+ else
+ K->setMetadata(Kind, nullptr);
+ break;
}
}
// Set !invariant.group from J if J has it. If both instructions have it
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 7cc9ff8..0c8d6fa 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -45,12 +45,6 @@ STATISTIC(NumInstrsHoisted,
"Number of instructions hoisted into loop preheader");
STATISTIC(NumInstrsDuplicated,
"Number of instructions cloned into loop preheader");
-STATISTIC(NumRotated, "Number of loops rotated");
-
-static cl::opt<bool>
- MultiRotate("loop-rotate-multi", cl::init(false), cl::Hidden,
- cl::desc("Allow loop rotation multiple times in order to reach "
- "a better latch exit"));
// Probability that a rotated loop has zero trip count / is never entered.
static constexpr uint32_t ZeroTripCountWeights[] = {1, 127};
@@ -206,50 +200,6 @@ static bool profitableToRotateLoopExitingLatch(Loop *L) {
return false;
}
-// Check that latch exit is deoptimizing (which means - very unlikely to happen)
-// and there is another exit from the loop which is non-deoptimizing.
-// If we rotate latch to that exit our loop has a better chance of being fully
-// canonical.
-//
-// It can give false positives in some rare cases.
-static bool canRotateDeoptimizingLatchExit(Loop *L) {
- BasicBlock *Latch = L->getLoopLatch();
- assert(Latch && "need latch");
- BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator());
- // Need normal exiting latch.
- if (!BI || !BI->isConditional())
- return false;
-
- BasicBlock *Exit = BI->getSuccessor(1);
- if (L->contains(Exit))
- Exit = BI->getSuccessor(0);
-
- // Latch exit is non-deoptimizing, no need to rotate.
- if (!Exit->getPostdominatingDeoptimizeCall())
- return false;
-
- SmallVector<BasicBlock *, 4> Exits;
- L->getUniqueExitBlocks(Exits);
- if (!Exits.empty()) {
- // There is at least one non-deoptimizing exit.
- //
- // Note, that BasicBlock::getPostdominatingDeoptimizeCall is not exact,
- // as it can conservatively return false for deoptimizing exits with
- // complex enough control flow down to deoptimize call.
- //
- // That means here we can report success for a case where
- // all exits are deoptimizing but one of them has complex enough
- // control flow (e.g. with loops).
- //
- // That should be a very rare case and false positives for this function
- // have compile-time effect only.
- return any_of(Exits, [](const BasicBlock *BB) {
- return !BB->getPostdominatingDeoptimizeCall();
- });
- }
- return false;
-}
-
static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI,
bool HasConditionalPreHeader,
bool SuccsSwapped) {
@@ -387,506 +337,489 @@ static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI,
/// rotation. LoopRotate should be repeatable and converge to a canonical
/// form. This property is satisfied because simplifying the loop latch can only
/// happen once across multiple invocations of the LoopRotate pass.
-///
-/// If -loop-rotate-multi is enabled we can do multiple rotations in one go
-/// so to reach a suitable (non-deoptimizing) exit.
bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
// If the loop has only one block then there is not much to rotate.
if (L->getBlocks().size() == 1)
return false;
bool Rotated = false;
- do {
- BasicBlock *OrigHeader = L->getHeader();
- BasicBlock *OrigLatch = L->getLoopLatch();
-
- BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
- if (!BI || BI->isUnconditional())
- return Rotated;
-
- // If the loop header is not one of the loop exiting blocks then
- // either this loop is already rotated or it is not
- // suitable for loop rotation transformations.
- if (!L->isLoopExiting(OrigHeader))
+ BasicBlock *OrigHeader = L->getHeader();
+ BasicBlock *OrigLatch = L->getLoopLatch();
+
+ BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+ if (!BI || BI->isUnconditional())
+ return Rotated;
+
+ // If the loop header is not one of the loop exiting blocks then
+ // either this loop is already rotated or it is not
+ // suitable for loop rotation transformations.
+ if (!L->isLoopExiting(OrigHeader))
+ return Rotated;
+
+ // If the loop latch already contains a branch that leaves the loop then the
+ // loop is already rotated.
+ if (!OrigLatch)
+ return Rotated;
+
+ // Rotate if the loop latch was just simplified. Or if it makes the loop exit
+ // count computable. Or if we think it will be profitable.
+ if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
+ !profitableToRotateLoopExitingLatch(L))
+ return Rotated;
+
+ // Check size of original header and reject loop if it is very big or we can't
+ // duplicate blocks inside it.
+ {
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+ CodeMetrics Metrics;
+ Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues, PrepareForLTO);
+ if (Metrics.notDuplicatable) {
+ LLVM_DEBUG(
+ dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
+ << " instructions: ";
+ L->dump());
return Rotated;
-
- // If the loop latch already contains a branch that leaves the loop then the
- // loop is already rotated.
- if (!OrigLatch)
+ }
+ if (Metrics.Convergence != ConvergenceKind::None) {
+ LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
+ "instructions: ";
+ L->dump());
return Rotated;
-
- // Rotate if either the loop latch does *not* exit the loop, or if the loop
- // latch was just simplified. Or if we think it will be profitable.
- if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
- !profitableToRotateLoopExitingLatch(L) &&
- !canRotateDeoptimizingLatchExit(L))
+ }
+ if (!Metrics.NumInsts.isValid()) {
+ LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions"
+ " with invalid cost: ";
+ L->dump());
return Rotated;
-
- // Check size of original header and reject loop if it is very big or we can't
- // duplicate blocks inside it.
- {
- SmallPtrSet<const Value *, 32> EphValues;
- CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
- CodeMetrics Metrics;
- Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues, PrepareForLTO);
- if (Metrics.notDuplicatable) {
- LLVM_DEBUG(
- dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
- << " instructions: ";
- L->dump());
- return Rotated;
- }
- if (Metrics.Convergence != ConvergenceKind::None) {
- LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
- "instructions: ";
- L->dump());
- return Rotated;
- }
- if (!Metrics.NumInsts.isValid()) {
- LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions"
- " with invalid cost: ";
- L->dump());
- return Rotated;
- }
- if (Metrics.NumInsts > MaxHeaderSize) {
- LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains "
- << Metrics.NumInsts
- << " instructions, which is more than the threshold ("
- << MaxHeaderSize << " instructions): ";
- L->dump());
- ++NumNotRotatedDueToHeaderSize;
- return Rotated;
- }
-
- // When preparing for LTO, avoid rotating loops with calls that could be
- // inlined during the LTO stage.
- if (PrepareForLTO && Metrics.NumInlineCandidates > 0)
- return Rotated;
}
-
- // Now, this loop is suitable for rotation.
- BasicBlock *OrigPreheader = L->getLoopPreheader();
-
- // If the loop could not be converted to canonical form, it must have an
- // indirectbr in it, just give up.
- if (!OrigPreheader || !L->hasDedicatedExits())
+ if (Metrics.NumInsts > MaxHeaderSize) {
+ LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains "
+ << Metrics.NumInsts
+ << " instructions, which is more than the threshold ("
+ << MaxHeaderSize << " instructions): ";
+ L->dump());
+ ++NumNotRotatedDueToHeaderSize;
return Rotated;
-
- // Anything ScalarEvolution may know about this loop or the PHI nodes
- // in its header will soon be invalidated. We should also invalidate
- // all outer loops because insertion and deletion of blocks that happens
- // during the rotation may violate invariants related to backedge taken
- // infos in them.
- if (SE) {
- SE->forgetTopmostLoop(L);
- // We may hoist some instructions out of loop. In case if they were cached
- // as "loop variant" or "loop computable", these caches must be dropped.
- // We also may fold basic blocks, so cached block dispositions also need
- // to be dropped.
- SE->forgetBlockAndLoopDispositions();
}
- LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- // Find new Loop header. NewHeader is a Header's one and only successor
- // that is inside loop. Header's other successor is outside the
- // loop. Otherwise loop is not suitable for rotation.
- BasicBlock *Exit = BI->getSuccessor(0);
- BasicBlock *NewHeader = BI->getSuccessor(1);
- bool BISuccsSwapped = L->contains(Exit);
- if (BISuccsSwapped)
- std::swap(Exit, NewHeader);
- assert(NewHeader && "Unable to determine new loop header");
- assert(L->contains(NewHeader) && !L->contains(Exit) &&
- "Unable to determine loop header and exit blocks");
-
- // This code assumes that the new header has exactly one predecessor.
- // Remove any single-entry PHI nodes in it.
- assert(NewHeader->getSinglePredecessor() &&
- "New header doesn't have one pred!");
- FoldSingleEntryPHINodes(NewHeader);
-
- // Begin by walking OrigHeader and populating ValueMap with an entry for
- // each Instruction.
- BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
- ValueToValueMapTy ValueMap, ValueMapMSSA;
-
- // For PHI nodes, the value available in OldPreHeader is just the
- // incoming value from OldPreHeader.
- for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
- InsertNewValueIntoMap(ValueMap, PN,
- PN->getIncomingValueForBlock(OrigPreheader));
-
- // For the rest of the instructions, either hoist to the OrigPreheader if
- // possible or create a clone in the OldPreHeader if not.
- Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
-
- // Record all debug records preceding LoopEntryBranch to avoid
- // duplication.
- using DbgHash =
- std::pair<std::pair<hash_code, DILocalVariable *>, DIExpression *>;
- auto makeHash = [](const DbgVariableRecord *D) -> DbgHash {
- auto VarLocOps = D->location_ops();
- return {{hash_combine_range(VarLocOps), D->getVariable()},
- D->getExpression()};
- };
-
- SmallDenseSet<DbgHash, 8> DbgRecords;
- // Build DbgVariableRecord hashes for DbgVariableRecords attached to the
- // terminator.
- for (const DbgVariableRecord &DVR :
- filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange()))
- DbgRecords.insert(makeHash(&DVR));
-
- // Remember the local noalias scope declarations in the header. After the
- // rotation, they must be duplicated and the scope must be cloned. This
- // avoids unwanted interaction across iterations.
- SmallVector<NoAliasScopeDeclInst *, 6> NoAliasDeclInstructions;
- for (Instruction &I : *OrigHeader)
- if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
- NoAliasDeclInstructions.push_back(Decl);
-
- Module *M = OrigHeader->getModule();
-
- // Track the next DbgRecord to clone. If we have a sequence where an
- // instruction is hoisted instead of being cloned:
- // DbgRecord blah
- // %foo = add i32 0, 0
- // DbgRecord xyzzy
- // %bar = call i32 @foobar()
- // where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once
- // attached to %foo, then when %foo his hoisted it will "fall down" onto the
- // function call:
- // DbgRecord blah
- // DbgRecord xyzzy
- // %bar = call i32 @foobar()
- // causing it to appear attached to the call too.
- //
- // To avoid this, cloneDebugInfoFrom takes an optional "start cloning from
- // here" position to account for this behaviour. We point it at any
- // DbgRecords on the next instruction, here labelled xyzzy, before we hoist
- // %foo. Later, we only only clone DbgRecords from that position (xyzzy)
- // onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as
- // a range because it gives us a natural way of testing whether
- // there were DbgRecords on the next instruction before we hoisted things).
- iterator_range<DbgRecord::self_iterator> NextDbgInsts =
- (I != E) ? I->getDbgRecordRange() : DbgMarker::getEmptyDbgRecordRange();
-
- while (I != E) {
- Instruction *Inst = &*I++;
-
- // If the instruction's operands are invariant and it doesn't read or write
- // memory, then it is safe to hoist. Doing this doesn't change the order of
- // execution in the preheader, but does prevent the instruction from
- // executing in each iteration of the loop. This means it is safe to hoist
- // something that might trap, but isn't safe to hoist something that reads
- // memory (without proving that the loop doesn't write).
- if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
- !Inst->mayWriteToMemory() && !Inst->isTerminator() &&
- !isa<AllocaInst>(Inst) &&
- // It is not safe to hoist the value of these instructions in
- // coroutines, as the addresses of otherwise eligible variables (e.g.
- // thread-local variables and errno) may change if the coroutine is
- // resumed in a different thread.Therefore, we disable this
- // optimization for correctness. However, this may block other correct
- // optimizations.
- // FIXME: This should be reverted once we have a better model for
- // memory access in coroutines.
- !Inst->getFunction()->isPresplitCoroutine()) {
-
- if (!NextDbgInsts.empty()) {
- auto DbgValueRange =
- LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
- RemapDbgRecordRange(M, DbgValueRange, ValueMap,
- RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
- // Erase anything we've seen before.
- for (DbgVariableRecord &DVR :
- make_early_inc_range(filterDbgVars(DbgValueRange)))
- if (DbgRecords.count(makeHash(&DVR)))
- DVR.eraseFromParent();
- }
-
- NextDbgInsts = I->getDbgRecordRange();
-
- Inst->moveBefore(LoopEntryBranch->getIterator());
+ // When preparing for LTO, avoid rotating loops with calls that could be
+ // inlined during the LTO stage.
+ if (PrepareForLTO && Metrics.NumInlineCandidates > 0)
+ return Rotated;
+ }
- ++NumInstrsHoisted;
- continue;
- }
+ // Now, this loop is suitable for rotation.
+ BasicBlock *OrigPreheader = L->getLoopPreheader();
+
+ // If the loop could not be converted to canonical form, it must have an
+ // indirectbr in it, just give up.
+ if (!OrigPreheader || !L->hasDedicatedExits())
+ return Rotated;
+
+ // Anything ScalarEvolution may know about this loop or the PHI nodes
+ // in its header will soon be invalidated. We should also invalidate
+ // all outer loops because insertion and deletion of blocks that happens
+ // during the rotation may violate invariants related to backedge taken
+ // infos in them.
+ if (SE) {
+ SE->forgetTopmostLoop(L);
+ // We may hoist some instructions out of loop. In case if they were cached
+ // as "loop variant" or "loop computable", these caches must be dropped.
+ // We also may fold basic blocks, so cached block dispositions also need
+ // to be dropped.
+ SE->forgetBlockAndLoopDispositions();
+ }
- // Otherwise, create a duplicate of the instruction.
- Instruction *C = Inst->clone();
- if (const DebugLoc &DL = C->getDebugLoc())
- mapAtomInstance(DL, ValueMap);
+ LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
- C->insertBefore(LoopEntryBranch->getIterator());
+ // Find new Loop header. NewHeader is a Header's one and only successor
+ // that is inside loop. Header's other successor is outside the
+ // loop. Otherwise loop is not suitable for rotation.
+ BasicBlock *Exit = BI->getSuccessor(0);
+ BasicBlock *NewHeader = BI->getSuccessor(1);
+ bool BISuccsSwapped = L->contains(Exit);
+ if (BISuccsSwapped)
+ std::swap(Exit, NewHeader);
+ assert(NewHeader && "Unable to determine new loop header");
+ assert(L->contains(NewHeader) && !L->contains(Exit) &&
+ "Unable to determine loop header and exit blocks");
+
+ // This code assumes that the new header has exactly one predecessor.
+ // Remove any single-entry PHI nodes in it.
+ assert(NewHeader->getSinglePredecessor() &&
+ "New header doesn't have one pred!");
+ FoldSingleEntryPHINodes(NewHeader);
+
+ // Begin by walking OrigHeader and populating ValueMap with an entry for
+ // each Instruction.
+ BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+ ValueToValueMapTy ValueMap, ValueMapMSSA;
+
+ // For PHI nodes, the value available in OldPreHeader is just the
+ // incoming value from OldPreHeader.
+ for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
+ InsertNewValueIntoMap(ValueMap, PN,
+ PN->getIncomingValueForBlock(OrigPreheader));
+
+ // For the rest of the instructions, either hoist to the OrigPreheader if
+ // possible or create a clone in the OldPreHeader if not.
+ Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
+
+ // Record all debug records preceding LoopEntryBranch to avoid
+ // duplication.
+ using DbgHash =
+ std::pair<std::pair<hash_code, DILocalVariable *>, DIExpression *>;
+ auto makeHash = [](const DbgVariableRecord *D) -> DbgHash {
+ auto VarLocOps = D->location_ops();
+ return {{hash_combine_range(VarLocOps), D->getVariable()},
+ D->getExpression()};
+ };
- ++NumInstrsDuplicated;
+ SmallDenseSet<DbgHash, 8> DbgRecords;
+ // Build DbgVariableRecord hashes for DbgVariableRecords attached to the
+ // terminator.
+ for (const DbgVariableRecord &DVR :
+ filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange()))
+ DbgRecords.insert(makeHash(&DVR));
+
+ // Remember the local noalias scope declarations in the header. After the
+ // rotation, they must be duplicated and the scope must be cloned. This
+ // avoids unwanted interaction across iterations.
+ SmallVector<NoAliasScopeDeclInst *, 6> NoAliasDeclInstructions;
+ for (Instruction &I : *OrigHeader)
+ if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
+ NoAliasDeclInstructions.push_back(Decl);
+
+ Module *M = OrigHeader->getModule();
+
+ // Track the next DbgRecord to clone. If we have a sequence where an
+ // instruction is hoisted instead of being cloned:
+ // DbgRecord blah
+ // %foo = add i32 0, 0
+ // DbgRecord xyzzy
+ // %bar = call i32 @foobar()
+ // where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once
+ // attached to %foo, then when %foo his hoisted it will "fall down" onto the
+ // function call:
+ // DbgRecord blah
+ // DbgRecord xyzzy
+ // %bar = call i32 @foobar()
+ // causing it to appear attached to the call too.
+ //
+ // To avoid this, cloneDebugInfoFrom takes an optional "start cloning from
+ // here" position to account for this behaviour. We point it at any
+ // DbgRecords on the next instruction, here labelled xyzzy, before we hoist
+ // %foo. Later, we only only clone DbgRecords from that position (xyzzy)
+ // onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as
+ // a range because it gives us a natural way of testing whether
+ // there were DbgRecords on the next instruction before we hoisted things).
+ iterator_range<DbgRecord::self_iterator> NextDbgInsts =
+ (I != E) ? I->getDbgRecordRange() : DbgMarker::getEmptyDbgRecordRange();
+
+ while (I != E) {
+ Instruction *Inst = &*I++;
+
+ // If the instruction's operands are invariant and it doesn't read or write
+ // memory, then it is safe to hoist. Doing this doesn't change the order of
+ // execution in the preheader, but does prevent the instruction from
+ // executing in each iteration of the loop. This means it is safe to hoist
+ // something that might trap, but isn't safe to hoist something that reads
+ // memory (without proving that the loop doesn't write).
+ if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
+ !Inst->mayWriteToMemory() && !Inst->isTerminator() &&
+ !isa<AllocaInst>(Inst) &&
+ // It is not safe to hoist the value of these instructions in
+ // coroutines, as the addresses of otherwise eligible variables (e.g.
+ // thread-local variables and errno) may change if the coroutine is
+ // resumed in a different thread.Therefore, we disable this
+ // optimization for correctness. However, this may block other correct
+ // optimizations.
+ // FIXME: This should be reverted once we have a better model for
+ // memory access in coroutines.
+ !Inst->getFunction()->isPresplitCoroutine()) {
if (!NextDbgInsts.empty()) {
- auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
- RemapDbgRecordRange(M, Range, ValueMap,
+ auto DbgValueRange =
+ LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
+ RemapDbgRecordRange(M, DbgValueRange, ValueMap,
RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
- NextDbgInsts = DbgMarker::getEmptyDbgRecordRange();
// Erase anything we've seen before.
for (DbgVariableRecord &DVR :
- make_early_inc_range(filterDbgVars(Range)))
+ make_early_inc_range(filterDbgVars(DbgValueRange)))
if (DbgRecords.count(makeHash(&DVR)))
DVR.eraseFromParent();
}
- // Eagerly remap the operands of the instruction.
- RemapInstruction(C, ValueMap,
- RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-
- // With the operands remapped, see if the instruction constant folds or is
- // otherwise simplifyable. This commonly occurs because the entry from PHI
- // nodes allows icmps and other instructions to fold.
- Value *V = simplifyInstruction(C, SQ);
- if (V && LI->replacementPreservesLCSSAForm(C, V)) {
- // If so, then delete the temporary instruction and stick the folded value
- // in the map.
- InsertNewValueIntoMap(ValueMap, Inst, V);
- if (!C->mayHaveSideEffects()) {
- C->eraseFromParent();
- C = nullptr;
- }
- } else {
- InsertNewValueIntoMap(ValueMap, Inst, C);
- }
- if (C) {
- // Otherwise, stick the new instruction into the new block!
- C->setName(Inst->getName());
-
- if (auto *II = dyn_cast<AssumeInst>(C))
- AC->registerAssumption(II);
- // MemorySSA cares whether the cloned instruction was inserted or not, and
- // not whether it can be remapped to a simplified value.
- if (MSSAU)
- InsertNewValueIntoMap(ValueMapMSSA, Inst, C);
- }
- }
+ NextDbgInsts = I->getDbgRecordRange();
- if (!NoAliasDeclInstructions.empty()) {
- // There are noalias scope declarations:
- // (general):
- // Original: OrigPre { OrigHeader NewHeader ... Latch }
- // after: (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader }
- //
- // with D: llvm.experimental.noalias.scope.decl,
- // U: !noalias or !alias.scope depending on D
- // ... { D U1 U2 } can transform into:
- // (0) : ... { D U1 U2 } // no relevant rotation for this part
- // (1) : ... D' { U1 U2 D } // D is part of OrigHeader
- // (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader
- //
- // We now want to transform:
- // (1) -> : ... D' { D U1 U2 D'' }
- // (2) -> : ... D' U1' { D U2 D'' U1'' }
- // D: original llvm.experimental.noalias.scope.decl
- // D', U1': duplicate with replaced scopes
- // D'', U1'': different duplicate with replaced scopes
- // This ensures a safe fallback to 'may_alias' introduced by the rotate,
- // as U1'' and U1' scopes will not be compatible wrt to the local restrict
-
- // Clone the llvm.experimental.noalias.decl again for the NewHeader.
- BasicBlock::iterator NewHeaderInsertionPoint =
- NewHeader->getFirstNonPHIIt();
- for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) {
- LLVM_DEBUG(dbgs() << " Cloning llvm.experimental.noalias.scope.decl:"
- << *NAD << "\n");
- Instruction *NewNAD = NAD->clone();
- NewNAD->insertBefore(*NewHeader, NewHeaderInsertionPoint);
- }
+ Inst->moveBefore(LoopEntryBranch->getIterator());
- // Scopes must now be duplicated, once for OrigHeader and once for
- // OrigPreHeader'.
- {
- auto &Context = NewHeader->getContext();
-
- SmallVector<MDNode *, 8> NoAliasDeclScopes;
- for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions)
- NoAliasDeclScopes.push_back(NAD->getScopeList());
-
- LLVM_DEBUG(dbgs() << " Updating OrigHeader scopes\n");
- cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, {OrigHeader}, Context,
- "h.rot");
- LLVM_DEBUG(OrigHeader->dump());
-
- // Keep the compile time impact low by only adapting the inserted block
- // of instructions in the OrigPreHeader. This might result in slightly
- // more aliasing between these instructions and those that were already
- // present, but it will be much faster when the original PreHeader is
- // large.
- LLVM_DEBUG(dbgs() << " Updating part of OrigPreheader scopes\n");
- auto *FirstDecl =
- cast<Instruction>(ValueMap[*NoAliasDeclInstructions.begin()]);
- auto *LastInst = &OrigPreheader->back();
- cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, FirstDecl, LastInst,
- Context, "pre.rot");
- LLVM_DEBUG(OrigPreheader->dump());
-
- LLVM_DEBUG(dbgs() << " Updated NewHeader:\n");
- LLVM_DEBUG(NewHeader->dump());
- }
+ ++NumInstrsHoisted;
+ continue;
}
- // Along with all the other instructions, we just cloned OrigHeader's
- // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
- // successors by duplicating their incoming values for OrigHeader.
- for (BasicBlock *SuccBB : successors(OrigHeader))
- for (BasicBlock::iterator BI = SuccBB->begin();
- PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
- PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
-
- // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
- // OrigPreHeader's old terminator (the original branch into the loop), and
- // remove the corresponding incoming values from the PHI nodes in OrigHeader.
- LoopEntryBranch->eraseFromParent();
- OrigPreheader->flushTerminatorDbgRecords();
-
- // Update MemorySSA before the rewrite call below changes the 1:1
- // instruction:cloned_instruction_or_value mapping.
- if (MSSAU) {
- InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader);
- MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader,
- ValueMapMSSA);
- }
+ // Otherwise, create a duplicate of the instruction.
+ Instruction *C = Inst->clone();
+ if (const DebugLoc &DL = C->getDebugLoc())
+ mapAtomInstance(DL, ValueMap);
- SmallVector<PHINode*, 2> InsertedPHIs;
- // If there were any uses of instructions in the duplicated block outside the
- // loop, update them, inserting PHI nodes as required
- RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE,
- &InsertedPHIs);
-
- // Attach debug records to the new phis if that phi uses a value that
- // previously had debug metadata attached. This keeps the debug info
- // up-to-date in the loop body.
- if (!InsertedPHIs.empty())
- insertDebugValuesForPHIs(OrigHeader, InsertedPHIs);
-
- // NewHeader is now the header of the loop.
- L->moveToHeader(NewHeader);
- assert(L->getHeader() == NewHeader && "Latch block is our new header");
-
- // Inform DT about changes to the CFG.
- if (DT) {
- // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
- // the DT about the removed edge to the OrigHeader (that got removed).
- SmallVector<DominatorTree::UpdateType, 3> Updates = {
- {DominatorTree::Insert, OrigPreheader, Exit},
- {DominatorTree::Insert, OrigPreheader, NewHeader},
- {DominatorTree::Delete, OrigPreheader, OrigHeader}};
-
- if (MSSAU) {
- MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true);
- if (VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
- } else {
- DT->applyUpdates(Updates);
- }
- }
+ C->insertBefore(LoopEntryBranch->getIterator());
- // At this point, we've finished our major CFG changes. As part of cloning
- // the loop into the preheader we've simplified instructions and the
- // duplicated conditional branch may now be branching on a constant. If it is
- // branching on a constant and if that constant means that we enter the loop,
- // then we fold away the cond branch to an uncond branch. This simplifies the
- // loop in cases important for nested loops, and it also means we don't have
- // to split as many edges.
- BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
- assert(PHBI->isConditional() && "Should be clone of BI condbr!");
- const Value *Cond = PHBI->getCondition();
- const bool HasConditionalPreHeader =
- !isa<ConstantInt>(Cond) ||
- PHBI->getSuccessor(cast<ConstantInt>(Cond)->isZero()) != NewHeader;
-
- updateBranchWeights(*PHBI, *BI, HasConditionalPreHeader, BISuccsSwapped);
+ ++NumInstrsDuplicated;
- if (HasConditionalPreHeader) {
- // The conditional branch can't be folded, handle the general case.
- // Split edges as necessary to preserve LoopSimplify form.
-
- // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
- // thus is not a preheader anymore.
- // Split the edge to form a real preheader.
- BasicBlock *NewPH = SplitCriticalEdge(
- OrigPreheader, NewHeader,
- CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
- NewPH->setName(NewHeader->getName() + ".lr.ph");
-
- // Preserve canonical loop form, which means that 'Exit' should have only
- // one predecessor. Note that Exit could be an exit block for multiple
- // nested loops, causing both of the edges to now be critical and need to
- // be split.
- SmallVector<BasicBlock *, 4> ExitPreds(predecessors(Exit));
- bool SplitLatchEdge = false;
- for (BasicBlock *ExitPred : ExitPreds) {
- // We only need to split loop exit edges.
- Loop *PredLoop = LI->getLoopFor(ExitPred);
- if (!PredLoop || PredLoop->contains(Exit) ||
- isa<IndirectBrInst>(ExitPred->getTerminator()))
- continue;
- SplitLatchEdge |= L->getLoopLatch() == ExitPred;
- BasicBlock *ExitSplit = SplitCriticalEdge(
- ExitPred, Exit,
- CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
- ExitSplit->moveBefore(Exit);
+ if (!NextDbgInsts.empty()) {
+ auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
+ RemapDbgRecordRange(M, Range, ValueMap,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+ NextDbgInsts = DbgMarker::getEmptyDbgRecordRange();
+ // Erase anything we've seen before.
+ for (DbgVariableRecord &DVR : make_early_inc_range(filterDbgVars(Range)))
+ if (DbgRecords.count(makeHash(&DVR)))
+ DVR.eraseFromParent();
+ }
+
+ // Eagerly remap the operands of the instruction.
+ RemapInstruction(C, ValueMap,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+ // With the operands remapped, see if the instruction constant folds or is
+ // otherwise simplifyable. This commonly occurs because the entry from PHI
+ // nodes allows icmps and other instructions to fold.
+ Value *V = simplifyInstruction(C, SQ);
+ if (V && LI->replacementPreservesLCSSAForm(C, V)) {
+ // If so, then delete the temporary instruction and stick the folded value
+ // in the map.
+ InsertNewValueIntoMap(ValueMap, Inst, V);
+ if (!C->mayHaveSideEffects()) {
+ C->eraseFromParent();
+ C = nullptr;
}
- assert(SplitLatchEdge &&
- "Despite splitting all preds, failed to split latch exit?");
- (void)SplitLatchEdge;
} else {
- // We can fold the conditional branch in the preheader, this makes things
- // simpler. The first step is to remove the extra edge to the Exit block.
- Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
- BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI->getIterator());
- NewBI->setDebugLoc(PHBI->getDebugLoc());
- PHBI->eraseFromParent();
+ InsertNewValueIntoMap(ValueMap, Inst, C);
+ }
+ if (C) {
+ // Otherwise, stick the new instruction into the new block!
+ C->setName(Inst->getName());
+
+ if (auto *II = dyn_cast<AssumeInst>(C))
+ AC->registerAssumption(II);
+ // MemorySSA cares whether the cloned instruction was inserted or not, and
+ // not whether it can be remapped to a simplified value.
+ if (MSSAU)
+ InsertNewValueIntoMap(ValueMapMSSA, Inst, C);
+ }
+ }
- // With our CFG finalized, update DomTree if it is available.
- if (DT) DT->deleteEdge(OrigPreheader, Exit);
+ if (!NoAliasDeclInstructions.empty()) {
+ // There are noalias scope declarations:
+ // (general):
+ // Original: OrigPre { OrigHeader NewHeader ... Latch }
+ // after: (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader }
+ //
+ // with D: llvm.experimental.noalias.scope.decl,
+ // U: !noalias or !alias.scope depending on D
+ // ... { D U1 U2 } can transform into:
+ // (0) : ... { D U1 U2 } // no relevant rotation for this part
+ // (1) : ... D' { U1 U2 D } // D is part of OrigHeader
+ // (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader
+ //
+ // We now want to transform:
+ // (1) -> : ... D' { D U1 U2 D'' }
+ // (2) -> : ... D' U1' { D U2 D'' U1'' }
+ // D: original llvm.experimental.noalias.scope.decl
+ // D', U1': duplicate with replaced scopes
+ // D'', U1'': different duplicate with replaced scopes
+ // This ensures a safe fallback to 'may_alias' introduced by the rotate,
+ // as U1'' and U1' scopes will not be compatible wrt to the local restrict
+
+ // Clone the llvm.experimental.noalias.decl again for the NewHeader.
+ BasicBlock::iterator NewHeaderInsertionPoint =
+ NewHeader->getFirstNonPHIIt();
+ for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) {
+ LLVM_DEBUG(dbgs() << " Cloning llvm.experimental.noalias.scope.decl:"
+ << *NAD << "\n");
+ Instruction *NewNAD = NAD->clone();
+ NewNAD->insertBefore(*NewHeader, NewHeaderInsertionPoint);
+ }
- // Update MSSA too, if available.
- if (MSSAU)
- MSSAU->removeEdge(OrigPreheader, Exit);
+ // Scopes must now be duplicated, once for OrigHeader and once for
+ // OrigPreHeader'.
+ {
+ auto &Context = NewHeader->getContext();
+
+ SmallVector<MDNode *, 8> NoAliasDeclScopes;
+ for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions)
+ NoAliasDeclScopes.push_back(NAD->getScopeList());
+
+ LLVM_DEBUG(dbgs() << " Updating OrigHeader scopes\n");
+ cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, {OrigHeader}, Context,
+ "h.rot");
+ LLVM_DEBUG(OrigHeader->dump());
+
+ // Keep the compile time impact low by only adapting the inserted block
+ // of instructions in the OrigPreHeader. This might result in slightly
+ // more aliasing between these instructions and those that were already
+ // present, but it will be much faster when the original PreHeader is
+ // large.
+ LLVM_DEBUG(dbgs() << " Updating part of OrigPreheader scopes\n");
+ auto *FirstDecl =
+ cast<Instruction>(ValueMap[*NoAliasDeclInstructions.begin()]);
+ auto *LastInst = &OrigPreheader->back();
+ cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, FirstDecl, LastInst,
+ Context, "pre.rot");
+ LLVM_DEBUG(OrigPreheader->dump());
+
+ LLVM_DEBUG(dbgs() << " Updated NewHeader:\n");
+ LLVM_DEBUG(NewHeader->dump());
}
+ }
- assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
- assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+ // Along with all the other instructions, we just cloned OrigHeader's
+ // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
+ // successors by duplicating their incoming values for OrigHeader.
+ for (BasicBlock *SuccBB : successors(OrigHeader))
+ for (BasicBlock::iterator BI = SuccBB->begin();
+ PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+ PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
+
+ // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
+ // OrigPreHeader's old terminator (the original branch into the loop), and
+ // remove the corresponding incoming values from the PHI nodes in OrigHeader.
+ LoopEntryBranch->eraseFromParent();
+ OrigPreheader->flushTerminatorDbgRecords();
+
+ // Update MemorySSA before the rewrite call below changes the 1:1
+ // instruction:cloned_instruction_or_value mapping.
+ if (MSSAU) {
+ InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader);
+ MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader,
+ ValueMapMSSA);
+ }
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
+ SmallVector<PHINode *, 2> InsertedPHIs;
+ // If there were any uses of instructions in the duplicated block outside the
+ // loop, update them, inserting PHI nodes as required
+ RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE,
+ &InsertedPHIs);
+
+ // Attach debug records to the new phis if that phi uses a value that
+ // previously had debug metadata attached. This keeps the debug info
+ // up-to-date in the loop body.
+ if (!InsertedPHIs.empty())
+ insertDebugValuesForPHIs(OrigHeader, InsertedPHIs);
+
+ // NewHeader is now the header of the loop.
+ L->moveToHeader(NewHeader);
+ assert(L->getHeader() == NewHeader && "Latch block is our new header");
+
+ // Inform DT about changes to the CFG.
+ if (DT) {
+ // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
+ // the DT about the removed edge to the OrigHeader (that got removed).
+ SmallVector<DominatorTree::UpdateType, 3> Updates = {
+ {DominatorTree::Insert, OrigPreheader, Exit},
+ {DominatorTree::Insert, OrigPreheader, NewHeader},
+ {DominatorTree::Delete, OrigPreheader, OrigHeader}};
- // Now that the CFG and DomTree are in a consistent state again, try to merge
- // the OrigHeader block into OrigLatch. This will succeed if they are
- // connected by an unconditional branch. This is just a cleanup so the
- // emitted code isn't too gross in this common case.
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
- BasicBlock *PredBB = OrigHeader->getUniquePredecessor();
- bool DidMerge = MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU);
- if (DidMerge)
- RemoveRedundantDbgInstrs(PredBB);
+ if (MSSAU) {
+ MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true);
+ if (VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+ } else {
+ DT->applyUpdates(Updates);
+ }
+ }
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
+ // At this point, we've finished our major CFG changes. As part of cloning
+ // the loop into the preheader we've simplified instructions and the
+ // duplicated conditional branch may now be branching on a constant. If it is
+ // branching on a constant and if that constant means that we enter the loop,
+ // then we fold away the cond branch to an uncond branch. This simplifies the
+ // loop in cases important for nested loops, and it also means we don't have
+ // to split as many edges.
+ BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
+ assert(PHBI->isConditional() && "Should be clone of BI condbr!");
+ const Value *Cond = PHBI->getCondition();
+ const bool HasConditionalPreHeader =
+ !isa<ConstantInt>(Cond) ||
+ PHBI->getSuccessor(cast<ConstantInt>(Cond)->isZero()) != NewHeader;
+
+ updateBranchWeights(*PHBI, *BI, HasConditionalPreHeader, BISuccsSwapped);
- LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+ if (HasConditionalPreHeader) {
+ // The conditional branch can't be folded, handle the general case.
+ // Split edges as necessary to preserve LoopSimplify form.
+
+ // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
+ // thus is not a preheader anymore.
+ // Split the edge to form a real preheader.
+ BasicBlock *NewPH = SplitCriticalEdge(
+ OrigPreheader, NewHeader,
+ CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
+ NewPH->setName(NewHeader->getName() + ".lr.ph");
+
+ // Preserve canonical loop form, which means that 'Exit' should have only
+ // one predecessor. Note that Exit could be an exit block for multiple
+ // nested loops, causing both of the edges to now be critical and need to
+ // be split.
+ SmallVector<BasicBlock *, 4> ExitPreds(predecessors(Exit));
+ bool SplitLatchEdge = false;
+ for (BasicBlock *ExitPred : ExitPreds) {
+ // We only need to split loop exit edges.
+ Loop *PredLoop = LI->getLoopFor(ExitPred);
+ if (!PredLoop || PredLoop->contains(Exit) ||
+ isa<IndirectBrInst>(ExitPred->getTerminator()))
+ continue;
+ SplitLatchEdge |= L->getLoopLatch() == ExitPred;
+ BasicBlock *ExitSplit = SplitCriticalEdge(
+ ExitPred, Exit,
+ CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
+ ExitSplit->moveBefore(Exit);
+ }
+ assert(SplitLatchEdge &&
+ "Despite splitting all preds, failed to split latch exit?");
+ (void)SplitLatchEdge;
+ } else {
+ // We can fold the conditional branch in the preheader, this makes things
+ // simpler. The first step is to remove the extra edge to the Exit block.
+ Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
+ BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI->getIterator());
+ NewBI->setDebugLoc(PHBI->getDebugLoc());
+ PHBI->eraseFromParent();
+
+ // With our CFG finalized, update DomTree if it is available.
+ if (DT)
+ DT->deleteEdge(OrigPreheader, Exit);
+
+ // Update MSSA too, if available.
+ if (MSSAU)
+ MSSAU->removeEdge(OrigPreheader, Exit);
+ }
- ++NumRotated;
+ assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
+ assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
- Rotated = true;
- SimplifiedLatch = false;
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ // Now that the CFG and DomTree are in a consistent state again, try to merge
+ // the OrigHeader block into OrigLatch. This will succeed if they are
+ // connected by an unconditional branch. This is just a cleanup so the
+ // emitted code isn't too gross in this common case.
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+ BasicBlock *PredBB = OrigHeader->getUniquePredecessor();
+ bool DidMerge = MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU);
+ if (DidMerge)
+ RemoveRedundantDbgInstrs(PredBB);
- // Check that new latch is a deoptimizing exit and then repeat rotation if possible.
- // Deoptimizing latch exit is not a generally typical case, so we just loop over.
- // TODO: if it becomes a performance bottleneck extend rotation algorithm
- // to handle multiple rotations in one go.
- } while (MultiRotate && canRotateDeoptimizingLatchExit(L));
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+ LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
return true;
}
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index bf882d7..6312831 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -201,18 +201,27 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
/// unroll count is non-zero.
///
/// This function performs the following:
-/// - Update PHI nodes at the unrolling loop exit and epilog loop exit
-/// - Create PHI nodes at the unrolling loop exit to combine
-/// values that exit the unrolling loop code and jump around it.
+/// - Update PHI nodes at the epilog loop exit
+/// - Create PHI nodes at the unrolling loop exit and epilog preheader to
+/// combine values that exit the unrolling loop code and jump around it.
/// - Update PHI operands in the epilog loop by the new PHI nodes
-/// - Branch around the epilog loop if extra iters (ModVal) is zero.
+/// - At the unrolling loop exit, branch around the epilog loop if extra iters
+// (ModVal) is zero.
+/// - At the epilog preheader, add an llvm.assume call that extra iters is
+/// non-zero. If the unrolling loop exit is the predecessor, the above new
+/// branch guarantees that assumption. If the unrolling loop preheader is the
+/// predecessor, then the required first iteration from the original loop has
+/// yet to be executed, so it must be executed in the epilog loop. If we
+/// later unroll the epilog loop, that llvm.assume call somehow enables
+/// ScalarEvolution to compute a epilog loop maximum trip count, which enables
+/// eliminating the branch at the end of the final unrolled epilog iteration.
///
static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
BasicBlock *Exit, BasicBlock *PreHeader,
BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
ValueToValueMapTy &VMap, DominatorTree *DT,
LoopInfo *LI, bool PreserveLCSSA, ScalarEvolution &SE,
- unsigned Count) {
+ unsigned Count, AssumptionCache &AC) {
BasicBlock *Latch = L->getLoopLatch();
assert(Latch && "Loop must have a latch");
BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
@@ -231,7 +240,7 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
// EpilogLatch
// Exit (EpilogPN)
- // Update PHI nodes at NewExit and Exit.
+ // Update PHI nodes at Exit.
for (PHINode &PN : NewExit->phis()) {
// PN should be used in another PHI located in Exit block as
// Exit was split by SplitBlockPredecessors into Exit and NewExit
@@ -246,15 +255,11 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
// epilogue edges have already been added.
//
// There is EpilogPreHeader incoming block instead of NewExit as
- // NewExit was spilt 1 more time to get EpilogPreHeader.
+ // NewExit was split 1 more time to get EpilogPreHeader.
assert(PN.hasOneUse() && "The phi should have 1 use");
PHINode *EpilogPN = cast<PHINode>(PN.use_begin()->getUser());
assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
- // Add incoming PreHeader from branch around the Loop
- PN.addIncoming(PoisonValue::get(PN.getType()), PreHeader);
- SE.forgetValue(&PN);
-
Value *V = PN.getIncomingValueForBlock(Latch);
Instruction *I = dyn_cast<Instruction>(V);
if (I && L->contains(I))
@@ -271,35 +276,52 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
NewExit);
// Now PHIs should look like:
// NewExit:
- // PN = PHI [I, Latch], [poison, PreHeader]
+ // PN = PHI [I, Latch]
// ...
// Exit:
// EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]
}
- // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader).
- // Update corresponding PHI nodes in epilog loop.
+ // Create PHI nodes at NewExit (from the unrolling loop Latch) and at
+ // EpilogPreHeader (from PreHeader and NewExit). Update corresponding PHI
+ // nodes in epilog loop.
for (BasicBlock *Succ : successors(Latch)) {
// Skip this as we already updated phis in exit blocks.
if (!L->contains(Succ))
continue;
+
+ // Succ here appears to always be just L->getHeader(). Otherwise, how do we
+ // know its corresponding epilog block (from VMap) is EpilogHeader and thus
+ // EpilogPreHeader is the right incoming block for VPN, as set below?
+ // TODO: Can we thus avoid the enclosing loop over successors?
+ assert(Succ == L->getHeader() &&
+ "Expect the only in-loop successor of latch to be the loop header");
+
for (PHINode &PN : Succ->phis()) {
- // Add new PHI nodes to the loop exit block and update epilog
- // PHIs with the new PHI values.
- PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr");
- NewPN->insertBefore(NewExit->getFirstNonPHIIt());
- // Adding a value to the new PHI node from the unrolling loop preheader.
- NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader);
- // Adding a value to the new PHI node from the unrolling loop latch.
- NewPN->addIncoming(PN.getIncomingValueForBlock(Latch), Latch);
+ // Add new PHI nodes to the loop exit block.
+ PHINode *NewPN0 = PHINode::Create(PN.getType(), /*NumReservedValues=*/1,
+ PN.getName() + ".unr");
+ NewPN0->insertBefore(NewExit->getFirstNonPHIIt());
+ // Add value to the new PHI node from the unrolling loop latch.
+ NewPN0->addIncoming(PN.getIncomingValueForBlock(Latch), Latch);
+
+ // Add new PHI nodes to EpilogPreHeader.
+ PHINode *NewPN1 = PHINode::Create(PN.getType(), /*NumReservedValues=*/2,
+ PN.getName() + ".epil.init");
+ NewPN1->insertBefore(EpilogPreHeader->getFirstNonPHIIt());
+ // Add value to the new PHI node from the unrolling loop preheader.
+ NewPN1->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader);
+ // Add value to the new PHI node from the epilog loop guard.
+ NewPN1->addIncoming(NewPN0, NewExit);
// Update the existing PHI node operand with the value from the new PHI
// node. Corresponding instruction in epilog loop should be PHI.
PHINode *VPN = cast<PHINode>(VMap[&PN]);
- VPN->setIncomingValueForBlock(EpilogPreHeader, NewPN);
+ VPN->setIncomingValueForBlock(EpilogPreHeader, NewPN1);
}
}
+ // In NewExit, branch around the epilog loop if no extra iters.
Instruction *InsertPt = NewExit->getTerminator();
IRBuilder<> B(InsertPt);
Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod");
@@ -308,7 +330,7 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI, nullptr,
PreserveLCSSA);
- // Add the branch to the exit block (around the unrolling loop)
+ // Add the branch to the exit block (around the epilog loop)
MDNode *BranchWeights = nullptr;
if (hasBranchWeightMD(*Latch->getTerminator())) {
// Assume equal distribution in interval [0, Count).
@@ -322,10 +344,11 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
DT->changeImmediateDominator(Exit, NewDom);
}
- // Split the main loop exit to maintain canonicalization guarantees.
- SmallVector<BasicBlock*, 4> NewExitPreds{Latch};
- SplitBlockPredecessors(NewExit, NewExitPreds, ".loopexit", DT, LI, nullptr,
- PreserveLCSSA);
+ // In EpilogPreHeader, assume extra iters is non-zero.
+ IRBuilder<> B2(EpilogPreHeader, EpilogPreHeader->getFirstNonPHIIt());
+ Value *ModIsNotNull = B2.CreateIsNotNull(ModVal, "lcmp.mod");
+ AssumeInst *AI = cast<AssumeInst>(B2.CreateAssumption(ModIsNotNull));
+ AC.registerAssumption(AI);
}
/// Create a clone of the blocks in a loop and connect them together. A new
@@ -795,7 +818,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
ConstantInt::get(BECount->getType(),
Count - 1)) :
B.CreateIsNotNull(ModVal, "lcmp.mod");
- BasicBlock *RemainderLoop = UseEpilogRemainder ? NewExit : PrologPreHeader;
+ BasicBlock *RemainderLoop =
+ UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;
// Branch to either remainder (extra iterations) loop or unrolling loop.
MDNode *BranchWeights = nullptr;
@@ -808,7 +832,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
PreHeaderBR->eraseFromParent();
if (DT) {
if (UseEpilogRemainder)
- DT->changeImmediateDominator(NewExit, PreHeader);
+ DT->changeImmediateDominator(EpilogPreHeader, PreHeader);
else
DT->changeImmediateDominator(PrologExit, PreHeader);
}
@@ -880,7 +904,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
// from both the original loop and the remainder code reaching the exit
// blocks. While the IDom of these exit blocks were from the original loop,
// now the IDom is the preheader (which decides whether the original loop or
- // remainder code should run).
+ // remainder code should run) unless the block still has just the original
+ // predecessor (such as NewExit in the case of an epilog remainder).
if (DT && !L->getExitingBlock()) {
SmallVector<BasicBlock *, 16> ChildrenToUpdate;
// NB! We have to examine the dom children of all loop blocks, not just
@@ -891,7 +916,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
auto *DomNodeBB = DT->getNode(BB);
for (auto *DomChild : DomNodeBB->children()) {
auto *DomChildBB = DomChild->getBlock();
- if (!L->contains(LI->getLoopFor(DomChildBB)))
+ if (!L->contains(LI->getLoopFor(DomChildBB)) &&
+ DomChildBB->getUniquePredecessor() != BB)
ChildrenToUpdate.push_back(DomChildBB);
}
}
@@ -930,7 +956,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
// Connect the epilog code to the original loop and update the
// PHI functions.
ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader,
- NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count);
+ NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC);
// Update counter in loop for unrolling.
// Use an incrementing IV. Pre-incr/post-incr is backedge/trip count.
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 148bfa8..155fcc5 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -4895,9 +4895,8 @@ bool SimplifyCFGOpt::simplifyTerminatorOnSelect(Instruction *OldTerm,
// We found both of the successors we were looking for.
// Create a conditional branch sharing the condition of the select.
BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB);
- if (TrueWeight != FalseWeight)
- setBranchWeights(*NewBI, {TrueWeight, FalseWeight},
- /*IsExpected=*/false, /*ElideAllZero=*/true);
+ setBranchWeights(*NewBI, {TrueWeight, FalseWeight},
+ /*IsExpected=*/false, /*ElideAllZero=*/true);
}
} else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
// Neither of the selected blocks were successors, so this
@@ -4982,9 +4981,15 @@ bool SimplifyCFGOpt::simplifyIndirectBrOnSelect(IndirectBrInst *IBI,
BasicBlock *TrueBB = TBA->getBasicBlock();
BasicBlock *FalseBB = FBA->getBasicBlock();
+ // The select's profile becomes the profile of the conditional branch that
+ // replaces the indirect branch.
+ SmallVector<uint32_t> SelectBranchWeights(2);
+ if (!ProfcheckDisableMetadataFixes)
+ extractBranchWeights(*SI, SelectBranchWeights);
// Perform the actual simplification.
- return simplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB, 0,
- 0);
+ return simplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB,
+ SelectBranchWeights[0],
+ SelectBranchWeights[1]);
}
/// This is called when we find an icmp instruction
@@ -6637,6 +6642,9 @@ public:
/// Return true if the replacement is a lookup table.
bool isLookupTable();
+ /// Return true if the replacement is a bit map.
+ bool isBitMap();
+
private:
// Depending on the switch, there are different alternatives.
enum {
@@ -6927,6 +6935,8 @@ Constant *SwitchReplacement::getDefaultValue() { return DefaultValue; }
bool SwitchReplacement::isLookupTable() { return Kind == LookupTableKind; }
+bool SwitchReplacement::isBitMap() { return Kind == BitMapKind; }
+
static bool isSwitchDense(uint64_t NumCases, uint64_t CaseRange) {
// 40% is the default density for building a jump table in optsize/minsize
// mode. See also TargetLoweringBase::isSuitableForJumpTable(), which this
@@ -7092,7 +7102,8 @@ static void reuseTableCompare(
/// lookup tables.
static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
DomTreeUpdater *DTU, const DataLayout &DL,
- const TargetTransformInfo &TTI) {
+ const TargetTransformInfo &TTI,
+ bool ConvertSwitchToLookupTable) {
assert(SI->getNumCases() > 1 && "Degenerate switch?");
BasicBlock *BB = SI->getParent();
@@ -7257,6 +7268,8 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
bool AnyLookupTables = any_of(
PhiToReplacementMap, [](auto &KV) { return KV.second.isLookupTable(); });
+ bool AnyBitMaps = any_of(PhiToReplacementMap,
+ [](auto &KV) { return KV.second.isBitMap(); });
// A few conditions prevent the generation of lookup tables:
// 1. The target does not support lookup tables.
@@ -7269,6 +7282,12 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
Fn->getFnAttribute("no-jump-tables").getValueAsBool()))
return false;
+ // In the early optimization pipeline, disable formation of lookup tables,
+ // bit maps and mask checks, as they may inhibit further optimization.
+ if (!ConvertSwitchToLookupTable &&
+ (AnyLookupTables || AnyBitMaps || NeedMask))
+ return false;
+
Builder.SetInsertPoint(SI);
// TableIndex is the switch condition - TableIndexOffset if we don't
// use the condition directly
@@ -7924,14 +7943,13 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
if (Options.ForwardSwitchCondToPhi && forwardSwitchConditionToPHI(SI))
return requestResimplify();
- // The conversion from switch to lookup tables results in difficult-to-analyze
- // code and makes pruning branches much harder. This is a problem if the
- // switch expression itself can still be restricted as a result of inlining or
- // CVP. Therefore, only apply this transformation during late stages of the
- // optimisation pipeline.
- if (Options.ConvertSwitchToLookupTable &&
- simplifySwitchLookup(SI, Builder, DTU, DL, TTI))
- return requestResimplify();
+ // The conversion of switches to arithmetic or lookup table is disabled in
+ // the early optimization pipeline, as it may lose information or make the
+ // resulting code harder to analyze.
+ if (Options.ConvertSwitchToArithmetic || Options.ConvertSwitchToLookupTable)
+ if (simplifySwitchLookup(SI, Builder, DTU, DL, TTI,
+ Options.ConvertSwitchToLookupTable))
+ return requestResimplify();
if (simplifySwitchOfPowersOfTwo(SI, Builder, DL, TTI))
return requestResimplify();
@@ -7952,19 +7970,27 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
BasicBlock *BB = IBI->getParent();
bool Changed = false;
+ SmallVector<uint32_t> BranchWeights;
+ const bool HasBranchWeights = !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*IBI, BranchWeights);
+
+ DenseMap<const BasicBlock *, uint64_t> TargetWeight;
+ if (HasBranchWeights)
+ for (size_t I = 0, E = IBI->getNumDestinations(); I < E; ++I)
+ TargetWeight[IBI->getDestination(I)] += BranchWeights[I];
// Eliminate redundant destinations.
SmallPtrSet<Value *, 8> Succs;
SmallSetVector<BasicBlock *, 8> RemovedSuccs;
- for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
- BasicBlock *Dest = IBI->getDestination(i);
+ for (unsigned I = 0, E = IBI->getNumDestinations(); I != E; ++I) {
+ BasicBlock *Dest = IBI->getDestination(I);
if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) {
if (!Dest->hasAddressTaken())
RemovedSuccs.insert(Dest);
Dest->removePredecessor(BB);
- IBI->removeDestination(i);
- --i;
- --e;
+ IBI->removeDestination(I);
+ --I;
+ --E;
Changed = true;
}
}
@@ -7990,7 +8016,12 @@ bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
eraseTerminatorAndDCECond(IBI);
return true;
}
-
+ if (HasBranchWeights) {
+ SmallVector<uint64_t> NewBranchWeights(IBI->getNumDestinations());
+ for (size_t I = 0, E = IBI->getNumDestinations(); I < E; ++I)
+ NewBranchWeights[I] += TargetWeight.find(IBI->getDestination(I))->second;
+ setFittedBranchWeights(*IBI, NewBranchWeights, /*IsExpected=*/false);
+ }
if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) {
if (simplifyIndirectBrOnSelect(IBI, SI))
return requestResimplify();
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d393a9c..3f16b03 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3903,7 +3903,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
if (VF.isScalar())
continue;
- VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
+ *CM.PSE.getSE());
precomputeCosts(*Plan, VF, CostCtx);
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4160,7 +4161,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
// Add on other costs that are modelled in VPlan, but not in the legacy
// cost model.
- VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
+ *CM.PSE.getSE());
VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
assert(VectorRegion && "Expected to have a vector region!");
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -4262,8 +4264,8 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
if (!Legal->isReductionVariable(&Phi))
return Legal->isFixedOrderRecurrence(&Phi);
- RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind();
- return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum;
+ return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(
+ Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind());
}))
return false;
@@ -6852,7 +6854,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
ElementCount VF) const {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
// Now compute and add the VPlan-based cost.
@@ -7085,7 +7087,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
// simplifications not accounted for in the legacy cost model. If that's the
// case, don't trigger the assertion, as the extra simplifications may cause a
// different VF to be picked by the VPlan-based cost model.
- VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
+ *CM.PSE.getSE());
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
// with early exits and plans with additional VPlan simplifications. The
@@ -7279,8 +7282,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
if (!Exit->hasPredecessors())
continue;
for (VPRecipeBase &PhiR : Exit->phis())
- SE.forgetLcssaPhiWithNewPredecessor(
- OrigLoop, cast<PHINode>(&cast<VPIRPhi>(PhiR).getInstruction()));
+ SE.forgetLcssaPhiWithNewPredecessor(OrigLoop,
+ &cast<VPIRPhi>(PhiR).getIRPhi());
}
// Forget the original loop and block dispositions.
SE.forgetLoop(OrigLoop);
@@ -8418,7 +8421,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// TODO: Enable following transform when the EVL-version of extended-reduction
// and mulacc-reduction are implemented.
if (!CM.foldTailWithEVL()) {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
+ *CM.PSE.getSE());
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
CostCtx, Range);
}
@@ -9874,7 +9878,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool ForceVectorization =
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
- CM.CostKind);
+ CM.CostKind, *CM.PSE.getSE());
if (!ForceVectorization &&
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
LVP.getPlanFor(VF.Width), SEL,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 91c3d42..cfa8d27 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10621,7 +10621,8 @@ class InstructionsCompatibilityAnalysis {
/// elements.
static bool isSupportedOpcode(const unsigned Opcode) {
return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
- Opcode == Instruction::Shl;
+ Opcode == Instruction::Shl || Opcode == Instruction::SDiv ||
+ Opcode == Instruction::UDiv;
}
/// Identifies the best candidate value, which represents main opcode
@@ -10939,6 +10940,8 @@ public:
case Instruction::Add:
case Instruction::LShr:
case Instruction::Shl:
+ case Instruction::SDiv:
+ case Instruction::UDiv:
VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
break;
default:
@@ -22066,8 +22069,10 @@ bool BoUpSLP::collectValuesToDemote(
auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
return all_of(E.Scalars, [&](Value *V) {
- auto *I = cast<Instruction>(V);
APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
+ if (E.hasCopyableElements() && E.isCopyableElement(V))
+ return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
+ auto *I = cast<Instruction>(V);
return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
});
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 07b191a..2555ebe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1772,7 +1772,8 @@ VPCostContext::getOperandInfo(VPValue *V) const {
}
InstructionCost VPCostContext::getScalarizationOverhead(
- Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
+ Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
+ bool AlwaysIncludeReplicatingR) {
if (VF.isScalar())
return 0;
@@ -1792,7 +1793,11 @@ InstructionCost VPCostContext::getScalarizationOverhead(
SmallPtrSet<const VPValue *, 4> UniqueOperands;
SmallVector<Type *> Tys;
for (auto *Op : Operands) {
- if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
+ if (Op->isLiveIn() ||
+ (!AlwaysIncludeReplicatingR &&
+ isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
+ (isa<VPReplicateRecipe>(Op) &&
+ cast<VPReplicateRecipe>(Op)->getOpcode() == Instruction::Load) ||
!UniqueOperands.insert(Op).second)
continue;
Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c167dd7..fb696be 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2263,8 +2263,7 @@ public:
/// debug location \p DL.
VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr,
DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "")
- : VPSingleDefRecipe(VPDef::VPWidenPHISC, ArrayRef<VPValue *>(), Phi, DL),
- Name(Name.str()) {
+ : VPSingleDefRecipe(VPDef::VPWidenPHISC, {}, Phi, DL), Name(Name.str()) {
if (Start)
addOperand(Start);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index c8212af..81deba2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -763,6 +763,8 @@ void VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
// Add the minimum iteration check for the epilogue vector loop.
VPValue *TC = Plan.getOrAddLiveIn(TripCount);
VPBuilder Builder(cast<VPBasicBlock>(Plan.getEntry()));
+ VPValue *VFxUF = Builder.createExpandSCEV(SE.getElementCount(
+ TripCount->getType(), (EpilogueVF * EpilogueUF), SCEV::FlagNUW));
VPValue *Count = Builder.createNaryOp(
Instruction::Sub, {TC, Plan.getOrAddLiveIn(VectorTripCount)},
DebugLoc::getUnknown(), "n.vec.remaining");
@@ -770,9 +772,6 @@ void VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
// Generate code to check if the loop's trip count is less than VF * UF of
// the vector epilogue loop.
auto P = RequiresScalarEpilogue ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
- VPValue *VFxUF = Builder.createExpandSCEV(SE.getElementCount(
- TripCount->getType(), (EpilogueVF * EpilogueUF), SCEV::FlagNUW));
-
auto *CheckMinIters = Builder.createICmp(
P, Count, VFxUF, DebugLoc::getUnknown(), "min.epilog.iters.check");
VPInstruction *Branch =
@@ -841,8 +840,8 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
// TODO: Support multiple MaxNum/MinNum reductions and other reductions.
if (RedPhiR)
return false;
- if (Cur->getRecurrenceKind() != RecurKind::FMaxNum &&
- Cur->getRecurrenceKind() != RecurKind::FMinNum) {
+ if (!RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(
+ Cur->getRecurrenceKind())) {
HasUnsupportedPhi = true;
continue;
}
@@ -862,10 +861,9 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
if (!MinMaxOp)
return false;
- RecurKind RedPhiRK = RedPhiR->getRecurrenceKind();
- assert((RedPhiRK == RecurKind::FMaxNum || RedPhiRK == RecurKind::FMinNum) &&
+ assert(RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(
+ RedPhiR->getRecurrenceKind()) &&
"unsupported reduction");
- (void)RedPhiRK;
/// Check if the vector loop of \p Plan can early exit and restart
/// execution of last vector iteration in the scalar loop. This requires all
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index fc1a09e..1580a3b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -349,12 +349,14 @@ struct VPCostContext {
LoopVectorizationCostModel &CM;
SmallPtrSet<Instruction *, 8> SkipCostComputation;
TargetTransformInfo::TargetCostKind CostKind;
+ ScalarEvolution &SE;
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
const VPlan &Plan, LoopVectorizationCostModel &CM,
- TargetTransformInfo::TargetCostKind CostKind)
+ TargetTransformInfo::TargetCostKind CostKind,
+ ScalarEvolution &SE)
: TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
- CostKind(CostKind) {}
+ CostKind(CostKind), SE(SE) {}
/// Return the cost for \p UI with \p VF using the legacy cost model as
/// fallback until computing the cost of all recipes migrates to VPlan.
@@ -374,10 +376,12 @@ struct VPCostContext {
/// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
/// and \p Operands with \p VF. This is a convenience wrapper for the
- /// type-based getScalarizationOverhead API.
- InstructionCost getScalarizationOverhead(Type *ResultTy,
- ArrayRef<const VPValue *> Operands,
- ElementCount VF);
+ /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
+ /// is true, always compute the cost of scalarizing replicating operands.
+ InstructionCost
+ getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
+ ElementCount VF,
+ bool AlwaysIncludeReplicatingR = false);
};
/// This class can be used to assign names to VPValues. For VPValues without
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 67b9244..600ff8a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -40,6 +40,7 @@
#include <cassert>
using namespace llvm;
+using namespace llvm::VPlanPatternMatch;
using VectorParts = SmallVector<Value *, 2>;
@@ -303,7 +304,6 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
VPRecipeBase *OpR = Op->getDefiningRecipe();
// If the partial reduction is predicated, a select will be operand 0
- using namespace llvm::VPlanPatternMatch;
if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
OpR = Op->getDefiningRecipe();
}
@@ -1230,6 +1230,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::ExtractLane:
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractPenultimateElement:
+ case VPInstruction::ActiveLaneMask:
case VPInstruction::FirstActiveLane:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
@@ -1963,7 +1964,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
VPValue *Op0, *Op1;
- using namespace llvm::VPlanPatternMatch;
if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
(match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
@@ -2778,7 +2778,7 @@ VPExpressionRecipe::VPExpressionRecipe(
// Recipes in the expression, except the last one, must only be used by
// (other) recipes inside the expression. If there are other users, external
// to the expression, use a clone of the recipe for external users.
- for (VPSingleDefRecipe *R : ExpressionRecipes) {
+ for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
if (R != ExpressionRecipes.back() &&
any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
return !ExpressionRecipesAsSetOfUsers.contains(U);
@@ -3111,6 +3111,62 @@ bool VPReplicateRecipe::shouldPack() const {
});
}
+/// Returns true if \p Ptr is a pointer computation for which the legacy cost
+/// model computes a SCEV expression when computing the address cost.
+static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
+ auto *PtrR = Ptr->getDefiningRecipe();
+ if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
+ cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
+ Instruction::GetElementPtr) ||
+ isa<VPWidenGEPRecipe>(PtrR) ||
+ match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
+ return false;
+
+ // We are looking for a GEP where all indices are either loop invariant or
+ // inductions.
+ for (VPValue *Opd : drop_begin(PtrR->operands())) {
+ if (!Opd->isDefinedOutsideLoopRegions() &&
+ !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
+ return false;
+ }
+
+ return true;
+}
+
+/// Returns true if \p V is used as part of the address of another load or
+/// store.
+static bool isUsedByLoadStoreAddress(const VPUser *V) {
+ SmallPtrSet<const VPUser *, 4> Seen;
+ SmallVector<const VPUser *> WorkList = {V};
+
+ while (!WorkList.empty()) {
+ auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
+ if (!Cur || !Seen.insert(Cur).second || isa<VPBlendRecipe>(Cur))
+ continue;
+
+ for (VPUser *U : Cur->users()) {
+ if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
+ if (InterleaveR->getAddr() == Cur)
+ return true;
+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
+ if (RepR->getOpcode() == Instruction::Load &&
+ RepR->getOperand(0) == Cur)
+ return true;
+ if (RepR->getOpcode() == Instruction::Store &&
+ RepR->getOperand(1) == Cur)
+ return true;
+ }
+ if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
+ if (MemR->getAddr() == Cur && MemR->isConsecutive())
+ return true;
+ }
+ }
+
+ append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
+ }
+ return false;
+}
+
InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3218,21 +3274,60 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
}
case Instruction::Load:
case Instruction::Store: {
- if (isSingleScalar()) {
- bool IsLoad = UI->getOpcode() == Instruction::Load;
- Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
- Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
- const Align Alignment = getLoadStoreAlignment(UI);
- unsigned AS = getLoadStoreAddressSpace(UI);
- TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
- InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
- UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
- return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
- ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
- }
+ if (VF.isScalable() && !isSingleScalar())
+ return InstructionCost::getInvalid();
+
// TODO: See getMemInstScalarizationCost for how to handle replicating and
// predicated cases.
- break;
+ const VPRegionBlock *ParentRegion = getParent()->getParent();
+ if (ParentRegion && ParentRegion->isReplicator())
+ break;
+
+ bool IsLoad = UI->getOpcode() == Instruction::Load;
+ const VPValue *PtrOp = getOperand(!IsLoad);
+ // TODO: Handle cases where we need to pass a SCEV to
+ // getAddressComputationCost.
+ if (shouldUseAddressAccessSCEV(PtrOp))
+ break;
+
+ Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
+ Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
+ const Align Alignment = getLoadStoreAlignment(UI);
+ unsigned AS = getLoadStoreAddressSpace(UI);
+ TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
+ InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
+ UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
+
+ Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
+ bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
+ bool UsedByLoadStoreAddress =
+ !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this);
+ InstructionCost ScalarCost =
+ ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
+ PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE,
+ nullptr, Ctx.CostKind);
+ if (isSingleScalar())
+ return ScalarCost;
+
+ SmallVector<const VPValue *> OpsToScalarize;
+ Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
+ // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
+ // don't assign scalarization overhead in general, if the target prefers
+ // vectorized addressing or the loaded value is used as part of an address
+ // of another load or store.
+ if (!UsedByLoadStoreAddress) {
+ bool EfficientVectorLoadStore =
+ Ctx.TTI.supportsEfficientVectorElementLoadStore();
+ if (!(IsLoad && !PreferVectorizedAddressing) &&
+ !(!IsLoad && EfficientVectorLoadStore))
+ append_range(OpsToScalarize, operands());
+
+ if (!EfficientVectorLoadStore)
+ ResultTy = Ctx.Types.inferScalarType(this);
+ }
+
+ return (ScalarCost * VF.getFixedValue()) +
+ Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ebf833e..c8a2d84 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3180,9 +3180,8 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
DebugLoc::getUnknown(), "induction");
// Create the widened phi of the vector IV.
- auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr,
+ auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
WidenIVR->getDebugLoc(), "vec.ind");
- WidePHI->addOperand(Init);
WidePHI->insertBefore(WidenIVR);
// Create the backedge value for the vector IV.
@@ -3545,8 +3544,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
VPValue *A, *B;
VPValue *Tmp = nullptr;
// Sub reductions could have a sub between the add reduction and vec op.
- if (match(VecOp,
- m_Binary<Instruction::Sub>(m_SpecificInt(0), m_VPValue(Tmp)))) {
+ if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
Sub = VecOp->getDefiningRecipe();
VecOp = Tmp;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 0599930..66748c5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -71,8 +71,8 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
m_Specific(&Plan.getVF()))) ||
IsWideCanonicalIV(A));
- return match(V, m_Binary<Instruction::ICmp>(m_VPValue(A), m_VPValue(B))) &&
- IsWideCanonicalIV(A) && B == Plan.getOrCreateBackedgeTakenCount();
+ return match(V, m_ICmp(m_VPValue(A), m_VPValue(B))) && IsWideCanonicalIV(A) &&
+ B == Plan.getOrCreateBackedgeTakenCount();
}
const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {