aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Instrumentation
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Instrumentation')
-rw-r--r--llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp47
-rw-r--r--llvm/lib/Transforms/Instrumentation/AllocToken.cpp76
-rw-r--r--llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp7
-rw-r--r--llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp2
-rw-r--r--llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp12
-rw-r--r--llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp4
-rw-r--r--llvm/lib/Transforms/Instrumentation/KCFI.cpp1
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemProfUse.cpp96
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp305
-rw-r--r--llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp4
-rw-r--r--llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp2
-rw-r--r--llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp3
-rw-r--r--llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp2
-rw-r--r--llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp2
14 files changed, 394 insertions, 169 deletions
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 7c364f8..3ea290a7 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -20,6 +20,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
@@ -248,6 +249,11 @@ static cl::opt<bool>
"platforms that support this"),
cl::Hidden, cl::init(true));
+static cl::opt<int>
+ ClShadowAddrSpace("asan-shadow-addr-space",
+ cl::desc("Address space for pointers to the shadow map"),
+ cl::Hidden, cl::init(0));
+
static cl::opt<bool> ClWithIfuncSuppressRemat(
"asan-with-ifunc-suppress-remat",
cl::desc("Suppress rematerialization of dynamic shadow address by passing "
@@ -436,6 +442,15 @@ static cl::opt<AsanDtorKind> ClOverrideDestructorKind(
"Use global destructors")),
cl::init(AsanDtorKind::Invalid), cl::Hidden);
+static SmallSet<unsigned, 8> SrcAddrSpaces;
+static cl::list<unsigned> ClAddrSpaces(
+ "asan-instrument-address-spaces",
+ cl::desc("Only instrument variables in the specified address spaces."),
+ cl::Hidden, cl::CommaSeparated, cl::ZeroOrMore,
+ cl::callback([](const unsigned &AddrSpace) {
+ SrcAddrSpaces.insert(AddrSpace);
+ }));
+
// Debug flags.
static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden,
@@ -503,6 +518,7 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
bool IsAMDGPU = TargetTriple.isAMDGPU();
bool IsHaiku = TargetTriple.isOSHaiku();
bool IsWasm = TargetTriple.isWasm();
+ bool IsBPF = TargetTriple.isBPF();
ShadowMapping Mapping;
@@ -579,6 +595,8 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
else if (IsHaiku && IsX86_64)
Mapping.Offset = (kSmallX86_64ShadowOffsetBase &
(kSmallX86_64ShadowOffsetAlignMask << Mapping.Scale));
+ else if (IsBPF)
+ Mapping.Offset = kDynamicShadowSentinel;
else
Mapping.Offset = kDefaultShadowOffset64;
}
@@ -1355,11 +1373,25 @@ static bool GlobalWasGeneratedByCompiler(GlobalVariable *G) {
static bool isUnsupportedAMDGPUAddrspace(Value *Addr) {
Type *PtrTy = cast<PointerType>(Addr->getType()->getScalarType());
unsigned int AddrSpace = PtrTy->getPointerAddressSpace();
+ // Globals in address space 1 and 4 are supported for AMDGPU.
if (AddrSpace == 3 || AddrSpace == 5)
return true;
return false;
}
+static bool isSupportedAddrspace(const Triple &TargetTriple, Value *Addr) {
+ Type *PtrTy = cast<PointerType>(Addr->getType()->getScalarType());
+ unsigned int AddrSpace = PtrTy->getPointerAddressSpace();
+
+ if (!SrcAddrSpaces.empty())
+ return SrcAddrSpaces.count(AddrSpace);
+
+ if (TargetTriple.isAMDGPU())
+ return !isUnsupportedAMDGPUAddrspace(Addr);
+
+ return AddrSpace == 0;
+}
+
Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
// Shadow >> scale
Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
@@ -1423,10 +1455,9 @@ bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
}
bool AddressSanitizer::ignoreAccess(Instruction *Inst, Value *Ptr) {
- // Instrument accesses from different address spaces only for AMDGPU.
- Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType());
- if (PtrTy->getPointerAddressSpace() != 0 &&
- !(TargetTriple.isAMDGPU() && !isUnsupportedAMDGPUAddrspace(Ptr)))
+ // Check whether the target supports sanitizing the address space
+ // of the pointer.
+ if (!isSupportedAddrspace(TargetTriple, Ptr))
return true;
// Ignore swifterror addresses.
@@ -1942,7 +1973,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
Type *ShadowTy =
IntegerType::get(*C, std::max(8U, TypeStoreSize >> Mapping.Scale));
- Type *ShadowPtrTy = PointerType::get(*C, 0);
+ Type *ShadowPtrTy = PointerType::get(*C, ClShadowAddrSpace);
Value *ShadowPtr = memToShadow(AddrLong, IRB);
const uint64_t ShadowAlign =
std::max<uint64_t>(Alignment.valueOrOne().value() >> Mapping.Scale, 1);
@@ -2089,9 +2120,7 @@ bool ModuleAddressSanitizer::shouldInstrumentGlobal(GlobalVariable *G) const {
return false;
if (!Ty->isSized()) return false;
if (!G->hasInitializer()) return false;
- // Globals in address space 1 and 4 are supported for AMDGPU.
- if (G->getAddressSpace() &&
- !(TargetTriple.isAMDGPU() && !isUnsupportedAMDGPUAddrspace(G)))
+ if (!isSupportedAddrspace(TargetTriple, G))
return false;
if (GlobalWasGeneratedByCompiler(G)) return false; // Our own globals.
// Two problems with thread-locals:
@@ -2669,7 +2698,7 @@ void ModuleAddressSanitizer::instrumentGlobals(IRBuilder<> &IRB,
// ODR should not happen for local linkage.
if (NewGlobal->hasLocalLinkage()) {
- ODRIndicator = ConstantInt::get(IntptrTy, -1);
+ ODRIndicator = ConstantInt::getAllOnesValue(IntptrTy);
} else if (UseOdrIndicator) {
// With local aliases, we need to provide another externally visible
// symbol __odr_asan_XXX to detect ODR violation.
diff --git a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
index 8181e4e..38eeee2 100644
--- a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
@@ -67,9 +67,10 @@ cl::opt<std::string> ClFuncPrefix("alloc-token-prefix",
cl::desc("The allocation function prefix"),
cl::Hidden, cl::init("__alloc_token_"));
-cl::opt<uint64_t> ClMaxTokens("alloc-token-max",
- cl::desc("Maximum number of tokens (0 = no max)"),
- cl::Hidden, cl::init(0));
+cl::opt<uint64_t>
+ ClMaxTokens("alloc-token-max",
+ cl::desc("Maximum number of tokens (0 = target SIZE_MAX)"),
+ cl::Hidden, cl::init(0));
cl::opt<bool>
ClFastABI("alloc-token-fast-abi",
@@ -233,12 +234,31 @@ public:
}
};
-// Apply opt overrides.
-AllocTokenOptions transformOptionsFromCl(AllocTokenOptions Opts) {
- if (!Opts.MaxTokens.has_value())
+// Apply opt overrides and module flags.
+static AllocTokenOptions resolveOptions(AllocTokenOptions Opts,
+ const Module &M) {
+ auto IntModuleFlagOrNull = [&](StringRef Key) {
+ return mdconst::extract_or_null<ConstantInt>(M.getModuleFlag(Key));
+ };
+
+ if (auto *S = dyn_cast_or_null<MDString>(M.getModuleFlag("alloc-token-mode")))
+ if (auto Mode = getAllocTokenModeFromString(S->getString()))
+ Opts.Mode = *Mode;
+ if (auto *Val = IntModuleFlagOrNull("alloc-token-max"))
+ Opts.MaxTokens = Val->getZExtValue();
+ if (auto *Val = IntModuleFlagOrNull("alloc-token-fast-abi"))
+ Opts.FastABI |= Val->isOne();
+ if (auto *Val = IntModuleFlagOrNull("alloc-token-extended"))
+ Opts.Extended |= Val->isOne();
+
+ // Allow overriding options from command line options.
+ if (ClMaxTokens.getNumOccurrences())
Opts.MaxTokens = ClMaxTokens;
- Opts.FastABI |= ClFastABI;
- Opts.Extended |= ClExtended;
+ if (ClFastABI.getNumOccurrences())
+ Opts.FastABI = ClFastABI;
+ if (ClExtended.getNumOccurrences())
+ Opts.Extended = ClExtended;
+
return Opts;
}
@@ -246,21 +266,21 @@ class AllocToken {
public:
explicit AllocToken(AllocTokenOptions Opts, Module &M,
ModuleAnalysisManager &MAM)
- : Options(transformOptionsFromCl(std::move(Opts))), Mod(M),
+ : Options(resolveOptions(std::move(Opts), M)), Mod(M),
FAM(MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()),
- Mode(IncrementMode(*IntPtrTy, *Options.MaxTokens)) {
+ Mode(IncrementMode(*IntPtrTy, Options.MaxTokens)) {
switch (Options.Mode) {
case TokenMode::Increment:
break;
case TokenMode::Random:
- Mode.emplace<RandomMode>(*IntPtrTy, *Options.MaxTokens,
+ Mode.emplace<RandomMode>(*IntPtrTy, Options.MaxTokens,
M.createRNG(DEBUG_TYPE));
break;
case TokenMode::TypeHash:
- Mode.emplace<TypeHashMode>(*IntPtrTy, *Options.MaxTokens);
+ Mode.emplace<TypeHashMode>(*IntPtrTy, Options.MaxTokens);
break;
case TokenMode::TypeHashPointerSplit:
- Mode.emplace<TypeHashPointerSplitMode>(*IntPtrTy, *Options.MaxTokens);
+ Mode.emplace<TypeHashPointerSplitMode>(*IntPtrTy, Options.MaxTokens);
break;
}
}
@@ -317,8 +337,6 @@ bool AllocToken::instrumentFunction(Function &F) {
if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage)
return false;
- auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
SmallVector<std::pair<CallBase *, LibFunc>, 4> AllocCalls;
SmallVector<IntrinsicInst *, 4> IntrinsicInsts;
@@ -327,6 +345,10 @@ bool AllocToken::instrumentFunction(Function &F) {
F.hasFnAttribute(Attribute::SanitizeAllocToken) &&
!F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation);
+ // Get TLI only when required.
+ const TargetLibraryInfo *TLI =
+ InstrumentFunction ? &FAM.getResult<TargetLibraryAnalysis>(F) : nullptr;
+
// Collect all allocation calls to avoid iterator invalidation.
for (Instruction &I : instructions(F)) {
// Collect all alloc_token_* intrinsics.
@@ -342,26 +364,28 @@ bool AllocToken::instrumentFunction(Function &F) {
auto *CB = dyn_cast<CallBase>(&I);
if (!CB)
continue;
- if (std::optional<LibFunc> Func = shouldInstrumentCall(*CB, TLI))
+ if (std::optional<LibFunc> Func = shouldInstrumentCall(*CB, *TLI))
AllocCalls.emplace_back(CB, Func.value());
}
+ // Return early to avoid unnecessarily instantiating the ORE.
+ if (AllocCalls.empty() && IntrinsicInsts.empty())
+ return false;
+
+ auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
bool Modified = false;
- if (!AllocCalls.empty()) {
- for (auto &[CB, Func] : AllocCalls)
- Modified |= replaceAllocationCall(CB, Func, ORE, TLI);
- if (Modified)
- NumFunctionsModified++;
- }
+ for (auto &[CB, Func] : AllocCalls)
+ Modified |= replaceAllocationCall(CB, Func, ORE, *TLI);
- if (!IntrinsicInsts.empty()) {
- for (auto *II : IntrinsicInsts)
- replaceIntrinsicInst(II, ORE);
+ for (auto *II : IntrinsicInsts) {
+ replaceIntrinsicInst(II, ORE);
Modified = true;
- NumFunctionsModified++;
}
+ if (Modified)
+ NumFunctionsModified++;
+
return Modified;
}
diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 9239ae8..b5a8f79 100644
--- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -178,6 +178,8 @@ getRuntimeCallName(const BoundsCheckingPass::Options::Runtime &Opts) {
Name += "_minimal";
if (!Opts.MayReturn)
Name += "_abort";
+ else if (Opts.HandlerPreserveAllRegs)
+ Name += "_preserve";
return Name;
}
@@ -267,7 +269,10 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
TrapCall->setDoesNotReturn();
IRB.CreateUnreachable();
}
-
+ // The preserve-all logic is somewhat duplicated in CGExpr.cpp for
+ // local-bounds. Make sure to change that too.
+ if (Opts.Rt && Opts.Rt->HandlerPreserveAllRegs && MayReturn)
+ TrapCall->setCallingConv(CallingConv::PreserveAll);
if (!MayReturn && SingleTrapBB && !DebugTrapBB)
ReuseTrapBB = TrapBB;
diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 0688bc7..726d94b 100644
--- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -1992,6 +1992,8 @@ void CHR::addToMergedCondition(bool IsTrueBiased, Value *Cond,
// Use logical and to avoid propagating poison from later conditions.
MergedCondition = IRB.CreateLogicalAnd(MergedCondition, Cond);
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *cast<Instruction>(MergedCondition), DEBUG_TYPE);
}
void CHR::transformScopes(SmallVectorImpl<CHRScope *> &CHRScopes) {
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index cc53ec2..e984ac4 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -2191,8 +2191,16 @@ std::pair<Value *, Value *> DFSanFunction::loadShadowFast(
// and then the entire shadow for the second origin pointer (which will be
// chosen by combineOrigins() iff the least-significant half of the wide
// shadow was empty but the other half was not).
- Value *WideShadowLo = IRB.CreateShl(
- WideShadow, ConstantInt::get(WideShadowTy, WideShadowBitWidth / 2));
+ Value *WideShadowLo =
+ F->getParent()->getDataLayout().isLittleEndian()
+ ? IRB.CreateShl(
+ WideShadow,
+ ConstantInt::get(WideShadowTy, WideShadowBitWidth / 2))
+ : IRB.CreateAnd(
+ WideShadow,
+ ConstantInt::get(WideShadowTy,
+ (1 - (1 << (WideShadowBitWidth / 2)))
+ << (WideShadowBitWidth / 2)));
Shadows.push_back(WideShadow);
Origins.push_back(DFS.loadNextOrigin(Pos, OriginAlign, &OriginAddr));
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index b5548d4..8c8d16a6 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1944,6 +1944,10 @@ void InstrLowerer::emitNameData() {
NamesVar = new GlobalVariable(M, NamesVal->getType(), true,
GlobalValue::PrivateLinkage, NamesVal,
getInstrProfNamesVarName());
+ if (isGPUProfTarget(M)) {
+ NamesVar->setLinkage(GlobalValue::ExternalLinkage);
+ NamesVar->setVisibility(GlobalValue::ProtectedVisibility);
+ }
NamesSize = CompressedNameStr.size();
setGlobalVariableLargeSection(TT, *NamesVar);
diff --git a/llvm/lib/Transforms/Instrumentation/KCFI.cpp b/llvm/lib/Transforms/Instrumentation/KCFI.cpp
index f4cb4e2..f06b1d3 100644
--- a/llvm/lib/Transforms/Instrumentation/KCFI.cpp
+++ b/llvm/lib/Transforms/Instrumentation/KCFI.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
+#include "llvm/Support/xxhash.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index b72d41a..25953f4 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -63,6 +63,11 @@ static cl::opt<bool>
cl::Hidden, cl::init(false));
static cl::opt<bool>
+ PrintFunctionGuids("memprof-print-function-guids",
+ cl::desc("Print function GUIDs computed for matching"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
SalvageStaleProfile("memprof-salvage-stale-profile",
cl::desc("Salvage stale MemProf profile"),
cl::init(false), cl::Hidden);
@@ -454,6 +459,15 @@ handleAllocSite(Instruction &I, CallBase *CI,
InlinedCallStack.size())] = {
AllocInfo->Info.getTotalSize(), AllocType};
}
+ ORE.emit(
+ OptimizationRemark(DEBUG_TYPE, "MemProfUse", CI)
+ << ore::NV("AllocationCall", CI) << " in function "
+ << ore::NV("Caller", CI->getFunction())
+ << " matched alloc context with alloc type "
+ << ore::NV("Attribute", getAllocTypeAttributeString(AllocType))
+ << " total size " << ore::NV("Size", AllocInfo->Info.getTotalSize())
+ << " full context id " << ore::NV("Context", FullStackId)
+ << " frame count " << ore::NV("Frames", InlinedCallStack.size()));
}
}
// If the threshold for the percent of cold bytes is less than 100%,
@@ -495,53 +509,59 @@ struct CallSiteEntry {
ArrayRef<Frame> Frames;
// Potential targets for indirect calls.
ArrayRef<GlobalValue::GUID> CalleeGuids;
-
- // Only compare Frame contents.
- // Use pointer-based equality instead of ArrayRef's operator== which does
- // element-wise comparison. We want to check if it's the same slice of the
- // underlying array, not just equivalent content.
- bool operator==(const CallSiteEntry &Other) const {
- return Frames.data() == Other.Frames.data() &&
- Frames.size() == Other.Frames.size();
- }
};
-struct CallSiteEntryHash {
- size_t operator()(const CallSiteEntry &Entry) const {
- return computeFullStackId(Entry.Frames);
- }
-};
-
-static void handleCallSite(
- Instruction &I, const Function *CalledFunction,
- ArrayRef<uint64_t> InlinedCallStack,
- const std::unordered_set<CallSiteEntry, CallSiteEntryHash> &CallSiteEntries,
- Module &M, std::set<std::vector<uint64_t>> &MatchedCallSites) {
+static void handleCallSite(Instruction &I, const Function *CalledFunction,
+ ArrayRef<uint64_t> InlinedCallStack,
+ const std::vector<CallSiteEntry> &CallSiteEntries,
+ Module &M,
+ std::set<std::vector<uint64_t>> &MatchedCallSites,
+ OptimizationRemarkEmitter &ORE) {
auto &Ctx = M.getContext();
+ // Set of Callee GUIDs to attach to indirect calls. We accumulate all of them
+ // to support cases where the instuction's inlined frames match multiple call
+ // site entries, which can happen if the profile was collected from a binary
+ // where this instruction was eventually inlined into multiple callers.
+ SetVector<GlobalValue::GUID> CalleeGuids;
+ bool CallsiteMDAdded = false;
for (const auto &CallSiteEntry : CallSiteEntries) {
// If we found and thus matched all frames on the call, create and
// attach call stack metadata.
if (stackFrameIncludesInlinedCallStack(CallSiteEntry.Frames,
InlinedCallStack)) {
NumOfMemProfMatchedCallSites++;
- addCallsiteMetadata(I, InlinedCallStack, Ctx);
-
- // Try to attach indirect call metadata if possible.
- if (!CalledFunction)
- addVPMetadata(M, I, CallSiteEntry.CalleeGuids);
-
// Only need to find one with a matching call stack and add a single
// callsite metadata.
-
- // Accumulate call site matching information upon request.
- if (ClPrintMemProfMatchInfo) {
- std::vector<uint64_t> CallStack;
- append_range(CallStack, InlinedCallStack);
- MatchedCallSites.insert(std::move(CallStack));
+ if (!CallsiteMDAdded) {
+ addCallsiteMetadata(I, InlinedCallStack, Ctx);
+
+ // Accumulate call site matching information upon request.
+ if (ClPrintMemProfMatchInfo) {
+ std::vector<uint64_t> CallStack;
+ append_range(CallStack, InlinedCallStack);
+ MatchedCallSites.insert(std::move(CallStack));
+ }
+ ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemProfUse", &I)
+ << ore::NV("CallSite", &I) << " in function "
+ << ore::NV("Caller", I.getFunction())
+ << " matched callsite with frame count "
+ << ore::NV("Frames", InlinedCallStack.size()));
+
+ // If this is a direct call, we're done.
+ if (CalledFunction)
+ break;
+ CallsiteMDAdded = true;
}
- break;
+
+ assert(!CalledFunction && "Didn't expect direct call");
+
+ // Collect Callee GUIDs from all matching CallSiteEntries.
+ CalleeGuids.insert(CallSiteEntry.CalleeGuids.begin(),
+ CallSiteEntry.CalleeGuids.end());
}
}
+ // Try to attach indirect call metadata if possible.
+ addVPMetadata(M, I, CalleeGuids.getArrayRef());
}
static void readMemprof(Module &M, Function &F,
@@ -562,6 +582,9 @@ static void readMemprof(Module &M, Function &F,
// linkage function.
auto FuncName = F.getName();
auto FuncGUID = Function::getGUIDAssumingExternalLinkage(FuncName);
+ if (PrintFunctionGuids)
+ errs() << "MemProf: Function GUID " << FuncGUID << " is " << FuncName
+ << "\n";
std::optional<memprof::MemProfRecord> MemProfRec;
auto Err = MemProfReader->getMemProfRecord(FuncGUID).moveInto(MemProfRec);
if (Err) {
@@ -616,8 +639,7 @@ static void readMemprof(Module &M, Function &F,
// For the callsites we need to record slices of the frame array (see comments
// below where the map entries are added) along with their CalleeGuids.
- std::map<uint64_t, std::unordered_set<CallSiteEntry, CallSiteEntryHash>>
- LocHashToCallSites;
+ std::map<uint64_t, std::vector<CallSiteEntry>> LocHashToCallSites;
for (auto &AI : MemProfRec->AllocSites) {
NumOfMemProfAllocContextProfiles++;
// Associate the allocation info with the leaf frame. The later matching
@@ -636,7 +658,7 @@ static void readMemprof(Module &M, Function &F,
uint64_t StackId = computeStackId(StackFrame);
ArrayRef<Frame> FrameSlice = ArrayRef<Frame>(CS.Frames).drop_front(Idx++);
ArrayRef<GlobalValue::GUID> CalleeGuids(CS.CalleeGuids);
- LocHashToCallSites[StackId].insert({FrameSlice, CalleeGuids});
+ LocHashToCallSites[StackId].push_back({FrameSlice, CalleeGuids});
ProfileHasColumns |= StackFrame.Column;
// Once we find this function, we can stop recording.
@@ -719,7 +741,7 @@ static void readMemprof(Module &M, Function &F,
// instruction's leaf location in the callsites map and not the
// allocation map.
handleCallSite(I, CalledFunction, InlinedCallStack,
- CallSitesIter->second, M, MatchedCallSites);
+ CallSitesIter->second, M, MatchedCallSites, ORE);
}
}
}
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 471c6ec..32ee16c 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2720,34 +2720,55 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// of elements.
//
// For example, suppose we have:
- // VectorA: <a1, a2, a3, a4, a5, a6>
- // VectorB: <b1, b2, b3, b4, b5, b6>
- // ReductionFactor: 3.
+ // VectorA: <a0, a1, a2, a3, a4, a5>
+ // VectorB: <b0, b1, b2, b3, b4, b5>
+ // ReductionFactor: 3
+ // Shards: 1
// The output would be:
- // <a1|a2|a3, a4|a5|a6, b1|b2|b3, b4|b5|b6>
+ // <a0|a1|a2, a3|a4|a5, b0|b1|b2, b3|b4|b5>
+ //
+ // If we have:
+ // VectorA: <a0, a1, a2, a3, a4, a5, a6, a7>
+ // VectorB: <b0, b1, b2, b3, b4, b5, b6, b7>
+ // ReductionFactor: 2
+ // Shards: 2
+ // then a and be each have 2 "shards", resulting in the output being
+ // interleaved:
+ // <a0|a1, a2|a3, b0|b1, b2|b3, a4|a5, a6|a7, b4|b5, b6|b7>
//
// This is convenient for instrumenting horizontal add/sub.
// For bitwise OR on "vertical" pairs, see maybeHandleSimpleNomemIntrinsic().
Value *horizontalReduce(IntrinsicInst &I, unsigned ReductionFactor,
- Value *VectorA, Value *VectorB) {
+ unsigned Shards, Value *VectorA, Value *VectorB) {
assert(isa<FixedVectorType>(VectorA->getType()));
- unsigned TotalNumElems =
+ unsigned NumElems =
cast<FixedVectorType>(VectorA->getType())->getNumElements();
+ [[maybe_unused]] unsigned TotalNumElems = NumElems;
if (VectorB) {
assert(VectorA->getType() == VectorB->getType());
- TotalNumElems = TotalNumElems * 2;
+ TotalNumElems *= 2;
}
- assert(TotalNumElems % ReductionFactor == 0);
+ assert(NumElems % (ReductionFactor * Shards) == 0);
Value *Or = nullptr;
IRBuilder<> IRB(&I);
for (unsigned i = 0; i < ReductionFactor; i++) {
SmallVector<int, 16> Mask;
- for (unsigned X = 0; X < TotalNumElems; X += ReductionFactor)
- Mask.push_back(X + i);
+
+ for (unsigned j = 0; j < Shards; j++) {
+ unsigned Offset = NumElems / Shards * j;
+
+ for (unsigned X = 0; X < NumElems / Shards; X += ReductionFactor)
+ Mask.push_back(Offset + X + i);
+
+ if (VectorB) {
+ for (unsigned X = 0; X < NumElems / Shards; X += ReductionFactor)
+ Mask.push_back(NumElems + Offset + X + i);
+ }
+ }
Value *Masked;
if (VectorB)
@@ -2769,7 +2790,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
///
/// e.g., <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>)
/// <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>)
- void handlePairwiseShadowOrIntrinsic(IntrinsicInst &I) {
+ void handlePairwiseShadowOrIntrinsic(IntrinsicInst &I, unsigned Shards) {
assert(I.arg_size() == 1 || I.arg_size() == 2);
assert(I.getType()->isVectorTy());
@@ -2792,8 +2813,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
if (I.arg_size() == 2)
SecondArgShadow = getShadow(&I, 1);
- Value *OrShadow = horizontalReduce(I, /*ReductionFactor=*/2, FirstArgShadow,
- SecondArgShadow);
+ Value *OrShadow = horizontalReduce(I, /*ReductionFactor=*/2, Shards,
+ FirstArgShadow, SecondArgShadow);
OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I));
@@ -2808,7 +2829,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
/// conceptually operates on
/// (<4 x i16> [[VAR1]], <4 x i16> [[VAR2]])
/// and can be handled with ReinterpretElemWidth == 16.
- void handlePairwiseShadowOrIntrinsic(IntrinsicInst &I,
+ void handlePairwiseShadowOrIntrinsic(IntrinsicInst &I, unsigned Shards,
int ReinterpretElemWidth) {
assert(I.arg_size() == 1 || I.arg_size() == 2);
@@ -2852,8 +2873,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
SecondArgShadow = IRB.CreateBitCast(SecondArgShadow, ReinterpretShadowTy);
}
- Value *OrShadow = horizontalReduce(I, /*ReductionFactor=*/2, FirstArgShadow,
- SecondArgShadow);
+ Value *OrShadow = horizontalReduce(I, /*ReductionFactor=*/2, Shards,
+ FirstArgShadow, SecondArgShadow);
OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I));
@@ -3903,7 +3924,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// adding/"accumulating" %s. "Accumulation" stores the result in one
// of the source registers, but this accumulate vs. add distinction
// is lost when dealing with LLVM intrinsics.)
+ //
+ // ZeroPurifies means that multiplying a known-zero with an uninitialized
+ // value results in an initialized value. This is applicable for integer
+ // multiplication, but not floating-point (counter-example: NaN).
void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
+ bool ZeroPurifies,
unsigned EltSizeInBits = 0) {
IRBuilder<> IRB(&I);
@@ -3945,7 +3971,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
assert(AccumulatorType == ReturnType);
}
- FixedVectorType *ImplicitReturnType = ReturnType;
+ FixedVectorType *ImplicitReturnType =
+ cast<FixedVectorType>(getShadowTy(ReturnType));
// Step 1: instrument multiplication of corresponding vector elements
if (EltSizeInBits) {
ImplicitReturnType = cast<FixedVectorType>(
@@ -3964,30 +3991,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
ReturnType->getNumElements() * ReductionFactor);
}
- // Multiplying an *initialized* zero by an uninitialized element results in
- // an initialized zero element.
- //
- // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value
- // results in an unpoisoned value. We can therefore adapt the visitAnd()
- // instrumentation:
- // OutShadow = (SaNonZero & SbNonZero)
- // | (VaNonZero & SbNonZero)
- // | (SaNonZero & VbNonZero)
- // where non-zero is checked on a per-element basis (not per bit).
- Value *SZero = Constant::getNullValue(Va->getType());
- Value *VZero = Constant::getNullValue(Sa->getType());
- Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero);
- Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero);
- Value *VaNonZero = IRB.CreateICmpNE(Va, VZero);
- Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero);
-
- Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero);
- Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero);
- Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero);
-
// Each element of the vector is represented by a single bit (poisoned or
// not) e.g., <8 x i1>.
- Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
+ Value *SaNonZero = IRB.CreateIsNotNull(Sa);
+ Value *SbNonZero = IRB.CreateIsNotNull(Sb);
+ Value *And;
+ if (ZeroPurifies) {
+ // Multiplying an *initialized* zero by an uninitialized element results
+ // in an initialized zero element.
+ //
+ // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value
+ // results in an unpoisoned value. We can therefore adapt the visitAnd()
+ // instrumentation:
+ // OutShadow = (SaNonZero & SbNonZero)
+ // | (VaNonZero & SbNonZero)
+ // | (SaNonZero & VbNonZero)
+ // where non-zero is checked on a per-element basis (not per bit).
+ Value *VaInt = Va;
+ Value *VbInt = Vb;
+ if (!Va->getType()->isIntegerTy()) {
+ VaInt = CreateAppToShadowCast(IRB, Va);
+ VbInt = CreateAppToShadowCast(IRB, Vb);
+ }
+
+ Value *VaNonZero = IRB.CreateIsNotNull(VaInt);
+ Value *VbNonZero = IRB.CreateIsNotNull(VbInt);
+
+ Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero);
+ Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero);
+ Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero);
+
+ And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
+ } else {
+ And = IRB.CreateOr({SaNonZero, SbNonZero});
+ }
// Extend <8 x i1> to <8 x i16>.
// (The real pmadd intrinsic would have computed intermediate values of
@@ -5752,17 +5789,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
case Intrinsic::x86_avx2_pmadd_ub_sw:
case Intrinsic::x86_avx512_pmaddubs_w_512:
- handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2);
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+ /*ZeroPurifies=*/true);
break;
// <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
case Intrinsic::x86_ssse3_pmadd_ub_sw:
- handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8);
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+ /*ZeroPurifies=*/true, /*EltSizeInBits=*/8);
break;
// <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
case Intrinsic::x86_mmx_pmadd_wd:
- handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+ /*ZeroPurifies=*/true, /*EltSizeInBits=*/16);
break;
// AVX Vector Neural Network Instructions: bytes
@@ -5848,71 +5888,144 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case Intrinsic::x86_avx2_vpdpbuuds_128:
case Intrinsic::x86_avx2_vpdpbuuds_256:
case Intrinsic::x86_avx10_vpdpbuuds_512:
- handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8);
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4,
+ /*ZeroPurifies=*/true, /*EltSizeInBits=*/8);
break;
// AVX Vector Neural Network Instructions: words
//
// Multiply and Add Signed Word Integers
// < 4 x i32> @llvm.x86.avx512.vpdpwssd.128
- // (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+ // (< 4 x i32>, < 8 x i16>, < 8 x i16>)
// < 8 x i32> @llvm.x86.avx512.vpdpwssd.256
- // (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+ // (< 8 x i32>, <16 x i16>, <16 x i16>)
// <16 x i32> @llvm.x86.avx512.vpdpwssd.512
- // (<16 x i32>, <16 x i32>, <16 x i32>)
+ // (<16 x i32>, <32 x i16>, <32 x i16>)
//
// Multiply and Add Signed Word Integers With Saturation
// < 4 x i32> @llvm.x86.avx512.vpdpwssds.128
- // (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+ // (< 4 x i32>, < 8 x i16>, < 8 x i16>)
// < 8 x i32> @llvm.x86.avx512.vpdpwssds.256
- // (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+ // (< 8 x i32>, <16 x i16>, <16 x i16>)
// <16 x i32> @llvm.x86.avx512.vpdpwssds.512
- // (<16 x i32>, <16 x i32>, <16 x i32>)
+ // (<16 x i32>, <32 x i16>, <32 x i16>)
+ //
+ // Multiply and Add Signed and Unsigned Word Integers
+ // < 4 x i32> @llvm.x86.avx2.vpdpwsud.128
+ // (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+ // < 8 x i32> @llvm.x86.avx2.vpdpwsud.256
+ // (< 8 x i32>, <16 x i16>, <16 x i16>)
+ // <16 x i32> @llvm.x86.avx10.vpdpwsud.512
+ // (<16 x i32>, <32 x i16>, <32 x i16>)
+ //
+ // Multiply and Add Signed and Unsigned Word Integers With Saturation
+ // < 4 x i32> @llvm.x86.avx2.vpdpwsuds.128
+ // (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+ // < 8 x i32> @llvm.x86.avx2.vpdpwsuds.256
+ // (< 8 x i32>, <16 x i16>, <16 x i16>)
+ // <16 x i32> @llvm.x86.avx10.vpdpwsuds.512
+ // (<16 x i32>, <32 x i16>, <32 x i16>)
+ //
+ // Multiply and Add Unsigned and Signed Word Integers
+ // < 4 x i32> @llvm.x86.avx2.vpdpwusd.128
+ // (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+ // < 8 x i32> @llvm.x86.avx2.vpdpwusd.256
+ // (< 8 x i32>, <16 x i16>, <16 x i16>)
+ // <16 x i32> @llvm.x86.avx10.vpdpwusd.512
+ // (<16 x i32>, <32 x i16>, <32 x i16>)
+ //
+ // Multiply and Add Unsigned and Signed Word Integers With Saturation
+ // < 4 x i32> @llvm.x86.avx2.vpdpwusds.128
+ // (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+ // < 8 x i32> @llvm.x86.avx2.vpdpwusds.256
+ // (< 8 x i32>, <16 x i16>, <16 x i16>)
+ // <16 x i32> @llvm.x86.avx10.vpdpwusds.512
+ // (<16 x i32>, <32 x i16>, <32 x i16>)
+ //
+ // Multiply and Add Unsigned and Unsigned Word Integers
+ // < 4 x i32> @llvm.x86.avx2.vpdpwuud.128
+ // (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+ // < 8 x i32> @llvm.x86.avx2.vpdpwuud.256
+ // (< 8 x i32>, <16 x i16>, <16 x i16>)
+ // <16 x i32> @llvm.x86.avx10.vpdpwuud.512
+ // (<16 x i32>, <32 x i16>, <32 x i16>)
+ //
+ // Multiply and Add Unsigned and Unsigned Word Integers With Saturation
+ // < 4 x i32> @llvm.x86.avx2.vpdpwuuds.128
+ // (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+ // < 8 x i32> @llvm.x86.avx2.vpdpwuuds.256
+ // (< 8 x i32>, <16 x i16>, <16 x i16>)
+ // <16 x i32> @llvm.x86.avx10.vpdpwuuds.512
+ // (<16 x i32>, <32 x i16>, <32 x i16>)
//
// These intrinsics are auto-upgraded into non-masked forms:
// <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128
- // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // (<4 x i32>, <8 x i16>, <8 x i16>, i8)
// <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128
- // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // (<4 x i32>, <8 x i16>, <8 x i16>, i8)
// <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256
- // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // (<8 x i32>, <16 x i16>, <16 x i16>, i8)
// <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256
- // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // (<8 x i32>, <16 x i16>, <16 x i16>, i8)
// <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512
- // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // (<16 x i32>, <32 x i16>, <32 x i16>, i16)
// <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512
- // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // (<16 x i32>, <32 x i16>, <32 x i16>, i16)
//
// <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128
- // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // (<4 x i32>, <8 x i16>, <8 x i16>, i8)
// <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128
- // (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+ // (<4 x i32>, <8 x i16>, <8 x i16>, i8)
// <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256
- // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // (<8 x i32>, <16 x i16>, <16 x i16>, i8)
// <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256
- // (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+ // (<8 x i32>, <16 x i16>, <16 x i16>, i8)
// <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512
- // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // (<16 x i32>, <32 x i16>, <32 x i16>, i16)
// <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512
- // (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+ // (<16 x i32>, <32 x i16>, <32 x i16>, i16)
case Intrinsic::x86_avx512_vpdpwssd_128:
case Intrinsic::x86_avx512_vpdpwssd_256:
case Intrinsic::x86_avx512_vpdpwssd_512:
case Intrinsic::x86_avx512_vpdpwssds_128:
case Intrinsic::x86_avx512_vpdpwssds_256:
case Intrinsic::x86_avx512_vpdpwssds_512:
- handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
+ case Intrinsic::x86_avx2_vpdpwsud_128:
+ case Intrinsic::x86_avx2_vpdpwsud_256:
+ case Intrinsic::x86_avx10_vpdpwsud_512:
+ case Intrinsic::x86_avx2_vpdpwsuds_128:
+ case Intrinsic::x86_avx2_vpdpwsuds_256:
+ case Intrinsic::x86_avx10_vpdpwsuds_512:
+ case Intrinsic::x86_avx2_vpdpwusd_128:
+ case Intrinsic::x86_avx2_vpdpwusd_256:
+ case Intrinsic::x86_avx10_vpdpwusd_512:
+ case Intrinsic::x86_avx2_vpdpwusds_128:
+ case Intrinsic::x86_avx2_vpdpwusds_256:
+ case Intrinsic::x86_avx10_vpdpwusds_512:
+ case Intrinsic::x86_avx2_vpdpwuud_128:
+ case Intrinsic::x86_avx2_vpdpwuud_256:
+ case Intrinsic::x86_avx10_vpdpwuud_512:
+ case Intrinsic::x86_avx2_vpdpwuuds_128:
+ case Intrinsic::x86_avx2_vpdpwuuds_256:
+ case Intrinsic::x86_avx10_vpdpwuuds_512:
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+ /*ZeroPurifies=*/true, /*EltSizeInBits=*/16);
break;
- // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single
- // Precision
- // <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128
- // (<4 x float>, <8 x bfloat>, <8 x bfloat>)
- // <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256
- // (<8 x float>, <16 x bfloat>, <16 x bfloat>)
- // <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512
- // (<16 x float>, <32 x bfloat>, <32 x bfloat>)
- // handleVectorPmaddIntrinsic() currently only handles integer types.
+ // Dot Product of BF16 Pairs Accumulated Into Packed Single
+ // Precision
+ // <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128
+ // (<4 x float>, <8 x bfloat>, <8 x bfloat>)
+ // <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256
+ // (<8 x float>, <16 x bfloat>, <16 x bfloat>)
+ // <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512
+ // (<16 x float>, <32 x bfloat>, <32 x bfloat>)
+ case Intrinsic::x86_avx512bf16_dpbf16ps_128:
+ case Intrinsic::x86_avx512bf16_dpbf16ps_256:
+ case Intrinsic::x86_avx512bf16_dpbf16ps_512:
+ handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+ /*ZeroPurifies=*/false);
+ break;
case Intrinsic::x86_sse_cmp_ss:
case Intrinsic::x86_sse2_cmp_sd:
@@ -6010,48 +6123,62 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// Packed Horizontal Add/Subtract
case Intrinsic::x86_ssse3_phadd_w:
case Intrinsic::x86_ssse3_phadd_w_128:
- case Intrinsic::x86_avx2_phadd_w:
case Intrinsic::x86_ssse3_phsub_w:
case Intrinsic::x86_ssse3_phsub_w_128:
- case Intrinsic::x86_avx2_phsub_w: {
- handlePairwiseShadowOrIntrinsic(I, /*ReinterpretElemWidth=*/16);
+ handlePairwiseShadowOrIntrinsic(I, /*Shards=*/1,
+ /*ReinterpretElemWidth=*/16);
+ break;
+
+ case Intrinsic::x86_avx2_phadd_w:
+ case Intrinsic::x86_avx2_phsub_w:
+ handlePairwiseShadowOrIntrinsic(I, /*Shards=*/2,
+ /*ReinterpretElemWidth=*/16);
break;
- }
// Packed Horizontal Add/Subtract
case Intrinsic::x86_ssse3_phadd_d:
case Intrinsic::x86_ssse3_phadd_d_128:
- case Intrinsic::x86_avx2_phadd_d:
case Intrinsic::x86_ssse3_phsub_d:
case Intrinsic::x86_ssse3_phsub_d_128:
- case Intrinsic::x86_avx2_phsub_d: {
- handlePairwiseShadowOrIntrinsic(I, /*ReinterpretElemWidth=*/32);
+ handlePairwiseShadowOrIntrinsic(I, /*Shards=*/1,
+ /*ReinterpretElemWidth=*/32);
+ break;
+
+ case Intrinsic::x86_avx2_phadd_d:
+ case Intrinsic::x86_avx2_phsub_d:
+ handlePairwiseShadowOrIntrinsic(I, /*Shards=*/2,
+ /*ReinterpretElemWidth=*/32);
break;
- }
// Packed Horizontal Add/Subtract and Saturate
case Intrinsic::x86_ssse3_phadd_sw:
case Intrinsic::x86_ssse3_phadd_sw_128:
- case Intrinsic::x86_avx2_phadd_sw:
case Intrinsic::x86_ssse3_phsub_sw:
case Intrinsic::x86_ssse3_phsub_sw_128:
- case Intrinsic::x86_avx2_phsub_sw: {
- handlePairwiseShadowOrIntrinsic(I, /*ReinterpretElemWidth=*/16);
+ handlePairwiseShadowOrIntrinsic(I, /*Shards=*/1,
+ /*ReinterpretElemWidth=*/16);
+ break;
+
+ case Intrinsic::x86_avx2_phadd_sw:
+ case Intrinsic::x86_avx2_phsub_sw:
+ handlePairwiseShadowOrIntrinsic(I, /*Shards=*/2,
+ /*ReinterpretElemWidth=*/16);
break;
- }
// Packed Single/Double Precision Floating-Point Horizontal Add
case Intrinsic::x86_sse3_hadd_ps:
case Intrinsic::x86_sse3_hadd_pd:
- case Intrinsic::x86_avx_hadd_pd_256:
- case Intrinsic::x86_avx_hadd_ps_256:
case Intrinsic::x86_sse3_hsub_ps:
case Intrinsic::x86_sse3_hsub_pd:
+ handlePairwiseShadowOrIntrinsic(I, /*Shards=*/1);
+ break;
+
+ case Intrinsic::x86_avx_hadd_pd_256:
+ case Intrinsic::x86_avx_hadd_ps_256:
case Intrinsic::x86_avx_hsub_pd_256:
- case Intrinsic::x86_avx_hsub_ps_256: {
- handlePairwiseShadowOrIntrinsic(I);
+ case Intrinsic::x86_avx_hsub_ps_256:
+ handlePairwiseShadowOrIntrinsic(I, /*Shards=*/2);
break;
- }
case Intrinsic::x86_avx_maskstore_ps:
case Intrinsic::x86_avx_maskstore_pd:
@@ -6434,7 +6561,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// Add Long Pairwise
case Intrinsic::aarch64_neon_saddlp:
case Intrinsic::aarch64_neon_uaddlp: {
- handlePairwiseShadowOrIntrinsic(I);
+ handlePairwiseShadowOrIntrinsic(I, /*Shards=*/1);
break;
}
diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
index 80e77e09..66d570b 100644
--- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
@@ -161,7 +161,7 @@ template <char NsanTypeId>
class ShadowTypeConfigImpl : public ShadowTypeConfig {
public:
char getNsanTypeId() const override { return NsanTypeId; }
- static constexpr const char kNsanTypeId = NsanTypeId;
+ static constexpr char kNsanTypeId = NsanTypeId;
};
// `double` (`d`) shadow type.
@@ -811,7 +811,7 @@ static bool shouldCheckArgs(CallBase &CI, const TargetLibraryInfo &TLI,
return false;
const auto ID = Fn->getIntrinsicID();
- LibFunc LFunc = LibFunc::NumLibFuncs;
+ LibFunc LFunc = LibFunc::NotLibFunc;
// Always check args of unknown functions.
if (ID == Intrinsic::ID() && !TLI.getLibFunc(*Fn, LFunc))
return true;
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index af53fa0..02f06be 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -734,7 +734,7 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
FunctionHash = (((uint64_t)JCH.getCRC()) << 28) + JC.getCRC();
// Reserve bit 60-63 for other information purpose.
- FunctionHash &= 0x0FFFFFFFFFFFFFFF;
+ FunctionHash &= NamedInstrProfRecord::FUNC_HASH_MASK;
if (IsCS)
NamedInstrProfRecord::setCSFlagInHash(FunctionHash);
LLVM_DEBUG(dbgs() << "Function Hash Computation for " << F.getName() << ":\n"
diff --git a/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp
index 5ef6ffb..667fdb7 100644
--- a/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp
@@ -90,6 +90,9 @@ PreservedAnalyses RealtimeSanitizerPass::run(Module &M,
[&](Function *Ctor, FunctionCallee) { appendToGlobalCtors(M, Ctor, 0); });
for (Function &F : M) {
+ if (F.empty())
+ continue;
+
if (F.hasFnAttribute(Attribute::SanitizeRealtime))
runSanitizeRealtime(F);
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 09abf6a..d72d216 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -1226,7 +1226,7 @@ void ModuleSanitizerCoverage::createFunctionControlFlow(Function &F) {
if (CB->isIndirectCall()) {
// TODO(navidem): handle indirect calls, for now mark its existence.
CFs.push_back((Constant *)IRB.CreateIntToPtr(
- ConstantInt::get(IntptrTy, -1), PtrTy));
+ ConstantInt::getAllOnesValue(IntptrTy), PtrTy));
} else {
auto CalledF = CB->getCalledFunction();
if (CalledF && !CalledF->isIntrinsic())
diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
index 87eba5f..1c91d83 100644
--- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -62,7 +62,7 @@ static cl::opt<bool> ClOutlineInstrumentation(
"tysan-outline-instrumentation",
cl::desc("Uses function calls for all TySan instrumentation, reducing "
"ELF size"),
- cl::Hidden, cl::init(false));
+ cl::Hidden, cl::init(true));
static cl::opt<bool> ClVerifyOutlinedInstrumentation(
"tysan-verify-outlined-instrumentation",