//===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "CtxInstrProfiling.h" #include "RootAutoDetector.h" #include "sanitizer_common/sanitizer_allocator_internal.h" #include "sanitizer_common/sanitizer_atomic.h" #include "sanitizer_common/sanitizer_atomic_clang.h" #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_dense_map.h" #include "sanitizer_common/sanitizer_libc.h" #include "sanitizer_common/sanitizer_mutex.h" #include "sanitizer_common/sanitizer_placement_new.h" #include "sanitizer_common/sanitizer_thread_safety.h" #include "sanitizer_common/sanitizer_vector.h" #include using namespace __ctx_profile; namespace { // Keep track of all the context roots we actually saw, so we can then traverse // them when the user asks for the profile in __llvm_ctx_profile_fetch __sanitizer::SpinMutex AllContextsMutex; SANITIZER_GUARDED_BY(AllContextsMutex) __sanitizer::Vector AllContextRoots; __sanitizer::atomic_uintptr_t AllFunctionsData = {}; // Keep all the functions for which we collect a flat profile in a linked list. __sanitizer::SpinMutex FlatCtxArenaMutex; SANITIZER_GUARDED_BY(FlatCtxArenaMutex) Arena *FlatCtxArenaHead = nullptr; SANITIZER_GUARDED_BY(FlatCtxArenaMutex) Arena *FlatCtxArena = nullptr; // Set to true when we enter a root, and false when we exit - regardless if this // thread collects a contextual profile for that root. __thread bool IsUnderContext = false; __sanitizer::atomic_uint8_t ProfilingStarted = {}; __sanitizer::atomic_uintptr_t RootDetector = {}; RootAutoDetector *getRootDetector() { return reinterpret_cast( __sanitizer::atomic_load_relaxed(&RootDetector)); } // utility to taint a pointer by setting the LSB. There is an assumption // throughout that the addresses of contexts are even (really, they should be // align(8), but "even"-ness is the minimum assumption) // "scratch contexts" are buffers that we return in certain cases - they are // large enough to allow for memory safe counter access, but they don't link // subcontexts below them (the runtime recognizes them and enforces that) ContextNode *markAsScratch(const ContextNode *Ctx) { return reinterpret_cast(reinterpret_cast(Ctx) | 1); } // Used when getting the data from TLS. We don't *really* need to reset, but // it's a simpler system if we do. template inline T consume(T &V) { auto R = V; V = {0}; return R; } // We allocate at least kBuffSize Arena pages. The scratch buffer is also that // large. constexpr size_t kPower = 20; constexpr size_t kBuffSize = 1 << kPower; // Highly unlikely we need more than kBuffSize for a context. size_t getArenaAllocSize(size_t Needed) { if (Needed >= kBuffSize) return 2 * Needed; return kBuffSize; } // verify the structural integrity of the context bool validate(const ContextRoot *Root) { // all contexts should be laid out in some arena page. Go over each arena // allocated for this Root, and jump over contained contexts based on // self-reported sizes. __sanitizer::DenseMap ContextStartAddrs; for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) { const auto *Pos = Mem->start(); while (Pos < Mem->pos()) { const auto *Ctx = reinterpret_cast(Pos); if (!ContextStartAddrs.insert({reinterpret_cast(Ctx), true}) .second) return false; Pos += Ctx->size(); } } // Now traverse the contexts again the same way, but validate all nonull // subcontext addresses appear in the set computed above. for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) { const auto *Pos = Mem->start(); while (Pos < Mem->pos()) { const auto *Ctx = reinterpret_cast(Pos); for (uint32_t I = 0; I < Ctx->callsites_size(); ++I) for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next()) if (!ContextStartAddrs.find(reinterpret_cast(Sub))) return false; Pos += Ctx->size(); } } return true; } inline ContextNode *allocContextNode(char *Place, GUID Guid, uint32_t NumCounters, uint32_t NumCallsites, ContextNode *Next = nullptr) { assert(reinterpret_cast(Place) % ExpectedAlignment == 0); return new (Place) ContextNode(Guid, NumCounters, NumCallsites, Next); } void resetContextNode(ContextNode &Node) { // FIXME(mtrofin): this is std::memset, which we can probably use if we // drop/reduce the dependency on sanitizer_common. for (uint32_t I = 0; I < Node.counters_size(); ++I) Node.counters()[I] = 0; for (uint32_t I = 0; I < Node.callsites_size(); ++I) for (auto *Next = Node.subContexts()[I]; Next; Next = Next->next()) resetContextNode(*Next); } ContextNode *onContextEnter(ContextNode &Node) { ++Node.counters()[0]; return &Node; } } // namespace // the scratch buffer - what we give when we can't produce a real context (the // scratch isn't "real" in that it's expected to be clobbered carelessly - we // don't read it). The other important thing is that the callees from a scratch // context also get a scratch context. // Eventually this can be replaced with per-function buffers, a'la the typical // (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but // the part about determining the nature of the subcontexts does. __thread char __Buffer[kBuffSize] = {0}; #define TheScratchContext \ markAsScratch(reinterpret_cast(__Buffer)) // init the TLSes __thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr, nullptr}; __thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0}; __thread ContextRoot *volatile __llvm_ctx_profile_current_context_root = nullptr; Arena::Arena(uint32_t Size) : Size(Size) { __sanitizer::internal_memset(start(), 0, Size); } // FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce // the dependency on the latter. Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) { assert(!Prev || Prev->Next == nullptr); Arena *NewArena = new (__sanitizer::InternalAlloc( Size + sizeof(Arena), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment)) Arena(Size); if (Prev) Prev->Next = NewArena; return NewArena; } void Arena::freeArenaList(Arena *&A) { assert(A); for (auto *I = A; I != nullptr;) { auto *Current = I; I = I->Next; __sanitizer::InternalFree(Current); } A = nullptr; } // If this is the first time we hit a callsite with this (Guid) particular // callee, we need to allocate. ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint, uint32_t NumCounters, uint32_t NumCallsites) { auto AllocSize = ContextNode::getAllocSize(NumCounters, NumCallsites); auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem; char *AllocPlace = Mem->tryBumpAllocate(AllocSize); if (!AllocPlace) { // if we failed to allocate on the current arena, allocate a new arena, // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we // find it from now on for other cases when we need to getCallsiteSlow. // Note that allocateNewArena will link the allocated memory in the list of // Arenas. __llvm_ctx_profile_current_context_root->CurrentMem = Mem = Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem); AllocPlace = Mem->tryBumpAllocate(AllocSize); } auto *Ret = allocContextNode(AllocPlace, Guid, NumCounters, NumCallsites, *InsertionPoint); *InsertionPoint = Ret; return Ret; } ContextNode *getFlatProfile(FunctionData &Data, void *Callee, GUID Guid, uint32_t NumCounters) { if (ContextNode *Existing = Data.FlatCtx) return Existing; { // We could instead try to take the lock and, if that fails, return // TheScratchContext. But that could leave message pump loops more sparsely // profiled than everything else. Maybe that doesn't matter, and we can // optimize this later. __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> L(&Data.Mutex); if (ContextNode *Existing = Data.FlatCtx) return Existing; auto NeededSize = ContextNode::getAllocSize(NumCounters, 0); char *AllocBuff = nullptr; { __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> FL( &FlatCtxArenaMutex); if (FlatCtxArena) AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize); if (!AllocBuff) { FlatCtxArena = Arena::allocateNewArena(getArenaAllocSize(NeededSize), FlatCtxArena); AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize); } if (!FlatCtxArenaHead) FlatCtxArenaHead = FlatCtxArena; } auto *Ret = allocContextNode(AllocBuff, Guid, NumCounters, 0); Data.FlatCtx = Ret; Data.EntryAddress = Callee; Data.Next = reinterpret_cast( __sanitizer::atomic_load_relaxed(&AllFunctionsData)); while (!__sanitizer::atomic_compare_exchange_strong( &AllFunctionsData, reinterpret_cast(&Data.Next), reinterpret_cast(&Data), __sanitizer::memory_order_release)) { } } return Data.FlatCtx; } // This should be called once for a Root. Allocate the first arena, set up the // first context. void setupContext(ContextRoot *Root, GUID Guid, uint32_t NumCounters, uint32_t NumCallsites) { __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( &AllContextsMutex); // Re-check - we got here without having had taken a lock. if (Root->FirstMemBlock) return; const auto Needed = ContextNode::getAllocSize(NumCounters, NumCallsites); auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed)); Root->FirstMemBlock = M; Root->CurrentMem = M; Root->FirstNode = allocContextNode(M->tryBumpAllocate(Needed), Guid, NumCounters, NumCallsites); AllContextRoots.PushBack(Root); } ContextRoot *FunctionData::getOrAllocateContextRoot() { auto *Root = CtxRoot; if (!canBeRoot(Root)) return Root; if (Root) return Root; __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> L(&Mutex); Root = CtxRoot; if (!Root) { Root = new (__sanitizer::InternalAlloc(sizeof(ContextRoot))) ContextRoot(); CtxRoot = Root; } assert(Root); return Root; } ContextNode *tryStartContextGivenRoot(ContextRoot *Root, GUID Guid, uint32_t Counters, uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS { IsUnderContext = true; __sanitizer::atomic_fetch_add(&Root->TotalEntries, 1, __sanitizer::memory_order_relaxed); if (!Root->FirstMemBlock) { setupContext(Root, Guid, Counters, Callsites); } if (Root->Taken.TryLock()) { __llvm_ctx_profile_current_context_root = Root; onContextEnter(*Root->FirstNode); return Root->FirstNode; } // If this thread couldn't take the lock, return scratch context. __llvm_ctx_profile_current_context_root = nullptr; return TheScratchContext; } ContextNode *getUnhandledContext(FunctionData &Data, void *Callee, GUID Guid, uint32_t NumCounters, uint32_t NumCallsites, ContextRoot *CtxRoot) { // 1) if we are currently collecting a contextual profile, fetch a ContextNode // in the `Unhandled` set. We want to do this regardless of `ProfilingStarted` // to (hopefully) offset the penalty of creating these contexts to before // profiling. // // 2) if we are under a root (regardless if this thread is collecting or not a // contextual profile for that root), do not collect a flat profile. We want // to keep flat profiles only for activations that can't happen under a root, // to avoid confusing profiles. We can, for example, combine flattened and // flat profiles meaningfully, as we wouldn't double-count anything. // // 3) to avoid lengthy startup, don't bother with flat profiles until the // profiling has started. We would reset them anyway when profiling starts. // HOWEVER. This does lose profiling for message pumps: those functions are // entered once and never exit. They should be assumed to be entered before // profiling starts - because profiling should start after the server is up // and running (which is equivalent to "message pumps are set up"). if (!CtxRoot) { if (auto *RAD = getRootDetector()) RAD->sample(); else if (auto *CR = Data.CtxRoot) { if (canBeRoot(CR)) return tryStartContextGivenRoot(CR, Guid, NumCounters, NumCallsites); } if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted)) return TheScratchContext; else return markAsScratch( onContextEnter(*getFlatProfile(Data, Callee, Guid, NumCounters))); } auto [Iter, Ins] = CtxRoot->Unhandled.insert({Guid, nullptr}); if (Ins) Iter->second = getCallsiteSlow(Guid, &CtxRoot->FirstUnhandledCalleeNode, NumCounters, 0); return markAsScratch(onContextEnter(*Iter->second)); } ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee, GUID Guid, uint32_t NumCounters, uint32_t NumCallsites) { auto *CtxRoot = __llvm_ctx_profile_current_context_root; // fast "out" if we're not even doing contextual collection. if (!CtxRoot) return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites, nullptr); // also fast "out" if the caller is scratch. We can see if it's scratch by // looking at the interior pointer into the subcontexts vector that the caller // provided, which, if the context is scratch, so is that interior pointer // (because all the address calculations are using even values. Or more // precisely, aligned - 8 values) auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]); if (!CallsiteContext || isScratch(CallsiteContext)) return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites, CtxRoot); // if the callee isn't the expected one, return scratch. // Signal handler(s) could have been invoked at any point in the execution. // Should that have happened, and had it (the handler) be built with // instrumentation, its __llvm_ctx_profile_get_context would have failed here. // Its sub call graph would have then populated // __llvm_ctx_profile_{expected_callee | callsite} at index 1. // The normal call graph may be impacted in that, if the signal handler // happened somewhere before we read the TLS here, we'd see the TLS reset and // we'd also fail here. That would just mean we would loose counter values for // the normal subgraph, this time around. That should be very unlikely, but if // it happens too frequently, we should be able to detect discrepancies in // entry counts (caller-callee). At the moment, the design goes on the // assumption that is so unfrequent, though, that it's not worth doing more // for that case. auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]); if (ExpectedCallee != Callee) return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites, CtxRoot); auto *Callsite = *CallsiteContext; // in the case of indirect calls, we will have all seen targets forming a // linked list here. Find the one corresponding to this callee. while (Callsite && Callsite->guid() != Guid) { Callsite = Callsite->next(); } auto *Ret = Callsite ? Callsite : getCallsiteSlow(Guid, CallsiteContext, NumCounters, NumCallsites); if (Ret->callsites_size() != NumCallsites || Ret->counters_size() != NumCounters) __sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: " "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n", reinterpret_cast(Ret), Guid, NumCallsites, NumCounters, Ret->guid(), Ret->callsites_size(), Ret->counters_size()); onContextEnter(*Ret); return Ret; } ContextNode *__llvm_ctx_profile_start_context(FunctionData *FData, GUID Guid, uint32_t Counters, uint32_t Callsites) { auto *Root = FData->getOrAllocateContextRoot(); assert(canBeRoot(Root)); return tryStartContextGivenRoot(Root, Guid, Counters, Callsites); } void __llvm_ctx_profile_release_context(FunctionData *FData) SANITIZER_NO_THREAD_SAFETY_ANALYSIS { const auto *CurrentRoot = __llvm_ctx_profile_current_context_root; auto *CR = FData->CtxRoot; if (!CurrentRoot || CR != CurrentRoot) return; IsUnderContext = false; assert(CR && canBeRoot(CR)); __llvm_ctx_profile_current_context_root = nullptr; CR->Taken.Unlock(); } void __llvm_ctx_profile_start_collection(unsigned AutodetectDuration) { size_t NumMemUnits = 0; __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( &AllContextsMutex); for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) { auto *Root = AllContextRoots[I]; __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock( &Root->Taken); for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) ++NumMemUnits; resetContextNode(*Root->FirstNode); if (Root->FirstUnhandledCalleeNode) resetContextNode(*Root->FirstUnhandledCalleeNode); __sanitizer::atomic_store_relaxed(&Root->TotalEntries, 0); } if (AutodetectDuration) { // we leak RD intentionally. Knowing when to free it is tricky, there's a // race condition with functions observing the `RootDectector` as non-null. // This can be addressed but the alternatives have some added complexity and // it's not (yet) worth it. auto *RD = new (__sanitizer::InternalAlloc(sizeof(RootAutoDetector))) RootAutoDetector(AllFunctionsData, RootDetector, AutodetectDuration); RD->start(); } else { __sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits); } __sanitizer::atomic_store_relaxed(&ProfilingStarted, true); } bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) { __sanitizer::atomic_store_relaxed(&ProfilingStarted, false); if (auto *RD = getRootDetector()) { __sanitizer::Printf("[ctxprof] Expected the root autodetector to have " "finished well before attempting to fetch a context"); RD->join(); } __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( &AllContextsMutex); Writer.startContextSection(); for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) { auto *Root = AllContextRoots[I]; __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock( &Root->Taken); if (!validate(Root)) { __sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid"); return false; } Writer.writeContextual( *Root->FirstNode, Root->FirstUnhandledCalleeNode, __sanitizer::atomic_load_relaxed(&Root->TotalEntries)); } Writer.endContextSection(); Writer.startFlatSection(); // The list progresses behind the head, so taking this snapshot allows the // list to grow concurrently without causing a race condition with our // traversing it. const auto *Pos = reinterpret_cast( __sanitizer::atomic_load_relaxed(&AllFunctionsData)); for (; Pos; Pos = Pos->Next) { const auto *CR = Pos->CtxRoot; if (!CR && canBeRoot(CR)) { const auto *FP = Pos->FlatCtx; Writer.writeFlat(FP->guid(), FP->counters(), FP->counters_size()); } } Writer.endFlatSection(); return true; } void __llvm_ctx_profile_free() { __sanitizer::atomic_store_relaxed(&ProfilingStarted, false); { __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( &AllContextsMutex); for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) { auto *C = A; A = A->next(); __sanitizer::InternalFree(C); } AllContextRoots.Reset(); } __sanitizer::atomic_store_relaxed(&AllFunctionsData, 0U); { __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( &FlatCtxArenaMutex); FlatCtxArena = nullptr; for (auto *A = FlatCtxArenaHead; A;) { auto *C = A; A = C->next(); __sanitizer::InternalFree(C); } FlatCtxArenaHead = nullptr; } }