1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
|
//===- RootAutodetector.cpp - detect contextual profiling roots -----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "RootAutoDetector.h"
#include "CtxInstrProfiling.h"
#include "sanitizer_common/sanitizer_common.h"
#include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap)
#include <assert.h>
#include <dlfcn.h>
#include <pthread.h>
using namespace __ctx_profile;
template <typename T> using Set = DenseMap<T, bool>;
namespace __sanitizer {
void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context,
bool request_fast, u32 max_depth) {
// We can't implement the fast variant. The fast variant ends up invoking an
// external allocator, because of pthread_attr_getstack. If this happens
// during an allocation of the program being instrumented, a non-reentrant
// lock may be taken (this was observed). The allocator called by
// pthread_attr_getstack will also try to take that lock.
UnwindSlow(pc, max_depth);
}
} // namespace __sanitizer
RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) {
GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex);
Parent.AllSamples.PushBack(this);
}
void RootAutoDetector::start() {
atomic_store_relaxed(&Self, reinterpret_cast<uintptr_t>(this));
pthread_create(
&WorkerThread, nullptr,
+[](void *Ctx) -> void * {
RootAutoDetector *RAD = reinterpret_cast<RootAutoDetector *>(Ctx);
SleepForSeconds(RAD->WaitSeconds);
// To avoid holding the AllSamplesMutex, make a snapshot of all the
// thread samples collected so far
Vector<PerThreadSamples *> SamplesSnapshot;
{
GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex);
SamplesSnapshot.Resize(RAD->AllSamples.Size());
for (uptr I = 0; I < RAD->AllSamples.Size(); ++I)
SamplesSnapshot[I] = RAD->AllSamples[I];
}
DenseMap<uptr, uint64_t> AllRoots;
for (uptr I = 0; I < SamplesSnapshot.Size(); ++I) {
GenericScopedLock<SpinMutex>(&SamplesSnapshot[I]->M);
SamplesSnapshot[I]->TrieRoot.determineRoots().forEach([&](auto &KVP) {
auto [FAddr, Count] = KVP;
AllRoots[FAddr] += Count;
return true;
});
}
// FIXME: as a next step, establish a minimum relative nr of samples
// per root that would qualify it as a root.
for (auto *FD = reinterpret_cast<FunctionData *>(
atomic_load_relaxed(&RAD->FunctionDataListHead));
FD; FD = FD->Next) {
if (AllRoots.contains(reinterpret_cast<uptr>(FD->EntryAddress))) {
if (canBeRoot(FD->CtxRoot)) {
FD->getOrAllocateContextRoot();
} else {
// FIXME: address this by informing the root detection algorithm
// to skip over such functions and pick the next down in the
// stack. At that point, this becomes an assert.
Printf("[ctxprof] Root auto-detector selected a musttail "
"function for root (%p). Ignoring\n",
FD->EntryAddress);
}
}
}
atomic_store_relaxed(&RAD->Self, 0);
return nullptr;
},
this);
}
void RootAutoDetector::join() { pthread_join(WorkerThread, nullptr); }
void RootAutoDetector::sample() {
// tracking reentry in case we want to re-explore fast stack unwind - which
// does potentially re-enter the runtime because it calls the instrumented
// allocator because of pthread_attr_getstack. See the notes also on
// UnwindImpl above.
static thread_local bool Entered = false;
static thread_local uint64_t Entries = 0;
if (Entered || (++Entries % SampleRate))
return;
Entered = true;
collectStack();
Entered = false;
}
void RootAutoDetector::collectStack() {
GET_CALLER_PC_BP;
BufferedStackTrace CurrentStack;
CurrentStack.Unwind(pc, bp, /*context=*/nullptr, /*request_fast=*/false);
// 2 stack frames would be very unlikely to mean anything, since at least the
// compiler-rt frame - which can't be inlined - should be observable, which
// counts as 1; we can be even more aggressive with this number.
if (CurrentStack.size <= 2)
return;
static thread_local PerThreadSamples *ThisThreadSamples =
new (__sanitizer::InternalAlloc(sizeof(PerThreadSamples)))
PerThreadSamples(*this);
if (!ThisThreadSamples->M.TryLock())
return;
ThisThreadSamples->TrieRoot.insertStack(CurrentStack);
ThisThreadSamples->M.Unlock();
}
uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const {
// this requires --linkopt=-Wl,--export-dynamic
Dl_info Info;
if (dladdr(reinterpret_cast<const void *>(CallsiteAddress), &Info) != 0)
return reinterpret_cast<uptr>(Info.dli_saddr);
return 0;
}
void PerThreadCallsiteTrie::insertStack(const StackTrace &ST) {
++TheTrie.Count;
auto *Current = &TheTrie;
// the stack is backwards - the first callsite is at the top.
for (int I = ST.size - 1; I >= 0; --I) {
uptr ChildAddr = ST.trace[I];
auto [Iter, _] = Current->Children.insert({ChildAddr, Trie(ChildAddr)});
++Iter->second.Count;
Current = &Iter->second;
}
}
DenseMap<uptr, uint64_t> PerThreadCallsiteTrie::determineRoots() const {
// Assuming a message pump design, roots are those functions called by the
// message pump. The message pump is an infinite loop (for all practical
// considerations) fetching data from a queue. The root functions return -
// otherwise the message pump doesn't work. This function detects roots as the
// first place in the trie (starting from the root) where a function calls 2
// or more functions.
//
// We start with a callsite trie - the nodes are callsites. Different child
// nodes may actually correspond to the same function.
//
// For example: using function(callsite)
// f1(csf1_1) -> f2(csf2_1) -> f3
// -> f2(csf2_2) -> f4
//
// would be represented in our trie as:
// csf1_1 -> csf2_1 -> f3
// -> csf2_2 -> f4
//
// While we can assert the control flow returns to f2, we don't know if it
// ever returns to f1. f2 could be the message pump.
//
// We need to convert our callsite tree into a function tree. We can also,
// more economically, just see how many distinct functions there are at a
// certain depth. When that count is greater than 1, we got to potential roots
// and everything above should be considered as non-roots.
DenseMap<uptr, uint64_t> Result;
Set<const Trie *> Worklist;
Worklist.insert({&TheTrie, {}});
while (!Worklist.empty()) {
Set<const Trie *> NextWorklist;
DenseMap<uptr, uint64_t> Candidates;
Worklist.forEach([&](const auto &KVP) {
auto [Node, _] = KVP;
auto SA = getFctStartAddr(Node->CallsiteAddress);
Candidates[SA] += Node->Count;
Node->Children.forEach([&](auto &ChildKVP) {
NextWorklist.insert({&ChildKVP.second, true});
return true;
});
return true;
});
if (Candidates.size() > 1) {
Result.swap(Candidates);
break;
}
Worklist.swap(NextWorklist);
}
return Result;
}
|