[memprof] Improve deserialization performance in V3 (#94787)

We call llvm::sort in a couple of places in the V3 encoding: - We sort Frames by FrameIds for stability of the output. - We sort call stacks in the dictionary order to maximize the length of the common prefix between adjacent call stacks. It turns out that we can improve the deserialization performance by modifying the comparison functions -- without changing the format at all. Both places take advantage of the histogram of Frames -- how many times each Frame occurs in the call stacks. - Frames: We serialize popular Frames in the descending order of popularity for improved cache locality. For two equally popular Frames, we break a tie by serializing one that tends to appear earlier in call stacks. Here, "earlier" means a smaller index within llvm::SmallVector<FrameId>. - Call Stacks: We sort the call stacks to reduce the number of times we follow pointers to parents during deserialization. Specifically, instead of comparing two call stacks in the strcmp style -- integer comparisons of FrameIds, we compare two FrameIds F1 and F2 with Histogram[F1] < Histogram[F2] at respective indexes. Since we encode from the end of the sorted list of call stacks, we tend to encode popular call stacks first. Since the two places use the same histogram, we compute it once and share it in the two places. Sorting the call stacks reduces the number of "jumps" by 74% when we deserialize all MemProfRecords. The cycle and instruction counts go down by 10% and 1.5%, respectively. If we sort the Frames in addition to the call stacks, then the cycle and instruction counts go down by 14% and 1.6%, respectively, relative to the same baseline (that is, without this patch).
author: Kazu Hirata <kazu@google.com> 2024-06-07 17:25:57 -0700
committer: GitHub <noreply@github.com> 2024-06-07 17:25:57 -0700
commit: dc3f8c2f587e9647d1ce86426c2cf317c98f453c (patch)
tree: 23b30b42ffc526c74b34740615231aadaf96aa21 /llvm/lib/ProfileData/InstrProfWriter.cpp
parent: 435dd9746107e13c2ad019be3bd34815f7d2360d (diff)
download: llvm-dc3f8c2f587e9647d1ce86426c2cf317c98f453c.zip
llvm-dc3f8c2f587e9647d1ce86426c2cf317c98f453c.tar.gz
llvm-dc3f8c2f587e9647d1ce86426c2cf317c98f453c.tar.bz2
1 files changed, 36 insertions, 7 deletions
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 7d7c980..1a9add1 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -494,17 +494,40 @@ static uint64_t writeMemProfFrames(
 static llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId>
 writeMemProfFrameArray(
     ProfOStream &OS,
-    llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData) {
+    llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData,
+    llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram) {
   // Mappings from FrameIds to array indexes.
   llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId> MemProfFrameIndexes;
 
-  // Sort the FrameIDs for stability.
+  // Compute the order in which we serialize Frames.  The order does not matter
+  // in terms of correctness, but we still compute it for deserialization
+  // performance.  Specifically, if we serialize frequently used Frames one
+  // after another, we have better cache utilization.  For two Frames that
+  // appear equally frequently, we break a tie by serializing the one that tends
+  // to appear earlier in call stacks.  We implement the tie-breaking mechanism
+  // by computing the sum of indexes within call stacks for each Frame.  If we
+  // still have a tie, then we just resort to compare two FrameIds, which is
+  // just for stability of output.
   std::vector<std::pair<memprof::FrameId, const memprof::Frame *>> FrameIdOrder;
   FrameIdOrder.reserve(MemProfFrameData.size());
   for (const auto &[Id, Frame] : MemProfFrameData)
     FrameIdOrder.emplace_back(Id, &Frame);
   assert(MemProfFrameData.size() == FrameIdOrder.size());
-  llvm::sort(FrameIdOrder);
+  llvm::sort(FrameIdOrder,
+             [&](const std::pair<memprof::FrameId, const memprof::Frame *> &L,
+                 const std::pair<memprof::FrameId, const memprof::Frame *> &R) {
+               const auto &SL = FrameHistogram[L.first];
+               const auto &SR = FrameHistogram[R.first];
+               // Popular FrameIds should come first.
+               if (SL.Count != SR.Count)
+                 return SL.Count > SR.Count;
+               // If they are equally popular, then the one that tends to appear
+               // earlier in call stacks should come first.
+               if (SL.PositionSum != SR.PositionSum)
+                 return SL.PositionSum < SR.PositionSum;
+               // Compare their FrameIds for sort stability.
+               return L.first < R.first;
+             });
 
   // Serialize all frames while creating mappings from linear IDs to FrameIds.
   uint64_t Index = 0;
@@ -543,12 +566,14 @@ writeMemProfCallStackArray(
     llvm::MapVector<memprof::CallStackId, llvm::SmallVector<memprof::FrameId>>
         &MemProfCallStackData,
     llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId>
-        &MemProfFrameIndexes) {
+        &MemProfFrameIndexes,
+    llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram) {
   llvm::DenseMap<memprof::CallStackId, memprof::LinearCallStackId>
       MemProfCallStackIndexes;
 
   memprof::CallStackRadixTreeBuilder Builder;
-  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
+  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
+                FrameHistogram);
   for (auto I : Builder.getRadixArray())
     OS.write32(I);
   MemProfCallStackIndexes = Builder.takeCallStackPos();
@@ -704,13 +729,17 @@ static Error writeMemProfV3(ProfOStream &OS,
     Schema = memprof::getFullSchema();
   writeMemProfSchema(OS, Schema);
 
+  llvm::DenseMap<memprof::FrameId, memprof::FrameStat> FrameHistogram =
+      memprof::computeFrameHistogram(MemProfData.CallStackData);
+  assert(MemProfData.FrameData.size() == FrameHistogram.size());
+
   llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId> MemProfFrameIndexes =
-      writeMemProfFrameArray(OS, MemProfData.FrameData);
+      writeMemProfFrameArray(OS, MemProfData.FrameData, FrameHistogram);
 
   uint64_t CallStackPayloadOffset = OS.tell();
   llvm::DenseMap<memprof::CallStackId, memprof::LinearCallStackId>
       MemProfCallStackIndexes = writeMemProfCallStackArray(
-          OS, MemProfData.CallStackData, MemProfFrameIndexes);
+          OS, MemProfData.CallStackData, MemProfFrameIndexes, FrameHistogram);
 
   uint64_t RecordPayloadOffset = OS.tell();
   uint64_t RecordTableOffset =
author	Kazu Hirata <kazu@google.com>	2024-06-07 17:25:57 -0700
committer	GitHub <noreply@github.com>	2024-06-07 17:25:57 -0700
commit	dc3f8c2f587e9647d1ce86426c2cf317c98f453c (patch)
tree	23b30b42ffc526c74b34740615231aadaf96aa21 /llvm/lib/ProfileData/InstrProfWriter.cpp
parent	435dd9746107e13c2ad019be3bd34815f7d2360d (diff)
download	llvm-dc3f8c2f587e9647d1ce86426c2cf317c98f453c.zip llvm-dc3f8c2f587e9647d1ce86426c2cf317c98f453c.tar.gz llvm-dc3f8c2f587e9647d1ce86426c2cf317c98f453c.tar.bz2