[RFC][AMDGPU] Add AMDGPU_SUMMARY bitcode block for ThinLTOusers/shiltian/amdgpu-thinlto-summary-block

With AMDGPU object linking, device functions are compiled separately from the kernels that call them. Without whole-program visibility, the compiler must be conservative about occupancy for every device function, leading to suboptimal resource usage. However, GPU kernels typically carry explicit occupancy control attributes that constrain the launch environment. ThinLTO is the natural place to propagate these kernel attributes to callees: the combined module summary index contains a cross-TU call graph, allowing occupancy information to be propagated top-down from kernels to all reachable device functions. The backend can then generate better code with the propagated constraints, achieving whole-program awareness without the compile-time overhead of full LTO. This patch introduces a dedicated AMDGPU_SUMMARY bitcode block that serializes per-function summary data alongside the standard module summary. The block is scoped to AMDGPU so that non-AMDGPU targets are completely unaffected. A follow-up patch will add the ThinLTO propagation logic that reads these summaries and applies conservative attribute bounds to device functions reachable from multiple kernels.
author: Shilei Tian <i@tianshilei.me> 2026-04-28 00:33:42 -0400
committer: Shilei Tian <i@tianshilei.me> 2026-04-28 21:15:00 -0400
commit: 630bff8a2248da1873f27060d17301b5a5606ebb (patch)
tree: b52eb68f1b6aefc18a1fce7d7d63d12aa62af817 /llvm
parent: 383733ea8d15524517b0f1f15c8380c24f17407d (diff)
download: llvm-users/shiltian/amdgpu-thinlto-summary-block.tar.gz
llvm-users/shiltian/amdgpu-thinlto-summary-block.tar.bz2
llvm-users/shiltian/amdgpu-thinlto-summary-block.zip
7 files changed, 289 insertions, 2 deletions
diff --git a/llvm/include/llvm/Bitcode/BitcodeReader.h b/llvm/include/llvm/Bitcode/BitcodeReader.h
index 772ca8201927..7ff6f7de8e70 100644
--- a/llvm/include/llvm/Bitcode/BitcodeReader.h
+++ b/llvm/include/llvm/Bitcode/BitcodeReader.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Bitstream/BitCodeEnums.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/AMDGPUSummary.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
@@ -166,6 +167,10 @@ struct ParserCallbacks {
     LLVM_ABI Error
     readSummary(ModuleSummaryIndex &CombinedIndex, StringRef ModulePath,
                 std::function<bool(GlobalValue::GUID)> IsPrevailing = nullptr);
+
+    /// Read the AMDGPU_SUMMARY block (if present) and merge per-function
+    /// occupancy data into \p Summaries. Returns false if no block was found.
+    LLVM_ABI Expected<bool> readAMDGPUSummary(AMDGPU::SummaryMap &Summaries);
   };
 
   struct BitcodeFileContents {
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 9162754bbfe1..e543966662c9 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -63,6 +63,8 @@ enum BlockIDs {
   SYMTAB_BLOCK_ID,
 
   SYNC_SCOPE_NAMES_BLOCK_ID,
+
+  AMDGPU_SUMMARY_BLOCK_ID,
 };
 
 /// Identification block contains a string that describes the producer details,
@@ -834,6 +836,14 @@ enum SymtabCodes {
   SYMTAB_BLOB = 1,
 };
 
+enum AMDGPUSummaryCodes {
+  // [version]
+  AMDGPU_SUMMARY_VERSION = 1,
+  // [guid, is_entry, flat_wg_min, flat_wg_max,
+  //  waves_min, waves_max, max_wg_x, max_wg_y, max_wg_z]
+  AMDGPU_SUMMARY_ENTRY = 2,
+};
+
 } // End bitc namespace
 } // End llvm namespace
 
diff --git a/llvm/include/llvm/Support/AMDGPUSummary.h b/llvm/include/llvm/Support/AMDGPUSummary.h
new file mode 100644
index 000000000000..8bade6df7dd4
--- /dev/null
+++ b/llvm/include/llvm/Support/AMDGPUSummary.h
@@ -0,0 +1,46 @@
+//===- AMDGPUSummary.h - AMDGPU ThinLTO summary data ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Per-function AMDGPU summary information carried through ThinLTO for
+// cross-TU attribute propagation. Stored in the AMDGPU_SUMMARY bitcode
+// block, separate from the standard module summary, so that non-AMDGPU
+// targets are completely unaffected.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_AMDGPUSUMMARY_H
+#define LLVM_SUPPORT_AMDGPUSUMMARY_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/GlobalValue.h"
+#include <cstdint>
+#include <limits>
+
+namespace llvm {
+namespace AMDGPU {
+
+struct FunctionSummary {
+  bool IsEntry = false;
+
+  uint32_t FlatWGSizeMin = 1;
+  uint32_t FlatWGSizeMax = 1024;
+
+  uint32_t WavesPerEUMin = 1;
+  uint32_t WavesPerEUMax = 10;
+
+  uint32_t MaxNumWGX = std::numeric_limits<uint32_t>::max();
+  uint32_t MaxNumWGY = std::numeric_limits<uint32_t>::max();
+  uint32_t MaxNumWGZ = std::numeric_limits<uint32_t>::max();
+};
+
+using SummaryMap = DenseMap<GlobalValue::GUID, FunctionSummary>;
+
+} // namespace AMDGPU
+} // namespace llvm
+
+#endif // LLVM_SUPPORT_AMDGPUSUMMARY_H
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index 911ec7501eb8..9a164e0ce1a7 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -81,6 +81,8 @@ GetBlockName(unsigned BlockID, const BitstreamBlockInfo &BlockInfo,
     return "STRTAB_BLOCK";
   case bitc::SYMTAB_BLOCK_ID:
     return "SYMTAB_BLOCK";
+  case bitc::AMDGPU_SUMMARY_BLOCK_ID:
+    return "AMDGPU_SUMMARY_BLOCK";
   }
 }
 
@@ -420,6 +422,15 @@ GetCodeName(unsigned CodeID, unsigned BlockID,
     case bitc::SYMTAB_BLOB:
       return "BLOB";
     }
+  case bitc::AMDGPU_SUMMARY_BLOCK_ID:
+    switch (CodeID) {
+    default:
+      return std::nullopt;
+    case bitc::AMDGPU_SUMMARY_VERSION:
+      return "AMDGPU_SUMMARY_VERSION";
+    case bitc::AMDGPU_SUMMARY_ENTRY:
+      return "AMDGPU_SUMMARY_ENTRY";
+    }
   }
 #undef STRINGIFY_CODE
 }
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index fa7a3b214e46..ddae213bff45 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -8695,6 +8695,88 @@ Error BitcodeModule::readSummary(
   return R.parseModule();
 }
 
+Expected<bool> BitcodeModule::readAMDGPUSummary(AMDGPU::SummaryMap &Summaries) {
+  BitstreamCursor Stream(Buffer);
+  if (Error JumpFailed = Stream.JumpToBit(ModuleBit))
+    return std::move(JumpFailed);
+
+  if (Error Err = Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
+    return std::move(Err);
+
+  // Scan sub-blocks to find AMDGPU_SUMMARY_BLOCK_ID.
+  while (true) {
+    Expected<llvm::BitstreamEntry> MaybeEntry = Stream.advance();
+    if (!MaybeEntry)
+      return MaybeEntry.takeError();
+    BitstreamEntry Entry = MaybeEntry.get();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+    case BitstreamEntry::EndBlock:
+      return false;
+    case BitstreamEntry::Record:
+      if (Expected<unsigned> Skipped = Stream.skipRecord(Entry.ID); !Skipped)
+        return Skipped.takeError();
+      continue;
+    case BitstreamEntry::SubBlock:
+      if (Entry.ID == bitc::AMDGPU_SUMMARY_BLOCK_ID)
+        break;
+      if (Error Err = Stream.SkipBlock())
+        return std::move(Err);
+      continue;
+    }
+
+    // Found AMDGPU_SUMMARY_BLOCK_ID — parse it.
+    if (Error Err = Stream.EnterSubBlock(bitc::AMDGPU_SUMMARY_BLOCK_ID))
+      return std::move(Err);
+
+    SmallVector<uint64_t, 16> Record;
+    while (true) {
+      Expected<BitstreamEntry> MaybeRec = Stream.advanceSkippingSubblocks();
+      if (!MaybeRec)
+        return MaybeRec.takeError();
+      BitstreamEntry Rec = MaybeRec.get();
+
+      if (Rec.Kind == BitstreamEntry::EndBlock)
+        return true;
+      if (Rec.Kind != BitstreamEntry::Record)
+        return error("Expected record in AMDGPU_SUMMARY block");
+
+      Record.clear();
+      Expected<unsigned> MaybeCode = Stream.readRecord(Rec.ID, Record);
+      if (!MaybeCode)
+        return MaybeCode.takeError();
+
+      switch (MaybeCode.get()) {
+      case bitc::AMDGPU_SUMMARY_VERSION:
+        if (Record.size() < 1 || Record[0] != 1)
+          return error("Unsupported AMDGPU summary version");
+        break;
+      case bitc::AMDGPU_SUMMARY_ENTRY: {
+        if (Record.size() < 9)
+          return error("Invalid AMDGPU summary entry");
+
+        GlobalValue::GUID GUID = Record[0];
+        AMDGPU::FunctionSummary FS;
+        FS.IsEntry = Record[1] != 0;
+        FS.FlatWGSizeMin = Record[2];
+        FS.FlatWGSizeMax = Record[3];
+        FS.WavesPerEUMin = Record[4];
+        FS.WavesPerEUMax = Record[5];
+        FS.MaxNumWGX = Record[6];
+        FS.MaxNumWGY = Record[7];
+        FS.MaxNumWGZ = Record[8];
+        Summaries[GUID] = FS;
+        break;
+      }
+      default:
+        break;
+      }
+    }
+  }
+}
+
 // Parse the specified bitcode buffer, returning the function info index.
 Expected<std::unique_ptr<ModuleSummaryIndex>> BitcodeModule::getSummary() {
   BitstreamCursor Stream(Buffer);
@@ -8725,7 +8807,7 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, unsigned ID) {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return error("Malformed block");
+      return error("malformed block");
     case BitstreamEntry::EndBlock: {
       // If no flags record found, return both flags as false.
       return std::make_pair(false, false);
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 7153b1a0000f..84de4e7575c4 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -64,6 +64,7 @@
 #include "llvm/Object/IRSymtab.h"
 #include "llvm/ProfileData/MemProf.h"
 #include "llvm/ProfileData/MemProfRadixTree.h"
+#include "llvm/Support/AMDGPUSummary.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -450,6 +451,7 @@ private:
                 DenseMap<const Function *, uint64_t> &FunctionToBitcodeIndex);
   void writeBlockInfo();
   void writeModuleHash(StringRef View);
+  void writeAMDGPUSummaryBlock();
 
   unsigned getEncodedSyncScopeID(SyncScope::ID SSID) {
     return unsigned(SSID);
@@ -5360,6 +5362,88 @@ void ModuleBitcodeWriter::writeModuleHash(StringRef View) {
   }
 }
 
+void ModuleBitcodeWriter::writeAMDGPUSummaryBlock() {
+  Triple TT(M.getTargetTriple());
+  // Object linking is only supported on AMDHSA platforms.
+  if (TT.getArch() != Triple::amdgcn || TT.getOS() != Triple::AMDHSA)
+    return;
+
+  SmallVector<const Function *, 8> Worklist;
+  for (const Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+    if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+        F.getCallingConv() == CallingConv::SPIR_KERNEL ||
+        F.hasFnAttribute("amdgpu-flat-work-group-size") ||
+        F.hasFnAttribute("amdgpu-waves-per-eu") ||
+        F.hasFnAttribute("amdgpu-max-num-workgroups"))
+      Worklist.push_back(&F);
+  }
+  if (Worklist.empty())
+    return;
+
+  Stream.EnterSubblock(bitc::AMDGPU_SUMMARY_BLOCK_ID, 4);
+
+  SmallVector<uint64_t, 10> Record;
+  Record.push_back(1);
+  Stream.EmitRecord(bitc::AMDGPU_SUMMARY_VERSION, Record);
+
+  for (const Function *F : Worklist) {
+    bool IsEntry = (F->getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+                    F->getCallingConv() == CallingConv::SPIR_KERNEL);
+
+    AMDGPU::FunctionSummary FS;
+    FS.IsEntry = IsEntry;
+
+    if (Attribute A = F->getFnAttribute("amdgpu-flat-work-group-size");
+        A.isStringAttribute()) {
+      auto [MinS, MaxS] = A.getValueAsString().split(',');
+      unsigned Min, Max;
+      if (!MinS.trim().getAsInteger(0, Min) &&
+          !MaxS.trim().getAsInteger(0, Max)) {
+        FS.FlatWGSizeMin = Min;
+        FS.FlatWGSizeMax = Max;
+      }
+    }
+
+    if (Attribute A = F->getFnAttribute("amdgpu-waves-per-eu");
+        A.isStringAttribute()) {
+      auto [MinS, MaxS] = A.getValueAsString().split(',');
+      unsigned Min;
+      if (!MinS.trim().getAsInteger(0, Min)) {
+        FS.WavesPerEUMin = Min;
+        unsigned Max;
+        if (!MaxS.trim().empty() && !MaxS.trim().getAsInteger(0, Max))
+          FS.WavesPerEUMax = Max;
+      }
+    }
+
+    if (Attribute A = F->getFnAttribute("amdgpu-max-num-workgroups");
+        A.isStringAttribute()) {
+      SmallVector<StringRef, 3> Parts;
+      A.getValueAsString().split(Parts, ',');
+      if (Parts.size() == 3) {
+        unsigned X, Y, Z;
+        if (!Parts[0].trim().getAsInteger(0, X) &&
+            !Parts[1].trim().getAsInteger(0, Y) &&
+            !Parts[2].trim().getAsInteger(0, Z)) {
+          FS.MaxNumWGX = X;
+          FS.MaxNumWGY = Y;
+          FS.MaxNumWGZ = Z;
+        }
+      }
+    }
+
+    Record.clear();
+    Record = {F->getGUID(),     FS.IsEntry,       FS.FlatWGSizeMin,
+              FS.FlatWGSizeMax, FS.WavesPerEUMin, FS.WavesPerEUMax,
+              FS.MaxNumWGX,     FS.MaxNumWGY,     FS.MaxNumWGZ};
+    Stream.EmitRecord(bitc::AMDGPU_SUMMARY_ENTRY, Record);
+  }
+
+  Stream.ExitBlock();
+}
+
 void ModuleBitcodeWriter::write() {
   writeIdentificationBlock(Stream);
 
@@ -5415,6 +5499,8 @@ void ModuleBitcodeWriter::write() {
   if (Index)
     writePerModuleGlobalValueSummary();
 
+  writeAMDGPUSummaryBlock();
+
   writeGlobalValueSymbolTable(FunctionToBitcodeIndex);
 
   writeModuleHash(Stream.getMarkedBufferAndResumeFlushing());
@@ -5613,7 +5699,7 @@ void llvm::WriteBitcodeToFile(const Module &M, raw_ostream &Out,
     Writer.writeSymtab();
     Writer.writeStrtab();
   };
-  Triple TT(M.getTargetTriple());
+  const Triple &TT = M.getTargetTriple();
   if (TT.isOSDarwin() || TT.isOSBinFormatMachO()) {
     // If this is darwin or another generic macho target, reserve space for the
     // header. Note that the header is computed *after* the output is known, so
diff --git a/llvm/test/ThinLTO/AMDGPU/amdgpu-summary-roundtrip.ll b/llvm/test/ThinLTO/AMDGPU/amdgpu-summary-roundtrip.ll
new file mode 100644
index 000000000000..f31a4845d23b
--- /dev/null
+++ b/llvm/test/ThinLTO/AMDGPU/amdgpu-summary-roundtrip.ll
@@ -0,0 +1,47 @@
+; Verify that the AMDGPU_SUMMARY block round-trips through bitcode.
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -module-summary %s -o %t.bc
+; RUN: llvm-bcanalyzer -dump %t.bc | FileCheck %s --check-prefix=BLOCK
+
+; All attributes present.
+; BLOCK: <AMDGPU_SUMMARY_BLOCK
+; BLOCK-NEXT: <AMDGPU_SUMMARY_VERSION op0=1/>
+; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=64 op3=256 op4=2 op5=8 op6=16 op7=16 op8=1/>
+
+; Only flat-work-group-size — waves and max-workgroups use defaults.
+; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=128 op3=512 op4=1 op5=10 op6=4294967295 op7=4294967295 op8=4294967295/>
+
+; Only waves-per-eu — flat-work-group-size and max-workgroups use defaults.
+; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=1 op3=1024 op4=4 op5=6 op6=4294967295 op7=4294967295 op8=4294967295/>
+
+; Bare kernel with no attributes — all defaults.
+; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=1 op3=1024 op4=1 op5=10 op6=4294967295 op7=4294967295 op8=4294967295/>
+; BLOCK-NEXT: </AMDGPU_SUMMARY_BLOCK>
+
+define amdgpu_kernel void @kernel_all(ptr %p) #0 {
+  call void @device_func(ptr %p)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_wg_only(ptr %p) #1 {
+  call void @device_func(ptr %p)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_waves_only(ptr %p) #2 {
+  call void @device_func(ptr %p)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_bare(ptr %p) {
+  call void @device_func(ptr %p)
+  ret void
+}
+
+define void @device_func(ptr %p) {
+  store i32 42, ptr %p
+  ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-waves-per-eu"="2,8" "amdgpu-max-num-workgroups"="16,16,1" }
+attributes #1 = { "amdgpu-flat-work-group-size"="128,512" }
+attributes #2 = { "amdgpu-waves-per-eu"="4,6" }
author	Shilei Tian <i@tianshilei.me>	2026-04-28 00:33:42 -0400
committer	Shilei Tian <i@tianshilei.me>	2026-04-28 21:15:00 -0400
commit	630bff8a2248da1873f27060d17301b5a5606ebb (patch)
tree	b52eb68f1b6aefc18a1fce7d7d63d12aa62af817 /llvm
parent	383733ea8d15524517b0f1f15c8380c24f17407d (diff)
download	llvm-users/shiltian/amdgpu-thinlto-summary-block.tar.gz llvm-users/shiltian/amdgpu-thinlto-summary-block.tar.bz2 llvm-users/shiltian/amdgpu-thinlto-summary-block.zip