7 files changed, 289 insertions, 2 deletions
diff --git a/llvm/include/llvm/Bitcode/BitcodeReader.h b/llvm/include/llvm/Bitcode/BitcodeReader.h
index 772ca8201927..7ff6f7de8e70 100644
--- a/llvm/include/llvm/Bitcode/BitcodeReader.h
+++ b/llvm/include/llvm/Bitcode/BitcodeReader.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Bitstream/BitCodeEnums.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/AMDGPUSummary.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
@@ -166,6 +167,10 @@ struct ParserCallbacks {
     LLVM_ABI Error
     readSummary(ModuleSummaryIndex &CombinedIndex, StringRef ModulePath,
                 std::function<bool(GlobalValue::GUID)> IsPrevailing = nullptr);
+
+    /// Read the AMDGPU_SUMMARY block (if present) and merge per-function
+    /// occupancy data into \p Summaries. Returns false if no block was found.
+    LLVM_ABI Expected<bool> readAMDGPUSummary(AMDGPU::SummaryMap &Summaries);
   };
 
   struct BitcodeFileContents {
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 9162754bbfe1..e543966662c9 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -63,6 +63,8 @@ enum BlockIDs {
   SYMTAB_BLOCK_ID,
 
   SYNC_SCOPE_NAMES_BLOCK_ID,
+
+  AMDGPU_SUMMARY_BLOCK_ID,
 };
 
 /// Identification block contains a string that describes the producer details,
@@ -834,6 +836,14 @@ enum SymtabCodes {
   SYMTAB_BLOB = 1,
 };
 
+enum AMDGPUSummaryCodes {
+  // [version]
+  AMDGPU_SUMMARY_VERSION = 1,
+  // [guid, is_entry, flat_wg_min, flat_wg_max,
+  //  waves_min, waves_max, max_wg_x, max_wg_y, max_wg_z]
+  AMDGPU_SUMMARY_ENTRY = 2,
+};
+
 } // End bitc namespace
 } // End llvm namespace
 
diff --git a/llvm/include/llvm/Support/AMDGPUSummary.h b/llvm/include/llvm/Support/AMDGPUSummary.h
new file mode 100644
index 000000000000..8bade6df7dd4
--- /dev/null
+++ b/llvm/include/llvm/Support/AMDGPUSummary.h
@@ -0,0 +1,46 @@
+//===- AMDGPUSummary.h - AMDGPU ThinLTO summary data ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Per-function AMDGPU summary information carried through ThinLTO for
+// cross-TU attribute propagation. Stored in the AMDGPU_SUMMARY bitcode
+// block, separate from the standard module summary, so that non-AMDGPU
+// targets are completely unaffected.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_AMDGPUSUMMARY_H
+#define LLVM_SUPPORT_AMDGPUSUMMARY_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/GlobalValue.h"
+#include <cstdint>
+#include <limits>
+
+namespace llvm {
+namespace AMDGPU {
+
+struct FunctionSummary {
+  bool IsEntry = false;
+
+  uint32_t FlatWGSizeMin = 1;
+  uint32_t FlatWGSizeMax = 1024;
+
+  uint32_t WavesPerEUMin = 1;
+  uint32_t WavesPerEUMax = 10;
+
+  uint32_t MaxNumWGX = std::numeric_limits<uint32_t>::max();
+  uint32_t MaxNumWGY = std::numeric_limits<uint32_t>::max();
+  uint32_t MaxNumWGZ = std::numeric_limits<uint32_t>::max();
+};
+
+using SummaryMap = DenseMap<GlobalValue::GUID, FunctionSummary>;
+
+} // namespace AMDGPU
+} // namespace llvm
+
+#endif // LLVM_SUPPORT_AMDGPUSUMMARY_H
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index 911ec7501eb8..9a164e0ce1a7 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -81,6 +81,8 @@ GetBlockName(unsigned BlockID, const BitstreamBlockInfo &BlockInfo,
     return "STRTAB_BLOCK";
   case bitc::SYMTAB_BLOCK_ID:
     return "SYMTAB_BLOCK";
+  case bitc::AMDGPU_SUMMARY_BLOCK_ID:
+    return "AMDGPU_SUMMARY_BLOCK";
   }
 }
 
@@ -420,6 +422,15 @@ GetCodeName(unsigned CodeID, unsigned BlockID,
     case bitc::SYMTAB_BLOB:
       return "BLOB";
     }
+  case bitc::AMDGPU_SUMMARY_BLOCK_ID:
+    switch (CodeID) {
+    default:
+      return std::nullopt;
+    case bitc::AMDGPU_SUMMARY_VERSION:
+      return "AMDGPU_SUMMARY_VERSION";
+    case bitc::AMDGPU_SUMMARY_ENTRY:
+      return "AMDGPU_SUMMARY_ENTRY";
+    }
   }
 #undef STRINGIFY_CODE
 }
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index fa7a3b214e46..ddae213bff45 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -8695,6 +8695,88 @@ Error BitcodeModule::readSummary(
   return R.parseModule();
 }
 
+Expected<bool> BitcodeModule::readAMDGPUSummary(AMDGPU::SummaryMap &Summaries) {
+  BitstreamCursor Stream(Buffer);
+  if (Error JumpFailed = Stream.JumpToBit(ModuleBit))
+    return std::move(JumpFailed);
+
+  if (Error Err = Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
+    return std::move(Err);
+
+  // Scan sub-blocks to find AMDGPU_SUMMARY_BLOCK_ID.
+  while (true) {
+    Expected<llvm::BitstreamEntry> MaybeEntry = Stream.advance();
+    if (!MaybeEntry)
+      return MaybeEntry.takeError();
+    BitstreamEntry Entry = MaybeEntry.get();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+    case BitstreamEntry::EndBlock:
+      return false;
+    case BitstreamEntry::Record:
+      if (Expected<unsigned> Skipped = Stream.skipRecord(Entry.ID); !Skipped)
+        return Skipped.takeError();
+      continue;
+    case BitstreamEntry::SubBlock:
+      if (Entry.ID == bitc::AMDGPU_SUMMARY_BLOCK_ID)
+        break;
+      if (Error Err = Stream.SkipBlock())
+        return std::move(Err);
+      continue;
+    }
+
+    // Found AMDGPU_SUMMARY_BLOCK_ID — parse it.
+    if (Error Err = Stream.EnterSubBlock(bitc::AMDGPU_SUMMARY_BLOCK_ID))
+      return std::move(Err);
+
+    SmallVector<uint64_t, 16> Record;
+    while (true) {
+      Expected<BitstreamEntry> MaybeRec = Stream.advanceSkippingSubblocks();
+      if (!MaybeRec)
+        return MaybeRec.takeError();
+      BitstreamEntry Rec = MaybeRec.get();
+
+      if (Rec.Kind == BitstreamEntry::EndBlock)
+        return true;
+      if (Rec.Kind != BitstreamEntry::Record)
+        return error("Expected record in AMDGPU_SUMMARY block");
+
+      Record.clear();
+      Expected<unsigned> MaybeCode = Stream.readRecord(Rec.ID, Record);
+      if (!MaybeCode)
+        return MaybeCode.takeError();
+
+      switch (MaybeCode.get()) {
+      case bitc::AMDGPU_SUMMARY_VERSION:
+        if (Record.size() < 1 || Record[0] != 1)
+          return error("Unsupported AMDGPU summary version");
+        break;
+      case bitc::AMDGPU_SUMMARY_ENTRY: {
+        if (Record.size() < 9)
+          return error("Invalid AMDGPU summary entry");
+
+        GlobalValue::GUID GUID = Record[0];
+        AMDGPU::FunctionSummary FS;
+        FS.IsEntry = Record[1] != 0;
+        FS.FlatWGSizeMin = Record[2];
+        FS.FlatWGSizeMax = Record[3];
+        FS.WavesPerEUMin = Record[4];
+        FS.WavesPerEUMax = Record[5];
+        FS.MaxNumWGX = Record[6];
+        FS.MaxNumWGY = Record[7];
+        FS.MaxNumWGZ = Record[8];
+        Summaries[GUID] = FS;
+        break;
+      }
+      default:
+        break;
+      }
+    }
+  }
+}
+
 // Parse the specified bitcode buffer, returning the function info index.
 Expected<std::unique_ptr<ModuleSummaryIndex>> BitcodeModule::getSummary() {
   BitstreamCursor Stream(Buffer);
@@ -8725,7 +8807,7 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, unsigned ID) {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return error("Malformed block");
+      return error("malformed block");
     case BitstreamEntry::EndBlock: {
       // If no flags record found, return both flags as false.
       return std::make_pair(false, false);
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 7153b1a0000f..84de4e7575c4 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -64,6 +64,7 @@
 #include "llvm/Object/IRSymtab.h"
 #include "llvm/ProfileData/MemProf.h"
 #include "llvm/ProfileData/MemProfRadixTree.h"
+#include "llvm/Support/AMDGPUSummary.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -450,6 +451,7 @@ private:
                 DenseMap<const Function *, uint64_t> &FunctionToBitcodeIndex);
   void writeBlockInfo();
   void writeModuleHash(StringRef View);
+  void writeAMDGPUSummaryBlock();
 
   unsigned getEncodedSyncScopeID(SyncScope::ID SSID) {
     return unsigned(SSID);
@@ -5360,6 +5362,88 @@ void ModuleBitcodeWriter::writeModuleHash(StringRef View) {
   }
 }
 
+void ModuleBitcodeWriter::writeAMDGPUSummaryBlock() {
+  Triple TT(M.getTargetTriple());
+  // Object linking is only supported on AMDHSA platforms.
+  if (TT.getArch() != Triple::amdgcn || TT.getOS() != Triple::AMDHSA)
+    return;
+
+  SmallVector<const Function *, 8> Worklist;
+  for (const Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+    if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+        F.getCallingConv() == CallingConv::SPIR_KERNEL ||
+        F.hasFnAttribute("amdgpu-flat-work-group-size") ||
+        F.hasFnAttribute("amdgpu-waves-per-eu") ||
+        F.hasFnAttribute("amdgpu-max-num-workgroups"))
+      Worklist.push_back(&F);
+  }
+  if (Worklist.empty())
+    return;
+
+  Stream.EnterSubblock(bitc::AMDGPU_SUMMARY_BLOCK_ID, 4);
+
+  SmallVector<uint64_t, 10> Record;
+  Record.push_back(1);
+  Stream.EmitRecord(bitc::AMDGPU_SUMMARY_VERSION, Record);
+
+  for (const Function *F : Worklist) {
+    bool IsEntry = (F->getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+                    F->getCallingConv() == CallingConv::SPIR_KERNEL);
+
+    AMDGPU::FunctionSummary FS;
+    FS.IsEntry = IsEntry;
+
+    if (Attribute A = F->getFnAttribute("amdgpu-flat-work-group-size");
+        A.isStringAttribute()) {
+      auto [MinS, MaxS] = A.getValueAsString().split(',');
+      unsigned Min, Max;
+      if (!MinS.trim().getAsInteger(0, Min) &&
+          !MaxS.trim().getAsInteger(0, Max)) {
+        FS.FlatWGSizeMin = Min;
+        FS.FlatWGSizeMax = Max;
+      }
+    }
+
+    if (Attribute A = F->getFnAttribute("amdgpu-waves-per-eu");
+        A.isStringAttribute()) {
+      auto [MinS, MaxS] = A.getValueAsString().split(',');
+      unsigned Min;
+      if (!MinS.trim().getAsInteger(0, Min)) {
+        FS.WavesPerEUMin = Min;
+        unsigned Max;
+        if (!MaxS.trim().empty() && !MaxS.trim().getAsInteger(0, Max))
+          FS.WavesPerEUMax = Max;
+      }
+    }
+
+    if (Attribute A = F->getFnAttribute("amdgpu-max-num-workgroups");
+        A.isStringAttribute()) {
+      SmallVector<StringRef, 3> Parts;
+      A.getValueAsString().split(Parts, ',');
+      if (Parts.size() == 3) {
+        unsigned X, Y, Z;
+        if (!Parts[0].trim().getAsInteger(0, X) &&
+            !Parts[1].trim().getAsInteger(0, Y) &&
+            !Parts[2].trim().getAsInteger(0, Z)) {
+          FS.MaxNumWGX = X;
+          FS.MaxNumWGY = Y;
+          FS.MaxNumWGZ = Z;
+        }
+      }
+    }
+
+    Record.clear();
+    Record = {F->getGUID(),     FS.IsEntry,       FS.FlatWGSizeMin,
+              FS.FlatWGSizeMax, FS.WavesPerEUMin, FS.WavesPerEUMax,
+              FS.MaxNumWGX,     FS.MaxNumWGY,     FS.MaxNumWGZ};
+    Stream.EmitRecord(bitc::AMDGPU_SUMMARY_ENTRY, Record);
+  }
+
+  Stream.ExitBlock();
+}
+
 void ModuleBitcodeWriter::write() {
   writeIdentificationBlock(Stream);
 
@@ -5415,6 +5499,8 @@ void ModuleBitcodeWriter::write() {
   if (Index)
     writePerModuleGlobalValueSummary();
 
+  writeAMDGPUSummaryBlock();
+
   writeGlobalValueSymbolTable(FunctionToBitcodeIndex);
 
   writeModuleHash(Stream.getMarkedBufferAndResumeFlushing());
@@ -5613,7 +5699,7 @@ void llvm::WriteBitcodeToFile(const Module &M, raw_ostream &Out,
     Writer.writeSymtab();
     Writer.writeStrtab();
   };
-  Triple TT(M.getTargetTriple());
+  const Triple &TT = M.getTargetTriple();
   if (TT.isOSDarwin() || TT.isOSBinFormatMachO()) {
     // If this is darwin or another generic macho target, reserve space for the
     // header. Note that the header is computed *after* the output is known, so
diff --git a/llvm/test/ThinLTO/AMDGPU/amdgpu-summary-roundtrip.ll b/llvm/test/ThinLTO/AMDGPU/amdgpu-summary-roundtrip.ll
new file mode 100644
index 000000000000..f31a4845d23b
--- /dev/null
+++ b/llvm/test/ThinLTO/AMDGPU/amdgpu-summary-roundtrip.ll
@@ -0,0 +1,47 @@
+; Verify that the AMDGPU_SUMMARY block round-trips through bitcode.
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -module-summary %s -o %t.bc
+; RUN: llvm-bcanalyzer -dump %t.bc | FileCheck %s --check-prefix=BLOCK
+
+; All attributes present.
+; BLOCK: <AMDGPU_SUMMARY_BLOCK
+; BLOCK-NEXT: <AMDGPU_SUMMARY_VERSION op0=1/>
+; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=64 op3=256 op4=2 op5=8 op6=16 op7=16 op8=1/>
+
+; Only flat-work-group-size — waves and max-workgroups use defaults.
+; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=128 op3=512 op4=1 op5=10 op6=4294967295 op7=4294967295 op8=4294967295/>
+
+; Only waves-per-eu — flat-work-group-size and max-workgroups use defaults.
+; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=1 op3=1024 op4=4 op5=6 op6=4294967295 op7=4294967295 op8=4294967295/>
+
+; Bare kernel with no attributes — all defaults.
+; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=1 op3=1024 op4=1 op5=10 op6=4294967295 op7=4294967295 op8=4294967295/>
+; BLOCK-NEXT: </AMDGPU_SUMMARY_BLOCK>
+
+define amdgpu_kernel void @kernel_all(ptr %p) #0 {
+  call void @device_func(ptr %p)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_wg_only(ptr %p) #1 {
+  call void @device_func(ptr %p)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_waves_only(ptr %p) #2 {
+  call void @device_func(ptr %p)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_bare(ptr %p) {
+  call void @device_func(ptr %p)
+  ret void
+}
+
+define void @device_func(ptr %p) {
+  store i32 42, ptr %p
+  ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-waves-per-eu"="2,8" "amdgpu-max-num-workgroups"="16,16,1" }
+attributes #1 = { "amdgpu-flat-work-group-size"="128,512" }
+attributes #2 = { "amdgpu-waves-per-eu"="4,6" }