aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--llvm/include/llvm/Bitcode/BitcodeReader.h5
-rw-r--r--llvm/include/llvm/Bitcode/LLVMBitCodes.h10
-rw-r--r--llvm/include/llvm/Support/AMDGPUSummary.h46
-rw-r--r--llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp11
-rw-r--r--llvm/lib/Bitcode/Reader/BitcodeReader.cpp84
-rw-r--r--llvm/lib/Bitcode/Writer/BitcodeWriter.cpp88
-rw-r--r--llvm/test/ThinLTO/AMDGPU/amdgpu-summary-roundtrip.ll47
7 files changed, 289 insertions, 2 deletions
diff --git a/llvm/include/llvm/Bitcode/BitcodeReader.h b/llvm/include/llvm/Bitcode/BitcodeReader.h
index 772ca8201927..7ff6f7de8e70 100644
--- a/llvm/include/llvm/Bitcode/BitcodeReader.h
+++ b/llvm/include/llvm/Bitcode/BitcodeReader.h
@@ -17,6 +17,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Bitstream/BitCodeEnums.h"
#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/AMDGPUSummary.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/Error.h"
@@ -166,6 +167,10 @@ struct ParserCallbacks {
LLVM_ABI Error
readSummary(ModuleSummaryIndex &CombinedIndex, StringRef ModulePath,
std::function<bool(GlobalValue::GUID)> IsPrevailing = nullptr);
+
+ /// Read the AMDGPU_SUMMARY block (if present) and merge per-function
+ /// occupancy data into \p Summaries. Returns false if no block was found.
+ LLVM_ABI Expected<bool> readAMDGPUSummary(AMDGPU::SummaryMap &Summaries);
};
struct BitcodeFileContents {
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 9162754bbfe1..e543966662c9 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -63,6 +63,8 @@ enum BlockIDs {
SYMTAB_BLOCK_ID,
SYNC_SCOPE_NAMES_BLOCK_ID,
+
+ AMDGPU_SUMMARY_BLOCK_ID,
};
/// Identification block contains a string that describes the producer details,
@@ -834,6 +836,14 @@ enum SymtabCodes {
SYMTAB_BLOB = 1,
};
+enum AMDGPUSummaryCodes {
+ // [version]
+ AMDGPU_SUMMARY_VERSION = 1,
+ // [guid, is_entry, flat_wg_min, flat_wg_max,
+ // waves_min, waves_max, max_wg_x, max_wg_y, max_wg_z]
+ AMDGPU_SUMMARY_ENTRY = 2,
+};
+
} // End bitc namespace
} // End llvm namespace
diff --git a/llvm/include/llvm/Support/AMDGPUSummary.h b/llvm/include/llvm/Support/AMDGPUSummary.h
new file mode 100644
index 000000000000..8bade6df7dd4
--- /dev/null
+++ b/llvm/include/llvm/Support/AMDGPUSummary.h
@@ -0,0 +1,46 @@
+//===- AMDGPUSummary.h - AMDGPU ThinLTO summary data ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Per-function AMDGPU summary information carried through ThinLTO for
+// cross-TU attribute propagation. Stored in the AMDGPU_SUMMARY bitcode
+// block, separate from the standard module summary, so that non-AMDGPU
+// targets are completely unaffected.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_AMDGPUSUMMARY_H
+#define LLVM_SUPPORT_AMDGPUSUMMARY_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/GlobalValue.h"
+#include <cstdint>
+#include <limits>
+
+namespace llvm {
+namespace AMDGPU {
+
+struct FunctionSummary {
+ bool IsEntry = false;
+
+ uint32_t FlatWGSizeMin = 1;
+ uint32_t FlatWGSizeMax = 1024;
+
+ uint32_t WavesPerEUMin = 1;
+ uint32_t WavesPerEUMax = 10;
+
+ uint32_t MaxNumWGX = std::numeric_limits<uint32_t>::max();
+ uint32_t MaxNumWGY = std::numeric_limits<uint32_t>::max();
+ uint32_t MaxNumWGZ = std::numeric_limits<uint32_t>::max();
+};
+
+using SummaryMap = DenseMap<GlobalValue::GUID, FunctionSummary>;
+
+} // namespace AMDGPU
+} // namespace llvm
+
+#endif // LLVM_SUPPORT_AMDGPUSUMMARY_H
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index 911ec7501eb8..9a164e0ce1a7 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -81,6 +81,8 @@ GetBlockName(unsigned BlockID, const BitstreamBlockInfo &BlockInfo,
return "STRTAB_BLOCK";
case bitc::SYMTAB_BLOCK_ID:
return "SYMTAB_BLOCK";
+ case bitc::AMDGPU_SUMMARY_BLOCK_ID:
+ return "AMDGPU_SUMMARY_BLOCK";
}
}
@@ -420,6 +422,15 @@ GetCodeName(unsigned CodeID, unsigned BlockID,
case bitc::SYMTAB_BLOB:
return "BLOB";
}
+ case bitc::AMDGPU_SUMMARY_BLOCK_ID:
+ switch (CodeID) {
+ default:
+ return std::nullopt;
+ case bitc::AMDGPU_SUMMARY_VERSION:
+ return "AMDGPU_SUMMARY_VERSION";
+ case bitc::AMDGPU_SUMMARY_ENTRY:
+ return "AMDGPU_SUMMARY_ENTRY";
+ }
}
#undef STRINGIFY_CODE
}
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index fa7a3b214e46..ddae213bff45 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -8695,6 +8695,88 @@ Error BitcodeModule::readSummary(
return R.parseModule();
}
+Expected<bool> BitcodeModule::readAMDGPUSummary(AMDGPU::SummaryMap &Summaries) {
+ BitstreamCursor Stream(Buffer);
+ if (Error JumpFailed = Stream.JumpToBit(ModuleBit))
+ return std::move(JumpFailed);
+
+ if (Error Err = Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
+ return std::move(Err);
+
+ // Scan sub-blocks to find AMDGPU_SUMMARY_BLOCK_ID.
+ while (true) {
+ Expected<llvm::BitstreamEntry> MaybeEntry = Stream.advance();
+ if (!MaybeEntry)
+ return MaybeEntry.takeError();
+ BitstreamEntry Entry = MaybeEntry.get();
+
+ switch (Entry.Kind) {
+ case BitstreamEntry::Error:
+ return error("Malformed block");
+ case BitstreamEntry::EndBlock:
+ return false;
+ case BitstreamEntry::Record:
+ if (Expected<unsigned> Skipped = Stream.skipRecord(Entry.ID); !Skipped)
+ return Skipped.takeError();
+ continue;
+ case BitstreamEntry::SubBlock:
+ if (Entry.ID == bitc::AMDGPU_SUMMARY_BLOCK_ID)
+ break;
+ if (Error Err = Stream.SkipBlock())
+ return std::move(Err);
+ continue;
+ }
+
+ // Found AMDGPU_SUMMARY_BLOCK_ID — parse it.
+ if (Error Err = Stream.EnterSubBlock(bitc::AMDGPU_SUMMARY_BLOCK_ID))
+ return std::move(Err);
+
+ SmallVector<uint64_t, 16> Record;
+ while (true) {
+ Expected<BitstreamEntry> MaybeRec = Stream.advanceSkippingSubblocks();
+ if (!MaybeRec)
+ return MaybeRec.takeError();
+ BitstreamEntry Rec = MaybeRec.get();
+
+ if (Rec.Kind == BitstreamEntry::EndBlock)
+ return true;
+ if (Rec.Kind != BitstreamEntry::Record)
+ return error("Expected record in AMDGPU_SUMMARY block");
+
+ Record.clear();
+ Expected<unsigned> MaybeCode = Stream.readRecord(Rec.ID, Record);
+ if (!MaybeCode)
+ return MaybeCode.takeError();
+
+ switch (MaybeCode.get()) {
+ case bitc::AMDGPU_SUMMARY_VERSION:
+ if (Record.size() < 1 || Record[0] != 1)
+ return error("Unsupported AMDGPU summary version");
+ break;
+ case bitc::AMDGPU_SUMMARY_ENTRY: {
+ if (Record.size() < 9)
+ return error("Invalid AMDGPU summary entry");
+
+ GlobalValue::GUID GUID = Record[0];
+ AMDGPU::FunctionSummary FS;
+ FS.IsEntry = Record[1] != 0;
+ FS.FlatWGSizeMin = Record[2];
+ FS.FlatWGSizeMax = Record[3];
+ FS.WavesPerEUMin = Record[4];
+ FS.WavesPerEUMax = Record[5];
+ FS.MaxNumWGX = Record[6];
+ FS.MaxNumWGY = Record[7];
+ FS.MaxNumWGZ = Record[8];
+ Summaries[GUID] = FS;
+ break;
+ }
+ default:
+ break;
+ }
+ }
+ }
+}
+
// Parse the specified bitcode buffer, returning the function info index.
Expected<std::unique_ptr<ModuleSummaryIndex>> BitcodeModule::getSummary() {
BitstreamCursor Stream(Buffer);
@@ -8725,7 +8807,7 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, unsigned ID) {
switch (Entry.Kind) {
case BitstreamEntry::SubBlock: // Handled for us already.
case BitstreamEntry::Error:
- return error("Malformed block");
+ return error("malformed block");
case BitstreamEntry::EndBlock: {
// If no flags record found, return both flags as false.
return std::make_pair(false, false);
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 7153b1a0000f..84de4e7575c4 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -64,6 +64,7 @@
#include "llvm/Object/IRSymtab.h"
#include "llvm/ProfileData/MemProf.h"
#include "llvm/ProfileData/MemProfRadixTree.h"
+#include "llvm/Support/AMDGPUSummary.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
@@ -450,6 +451,7 @@ private:
DenseMap<const Function *, uint64_t> &FunctionToBitcodeIndex);
void writeBlockInfo();
void writeModuleHash(StringRef View);
+ void writeAMDGPUSummaryBlock();
unsigned getEncodedSyncScopeID(SyncScope::ID SSID) {
return unsigned(SSID);
@@ -5360,6 +5362,88 @@ void ModuleBitcodeWriter::writeModuleHash(StringRef View) {
}
}
+void ModuleBitcodeWriter::writeAMDGPUSummaryBlock() {
+ Triple TT(M.getTargetTriple());
+ // Object linking is only supported on AMDHSA platforms.
+ if (TT.getArch() != Triple::amdgcn || TT.getOS() != Triple::AMDHSA)
+ return;
+
+ SmallVector<const Function *, 8> Worklist;
+ for (const Function &F : M) {
+ if (F.isDeclaration())
+ continue;
+ if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+ F.getCallingConv() == CallingConv::SPIR_KERNEL ||
+ F.hasFnAttribute("amdgpu-flat-work-group-size") ||
+ F.hasFnAttribute("amdgpu-waves-per-eu") ||
+ F.hasFnAttribute("amdgpu-max-num-workgroups"))
+ Worklist.push_back(&F);
+ }
+ if (Worklist.empty())
+ return;
+
+ Stream.EnterSubblock(bitc::AMDGPU_SUMMARY_BLOCK_ID, 4);
+
+ SmallVector<uint64_t, 10> Record;
+ Record.push_back(1);
+ Stream.EmitRecord(bitc::AMDGPU_SUMMARY_VERSION, Record);
+
+ for (const Function *F : Worklist) {
+ bool IsEntry = (F->getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+ F->getCallingConv() == CallingConv::SPIR_KERNEL);
+
+ AMDGPU::FunctionSummary FS;
+ FS.IsEntry = IsEntry;
+
+ if (Attribute A = F->getFnAttribute("amdgpu-flat-work-group-size");
+ A.isStringAttribute()) {
+ auto [MinS, MaxS] = A.getValueAsString().split(',');
+ unsigned Min, Max;
+ if (!MinS.trim().getAsInteger(0, Min) &&
+ !MaxS.trim().getAsInteger(0, Max)) {
+ FS.FlatWGSizeMin = Min;
+ FS.FlatWGSizeMax = Max;
+ }
+ }
+
+ if (Attribute A = F->getFnAttribute("amdgpu-waves-per-eu");
+ A.isStringAttribute()) {
+ auto [MinS, MaxS] = A.getValueAsString().split(',');
+ unsigned Min;
+ if (!MinS.trim().getAsInteger(0, Min)) {
+ FS.WavesPerEUMin = Min;
+ unsigned Max;
+ if (!MaxS.trim().empty() && !MaxS.trim().getAsInteger(0, Max))
+ FS.WavesPerEUMax = Max;
+ }
+ }
+
+ if (Attribute A = F->getFnAttribute("amdgpu-max-num-workgroups");
+ A.isStringAttribute()) {
+ SmallVector<StringRef, 3> Parts;
+ A.getValueAsString().split(Parts, ',');
+ if (Parts.size() == 3) {
+ unsigned X, Y, Z;
+ if (!Parts[0].trim().getAsInteger(0, X) &&
+ !Parts[1].trim().getAsInteger(0, Y) &&
+ !Parts[2].trim().getAsInteger(0, Z)) {
+ FS.MaxNumWGX = X;
+ FS.MaxNumWGY = Y;
+ FS.MaxNumWGZ = Z;
+ }
+ }
+ }
+
+ Record.clear();
+ Record = {F->getGUID(), FS.IsEntry, FS.FlatWGSizeMin,
+ FS.FlatWGSizeMax, FS.WavesPerEUMin, FS.WavesPerEUMax,
+ FS.MaxNumWGX, FS.MaxNumWGY, FS.MaxNumWGZ};
+ Stream.EmitRecord(bitc::AMDGPU_SUMMARY_ENTRY, Record);
+ }
+
+ Stream.ExitBlock();
+}
+
void ModuleBitcodeWriter::write() {
writeIdentificationBlock(Stream);
@@ -5415,6 +5499,8 @@ void ModuleBitcodeWriter::write() {
if (Index)
writePerModuleGlobalValueSummary();
+ writeAMDGPUSummaryBlock();
+
writeGlobalValueSymbolTable(FunctionToBitcodeIndex);
writeModuleHash(Stream.getMarkedBufferAndResumeFlushing());
@@ -5613,7 +5699,7 @@ void llvm::WriteBitcodeToFile(const Module &M, raw_ostream &Out,
Writer.writeSymtab();
Writer.writeStrtab();
};
- Triple TT(M.getTargetTriple());
+ const Triple &TT = M.getTargetTriple();
if (TT.isOSDarwin() || TT.isOSBinFormatMachO()) {
// If this is darwin or another generic macho target, reserve space for the
// header. Note that the header is computed *after* the output is known, so
diff --git a/llvm/test/ThinLTO/AMDGPU/amdgpu-summary-roundtrip.ll b/llvm/test/ThinLTO/AMDGPU/amdgpu-summary-roundtrip.ll
new file mode 100644
index 000000000000..f31a4845d23b
--- /dev/null
+++ b/llvm/test/ThinLTO/AMDGPU/amdgpu-summary-roundtrip.ll
@@ -0,0 +1,47 @@
+; Verify that the AMDGPU_SUMMARY block round-trips through bitcode.
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -module-summary %s -o %t.bc
+; RUN: llvm-bcanalyzer -dump %t.bc | FileCheck %s --check-prefix=BLOCK
+
+; All attributes present.
+; BLOCK: <AMDGPU_SUMMARY_BLOCK
+; BLOCK-NEXT: <AMDGPU_SUMMARY_VERSION op0=1/>
+; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=64 op3=256 op4=2 op5=8 op6=16 op7=16 op8=1/>
+
+; Only flat-work-group-size — waves and max-workgroups use defaults.
+; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=128 op3=512 op4=1 op5=10 op6=4294967295 op7=4294967295 op8=4294967295/>
+
+; Only waves-per-eu — flat-work-group-size and max-workgroups use defaults.
+; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=1 op3=1024 op4=4 op5=6 op6=4294967295 op7=4294967295 op8=4294967295/>
+
+; Bare kernel with no attributes — all defaults.
+; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=1 op3=1024 op4=1 op5=10 op6=4294967295 op7=4294967295 op8=4294967295/>
+; BLOCK-NEXT: </AMDGPU_SUMMARY_BLOCK>
+
+define amdgpu_kernel void @kernel_all(ptr %p) #0 {
+ call void @device_func(ptr %p)
+ ret void
+}
+
+define amdgpu_kernel void @kernel_wg_only(ptr %p) #1 {
+ call void @device_func(ptr %p)
+ ret void
+}
+
+define amdgpu_kernel void @kernel_waves_only(ptr %p) #2 {
+ call void @device_func(ptr %p)
+ ret void
+}
+
+define amdgpu_kernel void @kernel_bare(ptr %p) {
+ call void @device_func(ptr %p)
+ ret void
+}
+
+define void @device_func(ptr %p) {
+ store i32 42, ptr %p
+ ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-waves-per-eu"="2,8" "amdgpu-max-num-workgroups"="16,16,1" }
+attributes #1 = { "amdgpu-flat-work-group-size"="128,512" }
+attributes #2 = { "amdgpu-waves-per-eu"="4,6" }