diff options
| -rw-r--r-- | llvm/include/llvm/Bitcode/BitcodeReader.h | 5 | ||||
| -rw-r--r-- | llvm/include/llvm/Bitcode/LLVMBitCodes.h | 10 | ||||
| -rw-r--r-- | llvm/include/llvm/Support/AMDGPUSummary.h | 46 | ||||
| -rw-r--r-- | llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp | 11 | ||||
| -rw-r--r-- | llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 84 | ||||
| -rw-r--r-- | llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 88 | ||||
| -rw-r--r-- | llvm/test/ThinLTO/AMDGPU/amdgpu-summary-roundtrip.ll | 47 |
7 files changed, 289 insertions, 2 deletions
diff --git a/llvm/include/llvm/Bitcode/BitcodeReader.h b/llvm/include/llvm/Bitcode/BitcodeReader.h index 772ca8201927..7ff6f7de8e70 100644 --- a/llvm/include/llvm/Bitcode/BitcodeReader.h +++ b/llvm/include/llvm/Bitcode/BitcodeReader.h @@ -17,6 +17,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Bitstream/BitCodeEnums.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/Support/AMDGPUSummary.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Endian.h" #include "llvm/Support/Error.h" @@ -166,6 +167,10 @@ struct ParserCallbacks { LLVM_ABI Error readSummary(ModuleSummaryIndex &CombinedIndex, StringRef ModulePath, std::function<bool(GlobalValue::GUID)> IsPrevailing = nullptr); + + /// Read the AMDGPU_SUMMARY block (if present) and merge per-function + /// occupancy data into \p Summaries. Returns false if no block was found. + LLVM_ABI Expected<bool> readAMDGPUSummary(AMDGPU::SummaryMap &Summaries); }; struct BitcodeFileContents { diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index 9162754bbfe1..e543966662c9 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -63,6 +63,8 @@ enum BlockIDs { SYMTAB_BLOCK_ID, SYNC_SCOPE_NAMES_BLOCK_ID, + + AMDGPU_SUMMARY_BLOCK_ID, }; /// Identification block contains a string that describes the producer details, @@ -834,6 +836,14 @@ enum SymtabCodes { SYMTAB_BLOB = 1, }; +enum AMDGPUSummaryCodes { + // [version] + AMDGPU_SUMMARY_VERSION = 1, + // [guid, is_entry, flat_wg_min, flat_wg_max, + // waves_min, waves_max, max_wg_x, max_wg_y, max_wg_z] + AMDGPU_SUMMARY_ENTRY = 2, +}; + } // End bitc namespace } // End llvm namespace diff --git a/llvm/include/llvm/Support/AMDGPUSummary.h b/llvm/include/llvm/Support/AMDGPUSummary.h new file mode 100644 index 000000000000..8bade6df7dd4 --- /dev/null +++ b/llvm/include/llvm/Support/AMDGPUSummary.h @@ -0,0 +1,46 @@ +//===- AMDGPUSummary.h - AMDGPU ThinLTO summary data ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Per-function AMDGPU summary information carried through ThinLTO for +// cross-TU attribute propagation. Stored in the AMDGPU_SUMMARY bitcode +// block, separate from the standard module summary, so that non-AMDGPU +// targets are completely unaffected. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_AMDGPUSUMMARY_H +#define LLVM_SUPPORT_AMDGPUSUMMARY_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/IR/GlobalValue.h" +#include <cstdint> +#include <limits> + +namespace llvm { +namespace AMDGPU { + +struct FunctionSummary { + bool IsEntry = false; + + uint32_t FlatWGSizeMin = 1; + uint32_t FlatWGSizeMax = 1024; + + uint32_t WavesPerEUMin = 1; + uint32_t WavesPerEUMax = 10; + + uint32_t MaxNumWGX = std::numeric_limits<uint32_t>::max(); + uint32_t MaxNumWGY = std::numeric_limits<uint32_t>::max(); + uint32_t MaxNumWGZ = std::numeric_limits<uint32_t>::max(); +}; + +using SummaryMap = DenseMap<GlobalValue::GUID, FunctionSummary>; + +} // namespace AMDGPU +} // namespace llvm + +#endif // LLVM_SUPPORT_AMDGPUSUMMARY_H diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp index 911ec7501eb8..9a164e0ce1a7 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp @@ -81,6 +81,8 @@ GetBlockName(unsigned BlockID, const BitstreamBlockInfo &BlockInfo, return "STRTAB_BLOCK"; case bitc::SYMTAB_BLOCK_ID: return "SYMTAB_BLOCK"; + case bitc::AMDGPU_SUMMARY_BLOCK_ID: + return "AMDGPU_SUMMARY_BLOCK"; } } @@ -420,6 +422,15 @@ GetCodeName(unsigned CodeID, unsigned BlockID, case bitc::SYMTAB_BLOB: return "BLOB"; } + case bitc::AMDGPU_SUMMARY_BLOCK_ID: + switch (CodeID) { + default: + return std::nullopt; + case bitc::AMDGPU_SUMMARY_VERSION: + return "AMDGPU_SUMMARY_VERSION"; + case bitc::AMDGPU_SUMMARY_ENTRY: + return "AMDGPU_SUMMARY_ENTRY"; + } } #undef STRINGIFY_CODE } diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index fa7a3b214e46..ddae213bff45 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -8695,6 +8695,88 @@ Error BitcodeModule::readSummary( return R.parseModule(); } +Expected<bool> BitcodeModule::readAMDGPUSummary(AMDGPU::SummaryMap &Summaries) { + BitstreamCursor Stream(Buffer); + if (Error JumpFailed = Stream.JumpToBit(ModuleBit)) + return std::move(JumpFailed); + + if (Error Err = Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID)) + return std::move(Err); + + // Scan sub-blocks to find AMDGPU_SUMMARY_BLOCK_ID. + while (true) { + Expected<llvm::BitstreamEntry> MaybeEntry = Stream.advance(); + if (!MaybeEntry) + return MaybeEntry.takeError(); + BitstreamEntry Entry = MaybeEntry.get(); + + switch (Entry.Kind) { + case BitstreamEntry::Error: + return error("Malformed block"); + case BitstreamEntry::EndBlock: + return false; + case BitstreamEntry::Record: + if (Expected<unsigned> Skipped = Stream.skipRecord(Entry.ID); !Skipped) + return Skipped.takeError(); + continue; + case BitstreamEntry::SubBlock: + if (Entry.ID == bitc::AMDGPU_SUMMARY_BLOCK_ID) + break; + if (Error Err = Stream.SkipBlock()) + return std::move(Err); + continue; + } + + // Found AMDGPU_SUMMARY_BLOCK_ID — parse it. + if (Error Err = Stream.EnterSubBlock(bitc::AMDGPU_SUMMARY_BLOCK_ID)) + return std::move(Err); + + SmallVector<uint64_t, 16> Record; + while (true) { + Expected<BitstreamEntry> MaybeRec = Stream.advanceSkippingSubblocks(); + if (!MaybeRec) + return MaybeRec.takeError(); + BitstreamEntry Rec = MaybeRec.get(); + + if (Rec.Kind == BitstreamEntry::EndBlock) + return true; + if (Rec.Kind != BitstreamEntry::Record) + return error("Expected record in AMDGPU_SUMMARY block"); + + Record.clear(); + Expected<unsigned> MaybeCode = Stream.readRecord(Rec.ID, Record); + if (!MaybeCode) + return MaybeCode.takeError(); + + switch (MaybeCode.get()) { + case bitc::AMDGPU_SUMMARY_VERSION: + if (Record.size() < 1 || Record[0] != 1) + return error("Unsupported AMDGPU summary version"); + break; + case bitc::AMDGPU_SUMMARY_ENTRY: { + if (Record.size() < 9) + return error("Invalid AMDGPU summary entry"); + + GlobalValue::GUID GUID = Record[0]; + AMDGPU::FunctionSummary FS; + FS.IsEntry = Record[1] != 0; + FS.FlatWGSizeMin = Record[2]; + FS.FlatWGSizeMax = Record[3]; + FS.WavesPerEUMin = Record[4]; + FS.WavesPerEUMax = Record[5]; + FS.MaxNumWGX = Record[6]; + FS.MaxNumWGY = Record[7]; + FS.MaxNumWGZ = Record[8]; + Summaries[GUID] = FS; + break; + } + default: + break; + } + } + } +} + // Parse the specified bitcode buffer, returning the function info index. Expected<std::unique_ptr<ModuleSummaryIndex>> BitcodeModule::getSummary() { BitstreamCursor Stream(Buffer); @@ -8725,7 +8807,7 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, unsigned ID) { switch (Entry.Kind) { case BitstreamEntry::SubBlock: // Handled for us already. case BitstreamEntry::Error: - return error("Malformed block"); + return error("malformed block"); case BitstreamEntry::EndBlock: { // If no flags record found, return both flags as false. return std::make_pair(false, false); diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 7153b1a0000f..84de4e7575c4 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -64,6 +64,7 @@ #include "llvm/Object/IRSymtab.h" #include "llvm/ProfileData/MemProf.h" #include "llvm/ProfileData/MemProfRadixTree.h" +#include "llvm/Support/AMDGPUSummary.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -450,6 +451,7 @@ private: DenseMap<const Function *, uint64_t> &FunctionToBitcodeIndex); void writeBlockInfo(); void writeModuleHash(StringRef View); + void writeAMDGPUSummaryBlock(); unsigned getEncodedSyncScopeID(SyncScope::ID SSID) { return unsigned(SSID); @@ -5360,6 +5362,88 @@ void ModuleBitcodeWriter::writeModuleHash(StringRef View) { } } +void ModuleBitcodeWriter::writeAMDGPUSummaryBlock() { + Triple TT(M.getTargetTriple()); + // Object linking is only supported on AMDHSA platforms. + if (TT.getArch() != Triple::amdgcn || TT.getOS() != Triple::AMDHSA) + return; + + SmallVector<const Function *, 8> Worklist; + for (const Function &F : M) { + if (F.isDeclaration()) + continue; + if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL || + F.getCallingConv() == CallingConv::SPIR_KERNEL || + F.hasFnAttribute("amdgpu-flat-work-group-size") || + F.hasFnAttribute("amdgpu-waves-per-eu") || + F.hasFnAttribute("amdgpu-max-num-workgroups")) + Worklist.push_back(&F); + } + if (Worklist.empty()) + return; + + Stream.EnterSubblock(bitc::AMDGPU_SUMMARY_BLOCK_ID, 4); + + SmallVector<uint64_t, 10> Record; + Record.push_back(1); + Stream.EmitRecord(bitc::AMDGPU_SUMMARY_VERSION, Record); + + for (const Function *F : Worklist) { + bool IsEntry = (F->getCallingConv() == CallingConv::AMDGPU_KERNEL || + F->getCallingConv() == CallingConv::SPIR_KERNEL); + + AMDGPU::FunctionSummary FS; + FS.IsEntry = IsEntry; + + if (Attribute A = F->getFnAttribute("amdgpu-flat-work-group-size"); + A.isStringAttribute()) { + auto [MinS, MaxS] = A.getValueAsString().split(','); + unsigned Min, Max; + if (!MinS.trim().getAsInteger(0, Min) && + !MaxS.trim().getAsInteger(0, Max)) { + FS.FlatWGSizeMin = Min; + FS.FlatWGSizeMax = Max; + } + } + + if (Attribute A = F->getFnAttribute("amdgpu-waves-per-eu"); + A.isStringAttribute()) { + auto [MinS, MaxS] = A.getValueAsString().split(','); + unsigned Min; + if (!MinS.trim().getAsInteger(0, Min)) { + FS.WavesPerEUMin = Min; + unsigned Max; + if (!MaxS.trim().empty() && !MaxS.trim().getAsInteger(0, Max)) + FS.WavesPerEUMax = Max; + } + } + + if (Attribute A = F->getFnAttribute("amdgpu-max-num-workgroups"); + A.isStringAttribute()) { + SmallVector<StringRef, 3> Parts; + A.getValueAsString().split(Parts, ','); + if (Parts.size() == 3) { + unsigned X, Y, Z; + if (!Parts[0].trim().getAsInteger(0, X) && + !Parts[1].trim().getAsInteger(0, Y) && + !Parts[2].trim().getAsInteger(0, Z)) { + FS.MaxNumWGX = X; + FS.MaxNumWGY = Y; + FS.MaxNumWGZ = Z; + } + } + } + + Record.clear(); + Record = {F->getGUID(), FS.IsEntry, FS.FlatWGSizeMin, + FS.FlatWGSizeMax, FS.WavesPerEUMin, FS.WavesPerEUMax, + FS.MaxNumWGX, FS.MaxNumWGY, FS.MaxNumWGZ}; + Stream.EmitRecord(bitc::AMDGPU_SUMMARY_ENTRY, Record); + } + + Stream.ExitBlock(); +} + void ModuleBitcodeWriter::write() { writeIdentificationBlock(Stream); @@ -5415,6 +5499,8 @@ void ModuleBitcodeWriter::write() { if (Index) writePerModuleGlobalValueSummary(); + writeAMDGPUSummaryBlock(); + writeGlobalValueSymbolTable(FunctionToBitcodeIndex); writeModuleHash(Stream.getMarkedBufferAndResumeFlushing()); @@ -5613,7 +5699,7 @@ void llvm::WriteBitcodeToFile(const Module &M, raw_ostream &Out, Writer.writeSymtab(); Writer.writeStrtab(); }; - Triple TT(M.getTargetTriple()); + const Triple &TT = M.getTargetTriple(); if (TT.isOSDarwin() || TT.isOSBinFormatMachO()) { // If this is darwin or another generic macho target, reserve space for the // header. Note that the header is computed *after* the output is known, so diff --git a/llvm/test/ThinLTO/AMDGPU/amdgpu-summary-roundtrip.ll b/llvm/test/ThinLTO/AMDGPU/amdgpu-summary-roundtrip.ll new file mode 100644 index 000000000000..f31a4845d23b --- /dev/null +++ b/llvm/test/ThinLTO/AMDGPU/amdgpu-summary-roundtrip.ll @@ -0,0 +1,47 @@ +; Verify that the AMDGPU_SUMMARY block round-trips through bitcode. +; RUN: opt -mtriple=amdgcn-amd-amdhsa -module-summary %s -o %t.bc +; RUN: llvm-bcanalyzer -dump %t.bc | FileCheck %s --check-prefix=BLOCK + +; All attributes present. +; BLOCK: <AMDGPU_SUMMARY_BLOCK +; BLOCK-NEXT: <AMDGPU_SUMMARY_VERSION op0=1/> +; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=64 op3=256 op4=2 op5=8 op6=16 op7=16 op8=1/> + +; Only flat-work-group-size — waves and max-workgroups use defaults. +; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=128 op3=512 op4=1 op5=10 op6=4294967295 op7=4294967295 op8=4294967295/> + +; Only waves-per-eu — flat-work-group-size and max-workgroups use defaults. +; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=1 op3=1024 op4=4 op5=6 op6=4294967295 op7=4294967295 op8=4294967295/> + +; Bare kernel with no attributes — all defaults. +; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=1 op3=1024 op4=1 op5=10 op6=4294967295 op7=4294967295 op8=4294967295/> +; BLOCK-NEXT: </AMDGPU_SUMMARY_BLOCK> + +define amdgpu_kernel void @kernel_all(ptr %p) #0 { + call void @device_func(ptr %p) + ret void +} + +define amdgpu_kernel void @kernel_wg_only(ptr %p) #1 { + call void @device_func(ptr %p) + ret void +} + +define amdgpu_kernel void @kernel_waves_only(ptr %p) #2 { + call void @device_func(ptr %p) + ret void +} + +define amdgpu_kernel void @kernel_bare(ptr %p) { + call void @device_func(ptr %p) + ret void +} + +define void @device_func(ptr %p) { + store i32 42, ptr %p + ret void +} + +attributes #0 = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-waves-per-eu"="2,8" "amdgpu-max-num-workgroups"="16,16,1" } +attributes #1 = { "amdgpu-flat-work-group-size"="128,512" } +attributes #2 = { "amdgpu-waves-per-eu"="4,6" } |
