diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 39 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp | 15 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp | 25 | ||||
-rw-r--r-- | llvm/lib/Support/CMakeLists.txt | 1 | ||||
-rw-r--r-- | llvm/lib/Support/Jobserver.cpp | 259 | ||||
-rw-r--r-- | llvm/lib/Support/Parallel.cpp | 98 | ||||
-rw-r--r-- | llvm/lib/Support/ThreadPool.cpp | 108 | ||||
-rw-r--r-- | llvm/lib/Support/Threading.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Support/Unix/Jobserver.inc | 195 | ||||
-rw-r--r-- | llvm/lib/Support/Windows/Jobserver.inc | 79 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 9 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 39 | ||||
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 4 | ||||
-rw-r--r-- | llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp | 178 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstructionCombining.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 55 |
18 files changed, 1005 insertions, 120 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 95f53fe..6ea2e27 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -12698,6 +12698,45 @@ unsigned SelectionDAG::AssignTopologicalOrder() { return DAGSize; } +void SelectionDAG::getTopologicallyOrderedNodes( + SmallVectorImpl<const SDNode *> &SortedNodes) const { + SortedNodes.clear(); + // Node -> remaining number of outstanding operands. + DenseMap<const SDNode *, unsigned> RemainingOperands; + + // Put nodes without any operands into SortedNodes first. + for (const SDNode &N : allnodes()) { + checkForCycles(&N, this); + unsigned NumOperands = N.getNumOperands(); + if (NumOperands == 0) + SortedNodes.push_back(&N); + else + // Record their total number of outstanding operands. + RemainingOperands[&N] = NumOperands; + } + + // A node is pushed into SortedNodes when all of its operands (predecessors in + // the graph) are also in SortedNodes. + for (unsigned i = 0U; i < SortedNodes.size(); ++i) { + const SDNode *N = SortedNodes[i]; + for (const SDNode *U : N->users()) { + unsigned &NumRemOperands = RemainingOperands[U]; + assert(NumRemOperands && "Invalid number of remaining operands"); + --NumRemOperands; + if (!NumRemOperands) + SortedNodes.push_back(U); + } + } + + assert(SortedNodes.size() == AllNodes.size() && "Node count mismatch"); + assert(SortedNodes.front()->getOpcode() == ISD::EntryToken && + "First node in topological sort is not the entry token"); + assert(SortedNodes.front()->getNumOperands() == 0 && + "First node in topological sort has operands"); + assert(SortedNodes.back()->use_empty() && + "Last node in topologic sort has users"); +} + /// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the /// value is produced by SD. void SelectionDAG::AddDbgValue(SDDbgValue *DB, bool isParameter) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 4b2a00c..fcfbfe6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -1061,13 +1061,24 @@ static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) { N->dump(G); } -LLVM_DUMP_METHOD void SelectionDAG::dump() const { +LLVM_DUMP_METHOD void SelectionDAG::dump(bool Sorted) const { dbgs() << "SelectionDAG has " << AllNodes.size() << " nodes:\n"; - for (const SDNode &N : allnodes()) { + auto dumpEachNode = [this](const SDNode &N) { if (!N.hasOneUse() && &N != getRoot().getNode() && (!shouldPrintInline(N, this) || N.use_empty())) DumpNodes(&N, 2, this); + }; + + if (Sorted) { + SmallVector<const SDNode *> SortedNodes; + SortedNodes.reserve(AllNodes.size()); + getTopologicallyOrderedNodes(SortedNodes); + for (const SDNode *N : SortedNodes) + dumpEachNode(*N); + } else { + for (const SDNode &N : allnodes()) + dumpEachNode(N); } if (getRoot().getNode()) DumpNodes(getRoot().getNode(), 2, this); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index e61558c..c35f29d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -144,6 +144,11 @@ UseMBPI("use-mbpi", cl::init(true), cl::Hidden); #ifndef NDEBUG +static cl::opt<bool> + DumpSortedDAG("dump-sorted-dags", cl::Hidden, + cl::desc("Print DAGs with sorted nodes in debug dump"), + cl::init(false)); + static cl::opt<std::string> FilterDAGBasicBlockName("filter-view-dags", cl::Hidden, cl::desc("Only display the basic block whose name " @@ -932,7 +937,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { ISEL_DUMP(dbgs() << "\nInitial selection DAG: " << printMBBReference(*FuncInfo->MBB) << " '" << BlockName << "'\n"; - CurDAG->dump()); + CurDAG->dump(DumpSortedDAG)); #if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS if (TTI->hasBranchDivergence()) @@ -952,7 +957,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { ISEL_DUMP(dbgs() << "\nOptimized lowered selection DAG: " << printMBBReference(*FuncInfo->MBB) << " '" << BlockName << "'\n"; - CurDAG->dump()); + CurDAG->dump(DumpSortedDAG)); #if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS if (TTI->hasBranchDivergence()) @@ -974,7 +979,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { ISEL_DUMP(dbgs() << "\nType-legalized selection DAG: " << printMBBReference(*FuncInfo->MBB) << " '" << BlockName << "'\n"; - CurDAG->dump()); + CurDAG->dump(DumpSortedDAG)); #if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS if (TTI->hasBranchDivergence()) @@ -998,7 +1003,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { ISEL_DUMP(dbgs() << "\nOptimized type-legalized selection DAG: " << printMBBReference(*FuncInfo->MBB) << " '" << BlockName << "'\n"; - CurDAG->dump()); + CurDAG->dump(DumpSortedDAG)); #if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS if (TTI->hasBranchDivergence()) @@ -1016,7 +1021,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { ISEL_DUMP(dbgs() << "\nVector-legalized selection DAG: " << printMBBReference(*FuncInfo->MBB) << " '" << BlockName << "'\n"; - CurDAG->dump()); + CurDAG->dump(DumpSortedDAG)); #if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS if (TTI->hasBranchDivergence()) @@ -1032,7 +1037,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { ISEL_DUMP(dbgs() << "\nVector/type-legalized selection DAG: " << printMBBReference(*FuncInfo->MBB) << " '" << BlockName << "'\n"; - CurDAG->dump()); + CurDAG->dump(DumpSortedDAG)); #if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS if (TTI->hasBranchDivergence()) @@ -1052,7 +1057,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { ISEL_DUMP(dbgs() << "\nOptimized vector-legalized selection DAG: " << printMBBReference(*FuncInfo->MBB) << " '" << BlockName << "'\n"; - CurDAG->dump()); + CurDAG->dump(DumpSortedDAG)); #if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS if (TTI->hasBranchDivergence()) @@ -1072,7 +1077,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { ISEL_DUMP(dbgs() << "\nLegalized selection DAG: " << printMBBReference(*FuncInfo->MBB) << " '" << BlockName << "'\n"; - CurDAG->dump()); + CurDAG->dump(DumpSortedDAG)); #if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS if (TTI->hasBranchDivergence()) @@ -1092,7 +1097,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { ISEL_DUMP(dbgs() << "\nOptimized legalized selection DAG: " << printMBBReference(*FuncInfo->MBB) << " '" << BlockName << "'\n"; - CurDAG->dump()); + CurDAG->dump(DumpSortedDAG)); #if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS if (TTI->hasBranchDivergence()) @@ -1116,7 +1121,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { ISEL_DUMP(dbgs() << "\nSelected selection DAG: " << printMBBReference(*FuncInfo->MBB) << " '" << BlockName << "'\n"; - CurDAG->dump()); + CurDAG->dump(DumpSortedDAG)); if (ViewSchedDAGs && MatchFilterBB) CurDAG->viewGraph("scheduler input for " + BlockName); diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index 7da972f..42b21b5 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -207,6 +207,7 @@ add_llvm_component_library(LLVMSupport InstructionCost.cpp IntEqClasses.cpp IntervalMap.cpp + Jobserver.cpp JSON.cpp KnownBits.cpp KnownFPClass.cpp diff --git a/llvm/lib/Support/Jobserver.cpp b/llvm/lib/Support/Jobserver.cpp new file mode 100644 index 0000000..9f726eb --- /dev/null +++ b/llvm/lib/Support/Jobserver.cpp @@ -0,0 +1,259 @@ +//===- llvm/Support/Jobserver.cpp - Jobserver Client Implementation -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/Jobserver.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Config/llvm-config.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/raw_ostream.h" + +#include <atomic> +#include <memory> +#include <mutex> +#include <new> + +#define DEBUG_TYPE "jobserver" + +using namespace llvm; + +namespace { +struct FdPair { + int Read = -1; + int Write = -1; + bool isValid() const { return Read >= 0 && Write >= 0; } +}; + +struct JobserverConfig { + enum Mode { + None, + PosixFifo, + PosixPipe, + Win32Semaphore, + }; + Mode TheMode = None; + std::string Path; + FdPair PipeFDs; +}; + +/// A helper function that checks if `Input` starts with `Prefix`. +/// If it does, it removes the prefix from `Input`, assigns the remainder to +/// `Value`, and returns true. Otherwise, it returns false. +bool getPrefixedValue(StringRef Input, StringRef Prefix, StringRef &Value) { + if (Input.consume_front(Prefix)) { + Value = Input; + return true; + } + return false; +} + +/// A helper function to parse a string in the format "R,W" where R and W are +/// non-negative integers representing file descriptors. It populates the +/// `ReadFD` and `WriteFD` output parameters. Returns true on success. +static std::optional<FdPair> getFileDescriptorPair(StringRef Input) { + FdPair FDs; + if (Input.consumeInteger(10, FDs.Read)) + return std::nullopt; + if (!Input.consume_front(",")) + return std::nullopt; + if (Input.consumeInteger(10, FDs.Write)) + return std::nullopt; + if (!Input.empty() || !FDs.isValid()) + return std::nullopt; + return FDs; +} + +/// Parses the `MAKEFLAGS` environment variable string to find jobserver +/// arguments. It splits the string into space-separated arguments and searches +/// for `--jobserver-auth` or `--jobserver-fds`. Based on the value of these +/// arguments, it determines the jobserver mode (Pipe, FIFO, or Semaphore) and +/// connection details (file descriptors or path). +Expected<JobserverConfig> parseNativeMakeFlags(StringRef MakeFlags) { + JobserverConfig Config; + if (MakeFlags.empty()) + return Config; + + // Split the MAKEFLAGS string into arguments. + SmallVector<StringRef, 8> Args; + SplitString(MakeFlags, Args); + + // If '-n' (dry-run) is present as a legacy flag (not starting with '-'), + // disable the jobserver. + if (!Args.empty() && !Args[0].starts_with("-") && Args[0].contains('n')) + return Config; + + // Iterate through arguments to find jobserver flags. + // Note that make may pass multiple --jobserver-auth flags; the last one wins. + for (StringRef Arg : Args) { + StringRef Value; + if (getPrefixedValue(Arg, "--jobserver-auth=", Value)) { + // Try to parse as a file descriptor pair first. + if (auto FDPair = getFileDescriptorPair(Value)) { + Config.TheMode = JobserverConfig::PosixPipe; + Config.PipeFDs = *FDPair; + } else { + StringRef FifoPath; + // If not FDs, try to parse as a named pipe (fifo). + if (getPrefixedValue(Value, "fifo:", FifoPath)) { + Config.TheMode = JobserverConfig::PosixFifo; + Config.Path = FifoPath.str(); + } else { + // Otherwise, assume it's a Windows semaphore. + Config.TheMode = JobserverConfig::Win32Semaphore; + Config.Path = Value.str(); + } + } + } else if (getPrefixedValue(Arg, "--jobserver-fds=", Value)) { + // This is an alternative, older syntax for the pipe-based server. + if (auto FDPair = getFileDescriptorPair(Value)) { + Config.TheMode = JobserverConfig::PosixPipe; + Config.PipeFDs = *FDPair; + } else { + return createStringError(inconvertibleErrorCode(), + "Invalid file descriptor pair in MAKEFLAGS"); + } + } + } + +// Perform platform-specific validation. +#ifdef _WIN32 + if (Config.TheMode == JobserverConfig::PosixFifo || + Config.TheMode == JobserverConfig::PosixPipe) + return createStringError( + inconvertibleErrorCode(), + "FIFO/Pipe-based jobserver is not supported on Windows"); +#else + if (Config.TheMode == JobserverConfig::Win32Semaphore) + return createStringError( + inconvertibleErrorCode(), + "Semaphore-based jobserver is not supported on this platform"); +#endif + return Config; +} + +std::once_flag GJobserverOnceFlag; +JobserverClient *GJobserver = nullptr; + +} // namespace + +namespace llvm { +class JobserverClientImpl : public JobserverClient { + bool IsInitialized = false; + std::atomic<bool> HasImplicitSlot{true}; + unsigned NumJobs = 0; + +public: + JobserverClientImpl(const JobserverConfig &Config); + ~JobserverClientImpl() override; + + JobSlot tryAcquire() override; + void release(JobSlot Slot) override; + unsigned getNumJobs() const override { return NumJobs; } + + bool isValid() const { return IsInitialized; } + +private: +#if defined(LLVM_ON_UNIX) + int ReadFD = -1; + int WriteFD = -1; + std::string FifoPath; +#elif defined(_WIN32) + void *Semaphore = nullptr; +#endif +}; +} // namespace llvm + +// Include the platform-specific parts of the class. +#if defined(LLVM_ON_UNIX) +#include "Unix/Jobserver.inc" +#elif defined(_WIN32) +#include "Windows/Jobserver.inc" +#else +// Dummy implementation for unsupported platforms. +JobserverClientImpl::JobserverClientImpl(const JobserverConfig &Config) {} +JobserverClientImpl::~JobserverClientImpl() = default; +JobSlot JobserverClientImpl::tryAcquire() { return JobSlot(); } +void JobserverClientImpl::release(JobSlot Slot) {} +#endif + +namespace llvm { +JobserverClient::~JobserverClient() = default; + +uint8_t JobSlot::getExplicitValue() const { + assert(isExplicit() && "Cannot get value of implicit or invalid slot"); + return static_cast<uint8_t>(Value); +} + +/// This is the main entry point for acquiring a jobserver client. It uses a +/// std::call_once to ensure the singleton `GJobserver` instance is created +/// safely in a multi-threaded environment. On first call, it reads the +/// `MAKEFLAGS` environment variable, parses it, and attempts to construct and +/// initialize a `JobserverClientImpl`. If successful, the global instance is +/// stored in `GJobserver`. Subsequent calls will return the existing instance. +JobserverClient *JobserverClient::getInstance() { + std::call_once(GJobserverOnceFlag, []() { + LLVM_DEBUG( + dbgs() + << "JobserverClient::getInstance() called for the first time.\n"); + const char *MakeFlagsEnv = getenv("MAKEFLAGS"); + if (!MakeFlagsEnv) { + errs() << "Warning: failed to create jobserver client due to MAKEFLAGS " + "environment variable not found\n"; + return; + } + + LLVM_DEBUG(dbgs() << "Found MAKEFLAGS = \"" << MakeFlagsEnv << "\"\n"); + + auto ConfigOrErr = parseNativeMakeFlags(MakeFlagsEnv); + if (Error Err = ConfigOrErr.takeError()) { + errs() << "Warning: failed to create jobserver client due to invalid " + "MAKEFLAGS environment variable: " + << toString(std::move(Err)) << "\n"; + return; + } + + JobserverConfig Config = *ConfigOrErr; + if (Config.TheMode == JobserverConfig::None) { + errs() << "Warning: failed to create jobserver client due to jobserver " + "mode missing in MAKEFLAGS environment variable\n"; + return; + } + + if (Config.TheMode == JobserverConfig::PosixPipe) { +#if defined(LLVM_ON_UNIX) + if (!areFdsValid(Config.PipeFDs.Read, Config.PipeFDs.Write)) { + errs() << "Warning: failed to create jobserver client due to invalid " + "Pipe FDs in MAKEFLAGS environment variable\n"; + return; + } +#endif + } + + auto Client = std::make_unique<JobserverClientImpl>(Config); + if (Client->isValid()) { + LLVM_DEBUG(dbgs() << "Jobserver client created successfully!\n"); + GJobserver = Client.release(); + } else + errs() << "Warning: jobserver client initialization failed.\n"; + }); + return GJobserver; +} + +/// For testing purposes only. This function resets the singleton instance by +/// destroying the existing client and re-initializing the `std::once_flag`. +/// This allows tests to simulate the first-time initialization of the +/// jobserver client multiple times. +void JobserverClient::resetForTesting() { + delete GJobserver; + GJobserver = nullptr; + // Re-construct the std::once_flag in place to reset the singleton state. + new (&GJobserverOnceFlag) std::once_flag(); +} +} // namespace llvm diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp index 3ac6fc7..8e0c724 100644 --- a/llvm/lib/Support/Parallel.cpp +++ b/llvm/lib/Support/Parallel.cpp @@ -7,12 +7,17 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/Parallel.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/Config/llvm-config.h" +#include "llvm/Support/ExponentialBackoff.h" +#include "llvm/Support/Jobserver.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Threading.h" #include <atomic> #include <future> +#include <memory> +#include <mutex> #include <thread> #include <vector> @@ -49,6 +54,9 @@ public: class ThreadPoolExecutor : public Executor { public: explicit ThreadPoolExecutor(ThreadPoolStrategy S) { + if (S.UseJobserver) + TheJobserver = JobserverClient::getInstance(); + ThreadCount = S.compute_thread_count(); // Spawn all but one of the threads in another thread as spawning threads // can take a while. @@ -69,6 +77,10 @@ public: }); } + // To make sure the thread pool executor can only be created with a parallel + // strategy. + ThreadPoolExecutor() = delete; + void stop() { { std::lock_guard<std::mutex> Lock(Mutex); @@ -111,15 +123,62 @@ private: void work(ThreadPoolStrategy S, unsigned ThreadID) { threadIndex = ThreadID; S.apply_thread_strategy(ThreadID); + // Note on jobserver deadlock avoidance: + // GNU Make grants each invoked process one implicit job slot. Our + // JobserverClient models this by returning an implicit JobSlot on the + // first successful tryAcquire() in a process. This guarantees forward + // progress without requiring a dedicated "always-on" thread here. + + static thread_local std::unique_ptr<ExponentialBackoff> Backoff; + while (true) { - std::unique_lock<std::mutex> Lock(Mutex); - Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); }); - if (Stop) - break; - auto Task = std::move(WorkStack.back()); - WorkStack.pop_back(); - Lock.unlock(); - Task(); + if (TheJobserver) { + // Jobserver-mode scheduling: + // - Acquire one job slot (with exponential backoff to avoid busy-wait). + // - While holding the slot, drain and run tasks from the local queue. + // - Release the slot when the queue is empty or when shutting down. + // Rationale: Holding a slot amortizes acquire/release overhead over + // multiple tasks and avoids requeue/yield churn, while still enforcing + // the jobserver’s global concurrency limit. With K available slots, + // up to K workers run tasks in parallel; within each worker tasks run + // sequentially until the local queue is empty. + ExponentialBackoff Backoff(std::chrono::hours(24)); + JobSlot Slot; + do { + if (Stop) + return; + Slot = TheJobserver->tryAcquire(); + if (Slot.isValid()) + break; + } while (Backoff.waitForNextAttempt()); + + auto SlotReleaser = llvm::make_scope_exit( + [&] { TheJobserver->release(std::move(Slot)); }); + + while (true) { + std::function<void()> Task; + { + std::unique_lock<std::mutex> Lock(Mutex); + Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); }); + if (Stop && WorkStack.empty()) + return; + if (WorkStack.empty()) + break; + Task = std::move(WorkStack.back()); + WorkStack.pop_back(); + } + Task(); + } + } else { + std::unique_lock<std::mutex> Lock(Mutex); + Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); }); + if (Stop) + break; + auto Task = std::move(WorkStack.back()); + WorkStack.pop_back(); + Lock.unlock(); + Task(); + } } } @@ -130,9 +189,20 @@ private: std::promise<void> ThreadsCreated; std::vector<std::thread> Threads; unsigned ThreadCount; + + JobserverClient *TheJobserver = nullptr; }; -Executor *Executor::getDefaultExecutor() { +// A global raw pointer to the executor. Lifetime is managed by the +// objects created within createExecutor(). +static Executor *TheExec = nullptr; +static std::once_flag Flag; + +// This function will be called exactly once to create the executor. +// It contains the necessary platform-specific logic. Since functions +// called by std::call_once cannot return value, we have to set the +// executor as a global variable. +void createExecutor() { #ifdef _WIN32 // The ManagedStatic enables the ThreadPoolExecutor to be stopped via // llvm_shutdown() which allows a "clean" fast exit, e.g. via _exit(). This @@ -156,16 +226,22 @@ Executor *Executor::getDefaultExecutor() { ThreadPoolExecutor::Deleter> ManagedExec; static std::unique_ptr<ThreadPoolExecutor> Exec(&(*ManagedExec)); - return Exec.get(); + TheExec = Exec.get(); #else // ManagedStatic is not desired on other platforms. When `Exec` is destroyed // by llvm_shutdown(), worker threads will clean up and invoke TLS // destructors. This can lead to race conditions if other threads attempt to // access TLS objects that have already been destroyed. static ThreadPoolExecutor Exec(strategy); - return &Exec; + TheExec = &Exec; #endif } + +Executor *Executor::getDefaultExecutor() { + // Use std::call_once to lazily and safely initialize the executor. + std::call_once(Flag, createExecutor); + return TheExec; +} } // namespace } // namespace detail diff --git a/llvm/lib/Support/ThreadPool.cpp b/llvm/lib/Support/ThreadPool.cpp index c304f0f..6960268 100644 --- a/llvm/lib/Support/ThreadPool.cpp +++ b/llvm/lib/Support/ThreadPool.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// // +// // This file implements a crude C++11 based thread pool. // //===----------------------------------------------------------------------===// @@ -14,6 +15,8 @@ #include "llvm/Config/llvm-config.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/Support/ExponentialBackoff.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/Threading.h" #include "llvm/Support/raw_ostream.h" @@ -33,7 +36,10 @@ ThreadPoolInterface::~ThreadPoolInterface() = default; #if LLVM_ENABLE_THREADS StdThreadPool::StdThreadPool(ThreadPoolStrategy S) - : Strategy(S), MaxThreadCount(S.compute_thread_count()) {} + : Strategy(S), MaxThreadCount(S.compute_thread_count()) { + if (Strategy.UseJobserver) + TheJobserver = JobserverClient::getInstance(); +} void StdThreadPool::grow(int requested) { llvm::sys::ScopedWriter LockGuard(ThreadsLock); @@ -45,7 +51,15 @@ void StdThreadPool::grow(int requested) { Threads.emplace_back([this, ThreadID] { set_thread_name(formatv("llvm-worker-{0}", ThreadID)); Strategy.apply_thread_strategy(ThreadID); - processTasks(nullptr); + // Note on jobserver deadlock avoidance: + // GNU Make grants each invoked process one implicit job slot. + // JobserverClient::tryAcquire() returns that implicit slot on the first + // successful call in a process, ensuring forward progress without a + // dedicated "always-on" thread. + if (TheJobserver) + processTasksWithJobserver(); + else + processTasks(nullptr); }); } } @@ -133,6 +147,96 @@ void StdThreadPool::processTasks(ThreadPoolTaskGroup *WaitingForGroup) { } } +/// Main loop for worker threads when using a jobserver. +/// This function uses a two-level queue; it first acquires a job slot from the +/// external jobserver, then retrieves a task from the internal queue. +/// This allows the thread pool to cooperate with build systems like `make -j`. +void StdThreadPool::processTasksWithJobserver() { + while (true) { + // Acquire a job slot from the external jobserver. + // This polls for a slot and yields the thread to avoid a high-CPU wait. + JobSlot Slot; + // The timeout for the backoff can be very long, as the shutdown + // is checked on each iteration. The sleep duration is capped by MaxWait + // in ExponentialBackoff, so shutdown latency is not a problem. + ExponentialBackoff Backoff(std::chrono::hours(24)); + bool AcquiredToken = false; + do { + // Return if the thread pool is shutting down. + { + std::unique_lock<std::mutex> LockGuard(QueueLock); + if (!EnableFlag) + return; + } + + Slot = TheJobserver->tryAcquire(); + if (Slot.isValid()) { + AcquiredToken = true; + break; + } + } while (Backoff.waitForNextAttempt()); + + if (!AcquiredToken) { + // This is practically unreachable with a 24h timeout and indicates a + // deeper problem if hit. + report_fatal_error("Timed out waiting for jobserver token."); + } + + // `make_scope_exit` guarantees the job slot is released, even if the + // task throws or we exit early. This prevents deadlocking the build. + auto SlotReleaser = + make_scope_exit([&] { TheJobserver->release(std::move(Slot)); }); + + // While we hold a job slot, process tasks from the internal queue. + while (true) { + std::function<void()> Task; + ThreadPoolTaskGroup *GroupOfTask = nullptr; + + { + std::unique_lock<std::mutex> LockGuard(QueueLock); + + // Wait until a task is available or the pool is shutting down. + QueueCondition.wait(LockGuard, + [&] { return !EnableFlag || !Tasks.empty(); }); + + // If shutting down and the queue is empty, the thread can terminate. + if (!EnableFlag && Tasks.empty()) + return; + + // If the queue is empty, we're done processing tasks for now. + // Break the inner loop to release the job slot. + if (Tasks.empty()) + break; + + // A task is available. Mark it as active before releasing the lock + // to prevent race conditions with `wait()`. + ++ActiveThreads; + Task = std::move(Tasks.front().first); + GroupOfTask = Tasks.front().second; + if (GroupOfTask != nullptr) + ++ActiveGroups[GroupOfTask]; + Tasks.pop_front(); + } // The queue lock is released. + + // Run the task. The job slot remains acquired during execution. + Task(); + + // The task has finished. Update the active count and notify any waiters. + { + std::lock_guard<std::mutex> LockGuard(QueueLock); + --ActiveThreads; + if (GroupOfTask != nullptr) { + auto A = ActiveGroups.find(GroupOfTask); + if (--(A->second) == 0) + ActiveGroups.erase(A); + } + // If all tasks are complete, notify any waiting threads. + if (workCompletedUnlocked(nullptr)) + CompletionCondition.notify_all(); + } + } + } +} bool StdThreadPool::workCompletedUnlocked(ThreadPoolTaskGroup *Group) const { if (Group == nullptr) return !ActiveThreads && Tasks.empty(); diff --git a/llvm/lib/Support/Threading.cpp b/llvm/lib/Support/Threading.cpp index 693de0e..9da357a 100644 --- a/llvm/lib/Support/Threading.cpp +++ b/llvm/lib/Support/Threading.cpp @@ -14,6 +14,7 @@ #include "llvm/Support/Threading.h" #include "llvm/Config/config.h" #include "llvm/Config/llvm-config.h" +#include "llvm/Support/Jobserver.h" #include <cassert> #include <optional> @@ -51,6 +52,10 @@ int llvm::get_physical_cores() { return -1; } static int computeHostNumHardwareThreads(); unsigned llvm::ThreadPoolStrategy::compute_thread_count() const { + if (UseJobserver) + if (auto JS = JobserverClient::getInstance()) + return JS->getNumJobs(); + int MaxThreadCount = UseHyperThreads ? computeHostNumHardwareThreads() : get_physical_cores(); if (MaxThreadCount <= 0) diff --git a/llvm/lib/Support/Unix/Jobserver.inc b/llvm/lib/Support/Unix/Jobserver.inc new file mode 100644 index 0000000..53bf7f2 --- /dev/null +++ b/llvm/lib/Support/Unix/Jobserver.inc @@ -0,0 +1,195 @@ +//===- llvm/Support/Unix/Jobserver.inc - Unix Jobserver Impl ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the UNIX-specific parts of the JobserverClient class. +// +//===----------------------------------------------------------------------===// + +#include <atomic> +#include <cassert> +#include <cerrno> +#include <fcntl.h> +#include <string.h> +#include <sys/stat.h> +#include <unistd.h> + +namespace { +/// Returns true if the given file descriptor is a FIFO (named pipe). +bool isFifo(int FD) { + struct stat StatBuf; + if (::fstat(FD, &StatBuf) != 0) + return false; + return S_ISFIFO(StatBuf.st_mode); +} + +/// Returns true if the given file descriptors are valid. +bool areFdsValid(int ReadFD, int WriteFD) { + if (ReadFD == -1 || WriteFD == -1) + return false; + // Check if the file descriptors are actually valid by checking their flags. + return ::fcntl(ReadFD, F_GETFD) != -1 && ::fcntl(WriteFD, F_GETFD) != -1; +} +} // namespace + +/// The constructor sets up the client based on the provided configuration. +/// For pipe-based jobservers, it duplicates the inherited file descriptors, +/// sets them to close-on-exec, and makes the read descriptor non-blocking. +/// For FIFO-based jobservers, it opens the named pipe. After setup, it drains +/// all available tokens from the jobserver to determine the total number of +/// available jobs (`NumJobs`), then immediately releases them back. +JobserverClientImpl::JobserverClientImpl(const JobserverConfig &Config) { + switch (Config.TheMode) { + case JobserverConfig::PosixPipe: { + // Duplicate the read and write file descriptors. + int NewReadFD = ::dup(Config.PipeFDs.Read); + if (NewReadFD < 0) + return; + int NewWriteFD = ::dup(Config.PipeFDs.Write); + if (NewWriteFD < 0) { + ::close(NewReadFD); + return; + } + // Set the new descriptors to be closed automatically on exec(). + if (::fcntl(NewReadFD, F_SETFD, FD_CLOEXEC) == -1 || + ::fcntl(NewWriteFD, F_SETFD, FD_CLOEXEC) == -1) { + ::close(NewReadFD); + ::close(NewWriteFD); + return; + } + // Set the read descriptor to non-blocking. + int flags = ::fcntl(NewReadFD, F_GETFL, 0); + if (flags == -1 || ::fcntl(NewReadFD, F_SETFL, flags | O_NONBLOCK) == -1) { + ::close(NewReadFD); + ::close(NewWriteFD); + return; + } + ReadFD = NewReadFD; + WriteFD = NewWriteFD; + break; + } + case JobserverConfig::PosixFifo: + // Open the FIFO for reading. It must be non-blocking and close-on-exec. + ReadFD = ::open(Config.Path.c_str(), O_RDONLY | O_NONBLOCK | O_CLOEXEC); + if (ReadFD < 0 || !isFifo(ReadFD)) { + if (ReadFD >= 0) + ::close(ReadFD); + ReadFD = -1; + return; + } + FifoPath = Config.Path; + // The write FD is opened on-demand in release(). + WriteFD = -1; + break; + default: + return; + } + + IsInitialized = true; + // Determine the total number of jobs by acquiring all available slots and + // then immediately releasing them. + SmallVector<JobSlot, 8> Slots; + while (true) { + auto S = tryAcquire(); + if (!S.isValid()) + break; + Slots.push_back(std::move(S)); + } + NumJobs = Slots.size(); + assert(NumJobs >= 1 && "Invalid number of jobs"); + for (auto &S : Slots) + release(std::move(S)); +} + +/// The destructor closes any open file descriptors. +JobserverClientImpl::~JobserverClientImpl() { + if (ReadFD >= 0) + ::close(ReadFD); + if (WriteFD >= 0) + ::close(WriteFD); +} + +/// Tries to acquire a job slot. The first call to this function will always +/// successfully acquire the single "implicit" slot that is granted to every +/// process started by `make`. Subsequent calls attempt to read a one-byte +/// token from the jobserver's read pipe. A successful read grants one +/// explicit job slot. The read is non-blocking; if no token is available, +/// it fails and returns an invalid JobSlot. +JobSlot JobserverClientImpl::tryAcquire() { + if (!IsInitialized) + return JobSlot(); + + // The first acquisition is always for the implicit slot. + if (HasImplicitSlot.exchange(false, std::memory_order_acquire)) { + LLVM_DEBUG(dbgs() << "Acquired implicit job slot.\n"); + return JobSlot::createImplicit(); + } + + char Token; + ssize_t Ret; + LLVM_DEBUG(dbgs() << "Attempting to read token from FD " << ReadFD << ".\n"); + // Loop to retry on EINTR (interrupted system call). + do { + Ret = ::read(ReadFD, &Token, 1); + } while (Ret < 0 && errno == EINTR); + + if (Ret == 1) { + LLVM_DEBUG(dbgs() << "Acquired explicit token '" << Token << "'.\n"); + return JobSlot::createExplicit(static_cast<uint8_t>(Token)); + } + + LLVM_DEBUG(dbgs() << "Failed to acquire job slot, read returned " << Ret + << ".\n"); + return JobSlot(); +} + +/// Releases a job slot back to the pool. If the slot is implicit, it simply +/// resets a flag. If the slot is explicit, it writes the character token +/// associated with the slot back into the jobserver's write pipe. For FIFO +/// jobservers, this may require opening the FIFO for writing if it hasn't +/// been already. +void JobserverClientImpl::release(JobSlot Slot) { + if (!Slot.isValid()) + return; + + // Releasing the implicit slot just makes it available for the next acquire. + if (Slot.isImplicit()) { + LLVM_DEBUG(dbgs() << "Released implicit job slot.\n"); + [[maybe_unused]] bool was_already_released = + HasImplicitSlot.exchange(true, std::memory_order_release); + assert(!was_already_released && "Implicit slot released twice"); + return; + } + + uint8_t Token = Slot.getExplicitValue(); + LLVM_DEBUG(dbgs() << "Releasing explicit token '" << (char)Token << "' to FD " + << WriteFD << ".\n"); + + // For FIFO-based jobservers, the write FD might not be open yet. + // Open it on the first release. + if (WriteFD < 0) { + LLVM_DEBUG(dbgs() << "WriteFD is invalid, opening FIFO: " << FifoPath + << "\n"); + WriteFD = ::open(FifoPath.c_str(), O_WRONLY | O_CLOEXEC); + if (WriteFD < 0) { + LLVM_DEBUG(dbgs() << "Failed to open FIFO for writing.\n"); + return; + } + LLVM_DEBUG(dbgs() << "Opened FIFO as new WriteFD: " << WriteFD << "\n"); + } + + ssize_t Written; + // Loop to retry on EINTR (interrupted system call). + do { + Written = ::write(WriteFD, &Token, 1); + } while (Written < 0 && errno == EINTR); + + if (Written <= 0) { + LLVM_DEBUG(dbgs() << "Failed to write token to pipe, write returned " + << Written << "\n"); + } +} diff --git a/llvm/lib/Support/Windows/Jobserver.inc b/llvm/lib/Support/Windows/Jobserver.inc new file mode 100644 index 0000000..79028ee --- /dev/null +++ b/llvm/lib/Support/Windows/Jobserver.inc @@ -0,0 +1,79 @@ +//==- llvm/Support/Windows/Jobserver.inc - Windows Jobserver Impl -*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the Windows-specific parts of the JobserverClient class. +// On Windows, the jobserver is implemented using a named semaphore. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/Windows/WindowsSupport.h" +#include <atomic> +#include <cassert> + +namespace llvm { +/// The constructor for the Windows jobserver client. It attempts to open a +/// handle to an existing named semaphore, the name of which is provided by +/// GNU make in the --jobserver-auth argument. If the semaphore is opened +/// successfully, the client is marked as initialized. +JobserverClientImpl::JobserverClientImpl(const JobserverConfig &Config) { + Semaphore = (void *)::OpenSemaphoreA(SEMAPHORE_MODIFY_STATE | SYNCHRONIZE, + FALSE, Config.Path.c_str()); + if (Semaphore != nullptr) + IsInitialized = true; +} + +/// The destructor closes the handle to the semaphore, releasing the resource. +JobserverClientImpl::~JobserverClientImpl() { + if (Semaphore != nullptr) + ::CloseHandle((HANDLE)Semaphore); +} + +/// Tries to acquire a job slot. The first call always returns the implicit +/// slot. Subsequent calls use a non-blocking wait on the semaphore +/// (`WaitForSingleObject` with a timeout of 0). If the wait succeeds, the +/// semaphore's count is decremented, and an explicit job slot is acquired. +/// If the wait times out, it means no slots are available, and an invalid +/// slot is returned. +JobSlot JobserverClientImpl::tryAcquire() { + if (!IsInitialized) + return JobSlot(); + + // First, grant the implicit slot. + if (HasImplicitSlot.exchange(false, std::memory_order_acquire)) { + return JobSlot::createImplicit(); + } + + // Try to acquire a slot from the semaphore without blocking. + if (::WaitForSingleObject((HANDLE)Semaphore, 0) == WAIT_OBJECT_0) { + // The explicit token value is arbitrary on Windows, as the semaphore + // count is the real resource. + return JobSlot::createExplicit(1); + } + + return JobSlot(); // Invalid slot +} + +/// Releases a job slot back to the pool. If the slot is implicit, it simply +/// resets a flag. For an explicit slot, it increments the semaphore's count +/// by one using `ReleaseSemaphore`, making the slot available to other +/// processes. +void JobserverClientImpl::release(JobSlot Slot) { + if (!IsInitialized || !Slot.isValid()) + return; + + if (Slot.isImplicit()) { + [[maybe_unused]] bool was_already_released = + HasImplicitSlot.exchange(true, std::memory_order_release); + assert(!was_already_released && "Implicit slot released twice"); + return; + } + + // Release the slot by incrementing the semaphore count. + (void)::ReleaseSemaphore((HANDLE)Semaphore, 1, NULL); +} +} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 9dd64e0..cb49936 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1224,12 +1224,9 @@ static bool inlineAsmUsesAGPRs(const InlineAsm *IA) { } // TODO: Migrate to range merge of amdgpu-agpr-alloc. -// FIXME: Why is this using Attribute::NoUnwind? -struct AAAMDGPUNoAGPR - : public IRAttribute<Attribute::NoUnwind, - StateWrapper<BooleanState, AbstractAttribute>, - AAAMDGPUNoAGPR> { - AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {} +struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> { + using Base = StateWrapper<BooleanState, AbstractAttribute>; + AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : Base(IRP) {} static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP, Attributor &A) { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 5630580..82fc240 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -367,19 +367,6 @@ def SCC_CLASS : SIRegisterClass<"AMDGPU", [i1], 1, (add SCC)> { let BaseClassOrder = 10000; } -def M0_CLASS : SIRegisterClass<"AMDGPU", [i32], 32, (add M0)> { - let CopyCost = 1; - let isAllocatable = 0; - let HasSGPR = 1; -} - -def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, (add M0_LO16)> { - let CopyCost = 1; - let Size = 16; - let isAllocatable = 0; - let HasSGPR = 1; -} - // TODO: Do we need to set DwarfRegAlias on register tuples? def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, @@ -774,12 +761,6 @@ def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", Reg128Types.types, 32, let BaseClassOrder = 10000; } -def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32, - (add LDS_DIRECT)> { - let isAllocatable = 0; - let CopyCost = -1; -} - let GeneratePressureSet = 0, HasSGPR = 1 in { // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. @@ -797,7 +778,7 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO_LO16, SRC_SHARED_LIMIT_LO_LO16, SRC_PRIVATE_BASE_LO_LO16, SRC_PRIVATE_LIMIT_LO_LO16, SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16, - EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16, SRC_FLAT_SCRATCH_BASE_LO_LO16, + EXEC_LO_LO16, EXEC_HI_LO16, M0_LO16, SRC_FLAT_SCRATCH_BASE_LO_LO16, SRC_FLAT_SCRATCH_BASE_HI_LO16)> { let Size = 16; let isAllocatable = 0; @@ -805,7 +786,7 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, } def SReg_32_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32, - (add SReg_32_XM0_XEXEC, M0_CLASS)> { + (add SReg_32_XM0_XEXEC, M0)> { let AllocationPriority = 0; } @@ -830,7 +811,7 @@ def APERTURE_Class : SIRegisterClass<"AMDGPU", Reg64Types.types, 32, // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32, - (add SReg_32_XM0, M0_CLASS)> { + (add SReg_32_XM0, M0)> { let AllocationPriority = 0; let HasSGPR = 1; let BaseClassOrder = 32; @@ -842,7 +823,7 @@ def SGPR_NULL256 : SIReg<"null">; let GeneratePressureSet = 0 in { def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, - (add SReg_32, LDS_DIRECT_CLASS)> { + (add SReg_32, LDS_DIRECT)> { let isAllocatable = 0; let HasSGPR = 1; let Size = 32; @@ -981,7 +962,7 @@ defm "" : SRegClass<32, Reg1024Types.types, SGPR_1024Regs>; } def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, - (add VGPR_32, LDS_DIRECT_CLASS)> { + (add VGPR_32, LDS_DIRECT)> { let isAllocatable = 0; let HasVGPR = 1; let Size = 32; @@ -1096,21 +1077,21 @@ def VReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, (add)> { } def VS_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, - (add VGPR_16, SReg_32, LDS_DIRECT_CLASS)> { + (add VGPR_16, SReg_32, LDS_DIRECT)> { let isAllocatable = 0; let HasVGPR = 1; let Size = 16; } def VS_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, - (add VGPR_16_Lo128, SReg_32, LDS_DIRECT_CLASS)> { + (add VGPR_16_Lo128, SReg_32, LDS_DIRECT)> { let isAllocatable = 0; let HasVGPR = 1; let Size = 16; } def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, - (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> { + (add VGPR_32, SReg_32, LDS_DIRECT)> { let isAllocatable = 0; let HasVGPR = 1; let HasSGPR = 1; @@ -1118,7 +1099,7 @@ def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v } def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, - (add VGPR_32_Lo128, SReg_32, LDS_DIRECT_CLASS)> { + (add VGPR_32_Lo128, SReg_32, LDS_DIRECT)> { let isAllocatable = 0; let HasVGPR = 1; let HasSGPR = 1; @@ -1126,7 +1107,7 @@ def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2 } def VS_32_Lo256 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, - (add VGPR_32_Lo256, SReg_32, LDS_DIRECT_CLASS)> { + (add VGPR_32_Lo256, SReg_32, LDS_DIRECT)> { let isAllocatable = 0; let HasVGPR = 1; let HasSGPR = 1; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index efdbd12..447f05c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -1417,9 +1417,9 @@ class SelectQCbi<CondCode Cond, DAGOperand InTyImm, Pseudo OpNode > let Predicates = [HasVendorXqciac, IsRV32] in { def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12_lo:$imm12))), (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12_lo:$imm12)>; -def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, uimm5gt3:$imm), GPRNoX0:$rs2)), +def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, (i32 uimm5gt3:$imm)), GPRNoX0:$rs2)), (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>; -def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, uimm5gt3:$imm, GPRNoX0:$rs2)), +def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 uimm5gt3:$imm), GPRNoX0:$rs2)), (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>; } // Predicates = [HasVendorXqciac, IsRV32] diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp index ebd957c..e8c849e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp @@ -195,9 +195,9 @@ class SPIRVLegalizePointerCast : public FunctionPass { if (DstType->getElementType() != SrcType->getElementType()) { // Support bitcast between vectors of different sizes only if // the total bitwidth is the same. - auto dstBitWidth = + [[maybe_unused]] auto dstBitWidth = DstType->getElementType()->getScalarSizeInBits() * dstNumElements; - auto srcBitWidth = + [[maybe_unused]] auto srcBitWidth = SrcType->getElementType()->getScalarSizeInBits() * srcNumElements; assert(dstBitWidth == srcBitWidth && "Unsupported bitcast between vectors of different sizes."); diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index ddb95a4..faeab95 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -29,6 +29,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/MemoryProfileInfo.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" @@ -40,6 +41,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/GraphWriter.h" #include "llvm/Support/InterleavedRange.h" +#include "llvm/Support/SHA1.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/CallPromotionUtils.h" @@ -60,6 +62,9 @@ STATISTIC(FunctionClonesThinBackend, "Number of function clones created during ThinLTO backend"); STATISTIC(FunctionsClonedThinBackend, "Number of functions that had clones created during ThinLTO backend"); +STATISTIC( + FunctionCloneDuplicatesThinBackend, + "Number of function clone duplicates detected during ThinLTO backend"); STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly " "cloned) during whole program analysis"); STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) " @@ -5186,19 +5191,127 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { return Changed; } +// Compute a SHA1 hash of the callsite and alloc version information of clone I +// in the summary, to use in detection of duplicate clones. +uint64_t ComputeHash(const FunctionSummary *FS, unsigned I) { + SHA1 Hasher; + // Update hash with any callsites that call non-default (non-zero) callee + // versions. + for (auto &SN : FS->callsites()) { + // In theory all callsites and allocs in this function should have the same + // number of clone entries, but handle any discrepancies gracefully below + // for NDEBUG builds. + assert( + SN.Clones.size() > I && + "Callsite summary has fewer entries than other summaries in function"); + if (SN.Clones.size() <= I || !SN.Clones[I]) + continue; + uint8_t Data[sizeof(SN.Clones[I])]; + support::endian::write32le(Data, SN.Clones[I]); + Hasher.update(Data); + } + // Update hash with any allocs that have non-default (non-None) hints. + for (auto &AN : FS->allocs()) { + // In theory all callsites and allocs in this function should have the same + // number of clone entries, but handle any discrepancies gracefully below + // for NDEBUG builds. + assert(AN.Versions.size() > I && + "Alloc summary has fewer entries than other summaries in function"); + if (AN.Versions.size() <= I || + (AllocationType)AN.Versions[I] == AllocationType::None) + continue; + Hasher.update(ArrayRef<uint8_t>(&AN.Versions[I], 1)); + } + return support::endian::read64le(Hasher.result().data()); +} + static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones( Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> - &FuncToAliasMap) { + &FuncToAliasMap, + FunctionSummary *FS) { + auto TakeDeclNameAndReplace = [](GlobalValue *DeclGV, GlobalValue *NewGV) { + // We might have created this when adjusting callsite in another + // function. It should be a declaration. + assert(DeclGV->isDeclaration()); + NewGV->takeName(DeclGV); + DeclGV->replaceAllUsesWith(NewGV); + DeclGV->eraseFromParent(); + }; + + // Handle aliases to this function, and create analogous alias clones to the + // provided clone of this function. + auto CloneFuncAliases = [&](Function *NewF, unsigned I) { + if (!FuncToAliasMap.count(&F)) + return; + for (auto *A : FuncToAliasMap[&F]) { + std::string AliasName = getMemProfFuncName(A->getName(), I); + auto *PrevA = M.getNamedAlias(AliasName); + auto *NewA = GlobalAlias::create(A->getValueType(), + A->getType()->getPointerAddressSpace(), + A->getLinkage(), AliasName, NewF); + NewA->copyAttributesFrom(A); + if (PrevA) + TakeDeclNameAndReplace(PrevA, NewA); + } + }; + // The first "clone" is the original copy, we should only call this if we // needed to create new clones. assert(NumClones > 1); SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; VMaps.reserve(NumClones - 1); FunctionsClonedThinBackend++; + + // Map of hash of callsite/alloc versions to the instantiated function clone + // (possibly the original) implementing those calls. Used to avoid + // instantiating duplicate function clones. + // FIXME: Ideally the thin link would not generate such duplicate clones to + // start with, but right now it happens due to phase ordering in the function + // assignment and possible new clones that produces. We simply make each + // duplicate an alias to the matching instantiated clone recorded in the map + // (except for available_externally which are made declarations as they would + // be aliases in the prevailing module, and available_externally aliases are + // not well supported right now). + DenseMap<uint64_t, Function *> HashToFunc; + + // Save the hash of the original function version. + HashToFunc[ComputeHash(FS, 0)] = &F; + for (unsigned I = 1; I < NumClones; I++) { VMaps.emplace_back(std::make_unique<ValueToValueMapTy>()); + std::string Name = getMemProfFuncName(F.getName(), I); + auto Hash = ComputeHash(FS, I); + // If this clone would duplicate a previously seen clone, don't generate the + // duplicate clone body, just make an alias to satisfy any (potentially + // cross-module) references. + if (HashToFunc.contains(Hash)) { + FunctionCloneDuplicatesThinBackend++; + auto *Func = HashToFunc[Hash]; + if (Func->hasAvailableExternallyLinkage()) { + // Skip these as EliminateAvailableExternallyPass does not handle + // available_externally aliases correctly and we end up with an + // available_externally alias to a declaration. Just create a + // declaration for now as we know we will have a definition in another + // module. + auto Decl = M.getOrInsertFunction(Name, Func->getFunctionType()); + ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F) + << "created clone decl " << ore::NV("Decl", Decl.getCallee())); + continue; + } + auto *PrevF = M.getFunction(Name); + auto *Alias = GlobalAlias::create(Name, Func); + if (PrevF) + TakeDeclNameAndReplace(PrevF, Alias); + ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F) + << "created clone alias " << ore::NV("Alias", Alias)); + + // Now handle aliases to this function, and clone those as well. + CloneFuncAliases(Func, I); + continue; + } auto *NewF = CloneFunction(&F, *VMaps.back()); + HashToFunc[Hash] = NewF; FunctionClonesThinBackend++; // Strip memprof and callsite metadata from clone as they are no longer // needed. @@ -5208,40 +5321,17 @@ static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones( Inst.setMetadata(LLVMContext::MD_callsite, nullptr); } } - std::string Name = getMemProfFuncName(F.getName(), I); auto *PrevF = M.getFunction(Name); - if (PrevF) { - // We might have created this when adjusting callsite in another - // function. It should be a declaration. - assert(PrevF->isDeclaration()); - NewF->takeName(PrevF); - PrevF->replaceAllUsesWith(NewF); - PrevF->eraseFromParent(); - } else + if (PrevF) + TakeDeclNameAndReplace(PrevF, NewF); + else NewF->setName(Name); updateSubprogramLinkageName(NewF, Name); ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F) << "created clone " << ore::NV("NewFunction", NewF)); // Now handle aliases to this function, and clone those as well. - if (!FuncToAliasMap.count(&F)) - continue; - for (auto *A : FuncToAliasMap[&F]) { - std::string Name = getMemProfFuncName(A->getName(), I); - auto *PrevA = M.getNamedAlias(Name); - auto *NewA = GlobalAlias::create(A->getValueType(), - A->getType()->getPointerAddressSpace(), - A->getLinkage(), Name, NewF); - NewA->copyAttributesFrom(A); - if (PrevA) { - // We might have created this when adjusting callsite in another - // function. It should be a declaration. - assert(PrevA->isDeclaration()); - NewA->takeName(PrevA); - PrevA->replaceAllUsesWith(NewA); - PrevA->eraseFromParent(); - } - } + CloneFuncAliases(NewF, I); } return VMaps; } @@ -5401,7 +5491,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; bool ClonesCreated = false; unsigned NumClonesCreated = 0; - auto CloneFuncIfNeeded = [&](unsigned NumClones) { + auto CloneFuncIfNeeded = [&](unsigned NumClones, FunctionSummary *FS) { // We should at least have version 0 which is the original copy. assert(NumClones > 0); // If only one copy needed use original. @@ -5415,7 +5505,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { assert(NumClonesCreated == NumClones); return; } - VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap); + VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap, FS); // The first "clone" is the original copy, which doesn't have a VMap. assert(VMaps.size() == NumClones - 1); Changed = true; @@ -5424,9 +5514,9 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { }; auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB, - Function *CalledFunction) { + Function *CalledFunction, FunctionSummary *FS) { // Perform cloning if not yet done. - CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size()); + CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size(), FS); assert(!isMemProfClone(*CalledFunction)); @@ -5448,6 +5538,10 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { // below. auto CalleeOrigName = CalledFunction->getName(); for (unsigned J = 0; J < StackNode.Clones.size(); J++) { + // If the VMap is empty, this clone was a duplicate of another and was + // created as an alias or a declaration. + if (J > 0 && VMaps[J - 1]->empty()) + continue; // Do nothing if this version calls the original version of its // callee. if (!StackNode.Clones[J]) @@ -5591,7 +5685,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { #endif // Perform cloning if not yet done. - CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size()); + CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size(), FS); OrigAllocsThinBackend++; AllocVersionsThinBackend += AllocNode.Versions.size(); @@ -5624,6 +5718,10 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { // Update the allocation types per the summary info. for (unsigned J = 0; J < AllocNode.Versions.size(); J++) { + // If the VMap is empty, this clone was a duplicate of another and + // was created as an alias or a declaration. + if (J > 0 && VMaps[J - 1]->empty()) + continue; // Ignore any that didn't get an assigned allocation type. if (AllocNode.Versions[J] == (uint8_t)AllocationType::None) continue; @@ -5670,7 +5768,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { // we don't need to do ICP, but might need to clone this // function as it is the target of other cloned calls. if (NumClones) - CloneFuncIfNeeded(NumClones); + CloneFuncIfNeeded(NumClones, FS); } else { @@ -5690,7 +5788,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { } #endif - CloneCallsite(StackNode, CB, CalledFunction); + CloneCallsite(StackNode, CB, CalledFunction, FS); } } else if (CB->isTailCall() && CalledFunction) { // Locate the synthesized callsite info for the callee VI, if any was @@ -5700,7 +5798,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) { auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI); assert(Callsite != MapTailCallCalleeVIToCallsite.end()); - CloneCallsite(Callsite->second, CB, CalledFunction); + CloneCallsite(Callsite->second, CB, CalledFunction, FS); } } } @@ -5846,6 +5944,10 @@ void MemProfContextDisambiguation::performICP( // check. CallBase *CBClone = CB; for (unsigned J = 0; J < NumClones; J++) { + // If the VMap is empty, this clone was a duplicate of another and was + // created as an alias or a declaration. + if (J > 0 && VMaps[J - 1]->empty()) + continue; // Copy 0 is the original function. if (J > 0) CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); @@ -5891,6 +5993,10 @@ void MemProfContextDisambiguation::performICP( // TotalCount and the number promoted. CallBase *CBClone = CB; for (unsigned J = 0; J < NumClones; J++) { + // If the VMap is empty, this clone was a duplicate of another and was + // created as an alias or a declaration. + if (J > 0 && VMaps[J - 1]->empty()) + continue; // Copy 0 is the original function. if (J > 0) CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp index cba282c..a2e8c69 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp @@ -15,13 +15,12 @@ using namespace llvm; -namespace { /// Return true if and only if the given instruction does not modify the memory /// location referenced. Note that an idemptent atomicrmw may still have /// ordering effects on nearby instructions, or be volatile. /// TODO: Common w/ the version in AtomicExpandPass, and change the term used. /// Idemptotent is confusing in this context. -bool isIdempotentRMW(AtomicRMWInst& RMWI) { +static bool isIdempotentRMW(AtomicRMWInst &RMWI) { if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand())) switch(RMWI.getOperation()) { case AtomicRMWInst::FAdd: // -0.0 @@ -59,7 +58,7 @@ bool isIdempotentRMW(AtomicRMWInst& RMWI) { /// Return true if the given instruction always produces a value in memory /// equivalent to its value operand. -bool isSaturating(AtomicRMWInst& RMWI) { +static bool isSaturating(AtomicRMWInst &RMWI) { if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand())) switch (RMWI.getOperation()) { case AtomicRMWInst::FMax: @@ -98,7 +97,6 @@ bool isSaturating(AtomicRMWInst& RMWI) { return C->isMaxValue(false); }; } -} // namespace Instruction *InstCombinerImpl::visitAtomicRMWInst(AtomicRMWInst &RMWI) { diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 917004c..048cdf4 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -132,8 +132,6 @@ STATISTIC(NumReassoc , "Number of reassociations"); DEBUG_COUNTER(VisitCounter, "instcombine-visit", "Controls which instructions are visited"); -namespace llvm { - static cl::opt<bool> EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"), cl::init(true)); @@ -146,7 +144,9 @@ static cl::opt<unsigned> MaxArraySize("instcombine-maxarray-size", cl::init(1024), cl::desc("Maximum array size considered when doing a combine")); +namespace llvm { extern cl::opt<bool> ProfcheckDisableMetadataFixes; +} // end namespace llvm // FIXME: Remove this flag when it is no longer necessary to convert // llvm.dbg.declare to avoid inaccurate debug info. Setting this to false @@ -158,8 +158,6 @@ extern cl::opt<bool> ProfcheckDisableMetadataFixes; static cl::opt<unsigned> ShouldLowerDbgDeclare("instcombine-lower-dbg-declare", cl::Hidden, cl::init(true)); -} // end namespace llvm - std::optional<Instruction *> InstCombiner::targetInstCombineIntrinsic(IntrinsicInst &II) { // Handle target specific intrinsics diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 8bba634..48055ad 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -5152,14 +5152,18 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI, if (ExtraCase && Values.size() < 2) return false; - // TODO: Preserve branch weight metadata, similarly to how - // foldValueComparisonIntoPredecessors preserves it. + SmallVector<uint32_t> BranchWeights; + const bool HasProfile = !ProfcheckDisableMetadataFixes && + extractBranchWeights(*BI, BranchWeights); // Figure out which block is which destination. BasicBlock *DefaultBB = BI->getSuccessor(1); BasicBlock *EdgeBB = BI->getSuccessor(0); - if (!TrueWhenEqual) + if (!TrueWhenEqual) { std::swap(DefaultBB, EdgeBB); + if (HasProfile) + std::swap(BranchWeights[0], BranchWeights[1]); + } BasicBlock *BB = BI->getParent(); @@ -5190,10 +5194,11 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI, if (!isGuaranteedNotToBeUndefOrPoison(ExtraCase, AC, BI, nullptr)) ExtraCase = Builder.CreateFreeze(ExtraCase); - if (TrueWhenEqual) - Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB); - else - Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB); + // We don't have any info about this condition. + auto *Br = TrueWhenEqual ? Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB) + : Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB); + setExplicitlyUnknownBranchWeightsIfProfiled(*Br, *NewBB->getParent(), + DEBUG_TYPE); OldTI->eraseFromParent(); @@ -5220,6 +5225,17 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI, // Create the new switch instruction now. SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size()); + if (HasProfile) { + // We know the weight of the default case. We don't know the weight of the + // other cases, but rather than completely lose profiling info, we split + // the remaining probability equally over them. + SmallVector<uint32_t> NewWeights(Values.size() + 1); + NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped if + // TrueWhenEqual. + for (auto &V : drop_begin(NewWeights)) + V = BranchWeights[0] / Values.size(); + setBranchWeights(*New, NewWeights, /*IsExpected=*/false); + } // Add all of the 'cases' to the switch instruction. for (ConstantInt *Val : Values) @@ -7211,6 +7227,7 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest); BranchInst *RangeCheckBranch = nullptr; + BranchInst *CondBranch = nullptr; Builder.SetInsertPoint(SI); const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize); @@ -7225,6 +7242,7 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, TableIndex, ConstantInt::get(MinCaseVal->getType(), TableSize)); RangeCheckBranch = Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest()); + CondBranch = RangeCheckBranch; if (DTU) Updates.push_back({DominatorTree::Insert, BB, LookupBB}); } @@ -7263,7 +7281,7 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, Value *Shifted = Builder.CreateLShr(TableMask, MaskIndex, "switch.shifted"); Value *LoBit = Builder.CreateTrunc( Shifted, Type::getInt1Ty(Mod.getContext()), "switch.lobit"); - Builder.CreateCondBr(LoBit, LookupBB, SI->getDefaultDest()); + CondBranch = Builder.CreateCondBr(LoBit, LookupBB, SI->getDefaultDest()); if (DTU) { Updates.push_back({DominatorTree::Insert, MaskBB, LookupBB}); Updates.push_back({DominatorTree::Insert, MaskBB, SI->getDefaultDest()}); @@ -7303,19 +7321,32 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, if (DTU) Updates.push_back({DominatorTree::Insert, LookupBB, CommonDest}); + SmallVector<uint32_t> BranchWeights; + const bool HasBranchWeights = CondBranch && !ProfcheckDisableMetadataFixes && + extractBranchWeights(*SI, BranchWeights); + uint64_t ToLookupWeight = 0; + uint64_t ToDefaultWeight = 0; + // Remove the switch. SmallPtrSet<BasicBlock *, 8> RemovedSuccessors; - for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) { - BasicBlock *Succ = SI->getSuccessor(i); + for (unsigned I = 0, E = SI->getNumSuccessors(); I < E; ++I) { + BasicBlock *Succ = SI->getSuccessor(I); - if (Succ == SI->getDefaultDest()) + if (Succ == SI->getDefaultDest()) { + if (HasBranchWeights) + ToDefaultWeight += BranchWeights[I]; continue; + } Succ->removePredecessor(BB); if (DTU && RemovedSuccessors.insert(Succ).second) Updates.push_back({DominatorTree::Delete, BB, Succ}); + if (HasBranchWeights) + ToLookupWeight += BranchWeights[I]; } SI->eraseFromParent(); - + if (HasBranchWeights) + setFittedBranchWeights(*CondBranch, {ToLookupWeight, ToDefaultWeight}, + /*IsExpected=*/false); if (DTU) DTU->applyUpdates(Updates); |