aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp39
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp15
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp25
-rw-r--r--llvm/lib/Support/CMakeLists.txt1
-rw-r--r--llvm/lib/Support/Jobserver.cpp259
-rw-r--r--llvm/lib/Support/Parallel.cpp98
-rw-r--r--llvm/lib/Support/ThreadPool.cpp108
-rw-r--r--llvm/lib/Support/Threading.cpp5
-rw-r--r--llvm/lib/Support/Unix/Jobserver.inc195
-rw-r--r--llvm/lib/Support/Windows/Jobserver.inc79
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td39
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp4
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp178
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp6
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp6
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp55
18 files changed, 1005 insertions, 120 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 95f53fe..6ea2e27 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -12698,6 +12698,45 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
return DAGSize;
}
+void SelectionDAG::getTopologicallyOrderedNodes(
+ SmallVectorImpl<const SDNode *> &SortedNodes) const {
+ SortedNodes.clear();
+ // Node -> remaining number of outstanding operands.
+ DenseMap<const SDNode *, unsigned> RemainingOperands;
+
+ // Put nodes without any operands into SortedNodes first.
+ for (const SDNode &N : allnodes()) {
+ checkForCycles(&N, this);
+ unsigned NumOperands = N.getNumOperands();
+ if (NumOperands == 0)
+ SortedNodes.push_back(&N);
+ else
+ // Record their total number of outstanding operands.
+ RemainingOperands[&N] = NumOperands;
+ }
+
+ // A node is pushed into SortedNodes when all of its operands (predecessors in
+ // the graph) are also in SortedNodes.
+ for (unsigned i = 0U; i < SortedNodes.size(); ++i) {
+ const SDNode *N = SortedNodes[i];
+ for (const SDNode *U : N->users()) {
+ unsigned &NumRemOperands = RemainingOperands[U];
+ assert(NumRemOperands && "Invalid number of remaining operands");
+ --NumRemOperands;
+ if (!NumRemOperands)
+ SortedNodes.push_back(U);
+ }
+ }
+
+ assert(SortedNodes.size() == AllNodes.size() && "Node count mismatch");
+ assert(SortedNodes.front()->getOpcode() == ISD::EntryToken &&
+ "First node in topological sort is not the entry token");
+ assert(SortedNodes.front()->getNumOperands() == 0 &&
+ "First node in topological sort has operands");
+ assert(SortedNodes.back()->use_empty() &&
+ "Last node in topologic sort has users");
+}
+
/// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the
/// value is produced by SD.
void SelectionDAG::AddDbgValue(SDDbgValue *DB, bool isParameter) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 4b2a00c..fcfbfe6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -1061,13 +1061,24 @@ static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) {
N->dump(G);
}
-LLVM_DUMP_METHOD void SelectionDAG::dump() const {
+LLVM_DUMP_METHOD void SelectionDAG::dump(bool Sorted) const {
dbgs() << "SelectionDAG has " << AllNodes.size() << " nodes:\n";
- for (const SDNode &N : allnodes()) {
+ auto dumpEachNode = [this](const SDNode &N) {
if (!N.hasOneUse() && &N != getRoot().getNode() &&
(!shouldPrintInline(N, this) || N.use_empty()))
DumpNodes(&N, 2, this);
+ };
+
+ if (Sorted) {
+ SmallVector<const SDNode *> SortedNodes;
+ SortedNodes.reserve(AllNodes.size());
+ getTopologicallyOrderedNodes(SortedNodes);
+ for (const SDNode *N : SortedNodes)
+ dumpEachNode(*N);
+ } else {
+ for (const SDNode &N : allnodes())
+ dumpEachNode(N);
}
if (getRoot().getNode()) DumpNodes(getRoot().getNode(), 2, this);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index e61558c..c35f29d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -144,6 +144,11 @@ UseMBPI("use-mbpi",
cl::init(true), cl::Hidden);
#ifndef NDEBUG
+static cl::opt<bool>
+ DumpSortedDAG("dump-sorted-dags", cl::Hidden,
+ cl::desc("Print DAGs with sorted nodes in debug dump"),
+ cl::init(false));
+
static cl::opt<std::string>
FilterDAGBasicBlockName("filter-view-dags", cl::Hidden,
cl::desc("Only display the basic block whose name "
@@ -932,7 +937,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
ISEL_DUMP(dbgs() << "\nInitial selection DAG: "
<< printMBBReference(*FuncInfo->MBB) << " '" << BlockName
<< "'\n";
- CurDAG->dump());
+ CurDAG->dump(DumpSortedDAG));
#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
if (TTI->hasBranchDivergence())
@@ -952,7 +957,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
ISEL_DUMP(dbgs() << "\nOptimized lowered selection DAG: "
<< printMBBReference(*FuncInfo->MBB) << " '" << BlockName
<< "'\n";
- CurDAG->dump());
+ CurDAG->dump(DumpSortedDAG));
#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
if (TTI->hasBranchDivergence())
@@ -974,7 +979,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
ISEL_DUMP(dbgs() << "\nType-legalized selection DAG: "
<< printMBBReference(*FuncInfo->MBB) << " '" << BlockName
<< "'\n";
- CurDAG->dump());
+ CurDAG->dump(DumpSortedDAG));
#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
if (TTI->hasBranchDivergence())
@@ -998,7 +1003,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
ISEL_DUMP(dbgs() << "\nOptimized type-legalized selection DAG: "
<< printMBBReference(*FuncInfo->MBB) << " '" << BlockName
<< "'\n";
- CurDAG->dump());
+ CurDAG->dump(DumpSortedDAG));
#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
if (TTI->hasBranchDivergence())
@@ -1016,7 +1021,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
ISEL_DUMP(dbgs() << "\nVector-legalized selection DAG: "
<< printMBBReference(*FuncInfo->MBB) << " '" << BlockName
<< "'\n";
- CurDAG->dump());
+ CurDAG->dump(DumpSortedDAG));
#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
if (TTI->hasBranchDivergence())
@@ -1032,7 +1037,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
ISEL_DUMP(dbgs() << "\nVector/type-legalized selection DAG: "
<< printMBBReference(*FuncInfo->MBB) << " '" << BlockName
<< "'\n";
- CurDAG->dump());
+ CurDAG->dump(DumpSortedDAG));
#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
if (TTI->hasBranchDivergence())
@@ -1052,7 +1057,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
ISEL_DUMP(dbgs() << "\nOptimized vector-legalized selection DAG: "
<< printMBBReference(*FuncInfo->MBB) << " '" << BlockName
<< "'\n";
- CurDAG->dump());
+ CurDAG->dump(DumpSortedDAG));
#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
if (TTI->hasBranchDivergence())
@@ -1072,7 +1077,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
ISEL_DUMP(dbgs() << "\nLegalized selection DAG: "
<< printMBBReference(*FuncInfo->MBB) << " '" << BlockName
<< "'\n";
- CurDAG->dump());
+ CurDAG->dump(DumpSortedDAG));
#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
if (TTI->hasBranchDivergence())
@@ -1092,7 +1097,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
ISEL_DUMP(dbgs() << "\nOptimized legalized selection DAG: "
<< printMBBReference(*FuncInfo->MBB) << " '" << BlockName
<< "'\n";
- CurDAG->dump());
+ CurDAG->dump(DumpSortedDAG));
#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
if (TTI->hasBranchDivergence())
@@ -1116,7 +1121,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
ISEL_DUMP(dbgs() << "\nSelected selection DAG: "
<< printMBBReference(*FuncInfo->MBB) << " '" << BlockName
<< "'\n";
- CurDAG->dump());
+ CurDAG->dump(DumpSortedDAG));
if (ViewSchedDAGs && MatchFilterBB)
CurDAG->viewGraph("scheduler input for " + BlockName);
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 7da972f..42b21b5 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -207,6 +207,7 @@ add_llvm_component_library(LLVMSupport
InstructionCost.cpp
IntEqClasses.cpp
IntervalMap.cpp
+ Jobserver.cpp
JSON.cpp
KnownBits.cpp
KnownFPClass.cpp
diff --git a/llvm/lib/Support/Jobserver.cpp b/llvm/lib/Support/Jobserver.cpp
new file mode 100644
index 0000000..9f726eb
--- /dev/null
+++ b/llvm/lib/Support/Jobserver.cpp
@@ -0,0 +1,259 @@
+//===- llvm/Support/Jobserver.cpp - Jobserver Client Implementation -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Jobserver.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <new>
+
+#define DEBUG_TYPE "jobserver"
+
+using namespace llvm;
+
+namespace {
+struct FdPair {
+ int Read = -1;
+ int Write = -1;
+ bool isValid() const { return Read >= 0 && Write >= 0; }
+};
+
+struct JobserverConfig {
+ enum Mode {
+ None,
+ PosixFifo,
+ PosixPipe,
+ Win32Semaphore,
+ };
+ Mode TheMode = None;
+ std::string Path;
+ FdPair PipeFDs;
+};
+
+/// A helper function that checks if `Input` starts with `Prefix`.
+/// If it does, it removes the prefix from `Input`, assigns the remainder to
+/// `Value`, and returns true. Otherwise, it returns false.
+bool getPrefixedValue(StringRef Input, StringRef Prefix, StringRef &Value) {
+ if (Input.consume_front(Prefix)) {
+ Value = Input;
+ return true;
+ }
+ return false;
+}
+
+/// A helper function to parse a string in the format "R,W" where R and W are
+/// non-negative integers representing file descriptors. It populates the
+/// `ReadFD` and `WriteFD` output parameters. Returns true on success.
+static std::optional<FdPair> getFileDescriptorPair(StringRef Input) {
+ FdPair FDs;
+ if (Input.consumeInteger(10, FDs.Read))
+ return std::nullopt;
+ if (!Input.consume_front(","))
+ return std::nullopt;
+ if (Input.consumeInteger(10, FDs.Write))
+ return std::nullopt;
+ if (!Input.empty() || !FDs.isValid())
+ return std::nullopt;
+ return FDs;
+}
+
+/// Parses the `MAKEFLAGS` environment variable string to find jobserver
+/// arguments. It splits the string into space-separated arguments and searches
+/// for `--jobserver-auth` or `--jobserver-fds`. Based on the value of these
+/// arguments, it determines the jobserver mode (Pipe, FIFO, or Semaphore) and
+/// connection details (file descriptors or path).
+Expected<JobserverConfig> parseNativeMakeFlags(StringRef MakeFlags) {
+ JobserverConfig Config;
+ if (MakeFlags.empty())
+ return Config;
+
+ // Split the MAKEFLAGS string into arguments.
+ SmallVector<StringRef, 8> Args;
+ SplitString(MakeFlags, Args);
+
+ // If '-n' (dry-run) is present as a legacy flag (not starting with '-'),
+ // disable the jobserver.
+ if (!Args.empty() && !Args[0].starts_with("-") && Args[0].contains('n'))
+ return Config;
+
+ // Iterate through arguments to find jobserver flags.
+ // Note that make may pass multiple --jobserver-auth flags; the last one wins.
+ for (StringRef Arg : Args) {
+ StringRef Value;
+ if (getPrefixedValue(Arg, "--jobserver-auth=", Value)) {
+ // Try to parse as a file descriptor pair first.
+ if (auto FDPair = getFileDescriptorPair(Value)) {
+ Config.TheMode = JobserverConfig::PosixPipe;
+ Config.PipeFDs = *FDPair;
+ } else {
+ StringRef FifoPath;
+ // If not FDs, try to parse as a named pipe (fifo).
+ if (getPrefixedValue(Value, "fifo:", FifoPath)) {
+ Config.TheMode = JobserverConfig::PosixFifo;
+ Config.Path = FifoPath.str();
+ } else {
+ // Otherwise, assume it's a Windows semaphore.
+ Config.TheMode = JobserverConfig::Win32Semaphore;
+ Config.Path = Value.str();
+ }
+ }
+ } else if (getPrefixedValue(Arg, "--jobserver-fds=", Value)) {
+ // This is an alternative, older syntax for the pipe-based server.
+ if (auto FDPair = getFileDescriptorPair(Value)) {
+ Config.TheMode = JobserverConfig::PosixPipe;
+ Config.PipeFDs = *FDPair;
+ } else {
+ return createStringError(inconvertibleErrorCode(),
+ "Invalid file descriptor pair in MAKEFLAGS");
+ }
+ }
+ }
+
+// Perform platform-specific validation.
+#ifdef _WIN32
+ if (Config.TheMode == JobserverConfig::PosixFifo ||
+ Config.TheMode == JobserverConfig::PosixPipe)
+ return createStringError(
+ inconvertibleErrorCode(),
+ "FIFO/Pipe-based jobserver is not supported on Windows");
+#else
+ if (Config.TheMode == JobserverConfig::Win32Semaphore)
+ return createStringError(
+ inconvertibleErrorCode(),
+ "Semaphore-based jobserver is not supported on this platform");
+#endif
+ return Config;
+}
+
+std::once_flag GJobserverOnceFlag;
+JobserverClient *GJobserver = nullptr;
+
+} // namespace
+
+namespace llvm {
+class JobserverClientImpl : public JobserverClient {
+ bool IsInitialized = false;
+ std::atomic<bool> HasImplicitSlot{true};
+ unsigned NumJobs = 0;
+
+public:
+ JobserverClientImpl(const JobserverConfig &Config);
+ ~JobserverClientImpl() override;
+
+ JobSlot tryAcquire() override;
+ void release(JobSlot Slot) override;
+ unsigned getNumJobs() const override { return NumJobs; }
+
+ bool isValid() const { return IsInitialized; }
+
+private:
+#if defined(LLVM_ON_UNIX)
+ int ReadFD = -1;
+ int WriteFD = -1;
+ std::string FifoPath;
+#elif defined(_WIN32)
+ void *Semaphore = nullptr;
+#endif
+};
+} // namespace llvm
+
+// Include the platform-specific parts of the class.
+#if defined(LLVM_ON_UNIX)
+#include "Unix/Jobserver.inc"
+#elif defined(_WIN32)
+#include "Windows/Jobserver.inc"
+#else
+// Dummy implementation for unsupported platforms.
+JobserverClientImpl::JobserverClientImpl(const JobserverConfig &Config) {}
+JobserverClientImpl::~JobserverClientImpl() = default;
+JobSlot JobserverClientImpl::tryAcquire() { return JobSlot(); }
+void JobserverClientImpl::release(JobSlot Slot) {}
+#endif
+
+namespace llvm {
+JobserverClient::~JobserverClient() = default;
+
+uint8_t JobSlot::getExplicitValue() const {
+ assert(isExplicit() && "Cannot get value of implicit or invalid slot");
+ return static_cast<uint8_t>(Value);
+}
+
+/// This is the main entry point for acquiring a jobserver client. It uses a
+/// std::call_once to ensure the singleton `GJobserver` instance is created
+/// safely in a multi-threaded environment. On first call, it reads the
+/// `MAKEFLAGS` environment variable, parses it, and attempts to construct and
+/// initialize a `JobserverClientImpl`. If successful, the global instance is
+/// stored in `GJobserver`. Subsequent calls will return the existing instance.
+JobserverClient *JobserverClient::getInstance() {
+ std::call_once(GJobserverOnceFlag, []() {
+ LLVM_DEBUG(
+ dbgs()
+ << "JobserverClient::getInstance() called for the first time.\n");
+ const char *MakeFlagsEnv = getenv("MAKEFLAGS");
+ if (!MakeFlagsEnv) {
+ errs() << "Warning: failed to create jobserver client due to MAKEFLAGS "
+ "environment variable not found\n";
+ return;
+ }
+
+ LLVM_DEBUG(dbgs() << "Found MAKEFLAGS = \"" << MakeFlagsEnv << "\"\n");
+
+ auto ConfigOrErr = parseNativeMakeFlags(MakeFlagsEnv);
+ if (Error Err = ConfigOrErr.takeError()) {
+ errs() << "Warning: failed to create jobserver client due to invalid "
+ "MAKEFLAGS environment variable: "
+ << toString(std::move(Err)) << "\n";
+ return;
+ }
+
+ JobserverConfig Config = *ConfigOrErr;
+ if (Config.TheMode == JobserverConfig::None) {
+ errs() << "Warning: failed to create jobserver client due to jobserver "
+ "mode missing in MAKEFLAGS environment variable\n";
+ return;
+ }
+
+ if (Config.TheMode == JobserverConfig::PosixPipe) {
+#if defined(LLVM_ON_UNIX)
+ if (!areFdsValid(Config.PipeFDs.Read, Config.PipeFDs.Write)) {
+ errs() << "Warning: failed to create jobserver client due to invalid "
+ "Pipe FDs in MAKEFLAGS environment variable\n";
+ return;
+ }
+#endif
+ }
+
+ auto Client = std::make_unique<JobserverClientImpl>(Config);
+ if (Client->isValid()) {
+ LLVM_DEBUG(dbgs() << "Jobserver client created successfully!\n");
+ GJobserver = Client.release();
+ } else
+ errs() << "Warning: jobserver client initialization failed.\n";
+ });
+ return GJobserver;
+}
+
+/// For testing purposes only. This function resets the singleton instance by
+/// destroying the existing client and re-initializing the `std::once_flag`.
+/// This allows tests to simulate the first-time initialization of the
+/// jobserver client multiple times.
+void JobserverClient::resetForTesting() {
+ delete GJobserver;
+ GJobserver = nullptr;
+ // Re-construct the std::once_flag in place to reset the singleton state.
+ new (&GJobserverOnceFlag) std::once_flag();
+}
+} // namespace llvm
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 3ac6fc7..8e0c724 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -7,12 +7,17 @@
//===----------------------------------------------------------------------===//
#include "llvm/Support/Parallel.h"
+#include "llvm/ADT/ScopeExit.h"
#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/ExponentialBackoff.h"
+#include "llvm/Support/Jobserver.h"
#include "llvm/Support/ManagedStatic.h"
#include "llvm/Support/Threading.h"
#include <atomic>
#include <future>
+#include <memory>
+#include <mutex>
#include <thread>
#include <vector>
@@ -49,6 +54,9 @@ public:
class ThreadPoolExecutor : public Executor {
public:
explicit ThreadPoolExecutor(ThreadPoolStrategy S) {
+ if (S.UseJobserver)
+ TheJobserver = JobserverClient::getInstance();
+
ThreadCount = S.compute_thread_count();
// Spawn all but one of the threads in another thread as spawning threads
// can take a while.
@@ -69,6 +77,10 @@ public:
});
}
+ // To make sure the thread pool executor can only be created with a parallel
+ // strategy.
+ ThreadPoolExecutor() = delete;
+
void stop() {
{
std::lock_guard<std::mutex> Lock(Mutex);
@@ -111,15 +123,62 @@ private:
void work(ThreadPoolStrategy S, unsigned ThreadID) {
threadIndex = ThreadID;
S.apply_thread_strategy(ThreadID);
+ // Note on jobserver deadlock avoidance:
+ // GNU Make grants each invoked process one implicit job slot. Our
+ // JobserverClient models this by returning an implicit JobSlot on the
+ // first successful tryAcquire() in a process. This guarantees forward
+ // progress without requiring a dedicated "always-on" thread here.
+
+ static thread_local std::unique_ptr<ExponentialBackoff> Backoff;
+
while (true) {
- std::unique_lock<std::mutex> Lock(Mutex);
- Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
- if (Stop)
- break;
- auto Task = std::move(WorkStack.back());
- WorkStack.pop_back();
- Lock.unlock();
- Task();
+ if (TheJobserver) {
+ // Jobserver-mode scheduling:
+ // - Acquire one job slot (with exponential backoff to avoid busy-wait).
+ // - While holding the slot, drain and run tasks from the local queue.
+ // - Release the slot when the queue is empty or when shutting down.
+ // Rationale: Holding a slot amortizes acquire/release overhead over
+ // multiple tasks and avoids requeue/yield churn, while still enforcing
+ // the jobserver’s global concurrency limit. With K available slots,
+ // up to K workers run tasks in parallel; within each worker tasks run
+ // sequentially until the local queue is empty.
+ ExponentialBackoff Backoff(std::chrono::hours(24));
+ JobSlot Slot;
+ do {
+ if (Stop)
+ return;
+ Slot = TheJobserver->tryAcquire();
+ if (Slot.isValid())
+ break;
+ } while (Backoff.waitForNextAttempt());
+
+ auto SlotReleaser = llvm::make_scope_exit(
+ [&] { TheJobserver->release(std::move(Slot)); });
+
+ while (true) {
+ std::function<void()> Task;
+ {
+ std::unique_lock<std::mutex> Lock(Mutex);
+ Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
+ if (Stop && WorkStack.empty())
+ return;
+ if (WorkStack.empty())
+ break;
+ Task = std::move(WorkStack.back());
+ WorkStack.pop_back();
+ }
+ Task();
+ }
+ } else {
+ std::unique_lock<std::mutex> Lock(Mutex);
+ Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
+ if (Stop)
+ break;
+ auto Task = std::move(WorkStack.back());
+ WorkStack.pop_back();
+ Lock.unlock();
+ Task();
+ }
}
}
@@ -130,9 +189,20 @@ private:
std::promise<void> ThreadsCreated;
std::vector<std::thread> Threads;
unsigned ThreadCount;
+
+ JobserverClient *TheJobserver = nullptr;
};
-Executor *Executor::getDefaultExecutor() {
+// A global raw pointer to the executor. Lifetime is managed by the
+// objects created within createExecutor().
+static Executor *TheExec = nullptr;
+static std::once_flag Flag;
+
+// This function will be called exactly once to create the executor.
+// It contains the necessary platform-specific logic. Since functions
+// called by std::call_once cannot return value, we have to set the
+// executor as a global variable.
+void createExecutor() {
#ifdef _WIN32
// The ManagedStatic enables the ThreadPoolExecutor to be stopped via
// llvm_shutdown() which allows a "clean" fast exit, e.g. via _exit(). This
@@ -156,16 +226,22 @@ Executor *Executor::getDefaultExecutor() {
ThreadPoolExecutor::Deleter>
ManagedExec;
static std::unique_ptr<ThreadPoolExecutor> Exec(&(*ManagedExec));
- return Exec.get();
+ TheExec = Exec.get();
#else
// ManagedStatic is not desired on other platforms. When `Exec` is destroyed
// by llvm_shutdown(), worker threads will clean up and invoke TLS
// destructors. This can lead to race conditions if other threads attempt to
// access TLS objects that have already been destroyed.
static ThreadPoolExecutor Exec(strategy);
- return &Exec;
+ TheExec = &Exec;
#endif
}
+
+Executor *Executor::getDefaultExecutor() {
+ // Use std::call_once to lazily and safely initialize the executor.
+ std::call_once(Flag, createExecutor);
+ return TheExec;
+}
} // namespace
} // namespace detail
diff --git a/llvm/lib/Support/ThreadPool.cpp b/llvm/lib/Support/ThreadPool.cpp
index c304f0f..6960268 100644
--- a/llvm/lib/Support/ThreadPool.cpp
+++ b/llvm/lib/Support/ThreadPool.cpp
@@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//
//
+//
// This file implements a crude C++11 based thread pool.
//
//===----------------------------------------------------------------------===//
@@ -14,6 +15,8 @@
#include "llvm/Config/llvm-config.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/Support/ExponentialBackoff.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/Threading.h"
#include "llvm/Support/raw_ostream.h"
@@ -33,7 +36,10 @@ ThreadPoolInterface::~ThreadPoolInterface() = default;
#if LLVM_ENABLE_THREADS
StdThreadPool::StdThreadPool(ThreadPoolStrategy S)
- : Strategy(S), MaxThreadCount(S.compute_thread_count()) {}
+ : Strategy(S), MaxThreadCount(S.compute_thread_count()) {
+ if (Strategy.UseJobserver)
+ TheJobserver = JobserverClient::getInstance();
+}
void StdThreadPool::grow(int requested) {
llvm::sys::ScopedWriter LockGuard(ThreadsLock);
@@ -45,7 +51,15 @@ void StdThreadPool::grow(int requested) {
Threads.emplace_back([this, ThreadID] {
set_thread_name(formatv("llvm-worker-{0}", ThreadID));
Strategy.apply_thread_strategy(ThreadID);
- processTasks(nullptr);
+ // Note on jobserver deadlock avoidance:
+ // GNU Make grants each invoked process one implicit job slot.
+ // JobserverClient::tryAcquire() returns that implicit slot on the first
+ // successful call in a process, ensuring forward progress without a
+ // dedicated "always-on" thread.
+ if (TheJobserver)
+ processTasksWithJobserver();
+ else
+ processTasks(nullptr);
});
}
}
@@ -133,6 +147,96 @@ void StdThreadPool::processTasks(ThreadPoolTaskGroup *WaitingForGroup) {
}
}
+/// Main loop for worker threads when using a jobserver.
+/// This function uses a two-level queue; it first acquires a job slot from the
+/// external jobserver, then retrieves a task from the internal queue.
+/// This allows the thread pool to cooperate with build systems like `make -j`.
+void StdThreadPool::processTasksWithJobserver() {
+ while (true) {
+ // Acquire a job slot from the external jobserver.
+ // This polls for a slot and yields the thread to avoid a high-CPU wait.
+ JobSlot Slot;
+ // The timeout for the backoff can be very long, as the shutdown
+ // is checked on each iteration. The sleep duration is capped by MaxWait
+ // in ExponentialBackoff, so shutdown latency is not a problem.
+ ExponentialBackoff Backoff(std::chrono::hours(24));
+ bool AcquiredToken = false;
+ do {
+ // Return if the thread pool is shutting down.
+ {
+ std::unique_lock<std::mutex> LockGuard(QueueLock);
+ if (!EnableFlag)
+ return;
+ }
+
+ Slot = TheJobserver->tryAcquire();
+ if (Slot.isValid()) {
+ AcquiredToken = true;
+ break;
+ }
+ } while (Backoff.waitForNextAttempt());
+
+ if (!AcquiredToken) {
+ // This is practically unreachable with a 24h timeout and indicates a
+ // deeper problem if hit.
+ report_fatal_error("Timed out waiting for jobserver token.");
+ }
+
+ // `make_scope_exit` guarantees the job slot is released, even if the
+ // task throws or we exit early. This prevents deadlocking the build.
+ auto SlotReleaser =
+ make_scope_exit([&] { TheJobserver->release(std::move(Slot)); });
+
+ // While we hold a job slot, process tasks from the internal queue.
+ while (true) {
+ std::function<void()> Task;
+ ThreadPoolTaskGroup *GroupOfTask = nullptr;
+
+ {
+ std::unique_lock<std::mutex> LockGuard(QueueLock);
+
+ // Wait until a task is available or the pool is shutting down.
+ QueueCondition.wait(LockGuard,
+ [&] { return !EnableFlag || !Tasks.empty(); });
+
+ // If shutting down and the queue is empty, the thread can terminate.
+ if (!EnableFlag && Tasks.empty())
+ return;
+
+ // If the queue is empty, we're done processing tasks for now.
+ // Break the inner loop to release the job slot.
+ if (Tasks.empty())
+ break;
+
+ // A task is available. Mark it as active before releasing the lock
+ // to prevent race conditions with `wait()`.
+ ++ActiveThreads;
+ Task = std::move(Tasks.front().first);
+ GroupOfTask = Tasks.front().second;
+ if (GroupOfTask != nullptr)
+ ++ActiveGroups[GroupOfTask];
+ Tasks.pop_front();
+ } // The queue lock is released.
+
+ // Run the task. The job slot remains acquired during execution.
+ Task();
+
+ // The task has finished. Update the active count and notify any waiters.
+ {
+ std::lock_guard<std::mutex> LockGuard(QueueLock);
+ --ActiveThreads;
+ if (GroupOfTask != nullptr) {
+ auto A = ActiveGroups.find(GroupOfTask);
+ if (--(A->second) == 0)
+ ActiveGroups.erase(A);
+ }
+ // If all tasks are complete, notify any waiting threads.
+ if (workCompletedUnlocked(nullptr))
+ CompletionCondition.notify_all();
+ }
+ }
+ }
+}
bool StdThreadPool::workCompletedUnlocked(ThreadPoolTaskGroup *Group) const {
if (Group == nullptr)
return !ActiveThreads && Tasks.empty();
diff --git a/llvm/lib/Support/Threading.cpp b/llvm/lib/Support/Threading.cpp
index 693de0e..9da357a 100644
--- a/llvm/lib/Support/Threading.cpp
+++ b/llvm/lib/Support/Threading.cpp
@@ -14,6 +14,7 @@
#include "llvm/Support/Threading.h"
#include "llvm/Config/config.h"
#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Jobserver.h"
#include <cassert>
#include <optional>
@@ -51,6 +52,10 @@ int llvm::get_physical_cores() { return -1; }
static int computeHostNumHardwareThreads();
unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
+ if (UseJobserver)
+ if (auto JS = JobserverClient::getInstance())
+ return JS->getNumJobs();
+
int MaxThreadCount =
UseHyperThreads ? computeHostNumHardwareThreads() : get_physical_cores();
if (MaxThreadCount <= 0)
diff --git a/llvm/lib/Support/Unix/Jobserver.inc b/llvm/lib/Support/Unix/Jobserver.inc
new file mode 100644
index 0000000..53bf7f2
--- /dev/null
+++ b/llvm/lib/Support/Unix/Jobserver.inc
@@ -0,0 +1,195 @@
+//===- llvm/Support/Unix/Jobserver.inc - Unix Jobserver Impl ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the UNIX-specific parts of the JobserverClient class.
+//
+//===----------------------------------------------------------------------===//
+
+#include <atomic>
+#include <cassert>
+#include <cerrno>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+namespace {
+/// Returns true if the given file descriptor is a FIFO (named pipe).
+bool isFifo(int FD) {
+ struct stat StatBuf;
+ if (::fstat(FD, &StatBuf) != 0)
+ return false;
+ return S_ISFIFO(StatBuf.st_mode);
+}
+
+/// Returns true if the given file descriptors are valid.
+bool areFdsValid(int ReadFD, int WriteFD) {
+ if (ReadFD == -1 || WriteFD == -1)
+ return false;
+ // Check if the file descriptors are actually valid by checking their flags.
+ return ::fcntl(ReadFD, F_GETFD) != -1 && ::fcntl(WriteFD, F_GETFD) != -1;
+}
+} // namespace
+
+/// The constructor sets up the client based on the provided configuration.
+/// For pipe-based jobservers, it duplicates the inherited file descriptors,
+/// sets them to close-on-exec, and makes the read descriptor non-blocking.
+/// For FIFO-based jobservers, it opens the named pipe. After setup, it drains
+/// all available tokens from the jobserver to determine the total number of
+/// available jobs (`NumJobs`), then immediately releases them back.
+JobserverClientImpl::JobserverClientImpl(const JobserverConfig &Config) {
+ switch (Config.TheMode) {
+ case JobserverConfig::PosixPipe: {
+ // Duplicate the read and write file descriptors.
+ int NewReadFD = ::dup(Config.PipeFDs.Read);
+ if (NewReadFD < 0)
+ return;
+ int NewWriteFD = ::dup(Config.PipeFDs.Write);
+ if (NewWriteFD < 0) {
+ ::close(NewReadFD);
+ return;
+ }
+ // Set the new descriptors to be closed automatically on exec().
+ if (::fcntl(NewReadFD, F_SETFD, FD_CLOEXEC) == -1 ||
+ ::fcntl(NewWriteFD, F_SETFD, FD_CLOEXEC) == -1) {
+ ::close(NewReadFD);
+ ::close(NewWriteFD);
+ return;
+ }
+ // Set the read descriptor to non-blocking.
+ int flags = ::fcntl(NewReadFD, F_GETFL, 0);
+ if (flags == -1 || ::fcntl(NewReadFD, F_SETFL, flags | O_NONBLOCK) == -1) {
+ ::close(NewReadFD);
+ ::close(NewWriteFD);
+ return;
+ }
+ ReadFD = NewReadFD;
+ WriteFD = NewWriteFD;
+ break;
+ }
+ case JobserverConfig::PosixFifo:
+ // Open the FIFO for reading. It must be non-blocking and close-on-exec.
+ ReadFD = ::open(Config.Path.c_str(), O_RDONLY | O_NONBLOCK | O_CLOEXEC);
+ if (ReadFD < 0 || !isFifo(ReadFD)) {
+ if (ReadFD >= 0)
+ ::close(ReadFD);
+ ReadFD = -1;
+ return;
+ }
+ FifoPath = Config.Path;
+ // The write FD is opened on-demand in release().
+ WriteFD = -1;
+ break;
+ default:
+ return;
+ }
+
+ IsInitialized = true;
+ // Determine the total number of jobs by acquiring all available slots and
+ // then immediately releasing them.
+ SmallVector<JobSlot, 8> Slots;
+ while (true) {
+ auto S = tryAcquire();
+ if (!S.isValid())
+ break;
+ Slots.push_back(std::move(S));
+ }
+ NumJobs = Slots.size();
+ assert(NumJobs >= 1 && "Invalid number of jobs");
+ for (auto &S : Slots)
+ release(std::move(S));
+}
+
+/// The destructor closes any open file descriptors.
+JobserverClientImpl::~JobserverClientImpl() {
+ if (ReadFD >= 0)
+ ::close(ReadFD);
+ if (WriteFD >= 0)
+ ::close(WriteFD);
+}
+
+/// Tries to acquire a job slot. The first call to this function will always
+/// successfully acquire the single "implicit" slot that is granted to every
+/// process started by `make`. Subsequent calls attempt to read a one-byte
+/// token from the jobserver's read pipe. A successful read grants one
+/// explicit job slot. The read is non-blocking; if no token is available,
+/// it fails and returns an invalid JobSlot.
+JobSlot JobserverClientImpl::tryAcquire() {
+ if (!IsInitialized)
+ return JobSlot();
+
+ // The first acquisition is always for the implicit slot.
+ if (HasImplicitSlot.exchange(false, std::memory_order_acquire)) {
+ LLVM_DEBUG(dbgs() << "Acquired implicit job slot.\n");
+ return JobSlot::createImplicit();
+ }
+
+ char Token;
+ ssize_t Ret;
+ LLVM_DEBUG(dbgs() << "Attempting to read token from FD " << ReadFD << ".\n");
+ // Loop to retry on EINTR (interrupted system call).
+ do {
+ Ret = ::read(ReadFD, &Token, 1);
+ } while (Ret < 0 && errno == EINTR);
+
+ if (Ret == 1) {
+ LLVM_DEBUG(dbgs() << "Acquired explicit token '" << Token << "'.\n");
+ return JobSlot::createExplicit(static_cast<uint8_t>(Token));
+ }
+
+ LLVM_DEBUG(dbgs() << "Failed to acquire job slot, read returned " << Ret
+ << ".\n");
+ return JobSlot();
+}
+
+/// Releases a job slot back to the pool. If the slot is implicit, it simply
+/// resets a flag. If the slot is explicit, it writes the character token
+/// associated with the slot back into the jobserver's write pipe. For FIFO
+/// jobservers, this may require opening the FIFO for writing if it hasn't
+/// been already.
+void JobserverClientImpl::release(JobSlot Slot) {
+ if (!Slot.isValid())
+ return;
+
+ // Releasing the implicit slot just makes it available for the next acquire.
+ if (Slot.isImplicit()) {
+ LLVM_DEBUG(dbgs() << "Released implicit job slot.\n");
+ [[maybe_unused]] bool was_already_released =
+ HasImplicitSlot.exchange(true, std::memory_order_release);
+ assert(!was_already_released && "Implicit slot released twice");
+ return;
+ }
+
+ uint8_t Token = Slot.getExplicitValue();
+ LLVM_DEBUG(dbgs() << "Releasing explicit token '" << (char)Token << "' to FD "
+ << WriteFD << ".\n");
+
+ // For FIFO-based jobservers, the write FD might not be open yet.
+ // Open it on the first release.
+ if (WriteFD < 0) {
+ LLVM_DEBUG(dbgs() << "WriteFD is invalid, opening FIFO: " << FifoPath
+ << "\n");
+ WriteFD = ::open(FifoPath.c_str(), O_WRONLY | O_CLOEXEC);
+ if (WriteFD < 0) {
+ LLVM_DEBUG(dbgs() << "Failed to open FIFO for writing.\n");
+ return;
+ }
+ LLVM_DEBUG(dbgs() << "Opened FIFO as new WriteFD: " << WriteFD << "\n");
+ }
+
+ ssize_t Written;
+ // Loop to retry on EINTR (interrupted system call).
+ do {
+ Written = ::write(WriteFD, &Token, 1);
+ } while (Written < 0 && errno == EINTR);
+
+ if (Written <= 0) {
+ LLVM_DEBUG(dbgs() << "Failed to write token to pipe, write returned "
+ << Written << "\n");
+ }
+}
diff --git a/llvm/lib/Support/Windows/Jobserver.inc b/llvm/lib/Support/Windows/Jobserver.inc
new file mode 100644
index 0000000..79028ee
--- /dev/null
+++ b/llvm/lib/Support/Windows/Jobserver.inc
@@ -0,0 +1,79 @@
+//==- llvm/Support/Windows/Jobserver.inc - Windows Jobserver Impl -*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Windows-specific parts of the JobserverClient class.
+// On Windows, the jobserver is implemented using a named semaphore.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Windows/WindowsSupport.h"
+#include <atomic>
+#include <cassert>
+
+namespace llvm {
+/// The constructor for the Windows jobserver client. It attempts to open a
+/// handle to an existing named semaphore, the name of which is provided by
+/// GNU make in the --jobserver-auth argument. If the semaphore is opened
+/// successfully, the client is marked as initialized.
+JobserverClientImpl::JobserverClientImpl(const JobserverConfig &Config) {
+ Semaphore = (void *)::OpenSemaphoreA(SEMAPHORE_MODIFY_STATE | SYNCHRONIZE,
+ FALSE, Config.Path.c_str());
+ if (Semaphore != nullptr)
+ IsInitialized = true;
+}
+
+/// The destructor closes the handle to the semaphore, releasing the resource.
+JobserverClientImpl::~JobserverClientImpl() {
+ if (Semaphore != nullptr)
+ ::CloseHandle((HANDLE)Semaphore);
+}
+
+/// Tries to acquire a job slot. The first call always returns the implicit
+/// slot. Subsequent calls use a non-blocking wait on the semaphore
+/// (`WaitForSingleObject` with a timeout of 0). If the wait succeeds, the
+/// semaphore's count is decremented, and an explicit job slot is acquired.
+/// If the wait times out, it means no slots are available, and an invalid
+/// slot is returned.
+JobSlot JobserverClientImpl::tryAcquire() {
+ if (!IsInitialized)
+ return JobSlot();
+
+ // First, grant the implicit slot.
+ if (HasImplicitSlot.exchange(false, std::memory_order_acquire)) {
+ return JobSlot::createImplicit();
+ }
+
+ // Try to acquire a slot from the semaphore without blocking.
+ if (::WaitForSingleObject((HANDLE)Semaphore, 0) == WAIT_OBJECT_0) {
+ // The explicit token value is arbitrary on Windows, as the semaphore
+ // count is the real resource.
+ return JobSlot::createExplicit(1);
+ }
+
+ return JobSlot(); // Invalid slot
+}
+
+/// Releases a job slot back to the pool. If the slot is implicit, it simply
+/// resets a flag. For an explicit slot, it increments the semaphore's count
+/// by one using `ReleaseSemaphore`, making the slot available to other
+/// processes.
+void JobserverClientImpl::release(JobSlot Slot) {
+ if (!IsInitialized || !Slot.isValid())
+ return;
+
+ if (Slot.isImplicit()) {
+ [[maybe_unused]] bool was_already_released =
+ HasImplicitSlot.exchange(true, std::memory_order_release);
+ assert(!was_already_released && "Implicit slot released twice");
+ return;
+ }
+
+ // Release the slot by incrementing the semaphore count.
+ (void)::ReleaseSemaphore((HANDLE)Semaphore, 1, NULL);
+}
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 9dd64e0..cb49936 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1224,12 +1224,9 @@ static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
}
// TODO: Migrate to range merge of amdgpu-agpr-alloc.
-// FIXME: Why is this using Attribute::NoUnwind?
-struct AAAMDGPUNoAGPR
- : public IRAttribute<Attribute::NoUnwind,
- StateWrapper<BooleanState, AbstractAttribute>,
- AAAMDGPUNoAGPR> {
- AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
+struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> {
+ using Base = StateWrapper<BooleanState, AbstractAttribute>;
+ AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP,
Attributor &A) {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 5630580..82fc240 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -367,19 +367,6 @@ def SCC_CLASS : SIRegisterClass<"AMDGPU", [i1], 1, (add SCC)> {
let BaseClassOrder = 10000;
}
-def M0_CLASS : SIRegisterClass<"AMDGPU", [i32], 32, (add M0)> {
- let CopyCost = 1;
- let isAllocatable = 0;
- let HasSGPR = 1;
-}
-
-def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, (add M0_LO16)> {
- let CopyCost = 1;
- let Size = 16;
- let isAllocatable = 0;
- let HasSGPR = 1;
-}
-
// TODO: Do we need to set DwarfRegAlias on register tuples?
def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
@@ -774,12 +761,6 @@ def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", Reg128Types.types, 32,
let BaseClassOrder = 10000;
}
-def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32,
- (add LDS_DIRECT)> {
- let isAllocatable = 0;
- let CopyCost = -1;
-}
-
let GeneratePressureSet = 0, HasSGPR = 1 in {
// Subset of SReg_32 without M0 for SMRD instructions and alike.
// See comments in SIInstructions.td for more info.
@@ -797,7 +778,7 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO_LO16,
SRC_SHARED_LIMIT_LO_LO16, SRC_PRIVATE_BASE_LO_LO16, SRC_PRIVATE_LIMIT_LO_LO16,
SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16,
- EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16, SRC_FLAT_SCRATCH_BASE_LO_LO16,
+ EXEC_LO_LO16, EXEC_HI_LO16, M0_LO16, SRC_FLAT_SCRATCH_BASE_LO_LO16,
SRC_FLAT_SCRATCH_BASE_HI_LO16)> {
let Size = 16;
let isAllocatable = 0;
@@ -805,7 +786,7 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
}
def SReg_32_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
- (add SReg_32_XM0_XEXEC, M0_CLASS)> {
+ (add SReg_32_XM0_XEXEC, M0)> {
let AllocationPriority = 0;
}
@@ -830,7 +811,7 @@ def APERTURE_Class : SIRegisterClass<"AMDGPU", Reg64Types.types, 32,
// Register class for all scalar registers (SGPRs + Special Registers)
def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
- (add SReg_32_XM0, M0_CLASS)> {
+ (add SReg_32_XM0, M0)> {
let AllocationPriority = 0;
let HasSGPR = 1;
let BaseClassOrder = 32;
@@ -842,7 +823,7 @@ def SGPR_NULL256 : SIReg<"null">;
let GeneratePressureSet = 0 in {
def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
- (add SReg_32, LDS_DIRECT_CLASS)> {
+ (add SReg_32, LDS_DIRECT)> {
let isAllocatable = 0;
let HasSGPR = 1;
let Size = 32;
@@ -981,7 +962,7 @@ defm "" : SRegClass<32, Reg1024Types.types, SGPR_1024Regs>;
}
def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
- (add VGPR_32, LDS_DIRECT_CLASS)> {
+ (add VGPR_32, LDS_DIRECT)> {
let isAllocatable = 0;
let HasVGPR = 1;
let Size = 32;
@@ -1096,21 +1077,21 @@ def VReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, (add)> {
}
def VS_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
- (add VGPR_16, SReg_32, LDS_DIRECT_CLASS)> {
+ (add VGPR_16, SReg_32, LDS_DIRECT)> {
let isAllocatable = 0;
let HasVGPR = 1;
let Size = 16;
}
def VS_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
- (add VGPR_16_Lo128, SReg_32, LDS_DIRECT_CLASS)> {
+ (add VGPR_16_Lo128, SReg_32, LDS_DIRECT)> {
let isAllocatable = 0;
let HasVGPR = 1;
let Size = 16;
}
def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
- (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> {
+ (add VGPR_32, SReg_32, LDS_DIRECT)> {
let isAllocatable = 0;
let HasVGPR = 1;
let HasSGPR = 1;
@@ -1118,7 +1099,7 @@ def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v
}
def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
- (add VGPR_32_Lo128, SReg_32, LDS_DIRECT_CLASS)> {
+ (add VGPR_32_Lo128, SReg_32, LDS_DIRECT)> {
let isAllocatable = 0;
let HasVGPR = 1;
let HasSGPR = 1;
@@ -1126,7 +1107,7 @@ def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2
}
def VS_32_Lo256 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
- (add VGPR_32_Lo256, SReg_32, LDS_DIRECT_CLASS)> {
+ (add VGPR_32_Lo256, SReg_32, LDS_DIRECT)> {
let isAllocatable = 0;
let HasVGPR = 1;
let HasSGPR = 1;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index efdbd12..447f05c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1417,9 +1417,9 @@ class SelectQCbi<CondCode Cond, DAGOperand InTyImm, Pseudo OpNode >
let Predicates = [HasVendorXqciac, IsRV32] in {
def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12_lo:$imm12))),
(QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12_lo:$imm12)>;
-def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, uimm5gt3:$imm), GPRNoX0:$rs2)),
+def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, (i32 uimm5gt3:$imm)), GPRNoX0:$rs2)),
(QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
-def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, uimm5gt3:$imm, GPRNoX0:$rs2)),
+def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 uimm5gt3:$imm), GPRNoX0:$rs2)),
(QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
} // Predicates = [HasVendorXqciac, IsRV32]
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
index ebd957c..e8c849e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
@@ -195,9 +195,9 @@ class SPIRVLegalizePointerCast : public FunctionPass {
if (DstType->getElementType() != SrcType->getElementType()) {
// Support bitcast between vectors of different sizes only if
// the total bitwidth is the same.
- auto dstBitWidth =
+ [[maybe_unused]] auto dstBitWidth =
DstType->getElementType()->getScalarSizeInBits() * dstNumElements;
- auto srcBitWidth =
+ [[maybe_unused]] auto srcBitWidth =
SrcType->getElementType()->getScalarSizeInBits() * srcNumElements;
assert(dstBitWidth == srcBitWidth &&
"Unsupported bitcast between vectors of different sizes.");
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index ddb95a4..faeab95 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -29,6 +29,7 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/MemoryProfileInfo.h"
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -40,6 +41,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/InterleavedRange.h"
+#include "llvm/Support/SHA1.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Utils/CallPromotionUtils.h"
@@ -60,6 +62,9 @@ STATISTIC(FunctionClonesThinBackend,
"Number of function clones created during ThinLTO backend");
STATISTIC(FunctionsClonedThinBackend,
"Number of functions that had clones created during ThinLTO backend");
+STATISTIC(
+ FunctionCloneDuplicatesThinBackend,
+ "Number of function clone duplicates detected during ThinLTO backend");
STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
"cloned) during whole program analysis");
STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
@@ -5186,19 +5191,127 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
return Changed;
}
+// Compute a SHA1 hash of the callsite and alloc version information of clone I
+// in the summary, to use in detection of duplicate clones.
+uint64_t ComputeHash(const FunctionSummary *FS, unsigned I) {
+ SHA1 Hasher;
+ // Update hash with any callsites that call non-default (non-zero) callee
+ // versions.
+ for (auto &SN : FS->callsites()) {
+ // In theory all callsites and allocs in this function should have the same
+ // number of clone entries, but handle any discrepancies gracefully below
+ // for NDEBUG builds.
+ assert(
+ SN.Clones.size() > I &&
+ "Callsite summary has fewer entries than other summaries in function");
+ if (SN.Clones.size() <= I || !SN.Clones[I])
+ continue;
+ uint8_t Data[sizeof(SN.Clones[I])];
+ support::endian::write32le(Data, SN.Clones[I]);
+ Hasher.update(Data);
+ }
+ // Update hash with any allocs that have non-default (non-None) hints.
+ for (auto &AN : FS->allocs()) {
+ // In theory all callsites and allocs in this function should have the same
+ // number of clone entries, but handle any discrepancies gracefully below
+ // for NDEBUG builds.
+ assert(AN.Versions.size() > I &&
+ "Alloc summary has fewer entries than other summaries in function");
+ if (AN.Versions.size() <= I ||
+ (AllocationType)AN.Versions[I] == AllocationType::None)
+ continue;
+ Hasher.update(ArrayRef<uint8_t>(&AN.Versions[I], 1));
+ }
+ return support::endian::read64le(Hasher.result().data());
+}
+
static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
- &FuncToAliasMap) {
+ &FuncToAliasMap,
+ FunctionSummary *FS) {
+ auto TakeDeclNameAndReplace = [](GlobalValue *DeclGV, GlobalValue *NewGV) {
+ // We might have created this when adjusting callsite in another
+ // function. It should be a declaration.
+ assert(DeclGV->isDeclaration());
+ NewGV->takeName(DeclGV);
+ DeclGV->replaceAllUsesWith(NewGV);
+ DeclGV->eraseFromParent();
+ };
+
+ // Handle aliases to this function, and create analogous alias clones to the
+ // provided clone of this function.
+ auto CloneFuncAliases = [&](Function *NewF, unsigned I) {
+ if (!FuncToAliasMap.count(&F))
+ return;
+ for (auto *A : FuncToAliasMap[&F]) {
+ std::string AliasName = getMemProfFuncName(A->getName(), I);
+ auto *PrevA = M.getNamedAlias(AliasName);
+ auto *NewA = GlobalAlias::create(A->getValueType(),
+ A->getType()->getPointerAddressSpace(),
+ A->getLinkage(), AliasName, NewF);
+ NewA->copyAttributesFrom(A);
+ if (PrevA)
+ TakeDeclNameAndReplace(PrevA, NewA);
+ }
+ };
+
// The first "clone" is the original copy, we should only call this if we
// needed to create new clones.
assert(NumClones > 1);
SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
VMaps.reserve(NumClones - 1);
FunctionsClonedThinBackend++;
+
+ // Map of hash of callsite/alloc versions to the instantiated function clone
+ // (possibly the original) implementing those calls. Used to avoid
+ // instantiating duplicate function clones.
+ // FIXME: Ideally the thin link would not generate such duplicate clones to
+ // start with, but right now it happens due to phase ordering in the function
+ // assignment and possible new clones that produces. We simply make each
+ // duplicate an alias to the matching instantiated clone recorded in the map
+ // (except for available_externally which are made declarations as they would
+ // be aliases in the prevailing module, and available_externally aliases are
+ // not well supported right now).
+ DenseMap<uint64_t, Function *> HashToFunc;
+
+ // Save the hash of the original function version.
+ HashToFunc[ComputeHash(FS, 0)] = &F;
+
for (unsigned I = 1; I < NumClones; I++) {
VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
+ std::string Name = getMemProfFuncName(F.getName(), I);
+ auto Hash = ComputeHash(FS, I);
+ // If this clone would duplicate a previously seen clone, don't generate the
+ // duplicate clone body, just make an alias to satisfy any (potentially
+ // cross-module) references.
+ if (HashToFunc.contains(Hash)) {
+ FunctionCloneDuplicatesThinBackend++;
+ auto *Func = HashToFunc[Hash];
+ if (Func->hasAvailableExternallyLinkage()) {
+ // Skip these as EliminateAvailableExternallyPass does not handle
+ // available_externally aliases correctly and we end up with an
+ // available_externally alias to a declaration. Just create a
+ // declaration for now as we know we will have a definition in another
+ // module.
+ auto Decl = M.getOrInsertFunction(Name, Func->getFunctionType());
+ ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
+ << "created clone decl " << ore::NV("Decl", Decl.getCallee()));
+ continue;
+ }
+ auto *PrevF = M.getFunction(Name);
+ auto *Alias = GlobalAlias::create(Name, Func);
+ if (PrevF)
+ TakeDeclNameAndReplace(PrevF, Alias);
+ ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
+ << "created clone alias " << ore::NV("Alias", Alias));
+
+ // Now handle aliases to this function, and clone those as well.
+ CloneFuncAliases(Func, I);
+ continue;
+ }
auto *NewF = CloneFunction(&F, *VMaps.back());
+ HashToFunc[Hash] = NewF;
FunctionClonesThinBackend++;
// Strip memprof and callsite metadata from clone as they are no longer
// needed.
@@ -5208,40 +5321,17 @@ static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
}
}
- std::string Name = getMemProfFuncName(F.getName(), I);
auto *PrevF = M.getFunction(Name);
- if (PrevF) {
- // We might have created this when adjusting callsite in another
- // function. It should be a declaration.
- assert(PrevF->isDeclaration());
- NewF->takeName(PrevF);
- PrevF->replaceAllUsesWith(NewF);
- PrevF->eraseFromParent();
- } else
+ if (PrevF)
+ TakeDeclNameAndReplace(PrevF, NewF);
+ else
NewF->setName(Name);
updateSubprogramLinkageName(NewF, Name);
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
<< "created clone " << ore::NV("NewFunction", NewF));
// Now handle aliases to this function, and clone those as well.
- if (!FuncToAliasMap.count(&F))
- continue;
- for (auto *A : FuncToAliasMap[&F]) {
- std::string Name = getMemProfFuncName(A->getName(), I);
- auto *PrevA = M.getNamedAlias(Name);
- auto *NewA = GlobalAlias::create(A->getValueType(),
- A->getType()->getPointerAddressSpace(),
- A->getLinkage(), Name, NewF);
- NewA->copyAttributesFrom(A);
- if (PrevA) {
- // We might have created this when adjusting callsite in another
- // function. It should be a declaration.
- assert(PrevA->isDeclaration());
- NewA->takeName(PrevA);
- PrevA->replaceAllUsesWith(NewA);
- PrevA->eraseFromParent();
- }
- }
+ CloneFuncAliases(NewF, I);
}
return VMaps;
}
@@ -5401,7 +5491,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
bool ClonesCreated = false;
unsigned NumClonesCreated = 0;
- auto CloneFuncIfNeeded = [&](unsigned NumClones) {
+ auto CloneFuncIfNeeded = [&](unsigned NumClones, FunctionSummary *FS) {
// We should at least have version 0 which is the original copy.
assert(NumClones > 0);
// If only one copy needed use original.
@@ -5415,7 +5505,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
assert(NumClonesCreated == NumClones);
return;
}
- VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
+ VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap, FS);
// The first "clone" is the original copy, which doesn't have a VMap.
assert(VMaps.size() == NumClones - 1);
Changed = true;
@@ -5424,9 +5514,9 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
};
auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
- Function *CalledFunction) {
+ Function *CalledFunction, FunctionSummary *FS) {
// Perform cloning if not yet done.
- CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
+ CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size(), FS);
assert(!isMemProfClone(*CalledFunction));
@@ -5448,6 +5538,10 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// below.
auto CalleeOrigName = CalledFunction->getName();
for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
+ // If the VMap is empty, this clone was a duplicate of another and was
+ // created as an alias or a declaration.
+ if (J > 0 && VMaps[J - 1]->empty())
+ continue;
// Do nothing if this version calls the original version of its
// callee.
if (!StackNode.Clones[J])
@@ -5591,7 +5685,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
#endif
// Perform cloning if not yet done.
- CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
+ CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size(), FS);
OrigAllocsThinBackend++;
AllocVersionsThinBackend += AllocNode.Versions.size();
@@ -5624,6 +5718,10 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// Update the allocation types per the summary info.
for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
+ // If the VMap is empty, this clone was a duplicate of another and
+ // was created as an alias or a declaration.
+ if (J > 0 && VMaps[J - 1]->empty())
+ continue;
// Ignore any that didn't get an assigned allocation type.
if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
continue;
@@ -5670,7 +5768,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// we don't need to do ICP, but might need to clone this
// function as it is the target of other cloned calls.
if (NumClones)
- CloneFuncIfNeeded(NumClones);
+ CloneFuncIfNeeded(NumClones, FS);
}
else {
@@ -5690,7 +5788,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
}
#endif
- CloneCallsite(StackNode, CB, CalledFunction);
+ CloneCallsite(StackNode, CB, CalledFunction, FS);
}
} else if (CB->isTailCall() && CalledFunction) {
// Locate the synthesized callsite info for the callee VI, if any was
@@ -5700,7 +5798,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
assert(Callsite != MapTailCallCalleeVIToCallsite.end());
- CloneCallsite(Callsite->second, CB, CalledFunction);
+ CloneCallsite(Callsite->second, CB, CalledFunction, FS);
}
}
}
@@ -5846,6 +5944,10 @@ void MemProfContextDisambiguation::performICP(
// check.
CallBase *CBClone = CB;
for (unsigned J = 0; J < NumClones; J++) {
+ // If the VMap is empty, this clone was a duplicate of another and was
+ // created as an alias or a declaration.
+ if (J > 0 && VMaps[J - 1]->empty())
+ continue;
// Copy 0 is the original function.
if (J > 0)
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
@@ -5891,6 +5993,10 @@ void MemProfContextDisambiguation::performICP(
// TotalCount and the number promoted.
CallBase *CBClone = CB;
for (unsigned J = 0; J < NumClones; J++) {
+ // If the VMap is empty, this clone was a duplicate of another and was
+ // created as an alias or a declaration.
+ if (J > 0 && VMaps[J - 1]->empty())
+ continue;
// Copy 0 is the original function.
if (J > 0)
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
index cba282c..a2e8c69 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
@@ -15,13 +15,12 @@
using namespace llvm;
-namespace {
/// Return true if and only if the given instruction does not modify the memory
/// location referenced. Note that an idemptent atomicrmw may still have
/// ordering effects on nearby instructions, or be volatile.
/// TODO: Common w/ the version in AtomicExpandPass, and change the term used.
/// Idemptotent is confusing in this context.
-bool isIdempotentRMW(AtomicRMWInst& RMWI) {
+static bool isIdempotentRMW(AtomicRMWInst &RMWI) {
if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand()))
switch(RMWI.getOperation()) {
case AtomicRMWInst::FAdd: // -0.0
@@ -59,7 +58,7 @@ bool isIdempotentRMW(AtomicRMWInst& RMWI) {
/// Return true if the given instruction always produces a value in memory
/// equivalent to its value operand.
-bool isSaturating(AtomicRMWInst& RMWI) {
+static bool isSaturating(AtomicRMWInst &RMWI) {
if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand()))
switch (RMWI.getOperation()) {
case AtomicRMWInst::FMax:
@@ -98,7 +97,6 @@ bool isSaturating(AtomicRMWInst& RMWI) {
return C->isMaxValue(false);
};
}
-} // namespace
Instruction *InstCombinerImpl::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 917004c..048cdf4 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -132,8 +132,6 @@ STATISTIC(NumReassoc , "Number of reassociations");
DEBUG_COUNTER(VisitCounter, "instcombine-visit",
"Controls which instructions are visited");
-namespace llvm {
-
static cl::opt<bool> EnableCodeSinking("instcombine-code-sinking",
cl::desc("Enable code sinking"),
cl::init(true));
@@ -146,7 +144,9 @@ static cl::opt<unsigned>
MaxArraySize("instcombine-maxarray-size", cl::init(1024),
cl::desc("Maximum array size considered when doing a combine"));
+namespace llvm {
extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+} // end namespace llvm
// FIXME: Remove this flag when it is no longer necessary to convert
// llvm.dbg.declare to avoid inaccurate debug info. Setting this to false
@@ -158,8 +158,6 @@ extern cl::opt<bool> ProfcheckDisableMetadataFixes;
static cl::opt<unsigned> ShouldLowerDbgDeclare("instcombine-lower-dbg-declare",
cl::Hidden, cl::init(true));
-} // end namespace llvm
-
std::optional<Instruction *>
InstCombiner::targetInstCombineIntrinsic(IntrinsicInst &II) {
// Handle target specific intrinsics
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 8bba634..48055ad 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5152,14 +5152,18 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
if (ExtraCase && Values.size() < 2)
return false;
- // TODO: Preserve branch weight metadata, similarly to how
- // foldValueComparisonIntoPredecessors preserves it.
+ SmallVector<uint32_t> BranchWeights;
+ const bool HasProfile = !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*BI, BranchWeights);
// Figure out which block is which destination.
BasicBlock *DefaultBB = BI->getSuccessor(1);
BasicBlock *EdgeBB = BI->getSuccessor(0);
- if (!TrueWhenEqual)
+ if (!TrueWhenEqual) {
std::swap(DefaultBB, EdgeBB);
+ if (HasProfile)
+ std::swap(BranchWeights[0], BranchWeights[1]);
+ }
BasicBlock *BB = BI->getParent();
@@ -5190,10 +5194,11 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
if (!isGuaranteedNotToBeUndefOrPoison(ExtraCase, AC, BI, nullptr))
ExtraCase = Builder.CreateFreeze(ExtraCase);
- if (TrueWhenEqual)
- Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB);
- else
- Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
+ // We don't have any info about this condition.
+ auto *Br = TrueWhenEqual ? Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB)
+ : Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
+ setExplicitlyUnknownBranchWeightsIfProfiled(*Br, *NewBB->getParent(),
+ DEBUG_TYPE);
OldTI->eraseFromParent();
@@ -5220,6 +5225,17 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
// Create the new switch instruction now.
SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
+ if (HasProfile) {
+ // We know the weight of the default case. We don't know the weight of the
+ // other cases, but rather than completely lose profiling info, we split
+ // the remaining probability equally over them.
+ SmallVector<uint32_t> NewWeights(Values.size() + 1);
+ NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped if
+ // TrueWhenEqual.
+ for (auto &V : drop_begin(NewWeights))
+ V = BranchWeights[0] / Values.size();
+ setBranchWeights(*New, NewWeights, /*IsExpected=*/false);
+ }
// Add all of the 'cases' to the switch instruction.
for (ConstantInt *Val : Values)
@@ -7211,6 +7227,7 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest);
BranchInst *RangeCheckBranch = nullptr;
+ BranchInst *CondBranch = nullptr;
Builder.SetInsertPoint(SI);
const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize);
@@ -7225,6 +7242,7 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
TableIndex, ConstantInt::get(MinCaseVal->getType(), TableSize));
RangeCheckBranch =
Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
+ CondBranch = RangeCheckBranch;
if (DTU)
Updates.push_back({DominatorTree::Insert, BB, LookupBB});
}
@@ -7263,7 +7281,7 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
Value *Shifted = Builder.CreateLShr(TableMask, MaskIndex, "switch.shifted");
Value *LoBit = Builder.CreateTrunc(
Shifted, Type::getInt1Ty(Mod.getContext()), "switch.lobit");
- Builder.CreateCondBr(LoBit, LookupBB, SI->getDefaultDest());
+ CondBranch = Builder.CreateCondBr(LoBit, LookupBB, SI->getDefaultDest());
if (DTU) {
Updates.push_back({DominatorTree::Insert, MaskBB, LookupBB});
Updates.push_back({DominatorTree::Insert, MaskBB, SI->getDefaultDest()});
@@ -7303,19 +7321,32 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
if (DTU)
Updates.push_back({DominatorTree::Insert, LookupBB, CommonDest});
+ SmallVector<uint32_t> BranchWeights;
+ const bool HasBranchWeights = CondBranch && !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*SI, BranchWeights);
+ uint64_t ToLookupWeight = 0;
+ uint64_t ToDefaultWeight = 0;
+
// Remove the switch.
SmallPtrSet<BasicBlock *, 8> RemovedSuccessors;
- for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) {
- BasicBlock *Succ = SI->getSuccessor(i);
+ for (unsigned I = 0, E = SI->getNumSuccessors(); I < E; ++I) {
+ BasicBlock *Succ = SI->getSuccessor(I);
- if (Succ == SI->getDefaultDest())
+ if (Succ == SI->getDefaultDest()) {
+ if (HasBranchWeights)
+ ToDefaultWeight += BranchWeights[I];
continue;
+ }
Succ->removePredecessor(BB);
if (DTU && RemovedSuccessors.insert(Succ).second)
Updates.push_back({DominatorTree::Delete, BB, Succ});
+ if (HasBranchWeights)
+ ToLookupWeight += BranchWeights[I];
}
SI->eraseFromParent();
-
+ if (HasBranchWeights)
+ setFittedBranchWeights(*CondBranch, {ToLookupWeight, ToDefaultWeight},
+ /*IsExpected=*/false);
if (DTU)
DTU->applyUpdates(Updates);