//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file This file contains a DAG scheduling mutation to add latency to: /// 1. Barrier edges between ATOMIC_FENCE instructions and preceding /// memory accesses potentially affected by the fence. /// This encourages the scheduling of more instructions before /// ATOMIC_FENCE instructions. ATOMIC_FENCE instructions may /// introduce wait counting or indicate an impending S_BARRIER /// wait. Having more instructions in-flight across these /// constructs improves latency hiding. /// 2. Barrier edges from S_BARRIER_SIGNAL to S_BARRIER_WAIT. /// This encourages independent work to be scheduled between /// signal and wait, hiding barrier synchronization latency. // //===----------------------------------------------------------------------===// #include "AMDGPUBarrierLatency.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/Support/CommandLine.h" using namespace llvm; static cl::opt BarrierSignalWaitLatencyOpt( "amdgpu-barrier-signal-wait-latency", cl::desc("Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT " "to encourage scheduling independent work between them"), cl::init(16), cl::Hidden); namespace { class BarrierLatency : public ScheduleDAGMutation { private: SmallSet IgnoredScopes; public: BarrierLatency(MachineFunction *MF) { LLVMContext &Context = MF->getFunction().getContext(); IgnoredScopes.insert(SyncScope::SingleThread); IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront")); IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront-one-as")); IgnoredScopes.insert(Context.getOrInsertSyncScopeID("singlethread-one-as")); } void apply(ScheduleDAGInstrs *DAG) override; }; void addLatencyToEdge(SDep &PredDep, SUnit &SU, unsigned Latency) { SUnit *PredSU = PredDep.getSUnit(); SDep ForwardD = PredDep; ForwardD.setSUnit(&SU); for (SDep &SuccDep : PredSU->Succs) { if (SuccDep == ForwardD) { SuccDep.setLatency(SuccDep.getLatency() + Latency); break; } } PredDep.setLatency(PredDep.getLatency() + Latency); PredSU->setDepthDirty(); SU.setDepthDirty(); } void BarrierLatency::apply(ScheduleDAGInstrs *DAG) { const SIInstrInfo *TII = static_cast(DAG->TII); constexpr unsigned FenceLatency = 2000; const unsigned BarrierSignalWaitLatency = BarrierSignalWaitLatencyOpt; for (SUnit &SU : DAG->SUnits) { const MachineInstr *MI = SU.getInstr(); unsigned Op = MI->getOpcode(); if (Op == AMDGPU::ATOMIC_FENCE) { // Update latency on barrier edges of ATOMIC_FENCE. // Ignore scopes not expected to have any latency. SyncScope::ID SSID = static_cast(MI->getOperand(1).getImm()); if (IgnoredScopes.contains(SSID)) continue; for (SDep &PredDep : SU.Preds) { if (!PredDep.isBarrier()) continue; SUnit *PredSU = PredDep.getSUnit(); MachineInstr *MI = PredSU->getInstr(); // Only consider memory loads if (!MI->mayLoad() || MI->mayStore()) continue; addLatencyToEdge(PredDep, SU, FenceLatency); } } else if (Op == AMDGPU::S_BARRIER_WAIT) { for (SDep &PredDep : SU.Preds) { SUnit *PredSU = PredDep.getSUnit(); const MachineInstr *PredMI = PredSU->getInstr(); if (TII->isBarrierStart(PredMI->getOpcode())) { addLatencyToEdge(PredDep, SU, BarrierSignalWaitLatency); } } } } } } // end namespace std::unique_ptr llvm::createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF) { return std::make_unique(MF); }