aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp73
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h21
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp16
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h1
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h3
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp140
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h5
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td28
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp201
15 files changed, 332 insertions, 194 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
index 19e2a6a..93732a7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
@@ -244,11 +244,8 @@ void getInterestingMemoryOperands(
// Masked store has an initial operand for the value.
unsigned OpOffset = IsWrite ? 1 : 0;
Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType();
- MaybeAlign Alignment = Align(1);
- // Otherwise no alignment guarantees. We probably got Undef.
- if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
- Alignment = Op->getMaybeAlignValue();
- Value *Mask = CI->getOperand(2 + OpOffset);
+ MaybeAlign Alignment = CI->getParamAlign(OpOffset);
+ Value *Mask = CI->getOperand(1 + OpOffset);
Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, Mask);
break;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
new file mode 100644
index 0000000..30a1f05
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
@@ -0,0 +1,73 @@
+//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains a DAG scheduling mutation to add latency to
+/// barrier edges between ATOMIC_FENCE instructions and preceding
+/// memory accesses potentially affected by the fence.
+/// This encourages the scheduling of more instructions before
+/// ATOMIC_FENCE instructions. ATOMIC_FENCE instructions may
+/// introduce wait counting or indicate an impending S_BARRIER
+/// wait. Having more instructions in-flight across these
+/// constructs improves latency hiding.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUBarrierLatency.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+
+using namespace llvm;
+
+namespace {
+
+class BarrierLatency : public ScheduleDAGMutation {
+public:
+ BarrierLatency() = default;
+ void apply(ScheduleDAGInstrs *DAG) override;
+};
+
+void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
+ constexpr unsigned SyntheticLatency = 2000;
+ for (SUnit &SU : DAG->SUnits) {
+ const MachineInstr *MI = SU.getInstr();
+ if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
+ continue;
+
+ // Update latency on barrier edges of ATOMIC_FENCE.
+ // We don't consider the scope of the fence or type of instruction
+ // involved in the barrier edge.
+ for (SDep &PredDep : SU.Preds) {
+ if (!PredDep.isBarrier())
+ continue;
+ SUnit *PredSU = PredDep.getSUnit();
+ MachineInstr *MI = PredSU->getInstr();
+ // Only consider memory loads
+ if (!MI->mayLoad() || MI->mayStore())
+ continue;
+ SDep ForwardD = PredDep;
+ ForwardD.setSUnit(&SU);
+ for (SDep &SuccDep : PredSU->Succs) {
+ if (SuccDep == ForwardD) {
+ SuccDep.setLatency(SuccDep.getLatency() + SyntheticLatency);
+ break;
+ }
+ }
+ PredDep.setLatency(PredDep.getLatency() + SyntheticLatency);
+ PredSU->setDepthDirty();
+ SU.setDepthDirty();
+ }
+ }
+}
+
+} // end namespace
+
+std::unique_ptr<ScheduleDAGMutation>
+llvm::createAMDGPUBarrierLatencyDAGMutation() {
+ return std::make_unique<BarrierLatency>();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
new file mode 100644
index 0000000..c23f0b9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
@@ -0,0 +1,21 @@
+//===- AMDGPUBarrierLatency.h - AMDGPU Export Clustering --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
+
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include <memory>
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createAMDGPUBarrierLatencyDAGMutation();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 12915c73..97c2c9c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3446,10 +3446,14 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
: 0); // swz
MachineMemOperand *LoadMMO = *MI.memoperands_begin();
+ // Don't set the offset value here because the pointer points to the base of
+ // the buffer.
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
- LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
+
MachinePointerInfo StorePtrI = LoadPtrI;
- StorePtrI.V = nullptr;
+ LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
+ AMDGPUAS::BUFFER_RESOURCE));
+ LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
auto F = LoadMMO->getFlags() &
@@ -3627,13 +3631,17 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
if (isSGPR(Addr))
MIB.addReg(VOffset);
- MIB.add(MI.getOperand(4)) // offset
- .add(MI.getOperand(5)); // cpol
+ MIB.add(MI.getOperand(4)); // offset
+
+ unsigned Aux = MI.getOperand(5).getImm();
+ MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
MachineMemOperand *LoadMMO = *MI.memoperands_begin();
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
LoadPtrI.Offset = MI.getOperand(4).getImm();
MachinePointerInfo StorePtrI = LoadPtrI;
+ LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
+ AMDGPUAS::GLOBAL_ADDRESS));
LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
auto F = LoadMMO->getFlags() &
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 4958a20..996b55f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -17,6 +17,7 @@
#include "AMDGPUTargetMachine.h"
#include "AMDGPU.h"
#include "AMDGPUAliasAnalysis.h"
+#include "AMDGPUBarrierLatency.h"
#include "AMDGPUCtorDtorLowering.h"
#include "AMDGPUExportClustering.h"
#include "AMDGPUExportKernelRuntimeHandles.h"
@@ -639,6 +640,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+ DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
return DAG;
}
@@ -659,6 +661,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+ DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
return DAG;
}
@@ -1197,6 +1200,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
EnableVOPD)
DAG->addMutation(createVOPDPairingMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+ DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
return DAG;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 13f727b68..a1e0e52 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -52,6 +52,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUAsmPrinter.cpp
AMDGPUAtomicOptimizer.cpp
AMDGPUAttributor.cpp
+ AMDGPUBarrierLatency.cpp
AMDGPUCallLowering.cpp
AMDGPUCodeGenPrepare.cpp
AMDGPUCombinerHelper.cpp
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index f291e37..c8bbcbb 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -169,7 +169,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
: // clang-format off
AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
AMDGPUSubtarget(TT),
- TargetTriple(TT),
TargetID(*this),
InstrItins(getInstrItineraryForCPU(GPU)),
InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index c2e6078..a466780 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -60,7 +60,6 @@ private:
protected:
// Basic subtarget description.
- Triple TargetTriple;
AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
unsigned Gen = INVALID;
InstrItineraryData InstrItins;
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index ecc2824..b7a92a0 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -423,6 +423,9 @@ enum CPol {
// Volatile (used to preserve/signal operation volatility for buffer
// operations not a real instruction bit)
VOLATILE = 1 << 31,
+ // The set of "cache policy" bits used for compiler features that
+ // do not correspond to handware features.
+ VIRTUAL_BITS = VOLATILE,
};
} // namespace CPol
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 0189e7b..5c39f7a 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1034,16 +1034,13 @@ void SIFrameLowering::emitCSRSpillStores(
StoreWWMRegisters(WWMCalleeSavedRegs);
if (FuncInfo->isWholeWaveFunction()) {
- // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose, so we can remove
- // it now. If we have already saved some WWM CSR registers, then the EXEC is
- // already -1 and we don't need to do anything else. Otherwise, set EXEC to
- // -1 here.
+ // If we have already saved some WWM CSR registers, then the EXEC is already
+ // -1 and we don't need to do anything else. Otherwise, set EXEC to -1 here.
if (!ScratchExecCopy)
buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true,
/*EnableInactiveLanes*/ true);
else if (WWMCalleeSavedRegs.empty())
EnableAllLanes();
- TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
} else if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
@@ -1340,6 +1337,11 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
"Needed to save BP but didn't save it anywhere");
assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
+
+ if (FuncInfo->isWholeWaveFunction()) {
+ // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose.
+ TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
+ }
}
void SIFrameLowering::emitEpilogue(MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a2841c11..a757421 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1651,6 +1651,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
Info.ptrVal = CI.getArgOperand(1);
Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
+ if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
+ Info.flags |= MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
@@ -11219,8 +11222,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
MachinePointerInfo StorePtrI = LoadPtrI;
LoadPtrI.V = PoisonValue::get(
- PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
- LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
+ PointerType::get(*DAG.getContext(), AMDGPUAS::BUFFER_RESOURCE));
+ LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
auto F = LoadMMO->getFlags() &
@@ -11307,7 +11310,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
Ops.push_back(Op.getOperand(5)); // Offset
- Ops.push_back(Op.getOperand(6)); // CPol
+
+ unsigned Aux = Op.getConstantOperandVal(6);
+ Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
+ MVT::i32)); // CPol
+
Ops.push_back(M0Val.getValue(0)); // Chain
Ops.push_back(M0Val.getValue(1)); // Glue
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 50447f4..2ff2d2f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4032,28 +4032,31 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
}
}
+/// Helper struct for the implementation of 3-address conversion to communicate
+/// updates made to instruction operands.
+struct SIInstrInfo::ThreeAddressUpdates {
+ /// Other instruction whose def is no longer used by the converted
+ /// instruction.
+ MachineInstr *RemoveMIUse = nullptr;
+};
+
MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
MachineBasicBlock &MBB = *MI.getParent();
- unsigned Opc = MI.getOpcode();
+ ThreeAddressUpdates U;
+ MachineInstr *NewMI = convertToThreeAddressImpl(MI, U);
- // Handle MFMA.
- int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
- if (NewMFMAOpc != -1) {
- MachineInstrBuilder MIB =
- BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
- for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
- MIB.add(MI.getOperand(I));
- updateLiveVariables(LV, MI, *MIB);
+ if (NewMI) {
+ updateLiveVariables(LV, MI, *NewMI);
if (LIS) {
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+ LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
// SlotIndex of defs needs to be updated when converting to early-clobber
- MachineOperand &Def = MIB->getOperand(0);
+ MachineOperand &Def = NewMI->getOperand(0);
if (Def.isEarlyClobber() && Def.isReg() &&
LIS->hasInterval(Def.getReg())) {
- SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
- SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
+ SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
+ SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
auto &LI = LIS->getInterval(Def.getReg());
auto UpdateDefIndex = [&](LiveRange &LR) {
auto *S = LR.find(OldIndex);
@@ -4068,6 +4071,58 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
UpdateDefIndex(SR);
}
}
+ }
+
+ if (U.RemoveMIUse) {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ // The only user is the instruction which will be killed.
+ Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
+
+ if (MRI.hasOneNonDBGUse(DefReg)) {
+ // We cannot just remove the DefMI here, calling pass will crash.
+ U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
+ U.RemoveMIUse->getOperand(0).setIsDead(true);
+ for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
+ U.RemoveMIUse->removeOperand(I);
+ if (LV)
+ LV->getVarInfo(DefReg).AliveBlocks.clear();
+ }
+
+ if (LIS) {
+ LiveInterval &DefLI = LIS->getInterval(DefReg);
+
+ // We cannot delete the original instruction here, so hack out the use
+ // in the original instruction with a dummy register so we can use
+ // shrinkToUses to deal with any multi-use edge cases. Other targets do
+ // not have the complexity of deleting a use to consider here.
+ Register DummyReg = MRI.cloneVirtualRegister(DefReg);
+ for (MachineOperand &MIOp : MI.uses()) {
+ if (MIOp.isReg() && MIOp.getReg() == DefReg) {
+ MIOp.setIsUndef(true);
+ MIOp.setReg(DummyReg);
+ }
+ }
+
+ LIS->shrinkToUses(&DefLI);
+ }
+ }
+
+ return NewMI;
+}
+
+MachineInstr *
+SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
+ ThreeAddressUpdates &U) const {
+ MachineBasicBlock &MBB = *MI.getParent();
+ unsigned Opc = MI.getOpcode();
+
+ // Handle MFMA.
+ int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
+ if (NewMFMAOpc != -1) {
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+ MIB.add(MI.getOperand(I));
return MIB;
}
@@ -4077,11 +4132,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.setMIFlags(MI.getFlags());
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
MIB->addOperand(MI.getOperand(I));
-
- updateLiveVariables(LV, MI, *MIB);
- if (LIS)
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
-
return MIB;
}
@@ -4152,39 +4202,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
(ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
!RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
MachineInstr *DefMI;
- const auto killDef = [&]() -> void {
- MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
- // The only user is the instruction which will be killed.
- Register DefReg = DefMI->getOperand(0).getReg();
-
- if (MRI.hasOneNonDBGUse(DefReg)) {
- // We cannot just remove the DefMI here, calling pass will crash.
- DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
- DefMI->getOperand(0).setIsDead(true);
- for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
- DefMI->removeOperand(I);
- if (LV)
- LV->getVarInfo(DefReg).AliveBlocks.clear();
- }
-
- if (LIS) {
- LiveInterval &DefLI = LIS->getInterval(DefReg);
-
- // We cannot delete the original instruction here, so hack out the use
- // in the original instruction with a dummy register so we can use
- // shrinkToUses to deal with any multi-use edge cases. Other targets do
- // not have the complexity of deleting a use to consider here.
- Register DummyReg = MRI.cloneVirtualRegister(DefReg);
- for (MachineOperand &MIOp : MI.uses()) {
- if (MIOp.isReg() && MIOp.getReg() == DefReg) {
- MIOp.setIsUndef(true);
- MIOp.setReg(DummyReg);
- }
- }
-
- LIS->shrinkToUses(&DefLI);
- }
- };
int64_t Imm;
if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
@@ -4196,10 +4213,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.add(*Src1)
.addImm(Imm)
.setMIFlags(MI.getFlags());
- updateLiveVariables(LV, MI, *MIB);
- if (LIS)
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
- killDef();
+ U.RemoveMIUse = DefMI;
return MIB;
}
}
@@ -4212,11 +4226,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.addImm(Imm)
.add(*Src2)
.setMIFlags(MI.getFlags());
- updateLiveVariables(LV, MI, *MIB);
-
- if (LIS)
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
- killDef();
+ U.RemoveMIUse = DefMI;
return MIB;
}
}
@@ -4235,12 +4245,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.addImm(Imm)
.add(*Src2)
.setMIFlags(MI.getFlags());
- updateLiveVariables(LV, MI, *MIB);
-
- if (LIS)
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
- if (DefMI)
- killDef();
+ U.RemoveMIUse = DefMI;
return MIB;
}
}
@@ -4269,9 +4274,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.setMIFlags(MI.getFlags());
if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
MIB.addImm(OpSel ? OpSel->getImm() : 0);
- updateLiveVariables(LV, MI, *MIB);
- if (LIS)
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
return MIB;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index df27ec1..e1d7a07 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -88,6 +88,8 @@ private:
};
class SIInstrInfo final : public AMDGPUGenInstrInfo {
+ struct ThreeAddressUpdates;
+
private:
const SIRegisterInfo RI;
const GCNSubtarget &ST;
@@ -190,6 +192,9 @@ private:
bool resultDependsOnExec(const MachineInstr &MI) const;
+ MachineInstr *convertToThreeAddressImpl(MachineInstr &MI,
+ ThreeAddressUpdates &Updates) const;
+
protected:
/// If the specific machine instruction is a instruction that moves/copies
/// value from one register to another register return destination and source
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 27e5ee9c..6f1feb1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2223,8 +2223,8 @@ def : GCNPat <
def : GCNPat <
(DivergentUnaryFrag<fneg> (v2f32 VReg_64:$src)),
- (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src,
- 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, (i64 0),
+ (V_PK_ADD_F32 !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), VReg_64:$src,
+ !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), (i64 0),
0, 0, 0, 0, 0)
> {
let SubtargetPredicate = HasPackedFP32Ops;
@@ -3481,30 +3481,6 @@ def : GCNPat<
>;
} // End True16Predicate
-let True16Predicate = UseRealTrue16Insts in {
-def : GCNPat<
- (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
- (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0)
->;
-
-def : GCNPat<
- (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
- (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_NEG_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0)
->;
-} // End True16Predicate
-
-let True16Predicate = UseFakeTrue16Insts in {
-def : GCNPat<
- (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
- (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
->;
-
-def : GCNPat<
- (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
- (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
->;
-} // End True16Predicate
-
def : GCNPat<
(fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
(V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 484861d..07264d9 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -25,6 +25,7 @@
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/TargetParser/TargetParser.h"
@@ -277,6 +278,12 @@ public:
/// rmw operation, "std::nullopt" otherwise.
std::optional<SIMemOpInfo>
getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
+
+ /// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store,
+ /// along with an indication of whether this is a load or store. If it is not
+ /// a direct-to-LDS operation, returns std::nullopt.
+ std::optional<SIMemOpInfo>
+ getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const;
};
class SICacheControl {
@@ -360,11 +367,13 @@ public:
/// between memory instructions to enforce the order they become visible as
/// observed by other memory instructions executing in memory scope \p Scope.
/// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
- /// address spaces. Returns true iff any instructions inserted.
+ /// address spaces. If \p AtomicsOnly is true, only insert waits for counters
+ /// that are used by atomic instructions.
+ /// Returns true iff any instructions inserted.
virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order) const = 0;
+ AtomicOrdering Order, bool AtomicsOnly) const = 0;
/// Inserts any necessary instructions at position \p Pos relative to
/// instruction \p MI to ensure any subsequent memory instructions of this
@@ -388,12 +397,6 @@ public:
bool IsCrossAddrSpaceOrdering,
Position Pos) const = 0;
- /// Inserts any necessary instructions before the barrier start instruction
- /// \p MI in order to support pairing of barriers and fences.
- virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const {
- return false;
- };
-
/// Virtual destructor to allow derivations to be deleted.
virtual ~SICacheControl() = default;
};
@@ -437,7 +440,7 @@ public:
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order) const override;
+ AtomicOrdering Order, bool AtomicsOnly) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
@@ -484,7 +487,7 @@ public:
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order) const override;
+ AtomicOrdering Order, bool AtomicsOnly) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
@@ -572,14 +575,10 @@ public:
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order) const override;
-
- bool insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
+ AtomicOrdering Order, bool AtomicsOnly) const override;
- bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override;
+ bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, Position Pos) const override;
};
class SIGfx11CacheControl : public SIGfx10CacheControl {
@@ -629,7 +628,7 @@ public:
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order) const override;
+ AtomicOrdering Order, bool AtomicsOnly) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
@@ -701,6 +700,9 @@ private:
/// instructions are added/deleted or \p MI is modified, false otherwise.
bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
+ /// Expands LDS DMA operation \p MI. Returns true if instructions are
+ /// added/deleted or \p MI is modified, false otherwise.
+ bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI);
public:
SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
@@ -830,6 +832,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
return SIAtomicAddrSpace::SCRATCH;
if (AS == AMDGPUAS::REGION_ADDRESS)
return SIAtomicAddrSpace::GDS;
+ if (AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
+ AS == AMDGPUAS::BUFFER_STRIDED_POINTER)
+ return SIAtomicAddrSpace::GLOBAL;
return SIAtomicAddrSpace::OTHER;
}
@@ -985,6 +990,16 @@ std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
return constructFromMIWithMMO(MI);
}
+std::optional<SIMemOpInfo>
+SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const {
+ assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
+
+ if (!SIInstrInfo::isLDSDMA(*MI))
+ return std::nullopt;
+
+ return constructFromMIWithMMO(MI);
+}
+
SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
TII = ST.getInstrInfo();
IV = getIsaVersion(ST.getCPU());
@@ -1097,7 +1112,7 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
- assert(MI->mayLoad() ^ MI->mayStore());
+ assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -1120,7 +1135,8 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered);
+ Position::AFTER, AtomicOrdering::Unordered,
+ /*AtomicsOnly=*/false);
return Changed;
}
@@ -1140,7 +1156,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
- AtomicOrdering Order) const {
+ AtomicOrdering Order,
+ bool AtomicsOnly) const {
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
@@ -1294,7 +1311,8 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
bool IsCrossAddrSpaceOrdering,
Position Pos) const {
return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
+ IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+ /*AtomicsOnly=*/false);
}
bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -1424,7 +1442,7 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
- assert(MI->mayLoad() ^ MI->mayStore());
+ assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -1447,7 +1465,8 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered);
+ Position::AFTER, AtomicOrdering::Unordered,
+ /*AtomicsOnly=*/false);
return Changed;
}
@@ -1467,8 +1486,8 @@ bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
- Position Pos,
- AtomicOrdering Order) const {
+ Position Pos, AtomicOrdering Order,
+ bool AtomicsOnly) const {
if (ST.isTgSplitEnabled()) {
// In threadgroup split mode the waves of a work-group can be executing on
// different CUs. Therefore need to wait for global or GDS memory operations
@@ -1488,7 +1507,8 @@ bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
AddrSpace &= ~SIAtomicAddrSpace::LDS;
}
return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
- IsCrossAddrSpaceOrdering, Pos, Order);
+ IsCrossAddrSpaceOrdering, Pos, Order,
+ AtomicsOnly);
}
bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -1726,7 +1746,7 @@ bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
- assert(MI->mayLoad() ^ MI->mayStore());
+ assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -1747,7 +1767,8 @@ bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered);
+ Position::AFTER, AtomicOrdering::Unordered,
+ /*AtomicsOnly=*/false);
return Changed;
}
@@ -1904,7 +1925,8 @@ bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
// Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
// S_WAITCNT needed.
Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
+ IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+ /*AtomicsOnly=*/false);
return Changed;
}
@@ -1959,7 +1981,7 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
- assert(MI->mayLoad() ^ MI->mayStore());
+ assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -1984,7 +2006,8 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered);
+ Position::AFTER, AtomicOrdering::Unordered,
+ /*AtomicsOnly=*/false);
return Changed;
}
@@ -2007,7 +2030,8 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
- Position Pos, AtomicOrdering Order) const {
+ Position Pos, AtomicOrdering Order,
+ bool AtomicsOnly) const {
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
@@ -2035,8 +2059,11 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
// the WGP. Therefore need to wait for operations to complete to ensure
// they are visible to waves in the other CU as the L0 is per CU.
// Otherwise in CU mode and all waves of a work-group are on the same CU
- // which shares the same L0.
- if (!ST.isCuModeEnabled()) {
+ // which shares the same L0. Note that we still need to wait when
+ // performing a release in this mode to respect the transitivity of
+ // happens-before, e.g. other waves of the workgroup must be able to
+ // release the memory from another wave at a wider scope.
+ if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) {
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
VMCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2191,22 +2218,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
return Changed;
}
-bool SIGfx10CacheControl::insertBarrierStart(
- MachineBasicBlock::iterator &MI) const {
- // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU
- // mode. This is because a CU mode release fence does not emit any wait, which
- // is fine when only dealing with vmem, but isn't sufficient in the presence
- // of barriers which do not go through vmem.
- // GFX12.5 does not require this additional wait.
- if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
- return false;
-
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
- TII->get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
- return true;
-}
-
bool SIGfx11CacheControl::enableLoadCacheBypass(
const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
@@ -2255,7 +2266,7 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
- assert(MI->mayLoad() ^ MI->mayStore());
+ assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -2281,7 +2292,8 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered);
+ Position::AFTER, AtomicOrdering::Unordered,
+ /*AtomicsOnly=*/false);
return Changed;
}
@@ -2354,7 +2366,8 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
- Position Pos, AtomicOrdering Order) const {
+ Position Pos, AtomicOrdering Order,
+ bool AtomicsOnly) const {
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
@@ -2383,15 +2396,20 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
// In WGP mode the waves of a work-group can be executing on either CU
// of the WGP. Therefore need to wait for operations to complete to
// ensure they are visible to waves in the other CU as the L0 is per CU.
+ //
// Otherwise in CU mode and all waves of a work-group are on the same CU
- // which shares the same L0.
+ // which shares the same L0. Note that we still need to wait when
+ // performing a release in this mode to respect the transitivity of
+ // happens-before, e.g. other waves of the workgroup must be able to
+ // release the memory from another wave at a wider scope.
//
// GFX12.5:
// CU$ has two ports. To ensure operations are visible at the workgroup
// level, we need to ensure all operations in this port have completed
// so the other SIMDs in the WG can see them. There is no ordering
// guarantee between the ports.
- if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
+ if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() ||
+ isReleaseOrStronger(Order)) {
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
LOADCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2444,7 +2462,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
//
// This also applies to fences. Fences cannot pair with an instruction
// tracked with bvh/samplecnt as we don't have any atomics that do that.
- if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
+ if (!AtomicsOnly && ST.hasImageInsts()) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
}
@@ -2587,7 +2605,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
// complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
// we of course need to wait for that as well.
Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
+ IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+ /*AtomicsOnly=*/false);
return Changed;
}
@@ -2597,7 +2616,7 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
// Only handle load and store, not atomic read-modify-write instructions.
- assert(MI->mayLoad() ^ MI->mayStore());
+ assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -2624,7 +2643,8 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
- Position::AFTER, AtomicOrdering::Unordered);
+ Position::AFTER, AtomicOrdering::Unordered,
+ /*AtomicsOnly=*/false);
}
return Changed;
@@ -2748,13 +2768,15 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
SIMemOp::LOAD | SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(),
- Position::BEFORE, Order);
+ Position::BEFORE, Order, /*AtomicsOnly=*/false);
if (Order == AtomicOrdering::Acquire ||
Order == AtomicOrdering::SequentiallyConsistent) {
- Changed |= CC->insertWait(
- MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD,
- MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
+ // The wait below only needs to wait on the prior atomic.
+ Changed |=
+ CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
+ SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(),
+ Position::AFTER, Order, /*AtomicsOnly=*/true);
Changed |= CC->insertAcquire(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
Position::AFTER);
@@ -2830,9 +2852,11 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
if (MOI.isAtomic()) {
const AtomicOrdering Order = MOI.getOrdering();
if (Order == AtomicOrdering::Acquire) {
- Changed |= CC->insertWait(
- MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
- MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order);
+ // Acquire fences only need to wait on the previous atomic they pair with.
+ Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
+ SIMemOp::LOAD | SIMemOp::STORE,
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE, Order, /*AtomicsOnly=*/true);
}
if (Order == AtomicOrdering::Release ||
@@ -2897,10 +2921,12 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
Order == AtomicOrdering::SequentiallyConsistent ||
MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
- Changed |= CC->insertWait(
- MI, MOI.getScope(), MOI.getInstrAddrSpace(),
- isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
- MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
+ // Only wait on the previous atomic.
+ Changed |=
+ CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
+ isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
+ MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER,
+ Order, /*AtomicsOnly=*/true);
Changed |= CC->insertAcquire(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
Position::AFTER);
@@ -2913,6 +2939,23 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
return Changed;
}
+bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI) {
+ assert(MI->mayLoad() && MI->mayStore());
+
+ // The volatility or nontemporal-ness of the operation is a
+ // function of the global memory, not the LDS.
+ SIMemOp OpKind =
+ SIInstrInfo::mayWriteLDSThroughDMA(*MI) ? SIMemOp::LOAD : SIMemOp::STORE;
+
+ // Handle volatile and/or nontemporal markers on direct-to-LDS loads and
+ // stores. The operation is treated as a volatile/nontemporal store
+ // to its second argument.
+ return CC->enableVolatileAndOrNonTemporal(
+ MI, MOI.getInstrAddrSpace(), OpKind, MOI.isVolatile(),
+ MOI.isNonTemporal(), MOI.isLastUse());
+}
+
bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
const MachineModuleInfo &MMI =
getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
@@ -2956,22 +2999,20 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
MI = II->getIterator();
}
- if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) {
- Changed |= CC->insertBarrierStart(MI);
- continue;
- }
-
if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
continue;
- if (const auto &MOI = MOA.getLoadInfo(MI))
+ if (const auto &MOI = MOA.getLoadInfo(MI)) {
Changed |= expandLoad(*MOI, MI);
- else if (const auto &MOI = MOA.getStoreInfo(MI)) {
+ } else if (const auto &MOI = MOA.getStoreInfo(MI)) {
Changed |= expandStore(*MOI, MI);
- } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
+ } else if (const auto &MOI = MOA.getLDSDMAInfo(MI)) {
+ Changed |= expandLDSDMA(*MOI, MI);
+ } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) {
Changed |= expandAtomicFence(*MOI, MI);
- else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
+ } else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) {
Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
+ }
}
}