[AMDGPU][gfx1250] Implement SIMemoryLegalizer (#154726)

Implements the base of the MemoryLegalizer for a roughly correct GFX1250 memory model. Documentation will come later, and some remaining changes still have to be added, but this is the backbone of the model.
author: Pierre van Houtryve <pierre.vanhoutryve@amd.com> 2025-09-10 10:18:11 +0200
committer: GitHub <noreply@github.com> 2025-09-10 10:18:11 +0200
commit: bed9be954d5a8e4166629e489052c96e8cb24f99 (patch)
tree: 30243a6d72a6506559aef761895adc0bbab8a76a /llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
parent: 4d9a7fa9bad0d3f7eba801b784d4caa896d224d7 (diff)
download: llvm-bed9be954d5a8e4166629e489052c96e8cb24f99.zip
llvm-bed9be954d5a8e4166629e489052c96e8cb24f99.tar.gz
llvm-bed9be954d5a8e4166629e489052c96e8cb24f99.tar.bz2
1 files changed, 56 insertions, 19 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 6ab4eb4..c20fcac 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -606,7 +606,11 @@ protected:
                       SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
 
 public:
-  SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
+  SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
+    // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
+    // the behavior is the same if assuming GFX12.0 in CU mode.
+    assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
+  }
 
   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
@@ -2198,7 +2202,8 @@ bool SIGfx10CacheControl::insertBarrierStart(
   // mode. This is because a CU mode release fence does not emit any wait, which
   // is fine when only dealing with vmem, but isn't sufficient in the presence
   // of barriers which do not go through vmem.
-  if (!ST.isCuModeEnabled())
+  // GFX12.5 does not require this additional wait.
+  if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
     return false;
 
   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
@@ -2378,12 +2383,16 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
         STORECnt |= true;
       break;
     case SIAtomicScope::WORKGROUP:
-      // In WGP mode the waves of a work-group can be executing on either CU of
-      // the WGP. Therefore need to wait for operations to complete to ensure
-      // they are visible to waves in the other CU as the L0 is per CU.
-      // Otherwise in CU mode and all waves of a work-group are on the same CU
-      // which shares the same L0.
-      if (!ST.isCuModeEnabled()) {
+      // GFX12.0:
+      //   In WGP mode the waves of a work-group can be executing on either CU
+      //   of the WGP. Therefore need to wait for operations to complete to
+      //   ensure they are visible to waves in the other CU as the L0 is per CU.
+      //   Otherwise in CU mode and all waves of a work-group are on the same CU
+      //   which shares the same L0.
+      //
+      // GFX12.5:
+      //   TODO DOCS
+      if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
           LOADCnt |= true;
         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2435,7 +2444,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     //
     // This also applies to fences. Fences cannot pair with an instruction
     // tracked with bvh/samplecnt as we don't have any atomics that do that.
-    if (Order != AtomicOrdering::Acquire) {
+    if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
     }
@@ -2487,10 +2496,14 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
     ScopeImm = AMDGPU::CPol::SCOPE_DEV;
     break;
   case SIAtomicScope::WORKGROUP:
-    // In WGP mode the waves of a work-group can be executing on either CU of
-    // the WGP. Therefore we need to invalidate the L0 which is per CU.
-    // Otherwise in CU mode all waves of a work-group are on the same CU, and so
-    // the L0 does not need to be invalidated.
+    // GFX12.0:
+    //  In WGP mode the waves of a work-group can be executing on either CU of
+    //  the WGP. Therefore we need to invalidate the L0 which is per CU.
+    //  Otherwise in CU mode all waves of a work-group are on the same CU, and
+    //  so the L0 does not need to be invalidated.
+    //
+    // GFX12.5
+    //   TODO DOCS
     if (ST.isCuModeEnabled())
       return false;
 
@@ -2535,7 +2548,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
   if (Pos == Position::AFTER)
     ++MI;
 
-  // global_wb is only necessary at system scope for gfx120x targets.
+  // global_wb is only necessary at system scope for GFX12.0,
+  // they're also necessary at device scope for GFX12.5.
   //
   // Emitting it for lower scopes is a slow no-op, so we omit it
   // for performance.
@@ -2545,6 +2559,12 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
         .addImm(AMDGPU::CPol::SCOPE_SYS);
     break;
   case SIAtomicScope::AGENT:
+    // TODO DOCS
+    if (ST.hasGFX1250Insts()) {
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
+          .addImm(AMDGPU::CPol::SCOPE_DEV);
+    }
+    break;
   case SIAtomicScope::WORKGROUP:
     // No WB necessary, but we still have to wait.
     break;
@@ -2607,17 +2627,32 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
 }
 
 bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
-  MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
-  if (!CPol)
-    return false;
+  assert(MI.mayStore() && "Not a Store inst");
+  const bool IsRMW = (MI.mayLoad() && MI.mayStore());
+  bool Changed = false;
+
+  // GFX12.5 only: xcnt wait is needed before flat and global atomics
+  // stores/rmw.
+  if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
+    MachineBasicBlock &MBB = *MI.getParent();
+    BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
+    Changed = true;
+  }
+
+  // Remaining fixes do not apply to RMWs.
+  if (IsRMW)
+    return Changed;
 
+  MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
+  if (!CPol) // Some vmem operations do not have a scope and are not concerned.
+    return Changed;
   const unsigned Scope = CPol->getImm() & CPol::SCOPE;
 
   // GFX12.0 only: Extra waits needed before system scope stores.
   if (!ST.hasGFX1250Insts()) {
     if (!Atomic && Scope == CPol::SCOPE_SYS)
       return insertWaitsBeforeSystemScopeStore(MI);
-    return false;
+    return Changed;
   }
 
   // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
@@ -2627,7 +2662,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
       (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI)))
     return setScope(MI, CPol::SCOPE_SE);
 
-  return false;
+  return Changed;
 }
 
 bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
@@ -2839,6 +2874,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
   assert(MI->mayLoad() && MI->mayStore());
 
   bool Changed = false;
+  MachineInstr &RMWMI = *MI;
 
   if (MOI.isAtomic()) {
     const AtomicOrdering Order = MOI.getOrdering();
@@ -2873,6 +2909,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
                                    Position::AFTER);
     }
 
+    Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
     return Changed;
   }
author	Pierre van Houtryve <pierre.vanhoutryve@amd.com>	2025-09-10 10:18:11 +0200
committer	GitHub <noreply@github.com>	2025-09-10 10:18:11 +0200
commit	bed9be954d5a8e4166629e489052c96e8cb24f99 (patch)
tree	30243a6d72a6506559aef761895adc0bbab8a76a /llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
parent	4d9a7fa9bad0d3f7eba801b784d4caa896d224d7 (diff)
download	llvm-bed9be954d5a8e4166629e489052c96e8cb24f99.zip llvm-bed9be954d5a8e4166629e489052c96e8cb24f99.tar.gz llvm-bed9be954d5a8e4166629e489052c96e8cb24f99.tar.bz2