diff options
43 files changed, 3604 insertions, 1817 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 24f8788..21ea17e 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -609,6 +609,9 @@ protected: bool insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const; + bool setAtomicScope(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const; + public: SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} @@ -625,6 +628,28 @@ public: bool IsLastUse) const override; bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; + + bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, + Position Pos) const override; + + bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override { + return setAtomicScope(MI, Scope, AddrSpace); + } + + bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override { + return setAtomicScope(MI, Scope, AddrSpace); + } + + bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override { + return setAtomicScope(MI, Scope, AddrSpace); + } }; class SIMemoryLegalizer final : public MachineFunctionPass { @@ -2429,6 +2454,72 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return true; } +bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + // The scratch address space does not need the global memory cache + // writeback as all memory operations by the same thread are + // sequentially consistent, and no other thread can access scratch + // memory. + + // Other address spaces do not have a cache. + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) + return false; + + if (Pos == Position::AFTER) + ++MI; + + // GLOBAL_WB is always needed, even for write-through caches, as it + // additionally ensures all operations have reached the desired cache level. + bool SkipWB = false; + AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV; + switch (Scope) { + case SIAtomicScope::SYSTEM: + ScopeImm = AMDGPU::CPol::SCOPE_SYS; + break; + case SIAtomicScope::AGENT: + ScopeImm = AMDGPU::CPol::SCOPE_DEV; + break; + case SIAtomicScope::WORKGROUP: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore we need to ensure all operations have reached L1, + // hence the SCOPE_SE WB. + // For CU mode, we need operations to reach L0, so the wait is enough - + // there are no ways for an operation to report completion without reaching + // at least L0. + if (ST.isCuModeEnabled()) + SkipWB = true; + else + ScopeImm = AMDGPU::CPol::SCOPE_SE; + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to invalidate. + return false; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + + if (!SkipWB) + BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(ScopeImm); + + if (Pos == Position::AFTER) + --MI; + + // We always have to wait for previous memory operations (load/store) to + // complete, whether we inserted a WB or not. If we inserted a WB (storecnt), + // we of course need to wait for that as well. + insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, + IsCrossAddrSpaceOrdering, Pos); + + return true; +} + bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { @@ -2479,6 +2570,44 @@ bool SIGfx12CacheControl::expandSystemScopeStore( return false; } +bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); + break; + case SIAtomicScope::AGENT: + Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV); + break; + case SIAtomicScope::WORKGROUP: + // In workgroup mode, SCOPE_SE is needed as waves can executes on + // different CUs that access different L0s. + if (!ST.isCuModeEnabled()) + Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE); + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to bypass. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + // The scratch address space does not need the global memory caches + // to be bypassed as all memory operations by the same thread are + // sequentially consistent, and no other thread can access scratch + // memory. + + // Other address spaces do not have a cache. + + return Changed; +} + bool SIMemoryLegalizer::removeAtomicPseudoMIs() { if (AtomicPseudoMIs.empty()) return false; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 0a8e805..c701e87 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -18,6 +18,7 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -90,6 +91,7 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -162,6 +164,7 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -238,6 +241,7 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_f64 v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -324,8 +328,9 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -538,8 +543,9 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -746,8 +752,9 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -972,8 +979,9 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -1186,8 +1194,9 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1395,8 +1404,9 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -1598,8 +1608,9 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -1823,8 +1834,9 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -2035,11 +2047,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v4, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2285,9 +2297,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v0, v1, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2527,11 +2540,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -2800,10 +2813,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index fb81176..90110e6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -18,6 +18,7 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -90,6 +91,7 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -162,6 +164,7 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -238,6 +241,7 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_f64 v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -324,8 +328,9 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -538,8 +543,9 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -746,8 +752,9 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -972,8 +979,9 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -1186,8 +1194,9 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1395,8 +1404,9 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -1598,8 +1608,9 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -1823,8 +1834,9 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -2035,11 +2047,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v4, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2285,9 +2297,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v0, v1, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2527,11 +2540,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -2800,10 +2813,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll index a9f0e54..48217b9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -1293,7 +1293,8 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(ptr addrspace(1) inr ; GFX12-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4095: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_dual_mov_b32 v0, 2 :: v_dual_mov_b32 v1, 0 -; GFX12-NEXT: global_atomic_add_u32 v0, v1, v0, s[2:3] offset:16380 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u32 v0, v1, v0, s[2:3] offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog @@ -1344,7 +1345,8 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace( ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, 2 -; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog @@ -1385,7 +1387,8 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4095(ptr addrspace(1) %pt ; GFX12-LABEL: mubuf_atomicrmw_vgpr_ptr_offset4095: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 2 -; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off offset:16380 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog @@ -1433,7 +1436,8 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace( ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: v_mov_b32_e32 v2, 2 -; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog @@ -1485,7 +1489,8 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog @@ -1529,7 +1534,8 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4095(ptr addrspace(1) inreg ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v0, v[1:2], s[2:3] offset:16380 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v0, v[1:2], s[2:3] offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog @@ -1582,7 +1588,8 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1) ; GFX12-NEXT: s_add_co_ci_u32 s1, s3, 4 ; GFX12-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[3:4], v[1:2], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[3:4], v[1:2], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog @@ -1624,7 +1631,8 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4095(ptr addrspace(1) %ptr, ; GFX12-LABEL: mubuf_cmpxchg_vgpr_ptr_offset4095: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off offset:16380 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog @@ -1672,7 +1680,8 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v6, vcc_lo -; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog @@ -1725,7 +1734,8 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo -; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index de7fc76..bc5d266 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -279,7 +279,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: global_wb scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB0_2: @@ -314,7 +315,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: global_wb scope:SCOPE_DEV +; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB0_2: @@ -608,7 +610,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: v_mov_b32_e32 v1, s3 ; GFX1264-NEXT: s_mov_b32 s8, s6 ; GFX1264-NEXT: s_mov_b32 s9, s7 -; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: global_wb scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB1_2: @@ -645,7 +648,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_mov_b32_e32 v1, s2 ; GFX1232-NEXT: s_mov_b32 s8, s6 ; GFX1232-NEXT: s_mov_b32 s9, s7 -; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: global_wb scope:SCOPE_DEV +; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB1_2: @@ -1035,7 +1039,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: .LBB2_4: @@ -1085,7 +1090,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: .LBB2_4: @@ -1539,7 +1545,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 ; GFX1264_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1264_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX1264_DPP-NEXT: global_wb scope:SCOPE_DEV +; GFX1264_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_DPP-NEXT: .LBB2_2: @@ -1602,7 +1609,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s2 ; GFX1232_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1232_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX1232_DPP-NEXT: global_wb scope:SCOPE_DEV +; GFX1232_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_DPP-NEXT: .LBB2_2: @@ -1908,7 +1916,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: global_wb scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB3_2: @@ -1945,7 +1954,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: global_wb scope:SCOPE_DEV +; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB3_2: @@ -2294,7 +2304,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: s_mov_b32 s8, s6 ; GFX1264-NEXT: s_mov_b32 s9, s7 -; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: global_wb scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB4_2: @@ -2334,7 +2345,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX1232-NEXT: s_mov_b32 s12, s6 ; GFX1232-NEXT: s_mov_b32 s13, s7 -; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: global_wb scope:SCOPE_DEV +; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB4_2: @@ -2781,7 +2793,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: .LBB5_4: @@ -2834,7 +2847,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: .LBB5_4: @@ -3601,7 +3615,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 ; GFX1264_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1264_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX1264_DPP-NEXT: global_wb scope:SCOPE_DEV +; GFX1264_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_DPP-NEXT: .LBB5_2: @@ -3696,7 +3711,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s2 ; GFX1232_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1232_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX1232_DPP-NEXT: global_wb scope:SCOPE_DEV +; GFX1232_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_DPP-NEXT: .LBB5_2: @@ -3987,7 +4003,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: global_wb scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB6_2: @@ -4023,7 +4040,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: global_wb scope:SCOPE_DEV +; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB6_2: @@ -4322,7 +4340,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: v_mov_b32_e32 v1, s3 ; GFX1264-NEXT: s_mov_b32 s8, s6 ; GFX1264-NEXT: s_mov_b32 s9, s7 -; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: global_wb scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB7_2: @@ -4360,7 +4379,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_mov_b32_e32 v1, s2 ; GFX1232-NEXT: s_mov_b32 s8, s6 ; GFX1232-NEXT: s_mov_b32 s9, s7 -; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: global_wb scope:SCOPE_DEV +; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB7_2: @@ -4751,7 +4771,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: .LBB8_4: @@ -4801,7 +4822,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: .LBB8_4: @@ -5255,7 +5277,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 ; GFX1264_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1264_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX1264_DPP-NEXT: global_wb scope:SCOPE_DEV +; GFX1264_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_DPP-NEXT: .LBB8_2: @@ -5318,7 +5341,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s2 ; GFX1232_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1232_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX1232_DPP-NEXT: global_wb scope:SCOPE_DEV +; GFX1232_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_DPP-NEXT: .LBB8_2: @@ -5638,7 +5662,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: global_wb scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB9_2: @@ -5678,7 +5703,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: global_wb scope:SCOPE_DEV +; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB9_2: @@ -6043,7 +6069,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: s_mov_b32 s8, s6 ; GFX1264-NEXT: s_mov_b32 s9, s7 -; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: global_wb scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB10_2: @@ -6087,7 +6114,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX1232-NEXT: s_mov_b32 s12, s6 ; GFX1232-NEXT: s_mov_b32 s13, s7 -; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: global_wb scope:SCOPE_DEV +; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB10_2: @@ -6538,7 +6566,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: .LBB11_4: @@ -6591,7 +6620,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: .LBB11_4: @@ -7358,7 +7388,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 ; GFX1264_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1264_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX1264_DPP-NEXT: global_wb scope:SCOPE_DEV +; GFX1264_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_DPP-NEXT: .LBB11_2: @@ -7453,7 +7484,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s2 ; GFX1232_DPP-NEXT: s_mov_b32 s5, s3 -; GFX1232_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX1232_DPP-NEXT: global_wb scope:SCOPE_DEV +; GFX1232_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 ; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_DPP-NEXT: .LBB11_2: diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index c9076a9..22e00b2 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -102,8 +102,9 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX1200-NEXT: v_mov_b32_e32 v4, v3 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX1200-NEXT: global_wb scope:SCOPE_SYS ; GFX1200-NEXT: s_wait_storecnt 0x0 -; GFX1200-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX1200-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SYS ; GFX1200-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -213,8 +214,9 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 { ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: global_wb scope:SCOPE_SE ; GFX1200-NEXT: s_wait_storecnt 0x0 -; GFX1200-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX1200-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SE ; GFX1200-NEXT: s_setpc_b64 s[30:31] @@ -347,8 +349,9 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 { ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: global_wb scope:SCOPE_SE ; GFX1200-NEXT: s_wait_storecnt 0x0 -; GFX1200-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX1200-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SE ; GFX1200-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SE ; GFX1200-NEXT: s_setpc_b64 s[30:31] @@ -446,8 +449,9 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX1200-NEXT: v_mov_b32_e32 v4, v3 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX1200-NEXT: global_wb scope:SCOPE_SE ; GFX1200-NEXT: s_wait_storecnt 0x0 -; GFX1200-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX1200-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SE ; GFX1200-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 0cdd6b9..23e8f98 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -22,6 +22,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -234,6 +235,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -424,6 +426,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -853,9 +856,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -1127,8 +1131,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[2:3], v[4:5], v[0:1] ; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1407,8 +1411,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall(ptr a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[11:12], v[13:14], v[5:6] ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 @@ -1924,14 +1928,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2294,14 +2298,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2670,14 +2674,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3294,22 +3298,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3719,22 +3723,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4150,22 +4154,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 @@ -4815,6 +4819,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5086,6 +5091,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -5338,6 +5344,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -5838,6 +5845,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6245,6 +6253,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -6643,6 +6652,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -7313,9 +7323,10 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 503065c..ec04082 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -22,6 +22,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -221,6 +222,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -416,6 +418,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -821,11 +824,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -1037,10 +1040,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1265,9 +1269,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 @@ -1679,15 +1684,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v5 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2064,15 +2070,16 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v3 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2455,15 +2462,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v10 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v10 ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 @@ -3094,22 +3102,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3521,22 +3529,22 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3954,22 +3962,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v4, v4, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 @@ -4631,11 +4639,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v4, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4973,9 +4981,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5324,9 +5333,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 @@ -5922,27 +5932,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6367,11 +6377,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v3 :: v_dual_max_num_f32 v0, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 @@ -6818,11 +6828,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v9 :: v_dual_max_num_f32 v4, v4, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 @@ -7513,11 +7523,11 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v4, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index e0e6ccd..cd01cc7 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -22,6 +22,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -221,6 +222,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -416,6 +418,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -821,11 +824,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -1037,10 +1040,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1265,9 +1269,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 @@ -1679,15 +1684,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v5 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2064,15 +2070,16 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v3 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2455,15 +2462,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: v_min_num_f16_e32 v4, v4, v10 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v4, v4, v10 ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 @@ -3094,22 +3102,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3521,22 +3529,22 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3954,22 +3962,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v4, v4, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 @@ -4631,11 +4639,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v4, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4973,9 +4981,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5324,9 +5333,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v5, v4, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 @@ -5922,27 +5932,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v1, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6367,11 +6377,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v3 :: v_dual_min_num_f32 v0, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 @@ -6818,11 +6828,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v9 :: v_dual_min_num_f32 v4, v4, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 @@ -7513,11 +7523,11 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v4, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index af7e111..c5c44d2 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -20,8 +20,9 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -196,8 +197,9 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -382,8 +384,9 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -576,8 +579,9 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -782,8 +786,9 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -999,8 +1004,9 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1229,8 +1235,9 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1413,8 +1420,9 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1593,8 +1601,9 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1765,8 +1774,9 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1982,8 +1992,9 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2199,8 +2210,9 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2420,8 +2432,9 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2596,8 +2609,9 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2782,8 +2796,9 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2976,8 +2991,9 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3182,8 +3198,9 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3399,8 +3416,9 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3629,8 +3647,9 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -3813,8 +3832,9 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -3995,8 +4015,9 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -4179,8 +4200,9 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -4353,8 +4375,9 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4529,8 +4552,9 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4735,8 +4759,9 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4911,8 +4936,9 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -5117,8 +5143,9 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -5293,8 +5320,9 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -5499,8 +5527,9 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -5675,8 +5704,9 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -5893,8 +5923,9 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -6073,8 +6104,9 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -6254,8 +6286,9 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -6445,8 +6478,9 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -6613,8 +6647,9 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -6788,8 +6823,9 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -6996,8 +7032,9 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7284,8 +7321,9 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7581,8 +7619,9 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7876,8 +7915,9 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8153,8 +8193,9 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8439,8 +8480,9 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8714,8 +8756,9 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8929,8 +8972,9 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -9162,8 +9206,9 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -9459,8 +9504,9 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -9760,8 +9806,9 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -10102,8 +10149,9 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -10454,8 +10502,9 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -10805,8 +10854,9 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11146,8 +11196,9 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11478,8 +11529,9 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11759,8 +11811,9 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12044,8 +12097,9 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -12377,8 +12431,9 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -12729,8 +12784,9 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -13046,8 +13102,9 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -13233,8 +13290,9 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -13423,8 +13481,9 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -13627,8 +13686,9 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -13806,8 +13866,9 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -13992,8 +14053,9 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-2048 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -14193,8 +14255,9 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -14385,8 +14448,9 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -14573,8 +14637,9 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -14760,8 +14825,9 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -14939,8 +15005,9 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -15126,8 +15193,9 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -15309,8 +15377,9 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -15584,8 +15653,9 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -15862,8 +15932,9 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -16154,8 +16225,9 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -16421,8 +16493,9 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -16695,8 +16768,9 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-2048 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -16984,8 +17058,9 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -17264,8 +17339,9 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -17540,8 +17616,9 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -17815,8 +17892,9 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -18082,8 +18160,9 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -18357,8 +18436,9 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index e6d6652..a342479 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -20,8 +20,9 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -162,8 +163,9 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -310,8 +312,9 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -473,8 +476,9 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -613,8 +617,9 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -760,8 +765,9 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048 +; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -933,8 +939,9 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1148,8 +1155,9 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -1350,8 +1358,9 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1492,8 +1501,9 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1638,8 +1648,9 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1780,8 +1791,9 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1928,8 +1940,9 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2091,8 +2104,9 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2231,8 +2245,9 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2378,8 +2393,9 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048 +; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2551,8 +2567,9 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2766,8 +2783,9 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -2982,8 +3000,9 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3135,8 +3154,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3293,8 +3313,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3464,8 +3485,9 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3612,8 +3634,9 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3767,8 +3790,9 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3937,8 +3961,9 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4090,8 +4115,9 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4261,8 +4287,9 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4563,8 +4590,9 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4874,8 +4902,9 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5184,8 +5213,9 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -5477,8 +5507,9 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -5779,8 +5810,9 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -6071,8 +6103,9 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6305,8 +6338,9 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -6547,8 +6581,9 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6859,8 +6894,9 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -7173,8 +7209,9 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7516,8 +7553,9 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7869,8 +7907,9 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8219,8 +8258,9 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8551,8 +8591,9 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -8893,8 +8934,9 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9226,8 +9268,9 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -9508,8 +9551,9 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9798,8 +9842,9 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -10151,8 +10196,9 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -10479,8 +10525,9 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10713,8 +10760,9 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10950,8 +10998,9 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11204,8 +11253,9 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11429,8 +11479,9 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11661,8 +11712,9 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11912,8 +11964,9 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -12150,8 +12203,9 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12407,8 +12461,9 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -12751,8 +12806,9 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -13098,8 +13154,9 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -13461,8 +13518,9 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -13794,8 +13852,9 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -14134,8 +14193,9 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -14494,8 +14554,9 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -14841,8 +14902,9 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 6babcfd..0d954e2 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -20,8 +20,9 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -162,8 +163,9 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -310,8 +312,9 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -473,8 +476,9 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -613,8 +617,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -760,8 +765,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 +; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -933,8 +939,9 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1148,8 +1155,9 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -1350,8 +1358,9 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1492,8 +1501,9 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1638,8 +1648,9 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1780,8 +1791,9 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1928,8 +1940,9 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2091,8 +2104,9 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2231,8 +2245,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2378,8 +2393,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 +; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2551,8 +2567,9 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2766,8 +2783,9 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -2982,8 +3000,9 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3135,8 +3154,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3293,8 +3313,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3464,8 +3485,9 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3612,8 +3634,9 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3767,8 +3790,9 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3937,8 +3961,9 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4090,8 +4115,9 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4261,8 +4287,9 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4563,8 +4590,9 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4874,8 +4902,9 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5184,8 +5213,9 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -5477,8 +5507,9 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -5779,8 +5810,9 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -6071,8 +6103,9 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6305,8 +6338,9 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -6547,8 +6581,9 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6859,8 +6894,9 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -7173,8 +7209,9 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7516,8 +7553,9 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7869,8 +7907,9 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8219,8 +8258,9 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8551,8 +8591,9 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -8893,8 +8934,9 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9226,8 +9268,9 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -9508,8 +9551,9 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9798,8 +9842,9 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -10151,8 +10196,9 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -10479,8 +10525,9 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10713,8 +10760,9 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10950,8 +10998,9 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11204,8 +11253,9 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11429,8 +11479,9 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11661,8 +11712,9 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11912,8 +11964,9 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -12150,8 +12203,9 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12407,8 +12461,9 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -12751,8 +12806,9 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -13098,8 +13154,9 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -13461,8 +13518,9 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -13794,8 +13852,9 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -14134,8 +14193,9 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -14494,8 +14554,9 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -14841,8 +14902,9 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index bce522c..47eb89e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -28,8 +28,9 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -222,8 +223,9 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -420,8 +422,9 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -634,8 +637,9 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -818,8 +822,9 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1009,8 +1014,9 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1220,8 +1226,9 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1418,8 +1425,9 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1617,8 +1625,9 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1811,8 +1820,9 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2009,8 +2019,9 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2223,8 +2234,9 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2407,8 +2419,9 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2598,8 +2611,9 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2809,8 +2823,9 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -3007,8 +3022,9 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -3206,8 +3222,9 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3416,8 +3433,9 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3627,8 +3645,9 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3851,8 +3870,9 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4045,8 +4065,9 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4246,8 +4267,9 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4485,8 +4507,9 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4773,8 +4796,9 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5070,8 +5094,9 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5365,8 +5390,9 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -5642,8 +5668,9 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -5928,8 +5955,9 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6205,8 +6233,9 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6424,8 +6453,9 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6651,8 +6681,9 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6948,8 +6979,9 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -7249,8 +7281,9 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7591,8 +7624,9 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7943,8 +7977,9 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8292,8 +8327,9 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8623,8 +8659,9 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -8964,8 +9001,9 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9296,8 +9334,9 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -9577,8 +9616,9 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9866,8 +9906,9 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -10218,8 +10259,9 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -10543,8 +10585,9 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10760,8 +10803,9 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10980,8 +11024,9 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11216,8 +11261,9 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11422,8 +11468,9 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11635,8 +11682,9 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11868,8 +11916,9 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -12088,8 +12137,9 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -12329,8 +12379,9 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -12673,8 +12724,9 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -13020,8 +13072,9 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -13383,8 +13436,9 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -13716,8 +13770,9 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -14056,8 +14111,9 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -14416,8 +14472,9 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -14763,8 +14820,9 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index 5420733..86e6224 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -40,7 +40,8 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -97,7 +98,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -160,7 +162,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -223,7 +226,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -270,7 +274,8 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -322,7 +327,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -380,7 +386,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -438,7 +445,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -488,7 +496,8 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -545,7 +554,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -608,7 +618,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -671,7 +682,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -718,7 +730,8 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -770,7 +783,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -828,7 +842,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -886,7 +901,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -936,7 +952,8 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -993,7 +1010,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -1056,7 +1074,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -1119,7 +1138,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1166,7 +1186,8 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -1218,7 +1239,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -1276,7 +1298,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -1334,7 +1357,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1382,7 +1406,8 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -1439,7 +1464,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -1500,7 +1526,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -1563,7 +1590,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1608,7 +1636,8 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -1660,7 +1689,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -1716,7 +1746,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -1774,7 +1805,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1822,7 +1854,8 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -1879,7 +1912,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -1940,7 +1974,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2003,7 +2038,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2048,7 +2084,8 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2100,7 +2137,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -2156,7 +2194,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2214,7 +2253,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2262,7 +2302,8 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2319,7 +2360,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -2380,7 +2422,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2443,7 +2486,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2488,7 +2532,8 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2540,7 +2585,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -2596,7 +2642,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2654,7 +2701,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2702,7 +2750,8 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2759,7 +2808,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -2820,7 +2870,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2883,7 +2934,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2928,7 +2980,8 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2980,7 +3033,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -3036,7 +3090,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -3094,7 +3149,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3144,7 +3200,8 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -3201,7 +3258,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -3264,7 +3322,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -3327,7 +3386,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3374,7 +3434,8 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -3426,7 +3487,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -3484,7 +3546,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -3542,7 +3605,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3592,7 +3656,8 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -3639,7 +3704,8 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -3686,7 +3752,8 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -3743,7 +3810,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -3806,7 +3874,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -3869,7 +3938,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3916,7 +3986,8 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -3968,7 +4039,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -4026,7 +4098,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4084,7 +4157,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4134,7 +4208,8 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4191,7 +4266,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -4254,7 +4330,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4317,7 +4394,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4364,7 +4442,8 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4416,7 +4495,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -4474,7 +4554,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4532,7 +4613,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4584,7 +4666,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -4631,7 +4713,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -4694,7 +4776,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -4755,7 +4837,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -4800,7 +4882,8 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -4837,7 +4920,8 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: store atomic i64 %in, ptr %out seq_cst, align 8 @@ -4890,7 +4974,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4941,7 +5026,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4995,7 +5081,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5051,7 +5138,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5109,7 +5197,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5173,7 +5262,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5244,7 +5334,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -5301,7 +5392,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5354,7 +5446,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5413,7 +5506,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5479,7 +5573,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -5532,7 +5627,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -5579,7 +5674,7 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -5642,7 +5737,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -5703,7 +5798,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -5748,7 +5843,8 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr double, ptr %out, i64 4 @@ -5785,7 +5881,8 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: store atomic double %in, ptr %out seq_cst, align 8 @@ -5838,7 +5935,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr %out, i64 %index @@ -5889,7 +5987,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr %out, i64 %index @@ -5934,7 +6033,8 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5991,7 +6091,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -6054,7 +6155,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6117,7 +6219,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6164,7 +6267,8 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6216,7 +6320,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -6274,7 +6379,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6332,7 +6438,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6382,7 +6489,8 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6439,7 +6547,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -6502,7 +6611,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6565,7 +6675,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6612,7 +6723,8 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6664,7 +6776,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -6722,7 +6835,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6780,7 +6894,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll index 6504668..1914b74 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll @@ -19,6 +19,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE ; GFX12-SDAG-NEXT: ds_pk_add_f16 v0, v1 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE @@ -29,6 +30,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE ; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE @@ -43,6 +45,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE ; GFX12-SDAG-NEXT: ds_pk_add_bf16 v0, v1 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE @@ -53,6 +56,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE ; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE @@ -69,6 +73,7 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 @@ -82,6 +87,7 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 @@ -99,6 +105,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 @@ -112,6 +119,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index cdfc8f4..95d8ca3 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -58,7 +58,8 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v3 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -97,7 +98,8 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v3 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -160,8 +162,9 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -329,6 +332,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_pk_add_f16 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -352,6 +356,7 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -377,6 +382,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_pk_add_bf16 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -400,6 +406,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 52fe234..5c4ded9 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -21,8 +21,9 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -191,8 +192,9 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -363,8 +365,9 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -545,8 +548,9 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -697,8 +701,9 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -852,8 +857,9 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1023,8 +1029,9 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1241,8 +1248,9 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1450,8 +1458,9 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1620,8 +1629,9 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1792,8 +1802,9 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1974,8 +1985,9 @@ define void @global_agent_atomic_fadd_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2126,8 +2138,9 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2281,8 +2294,9 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2452,8 +2466,9 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2670,8 +2685,9 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2887,8 +2903,9 @@ define double @global_agent_atomic_fadd_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3107,8 +3124,9 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3328,8 +3346,9 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3555,8 +3574,9 @@ define void @global_agent_atomic_fadd_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3757,8 +3777,9 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3962,8 +3983,9 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4195,8 +4217,9 @@ define half @global_agent_atomic_fadd_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4533,8 +4556,9 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4882,8 +4906,9 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5229,8 +5254,9 @@ define void @global_agent_atomic_fadd_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -5555,8 +5581,9 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -5891,8 +5918,9 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6218,8 +6246,9 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6476,8 +6505,9 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6740,8 +6770,9 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7089,8 +7120,9 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -7440,8 +7472,9 @@ define bfloat @global_agent_atomic_fadd_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7832,8 +7865,9 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8236,8 +8270,9 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8637,8 +8672,9 @@ define void @global_agent_atomic_fadd_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -9017,8 +9053,9 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9408,8 +9445,9 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9790,8 +9828,9 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10110,8 +10149,9 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -10436,8 +10476,9 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -10840,8 +10881,9 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11207,8 +11249,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -11437,8 +11480,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -11669,8 +11713,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -11905,8 +11950,9 @@ define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -12113,8 +12159,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -12324,8 +12371,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -12543,8 +12591,9 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -12777,8 +12826,9 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -13007,8 +13057,9 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -13335,8 +13386,9 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -13665,8 +13717,9 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -13999,8 +14052,9 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -14317,8 +14371,9 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -14638,8 +14693,9 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -14967,8 +15023,9 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -15299,8 +15356,9 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -15635,7 +15693,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v1, s[2:3] +; GFX12-NEXT: global_atomic_add_f32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX12-NEXT: .LBB58_2: ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index ae5dca4..4f7b616 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -21,8 +21,9 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -182,8 +183,9 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -345,8 +347,9 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -508,8 +511,9 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -663,8 +667,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -821,8 +826,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -989,8 +995,9 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1240,8 +1247,9 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -1479,8 +1487,9 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1640,8 +1649,9 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1803,8 +1813,9 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1966,8 +1977,9 @@ define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2121,8 +2133,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2279,8 +2292,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2447,8 +2461,9 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2698,8 +2713,9 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -2947,8 +2963,9 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3118,8 +3135,9 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3290,8 +3308,9 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3461,8 +3480,9 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3622,8 +3642,9 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3786,8 +3807,9 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3969,8 +3991,9 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4321,8 +4344,9 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4684,8 +4708,9 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5046,8 +5071,9 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -5388,8 +5414,9 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -5740,8 +5767,9 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -6082,8 +6110,9 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6355,8 +6384,9 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -6634,8 +6664,9 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6998,8 +7029,9 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -7362,8 +7394,9 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7756,8 +7789,9 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8162,8 +8196,9 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8565,8 +8600,9 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8947,8 +8983,9 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9340,8 +9377,9 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9724,8 +9762,9 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10046,8 +10085,9 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -10374,8 +10414,9 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -10780,8 +10821,9 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11159,8 +11201,9 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11450,8 +11493,9 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11743,8 +11787,9 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -12039,8 +12084,9 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12319,8 +12365,9 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12602,8 +12649,9 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12894,8 +12942,9 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -13188,8 +13237,9 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -13496,8 +13546,9 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -13893,8 +13944,9 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -14292,8 +14344,9 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -14693,8 +14746,9 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -15077,8 +15131,9 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -15464,8 +15519,9 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -15861,8 +15917,9 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -16260,8 +16317,9 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 915ce74..591e01b 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -21,8 +21,9 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -182,8 +183,9 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -345,8 +347,9 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -508,8 +511,9 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -663,8 +667,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -821,8 +826,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -989,8 +995,9 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1240,8 +1247,9 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -1479,8 +1487,9 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1640,8 +1649,9 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1803,8 +1813,9 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1966,8 +1977,9 @@ define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2121,8 +2133,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2279,8 +2292,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2447,8 +2461,9 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2698,8 +2713,9 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -2947,8 +2963,9 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3118,8 +3135,9 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3290,8 +3308,9 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3461,8 +3480,9 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3622,8 +3642,9 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3786,8 +3807,9 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3969,8 +3991,9 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4321,8 +4344,9 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4684,8 +4708,9 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5046,8 +5071,9 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -5388,8 +5414,9 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -5740,8 +5767,9 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -6082,8 +6110,9 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6355,8 +6384,9 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -6634,8 +6664,9 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6998,8 +7029,9 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -7362,8 +7394,9 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7756,8 +7789,9 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8162,8 +8196,9 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8565,8 +8600,9 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8947,8 +8983,9 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9340,8 +9377,9 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9724,8 +9762,9 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10046,8 +10085,9 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -10374,8 +10414,9 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -10780,8 +10821,9 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11159,8 +11201,9 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11450,8 +11493,9 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11743,8 +11787,9 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -12039,8 +12084,9 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12319,8 +12365,9 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12602,8 +12649,9 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12894,8 +12942,9 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -13188,8 +13237,9 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -13496,8 +13546,9 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -13893,8 +13944,9 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -14292,8 +14344,9 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -14693,8 +14746,9 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -15077,8 +15131,9 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -15464,8 +15519,9 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -15861,8 +15917,9 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -16260,8 +16317,9 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index e26619a..8e58f30 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -29,8 +29,9 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -259,8 +260,9 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -491,8 +493,9 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -731,8 +734,9 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -950,8 +954,9 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1172,8 +1177,9 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1404,8 +1410,9 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1636,8 +1643,9 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1866,8 +1874,9 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2096,8 +2105,9 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2328,8 +2338,9 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2568,8 +2579,9 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2787,8 +2799,9 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -3009,8 +3022,9 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -3241,8 +3255,9 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -3473,8 +3488,9 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -3703,8 +3719,9 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3953,8 +3970,9 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4204,8 +4222,9 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4461,8 +4480,9 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4689,8 +4709,9 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4920,8 +4941,9 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -5179,8 +5201,9 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5517,8 +5540,9 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5866,8 +5890,9 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6213,8 +6238,9 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6539,8 +6565,9 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6875,8 +6902,9 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -7202,8 +7230,9 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -7460,8 +7489,9 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -7724,8 +7754,9 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8073,8 +8104,9 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8424,8 +8456,9 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8816,8 +8849,9 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -9220,8 +9254,9 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -9621,8 +9656,9 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10001,8 +10037,9 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -10392,8 +10429,9 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -10774,8 +10812,9 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11094,8 +11133,9 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11420,8 +11460,9 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -11824,8 +11865,9 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12199,8 +12241,9 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -12473,8 +12516,9 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -12749,8 +12793,9 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -13027,8 +13072,9 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -13288,8 +13334,9 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -13552,8 +13599,9 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -13826,8 +13874,9 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -14102,8 +14151,9 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -14394,8 +14444,9 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -14791,8 +14842,9 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -15190,8 +15242,9 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -15591,8 +15644,9 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -15975,8 +16029,9 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -16362,8 +16417,9 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -16759,8 +16815,9 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -17158,8 +17215,9 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 2f7e91f..157f91c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -3697,7 +3697,7 @@ define amdgpu_ps float @atomic_global_load_saddr_i32(ptr addrspace(1) inreg %sba ; ; GFX12-LABEL: atomic_global_load_saddr_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] th:TH_LOAD_NT +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: ; return to shader part epilog @@ -3734,7 +3734,7 @@ define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(ptr addrspace(1) ; ; GFX12-LABEL: atomic_global_load_saddr_i32_immneg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 th:TH_LOAD_NT +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: ; return to shader part epilog @@ -3772,7 +3772,7 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(ptr addrspace(1) inre ; ; GFX12-LABEL: atomic_global_load_saddr_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] th:TH_LOAD_NT +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: ; return to shader part epilog @@ -3809,7 +3809,7 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(ptr addrspa ; ; GFX12-LABEL: atomic_global_load_saddr_i64_immneg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 th:TH_LOAD_NT +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll index 1102f9d..790056b 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll @@ -1466,7 +1466,8 @@ define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr(ptr addrspace(1) ; ; GFX12-LABEL: atomic_global_store_saddr_i32_zext_vgpr: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1491,7 +1492,8 @@ define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr_offset_neg128(ptr ; ; GFX12-LABEL: atomic_global_store_saddr_i32_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SYS ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1517,7 +1519,8 @@ define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr(ptr addrspace(1) ; ; GFX12-LABEL: atomic_global_store_saddr_i64_zext_vgpr: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1542,7 +1545,8 @@ define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr_offset_neg128(ptr ; ; GFX12-LABEL: atomic_global_store_saddr_i64_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SYS ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll index 3bf52a5..40f0acf 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -49,7 +49,8 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -120,7 +121,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -198,7 +200,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -279,7 +282,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -343,7 +347,8 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -413,7 +418,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -488,7 +494,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -566,7 +573,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -625,7 +633,8 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -696,7 +705,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -774,7 +784,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -855,7 +866,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -919,7 +931,8 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -989,7 +1002,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -1064,7 +1078,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -1142,7 +1157,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -1201,7 +1217,8 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -1272,7 +1289,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -1350,7 +1368,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -1431,7 +1450,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -1495,7 +1515,8 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -1565,7 +1586,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -1640,7 +1662,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -1718,7 +1741,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -1771,7 +1795,8 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -1839,7 +1864,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -1911,7 +1937,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -1989,7 +2016,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -2047,7 +2075,8 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2114,7 +2143,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -2183,7 +2213,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2258,7 +2289,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -2311,7 +2343,8 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2379,7 +2412,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -2451,7 +2485,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2529,7 +2564,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -2587,7 +2623,8 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2654,7 +2691,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -2723,7 +2761,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2798,7 +2837,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -2851,7 +2891,8 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2919,7 +2960,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -2991,7 +3033,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -3069,7 +3112,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -3127,7 +3171,8 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -3194,7 +3239,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -3263,7 +3309,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -3338,7 +3385,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -3391,7 +3439,8 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -3459,7 +3508,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -3531,7 +3581,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -3609,7 +3660,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -3667,7 +3719,8 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -3734,7 +3787,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -3803,7 +3857,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -3878,7 +3933,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -3937,7 +3993,8 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4008,7 +4065,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -4086,7 +4144,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4167,7 +4226,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -4231,7 +4291,8 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4301,7 +4362,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -4376,7 +4438,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4454,7 +4517,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -4513,7 +4577,8 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4568,7 +4633,8 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4623,7 +4689,8 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4694,7 +4761,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -4772,7 +4840,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4853,7 +4922,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -4917,7 +4987,8 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4987,7 +5058,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -5062,7 +5134,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5140,7 +5213,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -5199,7 +5273,8 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5270,7 +5345,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -5348,7 +5424,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5429,7 +5506,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -5493,7 +5571,8 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5563,7 +5642,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -5638,7 +5718,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5716,7 +5797,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -5791,7 +5873,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5864,7 +5947,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:72000 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:72000 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5937,7 +6021,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[2:3] @@ -6019,7 +6104,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] -; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6112,7 +6198,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[6:7] @@ -6189,7 +6276,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i6 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6261,7 +6349,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[2:3] @@ -6340,7 +6429,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] -; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6430,7 +6520,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[6:7] @@ -6496,7 +6587,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -6563,7 +6654,7 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:-32 th:TH_LOAD_NT +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:-32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -6626,7 +6717,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -6707,7 +6798,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -6788,7 +6879,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -6870,7 +6961,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -6928,7 +7019,8 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -6979,7 +7071,8 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7047,7 +7140,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7115,7 +7209,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7184,7 +7279,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7240,7 +7336,8 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -7311,7 +7408,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -7389,7 +7487,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -7445,7 +7544,8 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -7516,7 +7616,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -7594,7 +7695,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index aca4730..b9dc27c 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -130,8 +130,9 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -290,8 +291,9 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v2, v3 ; GFX12-NEXT: v_or_b32_e32 v2, -5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -806,6 +808,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_mul_i32 s0, s0, 5 ; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_u32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -874,8 +877,9 @@ define void @flat_atomic_xchg_i32_noret(ptr %ptr, i32 %in) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1051,6 +1055,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_mul_i32 s1, s1, 5 ; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -1655,4 +1660,3 @@ entry: %bc = bitcast <2 x i32> %r.1 to <2 x float> ret <2 x float> %bc } - diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 3178396..65614a1 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -22,6 +22,7 @@ define float @local_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -137,6 +138,7 @@ define float @local_atomic_fadd_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -253,6 +255,7 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_add_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -366,6 +369,7 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_add_f32 v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -493,6 +497,7 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[0:1], 4.0, v[3:4] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -683,6 +688,7 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[0:1], 4.0, v[3:4] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -871,6 +877,7 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[3:4], 4.0, v[1:2] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1052,6 +1059,7 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[3:4], 4.0, v[1:2] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1253,6 +1261,7 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1558,6 +1567,7 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1870,6 +1880,7 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2163,6 +2174,7 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2456,6 +2468,7 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2690,6 +2703,7 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2938,6 +2952,7 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3291,6 +3306,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3651,6 +3667,7 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3992,6 +4009,7 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4333,6 +4351,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4622,6 +4641,7 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4889,6 +4909,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5112,6 +5133,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5335,6 +5357,7 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_f16 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5549,6 +5572,7 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_f16 v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5769,6 +5793,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6076,6 +6101,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6384,6 +6410,7 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_bf16 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6681,6 +6708,7 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_bf16 v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6996,6 +7024,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_lshl_b32 s5, s1, 3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -7015,6 +7044,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_lshl_b32 s0, s1, 4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_f32 v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -7051,6 +7081,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_cbranch_execz .LBB28_8 ; GFX12-NEXT: ; %bb.7: ; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -8689,6 +8720,7 @@ define float @local_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -8804,6 +8836,7 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_add_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 9a6ac12..6dec36c 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -22,6 +22,7 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -111,6 +112,7 @@ define float @local_atomic_fmax_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -202,6 +204,7 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -291,6 +294,7 @@ define void @local_atomic_fmax_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_f32 v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -387,6 +391,7 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -484,6 +489,7 @@ define double @local_atomic_fmax_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -583,6 +589,7 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_f64 v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -680,6 +687,7 @@ define void @local_atomic_fmax_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_f64 v0, v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -802,6 +810,7 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1114,6 +1123,7 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1433,6 +1443,7 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1734,6 +1745,7 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2035,6 +2047,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2277,6 +2290,7 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2532,6 +2546,7 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2887,6 +2902,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3249,6 +3265,7 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3592,6 +3609,7 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3935,6 +3953,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4226,6 +4245,7 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4505,6 +4525,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4775,6 +4796,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5044,6 +5066,7 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5304,6 +5327,7 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5588,6 +5612,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5963,6 +5988,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6337,6 +6363,7 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6699,6 +6726,7 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -7043,6 +7071,7 @@ define float @local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -7132,6 +7161,7 @@ define void @local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 4bc9404..b3132a2 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -22,6 +22,7 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -111,6 +112,7 @@ define float @local_atomic_fmin_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -202,6 +204,7 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -291,6 +294,7 @@ define void @local_atomic_fmin_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_f32 v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -387,6 +391,7 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -484,6 +489,7 @@ define double @local_atomic_fmin_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -583,6 +589,7 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_f64 v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -680,6 +687,7 @@ define void @local_atomic_fmin_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_f64 v0, v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -802,6 +810,7 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1114,6 +1123,7 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1433,6 +1443,7 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1734,6 +1745,7 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2035,6 +2047,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2277,6 +2290,7 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2532,6 +2546,7 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2887,6 +2902,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3249,6 +3265,7 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3592,6 +3609,7 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3935,6 +3953,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4226,6 +4245,7 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4505,6 +4525,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4775,6 +4796,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5044,6 +5066,7 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5304,6 +5327,7 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5588,6 +5612,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5963,6 +5988,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6337,6 +6363,7 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6699,6 +6726,7 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -7043,6 +7071,7 @@ define float @local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -7132,6 +7161,7 @@ define void @local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 760bb52..5ebeddd 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -29,6 +29,7 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -239,6 +240,7 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -448,6 +450,7 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -647,6 +650,7 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -855,6 +859,7 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[0:1], -4.0, v[3:4] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1070,6 +1075,7 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[0:1], -4.0, v[3:4] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1283,6 +1289,7 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[3:4], -4.0, v[1:2] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1487,6 +1494,7 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[3:4], -4.0, v[1:2] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1711,6 +1719,7 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2016,6 +2025,7 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2328,6 +2338,7 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2621,6 +2632,7 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2914,6 +2926,7 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3148,6 +3161,7 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3396,6 +3410,7 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3749,6 +3764,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4109,6 +4125,7 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4450,6 +4467,7 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4791,6 +4809,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5080,6 +5099,7 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5355,6 +5375,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5608,6 +5629,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5859,6 +5881,7 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6100,6 +6123,7 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6368,6 +6392,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6743,6 +6768,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -7117,6 +7143,7 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -7479,6 +7506,7 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -7830,6 +7858,7 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -8038,6 +8067,7 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll index 7a68fb5..6b7a6fb 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll @@ -138,6 +138,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX12-WGP-LABEL: workgroup_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -209,6 +210,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX12-WGP-LABEL: workgroup_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -281,6 +283,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX12-WGP-LABEL: workgroup_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -421,6 +424,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX12-WGP-LABEL: workgroup_one_as_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -492,6 +496,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -564,6 +569,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -739,6 +745,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX12-WGP-LABEL: agent_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -747,6 +754,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX12-CU-LABEL: agent_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -835,6 +843,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX12-WGP-LABEL: agent_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -844,6 +853,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX12-CU-LABEL: agent_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -933,6 +943,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX12-WGP-LABEL: agent_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -942,6 +953,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX12-CU-LABEL: agent_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1113,6 +1125,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; ; GFX12-WGP-LABEL: agent_one_as_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1121,6 +1134,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; ; GFX12-CU-LABEL: agent_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1209,6 +1223,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; ; GFX12-WGP-LABEL: agent_one_as_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1218,6 +1233,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; ; GFX12-CU-LABEL: agent_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1307,6 +1323,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; ; GFX12-WGP-LABEL: agent_one_as_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1316,6 +1333,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; ; GFX12-CU-LABEL: agent_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1491,6 +1509,7 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX12-WGP-LABEL: system_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1499,6 +1518,7 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX12-CU-LABEL: system_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1591,6 +1611,7 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX12-WGP-LABEL: system_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1600,6 +1621,7 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX12-CU-LABEL: system_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1693,6 +1715,7 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX12-WGP-LABEL: system_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1702,6 +1725,7 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX12-CU-LABEL: system_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1877,6 +1901,7 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; ; GFX12-WGP-LABEL: system_one_as_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1885,6 +1910,7 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; ; GFX12-CU-LABEL: system_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1977,6 +2003,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; ; GFX12-WGP-LABEL: system_one_as_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1986,6 +2013,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; ; GFX12-CU-LABEL: system_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -2079,6 +2107,7 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; ; GFX12-WGP-LABEL: system_one_as_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2088,6 +2117,7 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; ; GFX12-CU-LABEL: system_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll index e1794db..f564f8e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll @@ -137,12 +137,10 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX12-WGP-LABEL: workgroup_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: workgroup_release_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") release, !mmra !{!"amdgpu-as", !"local"} @@ -205,12 +203,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX12-WGP-LABEL: workgroup_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: workgroup_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-as", !"local"} @@ -273,12 +269,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX12-WGP-LABEL: workgroup_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: workgroup_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-as", !"local"} @@ -637,12 +631,10 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX12-WGP-LABEL: agent_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: agent_release_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm entry: fence syncscope("agent") release, !mmra !{!"amdgpu-as", !"local"} @@ -705,12 +697,10 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX12-WGP-LABEL: agent_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: agent_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-as", !"local"} @@ -773,12 +763,10 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX12-WGP-LABEL: agent_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: agent_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-as", !"local"} @@ -1137,12 +1125,10 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX12-WGP-LABEL: system_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: system_release_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm entry: fence release, !mmra !{!"amdgpu-as", !"local"} @@ -1205,12 +1191,10 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX12-WGP-LABEL: system_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: system_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm entry: fence acq_rel, !mmra !{!"amdgpu-as", !"local"} @@ -1273,12 +1257,10 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX12-WGP-LABEL: system_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: system_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm entry: fence seq_cst, !mmra !{!"amdgpu-as", !"local"} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll index 4128bfe..af7c66a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -1065,6 +1065,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX12-WGP-LABEL: workgroup_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1144,6 +1145,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX12-WGP-LABEL: workgroup_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1224,6 +1226,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX12-WGP-LABEL: workgroup_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1365,6 +1368,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX12-WGP-LABEL: workgroup_one_as_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1436,6 +1440,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1508,6 +1513,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1683,6 +1689,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX12-WGP-LABEL: agent_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1691,6 +1698,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX12-CU-LABEL: agent_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 @@ -1779,6 +1787,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX12-WGP-LABEL: agent_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1788,6 +1797,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX12-CU-LABEL: agent_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 @@ -1877,6 +1887,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX12-WGP-LABEL: agent_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1886,6 +1897,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX12-CU-LABEL: agent_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 @@ -2057,6 +2069,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; ; GFX12-WGP-LABEL: agent_one_as_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2065,6 +2078,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; ; GFX12-CU-LABEL: agent_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -2153,6 +2167,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; ; GFX12-WGP-LABEL: agent_one_as_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2162,6 +2177,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; ; GFX12-CU-LABEL: agent_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -2251,6 +2267,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; ; GFX12-WGP-LABEL: agent_one_as_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2260,6 +2277,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; ; GFX12-CU-LABEL: agent_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -2435,6 +2453,7 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX12-WGP-LABEL: system_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2443,6 +2462,7 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX12-CU-LABEL: system_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 @@ -2535,6 +2555,7 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX12-WGP-LABEL: system_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2544,6 +2565,7 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX12-CU-LABEL: system_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 @@ -2637,6 +2659,7 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX12-WGP-LABEL: system_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2646,6 +2669,7 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX12-CU-LABEL: system_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 @@ -2821,6 +2845,7 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; ; GFX12-WGP-LABEL: system_one_as_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2829,6 +2854,7 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; ; GFX12-CU-LABEL: system_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -2921,6 +2947,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; ; GFX12-WGP-LABEL: system_one_as_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2930,6 +2957,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; ; GFX12-CU-LABEL: system_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -3023,6 +3051,7 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; ; GFX12-WGP-LABEL: system_one_as_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -3032,6 +3061,7 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; ; GFX12-CU-LABEL: system_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 193f642..45e8b3b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -354,7 +354,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -369,7 +369,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -552,7 +552,7 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -570,7 +570,7 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -774,7 +774,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -796,7 +796,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_monotonic_store: @@ -1084,7 +1084,7 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -1229,11 +1229,12 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_release_store: @@ -1244,11 +1245,12 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -1393,11 +1395,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_seq_cst_store: @@ -1408,11 +1411,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -1552,7 +1556,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_monotonic_atomicrmw: @@ -1564,7 +1568,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -1731,7 +1735,7 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -1745,7 +1749,7 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -1903,11 +1907,12 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_release_atomicrmw: @@ -1919,11 +1924,12 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -2106,11 +2112,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -2124,11 +2131,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -2313,11 +2321,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -2331,11 +2340,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -2515,7 +2525,7 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2533,7 +2543,7 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2735,11 +2745,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2757,11 +2768,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2963,11 +2975,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2985,11 +2998,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3211,7 +3225,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: @@ -3227,7 +3241,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -3469,7 +3483,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -3487,7 +3501,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -3720,11 +3734,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_release_monotonic_cmpxchg: @@ -3740,11 +3755,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -4002,11 +4018,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -4024,11 +4041,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -4288,11 +4306,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -4310,11 +4329,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -4558,7 +4578,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -4576,7 +4596,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -4820,7 +4840,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -4838,7 +4858,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -5098,11 +5118,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -5120,11 +5141,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -5384,11 +5406,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -5406,11 +5429,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -5670,11 +5694,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -5692,11 +5717,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -5956,11 +5982,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -5978,11 +6005,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -6242,11 +6270,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -6264,11 +6293,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -6528,11 +6558,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -6550,11 +6581,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -6814,11 +6846,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -6836,11 +6869,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -7100,11 +7134,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -7122,11 +7157,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -7375,7 +7411,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7395,7 +7431,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7661,7 +7697,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7684,7 +7720,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7956,11 +7992,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7980,11 +8017,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8266,11 +8304,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8293,11 +8332,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8582,11 +8622,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8609,11 +8650,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8882,7 +8924,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8905,7 +8947,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9174,7 +9216,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9197,7 +9239,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9482,11 +9524,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9509,11 +9552,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9798,11 +9842,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9825,11 +9870,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10114,11 +10160,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10141,11 +10188,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10430,11 +10478,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10457,11 +10506,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10746,11 +10796,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10773,11 +10824,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11062,11 +11114,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11089,11 +11142,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11378,11 +11432,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11405,11 +11460,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11694,11 +11750,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11721,11 +11778,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12085,7 +12143,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12100,7 +12158,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12291,7 +12349,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12310,7 +12368,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -12523,7 +12581,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12546,7 +12604,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -12824,7 +12882,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_store: @@ -12835,7 +12893,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -12980,11 +13038,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_one_as_release_store: @@ -12995,11 +13054,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -13144,11 +13204,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_store: @@ -13159,11 +13220,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -13303,7 +13365,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: @@ -13315,7 +13377,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -13478,7 +13540,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -13492,7 +13554,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -13650,11 +13712,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_one_as_release_atomicrmw: @@ -13666,11 +13729,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -13849,11 +13913,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -13867,11 +13932,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -14052,11 +14118,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -14070,11 +14137,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -14262,7 +14330,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -14281,7 +14349,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -14492,11 +14560,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -14515,11 +14584,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -14730,11 +14800,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -14753,11 +14824,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -14980,7 +15052,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: @@ -14996,7 +15068,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -15234,7 +15306,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -15252,7 +15324,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -15485,11 +15557,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: @@ -15505,11 +15578,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -15763,11 +15837,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -15785,11 +15860,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -16045,11 +16121,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -16067,11 +16144,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -16311,7 +16389,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -16329,7 +16407,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -16569,7 +16647,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -16587,7 +16665,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -16843,11 +16921,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -16865,11 +16944,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -17125,11 +17205,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -17147,11 +17228,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -17407,11 +17489,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -17429,11 +17512,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -17689,11 +17773,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -17711,11 +17796,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -17971,11 +18057,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -17993,11 +18080,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -18253,11 +18341,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -18275,11 +18364,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -18535,11 +18625,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -18557,11 +18648,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -18817,11 +18909,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -18839,11 +18932,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -19092,7 +19186,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -19112,7 +19206,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -19386,7 +19480,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19410,7 +19504,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -19683,11 +19777,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -19707,11 +19802,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -20001,11 +20097,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20029,11 +20126,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20327,11 +20425,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20355,11 +20454,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20637,7 +20737,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20661,7 +20761,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20939,7 +21039,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20963,7 +21063,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21257,11 +21357,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21285,11 +21386,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21583,11 +21685,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21611,11 +21714,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21909,11 +22013,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21937,11 +22042,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -22235,11 +22341,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -22263,11 +22370,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -22561,11 +22669,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -22589,11 +22698,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -22887,11 +22997,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -22915,11 +23026,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -23213,11 +23325,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -23241,11 +23354,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -23539,11 +23653,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -23567,11 +23682,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 3bb8714..e77f143 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -354,7 +354,7 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -369,7 +369,7 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -554,7 +554,7 @@ define amdgpu_kernel void @flat_system_acquire_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -572,7 +572,7 @@ define amdgpu_kernel void @flat_system_acquire_load( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -778,7 +778,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -800,7 +800,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1077,7 +1077,7 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_monotonic_store: @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -1235,11 +1235,12 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_release_store: @@ -1250,11 +1251,12 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -1401,11 +1403,12 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_seq_cst_store: @@ -1416,11 +1419,12 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -1560,7 +1564,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_monotonic_atomicrmw: @@ -1572,7 +1576,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -1741,7 +1745,7 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -1755,7 +1759,7 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -1915,11 +1919,12 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_release_atomicrmw: @@ -1931,11 +1936,12 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -2122,11 +2128,12 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -2140,11 +2147,12 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -2333,11 +2341,12 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -2351,11 +2360,12 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -2537,7 +2547,7 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2555,7 +2565,7 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2761,11 +2771,12 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2783,11 +2794,12 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2993,11 +3005,12 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3015,11 +3028,12 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3241,7 +3255,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: @@ -3257,7 +3271,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -3501,7 +3515,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -3519,7 +3533,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -3754,11 +3768,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_release_monotonic_cmpxchg: @@ -3774,11 +3789,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -4040,11 +4056,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -4062,11 +4079,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -4330,11 +4348,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -4352,11 +4371,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -4602,7 +4622,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -4620,7 +4640,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -4866,7 +4886,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -4884,7 +4904,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -5148,11 +5168,12 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -5170,11 +5191,12 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -5438,11 +5460,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -5460,11 +5483,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -5728,11 +5752,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -5750,11 +5775,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -6018,11 +6044,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -6040,11 +6067,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -6308,11 +6336,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -6330,11 +6359,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -6598,11 +6628,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -6620,11 +6651,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -6888,11 +6920,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -6910,11 +6943,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -7178,11 +7212,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -7200,11 +7235,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -7453,7 +7489,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7473,7 +7509,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7741,7 +7777,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7764,7 +7800,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8038,11 +8074,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8062,11 +8099,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8352,11 +8390,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8379,11 +8418,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8672,11 +8712,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8699,11 +8740,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8974,7 +9016,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8997,7 +9039,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9268,7 +9310,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9291,7 +9333,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9580,11 +9622,12 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9607,11 +9650,12 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9900,11 +9944,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9927,11 +9972,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10220,11 +10266,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10247,11 +10294,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10540,11 +10588,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10567,11 +10616,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10860,11 +10910,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10887,11 +10938,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11180,11 +11232,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11207,11 +11260,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11500,11 +11554,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11527,11 +11582,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11820,11 +11876,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11847,11 +11904,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12211,7 +12269,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12226,7 +12284,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12419,7 +12477,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12438,7 +12496,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -12653,7 +12711,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12676,7 +12734,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -12954,7 +13012,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_one_as_monotonic_store: @@ -12965,7 +13023,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -13112,11 +13170,12 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_one_as_release_store: @@ -13127,11 +13186,12 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -13278,11 +13338,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_store: @@ -13293,11 +13354,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -13437,7 +13499,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: @@ -13449,7 +13511,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -13614,7 +13676,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -13628,7 +13690,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -13788,11 +13850,12 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_one_as_release_atomicrmw: @@ -13804,11 +13867,12 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -13991,11 +14055,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -14009,11 +14074,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -14198,11 +14264,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -14216,11 +14283,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -14410,7 +14478,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -14429,7 +14497,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -14644,11 +14712,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -14667,11 +14736,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -14886,11 +14956,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -14909,11 +14980,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -15136,7 +15208,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: @@ -15152,7 +15224,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -15392,7 +15464,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -15410,7 +15482,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -15645,11 +15717,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: @@ -15665,11 +15738,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -15927,11 +16001,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -15949,11 +16024,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -16213,11 +16289,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -16235,11 +16312,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -16481,7 +16559,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -16499,7 +16577,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -16741,7 +16819,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -16759,7 +16837,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -17019,11 +17097,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -17041,11 +17120,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -17305,11 +17385,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -17327,11 +17408,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -17591,11 +17673,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -17613,11 +17696,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -17877,11 +17961,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -17899,11 +17984,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -18163,11 +18249,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -18185,11 +18272,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -18449,11 +18537,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -18471,11 +18560,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -18735,11 +18825,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -18757,11 +18848,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -19021,11 +19113,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -19043,11 +19136,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -19296,7 +19390,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -19316,7 +19410,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -19592,7 +19686,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19616,7 +19710,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -19891,11 +19985,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -19915,11 +20010,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -20213,11 +20309,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20241,11 +20338,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20543,11 +20641,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20571,11 +20670,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20855,7 +20955,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20879,7 +20979,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21159,7 +21259,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21183,7 +21283,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21481,11 +21581,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21509,11 +21610,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21811,11 +21913,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21839,11 +21942,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -22141,11 +22245,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -22169,11 +22274,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -22471,11 +22577,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -22499,11 +22606,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -22801,11 +22909,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -22829,11 +22938,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -23131,11 +23241,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -23159,11 +23270,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -23461,11 +23573,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -23489,11 +23602,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -23791,11 +23905,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -23819,11 +23934,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index 8708fcf..6bf54cc 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -901,7 +901,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1018,11 +1018,12 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_volatile_workgroup_release_store: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index 405168a..8949e4b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -354,7 +354,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -549,7 +549,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -764,7 +764,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1058,7 +1058,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_monotonic_store: @@ -1210,11 +1210,12 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_release_store: @@ -1367,11 +1368,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_seq_cst_store: @@ -1523,7 +1525,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_monotonic_atomicrmw: @@ -1691,7 +1693,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -1858,11 +1860,12 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_release_atomicrmw: @@ -2043,11 +2046,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -2231,11 +2235,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -2426,7 +2431,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2637,11 +2642,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2853,11 +2859,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3096,7 +3103,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: @@ -3343,7 +3350,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -3589,11 +3596,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: @@ -3853,11 +3861,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -4120,11 +4129,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -4375,7 +4385,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -4625,7 +4635,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -4887,11 +4897,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -5154,11 +5165,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -5421,11 +5433,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -5688,11 +5701,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -5959,7 +5973,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -6242,7 +6256,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -6531,11 +6545,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -6831,11 +6846,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7135,11 +7151,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7427,7 +7444,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7714,7 +7731,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8013,11 +8030,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8317,11 +8335,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8621,11 +8640,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8925,11 +8945,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9229,11 +9250,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9533,11 +9555,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9837,11 +9860,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10141,11 +10165,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10527,7 +10552,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10718,7 +10743,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10923,7 +10948,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11216,7 +11241,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_store: @@ -11362,11 +11387,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_one_as_release_store: @@ -11512,11 +11538,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_store: @@ -11667,7 +11694,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: @@ -11827,7 +11854,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -11987,11 +12014,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: @@ -12157,11 +12185,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -12329,11 +12358,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -12518,7 +12548,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12719,11 +12749,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12924,11 +12955,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -13166,7 +13198,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: @@ -13405,7 +13437,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -13644,11 +13676,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: @@ -13893,11 +13926,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -14144,11 +14178,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -14389,7 +14424,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -14630,7 +14665,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -14877,11 +14912,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -15128,11 +15164,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -15379,11 +15416,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -15630,11 +15668,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -15881,11 +15920,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -16132,11 +16172,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -16383,11 +16424,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -16634,11 +16676,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -16903,7 +16946,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -17182,7 +17225,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -17465,11 +17508,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -17754,11 +17798,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18047,11 +18092,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18334,7 +18380,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18617,7 +18663,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18906,11 +18952,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19199,11 +19246,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19492,11 +19540,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19785,11 +19834,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20078,11 +20128,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20371,11 +20422,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20664,11 +20716,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20957,11 +21010,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index 793a1ba..b5686099 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -382,7 +382,7 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -394,7 +394,7 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -592,7 +592,7 @@ define amdgpu_kernel void @global_agent_acquire_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -607,7 +607,7 @@ define amdgpu_kernel void @global_agent_acquire_load( ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -819,7 +819,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -838,7 +838,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1167,7 +1167,7 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_monotonic_store: @@ -1178,7 +1178,7 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -1351,11 +1351,12 @@ define amdgpu_kernel void @global_agent_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_release_store: @@ -1366,11 +1367,12 @@ define amdgpu_kernel void @global_agent_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -1543,11 +1545,12 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_seq_cst_store: @@ -1558,11 +1561,12 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -1710,7 +1714,7 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_monotonic_atomicrmw: @@ -1720,7 +1724,7 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -1893,7 +1897,7 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -1905,7 +1909,7 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -2072,11 +2076,12 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_release_atomicrmw: @@ -2086,11 +2091,12 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -2280,11 +2286,12 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -2296,11 +2303,12 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -2492,11 +2500,12 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -2508,11 +2517,12 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -2699,7 +2709,7 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2714,7 +2724,7 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -2922,11 +2932,12 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2941,11 +2952,12 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -3153,11 +3165,12 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -3172,11 +3185,12 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -3390,7 +3404,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: @@ -3405,7 +3419,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -3639,7 +3653,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -3656,7 +3670,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -3884,11 +3898,12 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_release_monotonic_cmpxchg: @@ -3903,11 +3918,12 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -4158,11 +4174,12 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -4179,11 +4196,12 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -4436,11 +4454,12 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -4457,11 +4476,12 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -4697,7 +4717,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -4714,7 +4734,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -4950,7 +4970,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -4967,7 +4987,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -5220,11 +5240,12 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -5241,11 +5262,12 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -5498,11 +5520,12 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -5519,11 +5542,12 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -5776,11 +5800,12 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -5797,11 +5822,12 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -6054,11 +6080,12 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -6075,11 +6102,12 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -6332,11 +6360,12 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -6353,11 +6382,12 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -6610,11 +6640,12 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -6631,11 +6662,12 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -6888,11 +6920,12 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -6909,11 +6942,12 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -7166,11 +7200,12 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -7187,11 +7222,12 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -7427,7 +7463,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -7444,7 +7480,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -7697,7 +7733,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -7717,7 +7753,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -7975,11 +8011,12 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -7996,11 +8033,12 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -8270,11 +8308,12 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8294,11 +8333,12 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -8571,11 +8611,12 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8595,11 +8636,12 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -8855,7 +8897,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8875,7 +8917,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -9131,7 +9173,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9151,7 +9193,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -9424,11 +9466,12 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9448,11 +9491,12 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -9725,11 +9769,12 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9749,11 +9794,12 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10026,11 +10072,12 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10050,11 +10097,12 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10327,11 +10375,12 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10351,11 +10400,12 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10628,11 +10678,12 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10652,11 +10703,12 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10929,11 +10981,12 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10953,11 +11006,12 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -11230,11 +11284,12 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11254,11 +11309,12 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -11531,11 +11587,12 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11555,11 +11612,12 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -11944,7 +12002,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -11956,7 +12014,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -12154,7 +12212,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12169,7 +12227,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -12381,7 +12439,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12400,7 +12458,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -12729,7 +12787,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_store: @@ -12740,7 +12798,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -12913,11 +12971,12 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_release_store: @@ -12928,11 +12987,12 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -13105,11 +13165,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_store: @@ -13120,11 +13181,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -13272,7 +13334,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_atomicrmw: @@ -13282,7 +13344,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -13455,7 +13517,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -13467,7 +13529,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -13634,11 +13696,12 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_release_atomicrmw: @@ -13648,11 +13711,12 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -13842,11 +13906,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -13858,11 +13923,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -14054,11 +14120,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -14070,11 +14137,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -14261,7 +14329,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -14276,7 +14344,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -14484,11 +14552,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -14503,11 +14572,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -14715,11 +14785,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -14734,11 +14805,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -14952,7 +15024,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: @@ -14967,7 +15039,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -15201,7 +15273,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -15218,7 +15290,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -15446,11 +15518,12 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: @@ -15465,11 +15538,12 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -15720,11 +15794,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -15741,11 +15816,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -15998,11 +16074,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -16019,11 +16096,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -16259,7 +16337,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -16276,7 +16354,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -16512,7 +16590,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -16529,7 +16607,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -16782,11 +16860,12 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -16803,11 +16882,12 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -17060,11 +17140,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -17081,11 +17162,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -17338,11 +17420,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -17359,11 +17442,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -17616,11 +17700,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -17637,11 +17722,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -17894,11 +17980,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -17915,11 +18002,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -18172,11 +18260,12 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -18193,11 +18282,12 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -18450,11 +18540,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -18471,11 +18562,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -18728,11 +18820,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -18749,11 +18842,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -18989,7 +19083,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -19006,7 +19100,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -19259,7 +19353,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19279,7 +19373,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -19552,11 +19646,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19576,11 +19671,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -19853,11 +19949,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19877,11 +19974,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20137,7 +20235,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20157,7 +20255,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20413,7 +20511,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20433,7 +20531,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20706,11 +20804,12 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20730,11 +20829,12 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21007,11 +21107,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21031,11 +21132,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21308,11 +21410,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21332,11 +21435,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21609,11 +21713,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21633,11 +21738,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21910,11 +22016,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21934,11 +22041,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -22211,11 +22319,12 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -22235,11 +22344,12 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -22512,11 +22622,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -22536,11 +22647,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -22813,11 +22925,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -22837,11 +22950,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index 1dbf0b2..62a4f3b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -382,7 +382,7 @@ define amdgpu_kernel void @global_system_monotonic_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -394,7 +394,7 @@ define amdgpu_kernel void @global_system_monotonic_load( ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -594,7 +594,7 @@ define amdgpu_kernel void @global_system_acquire_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -609,7 +609,7 @@ define amdgpu_kernel void @global_system_acquire_load( ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -823,7 +823,7 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -842,7 +842,7 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1171,7 +1171,7 @@ define amdgpu_kernel void @global_system_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_monotonic_store: @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @global_system_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -1357,11 +1357,12 @@ define amdgpu_kernel void @global_system_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_release_store: @@ -1372,11 +1373,12 @@ define amdgpu_kernel void @global_system_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -1551,11 +1553,12 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_seq_cst_store: @@ -1566,11 +1569,12 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -1718,7 +1722,7 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_monotonic_atomicrmw: @@ -1728,7 +1732,7 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -1903,7 +1907,7 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -1915,7 +1919,7 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -2084,11 +2088,12 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_release_atomicrmw: @@ -2098,11 +2103,12 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -2296,11 +2302,12 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -2312,11 +2319,12 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -2512,11 +2520,12 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -2528,11 +2537,12 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -2721,7 +2731,7 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2736,7 +2746,7 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -2948,11 +2958,12 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2967,11 +2978,12 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -3183,11 +3195,12 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -3202,11 +3215,12 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -3420,7 +3434,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: @@ -3435,7 +3449,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -3671,7 +3685,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -3688,7 +3702,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -3918,11 +3932,12 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_release_monotonic_cmpxchg: @@ -3937,11 +3952,12 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -4196,11 +4212,12 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -4217,11 +4234,12 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -4478,11 +4496,12 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -4499,11 +4518,12 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -4741,7 +4761,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -4758,7 +4778,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -4996,7 +5016,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -5013,7 +5033,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -5270,11 +5290,12 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -5291,11 +5312,12 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -5552,11 +5574,12 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -5573,11 +5596,12 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -5834,11 +5858,12 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -5855,11 +5880,12 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -6116,11 +6142,12 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -6137,11 +6164,12 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -6377,7 +6405,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -6394,7 +6422,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -6649,7 +6677,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -6669,7 +6697,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -6946,11 +6974,12 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -6970,11 +6999,12 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -7251,11 +7281,12 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -7275,11 +7306,12 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -7537,7 +7569,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -7557,7 +7589,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -7815,7 +7847,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -7835,7 +7867,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -8112,11 +8144,12 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8136,11 +8169,12 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -8417,11 +8451,12 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8441,11 +8476,12 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -8722,11 +8758,12 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8746,11 +8783,12 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -9027,11 +9065,12 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9051,11 +9090,12 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -9332,11 +9372,12 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9356,11 +9397,12 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -9637,11 +9679,12 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9661,11 +9704,12 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -9942,11 +9986,12 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9966,11 +10011,12 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10247,11 +10293,12 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10271,11 +10318,12 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10660,7 +10708,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -10672,7 +10720,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -10872,7 +10920,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10887,7 +10935,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -11101,7 +11149,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11120,7 +11168,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -11449,7 +11497,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_store: @@ -11460,7 +11508,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -11635,11 +11683,12 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_release_store: @@ -11650,11 +11699,12 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -11829,11 +11879,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_store: @@ -11844,11 +11895,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -11996,7 +12048,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_atomicrmw: @@ -12006,7 +12058,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -12181,7 +12233,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -12193,7 +12245,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -12362,11 +12414,12 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_release_atomicrmw: @@ -12376,11 +12429,12 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -12574,11 +12628,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -12590,11 +12645,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -12790,11 +12846,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -12806,11 +12863,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -12999,7 +13057,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -13014,7 +13072,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -13226,11 +13284,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -13245,11 +13304,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -13461,11 +13521,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -13480,11 +13541,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -13698,7 +13760,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: @@ -13713,7 +13775,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -13949,7 +14011,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -13966,7 +14028,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -14196,11 +14258,12 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: @@ -14215,11 +14278,12 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -14474,11 +14538,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -14495,11 +14560,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -14756,11 +14822,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -14777,11 +14844,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -15019,7 +15087,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -15036,7 +15104,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -15274,7 +15342,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -15291,7 +15359,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -15548,11 +15616,12 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -15569,11 +15638,12 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -15830,11 +15900,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -15851,11 +15922,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -16112,11 +16184,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -16133,11 +16206,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -16394,11 +16468,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -16415,11 +16490,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -16676,11 +16752,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -16697,11 +16774,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -16958,11 +17036,12 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -16979,11 +17058,12 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -17240,11 +17320,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -17261,11 +17342,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -17522,11 +17604,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -17543,11 +17626,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -17783,7 +17867,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -17800,7 +17884,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -18055,7 +18139,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18075,7 +18159,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -18335,11 +18419,12 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -18356,11 +18441,12 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -18634,11 +18720,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18658,11 +18745,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -18939,11 +19027,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18963,11 +19052,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -19225,7 +19315,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19245,7 +19335,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -19503,7 +19593,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19523,7 +19613,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -19800,11 +19890,12 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19824,11 +19915,12 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20105,11 +20197,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20129,11 +20222,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20410,11 +20504,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20434,11 +20529,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20715,11 +20811,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20739,11 +20836,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21020,11 +21118,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21044,11 +21143,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21325,11 +21425,12 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21349,11 +21450,12 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21630,11 +21732,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21654,11 +21757,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21935,11 +22039,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21959,11 +22064,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 11206cb..a98efb4 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -837,7 +837,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -972,11 +972,12 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_volatile_workgroup_release_store: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 643684b..30bf492 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -382,7 +382,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -582,7 +582,7 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -794,7 +794,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1136,7 +1136,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_monotonic_store: @@ -1316,11 +1316,12 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_release_store: @@ -1501,11 +1502,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_store: @@ -1665,7 +1667,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_monotonic_atomicrmw: @@ -1831,7 +1833,7 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -2004,11 +2006,12 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_release_atomicrmw: @@ -2188,11 +2191,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -2374,11 +2378,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -2566,7 +2571,7 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2772,11 +2777,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2983,11 +2989,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -3214,7 +3221,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: @@ -3446,7 +3453,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -3685,11 +3692,12 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: @@ -3935,11 +3943,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -4187,11 +4196,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -4426,7 +4436,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -4660,7 +4670,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -4907,11 +4917,12 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -5159,11 +5170,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -5411,11 +5423,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -5663,11 +5676,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -5915,11 +5929,12 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -6167,11 +6182,12 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -6419,11 +6435,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -6671,11 +6688,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -6927,7 +6945,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -7186,7 +7204,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -7457,11 +7475,12 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -7734,11 +7753,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8014,11 +8034,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8281,7 +8302,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8543,7 +8564,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8818,11 +8839,12 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9098,11 +9120,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9378,11 +9401,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9658,11 +9682,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9938,11 +9963,12 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10218,11 +10244,12 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10498,11 +10525,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10778,11 +10806,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11185,7 +11214,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -11385,7 +11414,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11594,7 +11623,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11935,7 +11964,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_store: @@ -12108,11 +12137,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_store: @@ -12285,11 +12315,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_store: @@ -12448,7 +12479,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw: @@ -12614,7 +12645,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -12780,11 +12811,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_atomicrmw: @@ -12956,11 +12988,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -13134,11 +13167,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -13325,7 +13359,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -13524,11 +13558,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -13727,11 +13762,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -13957,7 +13993,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: @@ -14189,7 +14225,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -14421,11 +14457,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: @@ -14663,11 +14700,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -14907,11 +14945,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -15145,7 +15184,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -15379,7 +15418,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -15619,11 +15658,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -15863,11 +15903,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -16107,11 +16148,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -16351,11 +16393,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -16595,11 +16638,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -16839,11 +16883,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -17083,11 +17128,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -17327,11 +17373,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -17582,7 +17629,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -17841,7 +17888,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18105,11 +18152,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -18374,11 +18422,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18646,11 +18695,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18912,7 +18962,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19174,7 +19224,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19442,11 +19492,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19714,11 +19765,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19986,11 +20038,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20258,11 +20311,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20530,11 +20584,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20802,11 +20857,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21074,11 +21130,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21346,11 +21403,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index ff1538b..02cd97c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -1140,6 +1140,7 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1298,6 +1299,7 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1752,6 +1754,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1923,6 +1926,7 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2097,6 +2101,7 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2486,6 +2491,7 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2692,6 +2698,7 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -3270,6 +3277,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -3481,6 +3489,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -3695,6 +3704,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4301,6 +4311,7 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4515,6 +4526,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4729,6 +4741,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4943,6 +4956,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5157,6 +5171,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5371,6 +5386,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5585,6 +5601,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5799,6 +5816,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -6480,6 +6498,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -6723,6 +6742,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -6967,6 +6987,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -7663,6 +7684,7 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -7907,6 +7929,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8151,6 +8174,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8395,6 +8419,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8639,6 +8664,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8883,6 +8909,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -9127,6 +9154,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -9371,6 +9399,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index 6d52a4d..1c4c8d4 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -1140,6 +1140,7 @@ define amdgpu_kernel void @local_system_release_store( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1298,6 +1299,7 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1752,6 +1754,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1923,6 +1926,7 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2097,6 +2101,7 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2486,6 +2491,7 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2692,6 +2698,7 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -3270,6 +3277,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -3481,6 +3489,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -3695,6 +3704,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4301,6 +4311,7 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4515,6 +4526,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4729,6 +4741,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4943,6 +4956,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5157,6 +5171,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5371,6 +5386,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5585,6 +5601,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5799,6 +5816,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -6480,6 +6498,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -6723,6 +6742,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -6967,6 +6987,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -7663,6 +7684,7 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -7907,6 +7929,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8151,6 +8174,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8395,6 +8419,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8639,6 +8664,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8883,6 +8909,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -9127,6 +9154,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -9371,6 +9399,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index 2a15a0d..a52dd9b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -834,6 +834,7 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll index 0f56342..c242963 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -1140,6 +1140,7 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1298,6 +1299,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1752,6 +1754,7 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1923,6 +1926,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2097,6 +2101,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2486,6 +2491,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2692,6 +2698,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -3270,6 +3277,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -3481,6 +3489,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -3695,6 +3704,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4301,6 +4311,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4515,6 +4526,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4729,6 +4741,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4943,6 +4956,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5157,6 +5171,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5371,6 +5386,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5585,6 +5601,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5799,6 +5816,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -6480,6 +6498,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -6723,6 +6742,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -6967,6 +6987,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -7663,6 +7684,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -7907,6 +7929,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8151,6 +8174,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8395,6 +8419,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8639,6 +8664,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8883,6 +8909,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -9127,6 +9154,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -9371,6 +9399,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 |