; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s ; Test single atomic RMW - s_wait_xcnt should be kept. define amdgpu_kernel void @single_atomic_rmw(ptr addrspace(1) %ptr) { ; GFX1250-LABEL: single_atomic_rmw: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_endpgm %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst ret void } ; Test back-to-back atomic RMW operations - only first s_wait_xcnt should remain. define amdgpu_kernel void @atomic_rmw_back_to_back(ptr addrspace(1) %ptr) { ; GFX1250-LABEL: atomic_rmw_back_to_back: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 3 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_endpgm %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 %gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst %val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst %val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst ret void } ; Test atomic RMW block with interleaved ALU ops - should not break the block. define amdgpu_kernel void @atomic_rmw_with_alu(ptr addrspace(1) %ptr, i32 %a, i32 %b) { ; GFX1250-LABEL: atomic_rmw_with_alu: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0xc ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_add_co_i32 s4, s2, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, s4 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_mul_i32 s2, s2, s3 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_endpgm %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 %gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst %sum = add i32 %a, %b %val2 = atomicrmw add ptr addrspace(1) %gep2, i32 %sum seq_cst %prod = mul i32 %a, %b %val3 = atomicrmw add ptr addrspace(1) %gep3, i32 %prod seq_cst ret void } ; Test atomic RMW block broken by global load (VMEM). define amdgpu_kernel void @atomic_rmw_broken_by_global_load(ptr addrspace(1) %ptr, ptr addrspace(1) %load_ptr) { ; GFX1250-LABEL: atomic_rmw_broken_by_global_load: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_endpgm %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 %gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst %val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst %load = load i32, ptr addrspace(1) %load_ptr %val3 = atomicrmw add ptr addrspace(1) %gep3, i32 %load seq_cst ret void } ; Test atomic RMW block broken by global store (VMEM). define amdgpu_kernel void @atomic_rmw_broken_by_global_store(ptr addrspace(1) %ptr, ptr addrspace(1) %store_ptr) { ; GFX1250-LABEL: atomic_rmw_broken_by_global_store: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 42 ; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 3 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_endpgm %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 %gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst %val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst store i32 42, ptr addrspace(1) %store_ptr %val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst ret void } ; Test atomic RMW block broken by FLAT load (VMEM). define amdgpu_kernel void @atomic_rmw_broken_by_flat_load(ptr addrspace(1) %ptr, ptr %flat_ptr) { ; GFX1250-LABEL: atomic_rmw_broken_by_flat_load: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_endpgm %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 %gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst %val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst %load = load i32, ptr %flat_ptr, align 4 %val3 = atomicrmw add ptr addrspace(1) %gep3, i32 %load seq_cst ret void } ; Test atomic RMW block broken by FLAT store (VMEM). define amdgpu_kernel void @atomic_rmw_broken_by_flat_store(ptr addrspace(1) %ptr, ptr %flat_ptr) { ; GFX1250-LABEL: atomic_rmw_broken_by_flat_store: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 42 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 3 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_endpgm %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 %gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst %val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst store i32 42, ptr %flat_ptr, align 4 %val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst ret void } ; Test atomic RMW block broken by SMEM load. define amdgpu_kernel void @atomic_rmw_broken_by_smem_load(ptr addrspace(1) %ptr, ptr addrspace(4) %const_ptr) { ; GFX1250-LABEL: atomic_rmw_broken_by_smem_load: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_endpgm %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 %gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst %val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst %load = load i32, ptr addrspace(4) %const_ptr %val3 = atomicrmw add ptr addrspace(1) %gep3, i32 %load seq_cst ret void } ; Test atomic RMW block broken by atomic store. define amdgpu_kernel void @atomic_rmw_broken_by_atomic_store(ptr addrspace(1) %ptr, ptr addrspace(1) %store_ptr) { ; GFX1250-LABEL: atomic_rmw_broken_by_atomic_store: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 42 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 3 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_endpgm %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 %gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst %val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst store atomic i32 42, ptr addrspace(1) %store_ptr seq_cst, align 4 %val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst ret void } ; Test LDS load should not break atomic RMW block. define amdgpu_kernel void @atomic_rmw_with_lds_load(ptr addrspace(1) %ptr, ptr addrspace(3) %lds_ptr) { ; GFX1250-LABEL: atomic_rmw_with_lds_load: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ds_load_b32 v1, v1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_endpgm %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 %gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst %val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst %load = load i32, ptr addrspace(3) %lds_ptr, align 4 %val3 = atomicrmw add ptr addrspace(1) %gep3, i32 %load seq_cst ret void } ; Test LDS store should not break atomic RMW block. define amdgpu_kernel void @atomic_rmw_with_lds_store(ptr addrspace(1) %ptr, ptr addrspace(3) %lds_ptr) { ; GFX1250-LABEL: atomic_rmw_with_lds_store: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 42 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ds_store_b32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v1, 3 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_endpgm %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 %gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst %val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst store i32 42, ptr addrspace(3) %lds_ptr, align 4 %val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst ret void } ; Test FLAT load from LDS should not break atomic RMW block. define amdgpu_kernel void @atomic_rmw_with_flat_lds_load(ptr addrspace(1) %ptr, ptr addrspace(3) %lds_ptr) { ; GFX1250-LABEL: atomic_rmw_with_flat_lds_load: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s4, s7 ; GFX1250-NEXT: s_mov_b64 s[8:9], src_shared_base ; GFX1250-NEXT: s_mov_b32 s3, s9 ; GFX1250-NEXT: s_mov_b32 s5, -1 ; GFX1250-NEXT: s_cmp_lg_u32 s2, s5 ; GFX1250-NEXT: s_cselect_b32 s4, s3, s4 ; GFX1250-NEXT: s_mov_b32 s3, s6 ; GFX1250-NEXT: s_cselect_b32 s2, s2, s3 ; GFX1250-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 ; GFX1250-NEXT: s_mov_b32 s3, s4 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_endpgm %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 %gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst %val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst %flat_lds = addrspacecast ptr addrspace(3) %lds_ptr to ptr %load = load i32, ptr %flat_lds, align 4 %val3 = atomicrmw add ptr addrspace(1) %gep3, i32 %load seq_cst ret void } ; Test FLAT store to LDS should not break atomic RMW block. define amdgpu_kernel void @atomic_rmw_with_flat_lds_store(ptr addrspace(1) %ptr, ptr addrspace(3) %lds_ptr) { ; GFX1250-LABEL: atomic_rmw_with_flat_lds_store: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[6:7], 0 ; GFX1250-NEXT: s_mov_b32 s4, s7 ; GFX1250-NEXT: s_mov_b64 s[8:9], src_shared_base ; GFX1250-NEXT: s_mov_b32 s3, s9 ; GFX1250-NEXT: s_mov_b32 s5, -1 ; GFX1250-NEXT: s_cmp_lg_u32 s2, s5 ; GFX1250-NEXT: s_cselect_b32 s4, s3, s4 ; GFX1250-NEXT: s_mov_b32 s3, s6 ; GFX1250-NEXT: s_cselect_b32 s2, s2, s3 ; GFX1250-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 ; GFX1250-NEXT: s_mov_b32 s3, s4 ; GFX1250-NEXT: v_mov_b32_e32 v1, 42 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 3 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_endpgm %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 %gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst %val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst %flat_lds = addrspacecast ptr addrspace(3) %lds_ptr to ptr store i32 42, ptr %flat_lds, align 4 %val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst ret void } ; Test atomic RMW block broken by async copy from global to LDS. define amdgpu_kernel void @atomic_rmw_borken_by_async_lds_copy(ptr addrspace(1) %ptr, ptr addrspace(1) %src, ptr addrspace(3) %dst) { ; GFX1250-LABEL: atomic_rmw_borken_by_async_lds_copy: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b64 s[2:3], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX1250-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x10 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ds_store_b32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v1, 3 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_endpgm %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 %gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst %val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst %load = load i32, ptr addrspace(1) %src, align 4 store i32 %load, ptr addrspace(3) %dst, align 4 %val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst ret void } ; Test multiple separate atomic RMW blocks. define amdgpu_kernel void @multiple_atomic_rmw_blocks(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2) { ; First block ; GFX1250-LABEL: multiple_atomic_rmw_blocks: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 4 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:12 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_endpgm %gep1 = getelementptr i32, ptr addrspace(1) %ptr1, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr1, i64 1 %gep3 = getelementptr i32, ptr addrspace(1) %ptr1, i64 2 %gep4 = getelementptr i32, ptr addrspace(1) %ptr1, i64 3 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst %val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst %load = load i32, ptr addrspace(1) %ptr2 %val3 = atomicrmw add ptr addrspace(1) %gep3, i32 %load seq_cst %val4 = atomicrmw add ptr addrspace(1) %gep4, i32 4 seq_cst ret void } ; Test different atomic RMW operations in a block. define amdgpu_kernel void @different_atomic_ops(ptr addrspace(1) %ptr) { ; GFX1250-LABEL: different_atomic_ops: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_sub_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 3 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_and_b32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 4 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_or_b32 v0, v1, s[0:1] offset:12 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_endpgm %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 %gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 %gep4 = getelementptr i32, ptr addrspace(1) %ptr, i64 3 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst %val2 = atomicrmw sub ptr addrspace(1) %gep2, i32 2 seq_cst %val3 = atomicrmw and ptr addrspace(1) %gep3, i32 3 seq_cst %val4 = atomicrmw or ptr addrspace(1) %gep4, i32 4 seq_cst ret void } ; Test atomic RMW block reset at basic block boundary. define amdgpu_kernel void @atomic_rmw_across_basic_blocks(ptr addrspace(1) %ptr, i32 %cond) { ; GFX1250-LABEL: atomic_rmw_across_basic_blocks: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX1250-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GFX1250-NEXT: v_writelane_b32 v2, s4, 0 ; GFX1250-NEXT: v_writelane_b32 v2, s5, 1 ; GFX1250-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1250-NEXT: scratch_store_b32 off, v2, off nv ; 4-byte Folded Spill ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 exec_lo, s6 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[2:3] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s1, 0 ; GFX1250-NEXT: s_cmp_lg_u32 s0, s1 ; GFX1250-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1250-NEXT: ; %bb.1: ; %then ; GFX1250-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1250-NEXT: scratch_load_b32 v2, off, off nv ; 4-byte Folded Reload ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 exec_lo, s6 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_readlane_b32 s0, v2, 0 ; GFX1250-NEXT: v_readlane_b32 s1, v2, 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 3 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: .LBB16_2: ; %exit ; GFX1250-NEXT: s_endpgm entry: %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst %val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst %cmp = icmp eq i32 %cond, 0 br i1 %cmp, label %then, label %exit then: %gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 %val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst br label %exit exit: ret void } ; Test atomic RMW block in loop. define amdgpu_kernel void @atomic_rmw_in_loop(ptr addrspace(1) %ptr, i32 %n) { ; GFX1250-LABEL: atomic_rmw_in_loop: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX1250-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_writelane_b32 v2, s2, 0 ; GFX1250-NEXT: v_writelane_b32 v2, s3, 1 ; GFX1250-NEXT: s_mov_b32 s0, 0 ; GFX1250-NEXT: v_writelane_b32 v2, s1, 2 ; GFX1250-NEXT: v_writelane_b32 v2, s0, 3 ; GFX1250-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1250-NEXT: scratch_store_b32 off, v2, off nv ; 4-byte Folded Spill ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 exec_lo, s6 ; GFX1250-NEXT: .LBB17_1: ; %loop ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1250-NEXT: scratch_load_b32 v2, off, off nv ; 4-byte Folded Reload ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 exec_lo, s6 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_readlane_b32 s0, v2, 3 ; GFX1250-NEXT: v_readlane_b32 s1, v2, 2 ; GFX1250-NEXT: v_readlane_b32 s2, v2, 0 ; GFX1250-NEXT: v_readlane_b32 s3, v2, 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[2:3] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s2, 1 ; GFX1250-NEXT: s_add_co_i32 s0, s0, s2 ; GFX1250-NEXT: s_cmp_lt_u32 s0, s1 ; GFX1250-NEXT: v_writelane_b32 v2, s0, 3 ; GFX1250-NEXT: s_mov_b32 s6, exec_lo ; GFX1250-NEXT: s_mov_b32 exec_lo, -1 ; GFX1250-NEXT: scratch_store_b32 off, v2, off nv ; 4-byte Folded Spill ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 exec_lo, s6 ; GFX1250-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1250-NEXT: ; %bb.2: ; %exit ; GFX1250-NEXT: s_endpgm entry: br label %loop loop: %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 %i seq_cst %val2 = atomicrmw add ptr addrspace(1) %gep2, i32 %i seq_cst %i.next = add i32 %i, 1 %cmp = icmp ult i32 %i.next, %n br i1 %cmp, label %loop, label %exit exit: ret void } ; Test atomic RMW block with branch in between - state reset at new block. define amdgpu_kernel void @atomic_rmw_with_branch(ptr addrspace(1) %ptr, i32 %cond) { ; GFX1250-LABEL: atomic_rmw_with_branch: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX1250-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GFX1250-NEXT: v_writelane_b32 v2, s4, 0 ; GFX1250-NEXT: v_writelane_b32 v2, s5, 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[2:3] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s0, -1 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: s_cmp_lg_u32 s1, s2 ; GFX1250-NEXT: v_writelane_b32 v2, s0, 2 ; GFX1250-NEXT: s_mov_b32 s6, exec_lo ; GFX1250-NEXT: s_mov_b32 exec_lo, -1 ; GFX1250-NEXT: scratch_store_b32 off, v2, off nv ; 4-byte Folded Spill ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 exec_lo, s6 ; GFX1250-NEXT: s_cbranch_scc1 .LBB18_3 ; GFX1250-NEXT: .LBB18_1: ; %Flow ; GFX1250-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1250-NEXT: scratch_load_b32 v2, off, off nv ; 4-byte Folded Reload ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 exec_lo, s6 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_readlane_b32 s0, v2, 2 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX1250-NEXT: s_mov_b32 s0, 1 ; GFX1250-NEXT: v_cmp_ne_u32_e64 s0, v0, s0 ; GFX1250-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_vccnz .LBB18_4 ; GFX1250-NEXT: ; %bb.2: ; %bb1 ; GFX1250-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1250-NEXT: scratch_load_b32 v2, off, off nv ; 4-byte Folded Reload ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 exec_lo, s6 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_readlane_b32 s0, v2, 0 ; GFX1250-NEXT: v_readlane_b32 s1, v2, 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 3 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_branch .LBB18_4 ; GFX1250-NEXT: .LBB18_3: ; %bb2 ; GFX1250-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1250-NEXT: scratch_load_b32 v2, off, off nv ; 4-byte Folded Reload ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 exec_lo, s6 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_readlane_b32 s0, v2, 0 ; GFX1250-NEXT: v_readlane_b32 s1, v2, 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 4 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:12 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s0, 0 ; GFX1250-NEXT: v_writelane_b32 v2, s0, 2 ; GFX1250-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1250-NEXT: scratch_store_b32 off, v2, off nv ; 4-byte Folded Spill ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 exec_lo, s6 ; GFX1250-NEXT: s_branch .LBB18_1 ; GFX1250-NEXT: .LBB18_4: ; %merge ; GFX1250-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1250-NEXT: scratch_load_b32 v2, off, off nv ; 4-byte Folded Reload ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 exec_lo, s6 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_readlane_b32 s0, v2, 0 ; GFX1250-NEXT: v_readlane_b32 s1, v2, 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 5 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_endpgm entry: %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst %val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst %cmp = icmp eq i32 %cond, 0 br i1 %cmp, label %bb1, label %bb2 bb1: %gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 %val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst br label %merge bb2: %gep4 = getelementptr i32, ptr addrspace(1) %ptr, i64 3 %val4 = atomicrmw add ptr addrspace(1) %gep4, i32 4 seq_cst br label %merge merge: %gep5 = getelementptr i32, ptr addrspace(1) %ptr, i64 4 %val5 = atomicrmw add ptr addrspace(1) %gep5, i32 5 seq_cst ret void } ; Test fall-through block. define amdgpu_kernel void @atomic_rmw_fallthrough(ptr addrspace(1) %ptr) { ; GFX1250-LABEL: atomic_rmw_fallthrough: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX1250-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GFX1250-NEXT: v_writelane_b32 v2, s2, 0 ; GFX1250-NEXT: v_writelane_b32 v2, s3, 1 ; GFX1250-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1250-NEXT: scratch_store_b32 off, v2, off nv ; 4-byte Folded Spill ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 exec_lo, s6 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 2 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; %bb.1: ; %next ; GFX1250-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1250-NEXT: scratch_load_b32 v2, off, off nv ; 4-byte Folded Reload ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 exec_lo, s6 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_readlane_b32 s0, v2, 0 ; GFX1250-NEXT: v_readlane_b32 s1, v2, 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 3 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:8 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 4 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:12 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_endpgm entry: %gep1 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 %gep2 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 %val1 = atomicrmw add ptr addrspace(1) %gep1, i32 1 seq_cst %val2 = atomicrmw add ptr addrspace(1) %gep2, i32 2 seq_cst br label %next next: %gep3 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 %gep4 = getelementptr i32, ptr addrspace(1) %ptr, i64 3 %val3 = atomicrmw add ptr addrspace(1) %gep3, i32 3 seq_cst %val4 = atomicrmw add ptr addrspace(1) %gep4, i32 4 seq_cst ret void }